vello/piet-gpu/shader/backdrop.comp

113 lines
4.1 KiB
Plaintext
Raw Normal View History

// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
// Propagation of tile backdrop for filling.
//
// Each thread reads one path element and calculates the number of spanned tiles
// based on the bounding box.
// In a further compaction step, the workgroup loops over the corresponding tile rows per element in parallel.
// For each row the per tile backdrop will be read, as calculated in the previous coarse path segment kernel,
// and propagated from the left to the right (prefix summed).
//
// Output state:
// - Each path element has an array of tiles covering the whole path based on boundig box
// - Each tile per path element contains the 'backdrop' and a list of subdivided path segments
#version 450
#extension GL_GOOGLE_include_directive : enable
#include "mem.h"
#include "setup.h"
#define LG_BACKDROP_WG (7 + LG_WG_FACTOR)
#define BACKDROP_WG (1 << LG_BACKDROP_WG)
layout(local_size_x = BACKDROP_WG, local_size_y = 1) in;
layout(set = 0, binding = 1) readonly buffer ConfigBuf {
Config conf;
};
#include "annotated.h"
#include "tile.h"
shared uint sh_row_count[BACKDROP_WG];
shared Alloc sh_row_alloc[BACKDROP_WG];
shared uint sh_row_width[BACKDROP_WG];
void main() {
if (mem_error != NO_ERROR) {
return;
}
uint th_ix = gl_LocalInvocationID.x;
uint element_ix = gl_GlobalInvocationID.x;
AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
// Work assignment: 1 thread : 1 path element
uint row_count = 0;
if (element_ix < conf.n_elements) {
AnnotatedTag tag = Annotated_tag(conf.anno_alloc, ref);
switch (tag.tag) {
case Annotated_Color:
if (fill_mode_from_flags(tag.flags) != MODE_NONZERO) {
break;
}
// Fall through.
implement FillImage command and sRGB support FillImage is like Fill, except that it takes its color from one or more image atlases. kernel4 uses a single image for non-Vulkan hosts, and the dynamic sized array of image descriptors on Vulkan. A previous version of this commit used textures. I think images are a better choice for piet-gpu, for several reasons: - Texture sampling, in particular textureGrad, is slow on lower spec devices such as Google Pixel. Texture sampling is particularly slow and difficult to implement for CPU fallbacks. - Texture sampling need more parameters, in particular the full u,v transformation matrix, leading to a large increase in the command size. Since all commands use the same size, that memory penalty is paid by all scenes, not just scenes with textures. - It is unlikely that piet-gpu will support every kind of fill for every client, because each kind must be added to kernel4. With FillImage, a client will prepare the image(s) in separate shader stages, sampling and applying transformations and special effects as needed. Textures that align with the output pixel grid can be used directly, without pre-processing. Note that the pre-processing step can run concurrently with the piet-gpu pipeline; Only the last stage, kernel4, needs the images. Pre-processing most likely uses fixed function vertex/fragment programs, which on some GPUs may run in parallel with piet-gpu's compute programs. While here, fix a few validation errors: - Explicitly enable EXT_descriptor_indexing, KHR_maintenance3, KHR_get_physical_device_properties2. - Specify a vkDescriptorSetVariableDescriptorCountAllocateInfo for vkAllocateDescriptorSets. Otherwise, variable image2D arrays won't work (but sampler2D arrays do, at least on my setup). Updates #38 Signed-off-by: Elias Naur <mail@eliasnaur.com>
2020-12-29 08:02:39 +11:00
case Annotated_FillImage:
case Annotated_BeginClip:
PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size);
Path path = Path_read(conf.tile_alloc, path_ref);
sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
row_count = path.bbox.w - path.bbox.y;
// Paths that don't cross tile top edges don't have backdrops.
// Don't apply the optimization to paths that may cross the y = 0
// top edge, but clipped to 1 row.
if (row_count == 1 && path.bbox.y > 0) {
// Note: this can probably be expanded to width = 2 as
// long as it doesn't cross the left edge.
row_count = 0;
}
Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size);
sh_row_alloc[th_ix] = path_alloc;
}
}
sh_row_count[th_ix] = row_count;
// Prefix sum of sh_row_count
for (uint i = 0; i < LG_BACKDROP_WG; i++) {
barrier();
if (th_ix >= (1 << i)) {
row_count += sh_row_count[th_ix - (1 << i)];
}
barrier();
sh_row_count[th_ix] = row_count;
}
barrier();
// Work assignment: 1 thread : 1 path element row
uint total_rows = sh_row_count[BACKDROP_WG - 1];
for (uint row = th_ix; row < total_rows; row += BACKDROP_WG) {
// Binary search to find element
uint el_ix = 0;
for (uint i = 0; i < LG_BACKDROP_WG; i++) {
uint probe = el_ix + ((BACKDROP_WG / 2) >> i);
if (row >= sh_row_count[probe - 1]) {
el_ix = probe;
}
}
uint width = sh_row_width[el_ix];
if (width > 0) {
// Process one row sequentially
// Read backdrop value per tile and prefix sum it
Alloc tiles_alloc = sh_row_alloc[el_ix];
uint seq_ix = row - (el_ix > 0 ? sh_row_count[el_ix - 1] : 0);
uint tile_el_ix = (tiles_alloc.offset >> 2) + 1 + seq_ix * 2 * width;
uint sum = read_mem(tiles_alloc, tile_el_ix);
for (uint x = 1; x < width; x++) {
tile_el_ix += 2;
sum += read_mem(tiles_alloc, tile_el_ix);
write_mem(tiles_alloc, tile_el_ix, sum);
}
}
}
}