// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense // Propagation of tile backdrop for filling. // // Each thread reads one path element and calculates the row and column counts of spanned tiles // based on the bounding box. // The row count then goes through a prefix sum to redistribute and load-balance the work across the workgroup. // In the following step, the workgroup loops over the corresponding tile rows per element in parallel. // For each row the per tile backdrop will be read, as calculated in the previous coarse path segment kernel, // and propagated from the left to the right (prefix summed). // // Output state: // - Each path element has an array of tiles covering the whole path based on boundig box // - Each tile per path element contains the 'backdrop' and a list of subdivided path segments #version 450 #extension GL_GOOGLE_include_directive : enable #include "mem.h" #include "setup.h" #define LG_BACKDROP_WG (7 + LG_WG_FACTOR) #define BACKDROP_WG (1 << LG_BACKDROP_WG) #ifndef BACKDROP_DIST_FACTOR // Some paths (those covering a large area) can generate a lot of backdrop tiles; BACKDROP_DIST_FACTOR defines how much // additional threads should we spawn for parallel row processing. The additional threads does not participate in the // earlier stages (calculating the tile counts) but does work in the final prefix sum stage which has a lot more // parallelism. // This feature is opt-in: one variant is compiled with the following default, while the other variant is compiled with // a larger BACKDROP_DIST_FACTOR, which is used on GPUs supporting a larger workgroup size to improve performance. #define BACKDROP_DIST_FACTOR 1 #endif layout(local_size_x = BACKDROP_WG, local_size_y = BACKDROP_DIST_FACTOR) in; layout(set = 0, binding = 1) readonly buffer ConfigBuf { Config conf; }; #include "tile.h" shared uint sh_row_count[BACKDROP_WG]; shared Alloc sh_row_alloc[BACKDROP_WG]; shared uint sh_row_width[BACKDROP_WG]; void main() { if (!check_deps(STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_PATH_COARSE)) { return; } uint th_ix = gl_LocalInvocationIndex; uint element_ix = gl_GlobalInvocationID.x; // Work assignment: 1 thread : 1 path element uint row_count = 0; if (gl_LocalInvocationID.y == 0) { if (element_ix < conf.n_elements) { // Possible TODO: it's not necessary to process backdrops of stroked paths. // We had logic for that but took it out because it used the Annotated struct. PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size); Path path = Path_read(conf.tile_alloc, path_ref); sh_row_width[th_ix] = path.bbox.z - path.bbox.x; row_count = path.bbox.w - path.bbox.y; // Paths that don't cross tile top edges don't have backdrops. // Don't apply the optimization to paths that may cross the y = 0 // top edge, but clipped to 1 row. if (row_count == 1 && path.bbox.y > 0) { // Note: this can probably be expanded to width = 2 as // long as it doesn't cross the left edge. row_count = 0; } Alloc path_alloc = new_alloc( path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, true); sh_row_alloc[th_ix] = path_alloc; } sh_row_count[th_ix] = row_count; } // Prefix sum of sh_row_count for (uint i = 0; i < LG_BACKDROP_WG; i++) { barrier(); if (gl_LocalInvocationID.y == 0 && th_ix >= (1u << i)) { row_count += sh_row_count[th_ix - (1u << i)]; } barrier(); if (gl_LocalInvocationID.y == 0) { sh_row_count[th_ix] = row_count; } } barrier(); // Work assignment: 1 thread : 1 path element row uint total_rows = sh_row_count[BACKDROP_WG - 1]; for (uint row = th_ix; row < total_rows; row += BACKDROP_WG * BACKDROP_DIST_FACTOR) { // Binary search to find element uint el_ix = 0; for (uint i = 0; i < LG_BACKDROP_WG; i++) { uint probe = el_ix + (uint(BACKDROP_WG / 2) >> i); if (row >= sh_row_count[probe - 1]) { el_ix = probe; } } uint width = sh_row_width[el_ix]; if (width > 0) { // Process one row sequentially // Read backdrop value per tile and prefix sum it Alloc tiles_alloc = sh_row_alloc[el_ix]; uint seq_ix = row - (el_ix > 0 ? sh_row_count[el_ix - 1] : 0); uint tile_el_ix = (tiles_alloc.offset >> 2) + 1 + seq_ix * 2 * width; uint sum = read_mem(tiles_alloc, tile_el_ix); for (uint x = 1; x < width; x++) { tile_el_ix += 2; sum += read_mem(tiles_alloc, tile_el_ix); write_mem(tiles_alloc, tile_el_ix, sum); } } } }