// Allocation and initialization of tiles for paths. #version 450 #extension GL_GOOGLE_include_directive : enable #include "setup.h" #define LG_TILE_ALLOC_WG (7 + LG_WG_FACTOR) #define TILE_ALLOC_WG (1 << LG_TILE_ALLOC_WG) layout(local_size_x = TILE_ALLOC_WG, local_size_y = 1) in; layout(set = 0, binding = 0) buffer AnnotatedBuf { uint[] annotated; }; layout(set = 0, binding = 1) buffer AllocBuf { uint n_elements; uint n_pathseg; uint alloc; }; layout(set = 0, binding = 2) buffer TileBuf { uint[] tile; }; #include "annotated.h" #include "tile.h" // scale factors useful for converting coordinates to tiles #define SX (1.0 / float(TILE_WIDTH_PX)) #define SY (1.0 / float(TILE_HEIGHT_PX)) shared uint sh_tile_count[TILE_ALLOC_WG]; shared uint sh_tile_alloc; void main() { uint th_ix = gl_LocalInvocationID.x; uint element_ix = gl_GlobalInvocationID.x; PathRef path_ref = PathRef(element_ix * Path_size); AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size); uint tag = Annotated_Nop; if (element_ix < n_elements) { tag = Annotated_tag(ref); } int x0 = 0, y0 = 0, x1 = 0, y1 = 0; switch (tag) { case Annotated_Fill: case Annotated_FillMask: case Annotated_FillMaskInv: case Annotated_Stroke: case Annotated_BeginClip: case Annotated_EndClip: // Note: we take advantage of the fact that fills, strokes, and // clips have compatible layout. AnnoFill fill = Annotated_Fill_read(ref); x0 = int(floor(fill.bbox.x * SX)); y0 = int(floor(fill.bbox.y * SY)); x1 = int(ceil(fill.bbox.z * SX)); y1 = int(ceil(fill.bbox.w * SY)); break; } x0 = clamp(x0, 0, WIDTH_IN_TILES); y0 = clamp(y0, 0, HEIGHT_IN_TILES); x1 = clamp(x1, 0, WIDTH_IN_TILES); y1 = clamp(y1, 0, HEIGHT_IN_TILES); Path path; path.bbox = uvec4(x0, y0, x1, y1); uint tile_count = (x1 - x0) * (y1 - y0); if (tag == Annotated_EndClip) { // Don't actually allocate tiles for an end clip, but we do want // the path structure (especially bbox) allocated for it. tile_count = 0; } sh_tile_count[th_ix] = tile_count; // Prefix sum of sh_tile_count for (uint i = 0; i < LG_TILE_ALLOC_WG; i++) { barrier(); if (th_ix >= (1 << i)) { tile_count += sh_tile_count[th_ix - (1 << i)]; } barrier(); sh_tile_count[th_ix] = tile_count; } if (th_ix == TILE_ALLOC_WG - 1) { sh_tile_alloc = atomicAdd(alloc, tile_count * Tile_size); } barrier(); uint alloc_start = sh_tile_alloc; if (element_ix < n_elements) { uint tile_subix = th_ix > 0 ? sh_tile_count[th_ix - 1] : 0; path.tiles = TileRef(alloc_start + Tile_size * tile_subix); Path_write(path_ref, path); } // Zero out allocated tiles efficiently uint total_count = sh_tile_count[TILE_ALLOC_WG - 1] * (Tile_size / 4); uint start_ix = alloc_start >> 2; for (uint i = th_ix; i < total_count; i += TILE_ALLOC_WG) { // Note: this interleaving is faster than using Tile_write // by a significant amount. tile[start_ix + i] = 0; } }