diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp index 28efd16..692eeca 100644 --- a/piet-gpu/shader/coarse.comp +++ b/piet-gpu/shader/coarse.comp @@ -44,16 +44,12 @@ shared uint sh_part_count[N_PART_READ]; shared uint sh_part_elements[N_PART_READ]; shared uint sh_bitmaps[N_SLICE][N_TILE]; -shared uint sh_backdrop[N_SLICE][N_TILE]; -shared uint sh_bd_sign[N_SLICE]; -shared uint sh_is_segment[N_SLICE]; -// Shared state for parallel segment output stage - -// Count of total number of segments in each tile, then -// inclusive prefix sum of same. -shared uint sh_seg_count[N_TILE]; -shared uint sh_seg_alloc; +shared uint sh_tile_count[N_TILE]; +// The width of the tile rect for the element, intersected with this bin +shared uint sh_tile_width[N_TILE]; +shared uint sh_tile_x0[N_TILE]; +shared uint sh_tile_y0[N_TILE]; // scale factors useful for converting coordinates to tiles #define SX (1.0 / float(TILE_WIDTH_PX)) @@ -70,30 +66,6 @@ void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) { } } -#define CHUNK_ALLOC_SLAB 16 - -uint alloc_chunk_remaining; -uint alloc_chunk_offset; - -SegChunkRef alloc_seg_chunk() { - if (alloc_chunk_remaining == 0) { - alloc_chunk_offset = atomicAdd(alloc, CHUNK_ALLOC_SLAB * SegChunk_size); - alloc_chunk_remaining = CHUNK_ALLOC_SLAB; - } - uint offset = alloc_chunk_offset; - alloc_chunk_offset += SegChunk_size; - alloc_chunk_remaining--; - return SegChunkRef(offset); -} - -// Accumulate delta to backdrop. -// -// Each bit for which bd_bitmap is 1 and bd_sign is 1 counts as +1, and each -// bit for which bd_bitmap is 1 and bd_sign is 0 counts as -1. -int count_backdrop(uint bd_bitmap, uint bd_sign) { - return bitCount(bd_bitmap & bd_sign) - bitCount(bd_bitmap & ~bd_sign); -} - void main() { // Could use either linear or 2d layouts for both dispatch and // invocations within the workgroup. We'll use variables to abstract. @@ -110,13 +82,6 @@ void main() { CmdRef cmd_ref = CmdRef(this_tile_ix * PTCL_INITIAL_ALLOC); uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size; - // Allocation and management of segment output - SegChunkRef first_seg_chunk = SegChunkRef(0); - SegChunkRef last_chunk_ref = SegChunkRef(0); - uint last_chunk_n = 0; - SegmentRef last_chunk_segs = SegmentRef(0); - alloc_chunk_remaining = 0; - // I'm sure we can figure out how to do this with at least one fewer register... // Items up to rd_ix have been read from sh_elements uint rd_ix = 0; @@ -125,17 +90,10 @@ void main() { // Items between part_start_ix and ready_ix are ready to be transferred from sh_part_elements uint part_start_ix = 0; uint ready_ix = 0; - if (th_ix < N_SLICE) { - sh_bd_sign[th_ix] = 0; - } int backdrop = 0; while (true) { for (uint i = 0; i < N_SLICE; i++) { sh_bitmaps[i][th_ix] = 0; - sh_backdrop[i][th_ix] = 0; - } - if (th_ix < N_SLICE) { - sh_is_segment[th_ix] = 0; } // parallel read of input partitions @@ -204,8 +162,6 @@ void main() { // Bounding box of element in pixel coordinates. float xmin, xmax, ymin, ymax; - uint my_slice = th_ix / 32; - uint my_mask = 1 << (th_ix & 31); switch (tag) { case Annotated_Fill: case Annotated_Stroke: @@ -231,15 +187,58 @@ void main() { int x1 = int(ceil((xmax - xy0.x) * SX)); int y0 = int(floor((ymin - xy0.y) * SY)); int y1 = int(ceil((ymax - xy0.y) * SY)); + x0 = clamp(x0, 0, N_TILE_X); x1 = clamp(x1, x0, N_TILE_X); y0 = clamp(y0, 0, N_TILE_Y); y1 = clamp(y1, y0, N_TILE_Y); - for (uint y = y0; y < y1; y++) { - for (uint x = x0; x < x1; x++) { - atomicOr(sh_bitmaps[my_slice][y * N_TILE_X + x], my_mask); + + uint tile_count = uint((x1 - x0) * (y1 - y0)); + sh_tile_width[th_ix] = uint(x1 - x0); + sh_tile_x0[th_ix] = uint(x0); + sh_tile_y0[th_ix] = uint(y0); + + // Prefix sum of sh_tile_count + sh_tile_count[th_ix] = tile_count; + for (uint i = 0; i < LG_N_TILE; i++) { + barrier(); + if (th_ix >= (1 << i)) { + tile_count += sh_tile_count[th_ix - (1 << i)]; + } + barrier(); + sh_tile_count[th_ix] = tile_count; + } + barrier(); + uint total_tile_count = sh_tile_count[N_TILE - 1]; + for (uint ix = th_ix; ix < total_tile_count; ix += N_TILE) { + // Binary search to find element + uint el_ix = 0; + for (uint i = 0; i < LG_N_TILE; i++) { + uint probe = el_ix + ((N_TILE / 2) >> i); + if (ix >= sh_tile_count[probe - 1]) { + el_ix = probe; + } + } + uint seq_ix = ix - (el_ix > 0 ? sh_tile_count[el_ix - 1] : 0); + uint width = sh_tile_width[el_ix]; + uint x = sh_tile_x0[el_ix] + seq_ix % width; + uint y = sh_tile_y0[el_ix] + seq_ix / width; + uint tile_x = x + gl_WorkGroupID.x * N_TILE_X; + uint tile_y = y + gl_WorkGroupID.y * N_TILE_Y; + uint element_ix = sh_elements[el_ix]; + Path path = Path_read(PathRef(element_ix * Path_size)); + if (tile_x >= path.bbox.x && tile_x < path.bbox.z && tile_y >= path.bbox.y && tile_y < path.bbox.w) { + uint stride = path.bbox.z - path.bbox.x; + uint tile_subix = (tile_y - path.bbox.y) * stride + tile_x - path.bbox.x; + Tile tile = Tile_read(Tile_index(path.tiles, tile_subix)); + if (tile.tile.offset != 0) { + uint el_slice = el_ix / 32; + uint el_mask = 1 << (el_ix & 31); + atomicOr(sh_bitmaps[el_slice][y * N_TILE_X + x], el_mask); + } } } + barrier(); // We've computed coverage and other info for each element in the input, now for diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv index 6b2afaf..b3a90c8 100644 Binary files a/piet-gpu/shader/coarse.spv and b/piet-gpu/shader/coarse.spv differ