diff --git a/piet-gpu/shader/path_coarse.comp b/piet-gpu/shader/path_coarse.comp index 5a4b78c..7cbda9b 100644 --- a/piet-gpu/shader/path_coarse.comp +++ b/piet-gpu/shader/path_coarse.comp @@ -7,9 +7,10 @@ #include "setup.h" -#define TILE_ALLOC_WG 32 +#define LG_COARSE_WG 5 +#define COARSE_WG (1 << LG_COARSE_WG) -layout(local_size_x = TILE_ALLOC_WG, local_size_y = 1) in; +layout(local_size_x = COARSE_WG, local_size_y = 1) in; layout(set = 0, binding = 0) buffer PathSegBuf { uint[] pathseg; @@ -32,7 +33,22 @@ layout(set = 0, binding = 2) buffer TileBuf { #define SX (1.0 / float(TILE_WIDTH_PX)) #define SY (1.0 / float(TILE_HEIGHT_PX)) +shared uint sh_tile_count[COARSE_WG]; +shared uint sh_width[COARSE_WG]; +shared uint sh_draw_width[COARSE_WG]; +shared vec2 sh_p0[COARSE_WG]; +shared vec2 sh_p1[COARSE_WG]; +shared int sh_x0[COARSE_WG]; +shared int sh_y0[COARSE_WG]; +shared float sh_a[COARSE_WG]; +shared float sh_b[COARSE_WG]; +shared float sh_c[COARSE_WG]; +shared uint sh_base[COARSE_WG]; +shared uint sh_stride[COARSE_WG]; +shared uint sh_alloc_start; + void main() { + uint th_ix = gl_LocalInvocationID.x; uint element_ix = gl_GlobalInvocationID.x; PathSegRef ref = PathSegRef(element_ix * PathSeg_size); @@ -49,6 +65,8 @@ void main() { case PathSeg_FillLine: case PathSeg_StrokeLine: line = PathSeg_StrokeLine_read(ref); + sh_p0[th_ix] = line.p0; + sh_p1[th_ix] = line.p1; xmin = min(line.p0.x, line.p1.x) - line.stroke.x; xmax = max(line.p0.x, line.p1.x) + line.stroke.x; ymin = min(line.p0.y, line.p1.y) - line.stroke.y; @@ -60,6 +78,9 @@ void main() { c = (line.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + line.stroke.y)) * SX; b = invslope; // Note: assumes square tiles, otherwise scale. a = (line.p0.x - (line.p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX; + sh_a[th_ix] = a; + sh_b[th_ix] = b; + sh_c[th_ix] = c; break; } int x0 = int(floor((xmin) * SX)); @@ -74,34 +95,68 @@ void main() { y0 = clamp(y0, bbox.y, bbox.w); x1 = clamp(x1, bbox.x, bbox.z); y1 = clamp(y1, bbox.y, bbox.w); - float t = a + b * float(y0); + sh_x0[th_ix] = x0; + // TODO: can get rid of this (fold into base), with care (also need to update `a`) + sh_y0[th_ix] = y0; int stride = bbox.z - bbox.x; - int base = (y0 - bbox.y) * stride - bbox.x; - // TODO: can be tighter, use c to bound width - uint n_tile_alloc = uint((x1 - x0) * (y1 - y0)); - // Consider using subgroups to aggregate atomic add. - uint tile_offset = atomicAdd(alloc, n_tile_alloc * TileSeg_size); - TileSeg tile_seg; - tile_seg.start = line.p0; - tile_seg.end = line.p1; - for (int y = y0; y < y1; y++) { + sh_stride[th_ix] = stride; + sh_base[th_ix] = path.tiles.offset - (bbox.y * stride + bbox.x) * Tile_size; + uint width = uint(x1 - x0); + sh_width[th_ix] = width; + uint draw_width = min(width, uint(1.0 + ceil(2.0 * c))); + sh_draw_width[th_ix] = draw_width; + uint tile_count = draw_width * uint(y1 - y0); + + sh_tile_count[th_ix] = tile_count; + for (uint i = 0; i < LG_COARSE_WG; i++) { + barrier(); + if (th_ix >= (1 << i)) { + tile_count += sh_tile_count[th_ix - (1 << i)]; + } + barrier(); + sh_tile_count[th_ix] = tile_count; + } + if (th_ix == COARSE_WG - 1) { + sh_alloc_start = atomicAdd(alloc, tile_count * TileSeg_size); + } + barrier(); + uint alloc_start = sh_alloc_start; + uint total_tile_count = sh_tile_count[COARSE_WG - 1]; + + for (uint ix = th_ix; ix < total_tile_count; ix += COARSE_WG) { + // Binary search to find element + uint el_ix = 0; + for (uint i = 0; i < LG_COARSE_WG; i++) { + uint probe = el_ix + ((COARSE_WG / 2) >> i); + if (ix >= sh_tile_count[probe - 1]) { + el_ix = probe; + } + } + uint seq_ix = ix - (el_ix > 0 ? sh_tile_count[el_ix - 1] : 0); + uint draw_width = sh_draw_width[el_ix]; + int x0 = sh_x0[el_ix]; + int x1 = x0 + int(sh_width[el_ix]); + int dx = int(seq_ix % draw_width); + uint y = sh_y0[el_ix] + seq_ix / draw_width; + float t = sh_a[el_ix] + sh_b[el_ix] * float(y); + float c = sh_c[el_ix]; int xx0 = clamp(int(floor(t - c)), x0, x1); int xx1 = clamp(int(ceil(t + c)), x0, x1); - for (int x = xx0; x < xx1; x++) { - TileRef tile_ref = Tile_index(path.tiles, uint(base + x)); - uint tile_el = tile_ref.offset >> 2; + int x = xx0 + dx; + if (x < xx1) { + uint tile_offset = alloc_start + ix * TileSeg_size; + uint tile_el = (sh_base[el_ix] + uint(y * sh_stride[el_ix] + x) * Tile_size) >> 2; uint old; uint actual; do { old = tile[tile_el]; actual = atomicCompSwap(tile[tile_el], old, tile_offset); } while (actual != old); + TileSeg tile_seg; + tile_seg.start = sh_p0[el_ix]; + tile_seg.end = sh_p1[el_ix]; tile_seg.next.offset = old; TileSeg_write(TileSegRef(tile_offset), tile_seg); - tile_offset += TileSeg_size; } - // TODO for fills: backdrop - t += b; - base += stride; } } diff --git a/piet-gpu/shader/path_coarse.spv b/piet-gpu/shader/path_coarse.spv index cf0d4b9..58c2ab5 100644 Binary files a/piet-gpu/shader/path_coarse.spv and b/piet-gpu/shader/path_coarse.spv differ diff --git a/piet-gpu/shader/tile_alloc.comp b/piet-gpu/shader/tile_alloc.comp index 0d25274..593b87a 100644 --- a/piet-gpu/shader/tile_alloc.comp +++ b/piet-gpu/shader/tile_alloc.comp @@ -83,9 +83,11 @@ void main() { barrier(); uint alloc_start = sh_tile_alloc; - uint tile_subix = th_ix > 0 ? sh_tile_count[th_ix - 1] : 0; - path.tiles = TileRef(alloc_start + Tile_size * tile_subix); - Path_write(path_ref, path); + if (element_ix < n_elements) { + uint tile_subix = th_ix > 0 ? sh_tile_count[th_ix - 1] : 0; + path.tiles = TileRef(alloc_start + Tile_size * tile_subix); + Path_write(path_ref, path); + } // Zero out allocated tiles efficiently uint total_count = sh_tile_count[TILE_ALLOC_WG - 1] * (Tile_size / 4); diff --git a/piet-gpu/shader/tile_alloc.spv b/piet-gpu/shader/tile_alloc.spv index 3727647..81b3607 100644 Binary files a/piet-gpu/shader/tile_alloc.spv and b/piet-gpu/shader/tile_alloc.spv differ