diff --git a/piet-gpu-types/src/annotated.rs b/piet-gpu-types/src/annotated.rs index 247ab12..f7a6ad6 100644 --- a/piet-gpu-types/src/annotated.rs +++ b/piet-gpu-types/src/annotated.rs @@ -3,7 +3,14 @@ use piet_gpu_derive::piet_gpu; piet_gpu! { #[gpu_write] mod annotated { - struct AnnoLineSeg { + struct AnnoFillLineSeg { + p0: [f32; 2], + p1: [f32; 2], + // A note: the layout of this struct is shared with + // AnnoStrokeLineSeg. In that case, we actually write + // [0.0, 0.0] as the stroke field, to minimize divergence. + } + struct AnnoStrokeLineSeg { p0: [f32; 2], p1: [f32; 2], // halfwidth in both x and y for binning @@ -35,8 +42,8 @@ piet_gpu! { } enum Annotated { Nop, - // The segments need a flag to indicate fill/stroke - Line(AnnoLineSeg), + FillLine(AnnoFillLineSeg), + StrokeLine(AnnoStrokeLineSeg), Quad(AnnoQuadSeg), Cubic(AnnoCubicSeg), Stroke(AnnoStroke), diff --git a/piet-gpu-types/src/bins.rs b/piet-gpu-types/src/bins.rs index 88f16f1..1ac2413 100644 --- a/piet-gpu-types/src/bins.rs +++ b/piet-gpu-types/src/bins.rs @@ -7,6 +7,9 @@ piet_gpu! { mod bins { struct BinInstance { element_ix: u32, + // Right edge of the bounding box of the associated fill + // element; used in backdrop computation. + right_edge: f32, } struct BinChunk { diff --git a/piet-gpu-types/src/scene.rs b/piet-gpu-types/src/scene.rs index 7451c9c..5792c94 100644 --- a/piet-gpu-types/src/scene.rs +++ b/piet-gpu-types/src/scene.rs @@ -85,8 +85,15 @@ piet_gpu! { } enum Element { Nop, - // The segments need a flag to indicate fill/stroke - Line(LineSeg), + // Another approach to encoding would be to use a single + // variant but have a bool for fill/stroke. This could be + // packed into the tag, so the on-the-wire representation + // would be very similar to what's here. + StrokeLine(LineSeg), + FillLine(LineSeg), + + // Note: we'll need to handle the stroke/fill distinction + // for these as well, when we do flattening on the GPU. Quad(QuadSeg), Cubic(CubicSeg), Stroke(Stroke), diff --git a/piet-gpu/shader/annotated.h b/piet-gpu/shader/annotated.h index a3fc464..9812264 100644 --- a/piet-gpu/shader/annotated.h +++ b/piet-gpu/shader/annotated.h @@ -1,6 +1,10 @@ // Code auto-generated by piet-gpu-derive -struct AnnoLineSegRef { +struct AnnoFillLineSegRef { + uint offset; +}; + +struct AnnoStrokeLineSegRef { uint offset; }; @@ -24,16 +28,27 @@ struct AnnotatedRef { uint offset; }; -struct AnnoLineSeg { +struct AnnoFillLineSeg { + vec2 p0; + vec2 p1; +}; + +#define AnnoFillLineSeg_size 16 + +AnnoFillLineSegRef AnnoFillLineSeg_index(AnnoFillLineSegRef ref, uint index) { + return AnnoFillLineSegRef(ref.offset + index * AnnoFillLineSeg_size); +} + +struct AnnoStrokeLineSeg { vec2 p0; vec2 p1; vec2 stroke; }; -#define AnnoLineSeg_size 24 +#define AnnoStrokeLineSeg_size 24 -AnnoLineSegRef AnnoLineSeg_index(AnnoLineSegRef ref, uint index) { - return AnnoLineSegRef(ref.offset + index * AnnoLineSeg_size); +AnnoStrokeLineSegRef AnnoStrokeLineSeg_index(AnnoStrokeLineSegRef ref, uint index) { + return AnnoStrokeLineSegRef(ref.offset + index * AnnoStrokeLineSeg_size); } struct AnnoQuadSeg { @@ -87,18 +102,39 @@ AnnoStrokeRef AnnoStroke_index(AnnoStrokeRef ref, uint index) { } #define Annotated_Nop 0 -#define Annotated_Line 1 -#define Annotated_Quad 2 -#define Annotated_Cubic 3 -#define Annotated_Stroke 4 -#define Annotated_Fill 5 +#define Annotated_FillLine 1 +#define Annotated_StrokeLine 2 +#define Annotated_Quad 3 +#define Annotated_Cubic 4 +#define Annotated_Stroke 5 +#define Annotated_Fill 6 #define Annotated_size 44 AnnotatedRef Annotated_index(AnnotatedRef ref, uint index) { return AnnotatedRef(ref.offset + index * Annotated_size); } -AnnoLineSeg AnnoLineSeg_read(AnnoLineSegRef ref) { +AnnoFillLineSeg AnnoFillLineSeg_read(AnnoFillLineSegRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = annotated[ix + 0]; + uint raw1 = annotated[ix + 1]; + uint raw2 = annotated[ix + 2]; + uint raw3 = annotated[ix + 3]; + AnnoFillLineSeg s; + s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + return s; +} + +void AnnoFillLineSeg_write(AnnoFillLineSegRef ref, AnnoFillLineSeg s) { + uint ix = ref.offset >> 2; + annotated[ix + 0] = floatBitsToUint(s.p0.x); + annotated[ix + 1] = floatBitsToUint(s.p0.y); + annotated[ix + 2] = floatBitsToUint(s.p1.x); + annotated[ix + 3] = floatBitsToUint(s.p1.y); +} + +AnnoStrokeLineSeg AnnoStrokeLineSeg_read(AnnoStrokeLineSegRef ref) { uint ix = ref.offset >> 2; uint raw0 = annotated[ix + 0]; uint raw1 = annotated[ix + 1]; @@ -106,14 +142,14 @@ AnnoLineSeg AnnoLineSeg_read(AnnoLineSegRef ref) { uint raw3 = annotated[ix + 3]; uint raw4 = annotated[ix + 4]; uint raw5 = annotated[ix + 5]; - AnnoLineSeg s; + AnnoStrokeLineSeg s; s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); s.stroke = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5)); return s; } -void AnnoLineSeg_write(AnnoLineSegRef ref, AnnoLineSeg s) { +void AnnoStrokeLineSeg_write(AnnoStrokeLineSegRef ref, AnnoStrokeLineSeg s) { uint ix = ref.offset >> 2; annotated[ix + 0] = floatBitsToUint(s.p0.x); annotated[ix + 1] = floatBitsToUint(s.p0.y); @@ -239,8 +275,12 @@ uint Annotated_tag(AnnotatedRef ref) { return annotated[ref.offset >> 2]; } -AnnoLineSeg Annotated_Line_read(AnnotatedRef ref) { - return AnnoLineSeg_read(AnnoLineSegRef(ref.offset + 4)); +AnnoFillLineSeg Annotated_FillLine_read(AnnotatedRef ref) { + return AnnoFillLineSeg_read(AnnoFillLineSegRef(ref.offset + 4)); +} + +AnnoStrokeLineSeg Annotated_StrokeLine_read(AnnotatedRef ref) { + return AnnoStrokeLineSeg_read(AnnoStrokeLineSegRef(ref.offset + 4)); } AnnoQuadSeg Annotated_Quad_read(AnnotatedRef ref) { @@ -263,9 +303,14 @@ void Annotated_Nop_write(AnnotatedRef ref) { annotated[ref.offset >> 2] = Annotated_Nop; } -void Annotated_Line_write(AnnotatedRef ref, AnnoLineSeg s) { - annotated[ref.offset >> 2] = Annotated_Line; - AnnoLineSeg_write(AnnoLineSegRef(ref.offset + 4), s); +void Annotated_FillLine_write(AnnotatedRef ref, AnnoFillLineSeg s) { + annotated[ref.offset >> 2] = Annotated_FillLine; + AnnoFillLineSeg_write(AnnoFillLineSegRef(ref.offset + 4), s); +} + +void Annotated_StrokeLine_write(AnnotatedRef ref, AnnoStrokeLineSeg s) { + annotated[ref.offset >> 2] = Annotated_StrokeLine; + AnnoStrokeLineSeg_write(AnnoStrokeLineSegRef(ref.offset + 4), s); } void Annotated_Quad_write(AnnotatedRef ref, AnnoQuadSeg s) { diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp index c3067e7..713a654 100644 --- a/piet-gpu/shader/binning.comp +++ b/piet-gpu/shader/binning.comp @@ -11,24 +11,35 @@ layout(set = 0, binding = 0) buffer AnnotatedBuf { uint[] annotated; }; -layout(set = 0, binding = 1) buffer AllocBuf { +// This is for scanning forward for right_edge data. +layout(set = 0, binding = 1) buffer StateBuf { + uint[] state; +}; + +layout(set = 0, binding = 2) buffer AllocBuf { uint n_elements; // Will be incremented atomically to claim tiles uint tile_ix; uint alloc; }; -layout(set = 0, binding = 2) buffer BinsBuf { +layout(set = 0, binding = 3) buffer BinsBuf { uint[] bins; }; #include "annotated.h" +#include "state.h" #include "bins.h" // scale factors useful for converting coordinates to bins #define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX)) #define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX)) +#define TSY (1.0 / float(TILE_HEIGHT_PX)) + +// Constant not available in GLSL. Also consider uintBitsToFloat(0x7f800000) +#define INFINITY (1.0 / 0.0) + // Note: cudaraster has N_TILE + 1 to cut down on bank conflicts. shared uint bitmaps[N_SLICE][N_TILE]; shared uint count[N_SLICE][N_TILE]; @@ -37,6 +48,14 @@ shared uint sh_chunk_start[N_TILE]; shared uint sh_chunk_end[N_TILE]; shared uint sh_chunk_jump[N_TILE]; +shared float sh_right_edge[N_TILE]; + +#define StateBuf_stride (8 + 2 * State_size) + +uint state_right_edge_index(uint partition_ix) { + return 2 + partition_ix * (StateBuf_stride / 4); +} + void main() { BinChunkRef chunk_ref = BinChunkRef((gl_LocalInvocationID.x * N_WG + gl_WorkGroupID.x) * BIN_INITIAL_ALLOC); uint wr_limit = chunk_ref.offset + BIN_INITIAL_ALLOC; @@ -65,13 +84,17 @@ void main() { tag = Annotated_tag(ref); } int x0 = 0, y0 = 0, x1 = 0, y1 = 0; + float my_right_edge = INFINITY; + bool crosses_edge = false; switch (tag) { - case Annotated_Line: - AnnoLineSeg line = Annotated_Line_read(ref); + case Annotated_FillLine: + case Annotated_StrokeLine: + AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref); x0 = int(floor((min(line.p0.x, line.p1.x) - line.stroke.x) * SX)); y0 = int(floor((min(line.p0.y, line.p1.y) - line.stroke.y) * SY)); x1 = int(ceil((max(line.p0.x, line.p1.x) + line.stroke.x) * SX)); y1 = int(ceil((max(line.p0.y, line.p1.y) + line.stroke.y) * SY)); + crosses_edge = tag == Annotated_FillLine && ceil(line.p0.y * TSY) != ceil(line.p1.y * TSY); break; case Annotated_Fill: case Annotated_Stroke: @@ -82,8 +105,41 @@ void main() { y0 = int(floor(fill.bbox.y * SY)); x1 = int(ceil(fill.bbox.z * SX)); y1 = int(ceil(fill.bbox.w * SY)); + // It probably makes more sense to track x1, to avoid having to redo + // the rounding to tile coords. + my_right_edge = fill.bbox.z; break; } + + // If the last element in this partition is a fill edge, then we need to do a + // look-forward to find the right edge of its corresponding fill. That data is + // recorded in aggregates computed in the element processing pass. + if (gl_LocalInvocationID.x == N_TILE - 1 && tag == Annotated_FillLine) { + uint aggregate_ix = (my_tile + 1) * ELEMENT_BINNING_RATIO; + // This is sequential but the expectation is that the amount of + // look-forward is small (performance may degrade in the case + // of massively complex paths). + do { + my_right_edge = uintBitsToFloat(state[state_right_edge_index(aggregate_ix)]); + aggregate_ix++; + } while (isinf(my_right_edge)); + } + + // Now propagate right_edge backward, from fill to segment. + for (uint i = 0; i < LG_N_TILE; i++) { + // Note: we could try to cut down on write bandwidth here if the value hasn't + // changed, but not sure it's worth the complexity to track. + sh_right_edge[gl_LocalInvocationID.x] = my_right_edge; + barrier(); + if (gl_LocalInvocationID.x + (1 << i) < N_TILE && isinf(my_right_edge)) { + my_right_edge = sh_right_edge[gl_LocalInvocationID.x + (1 << i)]; + } + barrier(); + } + if (crosses_edge) { + x1 = int(ceil(my_right_edge * SX)); + } + // At this point, we run an iterator over the coverage area, // trying to keep divergence low. // Right now, it's just a bbox, but we'll get finer with @@ -118,9 +174,9 @@ void main() { uint chunk_new_start; // Refactor to reduce code duplication? if (chunk_n > 0) { - uint next_chunk = chunk_ref.offset + BinChunk_size + chunk_n * 4; - if (next_chunk + BinChunk_size + min(24, element_count * 4) > wr_limit) { - uint alloc_amount = max(BIN_ALLOC, BinChunk_size + element_count * 4); + uint next_chunk = chunk_ref.offset + BinChunk_size + chunk_n * BinInstance_size; + if (next_chunk + BinChunk_size + min(24, element_count * BinInstance_size) > wr_limit) { + uint alloc_amount = max(BIN_ALLOC, BinChunk_size + element_count * BinInstance_size); // could try to reduce fragmentation if BIN_ALLOC is only a bit above needed next_chunk = atomicAdd(alloc, alloc_amount); wr_limit = next_chunk + alloc_amount; @@ -129,10 +185,10 @@ void main() { chunk_ref = BinChunkRef(next_chunk); } BinInstanceRef instance_ref = BinInstanceRef(chunk_ref.offset + BinChunk_size); - if (instance_ref.offset + element_count * 4 > wr_limit) { + if (instance_ref.offset + element_count * BinInstance_size > wr_limit) { chunk_end = wr_limit; - chunk_n = (wr_limit - instance_ref.offset) / 4; - uint alloc_amount = max(BIN_ALLOC, BinChunk_size + (element_count - chunk_n) * 4); + chunk_n = (wr_limit - instance_ref.offset) / BinInstance_size; + uint alloc_amount = max(BIN_ALLOC, BinChunk_size + (element_count - chunk_n) * BinInstance_size); chunk_new_start = atomicAdd(alloc, alloc_amount); wr_limit = chunk_new_start + alloc_amount; BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(chunk_new_start))); @@ -141,6 +197,7 @@ void main() { chunk_n = element_count - chunk_n; } else { chunk_end = ~0; + chunk_new_start = ~0; chunk_n = element_count; } sh_chunk_start[gl_LocalInvocationID.x] = instance_ref.offset; @@ -161,11 +218,11 @@ void main() { if (my_slice > 0) { idx += count[my_slice - 1][bin_ix]; } - uint out_offset = sh_chunk_start[bin_ix] + idx * 4; + uint out_offset = sh_chunk_start[bin_ix] + idx * BinInstance_size; if (out_offset >= sh_chunk_end[bin_ix]) { out_offset += sh_chunk_jump[bin_ix]; } - BinInstance_write(BinInstanceRef(out_offset), BinInstance(element_ix)); + BinInstance_write(BinInstanceRef(out_offset), BinInstance(element_ix, my_right_edge)); } x++; if (x == x1) { diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv index 76148c2..e932e4d 100644 Binary files a/piet-gpu/shader/binning.spv and b/piet-gpu/shader/binning.spv differ diff --git a/piet-gpu/shader/bins.h b/piet-gpu/shader/bins.h index 3ce06e0..85f7536 100644 --- a/piet-gpu/shader/bins.h +++ b/piet-gpu/shader/bins.h @@ -10,9 +10,10 @@ struct BinChunkRef { struct BinInstance { uint element_ix; + float right_edge; }; -#define BinInstance_size 4 +#define BinInstance_size 8 BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) { return BinInstanceRef(ref.offset + index * BinInstance_size); @@ -32,14 +33,17 @@ BinChunkRef BinChunk_index(BinChunkRef ref, uint index) { BinInstance BinInstance_read(BinInstanceRef ref) { uint ix = ref.offset >> 2; uint raw0 = bins[ix + 0]; + uint raw1 = bins[ix + 1]; BinInstance s; s.element_ix = raw0; + s.right_edge = uintBitsToFloat(raw1); return s; } void BinInstance_write(BinInstanceRef ref, BinInstance s) { uint ix = ref.offset >> 2; bins[ix + 0] = s.element_ix; + bins[ix + 1] = floatBitsToUint(s.right_edge); } BinChunk BinChunk_read(BinChunkRef ref) { diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja index 3b6b963..14c72aa 100644 --- a/piet-gpu/shader/build.ninja +++ b/piet-gpu/shader/build.ninja @@ -12,7 +12,7 @@ build image.spv: glsl image.comp | scene.h build elements.spv: glsl elements.comp | scene.h state.h annotated.h -build binning.spv: glsl binning.comp | annotated.h bins.h setup.h +build binning.spv: glsl binning.comp | annotated.h state.h bins.h setup.h build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp index e331076..c77c6b8 100644 --- a/piet-gpu/shader/coarse.comp +++ b/piet-gpu/shader/coarse.comp @@ -30,6 +30,7 @@ layout(set = 0, binding = 3) buffer PtclBuf { #define N_RINGBUF 512 shared uint sh_elements[N_RINGBUF]; +shared float sh_right_edge[N_RINGBUF]; shared uint sh_chunk[N_WG]; shared uint sh_chunk_next[N_WG]; shared uint sh_chunk_n[N_WG]; @@ -41,6 +42,8 @@ shared uint sh_selected_n; shared uint sh_elements_ref; shared uint sh_bitmaps[N_SLICE][N_TILE]; +shared uint sh_backdrop[N_SLICE][N_TILE]; +shared uint sh_bd_sign[N_SLICE]; // scale factors useful for converting coordinates to tiles #define SX (1.0 / float(TILE_WIDTH_PX)) @@ -77,6 +80,14 @@ void alloc_chunk(inout uint chunk_n_segs, inout SegChunkRef seg_chunk_ref, } } +// Accumulate delta to backdrop. +// +// Each bit for which bd_bitmap is 1 and bd_sign is 1 counts as +1, and each +// bit for which bd_bitmap is 1 and bd_sign is 0 counts as -1. +int count_backdrop(uint bd_bitmap, uint bd_sign) { + return bitCount(bd_bitmap & bd_sign) - bitCount(bd_bitmap & ~bd_sign); +} + void main() { // Could use either linear or 2d layouts for both dispatch and // invocations within the workgroup. We'll use variables to abstract. @@ -109,10 +120,14 @@ void main() { sh_first_el[th_ix] = chunk.n > 0 ? BinInstance_read(BinInstanceRef(start_chunk + BinChunk_size)).element_ix : ~0; } - uint count = 0; + if (th_ix < N_SLICE) { + sh_bd_sign[th_ix] = 0; + } + int backdrop = 0; while (true) { for (uint i = 0; i < N_SLICE; i++) { sh_bitmaps[i][th_ix] = 0; + sh_backdrop[i][th_ix] = 0; } while (wr_ix - rd_ix <= N_TILE) { @@ -157,8 +172,10 @@ void main() { } BinInstanceRef inst_ref = BinInstanceRef(sh_elements_ref); if (th_ix < chunk_n) { - uint el = BinInstance_read(BinInstance_index(inst_ref, th_ix)).element_ix; - sh_elements[(wr_ix + th_ix) % N_RINGBUF] = el; + BinInstance inst = BinInstance_read(BinInstance_index(inst_ref, th_ix)); + uint wr_el_ix = (wr_ix + th_ix) % N_RINGBUF; + sh_elements[wr_el_ix] = inst.element_ix; + sh_right_edge[wr_el_ix] = inst.right_edge; } wr_ix += chunk_n; } @@ -169,8 +186,11 @@ void main() { // Read one element, compute coverage. uint tag = Annotated_Nop; AnnotatedRef ref; + float right_edge = 0.0; if (th_ix + rd_ix < wr_ix) { - uint element_ix = sh_elements[(rd_ix + th_ix) % N_RINGBUF]; + uint rd_el_ix = (rd_ix + th_ix) % N_RINGBUF; + uint element_ix = sh_elements[rd_el_ix]; + right_edge = sh_right_edge[rd_el_ix]; ref = AnnotatedRef(element_ix * Annotated_size); tag = Annotated_tag(ref); } @@ -179,15 +199,26 @@ void main() { float a, b, c; // Bounding box of element in pixel coordinates. float xmin, xmax, ymin, ymax; + uint my_slice = th_ix / 32; + uint my_mask = 1 << (th_ix & 31); switch (tag) { - case Annotated_Line: - AnnoLineSeg line = Annotated_Line_read(ref); + case Annotated_FillLine: + case Annotated_StrokeLine: + AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref); xmin = min(line.p0.x, line.p1.x) - line.stroke.x; xmax = max(line.p0.x, line.p1.x) + line.stroke.x; ymin = min(line.p0.y, line.p1.y) - line.stroke.y; ymax = max(line.p0.y, line.p1.y) + line.stroke.y; float dx = line.p1.x - line.p0.x; float dy = line.p1.y - line.p0.y; + if (tag == Annotated_FillLine) { + // Set bit for backdrop sign calculation, 1 is +1, 0 is -1. + if (dy < 0) { + atomicOr(sh_bd_sign[my_slice], my_mask); + } else { + atomicAnd(sh_bd_sign[my_slice], ~my_mask); + } + } // Set up for per-scanline coverage formula, below. float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy; c = abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + line.stroke.y) * SX; @@ -214,20 +245,20 @@ void main() { break; } - // Draw the coverage area into the bitmaks. This uses an algorithm + // Draw the coverage area into the bitmasks. This uses an algorithm // that computes the coverage of a span for given scanline. // Compute bounding box in tiles and clip to this bin. int x0 = int(floor((xmin - xy0.x) * SX)); int x1 = int(ceil((xmax - xy0.x) * SX)); + int xr = int(ceil((right_edge - xy0.x) * SX)); int y0 = int(floor((ymin - xy0.y) * SY)); int y1 = int(ceil((ymax - xy0.y) * SY)); x0 = clamp(x0, 0, N_TILE_X); x1 = clamp(x1, x0, N_TILE_X); + xr = clamp(xr, 0, N_TILE_X); y0 = clamp(y0, 0, N_TILE_Y); y1 = clamp(y1, y0, N_TILE_Y); - uint my_slice = th_ix / 32; - uint my_mask = 1 << (th_ix & 31); float t = a + b * float(y0); for (uint y = y0; y < y1; y++) { uint xx0 = clamp(int(floor(t - c)), x0, x1); @@ -235,6 +266,15 @@ void main() { for (uint x = xx0; x < xx1; x++) { atomicOr(sh_bitmaps[my_slice][y * N_TILE_X + x], my_mask); } + if (tag == Annotated_FillLine && ymin <= xy0.y + float(y * TILE_HEIGHT_PX)) { + // Assign backdrop to all tiles to the right of the ray crossing the + // top edge of this tile, up to the right edge of the fill bbox. + float xray = t - 0.5 * b; + xx0 = max(int(ceil(xray)), 0); + for (uint x = xx0; x < xr; x++) { + atomicOr(sh_backdrop[my_slice][y * N_TILE_X + x], my_mask); + } + } t += b; } barrier(); @@ -242,13 +282,18 @@ void main() { // Output elements for this tile, based on bitmaps. uint slice_ix = 0; uint bitmap = sh_bitmaps[0][th_ix]; + uint bd_bitmap = sh_backdrop[0][th_ix]; + uint bd_sign = sh_bd_sign[0]; while (true) { if (bitmap == 0) { + backdrop += count_backdrop(bd_bitmap, bd_sign); slice_ix++; if (slice_ix == N_SLICE) { break; } bitmap = sh_bitmaps[slice_ix][th_ix]; + bd_bitmap = sh_backdrop[slice_ix][th_ix]; + bd_sign = sh_bd_sign[slice_ix]; if (bitmap == 0) { continue; } @@ -256,6 +301,13 @@ void main() { uint element_ref_ix = slice_ix * 32 + findLSB(bitmap); uint element_ix = sh_elements[(rd_ix + element_ref_ix) % N_RINGBUF]; + // Bits up to and including the lsb + uint bd_mask = (bitmap - 1) ^ bitmap; + backdrop += count_backdrop(bd_bitmap & bd_mask, bd_sign); + // Clear bits that have been consumed. + bd_bitmap &= ~bd_mask; + bitmap &= ~bd_mask; + // At this point, we read the element again from global memory. // If that turns out to be expensive, maybe we can pack it into // shared memory (or perhaps just the tag). @@ -263,15 +315,58 @@ void main() { tag = Annotated_tag(ref); switch (tag) { - case Annotated_Line: - AnnoLineSeg line = Annotated_Line_read(ref); + case Annotated_FillLine: + AnnoFillLineSeg fill_line = Annotated_FillLine_read(ref); + // This is basically the same logic as piet-metal, but should be made numerically robust. + vec2 tile_xy = vec2(tile_x * TILE_WIDTH_PX, tile_y * TILE_HEIGHT_PX); + float yEdge = mix(fill_line.p0.y, fill_line.p1.y, (tile_xy.x - fill_line.p0.x) / (fill_line.p1.x - fill_line.p0.x)); + if (min(fill_line.p0.x, fill_line.p1.x) < tile_xy.x && yEdge >= tile_xy.y && yEdge < tile_xy.y + TILE_HEIGHT_PX) { + Segment edge_seg; + if (fill_line.p0.x > fill_line.p1.x) { + fill_line.p1 = vec2(tile_xy.x, yEdge); + edge_seg.start = fill_line.p1; + edge_seg.end = vec2(tile_xy.x, tile_xy.y + TILE_HEIGHT_PX); + } else { + fill_line.p0 = vec2(tile_xy.x, yEdge); + edge_seg.start = vec2(tile_xy.x, tile_xy.y + TILE_HEIGHT_PX); + edge_seg.end = fill_line.p0; + } + alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit); + Segment_write(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs), edge_seg); + chunk_n_segs++; + } + Segment fill_seg = Segment(fill_line.p0, fill_line.p1); + alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit); + Segment_write(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs), fill_seg); + chunk_n_segs++; + break; + case Annotated_StrokeLine: + AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref); Segment seg = Segment(line.p0, line.p1); alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit); Segment_write(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs), seg); chunk_n_segs++; break; case Annotated_Fill: - chunk_n_segs = 0; + if (chunk_n_segs > 0) { + AnnoFill fill = Annotated_Fill_read(ref); + SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, SegChunkRef(0))); + seg_chunk_ref.offset += SegChunk_size + Segment_size * chunk_n_segs; + CmdFill cmd_fill; + cmd_fill.seg_ref = first_seg_chunk.offset; + cmd_fill.backdrop = backdrop; + cmd_fill.rgba_color = fill.rgba_color; + alloc_cmd(cmd_ref, cmd_limit); + Cmd_Fill_write(cmd_ref, cmd_fill); + cmd_ref.offset += Cmd_size; + chunk_n_segs = 0; + } else if (backdrop != 0) { + AnnoFill fill = Annotated_Fill_read(ref); + alloc_cmd(cmd_ref, cmd_limit); + Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color)); + cmd_ref.offset += Cmd_size; + } + backdrop = 0; break; case Annotated_Stroke: if (chunk_n_segs > 0) { @@ -289,9 +384,6 @@ void main() { } break; } - - // clear LSB - bitmap &= bitmap - 1; } barrier(); diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv index d61b227..f74d0a0 100644 Binary files a/piet-gpu/shader/coarse.spv and b/piet-gpu/shader/coarse.spv differ diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp index 8f87b87..bdb4e0d 100644 --- a/piet-gpu/shader/elements.comp +++ b/piet-gpu/shader/elements.comp @@ -10,7 +10,7 @@ #define N_ROWS 4 #define WG_SIZE 32 #define LG_WG_SIZE 5 -#define TILE_SIZE (WG_SIZE * N_ROWS) +#define PARTITION_SIZE (WG_SIZE * N_ROWS) layout(local_size_x = WG_SIZE, local_size_y = 1) in; @@ -34,14 +34,14 @@ layout(set = 0, binding = 2) buffer AnnotatedBuf { #include "state.h" #include "annotated.h" -#define StateBuf_stride (4 + 2 * State_size) +#define StateBuf_stride (8 + 2 * State_size) StateRef state_aggregate_ref(uint partition_ix) { - return StateRef(8 + partition_ix * StateBuf_stride); + return StateRef(12 + partition_ix * StateBuf_stride); } StateRef state_prefix_ref(uint partition_ix) { - return StateRef(8 + partition_ix * StateBuf_stride + State_size); + return StateRef(12 + partition_ix * StateBuf_stride + State_size); } uint state_flag_index(uint partition_ix) { @@ -86,7 +86,7 @@ State combine_state(State a, State b) { return c; } -State map_element(ElementRef ref) { +State map_element(ElementRef ref, inout bool is_fill) { // TODO: it would *probably* be more efficient to make the memory read patterns less // divergent, though it would be more wasted memory. uint tag = Element_tag(ref); @@ -96,9 +96,11 @@ State map_element(ElementRef ref) { c.translate = vec2(0.0, 0.0); c.linewidth = 1.0; // TODO should be 0.0 c.flags = 0; + is_fill = false; switch (tag) { - case Element_Line: - LineSeg line = Element_Line_read(ref); + case Element_FillLine: + case Element_StrokeLine: + LineSeg line = Element_FillLine_read(ref); c.bbox.xy = min(line.p0, line.p1); c.bbox.zw = max(line.p0, line.p1); break; @@ -113,6 +115,8 @@ State map_element(ElementRef ref) { c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3)); break; case Element_Fill: + is_fill = true; + // fall-through case Element_Stroke: c.flags = FLAG_RESET_BBOX; break; @@ -145,6 +149,8 @@ shared vec4 sh_bbox[WG_SIZE]; shared float sh_width[WG_SIZE]; shared uint sh_flags[WG_SIZE]; +shared uint sh_min_fill; + shared uint sh_tile_ix; shared State sh_prefix; @@ -154,19 +160,27 @@ void main() { // 4.4 of prefix sum paper). if (gl_LocalInvocationID.x == 0) { sh_tile_ix = atomicAdd(state[0], 1); + sh_min_fill = ~0; } barrier(); uint tile_ix = sh_tile_ix; - uint ix = tile_ix * TILE_SIZE + gl_LocalInvocationID.x * N_ROWS; + uint ix = tile_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS; ElementRef ref = ElementRef(ix * Element_size); - th_state[0] = map_element(ref); + bool is_fill; + uint my_min_fill = ~0; + th_state[0] = map_element(ref, is_fill); + if (is_fill) my_min_fill = ix; for (uint i = 1; i < N_ROWS; i++) { // discussion question: would it be faster to load using more coherent patterns // into thread memory? This is kinda strided. - th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i))); + th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i), is_fill)); + if (is_fill && my_min_fill == ~0) { + my_min_fill = ix + i; + } } + atomicMin(sh_min_fill, my_min_fill); State agg = th_state[N_ROWS - 1]; sh_mat[gl_LocalInvocationID.x] = agg.mat; sh_translate[gl_LocalInvocationID.x] = agg.translate; @@ -238,6 +252,7 @@ void main() { } } barrier(); + my_min_fill = sh_min_fill; if (tile_ix != 0) { exclusive = sh_prefix; } @@ -253,8 +268,14 @@ void main() { other.flags = sh_flags[ix]; row = combine_state(row, other); } + if (my_min_fill == ~0 && gl_LocalInvocationID.x == 0) { + state[state_flag_index(tile_ix) + 1] = 0x7f800000; // infinity + } for (uint i = 0; i < N_ROWS; i++) { State st = combine_state(row, th_state[i]); + if (my_min_fill == ix + i) { + state[state_flag_index(tile_ix) + 1] = floatBitsToUint(st.bbox.z); + } // We write the state now for development purposes, but the // actual goal is to write transformed and annotated elements. //State_write(StateRef((ix + i) * State_size), st); @@ -266,13 +287,22 @@ void main() { AnnotatedRef out_ref = AnnotatedRef((ix + i) * Annotated_size); uint tag = Element_tag(this_ref); switch (tag) { - case Element_Line: - LineSeg line = Element_Line_read(this_ref); - AnnoLineSeg anno_line; + case Element_FillLine: + case Element_StrokeLine: + LineSeg line = Element_StrokeLine_read(this_ref); + AnnoStrokeLineSeg anno_line; anno_line.p0 = st.mat.xz * line.p0.x + st.mat.yw * line.p0.y + st.translate; anno_line.p1 = st.mat.xz * line.p1.x + st.mat.yw * line.p1.y + st.translate; - anno_line.stroke = get_linewidth(st); - Annotated_Line_write(out_ref, anno_line); + if (tag == Element_StrokeLine) { + anno_line.stroke = get_linewidth(st); + } else { + anno_line.stroke = vec2(0.0); + } + // We do encoding a bit by hand to minimize divergence. Another approach + // would be to have a fill/stroke bool. + uint out_tag = tag == Element_FillLine ? Annotated_FillLine : Annotated_StrokeLine; + annotated[out_ref.offset >> 2] = out_tag; + AnnoStrokeLineSeg_write(AnnoStrokeLineSegRef(out_ref.offset + 4), anno_line); break; case Element_Stroke: Stroke stroke = Element_Stroke_read(this_ref); diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv index 7828aa4..962bd0a 100644 Binary files a/piet-gpu/shader/elements.spv and b/piet-gpu/shader/elements.spv differ diff --git a/piet-gpu/shader/scene.h b/piet-gpu/shader/scene.h index 84ef80d..5bb879b 100644 --- a/piet-gpu/shader/scene.h +++ b/piet-gpu/shader/scene.h @@ -238,13 +238,14 @@ TransformRef Transform_index(TransformRef ref, uint index) { } #define Element_Nop 0 -#define Element_Line 1 -#define Element_Quad 2 -#define Element_Cubic 3 -#define Element_Stroke 4 -#define Element_Fill 5 -#define Element_SetLineWidth 6 -#define Element_Transform 7 +#define Element_StrokeLine 1 +#define Element_FillLine 2 +#define Element_Quad 3 +#define Element_Cubic 4 +#define Element_Stroke 5 +#define Element_Fill 6 +#define Element_SetLineWidth 7 +#define Element_Transform 8 #define Element_size 36 ElementRef Element_index(ElementRef ref, uint index) { @@ -446,7 +447,11 @@ uint Element_tag(ElementRef ref) { return scene[ref.offset >> 2]; } -LineSeg Element_Line_read(ElementRef ref) { +LineSeg Element_StrokeLine_read(ElementRef ref) { + return LineSeg_read(LineSegRef(ref.offset + 4)); +} + +LineSeg Element_FillLine_read(ElementRef ref) { return LineSeg_read(LineSegRef(ref.offset + 4)); } diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h index 5d8fb9b..b913086 100644 --- a/piet-gpu/shader/setup.h +++ b/piet-gpu/shader/setup.h @@ -51,9 +51,14 @@ #define N_TILE_X 16 #define N_TILE_Y 16 #define N_TILE (N_TILE_X * N_TILE_Y) +#define LG_N_TILE 8 #define N_SLICE (N_TILE / 32) // Number of workgroups for binning kernel #define N_WG 16 +// This is the ratio of the number of elements in a binning workgroup +// over the number of elements in a partition workgroup. +#define ELEMENT_BINNING_RATIO 2 + #define BIN_INITIAL_ALLOC 64 #define BIN_ALLOC 256 diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index 2dca39d..70b02f5 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -185,10 +185,10 @@ impl Renderer { ]) ?; let bin_code = include_bytes!("../shader/binning.spv"); - let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 3, 0)?; + let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 4, 0)?; let bin_ds = device.create_descriptor_set( &bin_pipeline, - &[&anno_buf, &bin_alloc_buf_dev, &bin_buf], + &[&anno_buf, &state_buf, &bin_alloc_buf_dev, &bin_buf], &[], )?; diff --git a/piet-gpu/src/pico_svg.rs b/piet-gpu/src/pico_svg.rs index 9cf5cc3..b2f054c 100644 --- a/piet-gpu/src/pico_svg.rs +++ b/piet-gpu/src/pico_svg.rs @@ -61,8 +61,8 @@ impl PicoSvg { for item in &self.items { match item { Item::Fill(fill_item) => { - //rc.fill(&fill_item.path, &fill_item.color); - rc.stroke(&fill_item.path, &fill_item.color, 1.0); + rc.fill(&fill_item.path, &fill_item.color); + //rc.stroke(&fill_item.path, &fill_item.color, 1.0); } Item::Stroke(stroke_item) => { rc.stroke(&stroke_item.path, &stroke_item.color, stroke_item.width); diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs index e01a6ae..da234de 100644 --- a/piet-gpu/src/render_ctx.rs +++ b/piet-gpu/src/render_ctx.rs @@ -94,7 +94,7 @@ impl RenderContext for PietGpuRenderContext { } let brush = brush.make_brush(self, || shape.bounding_box()).into_owned(); let path = shape.to_bez_path(TOLERANCE); - self.encode_path(path); + self.encode_path(path, false); match brush { PietGpuBrush::Solid(rgba_color) => { let stroke = Stroke { rgba_color }; @@ -116,7 +116,7 @@ impl RenderContext for PietGpuRenderContext { fn fill(&mut self, shape: impl Shape, brush: &impl IntoBrush) { let brush = brush.make_brush(self, || shape.bounding_box()).into_owned(); let path = shape.to_bez_path(TOLERANCE); - self.encode_path(path); + self.encode_path(path, true); match brush { PietGpuBrush::Solid(rgba_color) => { let fill = Fill { rgba_color }; @@ -198,7 +198,15 @@ impl RenderContext for PietGpuRenderContext { } impl PietGpuRenderContext { - fn encode_path(&mut self, path: impl Iterator) { + fn encode_line_seg(&mut self, seg: LineSeg, is_fill: bool) { + if is_fill { + self.elements.push(Element::FillLine(seg)); + } else { + self.elements.push(Element::StrokeLine(seg)); + } + } + + fn encode_path(&mut self, path: impl Iterator, is_fill: bool) { let flatten = true; if flatten { let mut start_pt = None; @@ -207,6 +215,7 @@ impl PietGpuRenderContext { match el { PathEl::MoveTo(p) => { let scene_pt = to_f32_2(p); + start_pt = Some(scene_pt); last_pt = Some(scene_pt); } PathEl::LineTo(p) => { @@ -215,16 +224,18 @@ impl PietGpuRenderContext { p0: last_pt.unwrap(), p1: scene_pt, }; - self.elements.push(Element::Line(seg)); + self.encode_line_seg(seg, is_fill); last_pt = Some(scene_pt); } PathEl::ClosePath => { if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) { - let seg = LineSeg { - p0: last, - p1: start, - }; - self.elements.push(Element::Line(seg)); + if last != start { + let seg = LineSeg { + p0: last, + p1: start, + }; + self.encode_line_seg(seg, is_fill); + } } } _ => (), @@ -238,6 +249,7 @@ impl PietGpuRenderContext { match el { PathEl::MoveTo(p) => { let scene_pt = to_f32_2(p); + start_pt = Some(scene_pt); last_pt = Some(scene_pt); } PathEl::LineTo(p) => { @@ -246,7 +258,7 @@ impl PietGpuRenderContext { p0: last_pt.unwrap(), p1: scene_pt, }; - self.elements.push(Element::Line(seg)); + self.encode_line_seg(seg, is_fill); last_pt = Some(scene_pt); } PathEl::QuadTo(p1, p2) => { @@ -275,11 +287,13 @@ impl PietGpuRenderContext { } PathEl::ClosePath => { if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) { - let seg = LineSeg { - p0: last, - p1: start, - }; - self.elements.push(Element::Line(seg)); + if last != start { + let seg = LineSeg { + p0: last, + p1: start, + }; + self.encode_line_seg(seg, is_fill); + } } } }