diff --git a/piet-gpu-types/src/state.rs b/piet-gpu-types/src/state.rs index b93e9f3..35076f0 100644 --- a/piet-gpu-types/src/state.rs +++ b/piet-gpu-types/src/state.rs @@ -8,7 +8,6 @@ piet_gpu! { translate: [f32; 2], bbox: [f32; 4], linewidth: f32, - right_edge: f32, flags: u32, } } diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp index 138621e..713a654 100644 --- a/piet-gpu/shader/binning.comp +++ b/piet-gpu/shader/binning.comp @@ -50,10 +50,10 @@ shared uint sh_chunk_jump[N_TILE]; shared float sh_right_edge[N_TILE]; -#define StateBuf_stride (4 + 2 * State_size) +#define StateBuf_stride (8 + 2 * State_size) -StateRef state_aggregate_ref(uint partition_ix) { - return StateRef(8 + partition_ix * StateBuf_stride); +uint state_right_edge_index(uint partition_ix) { + return 2 + partition_ix * (StateBuf_stride / 4); } void main() { @@ -120,8 +120,7 @@ void main() { // look-forward is small (performance may degrade in the case // of massively complex paths). do { - StateRef agg_ref = state_aggregate_ref(aggregate_ix); - my_right_edge = State_read(agg_ref).right_edge; + my_right_edge = uintBitsToFloat(state[state_right_edge_index(aggregate_ix)]); aggregate_ix++; } while (isinf(my_right_edge)); } diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv index dc1713b..e932e4d 100644 Binary files a/piet-gpu/shader/binning.spv and b/piet-gpu/shader/binning.spv differ diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja index 3b6b963..14c72aa 100644 --- a/piet-gpu/shader/build.ninja +++ b/piet-gpu/shader/build.ninja @@ -12,7 +12,7 @@ build image.spv: glsl image.comp | scene.h build elements.spv: glsl elements.comp | scene.h state.h annotated.h -build binning.spv: glsl binning.comp | annotated.h bins.h setup.h +build binning.spv: glsl binning.comp | annotated.h state.h bins.h setup.h build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp index 2389e27..03c4535 100644 --- a/piet-gpu/shader/coarse.comp +++ b/piet-gpu/shader/coarse.comp @@ -310,6 +310,30 @@ void main() { switch (tag) { case Annotated_FillLine: + AnnoFillLineSeg fill_line = Annotated_FillLine_read(ref); + // This is basically the same logic as piet-metal, but should be made numerically robust. + vec2 tile_xy = vec2(tile_x * TILE_WIDTH_PX, tile_y * TILE_HEIGHT_PX); + float yEdge = mix(fill_line.p0.y, fill_line.p1.y, (tile_xy.x - fill_line.p0.x) / (fill_line.p1.x - fill_line.p0.x)); + if (min(fill_line.p0.x, fill_line.p1.x) < tile_xy.x && yEdge >= tile_xy.y && yEdge < tile_xy.y + TILE_HEIGHT_PX) { + Segment edge_seg; + if (fill_line.p0.x > fill_line.p1.x) { + fill_line.p1 = vec2(tile_xy.x, yEdge); + edge_seg.start = fill_line.p1; + edge_seg.end = vec2(tile_xy.x, tile_xy.y + TILE_HEIGHT_PX); + } else { + fill_line.p0 = vec2(tile_xy.x, yEdge); + edge_seg.start = vec2(tile_xy.x, tile_xy.y + TILE_HEIGHT_PX); + edge_seg.end = fill_line.p0; + } + alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit); + Segment_write(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs), edge_seg); + chunk_n_segs++; + } + Segment fill_seg = Segment(fill_line.p0, fill_line.p1); + alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit); + Segment_write(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs), fill_seg); + chunk_n_segs++; + break; case Annotated_StrokeLine: AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref); Segment seg = Segment(line.p0, line.p1); diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv index bc097e4..3d3f3ff 100644 Binary files a/piet-gpu/shader/coarse.spv and b/piet-gpu/shader/coarse.spv differ diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp index 341952b..bdb4e0d 100644 --- a/piet-gpu/shader/elements.comp +++ b/piet-gpu/shader/elements.comp @@ -34,14 +34,14 @@ layout(set = 0, binding = 2) buffer AnnotatedBuf { #include "state.h" #include "annotated.h" -#define StateBuf_stride (4 + 2 * State_size) +#define StateBuf_stride (8 + 2 * State_size) StateRef state_aggregate_ref(uint partition_ix) { - return StateRef(8 + partition_ix * StateBuf_stride); + return StateRef(12 + partition_ix * StateBuf_stride); } StateRef state_prefix_ref(uint partition_ix) { - return StateRef(8 + partition_ix * StateBuf_stride + State_size); + return StateRef(12 + partition_ix * StateBuf_stride + State_size); } uint state_flag_index(uint partition_ix) { @@ -81,13 +81,12 @@ State combine_state(State a, State b) { c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x; c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y; c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth; - c.right_edge = (a.flags & FLAG_SET_BBOX) != 0 ? a.right_edge : (a.flags & FLAG_RESET_BBOX) != 0 ? a.bbox.z : c.right_edge; c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX)) | b.flags; c.flags |= (a.flags & FLAG_RESET_BBOX) >> 1; return c; } -State map_element(ElementRef ref) { +State map_element(ElementRef ref, inout bool is_fill) { // TODO: it would *probably* be more efficient to make the memory read patterns less // divergent, though it would be more wasted memory. uint tag = Element_tag(ref); @@ -97,6 +96,7 @@ State map_element(ElementRef ref) { c.translate = vec2(0.0, 0.0); c.linewidth = 1.0; // TODO should be 0.0 c.flags = 0; + is_fill = false; switch (tag) { case Element_FillLine: case Element_StrokeLine: @@ -115,6 +115,8 @@ State map_element(ElementRef ref) { c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3)); break; case Element_Fill: + is_fill = true; + // fall-through case Element_Stroke: c.flags = FLAG_RESET_BBOX; break; @@ -145,9 +147,10 @@ shared vec4 sh_mat[WG_SIZE]; shared vec2 sh_translate[WG_SIZE]; shared vec4 sh_bbox[WG_SIZE]; shared float sh_width[WG_SIZE]; -shared float sh_right_edge[WG_SIZE]; shared uint sh_flags[WG_SIZE]; +shared uint sh_min_fill; + shared uint sh_tile_ix; shared State sh_prefix; @@ -157,6 +160,7 @@ void main() { // 4.4 of prefix sum paper). if (gl_LocalInvocationID.x == 0) { sh_tile_ix = atomicAdd(state[0], 1); + sh_min_fill = ~0; } barrier(); uint tile_ix = sh_tile_ix; @@ -164,18 +168,24 @@ void main() { uint ix = tile_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS; ElementRef ref = ElementRef(ix * Element_size); - th_state[0] = map_element(ref); + bool is_fill; + uint my_min_fill = ~0; + th_state[0] = map_element(ref, is_fill); + if (is_fill) my_min_fill = ix; for (uint i = 1; i < N_ROWS; i++) { // discussion question: would it be faster to load using more coherent patterns // into thread memory? This is kinda strided. - th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i))); + th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i), is_fill)); + if (is_fill && my_min_fill == ~0) { + my_min_fill = ix + i; + } } + atomicMin(sh_min_fill, my_min_fill); State agg = th_state[N_ROWS - 1]; sh_mat[gl_LocalInvocationID.x] = agg.mat; sh_translate[gl_LocalInvocationID.x] = agg.translate; sh_bbox[gl_LocalInvocationID.x] = agg.bbox; sh_width[gl_LocalInvocationID.x] = agg.linewidth; - sh_right_edge[gl_LocalInvocationID.x] = agg.right_edge; sh_flags[gl_LocalInvocationID.x] = agg.flags; for (uint i = 0; i < LG_WG_SIZE; i++) { barrier(); @@ -194,7 +204,6 @@ void main() { sh_translate[gl_LocalInvocationID.x] = agg.translate; sh_bbox[gl_LocalInvocationID.x] = agg.bbox; sh_width[gl_LocalInvocationID.x] = agg.linewidth; - sh_right_edge[gl_LocalInvocationID.x] = agg.right_edge; sh_flags[gl_LocalInvocationID.x] = agg.flags; } @@ -203,7 +212,6 @@ void main() { exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0); exclusive.translate = vec2(0.0, 0.0); exclusive.linewidth = 1.0; //TODO should be 0.0 - exclusive.right_edge = 0.0; exclusive.flags = 0; // Publish aggregate for this partition @@ -244,6 +252,7 @@ void main() { } } barrier(); + my_min_fill = sh_min_fill; if (tile_ix != 0) { exclusive = sh_prefix; } @@ -256,12 +265,17 @@ void main() { other.translate = sh_translate[ix]; other.bbox = sh_bbox[ix]; other.linewidth = sh_width[ix]; - other.right_edge = sh_right_edge[ix]; other.flags = sh_flags[ix]; row = combine_state(row, other); } + if (my_min_fill == ~0 && gl_LocalInvocationID.x == 0) { + state[state_flag_index(tile_ix) + 1] = 0x7f800000; // infinity + } for (uint i = 0; i < N_ROWS; i++) { State st = combine_state(row, th_state[i]); + if (my_min_fill == ix + i) { + state[state_flag_index(tile_ix) + 1] = floatBitsToUint(st.bbox.z); + } // We write the state now for development purposes, but the // actual goal is to write transformed and annotated elements. //State_write(StateRef((ix + i) * State_size), st); diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv index c947240..962bd0a 100644 Binary files a/piet-gpu/shader/elements.spv and b/piet-gpu/shader/elements.spv differ diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h index 6b00661..b913086 100644 --- a/piet-gpu/shader/setup.h +++ b/piet-gpu/shader/setup.h @@ -58,7 +58,7 @@ // This is the ratio of the number of elements in a binning workgroup // over the number of elements in a partition workgroup. -#define ELEMENT_BINNING_RATIO 4 +#define ELEMENT_BINNING_RATIO 2 #define BIN_INITIAL_ALLOC 64 #define BIN_ALLOC 256 diff --git a/piet-gpu/shader/state.h b/piet-gpu/shader/state.h index bc6192f..2547b93 100644 --- a/piet-gpu/shader/state.h +++ b/piet-gpu/shader/state.h @@ -9,11 +9,10 @@ struct State { vec2 translate; vec4 bbox; float linewidth; - float right_edge; uint flags; }; -#define State_size 52 +#define State_size 48 StateRef State_index(StateRef ref, uint index) { return StateRef(ref.offset + index * State_size); @@ -33,14 +32,12 @@ State State_read(StateRef ref) { uint raw9 = state[ix + 9]; uint raw10 = state[ix + 10]; uint raw11 = state[ix + 11]; - uint raw12 = state[ix + 12]; State s; s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3)); s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5)); s.bbox = vec4(uintBitsToFloat(raw6), uintBitsToFloat(raw7), uintBitsToFloat(raw8), uintBitsToFloat(raw9)); s.linewidth = uintBitsToFloat(raw10); - s.right_edge = uintBitsToFloat(raw11); - s.flags = raw12; + s.flags = raw11; return s; } @@ -57,7 +54,6 @@ void State_write(StateRef ref, State s) { state[ix + 8] = floatBitsToUint(s.bbox.z); state[ix + 9] = floatBitsToUint(s.bbox.w); state[ix + 10] = floatBitsToUint(s.linewidth); - state[ix + 11] = floatBitsToUint(s.right_edge); - state[ix + 12] = s.flags; + state[ix + 11] = s.flags; } diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index 65bbe5c..70b02f5 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -46,8 +46,8 @@ pub fn render_scene(rc: &mut impl RenderContext) { let circle = Circle::new(center, radius); rc.fill(circle, &color); } - /* let mut path = BezPath::new(); + /* path.move_to((100.0, 1150.0)); path.line_to((200.0, 1200.0)); path.line_to((150.0, 1250.0)); diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs index 8d68b0c..da234de 100644 --- a/piet-gpu/src/render_ctx.rs +++ b/piet-gpu/src/render_ctx.rs @@ -215,6 +215,7 @@ impl PietGpuRenderContext { match el { PathEl::MoveTo(p) => { let scene_pt = to_f32_2(p); + start_pt = Some(scene_pt); last_pt = Some(scene_pt); } PathEl::LineTo(p) => { @@ -228,11 +229,13 @@ impl PietGpuRenderContext { } PathEl::ClosePath => { if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) { - let seg = LineSeg { - p0: last, - p1: start, - }; - self.encode_line_seg(seg, is_fill); + if last != start { + let seg = LineSeg { + p0: last, + p1: start, + }; + self.encode_line_seg(seg, is_fill); + } } } _ => (), @@ -246,6 +249,7 @@ impl PietGpuRenderContext { match el { PathEl::MoveTo(p) => { let scene_pt = to_f32_2(p); + start_pt = Some(scene_pt); last_pt = Some(scene_pt); } PathEl::LineTo(p) => { @@ -283,11 +287,13 @@ impl PietGpuRenderContext { } PathEl::ClosePath => { if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) { - let seg = LineSeg { - p0: last, - p1: start, - }; - self.encode_line_seg(seg, is_fill); + if last != start { + let seg = LineSeg { + p0: last, + p1: start, + }; + self.encode_line_seg(seg, is_fill); + } } } }