diff --git a/piet-gpu-types/src/ptcl.rs b/piet-gpu-types/src/ptcl.rs index 3faffb9..36274c4 100644 --- a/piet-gpu-types/src/ptcl.rs +++ b/piet-gpu-types/src/ptcl.rs @@ -13,8 +13,7 @@ piet_gpu! { end: [f32; 2], } struct CmdStroke { - n_segs: u32, - // Should be Ref if we had cross-module references. + // Should be Ref if we had cross-module references. seg_ref: u32, half_width: f32, rgba_color: u32, diff --git a/piet-gpu-types/src/segment.rs b/piet-gpu-types/src/segment.rs index ba5f3e2..0b18ab8 100644 --- a/piet-gpu-types/src/segment.rs +++ b/piet-gpu-types/src/segment.rs @@ -12,8 +12,7 @@ piet_gpu! { // Note: this is only suitable for strokes, fills require backdrop. struct ItemHeader { - n: u32, - segments: Ref, + segments: Ref, } // TODO: strongly consider using f16. If so, these would be @@ -23,5 +22,11 @@ piet_gpu! { start: [f32; 2], end: [f32; 2], } + + struct SegChunk { + n: u32, + next: Ref, + // Segments follow (could represent this as a variable sized array). + } } } diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja index 3da40c9..4a8181f 100644 --- a/piet-gpu/shader/build.ninja +++ b/piet-gpu/shader/build.ninja @@ -13,6 +13,6 @@ build kernel1.spv: glsl kernel1.comp | scene.h tilegroup.h setup.h build kernel2s.spv: glsl kernel2s.comp | scene.h tilegroup.h segment.h setup.h -build kernel3.spv: glsl kernel3.comp | scene.h tilegroup.h ptcl.h setup.h +build kernel3.spv: glsl kernel3.comp | scene.h tilegroup.h segment.h ptcl.h setup.h -build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h +build kernel4.spv: glsl kernel4.comp | ptcl.h segment.h setup.h diff --git a/piet-gpu/shader/kernel2s.comp b/piet-gpu/shader/kernel2s.comp index 3eb2d00..8c60b5b 100644 --- a/piet-gpu/shader/kernel2s.comp +++ b/piet-gpu/shader/kernel2s.comp @@ -44,12 +44,15 @@ void main() { InstanceRef stroke_ref = InstanceRef(chunk_ref.offset + Chunk_size); ItemHeaderRef item_header = ItemHeaderRef(atomicAdd(alloc, stroke_n * ItemHeader_size)); TileHeader_write(tile_header_ref, TileHeader(stroke_n, item_header)); - SegmentRef seg_ref = SegmentRef(0); + SegChunkRef seg_chunk_ref = SegChunkRef(0); uint seg_limit = 0; // Iterate through items; stroke_n holds count remaining. while (true) { if (chunk.chunk_n == 0) { chunk_ref = chunk.next; + if (chunk_ref.offset == 0) { + break; + } chunk = Chunk_read(chunk_ref); stroke_ref = InstanceRef(chunk_ref.offset + Chunk_size); } @@ -58,16 +61,8 @@ void main() { // Process the stroke polyline item. uint max_n_segs = poly.n_points - 1; - uint reserve = max_n_segs * Segment_size; - if (seg_ref.offset + reserve > seg_limit) { - // This is a heuristic to balance atomic bandwidth and utilization. - // The output always gets a contiguous allocation. We might use - // all, some, or none of the capacity. - uint capacity_bytes = stroke_n > 1 ? reserve * 2 + 128 : reserve; - seg_ref.offset = atomicAdd(alloc, capacity_bytes); - seg_limit = seg_ref.offset + capacity_bytes; - } - uint n_segs = 0; + uint chunk_n_segs = 0; + SegChunkRef seg_chunk_ref; vec2 start = Point_read(poly.points).xy; for (uint j = 0; j < max_n_segs; j++) { poly.points.offset += Point_size; @@ -103,18 +98,31 @@ void main() { && max(min(start.y, end.y), ymin) < min(max(start.y, end.y), ymax) && s00 * s01 + s00 * s10 + s00 * s11 < 3.0) { + // Allocate a chunk if needed. + if (chunk_n_segs == 0) { + if (seg_chunk_ref.offset + 40 > seg_limit) { + seg_chunk_ref.offset = atomicAdd(alloc, SEG_CHUNK_ALLOC); + seg_limit = seg_chunk_ref.offset + SEG_CHUNK_ALLOC - Segment_size; + } + ItemHeader_write(item_header, ItemHeader(seg_chunk_ref)); + } else if (seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs > seg_limit) { + uint new_chunk_ref = atomicAdd(alloc, SEG_CHUNK_ALLOC); + seg_limit = new_chunk_ref + SEG_CHUNK_ALLOC - Segment_size; + SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, SegChunkRef(new_chunk_ref))); + seg_chunk_ref.offset = new_chunk_ref; + chunk_n_segs = 0; + } Segment seg = Segment(start, end); - Segment_write(Segment_index(seg_ref, n_segs), seg); - n_segs++; + Segment_write(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs), seg); + chunk_n_segs++; } start = end; } - ItemHeader_write(item_header, ItemHeader(n_segs, seg_ref)); - if (--stroke_n == 0) { - break; + if (chunk_n_segs > 0) { + SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, SegChunkRef(0))); + seg_chunk_ref.offset += SegChunk_size + Segment_size * chunk_n_segs; } - seg_ref.offset += n_segs * Segment_size; stroke_ref.offset += Instance_size; chunk.chunk_n--; diff --git a/piet-gpu/shader/kernel2s.spv b/piet-gpu/shader/kernel2s.spv index 7c7f48f..4f249bb 100644 Binary files a/piet-gpu/shader/kernel2s.spv and b/piet-gpu/shader/kernel2s.spv differ diff --git a/piet-gpu/shader/kernel3.comp b/piet-gpu/shader/kernel3.comp index fc4f9ea..81c24d1 100644 --- a/piet-gpu/shader/kernel3.comp +++ b/piet-gpu/shader/kernel3.comp @@ -87,10 +87,9 @@ void main() { case PietItem_Poly: ItemHeader stroke_item = ItemHeader_read(stroke_th.items); stroke_th.items.offset += ItemHeader_size; - if (stroke_item.n > 0) { + if (stroke_item.segments.offset != 0) { PietStrokePolyLine poly = PietItem_Poly_read(item_ref); CmdStroke cmd = CmdStroke( - stroke_item.n, stroke_item.segments.offset, 0.5 * poly.width, poly.rgba_color diff --git a/piet-gpu/shader/kernel3.spv b/piet-gpu/shader/kernel3.spv index f5b83bc..3b8aa91 100644 Binary files a/piet-gpu/shader/kernel3.spv and b/piet-gpu/shader/kernel3.spv differ diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp index 931f28b..e30372a 100644 --- a/piet-gpu/shader/kernel4.comp +++ b/piet-gpu/shader/kernel4.comp @@ -54,13 +54,18 @@ void main() { case Cmd_Stroke: CmdStroke stroke = Cmd_Stroke_read(cmd_ref); float df = 1e9; - for (int i = 0; i < stroke.n_segs; i++) { - Segment seg = Segment_read(Segment_index(SegmentRef(stroke.seg_ref), i)); - vec2 line_vec = seg.end - seg.start; - vec2 dpos = xy + vec2(0.5, 0.5) - seg.start; - float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0); - df = min(df, length(line_vec * t - dpos)); - } + SegChunkRef seg_chunk_ref = SegChunkRef(stroke.seg_ref); + do { + SegChunk seg_chunk = SegChunk_read(seg_chunk_ref); + for (int i = 0; i < seg_chunk.n; i++) { + Segment seg = Segment_read(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * i)); + vec2 line_vec = seg.end - seg.start; + vec2 dpos = xy + vec2(0.5, 0.5) - seg.start; + float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0); + df = min(df, length(line_vec * t - dpos)); + } + seg_chunk_ref = seg_chunk.next; + } while (seg_chunk_ref.offset != 0); fg_rgba = unpackUnorm4x8(stroke.rgba_color).wzyx; alpha = clamp(stroke.half_width + 0.5 - df, 0.0, 1.0); rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a); diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv index b931f23..99067bb 100644 Binary files a/piet-gpu/shader/kernel4.spv and b/piet-gpu/shader/kernel4.spv differ diff --git a/piet-gpu/shader/ptcl.h b/piet-gpu/shader/ptcl.h index 8b62538..e52f811 100644 --- a/piet-gpu/shader/ptcl.h +++ b/piet-gpu/shader/ptcl.h @@ -60,13 +60,12 @@ CmdLineRef CmdLine_index(CmdLineRef ref, uint index) { } struct CmdStroke { - uint n_segs; uint seg_ref; float half_width; uint rgba_color; }; -#define CmdStroke_size 16 +#define CmdStroke_size 12 CmdStrokeRef CmdStroke_index(CmdStrokeRef ref, uint index) { return CmdStrokeRef(ref.offset + index * CmdStroke_size); @@ -187,21 +186,18 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) { uint raw0 = ptcl[ix + 0]; uint raw1 = ptcl[ix + 1]; uint raw2 = ptcl[ix + 2]; - uint raw3 = ptcl[ix + 3]; CmdStroke s; - s.n_segs = raw0; - s.seg_ref = raw1; - s.half_width = uintBitsToFloat(raw2); - s.rgba_color = raw3; + s.seg_ref = raw0; + s.half_width = uintBitsToFloat(raw1); + s.rgba_color = raw2; return s; } void CmdStroke_write(CmdStrokeRef ref, CmdStroke s) { uint ix = ref.offset >> 2; - ptcl[ix + 0] = s.n_segs; - ptcl[ix + 1] = s.seg_ref; - ptcl[ix + 2] = floatBitsToUint(s.half_width); - ptcl[ix + 3] = s.rgba_color; + ptcl[ix + 0] = s.seg_ref; + ptcl[ix + 1] = floatBitsToUint(s.half_width); + ptcl[ix + 2] = s.rgba_color; } CmdFill CmdFill_read(CmdFillRef ref) { diff --git a/piet-gpu/shader/segment.h b/piet-gpu/shader/segment.h index 517c115..2843b64 100644 --- a/piet-gpu/shader/segment.h +++ b/piet-gpu/shader/segment.h @@ -12,6 +12,10 @@ struct SegmentRef { uint offset; }; +struct SegChunkRef { + uint offset; +}; + struct TileHeader { uint n; ItemHeaderRef items; @@ -24,11 +28,10 @@ TileHeaderRef TileHeader_index(TileHeaderRef ref, uint index) { } struct ItemHeader { - uint n; - SegmentRef segments; + SegChunkRef segments; }; -#define ItemHeader_size 8 +#define ItemHeader_size 4 ItemHeaderRef ItemHeader_index(ItemHeaderRef ref, uint index) { return ItemHeaderRef(ref.offset + index * ItemHeader_size); @@ -45,6 +48,17 @@ SegmentRef Segment_index(SegmentRef ref, uint index) { return SegmentRef(ref.offset + index * Segment_size); } +struct SegChunk { + uint n; + SegChunkRef next; +}; + +#define SegChunk_size 8 + +SegChunkRef SegChunk_index(SegChunkRef ref, uint index) { + return SegChunkRef(ref.offset + index * SegChunk_size); +} + TileHeader TileHeader_read(TileHeaderRef ref) { uint ix = ref.offset >> 2; uint raw0 = segment[ix + 0]; @@ -64,17 +78,14 @@ void TileHeader_write(TileHeaderRef ref, TileHeader s) { ItemHeader ItemHeader_read(ItemHeaderRef ref) { uint ix = ref.offset >> 2; uint raw0 = segment[ix + 0]; - uint raw1 = segment[ix + 1]; ItemHeader s; - s.n = raw0; - s.segments = SegmentRef(raw1); + s.segments = SegChunkRef(raw0); return s; } void ItemHeader_write(ItemHeaderRef ref, ItemHeader s) { uint ix = ref.offset >> 2; - segment[ix + 0] = s.n; - segment[ix + 1] = s.segments.offset; + segment[ix + 0] = s.segments.offset; } Segment Segment_read(SegmentRef ref) { @@ -97,3 +108,19 @@ void Segment_write(SegmentRef ref, Segment s) { segment[ix + 3] = floatBitsToUint(s.end.y); } +SegChunk SegChunk_read(SegChunkRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = segment[ix + 0]; + uint raw1 = segment[ix + 1]; + SegChunk s; + s.n = raw0; + s.next = SegChunkRef(raw1); + return s; +} + +void SegChunk_write(SegChunkRef ref, SegChunk s) { + uint ix = ref.offset >> 2; + segment[ix + 0] = s.n; + segment[ix + 1] = s.next.offset; +} + diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h index a644dc0..2bebabe 100644 --- a/piet-gpu/shader/setup.h +++ b/piet-gpu/shader/setup.h @@ -32,3 +32,7 @@ #define TILE_HEIGHT_PX 16 #define PTCL_INITIAL_ALLOC 1024 + +// Maximum number of segments in a SegChunk +#define SEG_CHUNK_N 32 +#define SEG_CHUNK_ALLOC 512 \ No newline at end of file diff --git a/piet-gpu/src/main.rs b/piet-gpu/src/main.rs index 9f4f25f..4416487 100644 --- a/piet-gpu/src/main.rs +++ b/piet-gpu/src/main.rs @@ -53,7 +53,7 @@ fn render_scene(rc: &mut impl RenderContext) { } fn render_cardioid(rc: &mut impl RenderContext) { - let n = 100; + let n = 91; let dth = std::f64::consts::PI * 2.0 / (n as f64); let center = Point::new(1024.0, 768.0); let r = 750.0;