diff --git a/piet-gpu-types/src/lib.rs b/piet-gpu-types/src/lib.rs index 2072d8a..db9516f 100644 --- a/piet-gpu-types/src/lib.rs +++ b/piet-gpu-types/src/lib.rs @@ -1,5 +1,6 @@ pub mod encoder; pub mod ptcl; pub mod scene; +pub mod segment; pub mod test; pub mod tilegroup; diff --git a/piet-gpu-types/src/main.rs b/piet-gpu-types/src/main.rs index d19e825..834f1b6 100644 --- a/piet-gpu-types/src/main.rs +++ b/piet-gpu-types/src/main.rs @@ -6,6 +6,7 @@ fn main() { match mod_name.as_str() { "scene" => print!("{}", piet_gpu_types::scene::gen_gpu_scene()), "tilegroup" => print!("{}", piet_gpu_types::tilegroup::gen_gpu_tilegroup()), + "segment" => print!("{}", piet_gpu_types::segment::gen_gpu_segment()), "ptcl" => print!("{}", piet_gpu_types::ptcl::gen_gpu_ptcl()), "test" => print!("{}", piet_gpu_types::test::gen_gpu_test()), _ => println!("Oops, unknown module name"), diff --git a/piet-gpu-types/src/ptcl.rs b/piet-gpu-types/src/ptcl.rs index ed72e42..3faffb9 100644 --- a/piet-gpu-types/src/ptcl.rs +++ b/piet-gpu-types/src/ptcl.rs @@ -13,8 +13,10 @@ piet_gpu! { end: [f32; 2], } struct CmdStroke { - // In existing code, this is f16. Should we have support? - halfWidth: f32, + n_segs: u32, + // Should be Ref if we had cross-module references. + seg_ref: u32, + half_width: f32, rgba_color: u32, } struct CmdFill { diff --git a/piet-gpu-types/src/segment.rs b/piet-gpu-types/src/segment.rs new file mode 100644 index 0000000..ba5f3e2 --- /dev/null +++ b/piet-gpu-types/src/segment.rs @@ -0,0 +1,27 @@ +use piet_gpu_derive::piet_gpu; + +// Structures representing segments for stroke/fill items. + +piet_gpu! { + #[gpu_write] + mod segment { + struct TileHeader { + n: u32, + items: Ref, + } + + // Note: this is only suitable for strokes, fills require backdrop. + struct ItemHeader { + n: u32, + segments: Ref, + } + + // TODO: strongly consider using f16. If so, these would be + // relative to the tile. We're doing f32 for now to minimize + // divergence from piet-metal originals. + struct Segment { + start: [f32; 2], + end: [f32; 2], + } + } +} diff --git a/piet-gpu-types/src/tilegroup.rs b/piet-gpu-types/src/tilegroup.rs index 5912154..ea295d9 100644 --- a/piet-gpu-types/src/tilegroup.rs +++ b/piet-gpu-types/src/tilegroup.rs @@ -1,5 +1,18 @@ use piet_gpu_derive::piet_gpu; +// Structures representing tilegroup instances (output of kernel 1). +// There are three outputs: the main instances, the stroke instances, +// and the fill instances. All three are conceptually a list of +// instances, but the encoding is slightly different. The first is +// encoded with Instance, Jump, and End. The other two are encoded +// as a linked list of Chunk. + +// The motivation for the difference is that the first requires fewer +// registers to track state, but the second contains information that +// is useful up front for doing dynamic allocation in kernel 2, as +// well as increasing read parallelism; the "jump" approach really is +// geared to sequential reading. + piet_gpu! { #[gpu_write] mod tilegroup { @@ -11,7 +24,11 @@ piet_gpu! { offset: [f32; 2], } struct Jump { - new_ref: u32, + new_ref: Ref, + } + struct Chunk { + chunk_n: u32, + next: Ref, } enum TileGroup { Instance(Instance), diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja index 7509062..3da40c9 100644 --- a/piet-gpu/shader/build.ninja +++ b/piet-gpu/shader/build.ninja @@ -11,6 +11,8 @@ build image.spv: glsl image.comp | scene.h build kernel1.spv: glsl kernel1.comp | scene.h tilegroup.h setup.h +build kernel2s.spv: glsl kernel2s.comp | scene.h tilegroup.h segment.h setup.h + build kernel3.spv: glsl kernel3.comp | scene.h tilegroup.h ptcl.h setup.h build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h diff --git a/piet-gpu/shader/kernel1.comp b/piet-gpu/shader/kernel1.comp index 82ccb8f..ce99005 100644 --- a/piet-gpu/shader/kernel1.comp +++ b/piet-gpu/shader/kernel1.comp @@ -7,8 +7,7 @@ // subgroups (or possibly both) to parallelize the reading of the input and // the computation of tilegroup intersection. // -// In addition, there are some features currently missing. One is the use of -// a bump allocator to extend the current fixed allocation. Another is support +// In addition, there are some features currently missing, such as support // for clipping. #version 450 @@ -46,8 +45,17 @@ void main() { StackElement stack[MAX_STACK]; uint stack_ix = 0; uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS + gl_GlobalInvocationID.x; - TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_INITIAL_ALLOC); + TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE); uint tg_limit = tg_ref.offset + TILEGROUP_INITIAL_ALLOC - 2 * TileGroup_size; + + // State for stroke references. + TileGroupRef stroke_start = TileGroupRef(tg_ref.offset + TILEGROUP_STROKE_START); + ChunkRef stroke_chunk_start = ChunkRef(stroke_start.offset + 4); + InstanceRef stroke_ref = InstanceRef(stroke_chunk_start.offset + Chunk_size); + uint stroke_limit = stroke_start.offset + TILEGROUP_INITIAL_ALLOC - Instance_size; + uint stroke_chunk_n = 0; + uint stroke_n = 0; + vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILEGROUP_WIDTH_PX, TILEGROUP_HEIGHT_PX); PietItemRef root = PietItemRef(0); SimpleGroup group = PietItem_Group_read(root); @@ -60,9 +68,11 @@ void main() { bool hit = max(bb.x, xy0.x) < min(bb.z, xy0.x + float(TILEGROUP_WIDTH_PX)) && max(bb.y, xy0.y) < min(bb.w, xy0.y + float(TILEGROUP_HEIGHT_PX)); bool is_group = false; + uint tag; if (hit) { PietItemRef item_ref = PietItem_index(group.items, tos.index); - is_group = PietItem_tag(item_ref) == PietItem_Group; + tag = PietItem_tag(item_ref); + is_group = tag == PietItem_Group; } if (hit && !is_group) { PietItemRef item_ref = PietItem_index(group.items, tos.index); @@ -70,13 +80,27 @@ void main() { if (tg_ref.offset > tg_limit) { // Allocation exceeded; do atomic bump alloc. uint new_tg = atomicAdd(alloc, TILEGROUP_INITIAL_ALLOC); - Jump jump = Jump(new_tg); + Jump jump = Jump(TileGroupRef(new_tg)); TileGroup_Jump_write(tg_ref, jump); tg_ref = TileGroupRef(new_tg); tg_limit = tg_ref.offset + TILEGROUP_INITIAL_ALLOC - 2 * TileGroup_size; } TileGroup_Instance_write(tg_ref, ins); tg_ref.offset += TileGroup_size; + if (tag == PietItem_Poly) { + if (stroke_ref.offset > stroke_limit) { + uint new_stroke = atomicAdd(alloc, TILEGROUP_STROKE_ALLOC); + Chunk_write(stroke_chunk_start, Chunk(stroke_chunk_n, ChunkRef(new_stroke))); + stroke_chunk_start = ChunkRef(new_stroke); + stroke_ref = InstanceRef(new_stroke + Chunk_size); + stroke_n += stroke_chunk_n; + stroke_chunk_n = 0; + stroke_limit = new_stroke + TILEGROUP_STROKE_ALLOC - Instance_size; + } + Instance_write(stroke_ref, ins); + stroke_chunk_n++; + stroke_ref.offset += Instance_size; + } } if (is_group) { PietItemRef item_ref = PietItem_index(group.items, tos.index); @@ -99,4 +123,10 @@ void main() { } } TileGroup_End_write(tg_ref); + + stroke_n += stroke_chunk_n; + if (stroke_n > 0) { + Chunk_write(stroke_chunk_start, Chunk(stroke_chunk_n, ChunkRef(0))); + } + tilegroup[stroke_start.offset >> 2] = stroke_n; } diff --git a/piet-gpu/shader/kernel1.spv b/piet-gpu/shader/kernel1.spv index 9ac3593..8430d74 100644 Binary files a/piet-gpu/shader/kernel1.spv and b/piet-gpu/shader/kernel1.spv differ diff --git a/piet-gpu/shader/kernel2s.comp b/piet-gpu/shader/kernel2s.comp new file mode 100644 index 0000000..3eb2d00 --- /dev/null +++ b/piet-gpu/shader/kernel2s.comp @@ -0,0 +1,127 @@ +// This is "kernel 2" (strokes) in a 4-kernel pipeline. It processes the stroke +// (polyline) items in the scene and generates a list of segments for each, for +// each tile. + +#version 450 +#extension GL_GOOGLE_include_directive : enable + +layout(local_size_x = 32) in; + +layout(set = 0, binding = 0) readonly buffer SceneBuf { + uint[] scene; +}; + +layout(set = 0, binding = 1) buffer TilegroupBuf { + uint[] tilegroup; +}; + +layout(set = 0, binding = 2) buffer SegmentBuf { + uint[] segment; +}; + +layout(set = 0, binding = 3) buffer AllocBuf { + uint alloc; +}; + +#include "scene.h" +#include "tilegroup.h" +#include "segment.h" + +#include "setup.h" + +void main() { + uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x; + uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS + + (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES); + vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX); + TileGroupRef stroke_start = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE + TILEGROUP_STROKE_START); + uint stroke_n = tilegroup[stroke_start.offset >> 2]; + + TileHeaderRef tile_header_ref = TileHeaderRef(tile_ix * TileHeader_size); + if (stroke_n > 0) { + ChunkRef chunk_ref = ChunkRef(stroke_start.offset + 4); + Chunk chunk = Chunk_read(chunk_ref); + InstanceRef stroke_ref = InstanceRef(chunk_ref.offset + Chunk_size); + ItemHeaderRef item_header = ItemHeaderRef(atomicAdd(alloc, stroke_n * ItemHeader_size)); + TileHeader_write(tile_header_ref, TileHeader(stroke_n, item_header)); + SegmentRef seg_ref = SegmentRef(0); + uint seg_limit = 0; + // Iterate through items; stroke_n holds count remaining. + while (true) { + if (chunk.chunk_n == 0) { + chunk_ref = chunk.next; + chunk = Chunk_read(chunk_ref); + stroke_ref = InstanceRef(chunk_ref.offset + Chunk_size); + } + Instance ins = Instance_read(stroke_ref); + PietStrokePolyLine poly = PietItem_Poly_read(PietItemRef(ins.item_ref)); + + // Process the stroke polyline item. + uint max_n_segs = poly.n_points - 1; + uint reserve = max_n_segs * Segment_size; + if (seg_ref.offset + reserve > seg_limit) { + // This is a heuristic to balance atomic bandwidth and utilization. + // The output always gets a contiguous allocation. We might use + // all, some, or none of the capacity. + uint capacity_bytes = stroke_n > 1 ? reserve * 2 + 128 : reserve; + seg_ref.offset = atomicAdd(alloc, capacity_bytes); + seg_limit = seg_ref.offset + capacity_bytes; + } + uint n_segs = 0; + vec2 start = Point_read(poly.points).xy; + for (uint j = 0; j < max_n_segs; j++) { + poly.points.offset += Point_size; + vec2 end = Point_read(poly.points).xy; + + // Process one segment. + + // This logic just tests for collision. What we probably want to do + // is a clipping algorithm like Liang-Barsky, and then store coords + // relative to the tile in f16. See also: + // https://tavianator.com/fast-branchless-raybounding-box-intersections/ + + // Also note that when we go to the fancy version, we want to compute + // the (horizontal projection of) the bounding box of the intersection + // once per tilegroup, so we can assign work to individual tiles. + + float a = end.y - start.y; + float b = start.x - end.x; + float c = -(a * start.x + b * start.y); + float half_width = 0.5 * poly.width; + // Tile boundaries padded by half-width. + float xmin = xy0.x - half_width; + float ymin = xy0.y - half_width; + float xmax = xy0.x + float(TILE_WIDTH_PX) + half_width; + float ymax = xy0.y + float(TILE_HEIGHT_PX) + half_width; + float s00 = sign(b * ymin + a * xmin + c); + float s01 = sign(b * ymin + a * xmax + c); + float s10 = sign(b * ymax + a * xmin + c); + float s11 = sign(b * ymax + a * xmax + c); + // If bounding boxes intersect and not all four corners are on the same side, hit. + // Also note: this is designed to be false on NAN input. + if (max(min(start.x, end.x), xmin) < min(max(start.x, end.x), xmax) + && max(min(start.y, end.y), ymin) < min(max(start.y, end.y), ymax) + && s00 * s01 + s00 * s10 + s00 * s11 < 3.0) + { + Segment seg = Segment(start, end); + Segment_write(Segment_index(seg_ref, n_segs), seg); + n_segs++; + } + + start = end; + } + ItemHeader_write(item_header, ItemHeader(n_segs, seg_ref)); + if (--stroke_n == 0) { + break; + } + seg_ref.offset += n_segs * Segment_size; + + stroke_ref.offset += Instance_size; + chunk.chunk_n--; + item_header.offset += ItemHeader_size; + } + } else { + // As an optimization, we could just write 0 for the size. + TileHeader_write(tile_header_ref, TileHeader(stroke_n, ItemHeaderRef(0))); + } +} diff --git a/piet-gpu/shader/kernel2s.spv b/piet-gpu/shader/kernel2s.spv new file mode 100644 index 0000000..7c7f48f Binary files /dev/null and b/piet-gpu/shader/kernel2s.spv differ diff --git a/piet-gpu/shader/kernel3.comp b/piet-gpu/shader/kernel3.comp index ef3faef..fc4f9ea 100644 --- a/piet-gpu/shader/kernel3.comp +++ b/piet-gpu/shader/kernel3.comp @@ -16,16 +16,22 @@ layout(set = 0, binding = 1) buffer TilegroupBuf { uint[] tilegroup; }; -layout(set = 0, binding = 2) buffer PtclBuf { +// Used readonly +layout(set = 0, binding = 2) buffer SegmentBuf { + uint[] segment; +}; + +layout(set = 0, binding = 3) buffer PtclBuf { uint[] ptcl; }; -layout(set = 0, binding = 3) buffer AllocBuf { +layout(set = 0, binding = 4) buffer AllocBuf { uint alloc; }; #include "scene.h" #include "tilegroup.h" +#include "segment.h" #include "ptcl.h" #include "setup.h" @@ -45,17 +51,19 @@ void main() { uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS + (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES); vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX); - TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_INITIAL_ALLOC); + TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE); CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC); uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size; + TileHeader stroke_th = TileHeader_read(TileHeaderRef(tile_ix * TileHeader_size)); + while (true) { uint tg_tag = TileGroup_tag(tg_ref); if (tg_tag == TileGroup_End) { break; } if (tg_tag == TileGroup_Jump) { - tg_ref = TileGroupRef(TileGroup_Jump_read(tg_ref).new_ref); + tg_ref = TileGroup_Jump_read(tg_ref).new_ref; continue; } // Assume tg_tag is `Instance`, though there will be more cases. @@ -76,6 +84,22 @@ void main() { cmd_ref.offset += Cmd_size; } break; + case PietItem_Poly: + ItemHeader stroke_item = ItemHeader_read(stroke_th.items); + stroke_th.items.offset += ItemHeader_size; + if (stroke_item.n > 0) { + PietStrokePolyLine poly = PietItem_Poly_read(item_ref); + CmdStroke cmd = CmdStroke( + stroke_item.n, + stroke_item.segments.offset, + 0.5 * poly.width, + poly.rgba_color + ); + alloc_cmd(cmd_ref, cmd_limit); + Cmd_Stroke_write(cmd_ref, cmd); + cmd_ref.offset += Cmd_size; + } + break; } tg_ref.offset += TileGroup_size; } diff --git a/piet-gpu/shader/kernel3.spv b/piet-gpu/shader/kernel3.spv index cd56c48..f5b83bc 100644 Binary files a/piet-gpu/shader/kernel3.spv and b/piet-gpu/shader/kernel3.spv differ diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp index cdde198..931f28b 100644 --- a/piet-gpu/shader/kernel4.comp +++ b/piet-gpu/shader/kernel4.comp @@ -14,11 +14,17 @@ layout(set = 0, binding = 0) buffer PtclBuf { uint[] ptcl; }; -layout(set = 0, binding = 1) buffer ImageBuf { +// Used readonly +layout(set = 0, binding = 1) buffer SegmentBuf { + uint[] segment; +}; + +layout(set = 0, binding = 2) buffer ImageBuf { uint[] image; }; #include "ptcl.h" +#include "segment.h" #include "setup.h" @@ -41,10 +47,24 @@ void main() { CmdCircle circle = Cmd_Circle_read(cmd_ref); float r = length(xy + vec2(0.5, 0.5) - circle.center.xy); float alpha = clamp(0.5 + circle.radius - r, 0.0, 1.0); - vec4 fg_rgba = unpackUnorm4x8(circle.rgba_color); + vec4 fg_rgba = unpackUnorm4x8(circle.rgba_color).wzyx; // TODO: sRGB rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a); break; + case Cmd_Stroke: + CmdStroke stroke = Cmd_Stroke_read(cmd_ref); + float df = 1e9; + for (int i = 0; i < stroke.n_segs; i++) { + Segment seg = Segment_read(Segment_index(SegmentRef(stroke.seg_ref), i)); + vec2 line_vec = seg.end - seg.start; + vec2 dpos = xy + vec2(0.5, 0.5) - seg.start; + float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0); + df = min(df, length(line_vec * t - dpos)); + } + fg_rgba = unpackUnorm4x8(stroke.rgba_color).wzyx; + alpha = clamp(stroke.half_width + 0.5 - df, 0.0, 1.0); + rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a); + break; case Cmd_Jump: cmd_ref = CmdRef(Cmd_Jump_read(cmd_ref).new_ref); continue; diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv index caef463..b931f23 100644 Binary files a/piet-gpu/shader/kernel4.spv and b/piet-gpu/shader/kernel4.spv differ diff --git a/piet-gpu/shader/ptcl.h b/piet-gpu/shader/ptcl.h index cc43594..8b62538 100644 --- a/piet-gpu/shader/ptcl.h +++ b/piet-gpu/shader/ptcl.h @@ -60,11 +60,13 @@ CmdLineRef CmdLine_index(CmdLineRef ref, uint index) { } struct CmdStroke { - float halfWidth; + uint n_segs; + uint seg_ref; + float half_width; uint rgba_color; }; -#define CmdStroke_size 8 +#define CmdStroke_size 16 CmdStrokeRef CmdStroke_index(CmdStrokeRef ref, uint index) { return CmdStrokeRef(ref.offset + index * CmdStroke_size); @@ -184,16 +186,22 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) { uint ix = ref.offset >> 2; uint raw0 = ptcl[ix + 0]; uint raw1 = ptcl[ix + 1]; + uint raw2 = ptcl[ix + 2]; + uint raw3 = ptcl[ix + 3]; CmdStroke s; - s.halfWidth = uintBitsToFloat(raw0); - s.rgba_color = raw1; + s.n_segs = raw0; + s.seg_ref = raw1; + s.half_width = uintBitsToFloat(raw2); + s.rgba_color = raw3; return s; } void CmdStroke_write(CmdStrokeRef ref, CmdStroke s) { uint ix = ref.offset >> 2; - ptcl[ix + 0] = floatBitsToUint(s.halfWidth); - ptcl[ix + 1] = s.rgba_color; + ptcl[ix + 0] = s.n_segs; + ptcl[ix + 1] = s.seg_ref; + ptcl[ix + 2] = floatBitsToUint(s.half_width); + ptcl[ix + 3] = s.rgba_color; } CmdFill CmdFill_read(CmdFillRef ref) { diff --git a/piet-gpu/shader/segment.h b/piet-gpu/shader/segment.h new file mode 100644 index 0000000..517c115 --- /dev/null +++ b/piet-gpu/shader/segment.h @@ -0,0 +1,99 @@ +// Code auto-generated by piet-gpu-derive + +struct TileHeaderRef { + uint offset; +}; + +struct ItemHeaderRef { + uint offset; +}; + +struct SegmentRef { + uint offset; +}; + +struct TileHeader { + uint n; + ItemHeaderRef items; +}; + +#define TileHeader_size 8 + +TileHeaderRef TileHeader_index(TileHeaderRef ref, uint index) { + return TileHeaderRef(ref.offset + index * TileHeader_size); +} + +struct ItemHeader { + uint n; + SegmentRef segments; +}; + +#define ItemHeader_size 8 + +ItemHeaderRef ItemHeader_index(ItemHeaderRef ref, uint index) { + return ItemHeaderRef(ref.offset + index * ItemHeader_size); +} + +struct Segment { + vec2 start; + vec2 end; +}; + +#define Segment_size 16 + +SegmentRef Segment_index(SegmentRef ref, uint index) { + return SegmentRef(ref.offset + index * Segment_size); +} + +TileHeader TileHeader_read(TileHeaderRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = segment[ix + 0]; + uint raw1 = segment[ix + 1]; + TileHeader s; + s.n = raw0; + s.items = ItemHeaderRef(raw1); + return s; +} + +void TileHeader_write(TileHeaderRef ref, TileHeader s) { + uint ix = ref.offset >> 2; + segment[ix + 0] = s.n; + segment[ix + 1] = s.items.offset; +} + +ItemHeader ItemHeader_read(ItemHeaderRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = segment[ix + 0]; + uint raw1 = segment[ix + 1]; + ItemHeader s; + s.n = raw0; + s.segments = SegmentRef(raw1); + return s; +} + +void ItemHeader_write(ItemHeaderRef ref, ItemHeader s) { + uint ix = ref.offset >> 2; + segment[ix + 0] = s.n; + segment[ix + 1] = s.segments.offset; +} + +Segment Segment_read(SegmentRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = segment[ix + 0]; + uint raw1 = segment[ix + 1]; + uint raw2 = segment[ix + 2]; + uint raw3 = segment[ix + 3]; + Segment s; + s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + return s; +} + +void Segment_write(SegmentRef ref, Segment s) { + uint ix = ref.offset >> 2; + segment[ix + 0] = floatBitsToUint(s.start.x); + segment[ix + 1] = floatBitsToUint(s.start.y); + segment[ix + 2] = floatBitsToUint(s.end.x); + segment[ix + 3] = floatBitsToUint(s.end.y); +} + diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h index 9ce2de6..a644dc0 100644 --- a/piet-gpu/shader/setup.h +++ b/piet-gpu/shader/setup.h @@ -15,6 +15,15 @@ #define TILEGROUP_INITIAL_ALLOC 1024 +// Quick note on layout of tilegroups (k1 output): in the base, +// there is a region of size TILEGROUP_STRIDE for each tilegroup. +// At offset 0 are the main instances, encoded with Jump. At offset +// TILEGROUP_STROKE_START are the stroke instances, encoded with +// Head and Link. +#define TILEGROUP_STRIDE 2048 +#define TILEGROUP_STROKE_START 1024 +#define TILEGROUP_STROKE_ALLOC 1024 + // TODO: compute all these #define WIDTH_IN_TILES 128 diff --git a/piet-gpu/shader/tilegroup.h b/piet-gpu/shader/tilegroup.h index 64b27d3..213ddc3 100644 --- a/piet-gpu/shader/tilegroup.h +++ b/piet-gpu/shader/tilegroup.h @@ -8,6 +8,10 @@ struct JumpRef { uint offset; }; +struct ChunkRef { + uint offset; +}; + struct TileGroupRef { uint offset; }; @@ -24,7 +28,7 @@ InstanceRef Instance_index(InstanceRef ref, uint index) { } struct Jump { - uint new_ref; + TileGroupRef new_ref; }; #define Jump_size 4 @@ -33,6 +37,17 @@ JumpRef Jump_index(JumpRef ref, uint index) { return JumpRef(ref.offset + index * Jump_size); } +struct Chunk { + uint chunk_n; + ChunkRef next; +}; + +#define Chunk_size 8 + +ChunkRef Chunk_index(ChunkRef ref, uint index) { + return ChunkRef(ref.offset + index * Chunk_size); +} + #define TileGroup_Instance 0 #define TileGroup_Jump 1 #define TileGroup_End 2 @@ -64,13 +79,29 @@ Jump Jump_read(JumpRef ref) { uint ix = ref.offset >> 2; uint raw0 = tilegroup[ix + 0]; Jump s; - s.new_ref = raw0; + s.new_ref = TileGroupRef(raw0); return s; } void Jump_write(JumpRef ref, Jump s) { uint ix = ref.offset >> 2; - tilegroup[ix + 0] = s.new_ref; + tilegroup[ix + 0] = s.new_ref.offset; +} + +Chunk Chunk_read(ChunkRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = tilegroup[ix + 0]; + uint raw1 = tilegroup[ix + 1]; + Chunk s; + s.chunk_n = raw0; + s.next = ChunkRef(raw1); + return s; +} + +void Chunk_write(ChunkRef ref, Chunk s) { + uint ix = ref.offset >> 2; + tilegroup[ix + 0] = s.chunk_n; + tilegroup[ix + 1] = s.next.offset; } uint TileGroup_tag(TileGroupRef ref) { diff --git a/piet-gpu/src/main.rs b/piet-gpu/src/main.rs index 703e156..949ccc2 100644 --- a/piet-gpu/src/main.rs +++ b/piet-gpu/src/main.rs @@ -4,7 +4,7 @@ use std::path::Path; use rand::{Rng, RngCore}; -use piet::kurbo::{Circle, Point}; +use piet::kurbo::{BezPath, Circle, Line, Point, Vec2}; use piet::{Color, RenderContext}; use piet_gpu_hal::vulkan::VkInstance; @@ -22,13 +22,15 @@ const TILE_H: usize = 16; const WIDTH_IN_TILEGROUPS: usize = 4; const HEIGHT_IN_TILEGROUPS: usize = 96; -const TILEGROUP_INITIAL_ALLOC: usize = 1024; +const TILEGROUP_STRIDE: usize = 2048; -const WIDTH_IN_TILES: usize = 124; +const WIDTH_IN_TILES: usize = 128; const HEIGHT_IN_TILES: usize = 96; const PTCL_INITIAL_ALLOC: usize = 1024; -const N_CIRCLES: usize = 10_000; +const K2_PER_TILE_SIZE: usize = 8; + +const N_CIRCLES: usize = 1; fn render_scene(rc: &mut impl RenderContext) { let mut rng = rand::thread_rng(); @@ -42,6 +44,29 @@ fn render_scene(rc: &mut impl RenderContext) { let circle = Circle::new(center, radius); rc.fill(circle, &color); } + rc.stroke( + Line::new((100.0, 100.0), (200.0, 150.0)), + &Color::WHITE, + 5.0, + ); + render_cardioid(rc); +} + +fn render_cardioid(rc: &mut impl RenderContext) { + let n = 100; + let dth = std::f64::consts::PI * 2.0 / (n as f64); + let center = Point::new(1024.0, 768.0); + let r = 750.0; + let mut path = BezPath::new(); + for i in 1..n { + let p0 = center + Vec2::from_angle(i as f64 * dth) * r; + let p1 = center + Vec2::from_angle(((i * 2) % n) as f64 * dth) * r; + rc.fill(&Circle::new(p0, 8.0), &Color::WHITE); + path.move_to(p0); + path.line_to(p1); + //rc.stroke(Line::new(p0, p1), &Color::BLACK, 2.0); + } + rc.stroke(&path, &Color::BLACK, 2.0); } #[allow(unused)] @@ -80,7 +105,8 @@ fn main() { .unwrap(); device.write_buffer(&scene_buf, &scene).unwrap(); let tilegroup_buf = device.create_buffer(4 * 1024 * 1024, dev).unwrap(); - let ptcl_buf = device.create_buffer(12 * 1024 * 4096, dev).unwrap(); + let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev).unwrap(); + let segment_buf = device.create_buffer(64 * 1024 * 1024, dev).unwrap(); let image_buf = device .create_buffer((WIDTH * HEIGHT * 4) as u64, host) .unwrap(); @@ -90,7 +116,7 @@ fn main() { let k1_alloc_buf_host = device.create_buffer(4, host).unwrap(); let k1_alloc_buf_dev = device.create_buffer(4, dev).unwrap(); - let k1_alloc_start = WIDTH_IN_TILEGROUPS * HEIGHT_IN_TILEGROUPS * TILEGROUP_INITIAL_ALLOC; + let k1_alloc_start = WIDTH_IN_TILEGROUPS * HEIGHT_IN_TILEGROUPS * TILEGROUP_STRIDE; device .write_buffer(&k1_alloc_buf_host, &[k1_alloc_start as u32]) .unwrap(); @@ -103,6 +129,21 @@ fn main() { ) .unwrap(); + let k2s_alloc_buf_host = device.create_buffer(4, host).unwrap(); + let k2s_alloc_buf_dev = device.create_buffer(4, dev).unwrap(); + let k2s_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * K2_PER_TILE_SIZE; + device + .write_buffer(&k2s_alloc_buf_host, &[k2s_alloc_start as u32]) + .unwrap(); + let k2s_code = include_bytes!("../shader/kernel2s.spv"); + let k2s_pipeline = device.create_simple_compute_pipeline(k2s_code, 4).unwrap(); + let k2s_ds = device + .create_descriptor_set( + &k2s_pipeline, + &[&scene_dev, &tilegroup_buf, &segment_buf, &k2s_alloc_buf_dev], + ) + .unwrap(); + let k3_alloc_buf_host = device.create_buffer(4, host).unwrap(); let k3_alloc_buf_dev = device.create_buffer(4, dev).unwrap(); let k3_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC; @@ -110,24 +151,32 @@ fn main() { .write_buffer(&k3_alloc_buf_host, &[k3_alloc_start as u32]) .unwrap(); let k3_code = include_bytes!("../shader/kernel3.spv"); - let k3_pipeline = device.create_simple_compute_pipeline(k3_code, 4).unwrap(); + let k3_pipeline = device.create_simple_compute_pipeline(k3_code, 5).unwrap(); let k3_ds = device .create_descriptor_set( &k3_pipeline, - &[&scene_dev, &tilegroup_buf, &ptcl_buf, &k3_alloc_buf_dev], + &[ + &scene_dev, + &tilegroup_buf, + &segment_buf, + &ptcl_buf, + &k3_alloc_buf_dev, + ], ) .unwrap(); let k4_code = include_bytes!("../shader/kernel4.spv"); - let pipeline = device.create_simple_compute_pipeline(k4_code, 2).unwrap(); - let descriptor_set = device - .create_descriptor_set(&pipeline, &[&ptcl_buf, &image_dev]) + let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 3).unwrap(); + let k4_ds = device + .create_descriptor_set(&k4_pipeline, &[&ptcl_buf, &segment_buf, &image_dev]) .unwrap(); - let query_pool = device.create_query_pool(4).unwrap(); + + let query_pool = device.create_query_pool(5).unwrap(); let mut cmd_buf = device.create_cmd_buf().unwrap(); cmd_buf.begin(); cmd_buf.copy_buffer(&scene_buf, &scene_dev); cmd_buf.copy_buffer(&k1_alloc_buf_host, &k1_alloc_buf_dev); + cmd_buf.copy_buffer(&k2s_alloc_buf_host, &k2s_alloc_buf_dev); cmd_buf.copy_buffer(&k3_alloc_buf_host, &k3_alloc_buf_dev); cmd_buf.clear_buffer(&tilegroup_buf); cmd_buf.clear_buffer(&ptcl_buf); @@ -141,36 +190,47 @@ fn main() { cmd_buf.write_timestamp(&query_pool, 1); cmd_buf.memory_barrier(); cmd_buf.dispatch( - &k3_pipeline, - &k3_ds, + &k2s_pipeline, + &k2s_ds, ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 1), ); cmd_buf.write_timestamp(&query_pool, 2); cmd_buf.memory_barrier(); cmd_buf.dispatch( - &pipeline, - &descriptor_set, - ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1), + &k3_pipeline, + &k3_ds, + ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 1), ); cmd_buf.write_timestamp(&query_pool, 3); cmd_buf.memory_barrier(); + cmd_buf.dispatch( + &k4_pipeline, + &k4_ds, + ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1), + ); + cmd_buf.write_timestamp(&query_pool, 4); + cmd_buf.memory_barrier(); cmd_buf.copy_buffer(&image_dev, &image_buf); cmd_buf.finish(); device.run_cmd_buf(&cmd_buf).unwrap(); let timestamps = device.reap_query_pool(query_pool).unwrap(); println!("Kernel 1 time: {:.3}ms", timestamps[0] * 1e3); println!( - "Kernel 3 time: {:.3}ms", + "Kernel 2 time: {:.3}ms", (timestamps[1] - timestamps[0]) * 1e3 ); println!( - "Render time: {:.3}ms", + "Kernel 3 time: {:.3}ms", (timestamps[2] - timestamps[1]) * 1e3 ); + println!( + "Render time: {:.3}ms", + (timestamps[3] - timestamps[2]) * 1e3 + ); /* let mut k1_data: Vec = Default::default(); - device.read_buffer(&ptcl_buf, &mut k1_data).unwrap(); + device.read_buffer(&segment_buf, &mut k1_data).unwrap(); dump_k1_data(&k1_data); */ diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs index eb67132..f5b6897 100644 --- a/piet-gpu/src/render_ctx.rs +++ b/piet-gpu/src/render_ctx.rs @@ -259,7 +259,7 @@ fn flatten_shape( } _ => (), } - println!("{:?}", el); + //println!("{:?}", el); }); let n_points = points.len() as u32; let points_ref = points.encode(encoder).transmute();