Implement stroked polylines

This version seems to work but the allocation of segments has low utilization. Probably best to allocate in chunks rather than try to make them contiguous.
2025-01-09 20:31:29 +11:00 · 2020-04-28 11:02:19 -07:00 · 2020-04-28 11:02:19 -07:00 · cb06b1bc3d
parent 55e35dd879
commit cb06b1bc3d
20 changed files with 502 additions and 44 deletions
--- a/piet-gpu-types/src/lib.rs
+++ b/piet-gpu-types/src/lib.rs
@ -1,5 +1,6 @@
 pub mod encoder;
 pub mod ptcl;
 pub mod scene;
+pub mod segment;
 pub mod test;
 pub mod tilegroup;
--- a/piet-gpu-types/src/main.rs
+++ b/piet-gpu-types/src/main.rs
@ -6,6 +6,7 @@ fn main() {
    match mod_name.as_str() {
        "scene" => print!("{}", piet_gpu_types::scene::gen_gpu_scene()),
        "tilegroup" => print!("{}", piet_gpu_types::tilegroup::gen_gpu_tilegroup()),
+        "segment" => print!("{}", piet_gpu_types::segment::gen_gpu_segment()),
        "ptcl" => print!("{}", piet_gpu_types::ptcl::gen_gpu_ptcl()),
        "test" => print!("{}", piet_gpu_types::test::gen_gpu_test()),
        _ => println!("Oops, unknown module name"),
--- a/piet-gpu-types/src/ptcl.rs
+++ b/piet-gpu-types/src/ptcl.rs
@ -13,8 +13,10 @@ piet_gpu! {
            end: [f32; 2],
        }
        struct CmdStroke {
-            // In existing code, this is f16. Should we have support?
-            halfWidth: f32,
+            n_segs: u32,
+            // Should be Ref<Segment> if we had cross-module references.
+            seg_ref: u32,
+            half_width: f32,
            rgba_color: u32,
        }
        struct CmdFill {
--- a/piet-gpu-types/src/segment.rs
+++ b/piet-gpu-types/src/segment.rs
@ -0,0 +1,27 @@
+use piet_gpu_derive::piet_gpu;
+
+// Structures representing segments for stroke/fill items.
+
+piet_gpu! {
+    #[gpu_write]
+    mod segment {
+        struct TileHeader {
+            n: u32,
+            items: Ref<ItemHeader>,
+        }
+
+        // Note: this is only suitable for strokes, fills require backdrop.
+        struct ItemHeader {
+            n: u32,
+            segments: Ref<Segment>,
+        }
+
+        // TODO: strongly consider using f16. If so, these would be
+        // relative to the tile. We're doing f32 for now to minimize
+        // divergence from piet-metal originals.
+        struct Segment {
+            start: [f32; 2],
+            end: [f32; 2],
+        }
+    }
+}
--- a/piet-gpu-types/src/tilegroup.rs
+++ b/piet-gpu-types/src/tilegroup.rs
@ -1,5 +1,18 @@
 use piet_gpu_derive::piet_gpu;

+// Structures representing tilegroup instances (output of kernel 1).
+// There are three outputs: the main instances, the stroke instances,
+// and the fill instances. All three are conceptually a list of
+// instances, but the encoding is slightly different. The first is
+// encoded with Instance, Jump, and End. The other two are encoded
+// as a linked list of Chunk.
+
+// The motivation for the difference is that the first requires fewer
+// registers to track state, but the second contains information that
+// is useful up front for doing dynamic allocation in kernel 2, as
+// well as increasing read parallelism; the "jump" approach really is
+// geared to sequential reading.
+
 piet_gpu! {
    #[gpu_write]
    mod tilegroup {
@ -11,7 +24,11 @@ piet_gpu! {
            offset: [f32; 2],
        }
        struct Jump {
-            new_ref: u32,
+            new_ref: Ref<TileGroup>,
+        }
+        struct Chunk {
+            chunk_n: u32,
+            next: Ref<Chunk>,
        }
        enum TileGroup {
            Instance(Instance),
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@ -11,6 +11,8 @@ build image.spv: glsl image.comp | scene.h

 build kernel1.spv: glsl kernel1.comp | scene.h tilegroup.h setup.h

+build kernel2s.spv: glsl kernel2s.comp | scene.h tilegroup.h segment.h setup.h
+
 build kernel3.spv: glsl kernel3.comp | scene.h tilegroup.h ptcl.h setup.h

 build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h
--- a/piet-gpu/shader/kernel1.comp
+++ b/piet-gpu/shader/kernel1.comp
@ -7,8 +7,7 @@
 // subgroups (or possibly both) to parallelize the reading of the input and
 // the computation of tilegroup intersection.
 //
-// In addition, there are some features currently missing. One is the use of
-// a bump allocator to extend the current fixed allocation. Another is support
+// In addition, there are some features currently missing, such as support
 // for clipping.

 #version 450
@ -46,8 +45,17 @@ void main() {
    StackElement stack[MAX_STACK];
    uint stack_ix = 0;
    uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS + gl_GlobalInvocationID.x;
-    TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_INITIAL_ALLOC);
+    TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE);
    uint tg_limit = tg_ref.offset + TILEGROUP_INITIAL_ALLOC - 2 * TileGroup_size;
+
+    // State for stroke references.
+    TileGroupRef stroke_start = TileGroupRef(tg_ref.offset + TILEGROUP_STROKE_START);
+    ChunkRef stroke_chunk_start = ChunkRef(stroke_start.offset + 4);
+    InstanceRef stroke_ref = InstanceRef(stroke_chunk_start.offset + Chunk_size);
+    uint stroke_limit = stroke_start.offset + TILEGROUP_INITIAL_ALLOC - Instance_size;
+    uint stroke_chunk_n = 0;
+    uint stroke_n = 0;
+
    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILEGROUP_WIDTH_PX, TILEGROUP_HEIGHT_PX);
    PietItemRef root = PietItemRef(0);
    SimpleGroup group = PietItem_Group_read(root);
@ -60,9 +68,11 @@ void main() {
            bool hit = max(bb.x, xy0.x) < min(bb.z, xy0.x + float(TILEGROUP_WIDTH_PX))
                && max(bb.y, xy0.y) < min(bb.w, xy0.y + float(TILEGROUP_HEIGHT_PX));
            bool is_group = false;
+            uint tag;
            if (hit) {
                PietItemRef item_ref = PietItem_index(group.items, tos.index);
-                is_group = PietItem_tag(item_ref) == PietItem_Group;
+                tag = PietItem_tag(item_ref);
+                is_group = tag == PietItem_Group;
            }
            if (hit && !is_group) {
                PietItemRef item_ref = PietItem_index(group.items, tos.index);
@ -70,13 +80,27 @@ void main() {
                if (tg_ref.offset > tg_limit) {
                    // Allocation exceeded; do atomic bump alloc.
                    uint new_tg = atomicAdd(alloc, TILEGROUP_INITIAL_ALLOC);
-                    Jump jump = Jump(new_tg);
+                    Jump jump = Jump(TileGroupRef(new_tg));
                    TileGroup_Jump_write(tg_ref, jump);
                    tg_ref = TileGroupRef(new_tg);
                    tg_limit = tg_ref.offset + TILEGROUP_INITIAL_ALLOC - 2 * TileGroup_size;
                }
                TileGroup_Instance_write(tg_ref, ins);
                tg_ref.offset += TileGroup_size;
+                if (tag == PietItem_Poly) {
+                    if (stroke_ref.offset > stroke_limit) {
+                        uint new_stroke = atomicAdd(alloc, TILEGROUP_STROKE_ALLOC);
+                        Chunk_write(stroke_chunk_start, Chunk(stroke_chunk_n, ChunkRef(new_stroke)));
+                        stroke_chunk_start = ChunkRef(new_stroke);
+                        stroke_ref = InstanceRef(new_stroke + Chunk_size);
+                        stroke_n += stroke_chunk_n;
+                        stroke_chunk_n = 0;
+                        stroke_limit = new_stroke + TILEGROUP_STROKE_ALLOC - Instance_size;
+                    }
+                    Instance_write(stroke_ref, ins);
+                    stroke_chunk_n++;
+                    stroke_ref.offset += Instance_size;
+                }
            }
            if (is_group) {
                PietItemRef item_ref = PietItem_index(group.items, tos.index);
@ -99,4 +123,10 @@ void main() {
        }
    }
    TileGroup_End_write(tg_ref);
+
+    stroke_n += stroke_chunk_n;
+    if (stroke_n > 0) {
+        Chunk_write(stroke_chunk_start, Chunk(stroke_chunk_n, ChunkRef(0)));
+    }
+    tilegroup[stroke_start.offset >> 2] = stroke_n;
 }
--- a/piet-gpu/shader/kernel1.spv
+++ b/piet-gpu/shader/kernel1.spv
--- a/piet-gpu/shader/kernel2s.comp
+++ b/piet-gpu/shader/kernel2s.comp
@ -0,0 +1,127 @@
+// This is "kernel 2" (strokes) in a 4-kernel pipeline. It processes the stroke
+// (polyline) items in the scene and generates a list of segments for each, for
+// each tile.
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+layout(local_size_x = 32) in;
+
+layout(set = 0, binding = 0) readonly buffer SceneBuf {
+    uint[] scene;
+};
+
+layout(set = 0, binding = 1) buffer TilegroupBuf {
+    uint[] tilegroup;
+};
+
+layout(set = 0, binding = 2) buffer SegmentBuf {
+    uint[] segment;
+};
+
+layout(set = 0, binding = 3) buffer AllocBuf {
+    uint alloc;
+};
+
+#include "scene.h"
+#include "tilegroup.h"
+#include "segment.h"
+
+#include "setup.h"
+
+void main() {
+    uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x;
+    uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS
+        + (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES);
+    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
+    TileGroupRef stroke_start = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE + TILEGROUP_STROKE_START);
+    uint stroke_n = tilegroup[stroke_start.offset >> 2];
+
+    TileHeaderRef tile_header_ref = TileHeaderRef(tile_ix * TileHeader_size);
+    if (stroke_n > 0) {
+        ChunkRef chunk_ref = ChunkRef(stroke_start.offset + 4);
+        Chunk chunk = Chunk_read(chunk_ref);
+        InstanceRef stroke_ref = InstanceRef(chunk_ref.offset + Chunk_size);
+        ItemHeaderRef item_header = ItemHeaderRef(atomicAdd(alloc, stroke_n * ItemHeader_size));
+        TileHeader_write(tile_header_ref, TileHeader(stroke_n, item_header));
+        SegmentRef seg_ref = SegmentRef(0);
+        uint seg_limit = 0;
+        // Iterate through items; stroke_n holds count remaining.
+        while (true) {
+            if (chunk.chunk_n == 0) {
+                chunk_ref = chunk.next;
+                chunk = Chunk_read(chunk_ref);
+                stroke_ref = InstanceRef(chunk_ref.offset + Chunk_size);
+            }
+            Instance ins = Instance_read(stroke_ref);
+            PietStrokePolyLine poly = PietItem_Poly_read(PietItemRef(ins.item_ref));
+
+            // Process the stroke polyline item.
+            uint max_n_segs = poly.n_points - 1;
+            uint reserve = max_n_segs * Segment_size;
+            if (seg_ref.offset + reserve > seg_limit) {
+                // This is a heuristic to balance atomic bandwidth and utilization.
+                // The output always gets a contiguous allocation. We might use
+                // all, some, or none of the capacity.
+                uint capacity_bytes = stroke_n > 1 ? reserve * 2 + 128 : reserve;
+                seg_ref.offset = atomicAdd(alloc, capacity_bytes);
+                seg_limit = seg_ref.offset + capacity_bytes;
+            }
+            uint n_segs = 0;
+            vec2 start = Point_read(poly.points).xy;
+            for (uint j = 0; j < max_n_segs; j++) {
+                poly.points.offset += Point_size;
+                vec2 end = Point_read(poly.points).xy;
+
+                // Process one segment.
+
+                // This logic just tests for collision. What we probably want to do
+                // is a clipping algorithm like Liang-Barsky, and then store coords
+                // relative to the tile in f16. See also:
+                // https://tavianator.com/fast-branchless-raybounding-box-intersections/
+
+                // Also note that when we go to the fancy version, we want to compute
+                // the (horizontal projection of) the bounding box of the intersection
+                // once per tilegroup, so we can assign work to individual tiles.
+
+                float a = end.y - start.y;
+                float b = start.x - end.x;
+                float c = -(a * start.x + b * start.y);
+                float half_width = 0.5 * poly.width;
+                // Tile boundaries padded by half-width.
+                float xmin = xy0.x - half_width;
+                float ymin = xy0.y - half_width;
+                float xmax = xy0.x + float(TILE_WIDTH_PX) + half_width;
+                float ymax = xy0.y + float(TILE_HEIGHT_PX) + half_width;
+                float s00 = sign(b * ymin + a * xmin + c);
+                float s01 = sign(b * ymin + a * xmax + c);
+                float s10 = sign(b * ymax + a * xmin + c);
+                float s11 = sign(b * ymax + a * xmax + c);
+                // If bounding boxes intersect and not all four corners are on the same side, hit.
+                // Also note: this is designed to be false on NAN input.
+                if (max(min(start.x, end.x), xmin) < min(max(start.x, end.x), xmax)
+                    && max(min(start.y, end.y), ymin) < min(max(start.y, end.y), ymax)
+                    && s00 * s01 + s00 * s10 + s00 * s11 < 3.0)
+                {
+                    Segment seg = Segment(start, end);
+                    Segment_write(Segment_index(seg_ref, n_segs), seg);
+                    n_segs++;
+                }
+
+                start = end;
+            }
+            ItemHeader_write(item_header, ItemHeader(n_segs, seg_ref));
+            if (--stroke_n == 0) {
+                break;
+            }
+            seg_ref.offset += n_segs * Segment_size;
+
+            stroke_ref.offset += Instance_size;
+            chunk.chunk_n--;
+            item_header.offset += ItemHeader_size;
+        }
+    } else {
+        // As an optimization, we could just write 0 for the size.
+        TileHeader_write(tile_header_ref, TileHeader(stroke_n, ItemHeaderRef(0)));
+    }
+}
--- a/piet-gpu/shader/kernel2s.spv
+++ b/piet-gpu/shader/kernel2s.spv
--- a/piet-gpu/shader/kernel3.comp
+++ b/piet-gpu/shader/kernel3.comp
@ -16,16 +16,22 @@ layout(set = 0, binding = 1) buffer TilegroupBuf {
    uint[] tilegroup;
 };

-layout(set = 0, binding = 2) buffer PtclBuf {
+// Used readonly
+layout(set = 0, binding = 2) buffer SegmentBuf {
+    uint[] segment;
+};
+
+layout(set = 0, binding = 3) buffer PtclBuf {
    uint[] ptcl;
 };

-layout(set = 0, binding = 3) buffer AllocBuf {
+layout(set = 0, binding = 4) buffer AllocBuf {
    uint alloc;
 };

 #include "scene.h"
 #include "tilegroup.h"
+#include "segment.h"
 #include "ptcl.h"

 #include "setup.h"
@ -45,17 +51,19 @@ void main() {
    uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS
        + (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES);
    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
-    TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_INITIAL_ALLOC);
+    TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE);
    CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC);
    uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;

+    TileHeader stroke_th = TileHeader_read(TileHeaderRef(tile_ix * TileHeader_size));
+
    while (true) {
        uint tg_tag = TileGroup_tag(tg_ref);
        if (tg_tag == TileGroup_End) {
            break;
        }
        if (tg_tag == TileGroup_Jump) {
-            tg_ref = TileGroupRef(TileGroup_Jump_read(tg_ref).new_ref);
+            tg_ref = TileGroup_Jump_read(tg_ref).new_ref;
            continue;
        }
        // Assume tg_tag is `Instance`, though there will be more cases.
@ -76,6 +84,22 @@ void main() {
                cmd_ref.offset += Cmd_size;
            }
            break;
+        case PietItem_Poly:
+            ItemHeader stroke_item = ItemHeader_read(stroke_th.items);
+            stroke_th.items.offset += ItemHeader_size;
+            if (stroke_item.n > 0) {
+                PietStrokePolyLine poly = PietItem_Poly_read(item_ref);
+                CmdStroke cmd = CmdStroke(
+                    stroke_item.n,
+                    stroke_item.segments.offset,
+                    0.5 * poly.width,
+                    poly.rgba_color
+                );
+                alloc_cmd(cmd_ref, cmd_limit);
+                Cmd_Stroke_write(cmd_ref, cmd);
+                cmd_ref.offset += Cmd_size;
+            }
+            break;
        }
        tg_ref.offset += TileGroup_size;
    }
--- a/piet-gpu/shader/kernel3.spv
+++ b/piet-gpu/shader/kernel3.spv
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@ -14,11 +14,17 @@ layout(set = 0, binding = 0) buffer PtclBuf {
    uint[] ptcl;
 };

-layout(set = 0, binding = 1) buffer ImageBuf {
+// Used readonly
+layout(set = 0, binding = 1) buffer SegmentBuf {
+    uint[] segment;
+};
+
+layout(set = 0, binding = 2) buffer ImageBuf {
    uint[] image;
 };

 #include "ptcl.h"
+#include "segment.h"

 #include "setup.h"

@ -41,10 +47,24 @@ void main() {
            CmdCircle circle = Cmd_Circle_read(cmd_ref);
            float r = length(xy + vec2(0.5, 0.5) - circle.center.xy);
            float alpha = clamp(0.5 + circle.radius - r, 0.0, 1.0);
-            vec4 fg_rgba = unpackUnorm4x8(circle.rgba_color);
+            vec4 fg_rgba = unpackUnorm4x8(circle.rgba_color).wzyx;
            // TODO: sRGB
            rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a);
            break;
+        case Cmd_Stroke:
+            CmdStroke stroke = Cmd_Stroke_read(cmd_ref);
+            float df = 1e9;
+            for (int i = 0; i < stroke.n_segs; i++) {
+                Segment seg = Segment_read(Segment_index(SegmentRef(stroke.seg_ref), i));
+                vec2 line_vec = seg.end - seg.start;
+                vec2 dpos = xy + vec2(0.5, 0.5) - seg.start;
+                float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
+                df = min(df, length(line_vec * t - dpos));
+            }
+            fg_rgba = unpackUnorm4x8(stroke.rgba_color).wzyx;
+            alpha = clamp(stroke.half_width + 0.5 - df, 0.0, 1.0);
+            rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a);
+            break;
        case Cmd_Jump:
            cmd_ref = CmdRef(Cmd_Jump_read(cmd_ref).new_ref);
            continue;
--- a/piet-gpu/shader/kernel4.spv
+++ b/piet-gpu/shader/kernel4.spv
--- a/piet-gpu/shader/ptcl.h
+++ b/piet-gpu/shader/ptcl.h
@ -60,11 +60,13 @@ CmdLineRef CmdLine_index(CmdLineRef ref, uint index) {
 }

 struct CmdStroke {
-    float halfWidth;
+    uint n_segs;
+    uint seg_ref;
+    float half_width;
    uint rgba_color;
 };

-#define CmdStroke_size 8
+#define CmdStroke_size 16

 CmdStrokeRef CmdStroke_index(CmdStrokeRef ref, uint index) {
    return CmdStrokeRef(ref.offset + index * CmdStroke_size);
@ -184,16 +186,22 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = ptcl[ix + 0];
    uint raw1 = ptcl[ix + 1];
+    uint raw2 = ptcl[ix + 2];
+    uint raw3 = ptcl[ix + 3];
    CmdStroke s;
-    s.halfWidth = uintBitsToFloat(raw0);
-    s.rgba_color = raw1;
+    s.n_segs = raw0;
+    s.seg_ref = raw1;
+    s.half_width = uintBitsToFloat(raw2);
+    s.rgba_color = raw3;
    return s;
 }

 void CmdStroke_write(CmdStrokeRef ref, CmdStroke s) {
    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = floatBitsToUint(s.halfWidth);
-    ptcl[ix + 1] = s.rgba_color;
+    ptcl[ix + 0] = s.n_segs;
+    ptcl[ix + 1] = s.seg_ref;
+    ptcl[ix + 2] = floatBitsToUint(s.half_width);
+    ptcl[ix + 3] = s.rgba_color;
 }

 CmdFill CmdFill_read(CmdFillRef ref) {
--- a/piet-gpu/shader/segment.h
+++ b/piet-gpu/shader/segment.h
@ -0,0 +1,99 @@
+// Code auto-generated by piet-gpu-derive
+
+struct TileHeaderRef {
+    uint offset;
+};
+
+struct ItemHeaderRef {
+    uint offset;
+};
+
+struct SegmentRef {
+    uint offset;
+};
+
+struct TileHeader {
+    uint n;
+    ItemHeaderRef items;
+};
+
+#define TileHeader_size 8
+
+TileHeaderRef TileHeader_index(TileHeaderRef ref, uint index) {
+    return TileHeaderRef(ref.offset + index * TileHeader_size);
+}
+
+struct ItemHeader {
+    uint n;
+    SegmentRef segments;
+};
+
+#define ItemHeader_size 8
+
+ItemHeaderRef ItemHeader_index(ItemHeaderRef ref, uint index) {
+    return ItemHeaderRef(ref.offset + index * ItemHeader_size);
+}
+
+struct Segment {
+    vec2 start;
+    vec2 end;
+};
+
+#define Segment_size 16
+
+SegmentRef Segment_index(SegmentRef ref, uint index) {
+    return SegmentRef(ref.offset + index * Segment_size);
+}
+
+TileHeader TileHeader_read(TileHeaderRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = segment[ix + 0];
+    uint raw1 = segment[ix + 1];
+    TileHeader s;
+    s.n = raw0;
+    s.items = ItemHeaderRef(raw1);
+    return s;
+}
+
+void TileHeader_write(TileHeaderRef ref, TileHeader s) {
+    uint ix = ref.offset >> 2;
+    segment[ix + 0] = s.n;
+    segment[ix + 1] = s.items.offset;
+}
+
+ItemHeader ItemHeader_read(ItemHeaderRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = segment[ix + 0];
+    uint raw1 = segment[ix + 1];
+    ItemHeader s;
+    s.n = raw0;
+    s.segments = SegmentRef(raw1);
+    return s;
+}
+
+void ItemHeader_write(ItemHeaderRef ref, ItemHeader s) {
+    uint ix = ref.offset >> 2;
+    segment[ix + 0] = s.n;
+    segment[ix + 1] = s.segments.offset;
+}
+
+Segment Segment_read(SegmentRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = segment[ix + 0];
+    uint raw1 = segment[ix + 1];
+    uint raw2 = segment[ix + 2];
+    uint raw3 = segment[ix + 3];
+    Segment s;
+    s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    return s;
+}
+
+void Segment_write(SegmentRef ref, Segment s) {
+    uint ix = ref.offset >> 2;
+    segment[ix + 0] = floatBitsToUint(s.start.x);
+    segment[ix + 1] = floatBitsToUint(s.start.y);
+    segment[ix + 2] = floatBitsToUint(s.end.x);
+    segment[ix + 3] = floatBitsToUint(s.end.y);
+}
+
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@ -15,6 +15,15 @@

 #define TILEGROUP_INITIAL_ALLOC 1024

+// Quick note on layout of tilegroups (k1 output): in the base,
+// there is a region of size TILEGROUP_STRIDE for each tilegroup.
+// At offset 0 are the main instances, encoded with Jump. At offset
+// TILEGROUP_STROKE_START are the stroke instances, encoded with
+// Head and Link.
+#define TILEGROUP_STRIDE 2048
+#define TILEGROUP_STROKE_START 1024
+#define TILEGROUP_STROKE_ALLOC 1024
+
 // TODO: compute all these

 #define WIDTH_IN_TILES 128
--- a/piet-gpu/shader/tilegroup.h
+++ b/piet-gpu/shader/tilegroup.h
@ -8,6 +8,10 @@ struct JumpRef {
    uint offset;
 };

+struct ChunkRef {
+    uint offset;
+};
+
 struct TileGroupRef {
    uint offset;
 };
@ -24,7 +28,7 @@ InstanceRef Instance_index(InstanceRef ref, uint index) {
 }

 struct Jump {
-    uint new_ref;
+    TileGroupRef new_ref;
 };

 #define Jump_size 4
@ -33,6 +37,17 @@ JumpRef Jump_index(JumpRef ref, uint index) {
    return JumpRef(ref.offset + index * Jump_size);
 }

+struct Chunk {
+    uint chunk_n;
+    ChunkRef next;
+};
+
+#define Chunk_size 8
+
+ChunkRef Chunk_index(ChunkRef ref, uint index) {
+    return ChunkRef(ref.offset + index * Chunk_size);
+}
+
 #define TileGroup_Instance 0
 #define TileGroup_Jump 1
 #define TileGroup_End 2
@ -64,13 +79,29 @@ Jump Jump_read(JumpRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = tilegroup[ix + 0];
    Jump s;
-    s.new_ref = raw0;
+    s.new_ref = TileGroupRef(raw0);
    return s;
 }

 void Jump_write(JumpRef ref, Jump s) {
    uint ix = ref.offset >> 2;
-    tilegroup[ix + 0] = s.new_ref;
+    tilegroup[ix + 0] = s.new_ref.offset;
+}
+
+Chunk Chunk_read(ChunkRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = tilegroup[ix + 0];
+    uint raw1 = tilegroup[ix + 1];
+    Chunk s;
+    s.chunk_n = raw0;
+    s.next = ChunkRef(raw1);
+    return s;
+}
+
+void Chunk_write(ChunkRef ref, Chunk s) {
+    uint ix = ref.offset >> 2;
+    tilegroup[ix + 0] = s.chunk_n;
+    tilegroup[ix + 1] = s.next.offset;
 }

 uint TileGroup_tag(TileGroupRef ref) {
--- a/piet-gpu/src/main.rs
+++ b/piet-gpu/src/main.rs
@ -4,7 +4,7 @@ use std::path::Path;

 use rand::{Rng, RngCore};

-use piet::kurbo::{Circle, Point};
+use piet::kurbo::{BezPath, Circle, Line, Point, Vec2};
 use piet::{Color, RenderContext};

 use piet_gpu_hal::vulkan::VkInstance;
@ -22,13 +22,15 @@ const TILE_H: usize = 16;

 const WIDTH_IN_TILEGROUPS: usize = 4;
 const HEIGHT_IN_TILEGROUPS: usize = 96;
-const TILEGROUP_INITIAL_ALLOC: usize = 1024;
+const TILEGROUP_STRIDE: usize = 2048;

-const WIDTH_IN_TILES: usize = 124;
+const WIDTH_IN_TILES: usize = 128;
 const HEIGHT_IN_TILES: usize = 96;
 const PTCL_INITIAL_ALLOC: usize = 1024;

-const N_CIRCLES: usize = 10_000;
+const K2_PER_TILE_SIZE: usize = 8;
+
+const N_CIRCLES: usize = 1;

 fn render_scene(rc: &mut impl RenderContext) {
    let mut rng = rand::thread_rng();
@ -42,6 +44,29 @@ fn render_scene(rc: &mut impl RenderContext) {
        let circle = Circle::new(center, radius);
        rc.fill(circle, &color);
    }
+    rc.stroke(
+        Line::new((100.0, 100.0), (200.0, 150.0)),
+        &Color::WHITE,
+        5.0,
+    );
+    render_cardioid(rc);
+}
+
+fn render_cardioid(rc: &mut impl RenderContext) {
+    let n = 100;
+    let dth = std::f64::consts::PI * 2.0 / (n as f64);
+    let center = Point::new(1024.0, 768.0);
+    let r = 750.0;
+    let mut path = BezPath::new();
+    for i in 1..n {
+        let p0 = center + Vec2::from_angle(i as f64 * dth) * r;
+        let p1 = center + Vec2::from_angle(((i * 2) % n) as f64 * dth) * r;
+        rc.fill(&Circle::new(p0, 8.0), &Color::WHITE);
+        path.move_to(p0);
+        path.line_to(p1);
+        //rc.stroke(Line::new(p0, p1), &Color::BLACK, 2.0);
+    }
+    rc.stroke(&path, &Color::BLACK, 2.0);
 }

 #[allow(unused)]
@ -80,7 +105,8 @@ fn main() {
            .unwrap();
        device.write_buffer(&scene_buf, &scene).unwrap();
        let tilegroup_buf = device.create_buffer(4 * 1024 * 1024, dev).unwrap();
-        let ptcl_buf = device.create_buffer(12 * 1024 * 4096, dev).unwrap();
+        let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev).unwrap();
+        let segment_buf = device.create_buffer(64 * 1024 * 1024, dev).unwrap();
        let image_buf = device
            .create_buffer((WIDTH * HEIGHT * 4) as u64, host)
            .unwrap();
@ -90,7 +116,7 @@ fn main() {

        let k1_alloc_buf_host = device.create_buffer(4, host).unwrap();
        let k1_alloc_buf_dev = device.create_buffer(4, dev).unwrap();
-        let k1_alloc_start = WIDTH_IN_TILEGROUPS * HEIGHT_IN_TILEGROUPS * TILEGROUP_INITIAL_ALLOC;
+        let k1_alloc_start = WIDTH_IN_TILEGROUPS * HEIGHT_IN_TILEGROUPS * TILEGROUP_STRIDE;
        device
            .write_buffer(&k1_alloc_buf_host, &[k1_alloc_start as u32])
            .unwrap();
@ -103,6 +129,21 @@ fn main() {
            )
            .unwrap();

+        let k2s_alloc_buf_host = device.create_buffer(4, host).unwrap();
+        let k2s_alloc_buf_dev = device.create_buffer(4, dev).unwrap();
+        let k2s_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * K2_PER_TILE_SIZE;
+        device
+            .write_buffer(&k2s_alloc_buf_host, &[k2s_alloc_start as u32])
+            .unwrap();
+        let k2s_code = include_bytes!("../shader/kernel2s.spv");
+        let k2s_pipeline = device.create_simple_compute_pipeline(k2s_code, 4).unwrap();
+        let k2s_ds = device
+            .create_descriptor_set(
+                &k2s_pipeline,
+                &[&scene_dev, &tilegroup_buf, &segment_buf, &k2s_alloc_buf_dev],
+            )
+            .unwrap();
+
        let k3_alloc_buf_host = device.create_buffer(4, host).unwrap();
        let k3_alloc_buf_dev = device.create_buffer(4, dev).unwrap();
        let k3_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC;
@ -110,24 +151,32 @@ fn main() {
            .write_buffer(&k3_alloc_buf_host, &[k3_alloc_start as u32])
            .unwrap();
        let k3_code = include_bytes!("../shader/kernel3.spv");
-        let k3_pipeline = device.create_simple_compute_pipeline(k3_code, 4).unwrap();
+        let k3_pipeline = device.create_simple_compute_pipeline(k3_code, 5).unwrap();
        let k3_ds = device
            .create_descriptor_set(
                &k3_pipeline,
-                &[&scene_dev, &tilegroup_buf, &ptcl_buf, &k3_alloc_buf_dev],
+                &[
+                    &scene_dev,
+                    &tilegroup_buf,
+                    &segment_buf,
+                    &ptcl_buf,
+                    &k3_alloc_buf_dev,
+                ],
            )
            .unwrap();

        let k4_code = include_bytes!("../shader/kernel4.spv");
-        let pipeline = device.create_simple_compute_pipeline(k4_code, 2).unwrap();
-        let descriptor_set = device
-            .create_descriptor_set(&pipeline, &[&ptcl_buf, &image_dev])
+        let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 3).unwrap();
+        let k4_ds = device
+            .create_descriptor_set(&k4_pipeline, &[&ptcl_buf, &segment_buf, &image_dev])
            .unwrap();
-        let query_pool = device.create_query_pool(4).unwrap();
+
+        let query_pool = device.create_query_pool(5).unwrap();
        let mut cmd_buf = device.create_cmd_buf().unwrap();
        cmd_buf.begin();
        cmd_buf.copy_buffer(&scene_buf, &scene_dev);
        cmd_buf.copy_buffer(&k1_alloc_buf_host, &k1_alloc_buf_dev);
+        cmd_buf.copy_buffer(&k2s_alloc_buf_host, &k2s_alloc_buf_dev);
        cmd_buf.copy_buffer(&k3_alloc_buf_host, &k3_alloc_buf_dev);
        cmd_buf.clear_buffer(&tilegroup_buf);
        cmd_buf.clear_buffer(&ptcl_buf);
@ -141,36 +190,47 @@ fn main() {
        cmd_buf.write_timestamp(&query_pool, 1);
        cmd_buf.memory_barrier();
        cmd_buf.dispatch(
-            &k3_pipeline,
-            &k3_ds,
+            &k2s_pipeline,
+            &k2s_ds,
            ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 1),
        );
        cmd_buf.write_timestamp(&query_pool, 2);
        cmd_buf.memory_barrier();
        cmd_buf.dispatch(
-            &pipeline,
-            &descriptor_set,
-            ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1),
+            &k3_pipeline,
+            &k3_ds,
+            ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 1),
        );
        cmd_buf.write_timestamp(&query_pool, 3);
        cmd_buf.memory_barrier();
+        cmd_buf.dispatch(
+            &k4_pipeline,
+            &k4_ds,
+            ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1),
+        );
+        cmd_buf.write_timestamp(&query_pool, 4);
+        cmd_buf.memory_barrier();
        cmd_buf.copy_buffer(&image_dev, &image_buf);
        cmd_buf.finish();
        device.run_cmd_buf(&cmd_buf).unwrap();
        let timestamps = device.reap_query_pool(query_pool).unwrap();
        println!("Kernel 1 time: {:.3}ms", timestamps[0] * 1e3);
        println!(
-            "Kernel 3 time: {:.3}ms",
+            "Kernel 2 time: {:.3}ms",
            (timestamps[1] - timestamps[0]) * 1e3
        );
        println!(
-            "Render time: {:.3}ms",
+            "Kernel 3 time: {:.3}ms",
            (timestamps[2] - timestamps[1]) * 1e3
        );
+        println!(
+            "Render time: {:.3}ms",
+            (timestamps[3] - timestamps[2]) * 1e3
+        );

        /*
        let mut k1_data: Vec<u32> = Default::default();
-        device.read_buffer(&ptcl_buf, &mut k1_data).unwrap();
+        device.read_buffer(&segment_buf, &mut k1_data).unwrap();
        dump_k1_data(&k1_data);
        */

--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@ -259,7 +259,7 @@ fn flatten_shape(
            }
            _ => (),
        }
-        println!("{:?}", el);
+        //println!("{:?}", el);
    });
    let n_points = points.len() as u32;
    let points_ref = points.encode(encoder).transmute();