Delete old-style kernels and buffers

Pave the way for the coarse raster pass to write to the ptcl buffer.
2025-01-10 12:41:30 +11:00 · 2020-05-15 15:20:25 -07:00 · 2020-05-15 15:20:25 -07:00 · 1240da3870
parent 3a6428238b
commit 1240da3870
20 changed files with 91 additions and 965 deletions
--- a/piet-gpu-types/src/fill_seg.rs
+++ b/piet-gpu-types/src/fill_seg.rs
@ -1,37 +0,0 @@
 use piet_gpu_derive::piet_gpu;
 // Structures representing segments for fill items.
 // There is some cut'n'paste here from stroke segments, which can be
 // traced to the fact that buffers in GLSL are basically global.
 // Maybe there's a way to address that, but in the meantime living
 // with the duplication is easiest.
 piet_gpu! {
    #[gpu_write]
    mod fill_seg {
        struct FillTileHeader {
            n: u32,
            items: Ref<FillItemHeader>,
        }
        struct FillItemHeader {
            backdrop: i32,
            segments: Ref<FillSegChunk>,
        }
        // TODO: strongly consider using f16. If so, these would be
        // relative to the tile. We're doing f32 for now to minimize
        // divergence from piet-metal originals.
        struct FillSegment {
            start: [f32; 2],
            end: [f32; 2],
        }
        struct FillSegChunk {
            n: u32,
            next: Ref<FillSegChunk>,
            // Segments follow (could represent this as a variable sized array).
        }
    }
 }
--- a/piet-gpu-types/src/lib.rs
+++ b/piet-gpu-types/src/lib.rs
@ -3,10 +3,8 @@
 pub mod annotated;
 pub mod bins;
 pub mod encoder;
 pub mod fill_seg;
 pub mod ptcl;
 pub mod scene;
 pub mod segment;
 pub mod state;
 pub mod test;
 pub mod tilegroup;
--- a/piet-gpu-types/src/main.rs
+++ b/piet-gpu-types/src/main.rs
@ -9,8 +9,6 @@ fn main() {
        "annotated" => print!("{}", piet_gpu_types::annotated::gen_gpu_annotated()),
        "bins" => print!("{}", piet_gpu_types::bins::gen_gpu_bins()),
        "tilegroup" => print!("{}", piet_gpu_types::tilegroup::gen_gpu_tilegroup()),
        "segment" => print!("{}", piet_gpu_types::segment::gen_gpu_segment()),
        "fill_seg" => print!("{}", piet_gpu_types::fill_seg::gen_gpu_fill_seg()),
        "ptcl" => print!("{}", piet_gpu_types::ptcl::gen_gpu_ptcl()),
        "test" => print!("{}", piet_gpu_types::test::gen_gpu_test()),
        _ => println!("Oops, unknown module name"),
--- a/piet-gpu-types/src/ptcl.rs
+++ b/piet-gpu-types/src/ptcl.rs
@ -13,13 +13,13 @@ piet_gpu! {
            end: [f32; 2],
        }
        struct CmdStroke {
-            // Should be Ref<SegChunk> if we had cross-module references.
+            // Should be Ref<SegChunk>
            seg_ref: u32,
            half_width: f32,
            rgba_color: u32,
        }
        struct CmdFill {
-            // Should be Ref<FillSegChunk> if we had cross-module references.
+            // Should be Ref<FillSegChunk>
            seg_ref: u32,
            backdrop: i32,
            rgba_color: u32,
@ -51,5 +51,19 @@ piet_gpu! {
            Jump(CmdJump),
            Bail,
        }
        // TODO: strongly consider using f16. If so, these would be
        // relative to the tile. We're doing f32 for now to minimize
        // divergence from piet-metal originals.
        struct Segment {
            start: [f32; 2],
            end: [f32; 2],
        }
        struct SegChunk {
            n: u32,
            next: Ref<SegChunk>,
            // Segments follow (could represent this as a variable sized array).
        }
    }
 }
--- a/piet-gpu-types/src/segment.rs
+++ b/piet-gpu-types/src/segment.rs
@ -1,32 +0,0 @@
 use piet_gpu_derive::piet_gpu;
 // Structures representing segments for stroke/fill items.
 piet_gpu! {
    #[gpu_write]
    mod segment {
        struct TileHeader {
            n: u32,
            items: Ref<ItemHeader>,
        }
        // Note: this is only suitable for strokes, fills require backdrop.
        struct ItemHeader {
            segments: Ref<SegChunk>,
        }
        // TODO: strongly consider using f16. If so, these would be
        // relative to the tile. We're doing f32 for now to minimize
        // divergence from piet-metal originals.
        struct Segment {
            start: [f32; 2],
            end: [f32; 2],
        }
        struct SegChunk {
            n: u32,
            next: Ref<SegChunk>,
            // Segments follow (could represent this as a variable sized array).
        }
    }
 }
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@ -9,19 +9,11 @@ rule glsl
 build image.spv: glsl image.comp | scene.h
 build kernel1.spv: glsl kernel1.comp | scene.h tilegroup.h setup.h
 build kernel2s.spv: glsl kernel2s.comp | scene.h tilegroup.h segment.h setup.h
 build kernel2f.spv: glsl kernel2f.comp | scene.h tilegroup.h fill_seg.h setup.h
 build kernel3.spv: glsl kernel3.comp | scene.h tilegroup.h segment.h fill_seg.h ptcl.h setup.h
 build kernel4.spv: glsl kernel4.comp | ptcl.h segment.h fill_seg.h setup.h
 build elements.spv: glsl elements.comp | scene.h state.h annotated.h
 build binning.spv: glsl binning.comp | annotated.h bins.h setup.h
 build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h
 build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h
--- a/piet-gpu/shader/fill_seg.h
+++ b/piet-gpu/shader/fill_seg.h
@ -1,130 +0,0 @@
 // Code auto-generated by piet-gpu-derive
 struct FillTileHeaderRef {
    uint offset;
 };
 struct FillItemHeaderRef {
    uint offset;
 };
 struct FillSegmentRef {
    uint offset;
 };
 struct FillSegChunkRef {
    uint offset;
 };
 struct FillTileHeader {
    uint n;
    FillItemHeaderRef items;
 };
 #define FillTileHeader_size 8
 FillTileHeaderRef FillTileHeader_index(FillTileHeaderRef ref, uint index) {
    return FillTileHeaderRef(ref.offset + index * FillTileHeader_size);
 }
 struct FillItemHeader {
    int backdrop;
    FillSegChunkRef segments;
 };
 #define FillItemHeader_size 8
 FillItemHeaderRef FillItemHeader_index(FillItemHeaderRef ref, uint index) {
    return FillItemHeaderRef(ref.offset + index * FillItemHeader_size);
 }
 struct FillSegment {
    vec2 start;
    vec2 end;
 };
 #define FillSegment_size 16
 FillSegmentRef FillSegment_index(FillSegmentRef ref, uint index) {
    return FillSegmentRef(ref.offset + index * FillSegment_size);
 }
 struct FillSegChunk {
    uint n;
    FillSegChunkRef next;
 };
 #define FillSegChunk_size 8
 FillSegChunkRef FillSegChunk_index(FillSegChunkRef ref, uint index) {
    return FillSegChunkRef(ref.offset + index * FillSegChunk_size);
 }
 FillTileHeader FillTileHeader_read(FillTileHeaderRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = fill_seg[ix + 0];
    uint raw1 = fill_seg[ix + 1];
    FillTileHeader s;
    s.n = raw0;
    s.items = FillItemHeaderRef(raw1);
    return s;
 }
 void FillTileHeader_write(FillTileHeaderRef ref, FillTileHeader s) {
    uint ix = ref.offset >> 2;
    fill_seg[ix + 0] = s.n;
    fill_seg[ix + 1] = s.items.offset;
 }
 FillItemHeader FillItemHeader_read(FillItemHeaderRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = fill_seg[ix + 0];
    uint raw1 = fill_seg[ix + 1];
    FillItemHeader s;
    s.backdrop = int(raw0);
    s.segments = FillSegChunkRef(raw1);
    return s;
 }
 void FillItemHeader_write(FillItemHeaderRef ref, FillItemHeader s) {
    uint ix = ref.offset >> 2;
    fill_seg[ix + 0] = uint(s.backdrop);
    fill_seg[ix + 1] = s.segments.offset;
 }
 FillSegment FillSegment_read(FillSegmentRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = fill_seg[ix + 0];
    uint raw1 = fill_seg[ix + 1];
    uint raw2 = fill_seg[ix + 2];
    uint raw3 = fill_seg[ix + 3];
    FillSegment s;
    s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    return s;
 }
 void FillSegment_write(FillSegmentRef ref, FillSegment s) {
    uint ix = ref.offset >> 2;
    fill_seg[ix + 0] = floatBitsToUint(s.start.x);
    fill_seg[ix + 1] = floatBitsToUint(s.start.y);
    fill_seg[ix + 2] = floatBitsToUint(s.end.x);
    fill_seg[ix + 3] = floatBitsToUint(s.end.y);
 }
 FillSegChunk FillSegChunk_read(FillSegChunkRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = fill_seg[ix + 0];
    uint raw1 = fill_seg[ix + 1];
    FillSegChunk s;
    s.n = raw0;
    s.next = FillSegChunkRef(raw1);
    return s;
 }
 void FillSegChunk_write(FillSegChunkRef ref, FillSegChunk s) {
    uint ix = ref.offset >> 2;
    fill_seg[ix + 0] = s.n;
    fill_seg[ix + 1] = s.next.offset;
 }
--- a/piet-gpu/shader/kernel1.comp
+++ b/piet-gpu/shader/kernel1.comp
@ -1,161 +0,0 @@
 // This is "kernel 1" in a 4-kernel pipeline. It traverses the scene graph
 // and outputs "instances" (references to item + translation) for each item
 // that intersects the tilegroup.
 //
 // This implementation is simplistic and leaves a lot of performance on the
 // table. A fancier implementation would use threadgroup shared memory or
 // subgroups (or possibly both) to parallelize the reading of the input and
 // the computation of tilegroup intersection.
 //
 // In addition, there are some features currently missing, such as support
 // for clipping.
 #version 450
 #extension GL_GOOGLE_include_directive : enable
 // It's possible we should lay this out with x and do our own math.
 layout(local_size_x = 1, local_size_y = 32) in;
 layout(set = 0, binding = 0) readonly buffer SceneBuf {
    uint[] scene;
 };
 layout(set = 0, binding = 1) buffer TilegroupBuf {
    uint[] tilegroup;
 };
 layout(set = 0, binding = 2) buffer AllocBuf {
    uint alloc;
 };
 #include "scene.h"
 #include "tilegroup.h"
 #include "setup.h"
 #define MAX_STACK 8
 struct StackElement {
    PietItemRef group;
    uint index;
    vec2 offset;
 };
 void main() {
    StackElement stack[MAX_STACK];
    uint stack_ix = 0;
    uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS + gl_GlobalInvocationID.x;
    TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE);
    uint tg_limit = tg_ref.offset + TILEGROUP_INITIAL_ALLOC - 2 * TileGroup_size;
    // State for stroke references.
    TileGroupRef stroke_start = TileGroupRef(tg_ref.offset + TILEGROUP_STROKE_START);
    ChunkRef stroke_chunk_start = ChunkRef(stroke_start.offset + 4);
    InstanceRef stroke_ref = InstanceRef(stroke_chunk_start.offset + Chunk_size);
    uint stroke_limit = stroke_start.offset + TILEGROUP_INITIAL_STROKE_ALLOC - Instance_size;
    uint stroke_chunk_n = 0;
    uint stroke_n = 0;
    // State for fill references. All this is a bit cut'n'paste, but making a
    // proper abstraction isn't easy.
    TileGroupRef fill_start = TileGroupRef(tg_ref.offset + TILEGROUP_FILL_START);
    ChunkRef fill_chunk_start = ChunkRef(fill_start.offset + 4);
    InstanceRef fill_ref = InstanceRef(fill_chunk_start.offset + Chunk_size);
    uint fill_limit = fill_start.offset + TILEGROUP_INITIAL_FILL_ALLOC - Instance_size;
    uint fill_chunk_n = 0;
    uint fill_n = 0;
    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILEGROUP_WIDTH_PX, TILEGROUP_HEIGHT_PX);
    PietItemRef root = PietItemRef(0);
    SimpleGroup group = PietItem_Group_read(root);
    StackElement tos = StackElement(root, 0, group.offset.xy);
    while (true) {
        if (tos.index < group.n_items) {
            Bbox bbox = Bbox_read(Bbox_index(group.bboxes, tos.index));
            vec4 bb = vec4(bbox.bbox) + tos.offset.xyxy;
            bool hit = max(bb.x, xy0.x) < min(bb.z, xy0.x + float(TILEGROUP_WIDTH_PX))
                && max(bb.y, xy0.y) < min(bb.w, xy0.y + float(TILEGROUP_HEIGHT_PX));
            bool is_group = false;
            uint tag;
            if (hit) {
                PietItemRef item_ref = PietItem_index(group.items, tos.index);
                tag = PietItem_tag(item_ref);
                is_group = tag == PietItem_Group;
            }
            if (hit && !is_group) {
                PietItemRef item_ref = PietItem_index(group.items, tos.index);
                Instance ins = Instance(item_ref.offset, tos.offset);
                if (tg_ref.offset > tg_limit) {
                    // Allocation exceeded; do atomic bump alloc.
                    uint new_tg = atomicAdd(alloc, TILEGROUP_INITIAL_ALLOC);
                    Jump jump = Jump(TileGroupRef(new_tg));
                    TileGroup_Jump_write(tg_ref, jump);
                    tg_ref = TileGroupRef(new_tg);
                    tg_limit = tg_ref.offset + TILEGROUP_INITIAL_ALLOC - 2 * TileGroup_size;
                }
                TileGroup_Instance_write(tg_ref, ins);
                tg_ref.offset += TileGroup_size;
                if (tag == PietItem_Poly) {
                    if (stroke_ref.offset > stroke_limit) {
                        uint new_stroke = atomicAdd(alloc, TILEGROUP_STROKE_ALLOC);
                        Chunk_write(stroke_chunk_start, Chunk(stroke_chunk_n, ChunkRef(new_stroke)));
                        stroke_chunk_start = ChunkRef(new_stroke);
                        stroke_ref = InstanceRef(new_stroke + Chunk_size);
                        stroke_n += stroke_chunk_n;
                        stroke_chunk_n = 0;
                        stroke_limit = new_stroke + TILEGROUP_STROKE_ALLOC - Instance_size;
                    }
                    Instance_write(stroke_ref, ins);
                    stroke_chunk_n++;
                    stroke_ref.offset += Instance_size;
                } else if (tag == PietItem_Fill) {
                    if (fill_ref.offset > fill_limit) {
                        uint new_fill = atomicAdd(alloc, TILEGROUP_FILL_ALLOC);
                        Chunk_write(fill_chunk_start, Chunk(fill_chunk_n, ChunkRef(new_fill)));
                        fill_chunk_start = ChunkRef(new_fill);
                        fill_ref = InstanceRef(new_fill + Chunk_size);
                        fill_n += fill_chunk_n;
                        fill_chunk_n = 0;
                        fill_limit = new_fill + TILEGROUP_FILL_ALLOC - Instance_size;
                    }
                    Instance_write(fill_ref, ins);
                    fill_chunk_n++;
                    fill_ref.offset += Instance_size;
                }
            }
            if (is_group) {
                PietItemRef item_ref = PietItem_index(group.items, tos.index);
                tos.index++;
                if (tos.index < group.n_items) {
                    stack[stack_ix++] = tos;
                }
                group = PietItem_Group_read(item_ref);
                tos = StackElement(item_ref, 0, tos.offset + group.offset.xy);
            } else {
                tos.index++;
            }
        } else {
            // processed all items in this group; pop the stack
            if (stack_ix == 0) {
                break;
            }
            tos = stack[--stack_ix];
            group = PietItem_Group_read(tos.group);
        }
    }
    TileGroup_End_write(tg_ref);
    stroke_n += stroke_chunk_n;
    if (stroke_n > 0) {
        Chunk_write(stroke_chunk_start, Chunk(stroke_chunk_n, ChunkRef(0)));
    }
    tilegroup[stroke_start.offset >> 2] = stroke_n;
    fill_n += fill_chunk_n;
    if (fill_n > 0) {
        Chunk_write(fill_chunk_start, Chunk(fill_chunk_n, ChunkRef(0)));
    }
    tilegroup[fill_start.offset >> 2] = fill_n;
 }
--- a/piet-gpu/shader/kernel1.spv
+++ b/piet-gpu/shader/kernel1.spv
--- a/piet-gpu/shader/kernel2f.comp
+++ b/piet-gpu/shader/kernel2f.comp
@ -1,167 +0,0 @@
 // This is "kernel 2" (fill) in a 4-kernel pipeline. It processes the fill
 // (polyline) items in the scene and generates a list of segments for each, for
 // each tile.
 #version 450
 #extension GL_GOOGLE_include_directive : enable
 layout(local_size_x = 32) in;
 layout(set = 0, binding = 0) readonly buffer SceneBuf {
    uint[] scene;
 };
 layout(set = 0, binding = 1) buffer TilegroupBuf {
    uint[] tilegroup;
 };
 layout(set = 0, binding = 2) buffer FillSegBuf {
    uint[] fill_seg;
 };
 layout(set = 0, binding = 3) buffer AllocBuf {
    uint alloc;
 };
 #include "scene.h"
 #include "tilegroup.h"
 #include "fill_seg.h"
 #include "setup.h"
 // Ensure that there is space to encode a segment.
 void alloc_chunk(inout uint chunk_n_segs, inout FillSegChunkRef seg_chunk_ref,
    inout FillSegChunkRef first_seg_chunk, inout uint seg_limit)
 {
    if (chunk_n_segs == 0) {
        if (seg_chunk_ref.offset + 40 > seg_limit) {
            seg_chunk_ref.offset = atomicAdd(alloc, SEG_CHUNK_ALLOC);
            seg_limit = seg_chunk_ref.offset + SEG_CHUNK_ALLOC - FillSegment_size;
        }
        first_seg_chunk = seg_chunk_ref;
    } else if (seg_chunk_ref.offset + FillSegChunk_size + FillSegment_size * chunk_n_segs > seg_limit) {
        uint new_chunk_ref = atomicAdd(alloc, SEG_CHUNK_ALLOC);
        seg_limit = new_chunk_ref + SEG_CHUNK_ALLOC - FillSegment_size;
        FillSegChunk_write(seg_chunk_ref, FillSegChunk(chunk_n_segs, FillSegChunkRef(new_chunk_ref)));
        seg_chunk_ref.offset = new_chunk_ref;
        chunk_n_segs = 0;
    }
 }
 void main() {
    uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x;
    uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS
        + (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES);
    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
    TileGroupRef fill_start = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE + TILEGROUP_FILL_START);
    uint fill_n = tilegroup[fill_start.offset >> 2];
    FillTileHeaderRef tile_header_ref = FillTileHeaderRef(tile_ix * FillTileHeader_size);
    if (fill_n > 0) {
        ChunkRef chunk_ref = ChunkRef(fill_start.offset + 4);
        Chunk chunk = Chunk_read(chunk_ref);
        InstanceRef fill_ref = InstanceRef(chunk_ref.offset + Chunk_size);
        FillItemHeaderRef item_header = FillItemHeaderRef(atomicAdd(alloc, fill_n * FillItemHeader_size));
        FillTileHeader_write(tile_header_ref, FillTileHeader(fill_n, item_header));
        FillSegChunkRef seg_chunk_ref = FillSegChunkRef(0);
        uint seg_limit = 0;
        // Iterate through items; fill_n holds count remaining.
        while (true) {
            if (chunk.chunk_n == 0) {
                chunk_ref = chunk.next;
                if (chunk_ref.offset == 0) {
                    break;
                }
                chunk = Chunk_read(chunk_ref);
                fill_ref = InstanceRef(chunk_ref.offset + Chunk_size);
            }
            Instance ins = Instance_read(fill_ref);
            PietFill fill = PietItem_Fill_read(PietItemRef(ins.item_ref));
            // Process the fill polyline item.
            uint max_n_segs = fill.n_points - 1;
            uint chunk_n_segs = 0;
            int backdrop = 0;
            FillSegChunkRef seg_chunk_ref;
            FillSegChunkRef first_seg_chunk = FillSegChunkRef(0);
            vec2 start = Point_read(fill.points).xy;
            for (uint j = 0; j < max_n_segs; j++) {
                fill.points.offset += Point_size;
                vec2 end = Point_read(fill.points).xy;
                // Process one segment.
                // TODO: I think this would go more smoothly (and be easier to
                // make numerically robust) if it were based on clipping the line
                // to the tile box. See:
                // https://tavianator.com/fast-branchless-raybounding-box-intersections/
                vec2 xymin = min(start, end);
                vec2 xymax = max(start, end);
                float a = end.y - start.y;
                float b = start.x - end.x;
                float c = -(a * start.x + b * start.y);
                vec2 xy1 = xy0 + vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
                float ytop = max(xy0.y, xymin.y);
                float ybot = min(xy1.y, xymax.y);
                float s00 = sign(b * ytop + a * xy0.x + c);
                float s01 = sign(b * ytop + a * xy1.x + c);
                float s10 = sign(b * ybot + a * xy0.x + c);
                float s11 = sign(b * ybot + a * xy1.x + c);
                float sTopLeft = sign(b * xy0.y + a * xy0.x + c);
                if (sTopLeft == sign(a) && xymin.y <= xy0.y && xymax.y > xy0.y) {
                    backdrop -= int(s00);
                }
                // This is adapted from piet-metal but could be improved.
                if (max(xymin.x, xy0.x) < min(xymax.x, xy1.x)
                    && ytop < ybot
                    && s00 * s01 + s00 * s10 + s00 * s11 < 3.0)
                {
                    // avoid overwriting `end` so that it can be used as start
                    vec2 enc_end = end;
                    if (xymin.x < xy0.x) {
                        float yEdge = mix(start.y, end.y, (start.x - xy0.x) / b);
                        if (yEdge >= xy0.y && yEdge < xy1.y) {
                            // This is encoded the same as a general fill segment, but could be
                            // special-cased, either here or in rendering. (It was special-cased
                            // in piet-metal).
                            FillSegment edge_seg;
                            if (b > 0.0) {
                                enc_end = vec2(xy0.x, yEdge);
                                edge_seg.start = enc_end;
                                edge_seg.end = vec2(xy0.x, xy1.y);
                            } else {
                                start = vec2(xy0.x, yEdge);
                                edge_seg.start = vec2(xy0.x, xy1.y);
                                edge_seg.end = start;
                            }
                            alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit);
                            FillSegment_write(FillSegmentRef(seg_chunk_ref.offset + FillSegChunk_size + FillSegment_size * chunk_n_segs), edge_seg);
                            chunk_n_segs++;
                        }
                    }
                    alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit);
                    FillSegment seg = FillSegment(start, enc_end);
                    FillSegment_write(FillSegmentRef(seg_chunk_ref.offset + FillSegChunk_size + FillSegment_size * chunk_n_segs), seg);
                    chunk_n_segs++;
                }
                start = end;
            }
            FillItemHeader_write(item_header, FillItemHeader(backdrop, first_seg_chunk));
            if (chunk_n_segs != 0) {
                FillSegChunk_write(seg_chunk_ref, FillSegChunk(chunk_n_segs, FillSegChunkRef(0)));
                seg_chunk_ref.offset += FillSegChunk_size + FillSegment_size * chunk_n_segs;
            }
            fill_ref.offset += Instance_size;
            chunk.chunk_n--;
            item_header.offset += FillItemHeader_size;
        }
    } else {
        // As an optimization, we could just write 0 for the size.
        FillTileHeader_write(tile_header_ref, FillTileHeader(fill_n, FillItemHeaderRef(0)));
    }
 }
--- a/piet-gpu/shader/kernel2f.spv
+++ b/piet-gpu/shader/kernel2f.spv
--- a/piet-gpu/shader/kernel2s.comp
+++ b/piet-gpu/shader/kernel2s.comp
@ -1,137 +0,0 @@
 // This is "kernel 2" (strokes) in a 4-kernel pipeline. It processes the stroke
 // (polyline) items in the scene and generates a list of segments for each, for
 // each tile.
 #version 450
 #extension GL_GOOGLE_include_directive : enable
 layout(local_size_x = 32) in;
 layout(set = 0, binding = 0) readonly buffer SceneBuf {
    uint[] scene;
 };
 layout(set = 0, binding = 1) buffer TilegroupBuf {
    uint[] tilegroup;
 };
 layout(set = 0, binding = 2) buffer SegmentBuf {
    uint[] segment;
 };
 layout(set = 0, binding = 3) buffer AllocBuf {
    uint alloc;
 };
 #include "scene.h"
 #include "tilegroup.h"
 #include "segment.h"
 #include "setup.h"
 void main() {
    uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x;
    uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS
        + (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES);
    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
    TileGroupRef stroke_start = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE + TILEGROUP_STROKE_START);
    uint stroke_n = tilegroup[stroke_start.offset >> 2];
    TileHeaderRef tile_header_ref = TileHeaderRef(tile_ix * TileHeader_size);
    if (stroke_n > 0) {
        ChunkRef chunk_ref = ChunkRef(stroke_start.offset + 4);
        Chunk chunk = Chunk_read(chunk_ref);
        InstanceRef stroke_ref = InstanceRef(chunk_ref.offset + Chunk_size);
        ItemHeaderRef item_header = ItemHeaderRef(atomicAdd(alloc, stroke_n * ItemHeader_size));
        TileHeader_write(tile_header_ref, TileHeader(stroke_n, item_header));
        SegChunkRef seg_chunk_ref = SegChunkRef(0);
        uint seg_limit = 0;
        // Iterate through items; stroke_n holds count remaining.
        while (true) {
            if (chunk.chunk_n == 0) {
                chunk_ref = chunk.next;
                if (chunk_ref.offset == 0) {
                    break;
                }
                chunk = Chunk_read(chunk_ref);
                stroke_ref = InstanceRef(chunk_ref.offset + Chunk_size);
            }
            Instance ins = Instance_read(stroke_ref);
            PietStrokePolyLine poly = PietItem_Poly_read(PietItemRef(ins.item_ref));
            // Process the stroke polyline item.
            uint max_n_segs = poly.n_points - 1;
            uint chunk_n_segs = 0;
            SegChunkRef seg_chunk_ref;
            vec2 start = Point_read(poly.points).xy;
            for (uint j = 0; j < max_n_segs; j++) {
                poly.points.offset += Point_size;
                vec2 end = Point_read(poly.points).xy;
                // Process one segment.
                // This logic just tests for collision. What we probably want to do
                // is a clipping algorithm like Liang-Barsky, and then store coords
                // relative to the tile in f16. See also:
                // https://tavianator.com/fast-branchless-raybounding-box-intersections/
                // Also note that when we go to the fancy version, we want to compute
                // the (horizontal projection of) the bounding box of the intersection
                // once per tilegroup, so we can assign work to individual tiles.
                float a = end.y - start.y;
                float b = start.x - end.x;
                float c = -(a * start.x + b * start.y);
                float half_width = 0.5 * poly.width;
                // Tile boundaries padded by half-width.
                float xmin = xy0.x - half_width;
                float ymin = xy0.y - half_width;
                float xmax = xy0.x + float(TILE_WIDTH_PX) + half_width;
                float ymax = xy0.y + float(TILE_HEIGHT_PX) + half_width;
                float s00 = sign(b * ymin + a * xmin + c);
                float s01 = sign(b * ymin + a * xmax + c);
                float s10 = sign(b * ymax + a * xmin + c);
                float s11 = sign(b * ymax + a * xmax + c);
                // If bounding boxes intersect and not all four corners are on the same side, hit.
                // Also note: this is designed to be false on NAN input.
                if (max(min(start.x, end.x), xmin) < min(max(start.x, end.x), xmax)
                    && max(min(start.y, end.y), ymin) < min(max(start.y, end.y), ymax)
                    && s00 * s01 + s00 * s10 + s00 * s11 < 3.0)
                {
                    // Allocate a chunk if needed.
                    if (chunk_n_segs == 0) {
                        if (seg_chunk_ref.offset + 40 > seg_limit) {
                            seg_chunk_ref.offset = atomicAdd(alloc, SEG_CHUNK_ALLOC);
                            seg_limit = seg_chunk_ref.offset + SEG_CHUNK_ALLOC - Segment_size;
                        }
                        ItemHeader_write(item_header, ItemHeader(seg_chunk_ref));
                    } else if (seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs > seg_limit) {
                        uint new_chunk_ref = atomicAdd(alloc, SEG_CHUNK_ALLOC);
                        seg_limit = new_chunk_ref + SEG_CHUNK_ALLOC - Segment_size;
                        SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, SegChunkRef(new_chunk_ref)));
                        seg_chunk_ref.offset = new_chunk_ref;
                        chunk_n_segs = 0;
                    }
                    Segment seg = Segment(start, end);
                    Segment_write(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs), seg);
                    chunk_n_segs++;
                }
                start = end;
            }
            if (chunk_n_segs == 0) {
                ItemHeader_write(item_header, ItemHeader(SegChunkRef(0)));
            } else {
                SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, SegChunkRef(0)));
                seg_chunk_ref.offset += SegChunk_size + Segment_size * chunk_n_segs;
            }
            stroke_ref.offset += Instance_size;
            chunk.chunk_n--;
            item_header.offset += ItemHeader_size;
        }
    } else {
        // As an optimization, we could just write 0 for the size.
        TileHeader_write(tile_header_ref, TileHeader(stroke_n, ItemHeaderRef(0)));
    }
 }
--- a/piet-gpu/shader/kernel2s.spv
+++ b/piet-gpu/shader/kernel2s.spv
--- a/piet-gpu/shader/kernel3.comp
+++ b/piet-gpu/shader/kernel3.comp
@ -1,135 +0,0 @@
 // This is "kernel 3" in a 4-kernel pipeline. It walks the active items
 // for the tilegroup and produces a per-tile command list for each tile.
 #version 450
 #extension GL_GOOGLE_include_directive : enable
 layout(local_size_x = 32, local_size_y = 1) in;
 layout(set = 0, binding = 0) readonly buffer SceneBuf {
    uint[] scene;
 };
 // TODO: this should have a `readonly` qualifier, but then inclusion
 // of ptcl.h would fail because of the writers.
 layout(set = 0, binding = 1) buffer TilegroupBuf {
    uint[] tilegroup;
 };
 // Used readonly
 layout(set = 0, binding = 2) buffer SegmentBuf {
    uint[] segment;
 };
 // Used readonly
 layout(set = 0, binding = 3) buffer FillSegmentBuf {
    uint[] fill_seg;
 };
 layout(set = 0, binding = 4) buffer PtclBuf {
    uint[] ptcl;
 };
 layout(set = 0, binding = 5) buffer AllocBuf {
    uint alloc;
 };
 #include "scene.h"
 #include "tilegroup.h"
 #include "segment.h"
 #include "fill_seg.h"
 #include "ptcl.h"
 #include "setup.h"
 void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) {
    if (cmd_ref.offset > cmd_limit) {
        uint new_cmd = atomicAdd(alloc, PTCL_INITIAL_ALLOC);
        CmdJump jump = CmdJump(new_cmd);
        Cmd_Jump_write(cmd_ref, jump);
        cmd_ref = CmdRef(new_cmd);
        cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
    }
 }
 void main() {
    uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x;
    uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS
        + (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES);
    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
    TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE);
    CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC);
    uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
    TileHeader stroke_th = TileHeader_read(TileHeaderRef(tile_ix * TileHeader_size));
    FillTileHeader fill_th = FillTileHeader_read(FillTileHeaderRef(tile_ix * FillTileHeader_size));
    while (true) {
        uint tg_tag = TileGroup_tag(tg_ref);
        if (tg_tag == TileGroup_End) {
            break;
        }
        if (tg_tag == TileGroup_Jump) {
            tg_ref = TileGroup_Jump_read(tg_ref).new_ref;
            continue;
        }
        // Assume tg_tag is `Instance`, though there will be more cases.
        Instance ins = TileGroup_Instance_read(tg_ref);
        PietItemRef item_ref = PietItemRef(ins.item_ref);
        uint item_tag = PietItem_tag(item_ref);
        switch (item_tag) {
        case PietItem_Circle:
            PietCircle circle = PietItem_Circle_read(item_ref);
            vec2 center = ins.offset + circle.center.xy;
            float r = circle.radius;
            if (max(center.x - r, xy0.x) < min(center.x + r, xy0.x + float(TILE_WIDTH_PX))
                && max(center.y - r, xy0.y) < min(center.y + r, xy0.y + float(TILE_HEIGHT_PX)))
            {
                CmdCircle cmd = CmdCircle(center, r, circle.rgba_color);
                alloc_cmd(cmd_ref, cmd_limit);
                Cmd_Circle_write(cmd_ref, cmd);
                cmd_ref.offset += Cmd_size;
            }
            break;
        case PietItem_Poly:
            ItemHeader stroke_item = ItemHeader_read(stroke_th.items);
            stroke_th.items.offset += ItemHeader_size;
            if (stroke_item.segments.offset != 0) {
                PietStrokePolyLine poly = PietItem_Poly_read(item_ref);
                CmdStroke cmd = CmdStroke(
                    stroke_item.segments.offset,
                    0.5 * poly.width,
                    poly.rgba_color
                );
                alloc_cmd(cmd_ref, cmd_limit);
                Cmd_Stroke_write(cmd_ref, cmd);
                cmd_ref.offset += Cmd_size;
            }
            break;
        case PietItem_Fill:
            FillItemHeader fill_item = FillItemHeader_read(fill_th.items);
            fill_th.items.offset += FillItemHeader_size;
            // TODO: handle segments == 0 but backdrop != specially, it's a solid tile.
            if (fill_item.segments.offset != 0) {
                PietFill fill = PietItem_Fill_read(item_ref);
                CmdFill cmd = CmdFill(
                    fill_item.segments.offset,
                    fill_item.backdrop,
                    fill.rgba_color
                );
                alloc_cmd(cmd_ref, cmd_limit);
                Cmd_Fill_write(cmd_ref, cmd);
                cmd_ref.offset += Cmd_size;
            } else if (fill_item.backdrop != 0) {
                // TODO: truncate existing cmd list if alpha is opaque
                PietFill fill = PietItem_Fill_read(item_ref);
                alloc_cmd(cmd_ref, cmd_limit);
                Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color));
                cmd_ref.offset += Cmd_size;
            }
            break;
        }
        tg_ref.offset += TileGroup_size;
    }
    Cmd_End_write(cmd_ref);
 }
--- a/piet-gpu/shader/kernel3.spv
+++ b/piet-gpu/shader/kernel3.spv
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@ -9,26 +9,14 @@
 layout(local_size_x = 16, local_size_y = 16) in;
-// Same concern that this should be readonly as in kernel 3.
+// This should be annotated readonly but infra doesn't support that yet.
 layout(set = 0, binding = 0) buffer PtclBuf {
    uint[] ptcl;
 };
-// Used readonly
+layout(rgba8, set = 0, binding = 1) uniform writeonly image2D image;
 layout(set = 0, binding = 1) buffer SegmentBuf {
    uint[] segment;
 };
 // Used readonly
 layout(set = 0, binding = 2) buffer FillSegBuf {
    uint[] fill_seg;
 };
 layout(rgba8, set = 0, binding = 3) uniform writeonly image2D image;
 #include "ptcl.h"
 #include "segment.h"
 #include "fill_seg.h"
 #include "setup.h"
@ -79,11 +67,11 @@ void main() {
            CmdFill fill = Cmd_Fill_read(cmd_ref);
            // Probably better to store as float, but conversion is no doubt cheap.
            float area = float(fill.backdrop);
-            FillSegChunkRef fill_seg_chunk_ref = FillSegChunkRef(fill.seg_ref);
+            SegChunkRef fill_seg_chunk_ref = SegChunkRef(fill.seg_ref);
            do {
-                FillSegChunk seg_chunk = FillSegChunk_read(fill_seg_chunk_ref);
+                SegChunk seg_chunk = SegChunk_read(fill_seg_chunk_ref);
                for (int i = 0; i < seg_chunk.n; i++) {
-                    FillSegment seg = FillSegment_read(FillSegmentRef(fill_seg_chunk_ref.offset + FillSegChunk_size + FillSegment_size * i));
+                    Segment seg = Segment_read(SegmentRef(fill_seg_chunk_ref.offset + SegChunk_size + Segment_size * i));
                    vec2 start = seg.start - xy;
                    vec2 end = seg.end - xy;
                    vec2 window = clamp(vec2(start.y, end.y), 0.0, 1.0);
--- a/piet-gpu/shader/kernel4.spv
+++ b/piet-gpu/shader/kernel4.spv
--- a/piet-gpu/shader/ptcl.h
+++ b/piet-gpu/shader/ptcl.h
@ -36,6 +36,14 @@ struct CmdRef {
    uint offset;
 };
 struct SegmentRef {
    uint offset;
 };
 struct SegChunkRef {
    uint offset;
 };
 struct CmdCircle {
    vec2 center;
    float radius;
@ -141,6 +149,28 @@ CmdRef Cmd_index(CmdRef ref, uint index) {
    return CmdRef(ref.offset + index * Cmd_size);
 }
 struct Segment {
    vec2 start;
    vec2 end;
 };
 #define Segment_size 16
 SegmentRef Segment_index(SegmentRef ref, uint index) {
    return SegmentRef(ref.offset + index * Segment_size);
 }
 struct SegChunk {
    uint n;
    SegChunkRef next;
 };
 #define SegChunk_size 8
 SegChunkRef SegChunk_index(SegChunkRef ref, uint index) {
    return SegChunkRef(ref.offset + index * SegChunk_size);
 }
 CmdCircle CmdCircle_read(CmdCircleRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = ptcl[ix + 0];
@ -362,3 +392,39 @@ void Cmd_Bail_write(CmdRef ref) {
    ptcl[ref.offset >> 2] = Cmd_Bail;
 }
 Segment Segment_read(SegmentRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = ptcl[ix + 0];
    uint raw1 = ptcl[ix + 1];
    uint raw2 = ptcl[ix + 2];
    uint raw3 = ptcl[ix + 3];
    Segment s;
    s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    return s;
 }
 void Segment_write(SegmentRef ref, Segment s) {
    uint ix = ref.offset >> 2;
    ptcl[ix + 0] = floatBitsToUint(s.start.x);
    ptcl[ix + 1] = floatBitsToUint(s.start.y);
    ptcl[ix + 2] = floatBitsToUint(s.end.x);
    ptcl[ix + 3] = floatBitsToUint(s.end.y);
 }
 SegChunk SegChunk_read(SegChunkRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = ptcl[ix + 0];
    uint raw1 = ptcl[ix + 1];
    SegChunk s;
    s.n = raw0;
    s.next = SegChunkRef(raw1);
    return s;
 }
 void SegChunk_write(SegChunkRef ref, SegChunk s) {
    uint ix = ref.offset >> 2;
    ptcl[ix + 0] = s.n;
    ptcl[ix + 1] = s.next.offset;
 }
--- a/piet-gpu/shader/segment.h
+++ b/piet-gpu/shader/segment.h
@ -1,126 +0,0 @@
 // Code auto-generated by piet-gpu-derive
 struct TileHeaderRef {
    uint offset;
 };
 struct ItemHeaderRef {
    uint offset;
 };
 struct SegmentRef {
    uint offset;
 };
 struct SegChunkRef {
    uint offset;
 };
 struct TileHeader {
    uint n;
    ItemHeaderRef items;
 };
 #define TileHeader_size 8
 TileHeaderRef TileHeader_index(TileHeaderRef ref, uint index) {
    return TileHeaderRef(ref.offset + index * TileHeader_size);
 }
 struct ItemHeader {
    SegChunkRef segments;
 };
 #define ItemHeader_size 4
 ItemHeaderRef ItemHeader_index(ItemHeaderRef ref, uint index) {
    return ItemHeaderRef(ref.offset + index * ItemHeader_size);
 }
 struct Segment {
    vec2 start;
    vec2 end;
 };
 #define Segment_size 16
 SegmentRef Segment_index(SegmentRef ref, uint index) {
    return SegmentRef(ref.offset + index * Segment_size);
 }
 struct SegChunk {
    uint n;
    SegChunkRef next;
 };
 #define SegChunk_size 8
 SegChunkRef SegChunk_index(SegChunkRef ref, uint index) {
    return SegChunkRef(ref.offset + index * SegChunk_size);
 }
 TileHeader TileHeader_read(TileHeaderRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = segment[ix + 0];
    uint raw1 = segment[ix + 1];
    TileHeader s;
    s.n = raw0;
    s.items = ItemHeaderRef(raw1);
    return s;
 }
 void TileHeader_write(TileHeaderRef ref, TileHeader s) {
    uint ix = ref.offset >> 2;
    segment[ix + 0] = s.n;
    segment[ix + 1] = s.items.offset;
 }
 ItemHeader ItemHeader_read(ItemHeaderRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = segment[ix + 0];
    ItemHeader s;
    s.segments = SegChunkRef(raw0);
    return s;
 }
 void ItemHeader_write(ItemHeaderRef ref, ItemHeader s) {
    uint ix = ref.offset >> 2;
    segment[ix + 0] = s.segments.offset;
 }
 Segment Segment_read(SegmentRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = segment[ix + 0];
    uint raw1 = segment[ix + 1];
    uint raw2 = segment[ix + 2];
    uint raw3 = segment[ix + 3];
    Segment s;
    s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    return s;
 }
 void Segment_write(SegmentRef ref, Segment s) {
    uint ix = ref.offset >> 2;
    segment[ix + 0] = floatBitsToUint(s.start.x);
    segment[ix + 1] = floatBitsToUint(s.start.y);
    segment[ix + 2] = floatBitsToUint(s.end.x);
    segment[ix + 3] = floatBitsToUint(s.end.y);
 }
 SegChunk SegChunk_read(SegChunkRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = segment[ix + 0];
    uint raw1 = segment[ix + 1];
    SegChunk s;
    s.n = raw0;
    s.next = SegChunkRef(raw1);
    return s;
 }
 void SegChunk_write(SegChunkRef ref, SegChunk s) {
    uint ix = ref.offset >> 2;
    segment[ix + 0] = s.n;
    segment[ix + 1] = s.next.offset;
 }
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -209,16 +209,11 @@ impl<D: Device> Renderer<D> {
            &[],
        )?;
        // These will probably be combined with the ptcl buf, as they're all written by the
        // same kernel now.
        let segment_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
        let fill_seg_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
        let k4_code = include_bytes!("../shader/kernel4.spv");
-        let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 3, 1)?;
+        let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 1, 1)?;
        let k4_ds = device.create_descriptor_set(
            &k4_pipeline,
-            &[&ptcl_buf, &segment_buf, &fill_seg_buf],
+            &[&ptcl_buf],
            &[&image_dev],
        )?;