Delete old-style kernels and buffers

Pave the way for the coarse raster pass to write to the ptcl buffer.
2025-01-10 12:41:30 +11:00 · 2020-05-15 15:20:25 -07:00 · 2020-05-15 15:20:25 -07:00 · 1240da3870
parent 3a6428238b
commit 1240da3870
20 changed files with 91 additions and 965 deletions
--- a/piet-gpu-types/src/fill_seg.rs
+++ b/piet-gpu-types/src/fill_seg.rs
@ -1,37 +0,0 @@
-use piet_gpu_derive::piet_gpu;
-
-// Structures representing segments for fill items.
-
-// There is some cut'n'paste here from stroke segments, which can be
-// traced to the fact that buffers in GLSL are basically global.
-// Maybe there's a way to address that, but in the meantime living
-// with the duplication is easiest.
-
-piet_gpu! {
-    #[gpu_write]
-    mod fill_seg {
-        struct FillTileHeader {
-            n: u32,
-            items: Ref<FillItemHeader>,
-        }
-
-        struct FillItemHeader {
-            backdrop: i32,
-            segments: Ref<FillSegChunk>,
-        }
-
-        // TODO: strongly consider using f16. If so, these would be
-        // relative to the tile. We're doing f32 for now to minimize
-        // divergence from piet-metal originals.
-        struct FillSegment {
-            start: [f32; 2],
-            end: [f32; 2],
-        }
-
-        struct FillSegChunk {
-            n: u32,
-            next: Ref<FillSegChunk>,
-            // Segments follow (could represent this as a variable sized array).
-        }
-    }
-}
--- a/piet-gpu-types/src/lib.rs
+++ b/piet-gpu-types/src/lib.rs
@ -3,10 +3,8 @@
 pub mod annotated;
 pub mod bins;
 pub mod encoder;
-pub mod fill_seg;
 pub mod ptcl;
 pub mod scene;
-pub mod segment;
 pub mod state;
 pub mod test;
 pub mod tilegroup;
--- a/piet-gpu-types/src/main.rs
+++ b/piet-gpu-types/src/main.rs
@ -9,8 +9,6 @@ fn main() {
        "annotated" => print!("{}", piet_gpu_types::annotated::gen_gpu_annotated()),
        "bins" => print!("{}", piet_gpu_types::bins::gen_gpu_bins()),
        "tilegroup" => print!("{}", piet_gpu_types::tilegroup::gen_gpu_tilegroup()),
-        "segment" => print!("{}", piet_gpu_types::segment::gen_gpu_segment()),
-        "fill_seg" => print!("{}", piet_gpu_types::fill_seg::gen_gpu_fill_seg()),
        "ptcl" => print!("{}", piet_gpu_types::ptcl::gen_gpu_ptcl()),
        "test" => print!("{}", piet_gpu_types::test::gen_gpu_test()),
        _ => println!("Oops, unknown module name"),
--- a/piet-gpu-types/src/ptcl.rs
+++ b/piet-gpu-types/src/ptcl.rs
@ -13,13 +13,13 @@ piet_gpu! {
            end: [f32; 2],
        }
        struct CmdStroke {
-            // Should be Ref<SegChunk> if we had cross-module references.
+            // Should be Ref<SegChunk>
            seg_ref: u32,
            half_width: f32,
            rgba_color: u32,
        }
        struct CmdFill {
-            // Should be Ref<FillSegChunk> if we had cross-module references.
+            // Should be Ref<FillSegChunk>
            seg_ref: u32,
            backdrop: i32,
            rgba_color: u32,
@ -51,5 +51,19 @@ piet_gpu! {
            Jump(CmdJump),
            Bail,
        }
+
+        // TODO: strongly consider using f16. If so, these would be
+        // relative to the tile. We're doing f32 for now to minimize
+        // divergence from piet-metal originals.
+        struct Segment {
+            start: [f32; 2],
+            end: [f32; 2],
+        }
+
+        struct SegChunk {
+            n: u32,
+            next: Ref<SegChunk>,
+            // Segments follow (could represent this as a variable sized array).
+        }
    }
 }
--- a/piet-gpu-types/src/segment.rs
+++ b/piet-gpu-types/src/segment.rs
@ -1,32 +0,0 @@
-use piet_gpu_derive::piet_gpu;
-
-// Structures representing segments for stroke/fill items.
-
-piet_gpu! {
-    #[gpu_write]
-    mod segment {
-        struct TileHeader {
-            n: u32,
-            items: Ref<ItemHeader>,
-        }
-
-        // Note: this is only suitable for strokes, fills require backdrop.
-        struct ItemHeader {
-            segments: Ref<SegChunk>,
-        }
-
-        // TODO: strongly consider using f16. If so, these would be
-        // relative to the tile. We're doing f32 for now to minimize
-        // divergence from piet-metal originals.
-        struct Segment {
-            start: [f32; 2],
-            end: [f32; 2],
-        }
-
-        struct SegChunk {
-            n: u32,
-            next: Ref<SegChunk>,
-            // Segments follow (could represent this as a variable sized array).
-        }
-    }
-}
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@ -9,19 +9,11 @@ rule glsl

 build image.spv: glsl image.comp | scene.h

-build kernel1.spv: glsl kernel1.comp | scene.h tilegroup.h setup.h
-
-build kernel2s.spv: glsl kernel2s.comp | scene.h tilegroup.h segment.h setup.h
-
-build kernel2f.spv: glsl kernel2f.comp | scene.h tilegroup.h fill_seg.h setup.h
-
-build kernel3.spv: glsl kernel3.comp | scene.h tilegroup.h segment.h fill_seg.h ptcl.h setup.h
-
-build kernel4.spv: glsl kernel4.comp | ptcl.h segment.h fill_seg.h setup.h
-

 build elements.spv: glsl elements.comp | scene.h state.h annotated.h

 build binning.spv: glsl binning.comp | annotated.h bins.h setup.h

 build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h
+
+build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h
--- a/piet-gpu/shader/fill_seg.h
+++ b/piet-gpu/shader/fill_seg.h
@ -1,130 +0,0 @@
-// Code auto-generated by piet-gpu-derive
-
-struct FillTileHeaderRef {
-    uint offset;
-};
-
-struct FillItemHeaderRef {
-    uint offset;
-};
-
-struct FillSegmentRef {
-    uint offset;
-};
-
-struct FillSegChunkRef {
-    uint offset;
-};
-
-struct FillTileHeader {
-    uint n;
-    FillItemHeaderRef items;
-};
-
-#define FillTileHeader_size 8
-
-FillTileHeaderRef FillTileHeader_index(FillTileHeaderRef ref, uint index) {
-    return FillTileHeaderRef(ref.offset + index * FillTileHeader_size);
-}
-
-struct FillItemHeader {
-    int backdrop;
-    FillSegChunkRef segments;
-};
-
-#define FillItemHeader_size 8
-
-FillItemHeaderRef FillItemHeader_index(FillItemHeaderRef ref, uint index) {
-    return FillItemHeaderRef(ref.offset + index * FillItemHeader_size);
-}
-
-struct FillSegment {
-    vec2 start;
-    vec2 end;
-};
-
-#define FillSegment_size 16
-
-FillSegmentRef FillSegment_index(FillSegmentRef ref, uint index) {
-    return FillSegmentRef(ref.offset + index * FillSegment_size);
-}
-
-struct FillSegChunk {
-    uint n;
-    FillSegChunkRef next;
-};
-
-#define FillSegChunk_size 8
-
-FillSegChunkRef FillSegChunk_index(FillSegChunkRef ref, uint index) {
-    return FillSegChunkRef(ref.offset + index * FillSegChunk_size);
-}
-
-FillTileHeader FillTileHeader_read(FillTileHeaderRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = fill_seg[ix + 0];
-    uint raw1 = fill_seg[ix + 1];
-    FillTileHeader s;
-    s.n = raw0;
-    s.items = FillItemHeaderRef(raw1);
-    return s;
-}
-
-void FillTileHeader_write(FillTileHeaderRef ref, FillTileHeader s) {
-    uint ix = ref.offset >> 2;
-    fill_seg[ix + 0] = s.n;
-    fill_seg[ix + 1] = s.items.offset;
-}
-
-FillItemHeader FillItemHeader_read(FillItemHeaderRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = fill_seg[ix + 0];
-    uint raw1 = fill_seg[ix + 1];
-    FillItemHeader s;
-    s.backdrop = int(raw0);
-    s.segments = FillSegChunkRef(raw1);
-    return s;
-}
-
-void FillItemHeader_write(FillItemHeaderRef ref, FillItemHeader s) {
-    uint ix = ref.offset >> 2;
-    fill_seg[ix + 0] = uint(s.backdrop);
-    fill_seg[ix + 1] = s.segments.offset;
-}
-
-FillSegment FillSegment_read(FillSegmentRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = fill_seg[ix + 0];
-    uint raw1 = fill_seg[ix + 1];
-    uint raw2 = fill_seg[ix + 2];
-    uint raw3 = fill_seg[ix + 3];
-    FillSegment s;
-    s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
-    s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    return s;
-}
-
-void FillSegment_write(FillSegmentRef ref, FillSegment s) {
-    uint ix = ref.offset >> 2;
-    fill_seg[ix + 0] = floatBitsToUint(s.start.x);
-    fill_seg[ix + 1] = floatBitsToUint(s.start.y);
-    fill_seg[ix + 2] = floatBitsToUint(s.end.x);
-    fill_seg[ix + 3] = floatBitsToUint(s.end.y);
-}
-
-FillSegChunk FillSegChunk_read(FillSegChunkRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = fill_seg[ix + 0];
-    uint raw1 = fill_seg[ix + 1];
-    FillSegChunk s;
-    s.n = raw0;
-    s.next = FillSegChunkRef(raw1);
-    return s;
-}
-
-void FillSegChunk_write(FillSegChunkRef ref, FillSegChunk s) {
-    uint ix = ref.offset >> 2;
-    fill_seg[ix + 0] = s.n;
-    fill_seg[ix + 1] = s.next.offset;
-}
-
--- a/piet-gpu/shader/kernel1.comp
+++ b/piet-gpu/shader/kernel1.comp
@ -1,161 +0,0 @@
-// This is "kernel 1" in a 4-kernel pipeline. It traverses the scene graph
-// and outputs "instances" (references to item + translation) for each item
-// that intersects the tilegroup.
-//
-// This implementation is simplistic and leaves a lot of performance on the
-// table. A fancier implementation would use threadgroup shared memory or
-// subgroups (or possibly both) to parallelize the reading of the input and
-// the computation of tilegroup intersection.
-//
-// In addition, there are some features currently missing, such as support
-// for clipping.
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-// It's possible we should lay this out with x and do our own math.
-layout(local_size_x = 1, local_size_y = 32) in;
-
-layout(set = 0, binding = 0) readonly buffer SceneBuf {
-    uint[] scene;
-};
-
-layout(set = 0, binding = 1) buffer TilegroupBuf {
-    uint[] tilegroup;
-};
-
-layout(set = 0, binding = 2) buffer AllocBuf {
-    uint alloc;
-};
-
-#include "scene.h"
-#include "tilegroup.h"
-
-#include "setup.h"
-
-#define MAX_STACK 8
-
-struct StackElement {
-    PietItemRef group;
-    uint index;
-    vec2 offset;
-};
-
-void main() {
-    StackElement stack[MAX_STACK];
-    uint stack_ix = 0;
-    uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS + gl_GlobalInvocationID.x;
-    TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE);
-    uint tg_limit = tg_ref.offset + TILEGROUP_INITIAL_ALLOC - 2 * TileGroup_size;
-
-    // State for stroke references.
-    TileGroupRef stroke_start = TileGroupRef(tg_ref.offset + TILEGROUP_STROKE_START);
-    ChunkRef stroke_chunk_start = ChunkRef(stroke_start.offset + 4);
-    InstanceRef stroke_ref = InstanceRef(stroke_chunk_start.offset + Chunk_size);
-    uint stroke_limit = stroke_start.offset + TILEGROUP_INITIAL_STROKE_ALLOC - Instance_size;
-    uint stroke_chunk_n = 0;
-    uint stroke_n = 0;
-
-    // State for fill references. All this is a bit cut'n'paste, but making a
-    // proper abstraction isn't easy.
-    TileGroupRef fill_start = TileGroupRef(tg_ref.offset + TILEGROUP_FILL_START);
-    ChunkRef fill_chunk_start = ChunkRef(fill_start.offset + 4);
-    InstanceRef fill_ref = InstanceRef(fill_chunk_start.offset + Chunk_size);
-    uint fill_limit = fill_start.offset + TILEGROUP_INITIAL_FILL_ALLOC - Instance_size;
-    uint fill_chunk_n = 0;
-    uint fill_n = 0;
-
-    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILEGROUP_WIDTH_PX, TILEGROUP_HEIGHT_PX);
-    PietItemRef root = PietItemRef(0);
-    SimpleGroup group = PietItem_Group_read(root);
-    StackElement tos = StackElement(root, 0, group.offset.xy);
-
-    while (true) {
-        if (tos.index < group.n_items) {
-            Bbox bbox = Bbox_read(Bbox_index(group.bboxes, tos.index));
-            vec4 bb = vec4(bbox.bbox) + tos.offset.xyxy;
-            bool hit = max(bb.x, xy0.x) < min(bb.z, xy0.x + float(TILEGROUP_WIDTH_PX))
-                && max(bb.y, xy0.y) < min(bb.w, xy0.y + float(TILEGROUP_HEIGHT_PX));
-            bool is_group = false;
-            uint tag;
-            if (hit) {
-                PietItemRef item_ref = PietItem_index(group.items, tos.index);
-                tag = PietItem_tag(item_ref);
-                is_group = tag == PietItem_Group;
-            }
-            if (hit && !is_group) {
-                PietItemRef item_ref = PietItem_index(group.items, tos.index);
-                Instance ins = Instance(item_ref.offset, tos.offset);
-                if (tg_ref.offset > tg_limit) {
-                    // Allocation exceeded; do atomic bump alloc.
-                    uint new_tg = atomicAdd(alloc, TILEGROUP_INITIAL_ALLOC);
-                    Jump jump = Jump(TileGroupRef(new_tg));
-                    TileGroup_Jump_write(tg_ref, jump);
-                    tg_ref = TileGroupRef(new_tg);
-                    tg_limit = tg_ref.offset + TILEGROUP_INITIAL_ALLOC - 2 * TileGroup_size;
-                }
-                TileGroup_Instance_write(tg_ref, ins);
-                tg_ref.offset += TileGroup_size;
-                if (tag == PietItem_Poly) {
-                    if (stroke_ref.offset > stroke_limit) {
-                        uint new_stroke = atomicAdd(alloc, TILEGROUP_STROKE_ALLOC);
-                        Chunk_write(stroke_chunk_start, Chunk(stroke_chunk_n, ChunkRef(new_stroke)));
-                        stroke_chunk_start = ChunkRef(new_stroke);
-                        stroke_ref = InstanceRef(new_stroke + Chunk_size);
-                        stroke_n += stroke_chunk_n;
-                        stroke_chunk_n = 0;
-                        stroke_limit = new_stroke + TILEGROUP_STROKE_ALLOC - Instance_size;
-                    }
-                    Instance_write(stroke_ref, ins);
-                    stroke_chunk_n++;
-                    stroke_ref.offset += Instance_size;
-                } else if (tag == PietItem_Fill) {
-                    if (fill_ref.offset > fill_limit) {
-                        uint new_fill = atomicAdd(alloc, TILEGROUP_FILL_ALLOC);
-                        Chunk_write(fill_chunk_start, Chunk(fill_chunk_n, ChunkRef(new_fill)));
-                        fill_chunk_start = ChunkRef(new_fill);
-                        fill_ref = InstanceRef(new_fill + Chunk_size);
-                        fill_n += fill_chunk_n;
-                        fill_chunk_n = 0;
-                        fill_limit = new_fill + TILEGROUP_FILL_ALLOC - Instance_size;
-                    }
-                    Instance_write(fill_ref, ins);
-                    fill_chunk_n++;
-                    fill_ref.offset += Instance_size;
-
-                }
-            }
-            if (is_group) {
-                PietItemRef item_ref = PietItem_index(group.items, tos.index);
-                tos.index++;
-                if (tos.index < group.n_items) {
-                    stack[stack_ix++] = tos;
-                }
-                group = PietItem_Group_read(item_ref);
-                tos = StackElement(item_ref, 0, tos.offset + group.offset.xy);
-            } else {
-                tos.index++;
-            }
-        } else {
-            // processed all items in this group; pop the stack
-            if (stack_ix == 0) {
-                break;
-            }
-            tos = stack[--stack_ix];
-            group = PietItem_Group_read(tos.group);
-        }
-    }
-    TileGroup_End_write(tg_ref);
-
-    stroke_n += stroke_chunk_n;
-    if (stroke_n > 0) {
-        Chunk_write(stroke_chunk_start, Chunk(stroke_chunk_n, ChunkRef(0)));
-    }
-    tilegroup[stroke_start.offset >> 2] = stroke_n;
-
-    fill_n += fill_chunk_n;
-    if (fill_n > 0) {
-        Chunk_write(fill_chunk_start, Chunk(fill_chunk_n, ChunkRef(0)));
-    }
-    tilegroup[fill_start.offset >> 2] = fill_n;
-}
--- a/piet-gpu/shader/kernel1.spv
+++ b/piet-gpu/shader/kernel1.spv
--- a/piet-gpu/shader/kernel2f.comp
+++ b/piet-gpu/shader/kernel2f.comp
@ -1,167 +0,0 @@
-// This is "kernel 2" (fill) in a 4-kernel pipeline. It processes the fill
-// (polyline) items in the scene and generates a list of segments for each, for
-// each tile.
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-layout(local_size_x = 32) in;
-
-layout(set = 0, binding = 0) readonly buffer SceneBuf {
-    uint[] scene;
-};
-
-layout(set = 0, binding = 1) buffer TilegroupBuf {
-    uint[] tilegroup;
-};
-
-layout(set = 0, binding = 2) buffer FillSegBuf {
-    uint[] fill_seg;
-};
-
-layout(set = 0, binding = 3) buffer AllocBuf {
-    uint alloc;
-};
-
-#include "scene.h"
-#include "tilegroup.h"
-#include "fill_seg.h"
-
-#include "setup.h"
-
-// Ensure that there is space to encode a segment.
-void alloc_chunk(inout uint chunk_n_segs, inout FillSegChunkRef seg_chunk_ref,
-    inout FillSegChunkRef first_seg_chunk, inout uint seg_limit)
-{
-    if (chunk_n_segs == 0) {
-        if (seg_chunk_ref.offset + 40 > seg_limit) {
-            seg_chunk_ref.offset = atomicAdd(alloc, SEG_CHUNK_ALLOC);
-            seg_limit = seg_chunk_ref.offset + SEG_CHUNK_ALLOC - FillSegment_size;
-        }
-        first_seg_chunk = seg_chunk_ref;
-    } else if (seg_chunk_ref.offset + FillSegChunk_size + FillSegment_size * chunk_n_segs > seg_limit) {
-        uint new_chunk_ref = atomicAdd(alloc, SEG_CHUNK_ALLOC);
-        seg_limit = new_chunk_ref + SEG_CHUNK_ALLOC - FillSegment_size;
-        FillSegChunk_write(seg_chunk_ref, FillSegChunk(chunk_n_segs, FillSegChunkRef(new_chunk_ref)));
-        seg_chunk_ref.offset = new_chunk_ref;
-        chunk_n_segs = 0;
-    }
-
-}
-
-void main() {
-    uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x;
-    uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS
-        + (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES);
-    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
-    TileGroupRef fill_start = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE + TILEGROUP_FILL_START);
-    uint fill_n = tilegroup[fill_start.offset >> 2];
-
-    FillTileHeaderRef tile_header_ref = FillTileHeaderRef(tile_ix * FillTileHeader_size);
-    if (fill_n > 0) {
-        ChunkRef chunk_ref = ChunkRef(fill_start.offset + 4);
-        Chunk chunk = Chunk_read(chunk_ref);
-        InstanceRef fill_ref = InstanceRef(chunk_ref.offset + Chunk_size);
-        FillItemHeaderRef item_header = FillItemHeaderRef(atomicAdd(alloc, fill_n * FillItemHeader_size));
-        FillTileHeader_write(tile_header_ref, FillTileHeader(fill_n, item_header));
-        FillSegChunkRef seg_chunk_ref = FillSegChunkRef(0);
-        uint seg_limit = 0;
-        // Iterate through items; fill_n holds count remaining.
-        while (true) {
-            if (chunk.chunk_n == 0) {
-                chunk_ref = chunk.next;
-                if (chunk_ref.offset == 0) {
-                    break;
-                }
-                chunk = Chunk_read(chunk_ref);
-                fill_ref = InstanceRef(chunk_ref.offset + Chunk_size);
-            }
-            Instance ins = Instance_read(fill_ref);
-            PietFill fill = PietItem_Fill_read(PietItemRef(ins.item_ref));
-
-            // Process the fill polyline item.
-            uint max_n_segs = fill.n_points - 1;
-            uint chunk_n_segs = 0;
-            int backdrop = 0;
-            FillSegChunkRef seg_chunk_ref;
-            FillSegChunkRef first_seg_chunk = FillSegChunkRef(0);
-            vec2 start = Point_read(fill.points).xy;
-            for (uint j = 0; j < max_n_segs; j++) {
-                fill.points.offset += Point_size;
-                vec2 end = Point_read(fill.points).xy;
-
-                // Process one segment.
-
-                // TODO: I think this would go more smoothly (and be easier to
-                // make numerically robust) if it were based on clipping the line
-                // to the tile box. See:
-                // https://tavianator.com/fast-branchless-raybounding-box-intersections/
-                vec2 xymin = min(start, end);
-                vec2 xymax = max(start, end);
-                float a = end.y - start.y;
-                float b = start.x - end.x;
-                float c = -(a * start.x + b * start.y);
-                vec2 xy1 = xy0 + vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
-                float ytop = max(xy0.y, xymin.y);
-                float ybot = min(xy1.y, xymax.y);
-                float s00 = sign(b * ytop + a * xy0.x + c);
-                float s01 = sign(b * ytop + a * xy1.x + c);
-                float s10 = sign(b * ybot + a * xy0.x + c);
-                float s11 = sign(b * ybot + a * xy1.x + c);
-                float sTopLeft = sign(b * xy0.y + a * xy0.x + c);
-                if (sTopLeft == sign(a) && xymin.y <= xy0.y && xymax.y > xy0.y) {
-                    backdrop -= int(s00);
-                }
-
-                // This is adapted from piet-metal but could be improved.
-
-                if (max(xymin.x, xy0.x) < min(xymax.x, xy1.x)
-                    && ytop < ybot
-                    && s00 * s01 + s00 * s10 + s00 * s11 < 3.0)
-                {
-                    // avoid overwriting `end` so that it can be used as start
-                    vec2 enc_end = end;
-                    if (xymin.x < xy0.x) {
-                        float yEdge = mix(start.y, end.y, (start.x - xy0.x) / b);
-                        if (yEdge >= xy0.y && yEdge < xy1.y) {
-                            // This is encoded the same as a general fill segment, but could be
-                            // special-cased, either here or in rendering. (It was special-cased
-                            // in piet-metal).
-                            FillSegment edge_seg;
-                            if (b > 0.0) {
-                                enc_end = vec2(xy0.x, yEdge);
-                                edge_seg.start = enc_end;
-                                edge_seg.end = vec2(xy0.x, xy1.y);
-                            } else {
-                                start = vec2(xy0.x, yEdge);
-                                edge_seg.start = vec2(xy0.x, xy1.y);
-                                edge_seg.end = start;
-                            }
-                            alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit);
-                            FillSegment_write(FillSegmentRef(seg_chunk_ref.offset + FillSegChunk_size + FillSegment_size * chunk_n_segs), edge_seg);
-                            chunk_n_segs++;
-                        }
-                    }
-                    alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit);
-                    FillSegment seg = FillSegment(start, enc_end);
-                    FillSegment_write(FillSegmentRef(seg_chunk_ref.offset + FillSegChunk_size + FillSegment_size * chunk_n_segs), seg);
-                    chunk_n_segs++;
-                }
-
-                start = end;
-            }
-            FillItemHeader_write(item_header, FillItemHeader(backdrop, first_seg_chunk));
-            if (chunk_n_segs != 0) {
-                FillSegChunk_write(seg_chunk_ref, FillSegChunk(chunk_n_segs, FillSegChunkRef(0)));
-                seg_chunk_ref.offset += FillSegChunk_size + FillSegment_size * chunk_n_segs;
-            }
-
-            fill_ref.offset += Instance_size;
-            chunk.chunk_n--;
-            item_header.offset += FillItemHeader_size;
-        }
-    } else {
-        // As an optimization, we could just write 0 for the size.
-        FillTileHeader_write(tile_header_ref, FillTileHeader(fill_n, FillItemHeaderRef(0)));
-    }
-}
--- a/piet-gpu/shader/kernel2f.spv
+++ b/piet-gpu/shader/kernel2f.spv
--- a/piet-gpu/shader/kernel2s.comp
+++ b/piet-gpu/shader/kernel2s.comp
@ -1,137 +0,0 @@
-// This is "kernel 2" (strokes) in a 4-kernel pipeline. It processes the stroke
-// (polyline) items in the scene and generates a list of segments for each, for
-// each tile.
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-layout(local_size_x = 32) in;
-
-layout(set = 0, binding = 0) readonly buffer SceneBuf {
-    uint[] scene;
-};
-
-layout(set = 0, binding = 1) buffer TilegroupBuf {
-    uint[] tilegroup;
-};
-
-layout(set = 0, binding = 2) buffer SegmentBuf {
-    uint[] segment;
-};
-
-layout(set = 0, binding = 3) buffer AllocBuf {
-    uint alloc;
-};
-
-#include "scene.h"
-#include "tilegroup.h"
-#include "segment.h"
-
-#include "setup.h"
-
-void main() {
-    uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x;
-    uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS
-        + (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES);
-    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
-    TileGroupRef stroke_start = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE + TILEGROUP_STROKE_START);
-    uint stroke_n = tilegroup[stroke_start.offset >> 2];
-
-    TileHeaderRef tile_header_ref = TileHeaderRef(tile_ix * TileHeader_size);
-    if (stroke_n > 0) {
-        ChunkRef chunk_ref = ChunkRef(stroke_start.offset + 4);
-        Chunk chunk = Chunk_read(chunk_ref);
-        InstanceRef stroke_ref = InstanceRef(chunk_ref.offset + Chunk_size);
-        ItemHeaderRef item_header = ItemHeaderRef(atomicAdd(alloc, stroke_n * ItemHeader_size));
-        TileHeader_write(tile_header_ref, TileHeader(stroke_n, item_header));
-        SegChunkRef seg_chunk_ref = SegChunkRef(0);
-        uint seg_limit = 0;
-        // Iterate through items; stroke_n holds count remaining.
-        while (true) {
-            if (chunk.chunk_n == 0) {
-                chunk_ref = chunk.next;
-                if (chunk_ref.offset == 0) {
-                    break;
-                }
-                chunk = Chunk_read(chunk_ref);
-                stroke_ref = InstanceRef(chunk_ref.offset + Chunk_size);
-            }
-            Instance ins = Instance_read(stroke_ref);
-            PietStrokePolyLine poly = PietItem_Poly_read(PietItemRef(ins.item_ref));
-
-            // Process the stroke polyline item.
-            uint max_n_segs = poly.n_points - 1;
-            uint chunk_n_segs = 0;
-            SegChunkRef seg_chunk_ref;
-            vec2 start = Point_read(poly.points).xy;
-            for (uint j = 0; j < max_n_segs; j++) {
-                poly.points.offset += Point_size;
-                vec2 end = Point_read(poly.points).xy;
-
-                // Process one segment.
-
-                // This logic just tests for collision. What we probably want to do
-                // is a clipping algorithm like Liang-Barsky, and then store coords
-                // relative to the tile in f16. See also:
-                // https://tavianator.com/fast-branchless-raybounding-box-intersections/
-
-                // Also note that when we go to the fancy version, we want to compute
-                // the (horizontal projection of) the bounding box of the intersection
-                // once per tilegroup, so we can assign work to individual tiles.
-
-                float a = end.y - start.y;
-                float b = start.x - end.x;
-                float c = -(a * start.x + b * start.y);
-                float half_width = 0.5 * poly.width;
-                // Tile boundaries padded by half-width.
-                float xmin = xy0.x - half_width;
-                float ymin = xy0.y - half_width;
-                float xmax = xy0.x + float(TILE_WIDTH_PX) + half_width;
-                float ymax = xy0.y + float(TILE_HEIGHT_PX) + half_width;
-                float s00 = sign(b * ymin + a * xmin + c);
-                float s01 = sign(b * ymin + a * xmax + c);
-                float s10 = sign(b * ymax + a * xmin + c);
-                float s11 = sign(b * ymax + a * xmax + c);
-                // If bounding boxes intersect and not all four corners are on the same side, hit.
-                // Also note: this is designed to be false on NAN input.
-                if (max(min(start.x, end.x), xmin) < min(max(start.x, end.x), xmax)
-                    && max(min(start.y, end.y), ymin) < min(max(start.y, end.y), ymax)
-                    && s00 * s01 + s00 * s10 + s00 * s11 < 3.0)
-                {
-                    // Allocate a chunk if needed.
-                    if (chunk_n_segs == 0) {
-                        if (seg_chunk_ref.offset + 40 > seg_limit) {
-                            seg_chunk_ref.offset = atomicAdd(alloc, SEG_CHUNK_ALLOC);
-                            seg_limit = seg_chunk_ref.offset + SEG_CHUNK_ALLOC - Segment_size;
-                        }
-                        ItemHeader_write(item_header, ItemHeader(seg_chunk_ref));
-                    } else if (seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs > seg_limit) {
-                        uint new_chunk_ref = atomicAdd(alloc, SEG_CHUNK_ALLOC);
-                        seg_limit = new_chunk_ref + SEG_CHUNK_ALLOC - Segment_size;
-                        SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, SegChunkRef(new_chunk_ref)));
-                        seg_chunk_ref.offset = new_chunk_ref;
-                        chunk_n_segs = 0;
-                    }
-                    Segment seg = Segment(start, end);
-                    Segment_write(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs), seg);
-                    chunk_n_segs++;
-                }
-
-                start = end;
-            }
-            if (chunk_n_segs == 0) {
-                ItemHeader_write(item_header, ItemHeader(SegChunkRef(0)));
-            } else {
-                SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, SegChunkRef(0)));
-                seg_chunk_ref.offset += SegChunk_size + Segment_size * chunk_n_segs;
-            }
-
-            stroke_ref.offset += Instance_size;
-            chunk.chunk_n--;
-            item_header.offset += ItemHeader_size;
-        }
-    } else {
-        // As an optimization, we could just write 0 for the size.
-        TileHeader_write(tile_header_ref, TileHeader(stroke_n, ItemHeaderRef(0)));
-    }
-}
--- a/piet-gpu/shader/kernel2s.spv
+++ b/piet-gpu/shader/kernel2s.spv
--- a/piet-gpu/shader/kernel3.comp
+++ b/piet-gpu/shader/kernel3.comp
@ -1,135 +0,0 @@
-// This is "kernel 3" in a 4-kernel pipeline. It walks the active items
-// for the tilegroup and produces a per-tile command list for each tile.
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-layout(local_size_x = 32, local_size_y = 1) in;
-
-layout(set = 0, binding = 0) readonly buffer SceneBuf {
-    uint[] scene;
-};
-
-// TODO: this should have a `readonly` qualifier, but then inclusion
-// of ptcl.h would fail because of the writers.
-layout(set = 0, binding = 1) buffer TilegroupBuf {
-    uint[] tilegroup;
-};
-
-// Used readonly
-layout(set = 0, binding = 2) buffer SegmentBuf {
-    uint[] segment;
-};
-
-// Used readonly
-layout(set = 0, binding = 3) buffer FillSegmentBuf {
-    uint[] fill_seg;
-};
-
-layout(set = 0, binding = 4) buffer PtclBuf {
-    uint[] ptcl;
-};
-
-layout(set = 0, binding = 5) buffer AllocBuf {
-    uint alloc;
-};
-
-#include "scene.h"
-#include "tilegroup.h"
-#include "segment.h"
-#include "fill_seg.h"
-#include "ptcl.h"
-
-#include "setup.h"
-
-void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) {
-    if (cmd_ref.offset > cmd_limit) {
-        uint new_cmd = atomicAdd(alloc, PTCL_INITIAL_ALLOC);
-        CmdJump jump = CmdJump(new_cmd);
-        Cmd_Jump_write(cmd_ref, jump);
-        cmd_ref = CmdRef(new_cmd);
-        cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
-    }
-}
-
-void main() {
-    uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x;
-    uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS
-        + (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES);
-    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
-    TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE);
-    CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC);
-    uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
-
-    TileHeader stroke_th = TileHeader_read(TileHeaderRef(tile_ix * TileHeader_size));
-    FillTileHeader fill_th = FillTileHeader_read(FillTileHeaderRef(tile_ix * FillTileHeader_size));
-
-    while (true) {
-        uint tg_tag = TileGroup_tag(tg_ref);
-        if (tg_tag == TileGroup_End) {
-            break;
-        }
-        if (tg_tag == TileGroup_Jump) {
-            tg_ref = TileGroup_Jump_read(tg_ref).new_ref;
-            continue;
-        }
-        // Assume tg_tag is `Instance`, though there will be more cases.
-        Instance ins = TileGroup_Instance_read(tg_ref);
-        PietItemRef item_ref = PietItemRef(ins.item_ref);
-        uint item_tag = PietItem_tag(item_ref);
-        switch (item_tag) {
-        case PietItem_Circle:
-            PietCircle circle = PietItem_Circle_read(item_ref);
-            vec2 center = ins.offset + circle.center.xy;
-            float r = circle.radius;
-            if (max(center.x - r, xy0.x) < min(center.x + r, xy0.x + float(TILE_WIDTH_PX))
-                && max(center.y - r, xy0.y) < min(center.y + r, xy0.y + float(TILE_HEIGHT_PX)))
-            {
-                CmdCircle cmd = CmdCircle(center, r, circle.rgba_color);
-                alloc_cmd(cmd_ref, cmd_limit);
-                Cmd_Circle_write(cmd_ref, cmd);
-                cmd_ref.offset += Cmd_size;
-            }
-            break;
-        case PietItem_Poly:
-            ItemHeader stroke_item = ItemHeader_read(stroke_th.items);
-            stroke_th.items.offset += ItemHeader_size;
-            if (stroke_item.segments.offset != 0) {
-                PietStrokePolyLine poly = PietItem_Poly_read(item_ref);
-                CmdStroke cmd = CmdStroke(
-                    stroke_item.segments.offset,
-                    0.5 * poly.width,
-                    poly.rgba_color
-                );
-                alloc_cmd(cmd_ref, cmd_limit);
-                Cmd_Stroke_write(cmd_ref, cmd);
-                cmd_ref.offset += Cmd_size;
-            }
-            break;
-        case PietItem_Fill:
-            FillItemHeader fill_item = FillItemHeader_read(fill_th.items);
-            fill_th.items.offset += FillItemHeader_size;
-            // TODO: handle segments == 0 but backdrop != specially, it's a solid tile.
-            if (fill_item.segments.offset != 0) {
-                PietFill fill = PietItem_Fill_read(item_ref);
-                CmdFill cmd = CmdFill(
-                    fill_item.segments.offset,
-                    fill_item.backdrop,
-                    fill.rgba_color
-                );
-                alloc_cmd(cmd_ref, cmd_limit);
-                Cmd_Fill_write(cmd_ref, cmd);
-                cmd_ref.offset += Cmd_size;
-            } else if (fill_item.backdrop != 0) {
-                // TODO: truncate existing cmd list if alpha is opaque
-                PietFill fill = PietItem_Fill_read(item_ref);
-                alloc_cmd(cmd_ref, cmd_limit);
-                Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color));
-                cmd_ref.offset += Cmd_size;
-            }
-            break;
-        }
-        tg_ref.offset += TileGroup_size;
-    }
-    Cmd_End_write(cmd_ref);
-}
--- a/piet-gpu/shader/kernel3.spv
+++ b/piet-gpu/shader/kernel3.spv
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@ -9,26 +9,14 @@

 layout(local_size_x = 16, local_size_y = 16) in;

-// Same concern that this should be readonly as in kernel 3.
+// This should be annotated readonly but infra doesn't support that yet.
 layout(set = 0, binding = 0) buffer PtclBuf {
    uint[] ptcl;
 };

-// Used readonly
-layout(set = 0, binding = 1) buffer SegmentBuf {
-    uint[] segment;
-};
-
-// Used readonly
-layout(set = 0, binding = 2) buffer FillSegBuf {
-    uint[] fill_seg;
-};
-
-layout(rgba8, set = 0, binding = 3) uniform writeonly image2D image;
+layout(rgba8, set = 0, binding = 1) uniform writeonly image2D image;

 #include "ptcl.h"
-#include "segment.h"
-#include "fill_seg.h"

 #include "setup.h"

@ -79,11 +67,11 @@ void main() {
            CmdFill fill = Cmd_Fill_read(cmd_ref);
            // Probably better to store as float, but conversion is no doubt cheap.
            float area = float(fill.backdrop);
-            FillSegChunkRef fill_seg_chunk_ref = FillSegChunkRef(fill.seg_ref);
+            SegChunkRef fill_seg_chunk_ref = SegChunkRef(fill.seg_ref);
            do {
-                FillSegChunk seg_chunk = FillSegChunk_read(fill_seg_chunk_ref);
+                SegChunk seg_chunk = SegChunk_read(fill_seg_chunk_ref);
                for (int i = 0; i < seg_chunk.n; i++) {
-                    FillSegment seg = FillSegment_read(FillSegmentRef(fill_seg_chunk_ref.offset + FillSegChunk_size + FillSegment_size * i));
+                    Segment seg = Segment_read(SegmentRef(fill_seg_chunk_ref.offset + SegChunk_size + Segment_size * i));
                    vec2 start = seg.start - xy;
                    vec2 end = seg.end - xy;
                    vec2 window = clamp(vec2(start.y, end.y), 0.0, 1.0);
--- a/piet-gpu/shader/kernel4.spv
+++ b/piet-gpu/shader/kernel4.spv
--- a/piet-gpu/shader/ptcl.h
+++ b/piet-gpu/shader/ptcl.h
@ -36,6 +36,14 @@ struct CmdRef {
    uint offset;
 };

+struct SegmentRef {
+    uint offset;
+};
+
+struct SegChunkRef {
+    uint offset;
+};
+
 struct CmdCircle {
    vec2 center;
    float radius;
@ -141,6 +149,28 @@ CmdRef Cmd_index(CmdRef ref, uint index) {
    return CmdRef(ref.offset + index * Cmd_size);
 }

+struct Segment {
+    vec2 start;
+    vec2 end;
+};
+
+#define Segment_size 16
+
+SegmentRef Segment_index(SegmentRef ref, uint index) {
+    return SegmentRef(ref.offset + index * Segment_size);
+}
+
+struct SegChunk {
+    uint n;
+    SegChunkRef next;
+};
+
+#define SegChunk_size 8
+
+SegChunkRef SegChunk_index(SegChunkRef ref, uint index) {
+    return SegChunkRef(ref.offset + index * SegChunk_size);
+}
+
 CmdCircle CmdCircle_read(CmdCircleRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = ptcl[ix + 0];
@ -362,3 +392,39 @@ void Cmd_Bail_write(CmdRef ref) {
    ptcl[ref.offset >> 2] = Cmd_Bail;
 }

+Segment Segment_read(SegmentRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = ptcl[ix + 0];
+    uint raw1 = ptcl[ix + 1];
+    uint raw2 = ptcl[ix + 2];
+    uint raw3 = ptcl[ix + 3];
+    Segment s;
+    s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    return s;
+}
+
+void Segment_write(SegmentRef ref, Segment s) {
+    uint ix = ref.offset >> 2;
+    ptcl[ix + 0] = floatBitsToUint(s.start.x);
+    ptcl[ix + 1] = floatBitsToUint(s.start.y);
+    ptcl[ix + 2] = floatBitsToUint(s.end.x);
+    ptcl[ix + 3] = floatBitsToUint(s.end.y);
+}
+
+SegChunk SegChunk_read(SegChunkRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = ptcl[ix + 0];
+    uint raw1 = ptcl[ix + 1];
+    SegChunk s;
+    s.n = raw0;
+    s.next = SegChunkRef(raw1);
+    return s;
+}
+
+void SegChunk_write(SegChunkRef ref, SegChunk s) {
+    uint ix = ref.offset >> 2;
+    ptcl[ix + 0] = s.n;
+    ptcl[ix + 1] = s.next.offset;
+}
+
--- a/piet-gpu/shader/segment.h
+++ b/piet-gpu/shader/segment.h
@ -1,126 +0,0 @@
-// Code auto-generated by piet-gpu-derive
-
-struct TileHeaderRef {
-    uint offset;
-};
-
-struct ItemHeaderRef {
-    uint offset;
-};
-
-struct SegmentRef {
-    uint offset;
-};
-
-struct SegChunkRef {
-    uint offset;
-};
-
-struct TileHeader {
-    uint n;
-    ItemHeaderRef items;
-};
-
-#define TileHeader_size 8
-
-TileHeaderRef TileHeader_index(TileHeaderRef ref, uint index) {
-    return TileHeaderRef(ref.offset + index * TileHeader_size);
-}
-
-struct ItemHeader {
-    SegChunkRef segments;
-};
-
-#define ItemHeader_size 4
-
-ItemHeaderRef ItemHeader_index(ItemHeaderRef ref, uint index) {
-    return ItemHeaderRef(ref.offset + index * ItemHeader_size);
-}
-
-struct Segment {
-    vec2 start;
-    vec2 end;
-};
-
-#define Segment_size 16
-
-SegmentRef Segment_index(SegmentRef ref, uint index) {
-    return SegmentRef(ref.offset + index * Segment_size);
-}
-
-struct SegChunk {
-    uint n;
-    SegChunkRef next;
-};
-
-#define SegChunk_size 8
-
-SegChunkRef SegChunk_index(SegChunkRef ref, uint index) {
-    return SegChunkRef(ref.offset + index * SegChunk_size);
-}
-
-TileHeader TileHeader_read(TileHeaderRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = segment[ix + 0];
-    uint raw1 = segment[ix + 1];
-    TileHeader s;
-    s.n = raw0;
-    s.items = ItemHeaderRef(raw1);
-    return s;
-}
-
-void TileHeader_write(TileHeaderRef ref, TileHeader s) {
-    uint ix = ref.offset >> 2;
-    segment[ix + 0] = s.n;
-    segment[ix + 1] = s.items.offset;
-}
-
-ItemHeader ItemHeader_read(ItemHeaderRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = segment[ix + 0];
-    ItemHeader s;
-    s.segments = SegChunkRef(raw0);
-    return s;
-}
-
-void ItemHeader_write(ItemHeaderRef ref, ItemHeader s) {
-    uint ix = ref.offset >> 2;
-    segment[ix + 0] = s.segments.offset;
-}
-
-Segment Segment_read(SegmentRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = segment[ix + 0];
-    uint raw1 = segment[ix + 1];
-    uint raw2 = segment[ix + 2];
-    uint raw3 = segment[ix + 3];
-    Segment s;
-    s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
-    s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    return s;
-}
-
-void Segment_write(SegmentRef ref, Segment s) {
-    uint ix = ref.offset >> 2;
-    segment[ix + 0] = floatBitsToUint(s.start.x);
-    segment[ix + 1] = floatBitsToUint(s.start.y);
-    segment[ix + 2] = floatBitsToUint(s.end.x);
-    segment[ix + 3] = floatBitsToUint(s.end.y);
-}
-
-SegChunk SegChunk_read(SegChunkRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = segment[ix + 0];
-    uint raw1 = segment[ix + 1];
-    SegChunk s;
-    s.n = raw0;
-    s.next = SegChunkRef(raw1);
-    return s;
-}
-
-void SegChunk_write(SegChunkRef ref, SegChunk s) {
-    uint ix = ref.offset >> 2;
-    segment[ix + 0] = s.n;
-    segment[ix + 1] = s.next.offset;
-}
-
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -209,16 +209,11 @@ impl<D: Device> Renderer<D> {
            &[],
        )?;

-        // These will probably be combined with the ptcl buf, as they're all written by the
-        // same kernel now.
-        let segment_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
-        let fill_seg_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
-
        let k4_code = include_bytes!("../shader/kernel4.spv");
-        let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 3, 1)?;
+        let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 1, 1)?;
        let k4_ds = device.create_descriptor_set(
            &k4_pipeline,
-            &[&ptcl_buf, &segment_buf, &fill_seg_buf],
+            &[&ptcl_buf],
            &[&image_dev],
        )?;