Merge pull request #19 from linebender/sort_middle

Bring sort_middle branch to master
2025-01-09 20:31:29 +11:00 · 2020-06-11 16:16:10 -07:00 · 2020-06-11 16:16:10 -07:00 · 73df5534a1
parent 8d01aba237 feeaa31fd1
commit 73df5534a1
41 changed files with 2673 additions and 1310 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -26,6 +26,15 @@ version = "0.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "000444226fcff248f2bc4c7625be32c63caccfecc2723a2b9f78a7487a49c407"

+[[package]]
+name = "ansi_term"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b"
+dependencies = [
+ "winapi 0.3.8",
+]
+
 [[package]]
 name = "approx"
 version = "0.3.2"
@ -59,6 +68,17 @@ dependencies = [
 "raw-window-handle",
 ]

+[[package]]
+name = "atty"
+version = "0.2.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
+dependencies = [
+ "hermit-abi",
+ "libc",
+ "winapi 0.3.8",
+]
+
 [[package]]
 name = "autocfg"
 version = "1.0.0"
@ -106,6 +126,21 @@ version = "0.1.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"

+[[package]]
+name = "clap"
+version = "2.33.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bdfa80d47f954d53a35a64987ca1422f495b8d6483c0fe9f7117b36c2a792129"
+dependencies = [
+ "ansi_term",
+ "atty",
+ "bitflags",
+ "strsim",
+ "textwrap",
+ "unicode-width",
+ "vec_map",
+]
+
 [[package]]
 name = "cloudabi"
 version = "0.0.3"
@ -259,6 +294,15 @@ version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f36b5f248235f45773d4944f555f83ea61fe07b18b561ccf99d7483d7381e54d"

+[[package]]
+name = "hermit-abi"
+version = "0.1.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91780f809e750b0a89f5544be56617ff6b1227ee485bcb06ebe10cdf89bd3b71"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "inflate"
 version = "0.4.5"
@ -525,6 +569,7 @@ dependencies = [
 name = "piet-gpu"
 version = "0.1.0"
 dependencies = [
+ "clap",
 "piet",
 "piet-gpu-hal",
 "piet-gpu-types",
@ -758,6 +803,12 @@ dependencies = [
 "byteorder",
 ]

+[[package]]
+name = "strsim"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
+
 [[package]]
 name = "syn"
 version = "1.0.17"
@ -769,6 +820,21 @@ dependencies = [
 "unicode-xid 0.2.0",
 ]

+[[package]]
+name = "textwrap"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
+dependencies = [
+ "unicode-width",
+]
+
+[[package]]
+name = "unicode-width"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "caaa9d531767d1ff2150b9332433f32a24622147e5ebb1f26409d5da67afd479"
+
 [[package]]
 name = "unicode-xid"
 version = "0.1.0"
@ -781,6 +847,12 @@ version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c"

+[[package]]
+name = "vec_map"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
+
 [[package]]
 name = "void"
 version = "1.0.2"
--- a/piet-gpu-types/src/annotated.rs
+++ b/piet-gpu-types/src/annotated.rs
@ -0,0 +1,53 @@
+use piet_gpu_derive::piet_gpu;
+
+piet_gpu! {
+    #[gpu_write]
+    mod annotated {
+        struct AnnoFillLineSeg {
+            p0: [f32; 2],
+            p1: [f32; 2],
+            // A note: the layout of this struct is shared with
+            // AnnoStrokeLineSeg. In that case, we actually write
+            // [0.0, 0.0] as the stroke field, to minimize divergence.
+        }
+        struct AnnoStrokeLineSeg {
+            p0: [f32; 2],
+            p1: [f32; 2],
+            // halfwidth in both x and y for binning
+            stroke: [f32; 2],
+        }
+        struct AnnoQuadSeg {
+            p0: [f32; 2],
+            p1: [f32; 2],
+            p2: [f32; 2],
+            stroke: [f32; 2],
+        }
+        struct AnnoCubicSeg {
+            p0: [f32; 2],
+            p1: [f32; 2],
+            p2: [f32; 2],
+            p3: [f32; 2],
+            stroke: [f32; 2],
+        }
+        struct AnnoFill {
+            rgba_color: u32,
+            bbox: [f32; 4],
+        }
+        struct AnnoStroke {
+            rgba_color: u32,
+            bbox: [f32; 4],
+            // For the nonuniform scale case, this needs to be a 2x2 matrix.
+            // That's expected to be uncommon, so we could special-case it.
+            linewidth: f32,
+        }
+        enum Annotated {
+            Nop,
+            FillLine(AnnoFillLineSeg),
+            StrokeLine(AnnoStrokeLineSeg),
+            Quad(AnnoQuadSeg),
+            Cubic(AnnoCubicSeg),
+            Stroke(AnnoStroke),
+            Fill(AnnoFill),
+        }
+    }
+}
--- a/piet-gpu-types/src/bins.rs
+++ b/piet-gpu-types/src/bins.rs
@ -0,0 +1,22 @@
+use piet_gpu_derive::piet_gpu;
+
+// The output of the binning stage, organized as a linked list of chunks.
+
+piet_gpu! {
+    #[gpu_write]
+    mod bins {
+        struct BinInstance {
+            element_ix: u32,
+            // Right edge of the bounding box of the associated fill
+            // element; used in backdrop computation.
+            right_edge: f32,
+        }
+
+        struct BinChunk {
+            // First chunk can have n = 0, subsequent ones not.
+            n: u32,
+            next: Ref<BinChunk>,
+            // Instances follow
+        }
+    }
+}
--- a/piet-gpu-types/src/fill_seg.rs
+++ b/piet-gpu-types/src/fill_seg.rs
@ -1,37 +0,0 @@
-use piet_gpu_derive::piet_gpu;
-
-// Structures representing segments for fill items.
-
-// There is some cut'n'paste here from stroke segments, which can be
-// traced to the fact that buffers in GLSL are basically global.
-// Maybe there's a way to address that, but in the meantime living
-// with the duplication is easiest.
-
-piet_gpu! {
-    #[gpu_write]
-    mod fill_seg {
-        struct FillTileHeader {
-            n: u32,
-            items: Ref<FillItemHeader>,
-        }
-
-        struct FillItemHeader {
-            backdrop: i32,
-            segments: Ref<FillSegChunk>,
-        }
-
-        // TODO: strongly consider using f16. If so, these would be
-        // relative to the tile. We're doing f32 for now to minimize
-        // divergence from piet-metal originals.
-        struct FillSegment {
-            start: [f32; 2],
-            end: [f32; 2],
-        }
-
-        struct FillSegChunk {
-            n: u32,
-            next: Ref<FillSegChunk>,
-            // Segments follow (could represent this as a variable sized array).
-        }
-    }
-}
--- a/piet-gpu-types/src/lib.rs
+++ b/piet-gpu-types/src/lib.rs
@ -1,7 +1,10 @@
+// Structures used only internally probably don't need to be pub.
+
+pub mod annotated;
+pub mod bins;
 pub mod encoder;
-pub mod fill_seg;
 pub mod ptcl;
 pub mod scene;
-pub mod segment;
+pub mod state;
 pub mod test;
 pub mod tilegroup;
--- a/piet-gpu-types/src/main.rs
+++ b/piet-gpu-types/src/main.rs
@ -5,9 +5,10 @@ fn main() {
        .expect("provide a module name");
    match mod_name.as_str() {
        "scene" => print!("{}", piet_gpu_types::scene::gen_gpu_scene()),
+        "state" => print!("{}", piet_gpu_types::state::gen_gpu_state()),
+        "annotated" => print!("{}", piet_gpu_types::annotated::gen_gpu_annotated()),
+        "bins" => print!("{}", piet_gpu_types::bins::gen_gpu_bins()),
        "tilegroup" => print!("{}", piet_gpu_types::tilegroup::gen_gpu_tilegroup()),
-        "segment" => print!("{}", piet_gpu_types::segment::gen_gpu_segment()),
-        "fill_seg" => print!("{}", piet_gpu_types::fill_seg::gen_gpu_fill_seg()),
        "ptcl" => print!("{}", piet_gpu_types::ptcl::gen_gpu_ptcl()),
        "test" => print!("{}", piet_gpu_types::test::gen_gpu_test()),
        _ => println!("Oops, unknown module name"),
--- a/piet-gpu-types/src/ptcl.rs
+++ b/piet-gpu-types/src/ptcl.rs
@ -13,14 +13,13 @@ piet_gpu! {
            end: [f32; 2],
        }
        struct CmdStroke {
-            // Should be Ref<SegChunk> if we had cross-module references.
-            seg_ref: u32,
+            // Consider a specialization to one segment.
+            seg_ref: Ref<SegChunk>,
            half_width: f32,
            rgba_color: u32,
        }
        struct CmdFill {
-            // Should be Ref<FillSegChunk> if we had cross-module references.
-            seg_ref: u32,
+            seg_ref: Ref<SegChunk>,
            backdrop: i32,
            rgba_color: u32,
        }
@ -51,5 +50,24 @@ piet_gpu! {
            Jump(CmdJump),
            Bail,
        }
+
+        // TODO: strongly consider using f16. If so, these would be
+        // relative to the tile. We're doing f32 for now to minimize
+        // divergence from piet-metal originals.
+        struct Segment {
+            start: [f32; 2],
+            end: [f32; 2],
+
+            // This is used for fills only, but we're including it in
+            // the general structure for simplicity.
+            y_edge: f32,
+        }
+
+        struct SegChunk {
+            n: u32,
+            next: Ref<SegChunk>,
+            // Actually a reference to a variable-sized slice.
+            segs: Ref<Segment>,
+        }
    }
 }
--- a/piet-gpu-types/src/scene.rs
+++ b/piet-gpu-types/src/scene.rs
@ -4,6 +4,8 @@ pub use self::scene::{
    Bbox, PietCircle, PietFill, PietItem, PietStrokeLine, PietStrokePolyLine, Point, SimpleGroup,
 };

+pub use self::scene::{CubicSeg, Element, Fill, LineSeg, QuadSeg, SetLineWidth, Stroke, Transform};
+
 piet_gpu! {
    #[rust_encode]
    mod scene {
@ -51,5 +53,53 @@ piet_gpu! {
            Fill(PietFill),
            Poly(PietStrokePolyLine),
        }
+
+        // New approach follows (above to be deleted)
+        struct LineSeg {
+            p0: [f32; 2],
+            p1: [f32; 2],
+        }
+        struct QuadSeg {
+            p0: [f32; 2],
+            p1: [f32; 2],
+            p2: [f32; 2],
+        }
+        struct CubicSeg {
+            p0: [f32; 2],
+            p1: [f32; 2],
+            p2: [f32; 2],
+            p3: [f32; 2],
+        }
+        struct Fill {
+            rgba_color: u32,
+        }
+        struct Stroke {
+            rgba_color: u32,
+        }
+        struct SetLineWidth {
+            width: f32,
+        }
+        struct Transform {
+            mat: [f32; 4],
+            translate: [f32; 2],
+        }
+        enum Element {
+            Nop,
+            // Another approach to encoding would be to use a single
+            // variant but have a bool for fill/stroke. This could be
+            // packed into the tag, so the on-the-wire representation
+            // would be very similar to what's here.
+            StrokeLine(LineSeg),
+            FillLine(LineSeg),
+
+            // Note: we'll need to handle the stroke/fill distinction
+            // for these as well, when we do flattening on the GPU.
+            Quad(QuadSeg),
+            Cubic(CubicSeg),
+            Stroke(Stroke),
+            Fill(Fill),
+            SetLineWidth(SetLineWidth),
+            Transform(Transform),
+        }
    }
 }
--- a/piet-gpu-types/src/segment.rs
+++ b/piet-gpu-types/src/segment.rs
@ -1,32 +0,0 @@
-use piet_gpu_derive::piet_gpu;
-
-// Structures representing segments for stroke/fill items.
-
-piet_gpu! {
-    #[gpu_write]
-    mod segment {
-        struct TileHeader {
-            n: u32,
-            items: Ref<ItemHeader>,
-        }
-
-        // Note: this is only suitable for strokes, fills require backdrop.
-        struct ItemHeader {
-            segments: Ref<SegChunk>,
-        }
-
-        // TODO: strongly consider using f16. If so, these would be
-        // relative to the tile. We're doing f32 for now to minimize
-        // divergence from piet-metal originals.
-        struct Segment {
-            start: [f32; 2],
-            end: [f32; 2],
-        }
-
-        struct SegChunk {
-            n: u32,
-            next: Ref<SegChunk>,
-            // Segments follow (could represent this as a variable sized array).
-        }
-    }
-}
--- a/piet-gpu-types/src/state.rs
+++ b/piet-gpu-types/src/state.rs
@ -0,0 +1,14 @@
+use piet_gpu_derive::piet_gpu;
+
+piet_gpu! {
+    #[gpu_write]
+    mod state {
+        struct State {
+            mat: [f32; 4],
+            translate: [f32; 2],
+            bbox: [f32; 4],
+            linewidth: f32,
+            flags: u32,
+        }
+    }
+}
--- a/piet-gpu/Cargo.toml
+++ b/piet-gpu/Cargo.toml
@ -26,3 +26,4 @@ png = "0.16.2"
 rand = "0.7.3"
 roxmltree = "0.11"
 winit = "0.22"
+clap = "2.33"
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@ -2,10 +2,12 @@ use std::fs::File;
 use std::io::BufWriter;
 use std::path::Path;

+use clap::{Arg, App};
+
 use piet_gpu_hal::vulkan::VkInstance;
 use piet_gpu_hal::{CmdBuf, Device, Error, MemFlags};

-use piet_gpu::{PietGpuRenderContext, Renderer, render_scene, WIDTH, HEIGHT};
+use piet_gpu::{render_scene, render_svg, PietGpuRenderContext, Renderer, HEIGHT, WIDTH};

 #[allow(unused)]
 fn dump_scene(buf: &[u8]) {
@ -16,22 +18,179 @@ fn dump_scene(buf: &[u8]) {
    }
 }

+#[allow(unused)]
+fn dump_state(buf: &[u8]) {
+    for i in 0..(buf.len() / 48) {
+        let j = i * 48;
+        let floats = (0..11).map(|k| {
+            let mut buf_f32 = [0u8; 4];
+            buf_f32.copy_from_slice(&buf[j + k * 4..j + k * 4 + 4]);
+            f32::from_le_bytes(buf_f32)
+        }).collect::<Vec<_>>();
+        println!("{}: [{} {} {} {} {} {}] ({}, {})-({} {}) {} {}",
+            i,
+            floats[0], floats[1], floats[2], floats[3], floats[4], floats[5],
+            floats[6], floats[7], floats[8], floats[9],
+            floats[10], buf[j + 44]);
+    }
+
+}
+
+/// Interpret the output of the binning stage, for diagnostic purposes.
+#[allow(unused)]
+fn trace_merge(buf: &[u32]) {
+    for bin in 0..256 {
+        println!("bin {}:", bin);
+        let mut starts = (0..16).map(|i| Some((bin * 16 + i) * 64)).collect::<Vec<Option<usize>>>();
+        loop {
+            let min_start = starts.iter().map(|st|
+                st.map(|st|
+                    if buf[st / 4] == 0 {
+                        !0
+                    } else {
+                        buf[st / 4 + 2]
+                    }).unwrap_or(!0)).min().unwrap();
+            if min_start == !0 {
+                break;
+            }
+            let mut selected = !0;
+            for i in 0..16 {
+                if let Some(st) = starts[i] {
+                    if buf[st/4] != 0 && buf[st/4 + 2] == min_start {
+                        selected = i;
+                        break;
+                    }
+                }
+            }
+            let st = starts[selected].unwrap();
+            println!("selected {}, start {:x}", selected, st);
+            for j in 0..buf[st/4] {
+                println!("{:x}", buf[st/4 + 2 + j as usize])
+            }
+            if buf[st/4 + 1] == 0 {
+                starts[selected] = None;
+            } else {
+                starts[selected] = Some(buf[st/4 + 1] as usize);
+            }
+        }
+
+    }
+}
+
+/// Interpret the output of the coarse raster stage, for diagnostic purposes.
+#[allow(unused)]
+fn trace_ptcl(buf: &[u32]) {
+    for y in 0..96 {
+        for x in 0..128 {
+            let tile_ix = y * 128 + x;
+            println!("tile {} @({}, {})", tile_ix, x, y);
+            let mut tile_offset = tile_ix * 1024;
+            loop {
+                let tag = buf[tile_offset / 4];
+                match tag {
+                    0 => break,
+                    3 => {
+                        let backdrop = buf[tile_offset / 4 + 2];
+                        let rgba_color = buf[tile_offset / 4 + 3];
+                        println!("  {:x}: fill {:x} {}", tile_offset, rgba_color, backdrop);
+                        let mut seg_chunk = buf[tile_offset / 4 + 1] as usize;
+                        let n = buf[seg_chunk / 4] as usize;
+                        let segs = buf[seg_chunk / 4 + 2] as usize;
+                        println!("    chunk @{:x}: n={}, segs @{:x}", seg_chunk, n, segs);
+                        for i in 0..n {
+                            let x0 = f32::from_bits(buf[segs / 4 + i * 5]);
+                            let y0 = f32::from_bits(buf[segs / 4 + i * 5 + 1]);
+                            let x1 = f32::from_bits(buf[segs / 4 + i * 5 + 2]);
+                            let y1 = f32::from_bits(buf[segs / 4 + i * 5 + 3]);
+                            let y_edge = f32::from_bits(buf[segs / 4 + i * 5 + 4]);
+                            println!("      ({:.3}, {:.3}) - ({:.3}, {:.3}) | {:.3}", x0, y0, x1, y1, y_edge);
+                        }
+                        loop {
+                            seg_chunk = buf[seg_chunk / 4 + 1] as usize;
+                            if seg_chunk == 0 {
+                                break;
+                            }
+                        }
+                    }
+                    4 => {
+                        let line_width = f32::from_bits(buf[tile_offset / 4 + 2]);
+                        let rgba_color = buf[tile_offset / 4 + 3];
+                        println!("  {:x}: stroke {:x} {}", tile_offset, rgba_color, line_width);
+                        let mut seg_chunk = buf[tile_offset / 4 + 1] as usize;
+                        let n = buf[seg_chunk / 4] as usize;
+                        let segs = buf[seg_chunk / 4 + 2] as usize;
+                        println!("    chunk @{:x}: n={}, segs @{:x}", seg_chunk, n, segs);
+                        for i in 0..n {
+                            let x0 = f32::from_bits(buf[segs / 4 + i * 5]);
+                            let y0 = f32::from_bits(buf[segs / 4 + i * 5 + 1]);
+                            let x1 = f32::from_bits(buf[segs / 4 + i * 5 + 2]);
+                            let y1 = f32::from_bits(buf[segs / 4 + i * 5 + 3]);
+                            let y_edge = f32::from_bits(buf[segs / 4 + i * 5 + 4]);
+                            println!("      ({:.3}, {:.3}) - ({:.3}, {:.3}) | {:.3}", x0, y0, x1, y1, y_edge);
+                        }
+                        loop {
+                            seg_chunk = buf[seg_chunk / 4 + 1] as usize;
+                            if seg_chunk == 0 {
+                                break;
+                            }
+                        }
+                    }
+                    _ => {
+                        println!("{:x}: {}", tile_offset, tag);
+                    }
+                }
+                if tag == 0 {
+                    break;
+                }
+                if tag == 8 {
+                    tile_offset = buf[tile_offset / 4 + 1] as usize;
+                } else {
+                    tile_offset += 20;
+                }
+            }
+        }
+    }
+}
+
+
 fn main() -> Result<(), Error> {
+    let matches = App::new("piet-gpu test")
+        .arg(Arg::with_name("INPUT")
+            .index(1))
+        .arg(Arg::with_name("flip")
+            .short("f")
+            .long("flip"))
+        .arg(Arg::with_name("scale")
+            .short("s")
+            .long("scale")
+            .takes_value(true))
+        .get_matches();
    let (instance, _) = VkInstance::new(None)?;
    unsafe {
        let device = instance.device(None)?;

        let fence = device.create_fence(false)?;
        let mut cmd_buf = device.create_cmd_buf()?;
-        let query_pool = device.create_query_pool(6)?;
+        let query_pool = device.create_query_pool(5)?;

        let mut ctx = PietGpuRenderContext::new();
-        render_scene(&mut ctx);
+        if let Some(input) = matches.value_of("INPUT") {
+            let mut scale = matches.value_of("scale")
+                .map(|scale| scale.parse().unwrap())
+                .unwrap_or(8.0);
+            if matches.is_present("flip") {
+                scale = -scale;
+            }
+            render_svg(&mut ctx, input, scale);
+        } else {
+            render_scene(&mut ctx);
+        }
        let scene = ctx.get_scene_buf();
        //dump_scene(&scene);

        let renderer = Renderer::new(&device, scene)?;
-        let image_buf = device.create_buffer((WIDTH * HEIGHT * 4) as u64, MemFlags::host_coherent())?;
+        let image_buf =
+            device.create_buffer((WIDTH * HEIGHT * 4) as u64, MemFlags::host_coherent())?;

        cmd_buf.begin();
        renderer.record(&mut cmd_buf, &query_pool);
@ -39,29 +198,17 @@ fn main() -> Result<(), Error> {
        cmd_buf.finish();
        device.run_cmd_buf(&cmd_buf, &[], &[], Some(&fence))?;
        device.wait_and_reset(&[fence])?;
-        let timestamps = device.reap_query_pool(&query_pool).unwrap();
-        println!("Kernel 1 time: {:.3}ms", timestamps[0] * 1e3);
-        println!(
-            "Kernel 2s time: {:.3}ms",
-            (timestamps[1] - timestamps[0]) * 1e3
-        );
-        println!(
-            "Kernel 2f time: {:.3}ms",
-            (timestamps[2] - timestamps[1]) * 1e3
-        );
-        println!(
-            "Kernel 3 time: {:.3}ms",
-            (timestamps[3] - timestamps[2]) * 1e3
-        );
-        println!(
-            "Render time: {:.3}ms",
-            (timestamps[4] - timestamps[3]) * 1e3
-        );
+        let ts = device.reap_query_pool(&query_pool).unwrap();
+        println!("Element kernel time: {:.3}ms", ts[0] * 1e3);
+        println!("Binning kernel time: {:.3}ms", (ts[1] - ts[0]) * 1e3);
+        println!("Coarse kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3);
+        println!("Render kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3);

        /*
-        let mut k1_data: Vec<u32> = Default::default();
-        device.read_buffer(&segment_buf, &mut k1_data).unwrap();
-        dump_k1_data(&k1_data);
+        let mut data: Vec<u32> = Default::default();
+        device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap();
+        piet_gpu::dump_k1_data(&data);
+        //trace_ptcl(&data);
        */

        let mut img_data: Vec<u8> = Default::default();
--- a/piet-gpu/bin/winit.rs
+++ b/piet-gpu/bin/winit.rs
@ -1,7 +1,7 @@
 use piet_gpu_hal::vulkan::VkInstance;
 use piet_gpu_hal::{CmdBuf, Device, Error, ImageLayout};

-use piet_gpu::{PietGpuRenderContext, Renderer, render_scene, WIDTH, HEIGHT};
+use piet_gpu::{render_scene, PietGpuRenderContext, Renderer, HEIGHT, WIDTH};

 use winit::{
    event::{Event, WindowEvent},
@ -37,7 +37,7 @@ fn main() -> Result<(), Error> {
            .map(|_| device.create_cmd_buf())
            .collect::<Result<Vec<_>, Error>>()?;
        let query_pools = (0..NUM_FRAMES)
-            .map(|_| device.create_query_pool(6))
+            .map(|_| device.create_query_pool(5))
            .collect::<Result<Vec<_>, Error>>()?;

        let mut ctx = PietGpuRenderContext::new();
@ -69,12 +69,12 @@ fn main() -> Result<(), Error> {
                        device.wait_and_reset(&[frame_fences[frame_idx]]).unwrap();

                        let timestamps = device.reap_query_pool(query_pool).unwrap();
-                        window.set_title(&format!("k1: {:.3}ms, k2s: {:.3}ms, k2f: {:.3}ms, k3: {:.3}ms, k4: {:.3}ms",
+                        window.set_title(&format!(
+                            "e: {:.3}ms, b: {:.3}ms, c: {:.3}ms, f: {:.3}ms",
                            timestamps[0] * 1e3,
                            (timestamps[1] - timestamps[0]) * 1e3,
                            (timestamps[2] - timestamps[1]) * 1e3,
                            (timestamps[3] - timestamps[2]) * 1e3,
-                            (timestamps[4] - timestamps[3]) * 1e3,
                        ));
                    }

@ -93,11 +93,7 @@ fn main() -> Result<(), Error> {
                        ImageLayout::BlitDst,
                    );
                    cmd_buf.blit_image(&renderer.image_dev, &swap_image);
-                    cmd_buf.image_barrier(
-                        &swap_image,
-                        ImageLayout::BlitDst,
-                        ImageLayout::Present,
-                    );
+                    cmd_buf.image_barrier(&swap_image, ImageLayout::BlitDst, ImageLayout::Present);
                    cmd_buf.finish();

                    device
--- a/piet-gpu/shader/annotated.h
+++ b/piet-gpu/shader/annotated.h
@ -0,0 +1,335 @@
+// Code auto-generated by piet-gpu-derive
+
+struct AnnoFillLineSegRef {
+    uint offset;
+};
+
+struct AnnoStrokeLineSegRef {
+    uint offset;
+};
+
+struct AnnoQuadSegRef {
+    uint offset;
+};
+
+struct AnnoCubicSegRef {
+    uint offset;
+};
+
+struct AnnoFillRef {
+    uint offset;
+};
+
+struct AnnoStrokeRef {
+    uint offset;
+};
+
+struct AnnotatedRef {
+    uint offset;
+};
+
+struct AnnoFillLineSeg {
+    vec2 p0;
+    vec2 p1;
+};
+
+#define AnnoFillLineSeg_size 16
+
+AnnoFillLineSegRef AnnoFillLineSeg_index(AnnoFillLineSegRef ref, uint index) {
+    return AnnoFillLineSegRef(ref.offset + index * AnnoFillLineSeg_size);
+}
+
+struct AnnoStrokeLineSeg {
+    vec2 p0;
+    vec2 p1;
+    vec2 stroke;
+};
+
+#define AnnoStrokeLineSeg_size 24
+
+AnnoStrokeLineSegRef AnnoStrokeLineSeg_index(AnnoStrokeLineSegRef ref, uint index) {
+    return AnnoStrokeLineSegRef(ref.offset + index * AnnoStrokeLineSeg_size);
+}
+
+struct AnnoQuadSeg {
+    vec2 p0;
+    vec2 p1;
+    vec2 p2;
+    vec2 stroke;
+};
+
+#define AnnoQuadSeg_size 32
+
+AnnoQuadSegRef AnnoQuadSeg_index(AnnoQuadSegRef ref, uint index) {
+    return AnnoQuadSegRef(ref.offset + index * AnnoQuadSeg_size);
+}
+
+struct AnnoCubicSeg {
+    vec2 p0;
+    vec2 p1;
+    vec2 p2;
+    vec2 p3;
+    vec2 stroke;
+};
+
+#define AnnoCubicSeg_size 40
+
+AnnoCubicSegRef AnnoCubicSeg_index(AnnoCubicSegRef ref, uint index) {
+    return AnnoCubicSegRef(ref.offset + index * AnnoCubicSeg_size);
+}
+
+struct AnnoFill {
+    uint rgba_color;
+    vec4 bbox;
+};
+
+#define AnnoFill_size 20
+
+AnnoFillRef AnnoFill_index(AnnoFillRef ref, uint index) {
+    return AnnoFillRef(ref.offset + index * AnnoFill_size);
+}
+
+struct AnnoStroke {
+    uint rgba_color;
+    vec4 bbox;
+    float linewidth;
+};
+
+#define AnnoStroke_size 24
+
+AnnoStrokeRef AnnoStroke_index(AnnoStrokeRef ref, uint index) {
+    return AnnoStrokeRef(ref.offset + index * AnnoStroke_size);
+}
+
+#define Annotated_Nop 0
+#define Annotated_FillLine 1
+#define Annotated_StrokeLine 2
+#define Annotated_Quad 3
+#define Annotated_Cubic 4
+#define Annotated_Stroke 5
+#define Annotated_Fill 6
+#define Annotated_size 44
+
+AnnotatedRef Annotated_index(AnnotatedRef ref, uint index) {
+    return AnnotatedRef(ref.offset + index * Annotated_size);
+}
+
+AnnoFillLineSeg AnnoFillLineSeg_read(AnnoFillLineSegRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = annotated[ix + 0];
+    uint raw1 = annotated[ix + 1];
+    uint raw2 = annotated[ix + 2];
+    uint raw3 = annotated[ix + 3];
+    AnnoFillLineSeg s;
+    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    return s;
+}
+
+void AnnoFillLineSeg_write(AnnoFillLineSegRef ref, AnnoFillLineSeg s) {
+    uint ix = ref.offset >> 2;
+    annotated[ix + 0] = floatBitsToUint(s.p0.x);
+    annotated[ix + 1] = floatBitsToUint(s.p0.y);
+    annotated[ix + 2] = floatBitsToUint(s.p1.x);
+    annotated[ix + 3] = floatBitsToUint(s.p1.y);
+}
+
+AnnoStrokeLineSeg AnnoStrokeLineSeg_read(AnnoStrokeLineSegRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = annotated[ix + 0];
+    uint raw1 = annotated[ix + 1];
+    uint raw2 = annotated[ix + 2];
+    uint raw3 = annotated[ix + 3];
+    uint raw4 = annotated[ix + 4];
+    uint raw5 = annotated[ix + 5];
+    AnnoStrokeLineSeg s;
+    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.stroke = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
+    return s;
+}
+
+void AnnoStrokeLineSeg_write(AnnoStrokeLineSegRef ref, AnnoStrokeLineSeg s) {
+    uint ix = ref.offset >> 2;
+    annotated[ix + 0] = floatBitsToUint(s.p0.x);
+    annotated[ix + 1] = floatBitsToUint(s.p0.y);
+    annotated[ix + 2] = floatBitsToUint(s.p1.x);
+    annotated[ix + 3] = floatBitsToUint(s.p1.y);
+    annotated[ix + 4] = floatBitsToUint(s.stroke.x);
+    annotated[ix + 5] = floatBitsToUint(s.stroke.y);
+}
+
+AnnoQuadSeg AnnoQuadSeg_read(AnnoQuadSegRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = annotated[ix + 0];
+    uint raw1 = annotated[ix + 1];
+    uint raw2 = annotated[ix + 2];
+    uint raw3 = annotated[ix + 3];
+    uint raw4 = annotated[ix + 4];
+    uint raw5 = annotated[ix + 5];
+    uint raw6 = annotated[ix + 6];
+    uint raw7 = annotated[ix + 7];
+    AnnoQuadSeg s;
+    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
+    s.stroke = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7));
+    return s;
+}
+
+void AnnoQuadSeg_write(AnnoQuadSegRef ref, AnnoQuadSeg s) {
+    uint ix = ref.offset >> 2;
+    annotated[ix + 0] = floatBitsToUint(s.p0.x);
+    annotated[ix + 1] = floatBitsToUint(s.p0.y);
+    annotated[ix + 2] = floatBitsToUint(s.p1.x);
+    annotated[ix + 3] = floatBitsToUint(s.p1.y);
+    annotated[ix + 4] = floatBitsToUint(s.p2.x);
+    annotated[ix + 5] = floatBitsToUint(s.p2.y);
+    annotated[ix + 6] = floatBitsToUint(s.stroke.x);
+    annotated[ix + 7] = floatBitsToUint(s.stroke.y);
+}
+
+AnnoCubicSeg AnnoCubicSeg_read(AnnoCubicSegRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = annotated[ix + 0];
+    uint raw1 = annotated[ix + 1];
+    uint raw2 = annotated[ix + 2];
+    uint raw3 = annotated[ix + 3];
+    uint raw4 = annotated[ix + 4];
+    uint raw5 = annotated[ix + 5];
+    uint raw6 = annotated[ix + 6];
+    uint raw7 = annotated[ix + 7];
+    uint raw8 = annotated[ix + 8];
+    uint raw9 = annotated[ix + 9];
+    AnnoCubicSeg s;
+    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
+    s.p3 = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7));
+    s.stroke = vec2(uintBitsToFloat(raw8), uintBitsToFloat(raw9));
+    return s;
+}
+
+void AnnoCubicSeg_write(AnnoCubicSegRef ref, AnnoCubicSeg s) {
+    uint ix = ref.offset >> 2;
+    annotated[ix + 0] = floatBitsToUint(s.p0.x);
+    annotated[ix + 1] = floatBitsToUint(s.p0.y);
+    annotated[ix + 2] = floatBitsToUint(s.p1.x);
+    annotated[ix + 3] = floatBitsToUint(s.p1.y);
+    annotated[ix + 4] = floatBitsToUint(s.p2.x);
+    annotated[ix + 5] = floatBitsToUint(s.p2.y);
+    annotated[ix + 6] = floatBitsToUint(s.p3.x);
+    annotated[ix + 7] = floatBitsToUint(s.p3.y);
+    annotated[ix + 8] = floatBitsToUint(s.stroke.x);
+    annotated[ix + 9] = floatBitsToUint(s.stroke.y);
+}
+
+AnnoFill AnnoFill_read(AnnoFillRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = annotated[ix + 0];
+    uint raw1 = annotated[ix + 1];
+    uint raw2 = annotated[ix + 2];
+    uint raw3 = annotated[ix + 3];
+    uint raw4 = annotated[ix + 4];
+    AnnoFill s;
+    s.rgba_color = raw0;
+    s.bbox = vec4(uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3), uintBitsToFloat(raw4));
+    return s;
+}
+
+void AnnoFill_write(AnnoFillRef ref, AnnoFill s) {
+    uint ix = ref.offset >> 2;
+    annotated[ix + 0] = s.rgba_color;
+    annotated[ix + 1] = floatBitsToUint(s.bbox.x);
+    annotated[ix + 2] = floatBitsToUint(s.bbox.y);
+    annotated[ix + 3] = floatBitsToUint(s.bbox.z);
+    annotated[ix + 4] = floatBitsToUint(s.bbox.w);
+}
+
+AnnoStroke AnnoStroke_read(AnnoStrokeRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = annotated[ix + 0];
+    uint raw1 = annotated[ix + 1];
+    uint raw2 = annotated[ix + 2];
+    uint raw3 = annotated[ix + 3];
+    uint raw4 = annotated[ix + 4];
+    uint raw5 = annotated[ix + 5];
+    AnnoStroke s;
+    s.rgba_color = raw0;
+    s.bbox = vec4(uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3), uintBitsToFloat(raw4));
+    s.linewidth = uintBitsToFloat(raw5);
+    return s;
+}
+
+void AnnoStroke_write(AnnoStrokeRef ref, AnnoStroke s) {
+    uint ix = ref.offset >> 2;
+    annotated[ix + 0] = s.rgba_color;
+    annotated[ix + 1] = floatBitsToUint(s.bbox.x);
+    annotated[ix + 2] = floatBitsToUint(s.bbox.y);
+    annotated[ix + 3] = floatBitsToUint(s.bbox.z);
+    annotated[ix + 4] = floatBitsToUint(s.bbox.w);
+    annotated[ix + 5] = floatBitsToUint(s.linewidth);
+}
+
+uint Annotated_tag(AnnotatedRef ref) {
+    return annotated[ref.offset >> 2];
+}
+
+AnnoFillLineSeg Annotated_FillLine_read(AnnotatedRef ref) {
+    return AnnoFillLineSeg_read(AnnoFillLineSegRef(ref.offset + 4));
+}
+
+AnnoStrokeLineSeg Annotated_StrokeLine_read(AnnotatedRef ref) {
+    return AnnoStrokeLineSeg_read(AnnoStrokeLineSegRef(ref.offset + 4));
+}
+
+AnnoQuadSeg Annotated_Quad_read(AnnotatedRef ref) {
+    return AnnoQuadSeg_read(AnnoQuadSegRef(ref.offset + 4));
+}
+
+AnnoCubicSeg Annotated_Cubic_read(AnnotatedRef ref) {
+    return AnnoCubicSeg_read(AnnoCubicSegRef(ref.offset + 4));
+}
+
+AnnoStroke Annotated_Stroke_read(AnnotatedRef ref) {
+    return AnnoStroke_read(AnnoStrokeRef(ref.offset + 4));
+}
+
+AnnoFill Annotated_Fill_read(AnnotatedRef ref) {
+    return AnnoFill_read(AnnoFillRef(ref.offset + 4));
+}
+
+void Annotated_Nop_write(AnnotatedRef ref) {
+    annotated[ref.offset >> 2] = Annotated_Nop;
+}
+
+void Annotated_FillLine_write(AnnotatedRef ref, AnnoFillLineSeg s) {
+    annotated[ref.offset >> 2] = Annotated_FillLine;
+    AnnoFillLineSeg_write(AnnoFillLineSegRef(ref.offset + 4), s);
+}
+
+void Annotated_StrokeLine_write(AnnotatedRef ref, AnnoStrokeLineSeg s) {
+    annotated[ref.offset >> 2] = Annotated_StrokeLine;
+    AnnoStrokeLineSeg_write(AnnoStrokeLineSegRef(ref.offset + 4), s);
+}
+
+void Annotated_Quad_write(AnnotatedRef ref, AnnoQuadSeg s) {
+    annotated[ref.offset >> 2] = Annotated_Quad;
+    AnnoQuadSeg_write(AnnoQuadSegRef(ref.offset + 4), s);
+}
+
+void Annotated_Cubic_write(AnnotatedRef ref, AnnoCubicSeg s) {
+    annotated[ref.offset >> 2] = Annotated_Cubic;
+    AnnoCubicSeg_write(AnnoCubicSegRef(ref.offset + 4), s);
+}
+
+void Annotated_Stroke_write(AnnotatedRef ref, AnnoStroke s) {
+    annotated[ref.offset >> 2] = Annotated_Stroke;
+    AnnoStroke_write(AnnoStrokeRef(ref.offset + 4), s);
+}
+
+void Annotated_Fill_write(AnnotatedRef ref, AnnoFill s) {
+    annotated[ref.offset >> 2] = Annotated_Fill;
+    AnnoFill_write(AnnoFillRef(ref.offset + 4), s);
+}
+
--- a/piet-gpu/shader/binning.comp
+++ b/piet-gpu/shader/binning.comp
@ -0,0 +1,193 @@
+// The binning stage of the pipeline.
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+#include "setup.h"
+
+layout(local_size_x = N_TILE, local_size_y = 1) in;
+
+layout(set = 0, binding = 0) buffer AnnotatedBuf {
+    uint[] annotated;
+};
+
+// This is for scanning forward for right_edge data.
+layout(set = 0, binding = 1) buffer StateBuf {
+    uint[] state;
+};
+
+layout(set = 0, binding = 2) buffer AllocBuf {
+    uint n_elements;
+    // Will be incremented atomically to claim tiles
+    uint tile_ix;
+    uint alloc;
+};
+
+layout(set = 0, binding = 3) buffer BinsBuf {
+    uint[] bins;
+};
+
+#include "annotated.h"
+#include "state.h"
+#include "bins.h"
+
+// scale factors useful for converting coordinates to bins
+#define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX))
+#define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX))
+
+#define TSY (1.0 / float(TILE_HEIGHT_PX))
+
+// Constant not available in GLSL. Also consider uintBitsToFloat(0x7f800000)
+#define INFINITY (1.0 / 0.0)
+
+// Note: cudaraster has N_TILE + 1 to cut down on bank conflicts.
+shared uint bitmaps[N_SLICE][N_TILE];
+shared uint count[N_SLICE][N_TILE];
+shared uint sh_chunk_start[N_TILE];
+
+shared float sh_right_edge[N_TILE];
+
+#define StateBuf_stride (8 + 2 * State_size)
+
+uint state_right_edge_index(uint partition_ix) {
+    return 2 + partition_ix * (StateBuf_stride / 4);
+}
+
+void main() {
+    uint chunk_n = 0;
+    uint my_n_elements = n_elements;
+    uint my_partition = gl_WorkGroupID.x;
+
+    for (uint i = 0; i < N_SLICE; i++) {
+        bitmaps[i][gl_LocalInvocationID.x] = 0;
+    }
+    barrier();
+
+    // Read inputs and determine coverage of bins
+    uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x;
+    AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size);
+    uint tag = Annotated_Nop;
+    if (element_ix < my_n_elements) {
+        tag = Annotated_tag(ref);
+    }
+    int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
+    float my_right_edge = INFINITY;
+    bool crosses_edge = false;
+    switch (tag) {
+    case Annotated_FillLine:
+    case Annotated_StrokeLine:
+        AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref);
+        x0 = int(floor((min(line.p0.x, line.p1.x) - line.stroke.x) * SX));
+        y0 = int(floor((min(line.p0.y, line.p1.y) - line.stroke.y) * SY));
+        x1 = int(ceil((max(line.p0.x, line.p1.x) + line.stroke.x) * SX));
+        y1 = int(ceil((max(line.p0.y, line.p1.y) + line.stroke.y) * SY));
+        crosses_edge = tag == Annotated_FillLine && ceil(line.p0.y * TSY) != ceil(line.p1.y * TSY);
+        break;
+    case Annotated_Fill:
+    case Annotated_Stroke:
+        // Note: we take advantage of the fact that fills and strokes
+        // have compatible layout.
+        AnnoFill fill = Annotated_Fill_read(ref);
+        x0 = int(floor(fill.bbox.x * SX));
+        y0 = int(floor(fill.bbox.y * SY));
+        x1 = int(ceil(fill.bbox.z * SX));
+        y1 = int(ceil(fill.bbox.w * SY));
+        // It probably makes more sense to track x1, to avoid having to redo
+        // the rounding to tile coords.
+        my_right_edge = fill.bbox.z;
+        break;
+    }
+
+    // If the last element in this partition is a fill edge, then we need to do a
+    // look-forward to find the right edge of its corresponding fill. That data is
+    // recorded in aggregates computed in the element processing pass.
+    if (gl_LocalInvocationID.x == N_TILE - 1 && tag == Annotated_FillLine) {
+        uint aggregate_ix = (my_partition + 1) * ELEMENT_BINNING_RATIO;
+        // This is sequential but the expectation is that the amount of
+        // look-forward is small (performance may degrade in the case
+        // of massively complex paths).
+        do {
+            my_right_edge = uintBitsToFloat(state[state_right_edge_index(aggregate_ix)]);
+            aggregate_ix++;
+        } while (isinf(my_right_edge));
+    }
+
+    // Now propagate right_edge backward, from fill to segment.
+    for (uint i = 0; i < LG_N_TILE; i++) {
+        // Note: we could try to cut down on write bandwidth here if the value hasn't
+        // changed, but not sure it's worth the complexity to track.
+        sh_right_edge[gl_LocalInvocationID.x] = my_right_edge;
+        barrier();
+        if (gl_LocalInvocationID.x + (1 << i) < N_TILE && isinf(my_right_edge)) {
+            my_right_edge = sh_right_edge[gl_LocalInvocationID.x + (1 << i)];
+        }
+        barrier();
+    }
+    if (crosses_edge) {
+        x1 = int(ceil(my_right_edge * SX));
+    }
+
+    // At this point, we run an iterator over the coverage area,
+    // trying to keep divergence low.
+    // Right now, it's just a bbox, but we'll get finer with
+    // segments.
+    x0 = clamp(x0, 0, N_TILE_X);
+    x1 = clamp(x1, x0, N_TILE_X);
+    y0 = clamp(y0, 0, N_TILE_Y);
+    y1 = clamp(y1, y0, N_TILE_Y);
+    if (x0 == x1) y1 = y0;
+    int x = x0, y = y0;
+    uint my_slice = gl_LocalInvocationID.x / 32;
+    uint my_mask = 1 << (gl_LocalInvocationID.x & 31);
+    while (y < y1) {
+        atomicOr(bitmaps[my_slice][y * N_TILE_X + x], my_mask);
+        x++;
+        if (x == x1) {
+            x = x0;
+            y++;
+        }
+    }
+
+    barrier();
+    // Allocate output segments.
+    uint element_count = 0;
+    for (uint i = 0; i < N_SLICE; i++) {
+        element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]);
+        count[i][gl_LocalInvocationID.x] = element_count;
+    }
+    // element_count is number of elements covering bin for this invocation.
+    uint chunk_start = 0;
+    if (element_count != 0) {
+        // TODO: aggregate atomic adds (subgroup is probably fastest)
+        chunk_start = atomicAdd(alloc, element_count * BinInstance_size);
+        sh_chunk_start[gl_LocalInvocationID.x] = chunk_start;
+    }
+    // Note: it might be more efficient for reading to do this in the
+    // other order (each bin is a contiguous sequence of partitions)
+    uint out_ix = (my_partition * N_TILE + gl_LocalInvocationID.x) * 2;
+    bins[out_ix] = element_count;
+    bins[out_ix + 1] = chunk_start;
+
+    barrier();
+    // Use similar strategy as Laine & Karras paper; loop over bbox of bins
+    // touched by this element
+    x = x0;
+    y = y0;
+    while (y < y1) {
+        uint bin_ix = y * N_TILE_X + x;
+        uint out_mask = bitmaps[my_slice][bin_ix];
+        if ((out_mask & my_mask) != 0) {
+            uint idx = bitCount(out_mask & (my_mask - 1));
+            if (my_slice > 0) {
+                idx += count[my_slice - 1][bin_ix];
+            }
+            uint out_offset = sh_chunk_start[bin_ix] + idx * BinInstance_size;
+            BinInstance_write(BinInstanceRef(out_offset), BinInstance(element_ix, my_right_edge));
+        }
+        x++;
+        if (x == x1) {
+            x = x0;
+            y++;
+        }
+    }
+}
--- a/piet-gpu/shader/binning.spv
+++ b/piet-gpu/shader/binning.spv
--- a/piet-gpu/shader/bins.h
+++ b/piet-gpu/shader/bins.h
@ -0,0 +1,64 @@
+// Code auto-generated by piet-gpu-derive
+
+struct BinInstanceRef {
+    uint offset;
+};
+
+struct BinChunkRef {
+    uint offset;
+};
+
+struct BinInstance {
+    uint element_ix;
+    float right_edge;
+};
+
+#define BinInstance_size 8
+
+BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) {
+    return BinInstanceRef(ref.offset + index * BinInstance_size);
+}
+
+struct BinChunk {
+    uint n;
+    BinChunkRef next;
+};
+
+#define BinChunk_size 8
+
+BinChunkRef BinChunk_index(BinChunkRef ref, uint index) {
+    return BinChunkRef(ref.offset + index * BinChunk_size);
+}
+
+BinInstance BinInstance_read(BinInstanceRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = bins[ix + 0];
+    uint raw1 = bins[ix + 1];
+    BinInstance s;
+    s.element_ix = raw0;
+    s.right_edge = uintBitsToFloat(raw1);
+    return s;
+}
+
+void BinInstance_write(BinInstanceRef ref, BinInstance s) {
+    uint ix = ref.offset >> 2;
+    bins[ix + 0] = s.element_ix;
+    bins[ix + 1] = floatBitsToUint(s.right_edge);
+}
+
+BinChunk BinChunk_read(BinChunkRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = bins[ix + 0];
+    uint raw1 = bins[ix + 1];
+    BinChunk s;
+    s.n = raw0;
+    s.next = BinChunkRef(raw1);
+    return s;
+}
+
+void BinChunk_write(BinChunkRef ref, BinChunk s) {
+    uint ix = ref.offset >> 2;
+    bins[ix + 0] = s.n;
+    bins[ix + 1] = s.next.offset;
+}
+
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@ -9,12 +9,11 @@ rule glsl

 build image.spv: glsl image.comp | scene.h

-build kernel1.spv: glsl kernel1.comp | scene.h tilegroup.h setup.h

-build kernel2s.spv: glsl kernel2s.comp | scene.h tilegroup.h segment.h setup.h
+build elements.spv: glsl elements.comp | scene.h state.h annotated.h

-build kernel2f.spv: glsl kernel2f.comp | scene.h tilegroup.h fill_seg.h setup.h
+build binning.spv: glsl binning.comp | annotated.h state.h bins.h setup.h

-build kernel3.spv: glsl kernel3.comp | scene.h tilegroup.h segment.h fill_seg.h ptcl.h setup.h
+build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h

-build kernel4.spv: glsl kernel4.comp | ptcl.h segment.h fill_seg.h setup.h
+build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@ -0,0 +1,526 @@
+// The coarse rasterizer stage of the pipeline.
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+#include "setup.h"
+
+layout(local_size_x = N_TILE, local_size_y = 1) in;
+
+layout(set = 0, binding = 0) buffer AnnotatedBuf {
+    uint[] annotated;
+};
+
+layout(set = 0, binding = 1) buffer BinsBuf {
+    uint[] bins;
+};
+
+layout(set = 0, binding = 2) buffer AllocBuf {
+    uint n_elements;
+    uint alloc;
+};
+
+layout(set = 0, binding = 3) buffer PtclBuf {
+    uint[] ptcl;
+};
+
+#include "annotated.h"
+#include "bins.h"
+#include "ptcl.h"
+
+#define LG_N_PART_READ 8
+#define N_PART_READ (1 << LG_N_PART_READ)
+
+shared uint sh_elements[N_TILE];
+shared float sh_right_edge[N_TILE];
+
+// Number of elements in the partition; prefix sum.
+shared uint sh_part_count[N_PART_READ];
+shared uint sh_part_elements[N_PART_READ];
+
+shared uint sh_bitmaps[N_SLICE][N_TILE];
+shared uint sh_backdrop[N_SLICE][N_TILE];
+shared uint sh_bd_sign[N_SLICE];
+shared uint sh_is_segment[N_SLICE];
+
+// Shared state for parallel segment output stage
+
+// Count of total number of segments in each tile, then
+// inclusive prefix sum of same.
+shared uint sh_seg_count[N_TILE];
+shared uint sh_seg_alloc;
+
+// scale factors useful for converting coordinates to tiles
+#define SX (1.0 / float(TILE_WIDTH_PX))
+#define SY (1.0 / float(TILE_HEIGHT_PX))
+
+// Perhaps cmd_limit should be a global? This is a style question.
+void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) {
+    if (cmd_ref.offset > cmd_limit) {
+        uint new_cmd = atomicAdd(alloc, PTCL_INITIAL_ALLOC);
+        CmdJump jump = CmdJump(new_cmd);
+        Cmd_Jump_write(cmd_ref, jump);
+        cmd_ref = CmdRef(new_cmd);
+        cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
+    }
+}
+
+#define CHUNK_ALLOC_SLAB 16
+
+uint alloc_chunk_remaining;
+uint alloc_chunk_offset;
+
+SegChunkRef alloc_seg_chunk() {
+    if (alloc_chunk_remaining == 0) {
+        alloc_chunk_offset = atomicAdd(alloc, CHUNK_ALLOC_SLAB * SegChunk_size);
+        alloc_chunk_remaining = CHUNK_ALLOC_SLAB;
+    }
+    uint offset = alloc_chunk_offset;
+    alloc_chunk_offset += SegChunk_size;
+    alloc_chunk_remaining--;
+    return SegChunkRef(offset);
+}
+
+// Accumulate delta to backdrop.
+//
+// Each bit for which bd_bitmap is 1 and bd_sign is 1 counts as +1, and each
+// bit for which bd_bitmap is 1 and bd_sign is 0 counts as -1.
+int count_backdrop(uint bd_bitmap, uint bd_sign) {
+    return bitCount(bd_bitmap & bd_sign) - bitCount(bd_bitmap & ~bd_sign);
+}
+
+void main() {
+    // Could use either linear or 2d layouts for both dispatch and
+    // invocations within the workgroup. We'll use variables to abstract.
+    uint bin_ix = N_TILE_X * gl_WorkGroupID.y + gl_WorkGroupID.x;
+    uint partition_ix = 0;
+    uint n_partitions = (n_elements + N_TILE - 1) / N_TILE;
+    // Top left coordinates of this bin.
+    vec2 xy0 = vec2(N_TILE_X * TILE_WIDTH_PX * gl_WorkGroupID.x, N_TILE_Y * TILE_HEIGHT_PX * gl_WorkGroupID.y);
+    uint th_ix = gl_LocalInvocationID.x;
+
+    uint tile_x = N_TILE_X * gl_WorkGroupID.x + gl_LocalInvocationID.x % N_TILE_X;
+    uint tile_y = N_TILE_Y * gl_WorkGroupID.y + gl_LocalInvocationID.x / N_TILE_X;
+    uint this_tile_ix = tile_y * WIDTH_IN_TILES + tile_x;
+    CmdRef cmd_ref = CmdRef(this_tile_ix * PTCL_INITIAL_ALLOC);
+    uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
+
+    // Allocation and management of segment output
+    SegChunkRef first_seg_chunk = SegChunkRef(0);
+    SegChunkRef last_chunk_ref = SegChunkRef(0);
+    uint last_chunk_n = 0;
+    SegmentRef last_chunk_segs = SegmentRef(0);
+    alloc_chunk_remaining = 0;
+
+    // I'm sure we can figure out how to do this with at least one fewer register...
+    // Items up to rd_ix have been read from sh_elements
+    uint rd_ix = 0;
+    // Items up to wr_ix have been written into sh_elements
+    uint wr_ix = 0;
+    // Items between part_start_ix and ready_ix are ready to be transferred from sh_part_elements
+    uint part_start_ix = 0;
+    uint ready_ix = 0;
+    if (th_ix < N_SLICE) {
+        sh_bd_sign[th_ix] = 0;
+    }
+    int backdrop = 0;
+    while (true) {
+        for (uint i = 0; i < N_SLICE; i++) {
+            sh_bitmaps[i][th_ix] = 0;
+            sh_backdrop[i][th_ix] = 0;
+        }
+        if (th_ix < N_SLICE) {
+            sh_is_segment[th_ix] = 0;
+        }
+
+        // parallel read of input partitions
+        do {
+            if (ready_ix == wr_ix && partition_ix < n_partitions) {
+                part_start_ix = ready_ix;
+                uint count = 0;
+                if (th_ix < N_PART_READ && partition_ix + th_ix < n_partitions) {
+                    uint in_ix = ((partition_ix + th_ix) * N_TILE + bin_ix) * 2;
+                    count = bins[in_ix];
+                    sh_part_elements[th_ix] = bins[in_ix + 1];
+                }
+                // prefix sum of counts
+                for (uint i = 0; i < LG_N_PART_READ; i++) {
+                    if (th_ix < N_PART_READ) {
+                        sh_part_count[th_ix] = count;
+                    }
+                    barrier();
+                    if (th_ix < N_PART_READ) {
+                        if (th_ix >= (1 << i)) {
+                            count += sh_part_count[th_ix - (1 << i)];
+                        }
+                    }
+                    barrier();
+                }
+                if (th_ix < N_PART_READ) {
+                    sh_part_count[th_ix] = part_start_ix + count;
+                }
+                barrier();
+                ready_ix = sh_part_count[N_PART_READ - 1];
+                partition_ix += N_PART_READ;
+            }
+            // use binary search to find element to read
+            uint ix = rd_ix + th_ix;
+            if (ix >= wr_ix && ix < ready_ix) {
+                uint part_ix = 0;
+                for (uint i = 0; i < LG_N_PART_READ; i++) {
+                    uint probe = part_ix + ((N_PART_READ / 2) >> i);
+                    if (ix >= sh_part_count[probe - 1]) {
+                        part_ix = probe;
+                    }
+                }
+                ix -= part_ix > 0 ? sh_part_count[part_ix - 1] : part_start_ix;
+                BinInstanceRef inst_ref = BinInstanceRef(sh_part_elements[part_ix]);
+                BinInstance inst = BinInstance_read(BinInstance_index(inst_ref, ix));
+                sh_elements[th_ix] = inst.element_ix;
+                sh_right_edge[th_ix] = inst.right_edge;
+            }
+            barrier();
+
+            wr_ix = min(rd_ix + N_TILE, ready_ix);
+        } while (wr_ix - rd_ix < N_TILE && (wr_ix < ready_ix || partition_ix < n_partitions));
+
+        // We've done the merge and filled the buffer.
+
+        // Read one element, compute coverage.
+        uint tag = Annotated_Nop;
+        AnnotatedRef ref;
+        float right_edge = 0.0;
+        if (th_ix + rd_ix < wr_ix) {
+            uint element_ix = sh_elements[th_ix];
+            right_edge = sh_right_edge[th_ix];
+            ref = AnnotatedRef(element_ix * Annotated_size);
+            tag = Annotated_tag(ref);
+        }
+
+        // Setup for coverage algorithm.
+        float a, b, c;
+        // Bounding box of element in pixel coordinates.
+        float xmin, xmax, ymin, ymax;
+        uint my_slice = th_ix / 32;
+        uint my_mask = 1 << (th_ix & 31);
+        switch (tag) {
+        case Annotated_FillLine:
+        case Annotated_StrokeLine:
+            AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref);
+            xmin = min(line.p0.x, line.p1.x) - line.stroke.x;
+            xmax = max(line.p0.x, line.p1.x) + line.stroke.x;
+            ymin = min(line.p0.y, line.p1.y) - line.stroke.y;
+            ymax = max(line.p0.y, line.p1.y) + line.stroke.y;
+            float dx = line.p1.x - line.p0.x;
+            float dy = line.p1.y - line.p0.y;
+            if (tag == Annotated_FillLine) {
+                // Set bit for backdrop sign calculation, 1 is +1, 0 is -1.
+                if (dy < 0) {
+                    atomicOr(sh_bd_sign[my_slice], my_mask);
+                } else {
+                    atomicAnd(sh_bd_sign[my_slice], ~my_mask);
+                }
+            }
+            atomicOr(sh_is_segment[my_slice], my_mask);
+            // Set up for per-scanline coverage formula, below.
+            float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
+            c = (line.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + line.stroke.y)) * SX;
+            b = invslope; // Note: assumes square tiles, otherwise scale.
+            a = (line.p0.x - xy0.x - (line.p0.y - 0.5 * float(TILE_HEIGHT_PX) - xy0.y) * b) * SX;
+            break;
+        case Annotated_Fill:
+        case Annotated_Stroke:
+            // Note: we take advantage of the fact that fills and strokes
+            // have compatible layout.
+            AnnoFill fill = Annotated_Fill_read(ref);
+            xmin = fill.bbox.x;
+            xmax = fill.bbox.z;
+            ymin = fill.bbox.y;
+            ymax = fill.bbox.w;
+            // Just let the clamping to xmin and xmax determine the bounds.
+            a = 0.0;
+            b = 0.0;
+            c = 1e9;
+            break;
+        default:
+            ymin = 0;
+            ymax = 0;
+            break;
+        }
+
+        // Draw the coverage area into the bitmasks. This uses an algorithm
+        // that computes the coverage of a span for given scanline.
+
+        // Compute bounding box in tiles and clip to this bin.
+        int x0 = int(floor((xmin - xy0.x) * SX));
+        int x1 = int(ceil((xmax - xy0.x) * SX));
+        int xr = int(ceil((right_edge - xy0.x) * SX));
+        int y0 = int(floor((ymin - xy0.y) * SY));
+        int y1 = int(ceil((ymax - xy0.y) * SY));
+        x0 = clamp(x0, 0, N_TILE_X);
+        x1 = clamp(x1, x0, N_TILE_X);
+        xr = clamp(xr, 0, N_TILE_X);
+        y0 = clamp(y0, 0, N_TILE_Y);
+        y1 = clamp(y1, y0, N_TILE_Y);
+        float t = a + b * float(y0);
+        for (uint y = y0; y < y1; y++) {
+            uint xx0 = clamp(int(floor(t - c)), x0, x1);
+            uint xx1 = clamp(int(ceil(t + c)), x0, x1);
+            for (uint x = xx0; x < xx1; x++) {
+                atomicOr(sh_bitmaps[my_slice][y * N_TILE_X + x], my_mask);
+            }
+            if (tag == Annotated_FillLine && ymin <= xy0.y + float(y * TILE_HEIGHT_PX)) {
+                // Assign backdrop to all tiles to the right of the ray crossing the
+                // top edge of this tile, up to the right edge of the fill bbox.
+                float xray = t - 0.5 * b;
+                xx0 = max(int(ceil(xray)), 0);
+                for (uint x = xx0; x < xr; x++) {
+                    atomicOr(sh_backdrop[my_slice][y * N_TILE_X + x], my_mask);
+                }
+            }
+            t += b;
+        }
+        barrier();
+
+        // We've computed coverage and other info for each element in the input, now for
+        // the output stage. We'll do segments first using a more parallel algorithm.
+
+        uint seg_count = 0;
+        for (uint i = 0; i < N_SLICE; i++) {
+            seg_count += bitCount(sh_bitmaps[i][th_ix] & sh_is_segment[i]);
+        }
+        sh_seg_count[th_ix] = seg_count;
+        // Prefix sum of sh_seg_count
+        for (uint i = 0; i < LG_N_TILE; i++) {
+            barrier();
+            if (th_ix >= (1 << i)) {
+                seg_count += sh_seg_count[th_ix - (1 << i)];
+            }
+            barrier();
+            sh_seg_count[th_ix] = seg_count;
+        }
+        if (th_ix == N_TILE - 1) {
+            sh_seg_alloc = atomicAdd(alloc, seg_count * Segment_size);
+        }
+        barrier();
+        uint total_seg_count = sh_seg_count[N_TILE - 1];
+        uint seg_alloc = sh_seg_alloc;
+
+        // Output buffer is allocated as segments for each tile laid end-to-end.
+
+        for (uint ix = th_ix; ix < total_seg_count; ix += N_TILE) {
+            // Find the work item; this thread is now not bound to an element or tile.
+            // First find the tile (by binary search)
+            uint tile_ix = 0;
+            for (uint i = 0; i < LG_N_TILE; i++) {
+                uint probe = tile_ix + ((N_TILE / 2) >> i);
+                if (ix >= sh_seg_count[probe - 1]) {
+                    tile_ix = probe;
+                }
+            }
+            // Now, sh_seg_count[tile_ix - 1] <= ix < sh_seg_count[tile_ix].
+            // (considering sh_seg_count[-1] == 0)
+
+            // Index of segment within tile's segments
+            uint seq_ix = ix;
+            // Maybe consider a sentinel value to avoid the conditional?
+            if (tile_ix > 0) {
+                seq_ix -= sh_seg_count[tile_ix - 1];
+            }
+            // Find the segment. This is done by linear scan through the bitmaps of the
+            // tile, accelerated by bit counting. Binary search might help, maybe not.
+            uint slice_ix = 0;
+            uint seq_bits;
+
+            while (true) {
+                seq_bits = sh_bitmaps[slice_ix][tile_ix] & sh_is_segment[slice_ix];
+                uint this_count = bitCount(seq_bits);
+                if (this_count > seq_ix) {
+                    break;
+                }
+                seq_ix -= this_count;
+                slice_ix++;
+            }
+            // Now find position of nth bit set (n = seq_ix) in seq_bits; binary search
+            uint bit_ix = 0;
+            for (int i = 0; i < 5; i++) {
+                uint probe = bit_ix + (16 >> i);
+                if (seq_ix >= bitCount(seq_bits & ((1 << probe) - 1))) {
+                    bit_ix = probe;
+                }
+            }
+            uint out_offset = seg_alloc + Segment_size * ix + SegChunk_size;
+            uint rd_el_ix = slice_ix * 32 + bit_ix;
+            uint element_ix = sh_elements[rd_el_ix];
+            ref = AnnotatedRef(element_ix * Annotated_size);
+            AnnoFillLineSeg line = Annotated_FillLine_read(ref);
+            float y_edge = 0.0;
+            // This is basically the same logic as piet-metal, but should be made numerically robust.
+            if (Annotated_tag(ref) == Annotated_FillLine) {
+                vec2 tile_xy = xy0 + vec2((tile_ix % N_TILE_X) * TILE_WIDTH_PX, (tile_ix / N_TILE_X) * TILE_HEIGHT_PX);
+                y_edge = mix(line.p0.y, line.p1.y, (tile_xy.x - line.p0.x) / (line.p1.x - line.p0.x));
+                if (min(line.p0.x, line.p1.x) < tile_xy.x && y_edge >= tile_xy.y && y_edge < tile_xy.y + TILE_HEIGHT_PX) {
+                    if (line.p0.x > line.p1.x) {
+                        line.p1 = vec2(tile_xy.x, y_edge);
+                    } else {
+                        line.p0 = vec2(tile_xy.x, y_edge);
+                    }
+                } else {
+                    y_edge = 1e9;
+                }
+            }
+            Segment seg = Segment(line.p0, line.p1, y_edge);
+            Segment_write(SegmentRef(seg_alloc + Segment_size * ix), seg);
+        }
+
+        // Output non-segment elements for this tile. The thread does a sequential walk
+        // through the non-segment elements, and for segments, count and backdrop are
+        // aggregated using bit counting.
+        uint slice_ix = 0;
+        uint bitmap = sh_bitmaps[0][th_ix];
+        uint bd_bitmap = sh_backdrop[0][th_ix];
+        uint bd_sign = sh_bd_sign[0];
+        uint is_segment = sh_is_segment[0];
+        uint seg_start = th_ix == 0 ? 0 : sh_seg_count[th_ix - 1];
+        seg_count = 0;
+        while (true) {
+            uint nonseg_bitmap = bitmap & ~is_segment;
+            if (nonseg_bitmap == 0) {
+                backdrop += count_backdrop(bd_bitmap, bd_sign);
+                seg_count += bitCount(bitmap & is_segment);
+                slice_ix++;
+                if (slice_ix == N_SLICE) {
+                    break;
+                }
+                bitmap = sh_bitmaps[slice_ix][th_ix];
+                bd_bitmap = sh_backdrop[slice_ix][th_ix];
+                bd_sign = sh_bd_sign[slice_ix];
+                is_segment = sh_is_segment[slice_ix];
+                nonseg_bitmap = bitmap & ~is_segment;
+                if (nonseg_bitmap == 0) {
+                    continue;
+                }
+            }
+            uint element_ref_ix = slice_ix * 32 + findLSB(nonseg_bitmap);
+            uint element_ix = sh_elements[element_ref_ix];
+
+            // Bits up to and including the lsb
+            uint bd_mask = (nonseg_bitmap - 1) ^ nonseg_bitmap;
+            backdrop += count_backdrop(bd_bitmap & bd_mask, bd_sign);
+            seg_count += bitCount(bitmap & bd_mask & is_segment);
+            // Clear bits that have been consumed.
+            bd_bitmap &= ~bd_mask;
+            bitmap &= ~bd_mask;
+
+            // At this point, we read the element again from global memory.
+            // If that turns out to be expensive, maybe we can pack it into
+            // shared memory (or perhaps just the tag).
+            ref = AnnotatedRef(element_ix * Annotated_size);
+            tag = Annotated_tag(ref);
+
+            switch (tag) {
+            case Annotated_Fill:
+                if (last_chunk_n > 0 || seg_count > 0) {
+                    SegChunkRef chunk_ref = SegChunkRef(0);
+                    if (seg_count > 0) {
+                        chunk_ref = alloc_seg_chunk();
+                        SegChunk chunk;
+                        chunk.n = seg_count;
+                        chunk.next = SegChunkRef(0);
+                        uint seg_offset = seg_alloc + seg_start * Segment_size;
+                        chunk.segs = SegmentRef(seg_offset);
+                        SegChunk_write(chunk_ref, chunk);
+                    }
+                    if (last_chunk_n > 0) {
+                        SegChunk chunk;
+                        chunk.n = last_chunk_n;
+                        chunk.next = chunk_ref;
+                        chunk.segs = last_chunk_segs;
+                        SegChunk_write(last_chunk_ref, chunk);
+                    } else {
+                        first_seg_chunk = chunk_ref;
+                    }
+
+                    AnnoFill fill = Annotated_Fill_read(ref);
+                    CmdFill cmd_fill;
+                    cmd_fill.seg_ref = first_seg_chunk;
+                    cmd_fill.backdrop = backdrop;
+                    cmd_fill.rgba_color = fill.rgba_color;
+                    alloc_cmd(cmd_ref, cmd_limit);
+                    Cmd_Fill_write(cmd_ref, cmd_fill);
+                    cmd_ref.offset += Cmd_size;
+                    last_chunk_n = 0;
+                } else if (backdrop != 0) {
+                    AnnoFill fill = Annotated_Fill_read(ref);
+                    alloc_cmd(cmd_ref, cmd_limit);
+                    Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color));
+                    cmd_ref.offset += Cmd_size;
+                }
+                seg_start += seg_count;
+                seg_count = 0;
+                backdrop = 0;
+                break;
+            case Annotated_Stroke:
+                // TODO: reduce divergence & code duplication? Much of the
+                // fill and stroke processing is in common.
+                if (last_chunk_n > 0 || seg_count > 0) {
+                    SegChunkRef chunk_ref = SegChunkRef(0);
+                    if (seg_count > 0) {
+                        chunk_ref = alloc_seg_chunk();
+                        SegChunk chunk;
+                        chunk.n = seg_count;
+                        chunk.next = SegChunkRef(0);
+                        uint seg_offset = seg_alloc + seg_start * Segment_size;
+                        chunk.segs = SegmentRef(seg_offset);
+                        SegChunk_write(chunk_ref, chunk);
+                    }
+                    if (last_chunk_n > 0) {
+                        SegChunk chunk;
+                        chunk.n = last_chunk_n;
+                        chunk.next = chunk_ref;
+                        chunk.segs = last_chunk_segs;
+                        SegChunk_write(last_chunk_ref, chunk);
+                    } else {
+                        first_seg_chunk = chunk_ref;
+                    }
+
+                    AnnoStroke stroke = Annotated_Stroke_read(ref);
+                    CmdStroke cmd_stroke;
+                    cmd_stroke.seg_ref = first_seg_chunk;
+                    cmd_stroke.half_width = 0.5 * stroke.linewidth;
+                    cmd_stroke.rgba_color = stroke.rgba_color;
+                    alloc_cmd(cmd_ref, cmd_limit);
+                    Cmd_Stroke_write(cmd_ref, cmd_stroke);
+                    cmd_ref.offset += Cmd_size;
+                    last_chunk_n = 0;
+                }
+                seg_start += seg_count;
+                seg_count = 0;
+                break;
+            default:
+                // This shouldn't happen, but just in case.
+                seg_start++;
+                break;
+            }
+        }
+        if (seg_count > 0) {
+            SegChunkRef chunk_ref = alloc_seg_chunk();
+            if (last_chunk_n > 0) {
+                SegChunk_write(last_chunk_ref, SegChunk(last_chunk_n, chunk_ref, last_chunk_segs));
+            } else {
+                first_seg_chunk = chunk_ref;
+            }
+            // TODO: free two registers by writing count and segments ref now,
+            // as opposed to deferring SegChunk write until all fields are known.
+            last_chunk_ref = chunk_ref;
+            last_chunk_n = seg_count;
+            uint seg_offset = seg_alloc + seg_start * Segment_size;
+            last_chunk_segs = SegmentRef(seg_offset);
+        }
+        barrier();
+
+        rd_ix += N_TILE;
+        if (rd_ix >= ready_ix && partition_ix >= n_partitions) break;
+    }
+    Cmd_End_write(cmd_ref);
+}
--- a/piet-gpu/shader/coarse.spv
+++ b/piet-gpu/shader/coarse.spv
--- a/piet-gpu/shader/elements.comp
+++ b/piet-gpu/shader/elements.comp
@ -0,0 +1,328 @@
+// The element processing stage, first in the pipeline.
+//
+// This stage is primarily about applying transforms and computing bounding
+// boxes. It is organized as a scan over the input elements, producing
+// annotated output elements.
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+#define N_ROWS 4
+#define WG_SIZE 32
+#define LG_WG_SIZE 5
+#define PARTITION_SIZE (WG_SIZE * N_ROWS)
+
+layout(local_size_x = WG_SIZE, local_size_y = 1) in;
+
+layout(set = 0, binding = 0) readonly buffer SceneBuf {
+    uint[] scene;
+};
+
+// It would be better to use the Vulkan memory model than
+// "volatile" but shooting for compatibility here rather
+// than doing things right.
+layout(set = 0, binding = 1) volatile buffer StateBuf {
+    uint[] state;
+};
+
+// The annotated results are stored here.
+layout(set = 0, binding = 2) buffer AnnotatedBuf {
+    uint[] annotated;
+};
+
+#include "scene.h"
+#include "state.h"
+#include "annotated.h"
+
+#define StateBuf_stride (8 + 2 * State_size)
+
+StateRef state_aggregate_ref(uint partition_ix) {
+    return StateRef(12 + partition_ix * StateBuf_stride);
+}
+
+StateRef state_prefix_ref(uint partition_ix) {
+    return StateRef(12 + partition_ix * StateBuf_stride + State_size);
+}
+
+uint state_flag_index(uint partition_ix) {
+    return 1 + partition_ix * (StateBuf_stride / 4);
+}
+
+// These correspond to X, A, P respectively in the prefix sum paper.
+#define FLAG_NOT_READY 0
+#define FLAG_AGGREGATE_READY 1
+#define FLAG_PREFIX_READY 2
+
+#define FLAG_SET_LINEWIDTH 1
+#define FLAG_SET_BBOX 2
+#define FLAG_RESET_BBOX 4
+
+// This is almost like a monoid (the interaction between transformation and
+// bounding boxes is approximate)
+State combine_state(State a, State b) {
+    State c;
+    c.bbox.x = min(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + min(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x;
+    c.bbox.y = min(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + min(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
+    c.bbox.z = max(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + max(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x;
+    c.bbox.w = max(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + max(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
+    if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) {
+        c.bbox = a.bbox;
+    } else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 &&
+        (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y))
+    {
+        c.bbox.xy = min(a.bbox.xy, c.bbox.xy);
+        c.bbox.zw = max(a.bbox.zw, c.bbox.zw);
+    }
+    // It would be more concise to cast to matrix types; ah well.
+    c.mat.x = a.mat.x * b.mat.x + a.mat.z * b.mat.y;
+    c.mat.y = a.mat.y * b.mat.x + a.mat.w * b.mat.y;
+    c.mat.z = a.mat.x * b.mat.z + a.mat.z * b.mat.w;
+    c.mat.w = a.mat.y * b.mat.z + a.mat.w * b.mat.w;
+    c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x;
+    c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y;
+    c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth;
+    c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX)) | b.flags;
+    c.flags |= (a.flags & FLAG_RESET_BBOX) >> 1;
+    return c;
+}
+
+State map_element(ElementRef ref, inout bool is_fill) {
+    // TODO: it would *probably* be more efficient to make the memory read patterns less
+    // divergent, though it would be more wasted memory.
+    uint tag = Element_tag(ref);
+    State c;
+    c.bbox = vec4(0.0, 0.0, 0.0, 0.0);
+    c.mat = vec4(1.0, 0.0, 0.0, 1.0);
+    c.translate = vec2(0.0, 0.0);
+    c.linewidth = 1.0; // TODO should be 0.0
+    c.flags = 0;
+    is_fill = false;
+    switch (tag) {
+    case Element_FillLine:
+    case Element_StrokeLine:
+        LineSeg line = Element_FillLine_read(ref);
+        c.bbox.xy = min(line.p0, line.p1);
+        c.bbox.zw = max(line.p0, line.p1);
+        break;
+    case Element_Quad:
+        QuadSeg quad = Element_Quad_read(ref);
+        c.bbox.xy = min(min(quad.p0, quad.p1), quad.p2);
+        c.bbox.zw = max(max(quad.p0, quad.p1), quad.p2);
+        break;
+    case Element_Cubic:
+        CubicSeg cubic = Element_Cubic_read(ref);
+        c.bbox.xy = min(min(cubic.p0, cubic.p1), min(cubic.p2, cubic.p3));
+        c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3));
+        break;
+    case Element_Fill:
+        is_fill = true;
+        // fall-through
+    case Element_Stroke:
+        c.flags = FLAG_RESET_BBOX;
+        break;
+    case Element_SetLineWidth:
+        SetLineWidth lw = Element_SetLineWidth_read(ref);
+        c.linewidth = lw.width;
+        c.flags = FLAG_SET_LINEWIDTH;
+        break;
+    case Element_Transform:
+        Transform t = Element_Transform_read(ref);
+        c.mat = t.mat;
+        c.translate = t.translate;
+        break;
+    }
+    return c;
+}
+
+// Get the bounding box of a circle transformed by the matrix into an ellipse.
+vec2 get_linewidth(State st) {
+    // See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm
+    return 0.5 * st.linewidth * vec2(length(st.mat.xz), length(st.mat.yw));
+}
+
+// We should be able to use an array of structs but the NV shader compiler
+// doesn't seem to like it :/
+//shared State sh_state[WG_SIZE];
+shared vec4 sh_mat[WG_SIZE];
+shared vec2 sh_translate[WG_SIZE];
+shared vec4 sh_bbox[WG_SIZE];
+shared float sh_width[WG_SIZE];
+shared uint sh_flags[WG_SIZE];
+
+shared uint sh_min_fill;
+
+shared uint sh_tile_ix;
+shared State sh_prefix;
+
+void main() {
+    State th_state[N_ROWS];
+    // Determine partition to process by atomic counter (described in Section
+    // 4.4 of prefix sum paper).
+    if (gl_LocalInvocationID.x == 0) {
+        sh_tile_ix = atomicAdd(state[0], 1);
+        sh_min_fill = ~0;
+    }
+    barrier();
+    uint tile_ix = sh_tile_ix;
+
+    uint ix = tile_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
+    ElementRef ref = ElementRef(ix * Element_size);
+
+    bool is_fill;
+    uint my_min_fill = ~0;
+    th_state[0] = map_element(ref, is_fill);
+    if (is_fill) my_min_fill = ix;
+    for (uint i = 1; i < N_ROWS; i++) {
+        // discussion question: would it be faster to load using more coherent patterns
+        // into thread memory? This is kinda strided.
+        th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i), is_fill));
+        if (is_fill && my_min_fill == ~0) {
+            my_min_fill = ix + i;
+        }
+    }
+    atomicMin(sh_min_fill, my_min_fill);
+    State agg = th_state[N_ROWS - 1];
+    sh_mat[gl_LocalInvocationID.x] = agg.mat;
+    sh_translate[gl_LocalInvocationID.x] = agg.translate;
+    sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
+    sh_width[gl_LocalInvocationID.x] = agg.linewidth;
+    sh_flags[gl_LocalInvocationID.x] = agg.flags;
+    for (uint i = 0; i < LG_WG_SIZE; i++) {
+        barrier();
+        if (gl_LocalInvocationID.x >= (1 << i)) {
+            State other;
+            uint ix = gl_LocalInvocationID.x - (1 << i);
+            other.mat = sh_mat[ix];
+            other.translate = sh_translate[ix];
+            other.bbox = sh_bbox[ix];
+            other.linewidth = sh_width[ix];
+            other.flags = sh_flags[ix];
+            agg = combine_state(other, agg);
+        }
+        barrier();
+        sh_mat[gl_LocalInvocationID.x] = agg.mat;
+        sh_translate[gl_LocalInvocationID.x] = agg.translate;
+        sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
+        sh_width[gl_LocalInvocationID.x] = agg.linewidth;
+        sh_flags[gl_LocalInvocationID.x] = agg.flags;
+    }
+
+    State exclusive;
+    exclusive.bbox = vec4(0.0, 0.0, 0.0, 0.0);
+    exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0);
+    exclusive.translate = vec2(0.0, 0.0);
+    exclusive.linewidth = 1.0; //TODO should be 0.0
+    exclusive.flags = 0;
+
+    // Publish aggregate for this partition
+    if (gl_LocalInvocationID.x == WG_SIZE - 1) {
+        // Note: with memory model, we'd want to generate the atomic store version of this.
+        State_write(state_aggregate_ref(tile_ix), agg);
+        uint flag = FLAG_AGGREGATE_READY;
+        memoryBarrierBuffer();
+        if (tile_ix == 0) {
+            State_write(state_prefix_ref(tile_ix), agg);
+            flag = FLAG_PREFIX_READY;
+        }
+        state[state_flag_index(tile_ix)] = flag;
+        if (tile_ix != 0) {
+            // step 4 of paper: decoupled lookback
+            uint look_back_ix = tile_ix - 1;
+            while (true) {
+                flag = state[state_flag_index(look_back_ix)];
+                if (flag == FLAG_PREFIX_READY) {
+                    State their_prefix = State_read(state_prefix_ref(look_back_ix));
+                    exclusive = combine_state(their_prefix, exclusive);
+                    break;
+                } else if (flag == FLAG_AGGREGATE_READY) {
+                    State their_agg = State_read(state_aggregate_ref(look_back_ix));
+                    exclusive = combine_state(their_agg, exclusive);
+                    look_back_ix--;
+                }
+                // else spin
+            }
+
+            // step 5 of paper: compute inclusive prefix
+            State inclusive_prefix = combine_state(exclusive, agg);
+            sh_prefix = exclusive;
+            State_write(state_prefix_ref(tile_ix), inclusive_prefix);
+            memoryBarrierBuffer();
+            flag = FLAG_PREFIX_READY;
+            state[state_flag_index(tile_ix)] = flag;
+        }
+    }
+    barrier();
+    my_min_fill = sh_min_fill;
+    if (tile_ix != 0) {
+        exclusive = sh_prefix;
+    }
+
+    State row = exclusive;
+    if (gl_LocalInvocationID.x > 0) {
+        uint ix = gl_LocalInvocationID.x - 1;
+        State other;
+        other.mat = sh_mat[ix];
+        other.translate = sh_translate[ix];
+        other.bbox = sh_bbox[ix];
+        other.linewidth = sh_width[ix];
+        other.flags = sh_flags[ix];
+        row = combine_state(row, other);
+    }
+    if (my_min_fill == ~0 && gl_LocalInvocationID.x == 0) {
+        state[state_flag_index(tile_ix) + 1] = 0x7f800000; // infinity
+    }
+    for (uint i = 0; i < N_ROWS; i++) {
+        State st = combine_state(row, th_state[i]);
+        if (my_min_fill == ix + i) {
+            state[state_flag_index(tile_ix) + 1] = floatBitsToUint(st.bbox.z);
+        }
+        // We write the state now for development purposes, but the
+        // actual goal is to write transformed and annotated elements.
+        //State_write(StateRef((ix + i) * State_size), st);
+
+        // Here we read again from the original scene. There may be
+        // gains to be had from stashing in shared memory or possibly
+        // registers (though register pressure is an issue).
+        ElementRef this_ref = Element_index(ref, i);
+        AnnotatedRef out_ref = AnnotatedRef((ix + i) * Annotated_size);
+        uint tag = Element_tag(this_ref);
+        switch (tag) {
+        case Element_FillLine:
+        case Element_StrokeLine:
+            LineSeg line = Element_StrokeLine_read(this_ref);
+            AnnoStrokeLineSeg anno_line;
+            anno_line.p0 = st.mat.xy * line.p0.x + st.mat.zw * line.p0.y + st.translate;
+            anno_line.p1 = st.mat.xy * line.p1.x + st.mat.zw * line.p1.y + st.translate;
+            if (tag == Element_StrokeLine) {
+                anno_line.stroke = get_linewidth(st);
+            } else {
+                anno_line.stroke = vec2(0.0);
+            }
+            // We do encoding a bit by hand to minimize divergence. Another approach
+            // would be to have a fill/stroke bool.
+            uint out_tag = tag == Element_FillLine ? Annotated_FillLine : Annotated_StrokeLine;
+            annotated[out_ref.offset >> 2] = out_tag;
+            AnnoStrokeLineSeg_write(AnnoStrokeLineSegRef(out_ref.offset + 4), anno_line);
+            break;
+        case Element_Stroke:
+            Stroke stroke = Element_Stroke_read(this_ref);
+            AnnoStroke anno_stroke;
+            anno_stroke.rgba_color = stroke.rgba_color;
+            vec2 lw = get_linewidth(st);
+            anno_stroke.bbox = st.bbox + vec4(-lw, lw);
+            anno_stroke.linewidth = st.linewidth * sqrt(st.mat.x * st.mat.w - st.mat.y * st.mat.z);
+            Annotated_Stroke_write(out_ref, anno_stroke);
+            break;
+        case Element_Fill:
+            Fill fill = Element_Fill_read(this_ref);
+            AnnoFill anno_fill;
+            anno_fill.rgba_color = fill.rgba_color;
+            anno_fill.bbox = st.bbox;
+            Annotated_Fill_write(out_ref, anno_fill);
+            break;
+        default:
+            Annotated_Nop_write(out_ref);
+            break;
+        }
+    }
+}
--- a/piet-gpu/shader/elements.spv
+++ b/piet-gpu/shader/elements.spv
--- a/piet-gpu/shader/fill_seg.h
+++ b/piet-gpu/shader/fill_seg.h
@ -1,130 +0,0 @@
-// Code auto-generated by piet-gpu-derive
-
-struct FillTileHeaderRef {
-    uint offset;
-};
-
-struct FillItemHeaderRef {
-    uint offset;
-};
-
-struct FillSegmentRef {
-    uint offset;
-};
-
-struct FillSegChunkRef {
-    uint offset;
-};
-
-struct FillTileHeader {
-    uint n;
-    FillItemHeaderRef items;
-};
-
-#define FillTileHeader_size 8
-
-FillTileHeaderRef FillTileHeader_index(FillTileHeaderRef ref, uint index) {
-    return FillTileHeaderRef(ref.offset + index * FillTileHeader_size);
-}
-
-struct FillItemHeader {
-    int backdrop;
-    FillSegChunkRef segments;
-};
-
-#define FillItemHeader_size 8
-
-FillItemHeaderRef FillItemHeader_index(FillItemHeaderRef ref, uint index) {
-    return FillItemHeaderRef(ref.offset + index * FillItemHeader_size);
-}
-
-struct FillSegment {
-    vec2 start;
-    vec2 end;
-};
-
-#define FillSegment_size 16
-
-FillSegmentRef FillSegment_index(FillSegmentRef ref, uint index) {
-    return FillSegmentRef(ref.offset + index * FillSegment_size);
-}
-
-struct FillSegChunk {
-    uint n;
-    FillSegChunkRef next;
-};
-
-#define FillSegChunk_size 8
-
-FillSegChunkRef FillSegChunk_index(FillSegChunkRef ref, uint index) {
-    return FillSegChunkRef(ref.offset + index * FillSegChunk_size);
-}
-
-FillTileHeader FillTileHeader_read(FillTileHeaderRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = fill_seg[ix + 0];
-    uint raw1 = fill_seg[ix + 1];
-    FillTileHeader s;
-    s.n = raw0;
-    s.items = FillItemHeaderRef(raw1);
-    return s;
-}
-
-void FillTileHeader_write(FillTileHeaderRef ref, FillTileHeader s) {
-    uint ix = ref.offset >> 2;
-    fill_seg[ix + 0] = s.n;
-    fill_seg[ix + 1] = s.items.offset;
-}
-
-FillItemHeader FillItemHeader_read(FillItemHeaderRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = fill_seg[ix + 0];
-    uint raw1 = fill_seg[ix + 1];
-    FillItemHeader s;
-    s.backdrop = int(raw0);
-    s.segments = FillSegChunkRef(raw1);
-    return s;
-}
-
-void FillItemHeader_write(FillItemHeaderRef ref, FillItemHeader s) {
-    uint ix = ref.offset >> 2;
-    fill_seg[ix + 0] = uint(s.backdrop);
-    fill_seg[ix + 1] = s.segments.offset;
-}
-
-FillSegment FillSegment_read(FillSegmentRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = fill_seg[ix + 0];
-    uint raw1 = fill_seg[ix + 1];
-    uint raw2 = fill_seg[ix + 2];
-    uint raw3 = fill_seg[ix + 3];
-    FillSegment s;
-    s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
-    s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    return s;
-}
-
-void FillSegment_write(FillSegmentRef ref, FillSegment s) {
-    uint ix = ref.offset >> 2;
-    fill_seg[ix + 0] = floatBitsToUint(s.start.x);
-    fill_seg[ix + 1] = floatBitsToUint(s.start.y);
-    fill_seg[ix + 2] = floatBitsToUint(s.end.x);
-    fill_seg[ix + 3] = floatBitsToUint(s.end.y);
-}
-
-FillSegChunk FillSegChunk_read(FillSegChunkRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = fill_seg[ix + 0];
-    uint raw1 = fill_seg[ix + 1];
-    FillSegChunk s;
-    s.n = raw0;
-    s.next = FillSegChunkRef(raw1);
-    return s;
-}
-
-void FillSegChunk_write(FillSegChunkRef ref, FillSegChunk s) {
-    uint ix = ref.offset >> 2;
-    fill_seg[ix + 0] = s.n;
-    fill_seg[ix + 1] = s.next.offset;
-}
-
--- a/piet-gpu/shader/kernel1.comp
+++ b/piet-gpu/shader/kernel1.comp
@ -1,161 +0,0 @@
-// This is "kernel 1" in a 4-kernel pipeline. It traverses the scene graph
-// and outputs "instances" (references to item + translation) for each item
-// that intersects the tilegroup.
-//
-// This implementation is simplistic and leaves a lot of performance on the
-// table. A fancier implementation would use threadgroup shared memory or
-// subgroups (or possibly both) to parallelize the reading of the input and
-// the computation of tilegroup intersection.
-//
-// In addition, there are some features currently missing, such as support
-// for clipping.
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-// It's possible we should lay this out with x and do our own math.
-layout(local_size_x = 1, local_size_y = 32) in;
-
-layout(set = 0, binding = 0) readonly buffer SceneBuf {
-    uint[] scene;
-};
-
-layout(set = 0, binding = 1) buffer TilegroupBuf {
-    uint[] tilegroup;
-};
-
-layout(set = 0, binding = 2) buffer AllocBuf {
-    uint alloc;
-};
-
-#include "scene.h"
-#include "tilegroup.h"
-
-#include "setup.h"
-
-#define MAX_STACK 8
-
-struct StackElement {
-    PietItemRef group;
-    uint index;
-    vec2 offset;
-};
-
-void main() {
-    StackElement stack[MAX_STACK];
-    uint stack_ix = 0;
-    uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS + gl_GlobalInvocationID.x;
-    TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE);
-    uint tg_limit = tg_ref.offset + TILEGROUP_INITIAL_ALLOC - 2 * TileGroup_size;
-
-    // State for stroke references.
-    TileGroupRef stroke_start = TileGroupRef(tg_ref.offset + TILEGROUP_STROKE_START);
-    ChunkRef stroke_chunk_start = ChunkRef(stroke_start.offset + 4);
-    InstanceRef stroke_ref = InstanceRef(stroke_chunk_start.offset + Chunk_size);
-    uint stroke_limit = stroke_start.offset + TILEGROUP_INITIAL_STROKE_ALLOC - Instance_size;
-    uint stroke_chunk_n = 0;
-    uint stroke_n = 0;
-
-    // State for fill references. All this is a bit cut'n'paste, but making a
-    // proper abstraction isn't easy.
-    TileGroupRef fill_start = TileGroupRef(tg_ref.offset + TILEGROUP_FILL_START);
-    ChunkRef fill_chunk_start = ChunkRef(fill_start.offset + 4);
-    InstanceRef fill_ref = InstanceRef(fill_chunk_start.offset + Chunk_size);
-    uint fill_limit = fill_start.offset + TILEGROUP_INITIAL_FILL_ALLOC - Instance_size;
-    uint fill_chunk_n = 0;
-    uint fill_n = 0;
-
-    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILEGROUP_WIDTH_PX, TILEGROUP_HEIGHT_PX);
-    PietItemRef root = PietItemRef(0);
-    SimpleGroup group = PietItem_Group_read(root);
-    StackElement tos = StackElement(root, 0, group.offset.xy);
-
-    while (true) {
-        if (tos.index < group.n_items) {
-            Bbox bbox = Bbox_read(Bbox_index(group.bboxes, tos.index));
-            vec4 bb = vec4(bbox.bbox) + tos.offset.xyxy;
-            bool hit = max(bb.x, xy0.x) < min(bb.z, xy0.x + float(TILEGROUP_WIDTH_PX))
-                && max(bb.y, xy0.y) < min(bb.w, xy0.y + float(TILEGROUP_HEIGHT_PX));
-            bool is_group = false;
-            uint tag;
-            if (hit) {
-                PietItemRef item_ref = PietItem_index(group.items, tos.index);
-                tag = PietItem_tag(item_ref);
-                is_group = tag == PietItem_Group;
-            }
-            if (hit && !is_group) {
-                PietItemRef item_ref = PietItem_index(group.items, tos.index);
-                Instance ins = Instance(item_ref.offset, tos.offset);
-                if (tg_ref.offset > tg_limit) {
-                    // Allocation exceeded; do atomic bump alloc.
-                    uint new_tg = atomicAdd(alloc, TILEGROUP_INITIAL_ALLOC);
-                    Jump jump = Jump(TileGroupRef(new_tg));
-                    TileGroup_Jump_write(tg_ref, jump);
-                    tg_ref = TileGroupRef(new_tg);
-                    tg_limit = tg_ref.offset + TILEGROUP_INITIAL_ALLOC - 2 * TileGroup_size;
-                }
-                TileGroup_Instance_write(tg_ref, ins);
-                tg_ref.offset += TileGroup_size;
-                if (tag == PietItem_Poly) {
-                    if (stroke_ref.offset > stroke_limit) {
-                        uint new_stroke = atomicAdd(alloc, TILEGROUP_STROKE_ALLOC);
-                        Chunk_write(stroke_chunk_start, Chunk(stroke_chunk_n, ChunkRef(new_stroke)));
-                        stroke_chunk_start = ChunkRef(new_stroke);
-                        stroke_ref = InstanceRef(new_stroke + Chunk_size);
-                        stroke_n += stroke_chunk_n;
-                        stroke_chunk_n = 0;
-                        stroke_limit = new_stroke + TILEGROUP_STROKE_ALLOC - Instance_size;
-                    }
-                    Instance_write(stroke_ref, ins);
-                    stroke_chunk_n++;
-                    stroke_ref.offset += Instance_size;
-                } else if (tag == PietItem_Fill) {
-                    if (fill_ref.offset > fill_limit) {
-                        uint new_fill = atomicAdd(alloc, TILEGROUP_FILL_ALLOC);
-                        Chunk_write(fill_chunk_start, Chunk(fill_chunk_n, ChunkRef(new_fill)));
-                        fill_chunk_start = ChunkRef(new_fill);
-                        fill_ref = InstanceRef(new_fill + Chunk_size);
-                        fill_n += fill_chunk_n;
-                        fill_chunk_n = 0;
-                        fill_limit = new_fill + TILEGROUP_FILL_ALLOC - Instance_size;
-                    }
-                    Instance_write(fill_ref, ins);
-                    fill_chunk_n++;
-                    fill_ref.offset += Instance_size;
-
-                }
-            }
-            if (is_group) {
-                PietItemRef item_ref = PietItem_index(group.items, tos.index);
-                tos.index++;
-                if (tos.index < group.n_items) {
-                    stack[stack_ix++] = tos;
-                }
-                group = PietItem_Group_read(item_ref);
-                tos = StackElement(item_ref, 0, tos.offset + group.offset.xy);
-            } else {
-                tos.index++;
-            }
-        } else {
-            // processed all items in this group; pop the stack
-            if (stack_ix == 0) {
-                break;
-            }
-            tos = stack[--stack_ix];
-            group = PietItem_Group_read(tos.group);
-        }
-    }
-    TileGroup_End_write(tg_ref);
-
-    stroke_n += stroke_chunk_n;
-    if (stroke_n > 0) {
-        Chunk_write(stroke_chunk_start, Chunk(stroke_chunk_n, ChunkRef(0)));
-    }
-    tilegroup[stroke_start.offset >> 2] = stroke_n;
-
-    fill_n += fill_chunk_n;
-    if (fill_n > 0) {
-        Chunk_write(fill_chunk_start, Chunk(fill_chunk_n, ChunkRef(0)));
-    }
-    tilegroup[fill_start.offset >> 2] = fill_n;
-}
--- a/piet-gpu/shader/kernel1.spv
+++ b/piet-gpu/shader/kernel1.spv
--- a/piet-gpu/shader/kernel2f.comp
+++ b/piet-gpu/shader/kernel2f.comp
@ -1,167 +0,0 @@
-// This is "kernel 2" (fill) in a 4-kernel pipeline. It processes the fill
-// (polyline) items in the scene and generates a list of segments for each, for
-// each tile.
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-layout(local_size_x = 32) in;
-
-layout(set = 0, binding = 0) readonly buffer SceneBuf {
-    uint[] scene;
-};
-
-layout(set = 0, binding = 1) buffer TilegroupBuf {
-    uint[] tilegroup;
-};
-
-layout(set = 0, binding = 2) buffer FillSegBuf {
-    uint[] fill_seg;
-};
-
-layout(set = 0, binding = 3) buffer AllocBuf {
-    uint alloc;
-};
-
-#include "scene.h"
-#include "tilegroup.h"
-#include "fill_seg.h"
-
-#include "setup.h"
-
-// Ensure that there is space to encode a segment.
-void alloc_chunk(inout uint chunk_n_segs, inout FillSegChunkRef seg_chunk_ref,
-    inout FillSegChunkRef first_seg_chunk, inout uint seg_limit)
-{
-    if (chunk_n_segs == 0) {
-        if (seg_chunk_ref.offset + 40 > seg_limit) {
-            seg_chunk_ref.offset = atomicAdd(alloc, SEG_CHUNK_ALLOC);
-            seg_limit = seg_chunk_ref.offset + SEG_CHUNK_ALLOC - FillSegment_size;
-        }
-        first_seg_chunk = seg_chunk_ref;
-    } else if (seg_chunk_ref.offset + FillSegChunk_size + FillSegment_size * chunk_n_segs > seg_limit) {
-        uint new_chunk_ref = atomicAdd(alloc, SEG_CHUNK_ALLOC);
-        seg_limit = new_chunk_ref + SEG_CHUNK_ALLOC - FillSegment_size;
-        FillSegChunk_write(seg_chunk_ref, FillSegChunk(chunk_n_segs, FillSegChunkRef(new_chunk_ref)));
-        seg_chunk_ref.offset = new_chunk_ref;
-        chunk_n_segs = 0;
-    }
-
-}
-
-void main() {
-    uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x;
-    uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS
-        + (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES);
-    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
-    TileGroupRef fill_start = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE + TILEGROUP_FILL_START);
-    uint fill_n = tilegroup[fill_start.offset >> 2];
-
-    FillTileHeaderRef tile_header_ref = FillTileHeaderRef(tile_ix * FillTileHeader_size);
-    if (fill_n > 0) {
-        ChunkRef chunk_ref = ChunkRef(fill_start.offset + 4);
-        Chunk chunk = Chunk_read(chunk_ref);
-        InstanceRef fill_ref = InstanceRef(chunk_ref.offset + Chunk_size);
-        FillItemHeaderRef item_header = FillItemHeaderRef(atomicAdd(alloc, fill_n * FillItemHeader_size));
-        FillTileHeader_write(tile_header_ref, FillTileHeader(fill_n, item_header));
-        FillSegChunkRef seg_chunk_ref = FillSegChunkRef(0);
-        uint seg_limit = 0;
-        // Iterate through items; fill_n holds count remaining.
-        while (true) {
-            if (chunk.chunk_n == 0) {
-                chunk_ref = chunk.next;
-                if (chunk_ref.offset == 0) {
-                    break;
-                }
-                chunk = Chunk_read(chunk_ref);
-                fill_ref = InstanceRef(chunk_ref.offset + Chunk_size);
-            }
-            Instance ins = Instance_read(fill_ref);
-            PietFill fill = PietItem_Fill_read(PietItemRef(ins.item_ref));
-
-            // Process the fill polyline item.
-            uint max_n_segs = fill.n_points - 1;
-            uint chunk_n_segs = 0;
-            int backdrop = 0;
-            FillSegChunkRef seg_chunk_ref;
-            FillSegChunkRef first_seg_chunk = FillSegChunkRef(0);
-            vec2 start = Point_read(fill.points).xy;
-            for (uint j = 0; j < max_n_segs; j++) {
-                fill.points.offset += Point_size;
-                vec2 end = Point_read(fill.points).xy;
-
-                // Process one segment.
-
-                // TODO: I think this would go more smoothly (and be easier to
-                // make numerically robust) if it were based on clipping the line
-                // to the tile box. See:
-                // https://tavianator.com/fast-branchless-raybounding-box-intersections/
-                vec2 xymin = min(start, end);
-                vec2 xymax = max(start, end);
-                float a = end.y - start.y;
-                float b = start.x - end.x;
-                float c = -(a * start.x + b * start.y);
-                vec2 xy1 = xy0 + vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
-                float ytop = max(xy0.y, xymin.y);
-                float ybot = min(xy1.y, xymax.y);
-                float s00 = sign(b * ytop + a * xy0.x + c);
-                float s01 = sign(b * ytop + a * xy1.x + c);
-                float s10 = sign(b * ybot + a * xy0.x + c);
-                float s11 = sign(b * ybot + a * xy1.x + c);
-                float sTopLeft = sign(b * xy0.y + a * xy0.x + c);
-                if (sTopLeft == sign(a) && xymin.y <= xy0.y && xymax.y > xy0.y) {
-                    backdrop -= int(s00);
-                }
-
-                // This is adapted from piet-metal but could be improved.
-
-                if (max(xymin.x, xy0.x) < min(xymax.x, xy1.x)
-                    && ytop < ybot
-                    && s00 * s01 + s00 * s10 + s00 * s11 < 3.0)
-                {
-                    // avoid overwriting `end` so that it can be used as start
-                    vec2 enc_end = end;
-                    if (xymin.x < xy0.x) {
-                        float yEdge = mix(start.y, end.y, (start.x - xy0.x) / b);
-                        if (yEdge >= xy0.y && yEdge < xy1.y) {
-                            // This is encoded the same as a general fill segment, but could be
-                            // special-cased, either here or in rendering. (It was special-cased
-                            // in piet-metal).
-                            FillSegment edge_seg;
-                            if (b > 0.0) {
-                                enc_end = vec2(xy0.x, yEdge);
-                                edge_seg.start = enc_end;
-                                edge_seg.end = vec2(xy0.x, xy1.y);
-                            } else {
-                                start = vec2(xy0.x, yEdge);
-                                edge_seg.start = vec2(xy0.x, xy1.y);
-                                edge_seg.end = start;
-                            }
-                            alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit);
-                            FillSegment_write(FillSegmentRef(seg_chunk_ref.offset + FillSegChunk_size + FillSegment_size * chunk_n_segs), edge_seg);
-                            chunk_n_segs++;
-                        }
-                    }
-                    alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit);
-                    FillSegment seg = FillSegment(start, enc_end);
-                    FillSegment_write(FillSegmentRef(seg_chunk_ref.offset + FillSegChunk_size + FillSegment_size * chunk_n_segs), seg);
-                    chunk_n_segs++;
-                }
-
-                start = end;
-            }
-            FillItemHeader_write(item_header, FillItemHeader(backdrop, first_seg_chunk));
-            if (chunk_n_segs != 0) {
-                FillSegChunk_write(seg_chunk_ref, FillSegChunk(chunk_n_segs, FillSegChunkRef(0)));
-                seg_chunk_ref.offset += FillSegChunk_size + FillSegment_size * chunk_n_segs;
-            }
-
-            fill_ref.offset += Instance_size;
-            chunk.chunk_n--;
-            item_header.offset += FillItemHeader_size;
-        }
-    } else {
-        // As an optimization, we could just write 0 for the size.
-        FillTileHeader_write(tile_header_ref, FillTileHeader(fill_n, FillItemHeaderRef(0)));
-    }
-}
--- a/piet-gpu/shader/kernel2f.spv
+++ b/piet-gpu/shader/kernel2f.spv
--- a/piet-gpu/shader/kernel2s.comp
+++ b/piet-gpu/shader/kernel2s.comp
@ -1,137 +0,0 @@
-// This is "kernel 2" (strokes) in a 4-kernel pipeline. It processes the stroke
-// (polyline) items in the scene and generates a list of segments for each, for
-// each tile.
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-layout(local_size_x = 32) in;
-
-layout(set = 0, binding = 0) readonly buffer SceneBuf {
-    uint[] scene;
-};
-
-layout(set = 0, binding = 1) buffer TilegroupBuf {
-    uint[] tilegroup;
-};
-
-layout(set = 0, binding = 2) buffer SegmentBuf {
-    uint[] segment;
-};
-
-layout(set = 0, binding = 3) buffer AllocBuf {
-    uint alloc;
-};
-
-#include "scene.h"
-#include "tilegroup.h"
-#include "segment.h"
-
-#include "setup.h"
-
-void main() {
-    uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x;
-    uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS
-        + (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES);
-    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
-    TileGroupRef stroke_start = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE + TILEGROUP_STROKE_START);
-    uint stroke_n = tilegroup[stroke_start.offset >> 2];
-
-    TileHeaderRef tile_header_ref = TileHeaderRef(tile_ix * TileHeader_size);
-    if (stroke_n > 0) {
-        ChunkRef chunk_ref = ChunkRef(stroke_start.offset + 4);
-        Chunk chunk = Chunk_read(chunk_ref);
-        InstanceRef stroke_ref = InstanceRef(chunk_ref.offset + Chunk_size);
-        ItemHeaderRef item_header = ItemHeaderRef(atomicAdd(alloc, stroke_n * ItemHeader_size));
-        TileHeader_write(tile_header_ref, TileHeader(stroke_n, item_header));
-        SegChunkRef seg_chunk_ref = SegChunkRef(0);
-        uint seg_limit = 0;
-        // Iterate through items; stroke_n holds count remaining.
-        while (true) {
-            if (chunk.chunk_n == 0) {
-                chunk_ref = chunk.next;
-                if (chunk_ref.offset == 0) {
-                    break;
-                }
-                chunk = Chunk_read(chunk_ref);
-                stroke_ref = InstanceRef(chunk_ref.offset + Chunk_size);
-            }
-            Instance ins = Instance_read(stroke_ref);
-            PietStrokePolyLine poly = PietItem_Poly_read(PietItemRef(ins.item_ref));
-
-            // Process the stroke polyline item.
-            uint max_n_segs = poly.n_points - 1;
-            uint chunk_n_segs = 0;
-            SegChunkRef seg_chunk_ref;
-            vec2 start = Point_read(poly.points).xy;
-            for (uint j = 0; j < max_n_segs; j++) {
-                poly.points.offset += Point_size;
-                vec2 end = Point_read(poly.points).xy;
-
-                // Process one segment.
-
-                // This logic just tests for collision. What we probably want to do
-                // is a clipping algorithm like Liang-Barsky, and then store coords
-                // relative to the tile in f16. See also:
-                // https://tavianator.com/fast-branchless-raybounding-box-intersections/
-
-                // Also note that when we go to the fancy version, we want to compute
-                // the (horizontal projection of) the bounding box of the intersection
-                // once per tilegroup, so we can assign work to individual tiles.
-
-                float a = end.y - start.y;
-                float b = start.x - end.x;
-                float c = -(a * start.x + b * start.y);
-                float half_width = 0.5 * poly.width;
-                // Tile boundaries padded by half-width.
-                float xmin = xy0.x - half_width;
-                float ymin = xy0.y - half_width;
-                float xmax = xy0.x + float(TILE_WIDTH_PX) + half_width;
-                float ymax = xy0.y + float(TILE_HEIGHT_PX) + half_width;
-                float s00 = sign(b * ymin + a * xmin + c);
-                float s01 = sign(b * ymin + a * xmax + c);
-                float s10 = sign(b * ymax + a * xmin + c);
-                float s11 = sign(b * ymax + a * xmax + c);
-                // If bounding boxes intersect and not all four corners are on the same side, hit.
-                // Also note: this is designed to be false on NAN input.
-                if (max(min(start.x, end.x), xmin) < min(max(start.x, end.x), xmax)
-                    && max(min(start.y, end.y), ymin) < min(max(start.y, end.y), ymax)
-                    && s00 * s01 + s00 * s10 + s00 * s11 < 3.0)
-                {
-                    // Allocate a chunk if needed.
-                    if (chunk_n_segs == 0) {
-                        if (seg_chunk_ref.offset + 40 > seg_limit) {
-                            seg_chunk_ref.offset = atomicAdd(alloc, SEG_CHUNK_ALLOC);
-                            seg_limit = seg_chunk_ref.offset + SEG_CHUNK_ALLOC - Segment_size;
-                        }
-                        ItemHeader_write(item_header, ItemHeader(seg_chunk_ref));
-                    } else if (seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs > seg_limit) {
-                        uint new_chunk_ref = atomicAdd(alloc, SEG_CHUNK_ALLOC);
-                        seg_limit = new_chunk_ref + SEG_CHUNK_ALLOC - Segment_size;
-                        SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, SegChunkRef(new_chunk_ref)));
-                        seg_chunk_ref.offset = new_chunk_ref;
-                        chunk_n_segs = 0;
-                    }
-                    Segment seg = Segment(start, end);
-                    Segment_write(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs), seg);
-                    chunk_n_segs++;
-                }
-
-                start = end;
-            }
-            if (chunk_n_segs == 0) {
-                ItemHeader_write(item_header, ItemHeader(SegChunkRef(0)));
-            } else {
-                SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, SegChunkRef(0)));
-                seg_chunk_ref.offset += SegChunk_size + Segment_size * chunk_n_segs;
-            }
-
-            stroke_ref.offset += Instance_size;
-            chunk.chunk_n--;
-            item_header.offset += ItemHeader_size;
-        }
-    } else {
-        // As an optimization, we could just write 0 for the size.
-        TileHeader_write(tile_header_ref, TileHeader(stroke_n, ItemHeaderRef(0)));
-    }
-}
--- a/piet-gpu/shader/kernel2s.spv
+++ b/piet-gpu/shader/kernel2s.spv
--- a/piet-gpu/shader/kernel3.comp
+++ b/piet-gpu/shader/kernel3.comp
@ -1,135 +0,0 @@
-// This is "kernel 3" in a 4-kernel pipeline. It walks the active items
-// for the tilegroup and produces a per-tile command list for each tile.
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-layout(local_size_x = 32, local_size_y = 1) in;
-
-layout(set = 0, binding = 0) readonly buffer SceneBuf {
-    uint[] scene;
-};
-
-// TODO: this should have a `readonly` qualifier, but then inclusion
-// of ptcl.h would fail because of the writers.
-layout(set = 0, binding = 1) buffer TilegroupBuf {
-    uint[] tilegroup;
-};
-
-// Used readonly
-layout(set = 0, binding = 2) buffer SegmentBuf {
-    uint[] segment;
-};
-
-// Used readonly
-layout(set = 0, binding = 3) buffer FillSegmentBuf {
-    uint[] fill_seg;
-};
-
-layout(set = 0, binding = 4) buffer PtclBuf {
-    uint[] ptcl;
-};
-
-layout(set = 0, binding = 5) buffer AllocBuf {
-    uint alloc;
-};
-
-#include "scene.h"
-#include "tilegroup.h"
-#include "segment.h"
-#include "fill_seg.h"
-#include "ptcl.h"
-
-#include "setup.h"
-
-void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) {
-    if (cmd_ref.offset > cmd_limit) {
-        uint new_cmd = atomicAdd(alloc, PTCL_INITIAL_ALLOC);
-        CmdJump jump = CmdJump(new_cmd);
-        Cmd_Jump_write(cmd_ref, jump);
-        cmd_ref = CmdRef(new_cmd);
-        cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
-    }
-}
-
-void main() {
-    uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x;
-    uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS
-        + (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES);
-    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
-    TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE);
-    CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC);
-    uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
-
-    TileHeader stroke_th = TileHeader_read(TileHeaderRef(tile_ix * TileHeader_size));
-    FillTileHeader fill_th = FillTileHeader_read(FillTileHeaderRef(tile_ix * FillTileHeader_size));
-
-    while (true) {
-        uint tg_tag = TileGroup_tag(tg_ref);
-        if (tg_tag == TileGroup_End) {
-            break;
-        }
-        if (tg_tag == TileGroup_Jump) {
-            tg_ref = TileGroup_Jump_read(tg_ref).new_ref;
-            continue;
-        }
-        // Assume tg_tag is `Instance`, though there will be more cases.
-        Instance ins = TileGroup_Instance_read(tg_ref);
-        PietItemRef item_ref = PietItemRef(ins.item_ref);
-        uint item_tag = PietItem_tag(item_ref);
-        switch (item_tag) {
-        case PietItem_Circle:
-            PietCircle circle = PietItem_Circle_read(item_ref);
-            vec2 center = ins.offset + circle.center.xy;
-            float r = circle.radius;
-            if (max(center.x - r, xy0.x) < min(center.x + r, xy0.x + float(TILE_WIDTH_PX))
-                && max(center.y - r, xy0.y) < min(center.y + r, xy0.y + float(TILE_HEIGHT_PX)))
-            {
-                CmdCircle cmd = CmdCircle(center, r, circle.rgba_color);
-                alloc_cmd(cmd_ref, cmd_limit);
-                Cmd_Circle_write(cmd_ref, cmd);
-                cmd_ref.offset += Cmd_size;
-            }
-            break;
-        case PietItem_Poly:
-            ItemHeader stroke_item = ItemHeader_read(stroke_th.items);
-            stroke_th.items.offset += ItemHeader_size;
-            if (stroke_item.segments.offset != 0) {
-                PietStrokePolyLine poly = PietItem_Poly_read(item_ref);
-                CmdStroke cmd = CmdStroke(
-                    stroke_item.segments.offset,
-                    0.5 * poly.width,
-                    poly.rgba_color
-                );
-                alloc_cmd(cmd_ref, cmd_limit);
-                Cmd_Stroke_write(cmd_ref, cmd);
-                cmd_ref.offset += Cmd_size;
-            }
-            break;
-        case PietItem_Fill:
-            FillItemHeader fill_item = FillItemHeader_read(fill_th.items);
-            fill_th.items.offset += FillItemHeader_size;
-            // TODO: handle segments == 0 but backdrop != specially, it's a solid tile.
-            if (fill_item.segments.offset != 0) {
-                PietFill fill = PietItem_Fill_read(item_ref);
-                CmdFill cmd = CmdFill(
-                    fill_item.segments.offset,
-                    fill_item.backdrop,
-                    fill.rgba_color
-                );
-                alloc_cmd(cmd_ref, cmd_limit);
-                Cmd_Fill_write(cmd_ref, cmd);
-                cmd_ref.offset += Cmd_size;
-            } else if (fill_item.backdrop != 0) {
-                // TODO: truncate existing cmd list if alpha is opaque
-                PietFill fill = PietItem_Fill_read(item_ref);
-                alloc_cmd(cmd_ref, cmd_limit);
-                Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color));
-                cmd_ref.offset += Cmd_size;
-            }
-            break;
-        }
-        tg_ref.offset += TileGroup_size;
-    }
-    Cmd_End_write(cmd_ref);
-}
--- a/piet-gpu/shader/kernel3.spv
+++ b/piet-gpu/shader/kernel3.spv
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@ -6,29 +6,20 @@

 #version 450
 #extension GL_GOOGLE_include_directive : enable
+#extension GL_KHR_shader_subgroup_basic : enable

-layout(local_size_x = 16, local_size_y = 16) in;
+#define CHUNK 8
+#define CHUNK_DY (16 / CHUNK)
+layout(local_size_x = 16, local_size_y = 2) in;

 // Same concern that this should be readonly as in kernel 3.
 layout(set = 0, binding = 0) buffer PtclBuf {
    uint[] ptcl;
 };

-// Used readonly
-layout(set = 0, binding = 1) buffer SegmentBuf {
-    uint[] segment;
-};
-
-// Used readonly
-layout(set = 0, binding = 2) buffer FillSegBuf {
-    uint[] fill_seg;
-};
-
-layout(rgba8, set = 0, binding = 3) uniform writeonly image2D image;
+layout(rgba8, set = 0, binding = 1) uniform writeonly image2D image;

 #include "ptcl.h"
-#include "segment.h"
-#include "fill_seg.h"

 #include "setup.h"

@ -36,10 +27,14 @@ void main() {
    uint tile_ix = gl_WorkGroupID.y * WIDTH_IN_TILES + gl_WorkGroupID.x;
    CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC);

-    uvec2 xy_uint = gl_GlobalInvocationID.xy;
+    uvec2 xy_uint = uvec2(gl_GlobalInvocationID.x, gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y);
    vec2 xy = vec2(xy_uint);
    vec2 uv = xy * vec2(1.0 / IMAGE_WIDTH, 1.0 / IMAGE_HEIGHT);
-    vec3 rgb = uv.xyy;
+    //vec3 rgb = uv.xyy;
+    vec3 rgb[CHUNK];
+    for (uint i = 0; i < CHUNK; i++) {
+        rgb[i] = vec3(0.5);
+    }

    while (true) {
        uint tag = Cmd_tag(cmd_ref);
@ -49,65 +44,85 @@ void main() {
        switch (tag) {
        case Cmd_Circle:
            CmdCircle circle = Cmd_Circle_read(cmd_ref);
-            float r = length(xy + vec2(0.5, 0.5) - circle.center.xy);
-            float alpha = clamp(0.5 + circle.radius - r, 0.0, 1.0);
            vec4 fg_rgba = unpackUnorm4x8(circle.rgba_color).wzyx;
-            // TODO: sRGB
-            rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a);
+            for (uint i = 0; i < CHUNK; i++) {
+                float dy = float(i * CHUNK_DY);
+                float r = length(vec2(xy.x, xy.y + dy) + vec2(0.5, 0.5) - circle.center.xy);
+                float alpha = clamp(0.5 + circle.radius - r, 0.0, 1.0);
+                // TODO: sRGB
+                rgb[i] = mix(rgb[i], fg_rgba.rgb, alpha * fg_rgba.a);
+            }
            break;
        case Cmd_Stroke:
            CmdStroke stroke = Cmd_Stroke_read(cmd_ref);
-            float df = 1e9;
-            SegChunkRef seg_chunk_ref = SegChunkRef(stroke.seg_ref);
+            float df[CHUNK];
+            for (uint k = 0; k < CHUNK; k++) df[k] = 1e9;
+            SegChunkRef seg_chunk_ref = stroke.seg_ref;
            do {
                SegChunk seg_chunk = SegChunk_read(seg_chunk_ref);
+                SegmentRef segs = seg_chunk.segs;
                for (int i = 0; i < seg_chunk.n; i++) {
-                    Segment seg = Segment_read(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * i));
+                    Segment seg = Segment_read(Segment_index(segs, i));
                    vec2 line_vec = seg.end - seg.start;
-                    vec2 dpos = xy + vec2(0.5, 0.5) - seg.start;
-                    float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
-                    df = min(df, length(line_vec * t - dpos));
+                    for (uint k = 0; k < CHUNK; k++) {
+                        vec2 dpos = xy + vec2(0.5, 0.5) - seg.start;
+                        dpos.y += float(k * CHUNK_DY);
+                        float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
+                        df[k] = min(df[k], length(line_vec * t - dpos));
+                    }
                }
                seg_chunk_ref = seg_chunk.next;
            } while (seg_chunk_ref.offset != 0);
            fg_rgba = unpackUnorm4x8(stroke.rgba_color).wzyx;
-            alpha = clamp(stroke.half_width + 0.5 - df, 0.0, 1.0);
-            rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a);
+            for (uint k = 0; k < CHUNK; k++) {
+                float alpha = clamp(stroke.half_width + 0.5 - df[k], 0.0, 1.0);
+                rgb[k] = mix(rgb[k], fg_rgba.rgb, alpha * fg_rgba.a);
+            }
            break;
        case Cmd_Fill:
            CmdFill fill = Cmd_Fill_read(cmd_ref);
            // Probably better to store as float, but conversion is no doubt cheap.
-            float area = float(fill.backdrop);
-            FillSegChunkRef fill_seg_chunk_ref = FillSegChunkRef(fill.seg_ref);
+            float area[CHUNK];
+            for (uint k = 0; k < CHUNK; k++) area[k] = float(fill.backdrop);
+            SegChunkRef fill_seg_chunk_ref = fill.seg_ref;
            do {
-                FillSegChunk seg_chunk = FillSegChunk_read(fill_seg_chunk_ref);
+                SegChunk seg_chunk = SegChunk_read(fill_seg_chunk_ref);
+                SegmentRef segs = seg_chunk.segs;
                for (int i = 0; i < seg_chunk.n; i++) {
-                    FillSegment seg = FillSegment_read(FillSegmentRef(fill_seg_chunk_ref.offset + FillSegChunk_size + FillSegment_size * i));
-                    vec2 start = seg.start - xy;
-                    vec2 end = seg.end - xy;
-                    vec2 window = clamp(vec2(start.y, end.y), 0.0, 1.0);
-                    if (window.x != window.y) {
-                        vec2 t = (window - start.y) / (end.y - start.y);
-                        vec2 xs = vec2(mix(start.x, end.x, t.x), mix(start.x, end.x, t.y));
-                        float xmin = min(min(xs.x, xs.y), 1.0) - 1e-6;
-                        float xmax = max(xs.x, xs.y);
-                        float b = min(xmax, 1.0);
-                        float c = max(b, 0.0);
-                        float d = max(xmin, 0.0);
-                        float a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin);
-                        area += a * (window.x - window.y);
+                    Segment seg = Segment_read(Segment_index(segs, i));
+                    for (uint k = 0; k < CHUNK; k++) {
+                        vec2 my_xy = vec2(xy.x, xy.y + float(k * CHUNK_DY));
+                        vec2 start = seg.start - my_xy;
+                        vec2 end = seg.end - my_xy;
+                        vec2 window = clamp(vec2(start.y, end.y), 0.0, 1.0);
+                        if (window.x != window.y) {
+                            vec2 t = (window - start.y) / (end.y - start.y);
+                            vec2 xs = vec2(mix(start.x, end.x, t.x), mix(start.x, end.x, t.y));
+                            float xmin = min(min(xs.x, xs.y), 1.0) - 1e-6;
+                            float xmax = max(xs.x, xs.y);
+                            float b = min(xmax, 1.0);
+                            float c = max(b, 0.0);
+                            float d = max(xmin, 0.0);
+                            float a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin);
+                            area[k] += a * (window.x - window.y);
+                        }
+                        area[k] += sign(end.x - start.x) * clamp(my_xy.y - seg.y_edge + 1.0, 0.0, 1.0);
                    }
                }
                fill_seg_chunk_ref = seg_chunk.next;
            } while (fill_seg_chunk_ref.offset != 0);
            fg_rgba = unpackUnorm4x8(fill.rgba_color).wzyx;
-            alpha = min(abs(area), 1.0);
-            rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a);
+            for (uint k = 0; k < CHUNK; k++) {
+                float alpha = min(abs(area[k]), 1.0);
+                rgb[k] = mix(rgb[k], fg_rgba.rgb, alpha * fg_rgba.a);
+            }
            break;
        case Cmd_Solid:
            CmdSolid solid = Cmd_Solid_read(cmd_ref);
            fg_rgba = unpackUnorm4x8(solid.rgba_color).wzyx;
-            rgb = mix(rgb, fg_rgba.rgb, fg_rgba.a);
+            for (uint k = 0; k < CHUNK; k++) {
+                rgb[k] = mix(rgb[k], fg_rgba.rgb, fg_rgba.a);
+            }
            break;
        case Cmd_Jump:
            cmd_ref = CmdRef(Cmd_Jump_read(cmd_ref).new_ref);
@ -116,5 +131,8 @@ void main() {
        cmd_ref.offset += Cmd_size;
    }

-    imageStore(image, ivec2(xy_uint), vec4(rgb, 1.0));
+    // TODO: sRGB
+    for (uint i = 0; i < CHUNK; i++) {
+        imageStore(image, ivec2(xy_uint.x, xy_uint.y + CHUNK_DY * i), vec4(rgb[i], 1.0));
+    }
 }
--- a/piet-gpu/shader/kernel4.spv
+++ b/piet-gpu/shader/kernel4.spv
--- a/piet-gpu/shader/ptcl.h
+++ b/piet-gpu/shader/ptcl.h
@ -36,6 +36,14 @@ struct CmdRef {
    uint offset;
 };

+struct SegmentRef {
+    uint offset;
+};
+
+struct SegChunkRef {
+    uint offset;
+};
+
 struct CmdCircle {
    vec2 center;
    float radius;
@ -60,7 +68,7 @@ CmdLineRef CmdLine_index(CmdLineRef ref, uint index) {
 }

 struct CmdStroke {
-    uint seg_ref;
+    SegChunkRef seg_ref;
    float half_width;
    uint rgba_color;
 };
@ -72,7 +80,7 @@ CmdStrokeRef CmdStroke_index(CmdStrokeRef ref, uint index) {
 }

 struct CmdFill {
-    uint seg_ref;
+    SegChunkRef seg_ref;
    int backdrop;
    uint rgba_color;
 };
@ -141,6 +149,30 @@ CmdRef Cmd_index(CmdRef ref, uint index) {
    return CmdRef(ref.offset + index * Cmd_size);
 }

+struct Segment {
+    vec2 start;
+    vec2 end;
+    float y_edge;
+};
+
+#define Segment_size 20
+
+SegmentRef Segment_index(SegmentRef ref, uint index) {
+    return SegmentRef(ref.offset + index * Segment_size);
+}
+
+struct SegChunk {
+    uint n;
+    SegChunkRef next;
+    SegmentRef segs;
+};
+
+#define SegChunk_size 12
+
+SegChunkRef SegChunk_index(SegChunkRef ref, uint index) {
+    return SegChunkRef(ref.offset + index * SegChunk_size);
+}
+
 CmdCircle CmdCircle_read(CmdCircleRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = ptcl[ix + 0];
@ -188,7 +220,7 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) {
    uint raw1 = ptcl[ix + 1];
    uint raw2 = ptcl[ix + 2];
    CmdStroke s;
-    s.seg_ref = raw0;
+    s.seg_ref = SegChunkRef(raw0);
    s.half_width = uintBitsToFloat(raw1);
    s.rgba_color = raw2;
    return s;
@ -196,7 +228,7 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) {

 void CmdStroke_write(CmdStrokeRef ref, CmdStroke s) {
    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = s.seg_ref;
+    ptcl[ix + 0] = s.seg_ref.offset;
    ptcl[ix + 1] = floatBitsToUint(s.half_width);
    ptcl[ix + 2] = s.rgba_color;
 }
@ -207,7 +239,7 @@ CmdFill CmdFill_read(CmdFillRef ref) {
    uint raw1 = ptcl[ix + 1];
    uint raw2 = ptcl[ix + 2];
    CmdFill s;
-    s.seg_ref = raw0;
+    s.seg_ref = SegChunkRef(raw0);
    s.backdrop = int(raw1);
    s.rgba_color = raw2;
    return s;
@ -215,7 +247,7 @@ CmdFill CmdFill_read(CmdFillRef ref) {

 void CmdFill_write(CmdFillRef ref, CmdFill s) {
    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = s.seg_ref;
+    ptcl[ix + 0] = s.seg_ref.offset;
    ptcl[ix + 1] = uint(s.backdrop);
    ptcl[ix + 2] = s.rgba_color;
 }
@ -362,3 +394,45 @@ void Cmd_Bail_write(CmdRef ref) {
    ptcl[ref.offset >> 2] = Cmd_Bail;
 }

+Segment Segment_read(SegmentRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = ptcl[ix + 0];
+    uint raw1 = ptcl[ix + 1];
+    uint raw2 = ptcl[ix + 2];
+    uint raw3 = ptcl[ix + 3];
+    uint raw4 = ptcl[ix + 4];
+    Segment s;
+    s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.y_edge = uintBitsToFloat(raw4);
+    return s;
+}
+
+void Segment_write(SegmentRef ref, Segment s) {
+    uint ix = ref.offset >> 2;
+    ptcl[ix + 0] = floatBitsToUint(s.start.x);
+    ptcl[ix + 1] = floatBitsToUint(s.start.y);
+    ptcl[ix + 2] = floatBitsToUint(s.end.x);
+    ptcl[ix + 3] = floatBitsToUint(s.end.y);
+    ptcl[ix + 4] = floatBitsToUint(s.y_edge);
+}
+
+SegChunk SegChunk_read(SegChunkRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = ptcl[ix + 0];
+    uint raw1 = ptcl[ix + 1];
+    uint raw2 = ptcl[ix + 2];
+    SegChunk s;
+    s.n = raw0;
+    s.next = SegChunkRef(raw1);
+    s.segs = SegmentRef(raw2);
+    return s;
+}
+
+void SegChunk_write(SegChunkRef ref, SegChunk s) {
+    uint ix = ref.offset >> 2;
+    ptcl[ix + 0] = s.n;
+    ptcl[ix + 1] = s.next.offset;
+    ptcl[ix + 2] = s.segs.offset;
+}
+
--- a/piet-gpu/shader/scene.h
+++ b/piet-gpu/shader/scene.h
@ -32,6 +32,38 @@ struct PietItemRef {
    uint offset;
 };

+struct LineSegRef {
+    uint offset;
+};
+
+struct QuadSegRef {
+    uint offset;
+};
+
+struct CubicSegRef {
+    uint offset;
+};
+
+struct FillRef {
+    uint offset;
+};
+
+struct StrokeRef {
+    uint offset;
+};
+
+struct SetLineWidthRef {
+    uint offset;
+};
+
+struct TransformRef {
+    uint offset;
+};
+
+struct ElementRef {
+    uint offset;
+};
+
 struct Bbox {
    ivec4 bbox;
 };
@ -128,6 +160,98 @@ PietItemRef PietItem_index(PietItemRef ref, uint index) {
    return PietItemRef(ref.offset + index * PietItem_size);
 }

+struct LineSeg {
+    vec2 p0;
+    vec2 p1;
+};
+
+#define LineSeg_size 16
+
+LineSegRef LineSeg_index(LineSegRef ref, uint index) {
+    return LineSegRef(ref.offset + index * LineSeg_size);
+}
+
+struct QuadSeg {
+    vec2 p0;
+    vec2 p1;
+    vec2 p2;
+};
+
+#define QuadSeg_size 24
+
+QuadSegRef QuadSeg_index(QuadSegRef ref, uint index) {
+    return QuadSegRef(ref.offset + index * QuadSeg_size);
+}
+
+struct CubicSeg {
+    vec2 p0;
+    vec2 p1;
+    vec2 p2;
+    vec2 p3;
+};
+
+#define CubicSeg_size 32
+
+CubicSegRef CubicSeg_index(CubicSegRef ref, uint index) {
+    return CubicSegRef(ref.offset + index * CubicSeg_size);
+}
+
+struct Fill {
+    uint rgba_color;
+};
+
+#define Fill_size 4
+
+FillRef Fill_index(FillRef ref, uint index) {
+    return FillRef(ref.offset + index * Fill_size);
+}
+
+struct Stroke {
+    uint rgba_color;
+};
+
+#define Stroke_size 4
+
+StrokeRef Stroke_index(StrokeRef ref, uint index) {
+    return StrokeRef(ref.offset + index * Stroke_size);
+}
+
+struct SetLineWidth {
+    float width;
+};
+
+#define SetLineWidth_size 4
+
+SetLineWidthRef SetLineWidth_index(SetLineWidthRef ref, uint index) {
+    return SetLineWidthRef(ref.offset + index * SetLineWidth_size);
+}
+
+struct Transform {
+    vec4 mat;
+    vec2 translate;
+};
+
+#define Transform_size 24
+
+TransformRef Transform_index(TransformRef ref, uint index) {
+    return TransformRef(ref.offset + index * Transform_size);
+}
+
+#define Element_Nop 0
+#define Element_StrokeLine 1
+#define Element_FillLine 2
+#define Element_Quad 3
+#define Element_Cubic 4
+#define Element_Stroke 5
+#define Element_Fill 6
+#define Element_SetLineWidth 7
+#define Element_Transform 8
+#define Element_size 36
+
+ElementRef Element_index(ElementRef ref, uint index) {
+    return ElementRef(ref.offset + index * Element_size);
+}
+
 Bbox Bbox_read(BboxRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = scene[ix + 0];
@ -236,3 +360,122 @@ PietStrokePolyLine PietItem_Poly_read(PietItemRef ref) {
    return PietStrokePolyLine_read(PietStrokePolyLineRef(ref.offset + 4));
 }

+LineSeg LineSeg_read(LineSegRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = scene[ix + 0];
+    uint raw1 = scene[ix + 1];
+    uint raw2 = scene[ix + 2];
+    uint raw3 = scene[ix + 3];
+    LineSeg s;
+    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    return s;
+}
+
+QuadSeg QuadSeg_read(QuadSegRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = scene[ix + 0];
+    uint raw1 = scene[ix + 1];
+    uint raw2 = scene[ix + 2];
+    uint raw3 = scene[ix + 3];
+    uint raw4 = scene[ix + 4];
+    uint raw5 = scene[ix + 5];
+    QuadSeg s;
+    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
+    return s;
+}
+
+CubicSeg CubicSeg_read(CubicSegRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = scene[ix + 0];
+    uint raw1 = scene[ix + 1];
+    uint raw2 = scene[ix + 2];
+    uint raw3 = scene[ix + 3];
+    uint raw4 = scene[ix + 4];
+    uint raw5 = scene[ix + 5];
+    uint raw6 = scene[ix + 6];
+    uint raw7 = scene[ix + 7];
+    CubicSeg s;
+    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
+    s.p3 = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7));
+    return s;
+}
+
+Fill Fill_read(FillRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = scene[ix + 0];
+    Fill s;
+    s.rgba_color = raw0;
+    return s;
+}
+
+Stroke Stroke_read(StrokeRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = scene[ix + 0];
+    Stroke s;
+    s.rgba_color = raw0;
+    return s;
+}
+
+SetLineWidth SetLineWidth_read(SetLineWidthRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = scene[ix + 0];
+    SetLineWidth s;
+    s.width = uintBitsToFloat(raw0);
+    return s;
+}
+
+Transform Transform_read(TransformRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = scene[ix + 0];
+    uint raw1 = scene[ix + 1];
+    uint raw2 = scene[ix + 2];
+    uint raw3 = scene[ix + 3];
+    uint raw4 = scene[ix + 4];
+    uint raw5 = scene[ix + 5];
+    Transform s;
+    s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
+    return s;
+}
+
+uint Element_tag(ElementRef ref) {
+    return scene[ref.offset >> 2];
+}
+
+LineSeg Element_StrokeLine_read(ElementRef ref) {
+    return LineSeg_read(LineSegRef(ref.offset + 4));
+}
+
+LineSeg Element_FillLine_read(ElementRef ref) {
+    return LineSeg_read(LineSegRef(ref.offset + 4));
+}
+
+QuadSeg Element_Quad_read(ElementRef ref) {
+    return QuadSeg_read(QuadSegRef(ref.offset + 4));
+}
+
+CubicSeg Element_Cubic_read(ElementRef ref) {
+    return CubicSeg_read(CubicSegRef(ref.offset + 4));
+}
+
+Stroke Element_Stroke_read(ElementRef ref) {
+    return Stroke_read(StrokeRef(ref.offset + 4));
+}
+
+Fill Element_Fill_read(ElementRef ref) {
+    return Fill_read(FillRef(ref.offset + 4));
+}
+
+SetLineWidth Element_SetLineWidth_read(ElementRef ref) {
+    return SetLineWidth_read(SetLineWidthRef(ref.offset + 4));
+}
+
+Transform Element_Transform_read(ElementRef ref) {
+    return Transform_read(TransformRef(ref.offset + 4));
+}
+
--- a/piet-gpu/shader/segment.h
+++ b/piet-gpu/shader/segment.h
@ -1,126 +0,0 @@
-// Code auto-generated by piet-gpu-derive
-
-struct TileHeaderRef {
-    uint offset;
-};
-
-struct ItemHeaderRef {
-    uint offset;
-};
-
-struct SegmentRef {
-    uint offset;
-};
-
-struct SegChunkRef {
-    uint offset;
-};
-
-struct TileHeader {
-    uint n;
-    ItemHeaderRef items;
-};
-
-#define TileHeader_size 8
-
-TileHeaderRef TileHeader_index(TileHeaderRef ref, uint index) {
-    return TileHeaderRef(ref.offset + index * TileHeader_size);
-}
-
-struct ItemHeader {
-    SegChunkRef segments;
-};
-
-#define ItemHeader_size 4
-
-ItemHeaderRef ItemHeader_index(ItemHeaderRef ref, uint index) {
-    return ItemHeaderRef(ref.offset + index * ItemHeader_size);
-}
-
-struct Segment {
-    vec2 start;
-    vec2 end;
-};
-
-#define Segment_size 16
-
-SegmentRef Segment_index(SegmentRef ref, uint index) {
-    return SegmentRef(ref.offset + index * Segment_size);
-}
-
-struct SegChunk {
-    uint n;
-    SegChunkRef next;
-};
-
-#define SegChunk_size 8
-
-SegChunkRef SegChunk_index(SegChunkRef ref, uint index) {
-    return SegChunkRef(ref.offset + index * SegChunk_size);
-}
-
-TileHeader TileHeader_read(TileHeaderRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = segment[ix + 0];
-    uint raw1 = segment[ix + 1];
-    TileHeader s;
-    s.n = raw0;
-    s.items = ItemHeaderRef(raw1);
-    return s;
-}
-
-void TileHeader_write(TileHeaderRef ref, TileHeader s) {
-    uint ix = ref.offset >> 2;
-    segment[ix + 0] = s.n;
-    segment[ix + 1] = s.items.offset;
-}
-
-ItemHeader ItemHeader_read(ItemHeaderRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = segment[ix + 0];
-    ItemHeader s;
-    s.segments = SegChunkRef(raw0);
-    return s;
-}
-
-void ItemHeader_write(ItemHeaderRef ref, ItemHeader s) {
-    uint ix = ref.offset >> 2;
-    segment[ix + 0] = s.segments.offset;
-}
-
-Segment Segment_read(SegmentRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = segment[ix + 0];
-    uint raw1 = segment[ix + 1];
-    uint raw2 = segment[ix + 2];
-    uint raw3 = segment[ix + 3];
-    Segment s;
-    s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
-    s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    return s;
-}
-
-void Segment_write(SegmentRef ref, Segment s) {
-    uint ix = ref.offset >> 2;
-    segment[ix + 0] = floatBitsToUint(s.start.x);
-    segment[ix + 1] = floatBitsToUint(s.start.y);
-    segment[ix + 2] = floatBitsToUint(s.end.x);
-    segment[ix + 3] = floatBitsToUint(s.end.y);
-}
-
-SegChunk SegChunk_read(SegChunkRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = segment[ix + 0];
-    uint raw1 = segment[ix + 1];
-    SegChunk s;
-    s.n = raw0;
-    s.next = SegChunkRef(raw1);
-    return s;
-}
-
-void SegChunk_write(SegChunkRef ref, SegChunk s) {
-    uint ix = ref.offset >> 2;
-    segment[ix + 0] = s.n;
-    segment[ix + 1] = s.next.offset;
-}
-
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@ -40,3 +40,25 @@
 // Maximum number of segments in a SegChunk
 #define SEG_CHUNK_N 32
 #define SEG_CHUNK_ALLOC 512
+
+// Stuff for new algorithm follows; some of the above should get
+// deleted.
+
+// These should probably be renamed and/or reworked. In the binning
+// kernel, they represent the number of bins. Also, the workgroup size
+// of that kernel is equal to the number of bins, but should probably
+// be more flexible (it's 512 in the K&L paper).
+#define N_TILE_X 16
+#define N_TILE_Y 16
+#define N_TILE (N_TILE_X * N_TILE_Y)
+#define LG_N_TILE 8
+#define N_SLICE (N_TILE / 32)
+// Number of workgroups for binning kernel
+#define N_WG 16
+
+// This is the ratio of the number of elements in a binning workgroup
+// over the number of elements in a partition workgroup.
+#define ELEMENT_BINNING_RATIO 2
+
+#define BIN_INITIAL_ALLOC 64
+#define BIN_ALLOC 256
--- a/piet-gpu/shader/state.h
+++ b/piet-gpu/shader/state.h
@ -0,0 +1,59 @@
+// Code auto-generated by piet-gpu-derive
+
+struct StateRef {
+    uint offset;
+};
+
+struct State {
+    vec4 mat;
+    vec2 translate;
+    vec4 bbox;
+    float linewidth;
+    uint flags;
+};
+
+#define State_size 48
+
+StateRef State_index(StateRef ref, uint index) {
+    return StateRef(ref.offset + index * State_size);
+}
+
+State State_read(StateRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = state[ix + 0];
+    uint raw1 = state[ix + 1];
+    uint raw2 = state[ix + 2];
+    uint raw3 = state[ix + 3];
+    uint raw4 = state[ix + 4];
+    uint raw5 = state[ix + 5];
+    uint raw6 = state[ix + 6];
+    uint raw7 = state[ix + 7];
+    uint raw8 = state[ix + 8];
+    uint raw9 = state[ix + 9];
+    uint raw10 = state[ix + 10];
+    uint raw11 = state[ix + 11];
+    State s;
+    s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
+    s.bbox = vec4(uintBitsToFloat(raw6), uintBitsToFloat(raw7), uintBitsToFloat(raw8), uintBitsToFloat(raw9));
+    s.linewidth = uintBitsToFloat(raw10);
+    s.flags = raw11;
+    return s;
+}
+
+void State_write(StateRef ref, State s) {
+    uint ix = ref.offset >> 2;
+    state[ix + 0] = floatBitsToUint(s.mat.x);
+    state[ix + 1] = floatBitsToUint(s.mat.y);
+    state[ix + 2] = floatBitsToUint(s.mat.z);
+    state[ix + 3] = floatBitsToUint(s.mat.w);
+    state[ix + 4] = floatBitsToUint(s.translate.x);
+    state[ix + 5] = floatBitsToUint(s.translate.y);
+    state[ix + 6] = floatBitsToUint(s.bbox.x);
+    state[ix + 7] = floatBitsToUint(s.bbox.y);
+    state[ix + 8] = floatBitsToUint(s.bbox.z);
+    state[ix + 9] = floatBitsToUint(s.bbox.w);
+    state[ix + 10] = floatBitsToUint(s.linewidth);
+    state[ix + 11] = s.flags;
+}
+
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -1,5 +1,5 @@
-mod render_ctx;
 mod pico_svg;
+mod render_ctx;

 pub use render_ctx::PietGpuRenderContext;

@ -8,6 +8,8 @@ use rand::{Rng, RngCore};
 use piet::kurbo::{BezPath, Circle, Line, Point, Vec2};
 use piet::{Color, RenderContext};

+use piet_gpu_types::encoder::Encode;
+
 use piet_gpu_hal::{CmdBuf, Device, Error, ImageLayout, MemFlags};

 use pico_svg::PicoSvg;
@ -28,7 +30,20 @@ const PTCL_INITIAL_ALLOC: usize = 1024;

 const K2_PER_TILE_SIZE: usize = 8;

-const N_CIRCLES: usize = 1;
+const N_CIRCLES: usize = 0;
+
+const N_WG: u32 = 16;
+
+pub fn render_svg(rc: &mut impl RenderContext, filename: &str, scale: f64) {
+    let xml_str = std::fs::read_to_string(filename).unwrap();
+    let start = std::time::Instant::now();
+    let svg = PicoSvg::load(&xml_str, scale).unwrap();
+    println!("parsing time: {:?}", start.elapsed());
+
+    let start = std::time::Instant::now();
+    svg.render(rc);
+    println!("flattening and encoding time: {:?}", start.elapsed());
+}

 pub fn render_scene(rc: &mut impl RenderContext) {
    let mut rng = rand::thread_rng();
@ -42,12 +57,14 @@ pub fn render_scene(rc: &mut impl RenderContext) {
        let circle = Circle::new(center, radius);
        rc.fill(circle, &color);
    }
+    /*
    let mut path = BezPath::new();
    path.move_to((100.0, 1150.0));
    path.line_to((200.0, 1200.0));
    path.line_to((150.0, 1250.0));
    path.close_path();
    rc.fill(path, &Color::rgb8(128, 0, 128));
+    */
    rc.stroke(
        Line::new((100.0, 100.0), (200.0, 150.0)),
        &Color::WHITE,
@ -59,7 +76,7 @@ pub fn render_scene(rc: &mut impl RenderContext) {

 #[allow(unused)]
 fn render_cardioid(rc: &mut impl RenderContext) {
-    let n = 91;
+    let n = 601;
    let dth = std::f64::consts::PI * 2.0 / (n as f64);
    let center = Point::new(1024.0, 768.0);
    let r = 750.0;
@ -67,7 +84,7 @@ fn render_cardioid(rc: &mut impl RenderContext) {
    for i in 1..n {
        let p0 = center + Vec2::from_angle(i as f64 * dth) * r;
        let p1 = center + Vec2::from_angle(((i * 2) % n) as f64 * dth) * r;
-        rc.fill(&Circle::new(p0, 8.0), &Color::WHITE);
+        //rc.fill(&Circle::new(p0, 8.0), &Color::WHITE);
        path.move_to(p0);
        path.line_to(p1);
        //rc.stroke(Line::new(p0, p1), &Color::BLACK, 2.0);
@ -96,10 +113,10 @@ fn dump_scene(buf: &[u8]) {
 }

 #[allow(unused)]
-fn dump_k1_data(k1_buf: &[u32]) {
+pub fn dump_k1_data(k1_buf: &[u32]) {
    for i in 0..k1_buf.len() {
        if k1_buf[i] != 0 {
-            println!("{:4x}: {:8x}", i, k1_buf[i]);
+            println!("{:4x}: {:8x}", i * 4, k1_buf[i]);
        }
    }
 }
@ -110,27 +127,30 @@ pub struct Renderer<D: Device> {
    scene_buf: D::Buffer,
    scene_dev: D::Buffer,

-    k1_alloc_buf_host: D::Buffer,
-    k1_alloc_buf_dev: D::Buffer,
-    k2s_alloc_buf_host: D::Buffer,
-    k2s_alloc_buf_dev: D::Buffer,
-    k2f_alloc_buf_host: D::Buffer,
-    k2f_alloc_buf_dev: D::Buffer,
-    k3_alloc_buf_host: D::Buffer,
-    k3_alloc_buf_dev: D::Buffer,
-    tilegroup_buf: D::Buffer,
-    ptcl_buf: D::Buffer,
+    pub state_buf: D::Buffer,
+    pub anno_buf: D::Buffer,
+    pub bin_buf: D::Buffer,
+    pub ptcl_buf: D::Buffer,
+
+    el_pipeline: D::Pipeline,
+    el_ds: D::DescriptorSet,
+
+    bin_pipeline: D::Pipeline,
+    bin_ds: D::DescriptorSet,
+
+    bin_alloc_buf_host: D::Buffer,
+    bin_alloc_buf_dev: D::Buffer,
+
+    coarse_pipeline: D::Pipeline,
+    coarse_ds: D::DescriptorSet,
+
+    coarse_alloc_buf_host: D::Buffer,
+    coarse_alloc_buf_dev: D::Buffer,

-    k1_pipeline: D::Pipeline,
-    k1_ds: D::DescriptorSet,
-    k2s_pipeline: D::Pipeline,
-    k2s_ds: D::DescriptorSet,
-    k2f_pipeline: D::Pipeline,
-    k2f_ds: D::DescriptorSet,
-    k3_pipeline: D::Pipeline,
-    k3_ds: D::DescriptorSet,
    k4_pipeline: D::Pipeline,
    k4_ds: D::DescriptorSet,
+
+    n_elements: usize,
 }

 impl<D: Device> Renderer<D> {
@ -138,6 +158,9 @@ impl<D: Device> Renderer<D> {
        let host = MemFlags::host_coherent();
        let dev = MemFlags::device_local();

+        let n_elements = scene.len() / piet_gpu_types::scene::Element::fixed_size();
+        println!("scene: {} elements", n_elements);
+
        let scene_buf = device
            .create_buffer(std::mem::size_of_val(&scene[..]) as u64, host)
            .unwrap();
@ -146,174 +169,121 @@ impl<D: Device> Renderer<D> {
            .unwrap();
        device.write_buffer(&scene_buf, &scene)?;

-        let tilegroup_buf = device.create_buffer(4 * 1024 * 1024, dev)?;
+        let state_buf = device.create_buffer(1 * 1024 * 1024, dev)?;
+        let anno_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
+        let bin_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
        let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?;
-        let segment_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
-        let fill_seg_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
        let image_dev = device.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?;

-        let k1_alloc_buf_host = device.create_buffer(4, host)?;
-        let k1_alloc_buf_dev = device.create_buffer(4, dev)?;
-        let k1_alloc_start = WIDTH_IN_TILEGROUPS * HEIGHT_IN_TILEGROUPS * TILEGROUP_STRIDE;
-        device.write_buffer(&k1_alloc_buf_host, &[k1_alloc_start as u32])?;
-        let k1_code = include_bytes!("../shader/kernel1.spv");
-        let k1_pipeline = device
-            .create_simple_compute_pipeline(k1_code, 3, 0)?;
-        let k1_ds = device
-            .create_descriptor_set(
-                &k1_pipeline,
-                &[&scene_dev, &tilegroup_buf, &k1_alloc_buf_dev],
-                &[],
-            )?;
+        let el_code = include_bytes!("../shader/elements.spv");
+        let el_pipeline = device.create_simple_compute_pipeline(el_code, 3, 0)?;
+        let el_ds = device.create_descriptor_set(
+            &el_pipeline,
+            &[&scene_dev, &state_buf, &anno_buf],
+            &[],
+        )?;

-        let k2s_alloc_buf_host = device.create_buffer(4, host)?;
-        let k2s_alloc_buf_dev = device.create_buffer(4, dev)?;
-        let k2s_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * K2_PER_TILE_SIZE;
-        device
-            .write_buffer(&k2s_alloc_buf_host, &[k2s_alloc_start as u32])
-            ?;
-        let k2s_code = include_bytes!("../shader/kernel2s.spv");
-        let k2s_pipeline = device
-            .create_simple_compute_pipeline(k2s_code, 4, 0)
-            ?;
-        let k2s_ds = device
-            .create_descriptor_set(
-                &k2s_pipeline,
-                &[&scene_dev, &tilegroup_buf, &segment_buf, &k2s_alloc_buf_dev],
-                &[],
-            )
-            ?;
+        let bin_alloc_buf_host = device.create_buffer(12, host)?;
+        let bin_alloc_buf_dev = device.create_buffer(12, dev)?;

-        let k2f_alloc_buf_host = device.create_buffer(4, host)?;
-        let k2f_alloc_buf_dev = device.create_buffer(4, dev)?;
-        let k2f_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * K2_PER_TILE_SIZE;
-        device
-            .write_buffer(&k2f_alloc_buf_host, &[k2f_alloc_start as u32])
-            ?;
-        let k2f_code = include_bytes!("../shader/kernel2f.spv");
-        let k2f_pipeline = device.create_simple_compute_pipeline(k2f_code, 4, 0)?;
-        let k2f_ds = device
-            .create_descriptor_set(
-                &k2f_pipeline,
-                &[
-                    &scene_dev,
-                    &tilegroup_buf,
-                    &fill_seg_buf,
-                    &k2f_alloc_buf_dev,
-                ],
-                &[],
-            )
-            ?;
+        // TODO: constants
+        let bin_alloc_start = ((n_elements + 255) & !255) * 8;
+        device.write_buffer(
+            &bin_alloc_buf_host,
+            &[n_elements as u32, 0, bin_alloc_start as u32],
+        )?;
+        let bin_code = include_bytes!("../shader/binning.spv");
+        let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 4, 0)?;
+        let bin_ds = device.create_descriptor_set(
+            &bin_pipeline,
+            &[&anno_buf, &state_buf, &bin_alloc_buf_dev, &bin_buf],
+            &[],
+        )?;

-        let k3_alloc_buf_host = device.create_buffer(4, host)?;
-        let k3_alloc_buf_dev = device.create_buffer(4, dev)?;
-        let k3_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC;
-        device
-            .write_buffer(&k3_alloc_buf_host, &[k3_alloc_start as u32])
-            ?;
-        let k3_code = include_bytes!("../shader/kernel3.spv");
-        let k3_pipeline = device.create_simple_compute_pipeline(k3_code, 6, 0)?;
-        let k3_ds = device
-            .create_descriptor_set(
-                &k3_pipeline,
-                &[
-                    &scene_dev,
-                    &tilegroup_buf,
-                    &segment_buf,
-                    &fill_seg_buf,
-                    &ptcl_buf,
-                    &k3_alloc_buf_dev,
-                ],
-                &[],
-            )
-            ?;
+        let coarse_alloc_buf_host = device.create_buffer(8, host)?;
+        let coarse_alloc_buf_dev = device.create_buffer(8, dev)?;
+
+        let coarse_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC;
+        device.write_buffer(
+            &coarse_alloc_buf_host,
+            &[n_elements as u32, coarse_alloc_start as u32],
+        )?;
+        let coarse_code = include_bytes!("../shader/coarse.spv");
+        let coarse_pipeline = device.create_simple_compute_pipeline(coarse_code, 4, 0)?;
+        let coarse_ds = device.create_descriptor_set(
+            &coarse_pipeline,
+            &[&anno_buf, &bin_buf, &coarse_alloc_buf_dev, &ptcl_buf],
+            &[],
+        )?;

        let k4_code = include_bytes!("../shader/kernel4.spv");
-        let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 3, 1)?;
-        let k4_ds = device
-            .create_descriptor_set(&k4_pipeline, &[&ptcl_buf, &segment_buf, &fill_seg_buf], &[&image_dev])
-            ?;
+        let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 1, 1)?;
+        let k4_ds = device.create_descriptor_set(&k4_pipeline, &[&ptcl_buf], &[&image_dev])?;

        Ok(Renderer {
            scene_buf,
            scene_dev,
            image_dev,
-            k1_alloc_buf_host,
-            k1_alloc_buf_dev,
-            k2s_alloc_buf_host,
-            k2s_alloc_buf_dev,
-            k2f_alloc_buf_host,
-            k2f_alloc_buf_dev,
-            k3_alloc_buf_host,
-            k3_alloc_buf_dev,
-            tilegroup_buf,
-            ptcl_buf,
-            k1_pipeline,
-            k1_ds,
-            k2s_pipeline,
-            k2s_ds,
-            k2f_pipeline,
-            k2f_ds,
-            k3_pipeline,
-            k3_ds,
+            el_pipeline,
+            el_ds,
+            bin_pipeline,
+            bin_ds,
+            coarse_pipeline,
+            coarse_ds,
            k4_pipeline,
            k4_ds,
+            state_buf,
+            anno_buf,
+            bin_buf,
+            ptcl_buf,
+            bin_alloc_buf_host,
+            bin_alloc_buf_dev,
+            coarse_alloc_buf_host,
+            coarse_alloc_buf_dev,
+            n_elements,
        })
    }

    pub unsafe fn record(&self, cmd_buf: &mut impl CmdBuf<D>, query_pool: &D::QueryPool) {
        cmd_buf.copy_buffer(&self.scene_buf, &self.scene_dev);
-        // Note: we could use one alloc buf and reuse it. But we'll stick with
-        // multiple ones for clarity.
-        cmd_buf.copy_buffer(&self.k1_alloc_buf_host, &self.k1_alloc_buf_dev);
-        cmd_buf.copy_buffer(&self.k2s_alloc_buf_host, &self.k2s_alloc_buf_dev);
-        cmd_buf.copy_buffer(&self.k2f_alloc_buf_host, &self.k2f_alloc_buf_dev);
-        cmd_buf.copy_buffer(&self.k3_alloc_buf_host, &self.k3_alloc_buf_dev);
-        // Note: these clears aren't necessary, and are here to make inspection
-        // of the buffers cleaner. Can likely be removed.
-        cmd_buf.clear_buffer(&self.tilegroup_buf);
-        cmd_buf.clear_buffer(&self.ptcl_buf);
+        cmd_buf.copy_buffer(&self.bin_alloc_buf_host, &self.bin_alloc_buf_dev);
+        cmd_buf.copy_buffer(&self.coarse_alloc_buf_host, &self.coarse_alloc_buf_dev);
+        cmd_buf.clear_buffer(&self.state_buf);
        cmd_buf.memory_barrier();
-        cmd_buf.image_barrier(&self.image_dev, ImageLayout::Undefined, ImageLayout::General);
+        cmd_buf.image_barrier(
+            &self.image_dev,
+            ImageLayout::Undefined,
+            ImageLayout::General,
+        );
        cmd_buf.reset_query_pool(&query_pool);
        cmd_buf.write_timestamp(&query_pool, 0);
        cmd_buf.dispatch(
-            &self.k1_pipeline,
-            &self.k1_ds,
-            ((WIDTH / 512) as u32, (HEIGHT / 512) as u32, 1),
+            &self.el_pipeline,
+            &self.el_ds,
+            (((self.n_elements + 127) / 128) as u32, 1, 1),
        );
        cmd_buf.write_timestamp(&query_pool, 1);
        cmd_buf.memory_barrier();
        cmd_buf.dispatch(
-            &self.k2s_pipeline,
-            &self.k2s_ds,
-            ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 1),
+            &self.bin_pipeline,
+            &self.bin_ds,
+            (((self.n_elements + 255) / 256) as u32, 1, 1),
        );
        cmd_buf.write_timestamp(&query_pool, 2);
-        // Note: this barrier is not necessary (k2f does not depend on
-        // k2s output), but I'm keeping it here to increase transparency
-        // of performance.
        cmd_buf.memory_barrier();
        cmd_buf.dispatch(
-            &self.k2f_pipeline,
-            &self.k2f_ds,
-            ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 2),
+            &self.coarse_pipeline,
+            &self.coarse_ds,
+            (WIDTH as u32 / 256, HEIGHT as u32 / 256, 1),
        );
        cmd_buf.write_timestamp(&query_pool, 3);
        cmd_buf.memory_barrier();
-        cmd_buf.dispatch(
-            &self.k3_pipeline,
-            &self.k3_ds,
-            ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 3),
-        );
-        cmd_buf.write_timestamp(&query_pool, 4);
-        cmd_buf.memory_barrier();
        cmd_buf.dispatch(
            &self.k4_pipeline,
            &self.k4_ds,
            ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1),
        );
-        cmd_buf.write_timestamp(&query_pool, 5);
+        cmd_buf.write_timestamp(&query_pool, 4);
        cmd_buf.memory_barrier();
        cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
    }
--- a/piet-gpu/src/pico_svg.rs
+++ b/piet-gpu/src/pico_svg.rs
@ -2,7 +2,7 @@

 use std::str::FromStr;

-use roxmltree::Document;
+use roxmltree::{Document, Node};

 use piet::kurbo::{Affine, BezPath};

@ -28,27 +28,19 @@ pub struct FillItem {
    path: BezPath,
 }

+struct Parser<'a> {
+    scale: f64,
+    items: &'a mut Vec<Item>,
+}
+
 impl PicoSvg {
    pub fn load(xml_string: &str, scale: f64) -> Result<PicoSvg, Box<dyn std::error::Error>> {
        let doc = Document::parse(xml_string)?;
        let root = doc.root_element();
-        let g = root.first_element_child().ok_or("no root element")?;
        let mut items = Vec::new();
-        for el in g.children() {
-            if el.is_element() {
-                let d = el.attribute("d").ok_or("missing 'd' attribute")?;
-                let bp = BezPath::from_svg(d)?;
-                let path = Affine::scale(scale) * bp;
-                if let Some(fill_color) = el.attribute("fill") {
-                    let color = parse_color(fill_color);
-                    items.push(Item::Fill(FillItem { color, path: path.clone() }));
-                }
-                if let Some(stroke_color) = el.attribute("stroke") {
-                    let width = f64::from_str(el.attribute("stroke-width").ok_or("missing width")?)?;
-                    let color = parse_color(stroke_color);
-                    items.push(Item::Stroke(StrokeItem { width, color, path }));
-                }
-            }
+        let mut parser = Parser::new(&mut items, scale);
+        for node in root.children() {
+            parser.rec_parse(node)?;
        }
        Ok(PicoSvg { items })
    }
@ -58,6 +50,7 @@ impl PicoSvg {
            match item {
                Item::Fill(fill_item) => {
                    rc.fill(&fill_item.path, &fill_item.color);
+                    //rc.stroke(&fill_item.path, &fill_item.color, 1.0);
                }
                Item::Stroke(stroke_item) => {
                    rc.stroke(&stroke_item.path, &stroke_item.color, stroke_item.width);
@ -67,6 +60,59 @@ impl PicoSvg {
    }
 }

+impl<'a> Parser<'a> {
+    fn new(items: &'a mut Vec<Item>, scale: f64) -> Parser<'a> {
+        Parser { scale, items }
+    }
+
+    fn rec_parse(&mut self, node: Node) -> Result<(), Box<dyn std::error::Error>> {
+        let transform = if self.scale >= 0.0 {
+            Affine::scale(self.scale)
+        } else {
+            Affine::new([-self.scale, 0.0, 0.0, self.scale, 0.0, 1536.0])
+        };
+        if node.is_element() {
+            match node.tag_name().name() {
+                "g" => {
+                    for child in node.children() {
+                        self.rec_parse(child)?;
+                    }
+                }
+                "path" => {
+                    let d = node.attribute("d").ok_or("missing 'd' attribute")?;
+                    let bp = BezPath::from_svg(d)?;
+                    let path = transform * bp;
+                    // TODO: default fill color is black, but this is overridden in tiger to this logic.
+                    if let Some(fill_color) = node.attribute("fill") {
+                        if fill_color != "none" {
+                            let color = parse_color(fill_color);
+                            let color = modify_opacity(color, "fill-opacity", node);
+                            self.items.push(Item::Fill(FillItem {
+                                color,
+                                path: path.clone(),
+                            }));
+                        }
+                    }
+                    if let Some(stroke_color) = node.attribute("stroke") {
+                        if stroke_color != "none" {
+                            let width = self.scale.abs()
+                                * f64::from_str(
+                                    node.attribute("stroke-width").ok_or("missing width")?,
+                                )?;
+                            let color = parse_color(stroke_color);
+                            let color = modify_opacity(color, "stroke-opacity", node);
+                            self.items
+                                .push(Item::Stroke(StrokeItem { width, color, path }));
+                        }
+                    }
+                }
+                _ => (),
+            }
+        }
+        Ok(())
+    }
+}
+
 fn parse_color(color: &str) -> Color {
    if color.as_bytes()[0] == b'#' {
        let mut hex = u32::from_str_radix(&color[1..], 16).unwrap();
@ -74,7 +120,27 @@ fn parse_color(color: &str) -> Color {
            hex = (hex >> 8) * 0x110000 + ((hex >> 4) & 0xf) * 0x1100 + (hex & 0xf) * 0x11;
        }
        Color::from_rgba32_u32((hex << 8) + 0xff)
+    } else if color.starts_with("rgb(") {
+        let mut iter = color[4..color.len() - 1].split(',');
+        let r = u8::from_str(iter.next().unwrap()).unwrap();
+        let g = u8::from_str(iter.next().unwrap()).unwrap();
+        let b = u8::from_str(iter.next().unwrap()).unwrap();
+        Color::rgb8(r, g, b)
    } else {
        Color::from_rgba32_u32(0xff00ff80)
    }
 }
+
+fn modify_opacity(color: Color, attr_name: &str, node: Node) -> Color {
+    if let Some(opacity) = node.attribute(attr_name) {
+        let alpha = if opacity.ends_with("%") {
+            let pctg = opacity[..opacity.len() - 1].parse().unwrap_or(100.0);
+            pctg * 0.01
+        } else {
+            opacity.parse().unwrap_or(1.0)
+        };
+        color.with_alpha(alpha)
+    } else {
+        color
+    }
+}
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@ -2,7 +2,11 @@ use std::borrow::Cow;

 use piet_gpu_types::encoder::{Encode, Encoder, Ref};
 use piet_gpu_types::scene;
-use piet_gpu_types::scene::{Bbox, PietCircle, PietFill, PietItem, PietStrokePolyLine, SimpleGroup};
+use piet_gpu_types::scene::{
+    Bbox, PietCircle, PietFill, PietItem, PietStrokePolyLine, SimpleGroup,
+};
+
+use piet_gpu_types::scene::{CubicSeg, Element, Fill, LineSeg, QuadSeg, SetLineWidth, Stroke};

 use piet::kurbo::{Affine, PathEl, Point, Rect, Shape};

@ -27,10 +31,10 @@ pub struct PietGpuText;

 pub struct PietGpuRenderContext {
    encoder: Encoder,
-    bboxes: Vec<Bbox>,
-    items: Vec<PietItem>,
+    elements: Vec<Element>,
    // Will probably need direct accesss to hal Device to create images etc.
    inner_text: PietGpuText,
+    stroke_width: f32,
 }

 #[derive(Clone)]
@ -43,47 +47,22 @@ const TOLERANCE: f64 = 0.25;

 impl PietGpuRenderContext {
    pub fn new() -> PietGpuRenderContext {
-        let mut encoder = Encoder::new();
-        let _reserve_root = encoder.alloc_chunk(PietItem::fixed_size() as u32);
-        let bboxes = Vec::new();
-        let items = Vec::new();
+        let encoder = Encoder::new();
+        let elements = Vec::new();
        let inner_text = PietGpuText;
+        let stroke_width = 0.0;
        PietGpuRenderContext {
            encoder,
-            bboxes,
-            items,
+            elements,
            inner_text,
+            stroke_width,
        }
    }

    pub fn get_scene_buf(&mut self) -> &[u8] {
-        let n_items = self.bboxes.len() as u32;
-        let bboxes = self.bboxes.encode(&mut self.encoder).transmute();
-        let items = self.items.encode(&mut self.encoder).transmute();
-        let offset = scene::Point { xy: [0.0, 0.0] };
-        let simple_group = SimpleGroup {
-            n_items,
-            bboxes,
-            items,
-            offset,
-        };
-        let root_item = PietItem::Group(simple_group);
-        root_item.encode_to(&mut self.encoder.buf_mut()[0..PietItem::fixed_size()]);
+        self.elements.encode(&mut self.encoder);
        self.encoder.buf()
    }
-
-    fn push_item(&mut self, item: PietItem, bbox: Rect) {
-        let scene_bbox = Bbox {
-            bbox: [
-                bbox.x0.floor() as i16,
-                bbox.y0.floor() as i16,
-                bbox.x1.ceil() as i16,
-                bbox.y1.ceil() as i16,
-            ],
-        };
-        self.items.push(item);
-        self.bboxes.push(scene_bbox);
-    }
 }

 impl RenderContext for PietGpuRenderContext {
@ -107,20 +86,19 @@ impl RenderContext for PietGpuRenderContext {
    fn clear(&mut self, _color: Color) {}

    fn stroke(&mut self, shape: impl Shape, brush: &impl IntoBrush<Self>, width: f64) {
-        let bbox = shape.bounding_box();
-        let brush = brush.make_brush(self, || bbox).into_owned();
+        let width = width as f32;
+        if self.stroke_width != width {
+            self.elements
+                .push(Element::SetLineWidth(SetLineWidth { width }));
+            self.stroke_width = width;
+        }
+        let brush = brush.make_brush(self, || shape.bounding_box()).into_owned();
        let path = shape.to_bez_path(TOLERANCE);
-        let (n_points, points) = flatten_shape(&mut self.encoder, path);
+        self.encode_path(path, false);
        match brush {
            PietGpuBrush::Solid(rgba_color) => {
-                let poly_line = PietStrokePolyLine {
-                    rgba_color,
-                    width: width as f32,
-                    n_points,
-                    points,
-                };
-                let bbox = bbox.inset(-0.5 * width);
-                self.push_item(PietItem::Poly(poly_line), bbox);
+                let stroke = Stroke { rgba_color };
+                self.elements.push(Element::Stroke(stroke));
            }
            _ => (),
        }
@ -136,35 +114,13 @@ impl RenderContext for PietGpuRenderContext {
    }

    fn fill(&mut self, shape: impl Shape, brush: &impl IntoBrush<Self>) {
-        let bbox = shape.bounding_box();
        let brush = brush.make_brush(self, || shape.bounding_box()).into_owned();
-
-        if let Some(circle) = shape.as_circle() {
-            match brush {
-                PietGpuBrush::Solid(rgba_color) => {
-                    let piet_circle = PietCircle {
-                        rgba_color,
-                        center: to_scene_point(circle.center),
-                        radius: circle.radius as f32,
-                    };
-                    let bbox = circle.bounding_box();
-                    self.push_item(PietItem::Circle(piet_circle), bbox);
-                }
-                _ => {}
-            }
-            return;
-        }
        let path = shape.to_bez_path(TOLERANCE);
-        let (n_points, points) = flatten_shape(&mut self.encoder, path);
+        self.encode_path(path, true);
        match brush {
            PietGpuBrush::Solid(rgba_color) => {
-                let fill = PietFill {
-                    flags: 0,
-                    rgba_color,
-                    n_points,
-                    points,
-                };
-                self.push_item(PietItem::Fill(fill), bbox);
+                let fill = Fill { rgba_color };
+                self.elements.push(Element::Fill(fill));
            }
            _ => (),
        }
@ -241,45 +197,110 @@ impl RenderContext for PietGpuRenderContext {
    }
 }

-fn flatten_shape(
-    encoder: &mut Encoder,
-    path: impl Iterator<Item = PathEl>,
-) -> (u32, Ref<scene::Point>) {
-    let mut points = Vec::new();
-    let mut start_pt = None;
-    let mut last_pt = None;
-    piet::kurbo::flatten(path, TOLERANCE, |el| {
-        match el {
-            PathEl::MoveTo(p) => {
-                let scene_pt = to_scene_point(p);
-                start_pt = Some(clone_scene_pt(&scene_pt));
-                if !points.is_empty() {
-                    points.push(scene::Point {
-                        xy: [std::f32::NAN, std::f32::NAN],
-                    });
+impl PietGpuRenderContext {
+    fn encode_line_seg(&mut self, seg: LineSeg, is_fill: bool) {
+        if is_fill {
+            self.elements.push(Element::FillLine(seg));
+        } else {
+            self.elements.push(Element::StrokeLine(seg));
+        }
+    }
+
+    fn encode_path(&mut self, path: impl Iterator<Item = PathEl>, is_fill: bool) {
+        let flatten = true;
+        if flatten {
+            let mut start_pt = None;
+            let mut last_pt = None;
+            piet::kurbo::flatten(path, TOLERANCE, |el| {
+                match el {
+                    PathEl::MoveTo(p) => {
+                        let scene_pt = to_f32_2(p);
+                        start_pt = Some(scene_pt);
+                        last_pt = Some(scene_pt);
+                    }
+                    PathEl::LineTo(p) => {
+                        let scene_pt = to_f32_2(p);
+                        let seg = LineSeg {
+                            p0: last_pt.unwrap(),
+                            p1: scene_pt,
+                        };
+                        self.encode_line_seg(seg, is_fill);
+                        last_pt = Some(scene_pt);
+                    }
+                    PathEl::ClosePath => {
+                        if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) {
+                            if last != start {
+                                let seg = LineSeg {
+                                    p0: last,
+                                    p1: start,
+                                };
+                                self.encode_line_seg(seg, is_fill);
+                            }
+                        }
+                    }
+                    _ => (),
                }
-                last_pt = Some(clone_scene_pt(&scene_pt));
-                points.push(scene_pt);
-            }
-            PathEl::LineTo(p) => {
-                let scene_pt = to_scene_point(p);
-                last_pt = Some(clone_scene_pt(&scene_pt));
-                points.push(scene_pt);
-            }
-            PathEl::ClosePath => {
-                if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) {
-                    if start.xy != last.xy {
-                        points.push(start);
+                //println!("{:?}", el);
+            });
+        } else {
+            let mut start_pt = None;
+            let mut last_pt = None;
+            for el in path {
+                match el {
+                    PathEl::MoveTo(p) => {
+                        let scene_pt = to_f32_2(p);
+                        start_pt = Some(scene_pt);
+                        last_pt = Some(scene_pt);
+                    }
+                    PathEl::LineTo(p) => {
+                        let scene_pt = to_f32_2(p);
+                        let seg = LineSeg {
+                            p0: last_pt.unwrap(),
+                            p1: scene_pt,
+                        };
+                        self.encode_line_seg(seg, is_fill);
+                        last_pt = Some(scene_pt);
+                    }
+                    PathEl::QuadTo(p1, p2) => {
+                        let scene_p1 = to_f32_2(p1);
+                        let scene_p2 = to_f32_2(p2);
+                        let seg = QuadSeg {
+                            p0: last_pt.unwrap(),
+                            p1: scene_p1,
+                            p2: scene_p2,
+                        };
+                        self.elements.push(Element::Quad(seg));
+                        last_pt = Some(scene_p2);
+                    }
+                    PathEl::CurveTo(p1, p2, p3) => {
+                        let scene_p1 = to_f32_2(p1);
+                        let scene_p2 = to_f32_2(p2);
+                        let scene_p3 = to_f32_2(p3);
+                        let seg = CubicSeg {
+                            p0: last_pt.unwrap(),
+                            p1: scene_p1,
+                            p2: scene_p2,
+                            p3: scene_p3,
+                        };
+                        self.elements.push(Element::Cubic(seg));
+                        last_pt = Some(scene_p3);
+                    }
+                    PathEl::ClosePath => {
+                        if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) {
+                            if last != start {
+                                let seg = LineSeg {
+                                    p0: last,
+                                    p1: start,
+                                };
+                                self.encode_line_seg(seg, is_fill);
+                            }
+                        }
                    }
                }
+                //println!("{:?}", el);
            }
-            _ => (),
        }
-        //println!("{:?}", el);
-    });
-    let n_points = points.len() as u32;
-    let points_ref = points.encode(encoder).transmute();
-    (n_points, points_ref)
+    }
 }

 impl Text for PietGpuText {
@ -360,13 +381,6 @@ impl IntoBrush<PietGpuRenderContext> for PietGpuBrush {
    }
 }

-fn to_scene_point(point: Point) -> scene::Point {
-    scene::Point {
-        xy: [point.x as f32, point.y as f32],
-    }
-}
-
-// TODO: allow #[derive(Clone)] in piet-gpu-derive.
-fn clone_scene_pt(p: &scene::Point) -> scene::Point {
-    scene::Point { xy: p.xy }
+fn to_f32_2(point: Point) -> [f32; 2] {
+    [point.x as f32, point.y as f32]
 }