Merge pull request #19 from linebender/sort_middle

Bring sort_middle branch to master
2025-01-09 20:31:29 +11:00 · 2020-06-11 16:16:10 -07:00 · 2020-06-11 16:16:10 -07:00 · 73df5534a1
parent 8d01aba237 feeaa31fd1
commit 73df5534a1
41 changed files with 2673 additions and 1310 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -26,6 +26,15 @@ version = "0.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "000444226fcff248f2bc4c7625be32c63caccfecc2723a2b9f78a7487a49c407"
 [[package]]
 name = "ansi_term"
 version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b"
 dependencies = [
 "winapi 0.3.8",
 ]
 [[package]]
 name = "approx"
 version = "0.3.2"
@ -59,6 +68,17 @@ dependencies = [
 "raw-window-handle",
 ]
 [[package]]
 name = "atty"
 version = "0.2.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
 dependencies = [
 "hermit-abi",
 "libc",
 "winapi 0.3.8",
 ]
 [[package]]
 name = "autocfg"
 version = "1.0.0"
@ -106,6 +126,21 @@ version = "0.1.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
 [[package]]
 name = "clap"
 version = "2.33.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bdfa80d47f954d53a35a64987ca1422f495b8d6483c0fe9f7117b36c2a792129"
 dependencies = [
 "ansi_term",
 "atty",
 "bitflags",
 "strsim",
 "textwrap",
 "unicode-width",
 "vec_map",
 ]
 [[package]]
 name = "cloudabi"
 version = "0.0.3"
@ -259,6 +294,15 @@ version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f36b5f248235f45773d4944f555f83ea61fe07b18b561ccf99d7483d7381e54d"
 [[package]]
 name = "hermit-abi"
 version = "0.1.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "91780f809e750b0a89f5544be56617ff6b1227ee485bcb06ebe10cdf89bd3b71"
 dependencies = [
 "libc",
 ]
 [[package]]
 name = "inflate"
 version = "0.4.5"
@ -525,6 +569,7 @@ dependencies = [
 name = "piet-gpu"
 version = "0.1.0"
 dependencies = [
 "clap",
 "piet",
 "piet-gpu-hal",
 "piet-gpu-types",
@ -758,6 +803,12 @@ dependencies = [
 "byteorder",
 ]
 [[package]]
 name = "strsim"
 version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
 [[package]]
 name = "syn"
 version = "1.0.17"
@ -769,6 +820,21 @@ dependencies = [
 "unicode-xid 0.2.0",
 ]
 [[package]]
 name = "textwrap"
 version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
 dependencies = [
 "unicode-width",
 ]
 [[package]]
 name = "unicode-width"
 version = "0.1.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "caaa9d531767d1ff2150b9332433f32a24622147e5ebb1f26409d5da67afd479"
 [[package]]
 name = "unicode-xid"
 version = "0.1.0"
@ -781,6 +847,12 @@ version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c"
 [[package]]
 name = "vec_map"
 version = "0.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
 [[package]]
 name = "void"
 version = "1.0.2"
--- a/piet-gpu-types/src/annotated.rs
+++ b/piet-gpu-types/src/annotated.rs
@ -0,0 +1,53 @@
 use piet_gpu_derive::piet_gpu;
 piet_gpu! {
    #[gpu_write]
    mod annotated {
        struct AnnoFillLineSeg {
            p0: [f32; 2],
            p1: [f32; 2],
            // A note: the layout of this struct is shared with
            // AnnoStrokeLineSeg. In that case, we actually write
            // [0.0, 0.0] as the stroke field, to minimize divergence.
        }
        struct AnnoStrokeLineSeg {
            p0: [f32; 2],
            p1: [f32; 2],
            // halfwidth in both x and y for binning
            stroke: [f32; 2],
        }
        struct AnnoQuadSeg {
            p0: [f32; 2],
            p1: [f32; 2],
            p2: [f32; 2],
            stroke: [f32; 2],
        }
        struct AnnoCubicSeg {
            p0: [f32; 2],
            p1: [f32; 2],
            p2: [f32; 2],
            p3: [f32; 2],
            stroke: [f32; 2],
        }
        struct AnnoFill {
            rgba_color: u32,
            bbox: [f32; 4],
        }
        struct AnnoStroke {
            rgba_color: u32,
            bbox: [f32; 4],
            // For the nonuniform scale case, this needs to be a 2x2 matrix.
            // That's expected to be uncommon, so we could special-case it.
            linewidth: f32,
        }
        enum Annotated {
            Nop,
            FillLine(AnnoFillLineSeg),
            StrokeLine(AnnoStrokeLineSeg),
            Quad(AnnoQuadSeg),
            Cubic(AnnoCubicSeg),
            Stroke(AnnoStroke),
            Fill(AnnoFill),
        }
    }
 }
--- a/piet-gpu-types/src/bins.rs
+++ b/piet-gpu-types/src/bins.rs
@ -0,0 +1,22 @@
 use piet_gpu_derive::piet_gpu;
 // The output of the binning stage, organized as a linked list of chunks.
 piet_gpu! {
    #[gpu_write]
    mod bins {
        struct BinInstance {
            element_ix: u32,
            // Right edge of the bounding box of the associated fill
            // element; used in backdrop computation.
            right_edge: f32,
        }
        struct BinChunk {
            // First chunk can have n = 0, subsequent ones not.
            n: u32,
            next: Ref<BinChunk>,
            // Instances follow
        }
    }
 }
--- a/piet-gpu-types/src/fill_seg.rs
+++ b/piet-gpu-types/src/fill_seg.rs
@ -1,37 +0,0 @@
 use piet_gpu_derive::piet_gpu;
 // Structures representing segments for fill items.
 // There is some cut'n'paste here from stroke segments, which can be
 // traced to the fact that buffers in GLSL are basically global.
 // Maybe there's a way to address that, but in the meantime living
 // with the duplication is easiest.
 piet_gpu! {
    #[gpu_write]
    mod fill_seg {
        struct FillTileHeader {
            n: u32,
            items: Ref<FillItemHeader>,
        }
        struct FillItemHeader {
            backdrop: i32,
            segments: Ref<FillSegChunk>,
        }
        // TODO: strongly consider using f16. If so, these would be
        // relative to the tile. We're doing f32 for now to minimize
        // divergence from piet-metal originals.
        struct FillSegment {
            start: [f32; 2],
            end: [f32; 2],
        }
        struct FillSegChunk {
            n: u32,
            next: Ref<FillSegChunk>,
            // Segments follow (could represent this as a variable sized array).
        }
    }
 }
--- a/piet-gpu-types/src/lib.rs
+++ b/piet-gpu-types/src/lib.rs
@ -1,7 +1,10 @@
 // Structures used only internally probably don't need to be pub.
 pub mod annotated;
 pub mod bins;
 pub mod encoder;
 pub mod fill_seg;
 pub mod ptcl;
 pub mod scene;
-pub mod segment;
+pub mod state;
 pub mod test;
 pub mod tilegroup;
--- a/piet-gpu-types/src/main.rs
+++ b/piet-gpu-types/src/main.rs
@ -5,9 +5,10 @@ fn main() {
        .expect("provide a module name");
    match mod_name.as_str() {
        "scene" => print!("{}", piet_gpu_types::scene::gen_gpu_scene()),
        "state" => print!("{}", piet_gpu_types::state::gen_gpu_state()),
        "annotated" => print!("{}", piet_gpu_types::annotated::gen_gpu_annotated()),
        "bins" => print!("{}", piet_gpu_types::bins::gen_gpu_bins()),
        "tilegroup" => print!("{}", piet_gpu_types::tilegroup::gen_gpu_tilegroup()),
        "segment" => print!("{}", piet_gpu_types::segment::gen_gpu_segment()),
        "fill_seg" => print!("{}", piet_gpu_types::fill_seg::gen_gpu_fill_seg()),
        "ptcl" => print!("{}", piet_gpu_types::ptcl::gen_gpu_ptcl()),
        "test" => print!("{}", piet_gpu_types::test::gen_gpu_test()),
        _ => println!("Oops, unknown module name"),
--- a/piet-gpu-types/src/ptcl.rs
+++ b/piet-gpu-types/src/ptcl.rs
@ -13,14 +13,13 @@ piet_gpu! {
            end: [f32; 2],
        }
        struct CmdStroke {
-            // Should be Ref<SegChunk> if we had cross-module references.
+            // Consider a specialization to one segment.
-            seg_ref: u32,
+            seg_ref: Ref<SegChunk>,
            half_width: f32,
            rgba_color: u32,
        }
        struct CmdFill {
-            // Should be Ref<FillSegChunk> if we had cross-module references.
+            seg_ref: Ref<SegChunk>,
            seg_ref: u32,
            backdrop: i32,
            rgba_color: u32,
        }
@ -51,5 +50,24 @@ piet_gpu! {
            Jump(CmdJump),
            Bail,
        }
        // TODO: strongly consider using f16. If so, these would be
        // relative to the tile. We're doing f32 for now to minimize
        // divergence from piet-metal originals.
        struct Segment {
            start: [f32; 2],
            end: [f32; 2],
            // This is used for fills only, but we're including it in
            // the general structure for simplicity.
            y_edge: f32,
        }
        struct SegChunk {
            n: u32,
            next: Ref<SegChunk>,
            // Actually a reference to a variable-sized slice.
            segs: Ref<Segment>,
        }
    }
 }
--- a/piet-gpu-types/src/scene.rs
+++ b/piet-gpu-types/src/scene.rs
@ -4,6 +4,8 @@ pub use self::scene::{
    Bbox, PietCircle, PietFill, PietItem, PietStrokeLine, PietStrokePolyLine, Point, SimpleGroup,
 };
 pub use self::scene::{CubicSeg, Element, Fill, LineSeg, QuadSeg, SetLineWidth, Stroke, Transform};
 piet_gpu! {
    #[rust_encode]
    mod scene {
@ -51,5 +53,53 @@ piet_gpu! {
            Fill(PietFill),
            Poly(PietStrokePolyLine),
        }
        // New approach follows (above to be deleted)
        struct LineSeg {
            p0: [f32; 2],
            p1: [f32; 2],
        }
        struct QuadSeg {
            p0: [f32; 2],
            p1: [f32; 2],
            p2: [f32; 2],
        }
        struct CubicSeg {
            p0: [f32; 2],
            p1: [f32; 2],
            p2: [f32; 2],
            p3: [f32; 2],
        }
        struct Fill {
            rgba_color: u32,
        }
        struct Stroke {
            rgba_color: u32,
        }
        struct SetLineWidth {
            width: f32,
        }
        struct Transform {
            mat: [f32; 4],
            translate: [f32; 2],
        }
        enum Element {
            Nop,
            // Another approach to encoding would be to use a single
            // variant but have a bool for fill/stroke. This could be
            // packed into the tag, so the on-the-wire representation
            // would be very similar to what's here.
            StrokeLine(LineSeg),
            FillLine(LineSeg),
            // Note: we'll need to handle the stroke/fill distinction
            // for these as well, when we do flattening on the GPU.
            Quad(QuadSeg),
            Cubic(CubicSeg),
            Stroke(Stroke),
            Fill(Fill),
            SetLineWidth(SetLineWidth),
            Transform(Transform),
        }
    }
 }
--- a/piet-gpu-types/src/segment.rs
+++ b/piet-gpu-types/src/segment.rs
@ -1,32 +0,0 @@
 use piet_gpu_derive::piet_gpu;
 // Structures representing segments for stroke/fill items.
 piet_gpu! {
    #[gpu_write]
    mod segment {
        struct TileHeader {
            n: u32,
            items: Ref<ItemHeader>,
        }
        // Note: this is only suitable for strokes, fills require backdrop.
        struct ItemHeader {
            segments: Ref<SegChunk>,
        }
        // TODO: strongly consider using f16. If so, these would be
        // relative to the tile. We're doing f32 for now to minimize
        // divergence from piet-metal originals.
        struct Segment {
            start: [f32; 2],
            end: [f32; 2],
        }
        struct SegChunk {
            n: u32,
            next: Ref<SegChunk>,
            // Segments follow (could represent this as a variable sized array).
        }
    }
 }
--- a/piet-gpu-types/src/state.rs
+++ b/piet-gpu-types/src/state.rs
@ -0,0 +1,14 @@
 use piet_gpu_derive::piet_gpu;
 piet_gpu! {
    #[gpu_write]
    mod state {
        struct State {
            mat: [f32; 4],
            translate: [f32; 2],
            bbox: [f32; 4],
            linewidth: f32,
            flags: u32,
        }
    }
 }
--- a/piet-gpu/Cargo.toml
+++ b/piet-gpu/Cargo.toml
@ -26,3 +26,4 @@ png = "0.16.2"
 rand = "0.7.3"
 roxmltree = "0.11"
 winit = "0.22"
 clap = "2.33"
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@ -2,10 +2,12 @@ use std::fs::File;
 use std::io::BufWriter;
 use std::path::Path;
 use clap::{Arg, App};
 use piet_gpu_hal::vulkan::VkInstance;
 use piet_gpu_hal::{CmdBuf, Device, Error, MemFlags};
-use piet_gpu::{PietGpuRenderContext, Renderer, render_scene, WIDTH, HEIGHT};
+use piet_gpu::{render_scene, render_svg, PietGpuRenderContext, Renderer, HEIGHT, WIDTH};
 #[allow(unused)]
 fn dump_scene(buf: &[u8]) {
@ -16,22 +18,179 @@ fn dump_scene(buf: &[u8]) {
    }
 }
 #[allow(unused)]
 fn dump_state(buf: &[u8]) {
    for i in 0..(buf.len() / 48) {
        let j = i * 48;
        let floats = (0..11).map(|k| {
            let mut buf_f32 = [0u8; 4];
            buf_f32.copy_from_slice(&buf[j + k * 4..j + k * 4 + 4]);
            f32::from_le_bytes(buf_f32)
        }).collect::<Vec<_>>();
        println!("{}: [{} {} {} {} {} {}] ({}, {})-({} {}) {} {}",
            i,
            floats[0], floats[1], floats[2], floats[3], floats[4], floats[5],
            floats[6], floats[7], floats[8], floats[9],
            floats[10], buf[j + 44]);
    }
 }
 /// Interpret the output of the binning stage, for diagnostic purposes.
 #[allow(unused)]
 fn trace_merge(buf: &[u32]) {
    for bin in 0..256 {
        println!("bin {}:", bin);
        let mut starts = (0..16).map(|i| Some((bin * 16 + i) * 64)).collect::<Vec<Option<usize>>>();
        loop {
            let min_start = starts.iter().map(|st|
                st.map(|st|
                    if buf[st / 4] == 0 {
                        !0
                    } else {
                        buf[st / 4 + 2]
                    }).unwrap_or(!0)).min().unwrap();
            if min_start == !0 {
                break;
            }
            let mut selected = !0;
            for i in 0..16 {
                if let Some(st) = starts[i] {
                    if buf[st/4] != 0 && buf[st/4 + 2] == min_start {
                        selected = i;
                        break;
                    }
                }
            }
            let st = starts[selected].unwrap();
            println!("selected {}, start {:x}", selected, st);
            for j in 0..buf[st/4] {
                println!("{:x}", buf[st/4 + 2 + j as usize])
            }
            if buf[st/4 + 1] == 0 {
                starts[selected] = None;
            } else {
                starts[selected] = Some(buf[st/4 + 1] as usize);
            }
        }
    }
 }
 /// Interpret the output of the coarse raster stage, for diagnostic purposes.
 #[allow(unused)]
 fn trace_ptcl(buf: &[u32]) {
    for y in 0..96 {
        for x in 0..128 {
            let tile_ix = y * 128 + x;
            println!("tile {} @({}, {})", tile_ix, x, y);
            let mut tile_offset = tile_ix * 1024;
            loop {
                let tag = buf[tile_offset / 4];
                match tag {
                    0 => break,
                    3 => {
                        let backdrop = buf[tile_offset / 4 + 2];
                        let rgba_color = buf[tile_offset / 4 + 3];
                        println!("  {:x}: fill {:x} {}", tile_offset, rgba_color, backdrop);
                        let mut seg_chunk = buf[tile_offset / 4 + 1] as usize;
                        let n = buf[seg_chunk / 4] as usize;
                        let segs = buf[seg_chunk / 4 + 2] as usize;
                        println!("    chunk @{:x}: n={}, segs @{:x}", seg_chunk, n, segs);
                        for i in 0..n {
                            let x0 = f32::from_bits(buf[segs / 4 + i * 5]);
                            let y0 = f32::from_bits(buf[segs / 4 + i * 5 + 1]);
                            let x1 = f32::from_bits(buf[segs / 4 + i * 5 + 2]);
                            let y1 = f32::from_bits(buf[segs / 4 + i * 5 + 3]);
                            let y_edge = f32::from_bits(buf[segs / 4 + i * 5 + 4]);
                            println!("      ({:.3}, {:.3}) - ({:.3}, {:.3}) | {:.3}", x0, y0, x1, y1, y_edge);
                        }
                        loop {
                            seg_chunk = buf[seg_chunk / 4 + 1] as usize;
                            if seg_chunk == 0 {
                                break;
                            }
                        }
                    }
                    4 => {
                        let line_width = f32::from_bits(buf[tile_offset / 4 + 2]);
                        let rgba_color = buf[tile_offset / 4 + 3];
                        println!("  {:x}: stroke {:x} {}", tile_offset, rgba_color, line_width);
                        let mut seg_chunk = buf[tile_offset / 4 + 1] as usize;
                        let n = buf[seg_chunk / 4] as usize;
                        let segs = buf[seg_chunk / 4 + 2] as usize;
                        println!("    chunk @{:x}: n={}, segs @{:x}", seg_chunk, n, segs);
                        for i in 0..n {
                            let x0 = f32::from_bits(buf[segs / 4 + i * 5]);
                            let y0 = f32::from_bits(buf[segs / 4 + i * 5 + 1]);
                            let x1 = f32::from_bits(buf[segs / 4 + i * 5 + 2]);
                            let y1 = f32::from_bits(buf[segs / 4 + i * 5 + 3]);
                            let y_edge = f32::from_bits(buf[segs / 4 + i * 5 + 4]);
                            println!("      ({:.3}, {:.3}) - ({:.3}, {:.3}) | {:.3}", x0, y0, x1, y1, y_edge);
                        }
                        loop {
                            seg_chunk = buf[seg_chunk / 4 + 1] as usize;
                            if seg_chunk == 0 {
                                break;
                            }
                        }
                    }
                    _ => {
                        println!("{:x}: {}", tile_offset, tag);
                    }
                }
                if tag == 0 {
                    break;
                }
                if tag == 8 {
                    tile_offset = buf[tile_offset / 4 + 1] as usize;
                } else {
                    tile_offset += 20;
                }
            }
        }
    }
 }
 fn main() -> Result<(), Error> {
    let matches = App::new("piet-gpu test")
        .arg(Arg::with_name("INPUT")
            .index(1))
        .arg(Arg::with_name("flip")
            .short("f")
            .long("flip"))
        .arg(Arg::with_name("scale")
            .short("s")
            .long("scale")
            .takes_value(true))
        .get_matches();
    let (instance, _) = VkInstance::new(None)?;
    unsafe {
        let device = instance.device(None)?;
        let fence = device.create_fence(false)?;
        let mut cmd_buf = device.create_cmd_buf()?;
-        let query_pool = device.create_query_pool(6)?;
+        let query_pool = device.create_query_pool(5)?;
        let mut ctx = PietGpuRenderContext::new();
-        render_scene(&mut ctx);
+        if let Some(input) = matches.value_of("INPUT") {
            let mut scale = matches.value_of("scale")
                .map(|scale| scale.parse().unwrap())
                .unwrap_or(8.0);
            if matches.is_present("flip") {
                scale = -scale;
            }
            render_svg(&mut ctx, input, scale);
        } else {
            render_scene(&mut ctx);
        }
        let scene = ctx.get_scene_buf();
        //dump_scene(&scene);
        let renderer = Renderer::new(&device, scene)?;
-        let image_buf = device.create_buffer((WIDTH * HEIGHT * 4) as u64, MemFlags::host_coherent())?;
+        let image_buf =
            device.create_buffer((WIDTH * HEIGHT * 4) as u64, MemFlags::host_coherent())?;
        cmd_buf.begin();
        renderer.record(&mut cmd_buf, &query_pool);
@ -39,29 +198,17 @@ fn main() -> Result<(), Error> {
        cmd_buf.finish();
        device.run_cmd_buf(&cmd_buf, &[], &[], Some(&fence))?;
        device.wait_and_reset(&[fence])?;
-        let timestamps = device.reap_query_pool(&query_pool).unwrap();
+        let ts = device.reap_query_pool(&query_pool).unwrap();
-        println!("Kernel 1 time: {:.3}ms", timestamps[0] * 1e3);
+        println!("Element kernel time: {:.3}ms", ts[0] * 1e3);
-        println!(
+        println!("Binning kernel time: {:.3}ms", (ts[1] - ts[0]) * 1e3);
-            "Kernel 2s time: {:.3}ms",
+        println!("Coarse kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3);
-            (timestamps[1] - timestamps[0]) * 1e3
+        println!("Render kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3);
        );
        println!(
            "Kernel 2f time: {:.3}ms",
            (timestamps[2] - timestamps[1]) * 1e3
        );
        println!(
            "Kernel 3 time: {:.3}ms",
            (timestamps[3] - timestamps[2]) * 1e3
        );
        println!(
            "Render time: {:.3}ms",
            (timestamps[4] - timestamps[3]) * 1e3
        );
        /*
-        let mut k1_data: Vec<u32> = Default::default();
+        let mut data: Vec<u32> = Default::default();
-        device.read_buffer(&segment_buf, &mut k1_data).unwrap();
+        device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap();
-        dump_k1_data(&k1_data);
+        piet_gpu::dump_k1_data(&data);
        //trace_ptcl(&data);
        */
        let mut img_data: Vec<u8> = Default::default();
--- a/piet-gpu/bin/winit.rs
+++ b/piet-gpu/bin/winit.rs
@ -1,7 +1,7 @@
 use piet_gpu_hal::vulkan::VkInstance;
 use piet_gpu_hal::{CmdBuf, Device, Error, ImageLayout};
-use piet_gpu::{PietGpuRenderContext, Renderer, render_scene, WIDTH, HEIGHT};
+use piet_gpu::{render_scene, PietGpuRenderContext, Renderer, HEIGHT, WIDTH};
 use winit::{
    event::{Event, WindowEvent},
@ -37,7 +37,7 @@ fn main() -> Result<(), Error> {
            .map(|_| device.create_cmd_buf())
            .collect::<Result<Vec<_>, Error>>()?;
        let query_pools = (0..NUM_FRAMES)
-            .map(|_| device.create_query_pool(6))
+            .map(|_| device.create_query_pool(5))
            .collect::<Result<Vec<_>, Error>>()?;
        let mut ctx = PietGpuRenderContext::new();
@ -69,12 +69,12 @@ fn main() -> Result<(), Error> {
                        device.wait_and_reset(&[frame_fences[frame_idx]]).unwrap();
                        let timestamps = device.reap_query_pool(query_pool).unwrap();
-                        window.set_title(&format!("k1: {:.3}ms, k2s: {:.3}ms, k2f: {:.3}ms, k3: {:.3}ms, k4: {:.3}ms",
+                        window.set_title(&format!(
                            "e: {:.3}ms, b: {:.3}ms, c: {:.3}ms, f: {:.3}ms",
                            timestamps[0] * 1e3,
                            (timestamps[1] - timestamps[0]) * 1e3,
                            (timestamps[2] - timestamps[1]) * 1e3,
                            (timestamps[3] - timestamps[2]) * 1e3,
                            (timestamps[4] - timestamps[3]) * 1e3,
                        ));
                    }
@ -93,11 +93,7 @@ fn main() -> Result<(), Error> {
                        ImageLayout::BlitDst,
                    );
                    cmd_buf.blit_image(&renderer.image_dev, &swap_image);
-                    cmd_buf.image_barrier(
+                    cmd_buf.image_barrier(&swap_image, ImageLayout::BlitDst, ImageLayout::Present);
                        &swap_image,
                        ImageLayout::BlitDst,
                        ImageLayout::Present,
                    );
                    cmd_buf.finish();
                    device
--- a/piet-gpu/shader/annotated.h
+++ b/piet-gpu/shader/annotated.h
@ -0,0 +1,335 @@
 // Code auto-generated by piet-gpu-derive
 struct AnnoFillLineSegRef {
    uint offset;
 };
 struct AnnoStrokeLineSegRef {
    uint offset;
 };
 struct AnnoQuadSegRef {
    uint offset;
 };
 struct AnnoCubicSegRef {
    uint offset;
 };
 struct AnnoFillRef {
    uint offset;
 };
 struct AnnoStrokeRef {
    uint offset;
 };
 struct AnnotatedRef {
    uint offset;
 };
 struct AnnoFillLineSeg {
    vec2 p0;
    vec2 p1;
 };
 #define AnnoFillLineSeg_size 16
 AnnoFillLineSegRef AnnoFillLineSeg_index(AnnoFillLineSegRef ref, uint index) {
    return AnnoFillLineSegRef(ref.offset + index * AnnoFillLineSeg_size);
 }
 struct AnnoStrokeLineSeg {
    vec2 p0;
    vec2 p1;
    vec2 stroke;
 };
 #define AnnoStrokeLineSeg_size 24
 AnnoStrokeLineSegRef AnnoStrokeLineSeg_index(AnnoStrokeLineSegRef ref, uint index) {
    return AnnoStrokeLineSegRef(ref.offset + index * AnnoStrokeLineSeg_size);
 }
 struct AnnoQuadSeg {
    vec2 p0;
    vec2 p1;
    vec2 p2;
    vec2 stroke;
 };
 #define AnnoQuadSeg_size 32
 AnnoQuadSegRef AnnoQuadSeg_index(AnnoQuadSegRef ref, uint index) {
    return AnnoQuadSegRef(ref.offset + index * AnnoQuadSeg_size);
 }
 struct AnnoCubicSeg {
    vec2 p0;
    vec2 p1;
    vec2 p2;
    vec2 p3;
    vec2 stroke;
 };
 #define AnnoCubicSeg_size 40
 AnnoCubicSegRef AnnoCubicSeg_index(AnnoCubicSegRef ref, uint index) {
    return AnnoCubicSegRef(ref.offset + index * AnnoCubicSeg_size);
 }
 struct AnnoFill {
    uint rgba_color;
    vec4 bbox;
 };
 #define AnnoFill_size 20
 AnnoFillRef AnnoFill_index(AnnoFillRef ref, uint index) {
    return AnnoFillRef(ref.offset + index * AnnoFill_size);
 }
 struct AnnoStroke {
    uint rgba_color;
    vec4 bbox;
    float linewidth;
 };
 #define AnnoStroke_size 24
 AnnoStrokeRef AnnoStroke_index(AnnoStrokeRef ref, uint index) {
    return AnnoStrokeRef(ref.offset + index * AnnoStroke_size);
 }
 #define Annotated_Nop 0
 #define Annotated_FillLine 1
 #define Annotated_StrokeLine 2
 #define Annotated_Quad 3
 #define Annotated_Cubic 4
 #define Annotated_Stroke 5
 #define Annotated_Fill 6
 #define Annotated_size 44
 AnnotatedRef Annotated_index(AnnotatedRef ref, uint index) {
    return AnnotatedRef(ref.offset + index * Annotated_size);
 }
 AnnoFillLineSeg AnnoFillLineSeg_read(AnnoFillLineSegRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = annotated[ix + 0];
    uint raw1 = annotated[ix + 1];
    uint raw2 = annotated[ix + 2];
    uint raw3 = annotated[ix + 3];
    AnnoFillLineSeg s;
    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    return s;
 }
 void AnnoFillLineSeg_write(AnnoFillLineSegRef ref, AnnoFillLineSeg s) {
    uint ix = ref.offset >> 2;
    annotated[ix + 0] = floatBitsToUint(s.p0.x);
    annotated[ix + 1] = floatBitsToUint(s.p0.y);
    annotated[ix + 2] = floatBitsToUint(s.p1.x);
    annotated[ix + 3] = floatBitsToUint(s.p1.y);
 }
 AnnoStrokeLineSeg AnnoStrokeLineSeg_read(AnnoStrokeLineSegRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = annotated[ix + 0];
    uint raw1 = annotated[ix + 1];
    uint raw2 = annotated[ix + 2];
    uint raw3 = annotated[ix + 3];
    uint raw4 = annotated[ix + 4];
    uint raw5 = annotated[ix + 5];
    AnnoStrokeLineSeg s;
    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.stroke = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
    return s;
 }
 void AnnoStrokeLineSeg_write(AnnoStrokeLineSegRef ref, AnnoStrokeLineSeg s) {
    uint ix = ref.offset >> 2;
    annotated[ix + 0] = floatBitsToUint(s.p0.x);
    annotated[ix + 1] = floatBitsToUint(s.p0.y);
    annotated[ix + 2] = floatBitsToUint(s.p1.x);
    annotated[ix + 3] = floatBitsToUint(s.p1.y);
    annotated[ix + 4] = floatBitsToUint(s.stroke.x);
    annotated[ix + 5] = floatBitsToUint(s.stroke.y);
 }
 AnnoQuadSeg AnnoQuadSeg_read(AnnoQuadSegRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = annotated[ix + 0];
    uint raw1 = annotated[ix + 1];
    uint raw2 = annotated[ix + 2];
    uint raw3 = annotated[ix + 3];
    uint raw4 = annotated[ix + 4];
    uint raw5 = annotated[ix + 5];
    uint raw6 = annotated[ix + 6];
    uint raw7 = annotated[ix + 7];
    AnnoQuadSeg s;
    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
    s.stroke = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7));
    return s;
 }
 void AnnoQuadSeg_write(AnnoQuadSegRef ref, AnnoQuadSeg s) {
    uint ix = ref.offset >> 2;
    annotated[ix + 0] = floatBitsToUint(s.p0.x);
    annotated[ix + 1] = floatBitsToUint(s.p0.y);
    annotated[ix + 2] = floatBitsToUint(s.p1.x);
    annotated[ix + 3] = floatBitsToUint(s.p1.y);
    annotated[ix + 4] = floatBitsToUint(s.p2.x);
    annotated[ix + 5] = floatBitsToUint(s.p2.y);
    annotated[ix + 6] = floatBitsToUint(s.stroke.x);
    annotated[ix + 7] = floatBitsToUint(s.stroke.y);
 }
 AnnoCubicSeg AnnoCubicSeg_read(AnnoCubicSegRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = annotated[ix + 0];
    uint raw1 = annotated[ix + 1];
    uint raw2 = annotated[ix + 2];
    uint raw3 = annotated[ix + 3];
    uint raw4 = annotated[ix + 4];
    uint raw5 = annotated[ix + 5];
    uint raw6 = annotated[ix + 6];
    uint raw7 = annotated[ix + 7];
    uint raw8 = annotated[ix + 8];
    uint raw9 = annotated[ix + 9];
    AnnoCubicSeg s;
    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
    s.p3 = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7));
    s.stroke = vec2(uintBitsToFloat(raw8), uintBitsToFloat(raw9));
    return s;
 }
 void AnnoCubicSeg_write(AnnoCubicSegRef ref, AnnoCubicSeg s) {
    uint ix = ref.offset >> 2;
    annotated[ix + 0] = floatBitsToUint(s.p0.x);
    annotated[ix + 1] = floatBitsToUint(s.p0.y);
    annotated[ix + 2] = floatBitsToUint(s.p1.x);
    annotated[ix + 3] = floatBitsToUint(s.p1.y);
    annotated[ix + 4] = floatBitsToUint(s.p2.x);
    annotated[ix + 5] = floatBitsToUint(s.p2.y);
    annotated[ix + 6] = floatBitsToUint(s.p3.x);
    annotated[ix + 7] = floatBitsToUint(s.p3.y);
    annotated[ix + 8] = floatBitsToUint(s.stroke.x);
    annotated[ix + 9] = floatBitsToUint(s.stroke.y);
 }
 AnnoFill AnnoFill_read(AnnoFillRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = annotated[ix + 0];
    uint raw1 = annotated[ix + 1];
    uint raw2 = annotated[ix + 2];
    uint raw3 = annotated[ix + 3];
    uint raw4 = annotated[ix + 4];
    AnnoFill s;
    s.rgba_color = raw0;
    s.bbox = vec4(uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3), uintBitsToFloat(raw4));
    return s;
 }
 void AnnoFill_write(AnnoFillRef ref, AnnoFill s) {
    uint ix = ref.offset >> 2;
    annotated[ix + 0] = s.rgba_color;
    annotated[ix + 1] = floatBitsToUint(s.bbox.x);
    annotated[ix + 2] = floatBitsToUint(s.bbox.y);
    annotated[ix + 3] = floatBitsToUint(s.bbox.z);
    annotated[ix + 4] = floatBitsToUint(s.bbox.w);
 }
 AnnoStroke AnnoStroke_read(AnnoStrokeRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = annotated[ix + 0];
    uint raw1 = annotated[ix + 1];
    uint raw2 = annotated[ix + 2];
    uint raw3 = annotated[ix + 3];
    uint raw4 = annotated[ix + 4];
    uint raw5 = annotated[ix + 5];
    AnnoStroke s;
    s.rgba_color = raw0;
    s.bbox = vec4(uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3), uintBitsToFloat(raw4));
    s.linewidth = uintBitsToFloat(raw5);
    return s;
 }
 void AnnoStroke_write(AnnoStrokeRef ref, AnnoStroke s) {
    uint ix = ref.offset >> 2;
    annotated[ix + 0] = s.rgba_color;
    annotated[ix + 1] = floatBitsToUint(s.bbox.x);
    annotated[ix + 2] = floatBitsToUint(s.bbox.y);
    annotated[ix + 3] = floatBitsToUint(s.bbox.z);
    annotated[ix + 4] = floatBitsToUint(s.bbox.w);
    annotated[ix + 5] = floatBitsToUint(s.linewidth);
 }
 uint Annotated_tag(AnnotatedRef ref) {
    return annotated[ref.offset >> 2];
 }
 AnnoFillLineSeg Annotated_FillLine_read(AnnotatedRef ref) {
    return AnnoFillLineSeg_read(AnnoFillLineSegRef(ref.offset + 4));
 }
 AnnoStrokeLineSeg Annotated_StrokeLine_read(AnnotatedRef ref) {
    return AnnoStrokeLineSeg_read(AnnoStrokeLineSegRef(ref.offset + 4));
 }
 AnnoQuadSeg Annotated_Quad_read(AnnotatedRef ref) {
    return AnnoQuadSeg_read(AnnoQuadSegRef(ref.offset + 4));
 }
 AnnoCubicSeg Annotated_Cubic_read(AnnotatedRef ref) {
    return AnnoCubicSeg_read(AnnoCubicSegRef(ref.offset + 4));
 }
 AnnoStroke Annotated_Stroke_read(AnnotatedRef ref) {
    return AnnoStroke_read(AnnoStrokeRef(ref.offset + 4));
 }
 AnnoFill Annotated_Fill_read(AnnotatedRef ref) {
    return AnnoFill_read(AnnoFillRef(ref.offset + 4));
 }
 void Annotated_Nop_write(AnnotatedRef ref) {
    annotated[ref.offset >> 2] = Annotated_Nop;
 }
 void Annotated_FillLine_write(AnnotatedRef ref, AnnoFillLineSeg s) {
    annotated[ref.offset >> 2] = Annotated_FillLine;
    AnnoFillLineSeg_write(AnnoFillLineSegRef(ref.offset + 4), s);
 }
 void Annotated_StrokeLine_write(AnnotatedRef ref, AnnoStrokeLineSeg s) {
    annotated[ref.offset >> 2] = Annotated_StrokeLine;
    AnnoStrokeLineSeg_write(AnnoStrokeLineSegRef(ref.offset + 4), s);
 }
 void Annotated_Quad_write(AnnotatedRef ref, AnnoQuadSeg s) {
    annotated[ref.offset >> 2] = Annotated_Quad;
    AnnoQuadSeg_write(AnnoQuadSegRef(ref.offset + 4), s);
 }
 void Annotated_Cubic_write(AnnotatedRef ref, AnnoCubicSeg s) {
    annotated[ref.offset >> 2] = Annotated_Cubic;
    AnnoCubicSeg_write(AnnoCubicSegRef(ref.offset + 4), s);
 }
 void Annotated_Stroke_write(AnnotatedRef ref, AnnoStroke s) {
    annotated[ref.offset >> 2] = Annotated_Stroke;
    AnnoStroke_write(AnnoStrokeRef(ref.offset + 4), s);
 }
 void Annotated_Fill_write(AnnotatedRef ref, AnnoFill s) {
    annotated[ref.offset >> 2] = Annotated_Fill;
    AnnoFill_write(AnnoFillRef(ref.offset + 4), s);
 }
--- a/piet-gpu/shader/binning.comp
+++ b/piet-gpu/shader/binning.comp
@ -0,0 +1,193 @@
 // The binning stage of the pipeline.
 #version 450
 #extension GL_GOOGLE_include_directive : enable
 #include "setup.h"
 layout(local_size_x = N_TILE, local_size_y = 1) in;
 layout(set = 0, binding = 0) buffer AnnotatedBuf {
    uint[] annotated;
 };
 // This is for scanning forward for right_edge data.
 layout(set = 0, binding = 1) buffer StateBuf {
    uint[] state;
 };
 layout(set = 0, binding = 2) buffer AllocBuf {
    uint n_elements;
    // Will be incremented atomically to claim tiles
    uint tile_ix;
    uint alloc;
 };
 layout(set = 0, binding = 3) buffer BinsBuf {
    uint[] bins;
 };
 #include "annotated.h"
 #include "state.h"
 #include "bins.h"
 // scale factors useful for converting coordinates to bins
 #define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX))
 #define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX))
 #define TSY (1.0 / float(TILE_HEIGHT_PX))
 // Constant not available in GLSL. Also consider uintBitsToFloat(0x7f800000)
 #define INFINITY (1.0 / 0.0)
 // Note: cudaraster has N_TILE + 1 to cut down on bank conflicts.
 shared uint bitmaps[N_SLICE][N_TILE];
 shared uint count[N_SLICE][N_TILE];
 shared uint sh_chunk_start[N_TILE];
 shared float sh_right_edge[N_TILE];
 #define StateBuf_stride (8 + 2 * State_size)
 uint state_right_edge_index(uint partition_ix) {
    return 2 + partition_ix * (StateBuf_stride / 4);
 }
 void main() {
    uint chunk_n = 0;
    uint my_n_elements = n_elements;
    uint my_partition = gl_WorkGroupID.x;
    for (uint i = 0; i < N_SLICE; i++) {
        bitmaps[i][gl_LocalInvocationID.x] = 0;
    }
    barrier();
    // Read inputs and determine coverage of bins
    uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x;
    AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size);
    uint tag = Annotated_Nop;
    if (element_ix < my_n_elements) {
        tag = Annotated_tag(ref);
    }
    int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
    float my_right_edge = INFINITY;
    bool crosses_edge = false;
    switch (tag) {
    case Annotated_FillLine:
    case Annotated_StrokeLine:
        AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref);
        x0 = int(floor((min(line.p0.x, line.p1.x) - line.stroke.x) * SX));
        y0 = int(floor((min(line.p0.y, line.p1.y) - line.stroke.y) * SY));
        x1 = int(ceil((max(line.p0.x, line.p1.x) + line.stroke.x) * SX));
        y1 = int(ceil((max(line.p0.y, line.p1.y) + line.stroke.y) * SY));
        crosses_edge = tag == Annotated_FillLine && ceil(line.p0.y * TSY) != ceil(line.p1.y * TSY);
        break;
    case Annotated_Fill:
    case Annotated_Stroke:
        // Note: we take advantage of the fact that fills and strokes
        // have compatible layout.
        AnnoFill fill = Annotated_Fill_read(ref);
        x0 = int(floor(fill.bbox.x * SX));
        y0 = int(floor(fill.bbox.y * SY));
        x1 = int(ceil(fill.bbox.z * SX));
        y1 = int(ceil(fill.bbox.w * SY));
        // It probably makes more sense to track x1, to avoid having to redo
        // the rounding to tile coords.
        my_right_edge = fill.bbox.z;
        break;
    }
    // If the last element in this partition is a fill edge, then we need to do a
    // look-forward to find the right edge of its corresponding fill. That data is
    // recorded in aggregates computed in the element processing pass.
    if (gl_LocalInvocationID.x == N_TILE - 1 && tag == Annotated_FillLine) {
        uint aggregate_ix = (my_partition + 1) * ELEMENT_BINNING_RATIO;
        // This is sequential but the expectation is that the amount of
        // look-forward is small (performance may degrade in the case
        // of massively complex paths).
        do {
            my_right_edge = uintBitsToFloat(state[state_right_edge_index(aggregate_ix)]);
            aggregate_ix++;
        } while (isinf(my_right_edge));
    }
    // Now propagate right_edge backward, from fill to segment.
    for (uint i = 0; i < LG_N_TILE; i++) {
        // Note: we could try to cut down on write bandwidth here if the value hasn't
        // changed, but not sure it's worth the complexity to track.
        sh_right_edge[gl_LocalInvocationID.x] = my_right_edge;
        barrier();
        if (gl_LocalInvocationID.x + (1 << i) < N_TILE && isinf(my_right_edge)) {
            my_right_edge = sh_right_edge[gl_LocalInvocationID.x + (1 << i)];
        }
        barrier();
    }
    if (crosses_edge) {
        x1 = int(ceil(my_right_edge * SX));
    }
    // At this point, we run an iterator over the coverage area,
    // trying to keep divergence low.
    // Right now, it's just a bbox, but we'll get finer with
    // segments.
    x0 = clamp(x0, 0, N_TILE_X);
    x1 = clamp(x1, x0, N_TILE_X);
    y0 = clamp(y0, 0, N_TILE_Y);
    y1 = clamp(y1, y0, N_TILE_Y);
    if (x0 == x1) y1 = y0;
    int x = x0, y = y0;
    uint my_slice = gl_LocalInvocationID.x / 32;
    uint my_mask = 1 << (gl_LocalInvocationID.x & 31);
    while (y < y1) {
        atomicOr(bitmaps[my_slice][y * N_TILE_X + x], my_mask);
        x++;
        if (x == x1) {
            x = x0;
            y++;
        }
    }
    barrier();
    // Allocate output segments.
    uint element_count = 0;
    for (uint i = 0; i < N_SLICE; i++) {
        element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]);
        count[i][gl_LocalInvocationID.x] = element_count;
    }
    // element_count is number of elements covering bin for this invocation.
    uint chunk_start = 0;
    if (element_count != 0) {
        // TODO: aggregate atomic adds (subgroup is probably fastest)
        chunk_start = atomicAdd(alloc, element_count * BinInstance_size);
        sh_chunk_start[gl_LocalInvocationID.x] = chunk_start;
    }
    // Note: it might be more efficient for reading to do this in the
    // other order (each bin is a contiguous sequence of partitions)
    uint out_ix = (my_partition * N_TILE + gl_LocalInvocationID.x) * 2;
    bins[out_ix] = element_count;
    bins[out_ix + 1] = chunk_start;
    barrier();
    // Use similar strategy as Laine & Karras paper; loop over bbox of bins
    // touched by this element
    x = x0;
    y = y0;
    while (y < y1) {
        uint bin_ix = y * N_TILE_X + x;
        uint out_mask = bitmaps[my_slice][bin_ix];
        if ((out_mask & my_mask) != 0) {
            uint idx = bitCount(out_mask & (my_mask - 1));
            if (my_slice > 0) {
                idx += count[my_slice - 1][bin_ix];
            }
            uint out_offset = sh_chunk_start[bin_ix] + idx * BinInstance_size;
            BinInstance_write(BinInstanceRef(out_offset), BinInstance(element_ix, my_right_edge));
        }
        x++;
        if (x == x1) {
            x = x0;
            y++;
        }
    }
 }
--- a/piet-gpu/shader/binning.spv
+++ b/piet-gpu/shader/binning.spv
--- a/piet-gpu/shader/bins.h
+++ b/piet-gpu/shader/bins.h
@ -0,0 +1,64 @@
 // Code auto-generated by piet-gpu-derive
 struct BinInstanceRef {
    uint offset;
 };
 struct BinChunkRef {
    uint offset;
 };
 struct BinInstance {
    uint element_ix;
    float right_edge;
 };
 #define BinInstance_size 8
 BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) {
    return BinInstanceRef(ref.offset + index * BinInstance_size);
 }
 struct BinChunk {
    uint n;
    BinChunkRef next;
 };
 #define BinChunk_size 8
 BinChunkRef BinChunk_index(BinChunkRef ref, uint index) {
    return BinChunkRef(ref.offset + index * BinChunk_size);
 }
 BinInstance BinInstance_read(BinInstanceRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = bins[ix + 0];
    uint raw1 = bins[ix + 1];
    BinInstance s;
    s.element_ix = raw0;
    s.right_edge = uintBitsToFloat(raw1);
    return s;
 }
 void BinInstance_write(BinInstanceRef ref, BinInstance s) {
    uint ix = ref.offset >> 2;
    bins[ix + 0] = s.element_ix;
    bins[ix + 1] = floatBitsToUint(s.right_edge);
 }
 BinChunk BinChunk_read(BinChunkRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = bins[ix + 0];
    uint raw1 = bins[ix + 1];
    BinChunk s;
    s.n = raw0;
    s.next = BinChunkRef(raw1);
    return s;
 }
 void BinChunk_write(BinChunkRef ref, BinChunk s) {
    uint ix = ref.offset >> 2;
    bins[ix + 0] = s.n;
    bins[ix + 1] = s.next.offset;
 }
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@ -9,12 +9,11 @@ rule glsl
 build image.spv: glsl image.comp | scene.h
 build kernel1.spv: glsl kernel1.comp | scene.h tilegroup.h setup.h
-build kernel2s.spv: glsl kernel2s.comp | scene.h tilegroup.h segment.h setup.h
+build elements.spv: glsl elements.comp | scene.h state.h annotated.h
-build kernel2f.spv: glsl kernel2f.comp | scene.h tilegroup.h fill_seg.h setup.h
+build binning.spv: glsl binning.comp | annotated.h state.h bins.h setup.h
-build kernel3.spv: glsl kernel3.comp | scene.h tilegroup.h segment.h fill_seg.h ptcl.h setup.h
+build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h
-build kernel4.spv: glsl kernel4.comp | ptcl.h segment.h fill_seg.h setup.h
+build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@ -0,0 +1,526 @@
 // The coarse rasterizer stage of the pipeline.
 #version 450
 #extension GL_GOOGLE_include_directive : enable
 #include "setup.h"
 layout(local_size_x = N_TILE, local_size_y = 1) in;
 layout(set = 0, binding = 0) buffer AnnotatedBuf {
    uint[] annotated;
 };
 layout(set = 0, binding = 1) buffer BinsBuf {
    uint[] bins;
 };
 layout(set = 0, binding = 2) buffer AllocBuf {
    uint n_elements;
    uint alloc;
 };
 layout(set = 0, binding = 3) buffer PtclBuf {
    uint[] ptcl;
 };
 #include "annotated.h"
 #include "bins.h"
 #include "ptcl.h"
 #define LG_N_PART_READ 8
 #define N_PART_READ (1 << LG_N_PART_READ)
 shared uint sh_elements[N_TILE];
 shared float sh_right_edge[N_TILE];
 // Number of elements in the partition; prefix sum.
 shared uint sh_part_count[N_PART_READ];
 shared uint sh_part_elements[N_PART_READ];
 shared uint sh_bitmaps[N_SLICE][N_TILE];
 shared uint sh_backdrop[N_SLICE][N_TILE];
 shared uint sh_bd_sign[N_SLICE];
 shared uint sh_is_segment[N_SLICE];
 // Shared state for parallel segment output stage
 // Count of total number of segments in each tile, then
 // inclusive prefix sum of same.
 shared uint sh_seg_count[N_TILE];
 shared uint sh_seg_alloc;
 // scale factors useful for converting coordinates to tiles
 #define SX (1.0 / float(TILE_WIDTH_PX))
 #define SY (1.0 / float(TILE_HEIGHT_PX))
 // Perhaps cmd_limit should be a global? This is a style question.
 void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) {
    if (cmd_ref.offset > cmd_limit) {
        uint new_cmd = atomicAdd(alloc, PTCL_INITIAL_ALLOC);
        CmdJump jump = CmdJump(new_cmd);
        Cmd_Jump_write(cmd_ref, jump);
        cmd_ref = CmdRef(new_cmd);
        cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
    }
 }
 #define CHUNK_ALLOC_SLAB 16
 uint alloc_chunk_remaining;
 uint alloc_chunk_offset;
 SegChunkRef alloc_seg_chunk() {
    if (alloc_chunk_remaining == 0) {
        alloc_chunk_offset = atomicAdd(alloc, CHUNK_ALLOC_SLAB * SegChunk_size);
        alloc_chunk_remaining = CHUNK_ALLOC_SLAB;
    }
    uint offset = alloc_chunk_offset;
    alloc_chunk_offset += SegChunk_size;
    alloc_chunk_remaining--;
    return SegChunkRef(offset);
 }
 // Accumulate delta to backdrop.
 //
 // Each bit for which bd_bitmap is 1 and bd_sign is 1 counts as +1, and each
 // bit for which bd_bitmap is 1 and bd_sign is 0 counts as -1.
 int count_backdrop(uint bd_bitmap, uint bd_sign) {
    return bitCount(bd_bitmap & bd_sign) - bitCount(bd_bitmap & ~bd_sign);
 }
 void main() {
    // Could use either linear or 2d layouts for both dispatch and
    // invocations within the workgroup. We'll use variables to abstract.
    uint bin_ix = N_TILE_X * gl_WorkGroupID.y + gl_WorkGroupID.x;
    uint partition_ix = 0;
    uint n_partitions = (n_elements + N_TILE - 1) / N_TILE;
    // Top left coordinates of this bin.
    vec2 xy0 = vec2(N_TILE_X * TILE_WIDTH_PX * gl_WorkGroupID.x, N_TILE_Y * TILE_HEIGHT_PX * gl_WorkGroupID.y);
    uint th_ix = gl_LocalInvocationID.x;
    uint tile_x = N_TILE_X * gl_WorkGroupID.x + gl_LocalInvocationID.x % N_TILE_X;
    uint tile_y = N_TILE_Y * gl_WorkGroupID.y + gl_LocalInvocationID.x / N_TILE_X;
    uint this_tile_ix = tile_y * WIDTH_IN_TILES + tile_x;
    CmdRef cmd_ref = CmdRef(this_tile_ix * PTCL_INITIAL_ALLOC);
    uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
    // Allocation and management of segment output
    SegChunkRef first_seg_chunk = SegChunkRef(0);
    SegChunkRef last_chunk_ref = SegChunkRef(0);
    uint last_chunk_n = 0;
    SegmentRef last_chunk_segs = SegmentRef(0);
    alloc_chunk_remaining = 0;
    // I'm sure we can figure out how to do this with at least one fewer register...
    // Items up to rd_ix have been read from sh_elements
    uint rd_ix = 0;
    // Items up to wr_ix have been written into sh_elements
    uint wr_ix = 0;
    // Items between part_start_ix and ready_ix are ready to be transferred from sh_part_elements
    uint part_start_ix = 0;
    uint ready_ix = 0;
    if (th_ix < N_SLICE) {
        sh_bd_sign[th_ix] = 0;
    }
    int backdrop = 0;
    while (true) {
        for (uint i = 0; i < N_SLICE; i++) {
            sh_bitmaps[i][th_ix] = 0;
            sh_backdrop[i][th_ix] = 0;
        }
        if (th_ix < N_SLICE) {
            sh_is_segment[th_ix] = 0;
        }
        // parallel read of input partitions
        do {
            if (ready_ix == wr_ix && partition_ix < n_partitions) {
                part_start_ix = ready_ix;
                uint count = 0;
                if (th_ix < N_PART_READ && partition_ix + th_ix < n_partitions) {
                    uint in_ix = ((partition_ix + th_ix) * N_TILE + bin_ix) * 2;
                    count = bins[in_ix];
                    sh_part_elements[th_ix] = bins[in_ix + 1];
                }
                // prefix sum of counts
                for (uint i = 0; i < LG_N_PART_READ; i++) {
                    if (th_ix < N_PART_READ) {
                        sh_part_count[th_ix] = count;
                    }
                    barrier();
                    if (th_ix < N_PART_READ) {
                        if (th_ix >= (1 << i)) {
                            count += sh_part_count[th_ix - (1 << i)];
                        }
                    }
                    barrier();
                }
                if (th_ix < N_PART_READ) {
                    sh_part_count[th_ix] = part_start_ix + count;
                }
                barrier();
                ready_ix = sh_part_count[N_PART_READ - 1];
                partition_ix += N_PART_READ;
            }
            // use binary search to find element to read
            uint ix = rd_ix + th_ix;
            if (ix >= wr_ix && ix < ready_ix) {
                uint part_ix = 0;
                for (uint i = 0; i < LG_N_PART_READ; i++) {
                    uint probe = part_ix + ((N_PART_READ / 2) >> i);
                    if (ix >= sh_part_count[probe - 1]) {
                        part_ix = probe;
                    }
                }
                ix -= part_ix > 0 ? sh_part_count[part_ix - 1] : part_start_ix;
                BinInstanceRef inst_ref = BinInstanceRef(sh_part_elements[part_ix]);
                BinInstance inst = BinInstance_read(BinInstance_index(inst_ref, ix));
                sh_elements[th_ix] = inst.element_ix;
                sh_right_edge[th_ix] = inst.right_edge;
            }
            barrier();
            wr_ix = min(rd_ix + N_TILE, ready_ix);
        } while (wr_ix - rd_ix < N_TILE && (wr_ix < ready_ix || partition_ix < n_partitions));
        // We've done the merge and filled the buffer.
        // Read one element, compute coverage.
        uint tag = Annotated_Nop;
        AnnotatedRef ref;
        float right_edge = 0.0;
        if (th_ix + rd_ix < wr_ix) {
            uint element_ix = sh_elements[th_ix];
            right_edge = sh_right_edge[th_ix];
            ref = AnnotatedRef(element_ix * Annotated_size);
            tag = Annotated_tag(ref);
        }
        // Setup for coverage algorithm.
        float a, b, c;
        // Bounding box of element in pixel coordinates.
        float xmin, xmax, ymin, ymax;
        uint my_slice = th_ix / 32;
        uint my_mask = 1 << (th_ix & 31);
        switch (tag) {
        case Annotated_FillLine:
        case Annotated_StrokeLine:
            AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref);
            xmin = min(line.p0.x, line.p1.x) - line.stroke.x;
            xmax = max(line.p0.x, line.p1.x) + line.stroke.x;
            ymin = min(line.p0.y, line.p1.y) - line.stroke.y;
            ymax = max(line.p0.y, line.p1.y) + line.stroke.y;
            float dx = line.p1.x - line.p0.x;
            float dy = line.p1.y - line.p0.y;
            if (tag == Annotated_FillLine) {
                // Set bit for backdrop sign calculation, 1 is +1, 0 is -1.
                if (dy < 0) {
                    atomicOr(sh_bd_sign[my_slice], my_mask);
                } else {
                    atomicAnd(sh_bd_sign[my_slice], ~my_mask);
                }
            }
            atomicOr(sh_is_segment[my_slice], my_mask);
            // Set up for per-scanline coverage formula, below.
            float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
            c = (line.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + line.stroke.y)) * SX;
            b = invslope; // Note: assumes square tiles, otherwise scale.
            a = (line.p0.x - xy0.x - (line.p0.y - 0.5 * float(TILE_HEIGHT_PX) - xy0.y) * b) * SX;
            break;
        case Annotated_Fill:
        case Annotated_Stroke:
            // Note: we take advantage of the fact that fills and strokes
            // have compatible layout.
            AnnoFill fill = Annotated_Fill_read(ref);
            xmin = fill.bbox.x;
            xmax = fill.bbox.z;
            ymin = fill.bbox.y;
            ymax = fill.bbox.w;
            // Just let the clamping to xmin and xmax determine the bounds.
            a = 0.0;
            b = 0.0;
            c = 1e9;
            break;
        default:
            ymin = 0;
            ymax = 0;
            break;
        }
        // Draw the coverage area into the bitmasks. This uses an algorithm
        // that computes the coverage of a span for given scanline.
        // Compute bounding box in tiles and clip to this bin.
        int x0 = int(floor((xmin - xy0.x) * SX));
        int x1 = int(ceil((xmax - xy0.x) * SX));
        int xr = int(ceil((right_edge - xy0.x) * SX));
        int y0 = int(floor((ymin - xy0.y) * SY));
        int y1 = int(ceil((ymax - xy0.y) * SY));
        x0 = clamp(x0, 0, N_TILE_X);
        x1 = clamp(x1, x0, N_TILE_X);
        xr = clamp(xr, 0, N_TILE_X);
        y0 = clamp(y0, 0, N_TILE_Y);
        y1 = clamp(y1, y0, N_TILE_Y);
        float t = a + b * float(y0);
        for (uint y = y0; y < y1; y++) {
            uint xx0 = clamp(int(floor(t - c)), x0, x1);
            uint xx1 = clamp(int(ceil(t + c)), x0, x1);
            for (uint x = xx0; x < xx1; x++) {
                atomicOr(sh_bitmaps[my_slice][y * N_TILE_X + x], my_mask);
            }
            if (tag == Annotated_FillLine && ymin <= xy0.y + float(y * TILE_HEIGHT_PX)) {
                // Assign backdrop to all tiles to the right of the ray crossing the
                // top edge of this tile, up to the right edge of the fill bbox.
                float xray = t - 0.5 * b;
                xx0 = max(int(ceil(xray)), 0);
                for (uint x = xx0; x < xr; x++) {
                    atomicOr(sh_backdrop[my_slice][y * N_TILE_X + x], my_mask);
                }
            }
            t += b;
        }
        barrier();
        // We've computed coverage and other info for each element in the input, now for
        // the output stage. We'll do segments first using a more parallel algorithm.
        uint seg_count = 0;
        for (uint i = 0; i < N_SLICE; i++) {
            seg_count += bitCount(sh_bitmaps[i][th_ix] & sh_is_segment[i]);
        }
        sh_seg_count[th_ix] = seg_count;
        // Prefix sum of sh_seg_count
        for (uint i = 0; i < LG_N_TILE; i++) {
            barrier();
            if (th_ix >= (1 << i)) {
                seg_count += sh_seg_count[th_ix - (1 << i)];
            }
            barrier();
            sh_seg_count[th_ix] = seg_count;
        }
        if (th_ix == N_TILE - 1) {
            sh_seg_alloc = atomicAdd(alloc, seg_count * Segment_size);
        }
        barrier();
        uint total_seg_count = sh_seg_count[N_TILE - 1];
        uint seg_alloc = sh_seg_alloc;
        // Output buffer is allocated as segments for each tile laid end-to-end.
        for (uint ix = th_ix; ix < total_seg_count; ix += N_TILE) {
            // Find the work item; this thread is now not bound to an element or tile.
            // First find the tile (by binary search)
            uint tile_ix = 0;
            for (uint i = 0; i < LG_N_TILE; i++) {
                uint probe = tile_ix + ((N_TILE / 2) >> i);
                if (ix >= sh_seg_count[probe - 1]) {
                    tile_ix = probe;
                }
            }
            // Now, sh_seg_count[tile_ix - 1] <= ix < sh_seg_count[tile_ix].
            // (considering sh_seg_count[-1] == 0)
            // Index of segment within tile's segments
            uint seq_ix = ix;
            // Maybe consider a sentinel value to avoid the conditional?
            if (tile_ix > 0) {
                seq_ix -= sh_seg_count[tile_ix - 1];
            }
            // Find the segment. This is done by linear scan through the bitmaps of the
            // tile, accelerated by bit counting. Binary search might help, maybe not.
            uint slice_ix = 0;
            uint seq_bits;
            while (true) {
                seq_bits = sh_bitmaps[slice_ix][tile_ix] & sh_is_segment[slice_ix];
                uint this_count = bitCount(seq_bits);
                if (this_count > seq_ix) {
                    break;
                }
                seq_ix -= this_count;
                slice_ix++;
            }
            // Now find position of nth bit set (n = seq_ix) in seq_bits; binary search
            uint bit_ix = 0;
            for (int i = 0; i < 5; i++) {
                uint probe = bit_ix + (16 >> i);
                if (seq_ix >= bitCount(seq_bits & ((1 << probe) - 1))) {
                    bit_ix = probe;
                }
            }
            uint out_offset = seg_alloc + Segment_size * ix + SegChunk_size;
            uint rd_el_ix = slice_ix * 32 + bit_ix;
            uint element_ix = sh_elements[rd_el_ix];
            ref = AnnotatedRef(element_ix * Annotated_size);
            AnnoFillLineSeg line = Annotated_FillLine_read(ref);
            float y_edge = 0.0;
            // This is basically the same logic as piet-metal, but should be made numerically robust.
            if (Annotated_tag(ref) == Annotated_FillLine) {
                vec2 tile_xy = xy0 + vec2((tile_ix % N_TILE_X) * TILE_WIDTH_PX, (tile_ix / N_TILE_X) * TILE_HEIGHT_PX);
                y_edge = mix(line.p0.y, line.p1.y, (tile_xy.x - line.p0.x) / (line.p1.x - line.p0.x));
                if (min(line.p0.x, line.p1.x) < tile_xy.x && y_edge >= tile_xy.y && y_edge < tile_xy.y + TILE_HEIGHT_PX) {
                    if (line.p0.x > line.p1.x) {
                        line.p1 = vec2(tile_xy.x, y_edge);
                    } else {
                        line.p0 = vec2(tile_xy.x, y_edge);
                    }
                } else {
                    y_edge = 1e9;
                }
            }
            Segment seg = Segment(line.p0, line.p1, y_edge);
            Segment_write(SegmentRef(seg_alloc + Segment_size * ix), seg);
        }
        // Output non-segment elements for this tile. The thread does a sequential walk
        // through the non-segment elements, and for segments, count and backdrop are
        // aggregated using bit counting.
        uint slice_ix = 0;
        uint bitmap = sh_bitmaps[0][th_ix];
        uint bd_bitmap = sh_backdrop[0][th_ix];
        uint bd_sign = sh_bd_sign[0];
        uint is_segment = sh_is_segment[0];
        uint seg_start = th_ix == 0 ? 0 : sh_seg_count[th_ix - 1];
        seg_count = 0;
        while (true) {
            uint nonseg_bitmap = bitmap & ~is_segment;
            if (nonseg_bitmap == 0) {
                backdrop += count_backdrop(bd_bitmap, bd_sign);
                seg_count += bitCount(bitmap & is_segment);
                slice_ix++;
                if (slice_ix == N_SLICE) {
                    break;
                }
                bitmap = sh_bitmaps[slice_ix][th_ix];
                bd_bitmap = sh_backdrop[slice_ix][th_ix];
                bd_sign = sh_bd_sign[slice_ix];
                is_segment = sh_is_segment[slice_ix];
                nonseg_bitmap = bitmap & ~is_segment;
                if (nonseg_bitmap == 0) {
                    continue;
                }
            }
            uint element_ref_ix = slice_ix * 32 + findLSB(nonseg_bitmap);
            uint element_ix = sh_elements[element_ref_ix];
            // Bits up to and including the lsb
            uint bd_mask = (nonseg_bitmap - 1) ^ nonseg_bitmap;
            backdrop += count_backdrop(bd_bitmap & bd_mask, bd_sign);
            seg_count += bitCount(bitmap & bd_mask & is_segment);
            // Clear bits that have been consumed.
            bd_bitmap &= ~bd_mask;
            bitmap &= ~bd_mask;
            // At this point, we read the element again from global memory.
            // If that turns out to be expensive, maybe we can pack it into
            // shared memory (or perhaps just the tag).
            ref = AnnotatedRef(element_ix * Annotated_size);
            tag = Annotated_tag(ref);
            switch (tag) {
            case Annotated_Fill:
                if (last_chunk_n > 0 || seg_count > 0) {
                    SegChunkRef chunk_ref = SegChunkRef(0);
                    if (seg_count > 0) {
                        chunk_ref = alloc_seg_chunk();
                        SegChunk chunk;
                        chunk.n = seg_count;
                        chunk.next = SegChunkRef(0);
                        uint seg_offset = seg_alloc + seg_start * Segment_size;
                        chunk.segs = SegmentRef(seg_offset);
                        SegChunk_write(chunk_ref, chunk);
                    }
                    if (last_chunk_n > 0) {
                        SegChunk chunk;
                        chunk.n = last_chunk_n;
                        chunk.next = chunk_ref;
                        chunk.segs = last_chunk_segs;
                        SegChunk_write(last_chunk_ref, chunk);
                    } else {
                        first_seg_chunk = chunk_ref;
                    }
                    AnnoFill fill = Annotated_Fill_read(ref);
                    CmdFill cmd_fill;
                    cmd_fill.seg_ref = first_seg_chunk;
                    cmd_fill.backdrop = backdrop;
                    cmd_fill.rgba_color = fill.rgba_color;
                    alloc_cmd(cmd_ref, cmd_limit);
                    Cmd_Fill_write(cmd_ref, cmd_fill);
                    cmd_ref.offset += Cmd_size;
                    last_chunk_n = 0;
                } else if (backdrop != 0) {
                    AnnoFill fill = Annotated_Fill_read(ref);
                    alloc_cmd(cmd_ref, cmd_limit);
                    Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color));
                    cmd_ref.offset += Cmd_size;
                }
                seg_start += seg_count;
                seg_count = 0;
                backdrop = 0;
                break;
            case Annotated_Stroke:
                // TODO: reduce divergence & code duplication? Much of the
                // fill and stroke processing is in common.
                if (last_chunk_n > 0 || seg_count > 0) {
                    SegChunkRef chunk_ref = SegChunkRef(0);
                    if (seg_count > 0) {
                        chunk_ref = alloc_seg_chunk();
                        SegChunk chunk;
                        chunk.n = seg_count;
                        chunk.next = SegChunkRef(0);
                        uint seg_offset = seg_alloc + seg_start * Segment_size;
                        chunk.segs = SegmentRef(seg_offset);
                        SegChunk_write(chunk_ref, chunk);
                    }
                    if (last_chunk_n > 0) {
                        SegChunk chunk;
                        chunk.n = last_chunk_n;
                        chunk.next = chunk_ref;
                        chunk.segs = last_chunk_segs;
                        SegChunk_write(last_chunk_ref, chunk);
                    } else {
                        first_seg_chunk = chunk_ref;
                    }
                    AnnoStroke stroke = Annotated_Stroke_read(ref);
                    CmdStroke cmd_stroke;
                    cmd_stroke.seg_ref = first_seg_chunk;
                    cmd_stroke.half_width = 0.5 * stroke.linewidth;
                    cmd_stroke.rgba_color = stroke.rgba_color;
                    alloc_cmd(cmd_ref, cmd_limit);
                    Cmd_Stroke_write(cmd_ref, cmd_stroke);
                    cmd_ref.offset += Cmd_size;
                    last_chunk_n = 0;
                }
                seg_start += seg_count;
                seg_count = 0;
                break;
            default:
                // This shouldn't happen, but just in case.
                seg_start++;
                break;
            }
        }
        if (seg_count > 0) {
            SegChunkRef chunk_ref = alloc_seg_chunk();
            if (last_chunk_n > 0) {
                SegChunk_write(last_chunk_ref, SegChunk(last_chunk_n, chunk_ref, last_chunk_segs));
            } else {
                first_seg_chunk = chunk_ref;
            }
            // TODO: free two registers by writing count and segments ref now,
            // as opposed to deferring SegChunk write until all fields are known.
            last_chunk_ref = chunk_ref;
            last_chunk_n = seg_count;
            uint seg_offset = seg_alloc + seg_start * Segment_size;
            last_chunk_segs = SegmentRef(seg_offset);
        }
        barrier();
        rd_ix += N_TILE;
        if (rd_ix >= ready_ix && partition_ix >= n_partitions) break;
    }
    Cmd_End_write(cmd_ref);
 }
--- a/piet-gpu/shader/coarse.spv
+++ b/piet-gpu/shader/coarse.spv
--- a/piet-gpu/shader/elements.comp
+++ b/piet-gpu/shader/elements.comp
@ -0,0 +1,328 @@
 // The element processing stage, first in the pipeline.
 //
 // This stage is primarily about applying transforms and computing bounding
 // boxes. It is organized as a scan over the input elements, producing
 // annotated output elements.
 #version 450
 #extension GL_GOOGLE_include_directive : enable
 #define N_ROWS 4
 #define WG_SIZE 32
 #define LG_WG_SIZE 5
 #define PARTITION_SIZE (WG_SIZE * N_ROWS)
 layout(local_size_x = WG_SIZE, local_size_y = 1) in;
 layout(set = 0, binding = 0) readonly buffer SceneBuf {
    uint[] scene;
 };
 // It would be better to use the Vulkan memory model than
 // "volatile" but shooting for compatibility here rather
 // than doing things right.
 layout(set = 0, binding = 1) volatile buffer StateBuf {
    uint[] state;
 };
 // The annotated results are stored here.
 layout(set = 0, binding = 2) buffer AnnotatedBuf {
    uint[] annotated;
 };
 #include "scene.h"
 #include "state.h"
 #include "annotated.h"
 #define StateBuf_stride (8 + 2 * State_size)
 StateRef state_aggregate_ref(uint partition_ix) {
    return StateRef(12 + partition_ix * StateBuf_stride);
 }
 StateRef state_prefix_ref(uint partition_ix) {
    return StateRef(12 + partition_ix * StateBuf_stride + State_size);
 }
 uint state_flag_index(uint partition_ix) {
    return 1 + partition_ix * (StateBuf_stride / 4);
 }
 // These correspond to X, A, P respectively in the prefix sum paper.
 #define FLAG_NOT_READY 0
 #define FLAG_AGGREGATE_READY 1
 #define FLAG_PREFIX_READY 2
 #define FLAG_SET_LINEWIDTH 1
 #define FLAG_SET_BBOX 2
 #define FLAG_RESET_BBOX 4
 // This is almost like a monoid (the interaction between transformation and
 // bounding boxes is approximate)
 State combine_state(State a, State b) {
    State c;
    c.bbox.x = min(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + min(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x;
    c.bbox.y = min(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + min(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
    c.bbox.z = max(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + max(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x;
    c.bbox.w = max(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + max(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
    if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) {
        c.bbox = a.bbox;
    } else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 &&
        (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y))
    {
        c.bbox.xy = min(a.bbox.xy, c.bbox.xy);
        c.bbox.zw = max(a.bbox.zw, c.bbox.zw);
    }
    // It would be more concise to cast to matrix types; ah well.
    c.mat.x = a.mat.x * b.mat.x + a.mat.z * b.mat.y;
    c.mat.y = a.mat.y * b.mat.x + a.mat.w * b.mat.y;
    c.mat.z = a.mat.x * b.mat.z + a.mat.z * b.mat.w;
    c.mat.w = a.mat.y * b.mat.z + a.mat.w * b.mat.w;
    c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x;
    c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y;
    c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth;
    c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX)) | b.flags;
    c.flags |= (a.flags & FLAG_RESET_BBOX) >> 1;
    return c;
 }
 State map_element(ElementRef ref, inout bool is_fill) {
    // TODO: it would *probably* be more efficient to make the memory read patterns less
    // divergent, though it would be more wasted memory.
    uint tag = Element_tag(ref);
    State c;
    c.bbox = vec4(0.0, 0.0, 0.0, 0.0);
    c.mat = vec4(1.0, 0.0, 0.0, 1.0);
    c.translate = vec2(0.0, 0.0);
    c.linewidth = 1.0; // TODO should be 0.0
    c.flags = 0;
    is_fill = false;
    switch (tag) {
    case Element_FillLine:
    case Element_StrokeLine:
        LineSeg line = Element_FillLine_read(ref);
        c.bbox.xy = min(line.p0, line.p1);
        c.bbox.zw = max(line.p0, line.p1);
        break;
    case Element_Quad:
        QuadSeg quad = Element_Quad_read(ref);
        c.bbox.xy = min(min(quad.p0, quad.p1), quad.p2);
        c.bbox.zw = max(max(quad.p0, quad.p1), quad.p2);
        break;
    case Element_Cubic:
        CubicSeg cubic = Element_Cubic_read(ref);
        c.bbox.xy = min(min(cubic.p0, cubic.p1), min(cubic.p2, cubic.p3));
        c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3));
        break;
    case Element_Fill:
        is_fill = true;
        // fall-through
    case Element_Stroke:
        c.flags = FLAG_RESET_BBOX;
        break;
    case Element_SetLineWidth:
        SetLineWidth lw = Element_SetLineWidth_read(ref);
        c.linewidth = lw.width;
        c.flags = FLAG_SET_LINEWIDTH;
        break;
    case Element_Transform:
        Transform t = Element_Transform_read(ref);
        c.mat = t.mat;
        c.translate = t.translate;
        break;
    }
    return c;
 }
 // Get the bounding box of a circle transformed by the matrix into an ellipse.
 vec2 get_linewidth(State st) {
    // See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm
    return 0.5 * st.linewidth * vec2(length(st.mat.xz), length(st.mat.yw));
 }
 // We should be able to use an array of structs but the NV shader compiler
 // doesn't seem to like it :/
 //shared State sh_state[WG_SIZE];
 shared vec4 sh_mat[WG_SIZE];
 shared vec2 sh_translate[WG_SIZE];
 shared vec4 sh_bbox[WG_SIZE];
 shared float sh_width[WG_SIZE];
 shared uint sh_flags[WG_SIZE];
 shared uint sh_min_fill;
 shared uint sh_tile_ix;
 shared State sh_prefix;
 void main() {
    State th_state[N_ROWS];
    // Determine partition to process by atomic counter (described in Section
    // 4.4 of prefix sum paper).
    if (gl_LocalInvocationID.x == 0) {
        sh_tile_ix = atomicAdd(state[0], 1);
        sh_min_fill = ~0;
    }
    barrier();
    uint tile_ix = sh_tile_ix;
    uint ix = tile_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
    ElementRef ref = ElementRef(ix * Element_size);
    bool is_fill;
    uint my_min_fill = ~0;
    th_state[0] = map_element(ref, is_fill);
    if (is_fill) my_min_fill = ix;
    for (uint i = 1; i < N_ROWS; i++) {
        // discussion question: would it be faster to load using more coherent patterns
        // into thread memory? This is kinda strided.
        th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i), is_fill));
        if (is_fill && my_min_fill == ~0) {
            my_min_fill = ix + i;
        }
    }
    atomicMin(sh_min_fill, my_min_fill);
    State agg = th_state[N_ROWS - 1];
    sh_mat[gl_LocalInvocationID.x] = agg.mat;
    sh_translate[gl_LocalInvocationID.x] = agg.translate;
    sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
    sh_width[gl_LocalInvocationID.x] = agg.linewidth;
    sh_flags[gl_LocalInvocationID.x] = agg.flags;
    for (uint i = 0; i < LG_WG_SIZE; i++) {
        barrier();
        if (gl_LocalInvocationID.x >= (1 << i)) {
            State other;
            uint ix = gl_LocalInvocationID.x - (1 << i);
            other.mat = sh_mat[ix];
            other.translate = sh_translate[ix];
            other.bbox = sh_bbox[ix];
            other.linewidth = sh_width[ix];
            other.flags = sh_flags[ix];
            agg = combine_state(other, agg);
        }
        barrier();
        sh_mat[gl_LocalInvocationID.x] = agg.mat;
        sh_translate[gl_LocalInvocationID.x] = agg.translate;
        sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
        sh_width[gl_LocalInvocationID.x] = agg.linewidth;
        sh_flags[gl_LocalInvocationID.x] = agg.flags;
    }
    State exclusive;
    exclusive.bbox = vec4(0.0, 0.0, 0.0, 0.0);
    exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0);
    exclusive.translate = vec2(0.0, 0.0);
    exclusive.linewidth = 1.0; //TODO should be 0.0
    exclusive.flags = 0;
    // Publish aggregate for this partition
    if (gl_LocalInvocationID.x == WG_SIZE - 1) {
        // Note: with memory model, we'd want to generate the atomic store version of this.
        State_write(state_aggregate_ref(tile_ix), agg);
        uint flag = FLAG_AGGREGATE_READY;
        memoryBarrierBuffer();
        if (tile_ix == 0) {
            State_write(state_prefix_ref(tile_ix), agg);
            flag = FLAG_PREFIX_READY;
        }
        state[state_flag_index(tile_ix)] = flag;
        if (tile_ix != 0) {
            // step 4 of paper: decoupled lookback
            uint look_back_ix = tile_ix - 1;
            while (true) {
                flag = state[state_flag_index(look_back_ix)];
                if (flag == FLAG_PREFIX_READY) {
                    State their_prefix = State_read(state_prefix_ref(look_back_ix));
                    exclusive = combine_state(their_prefix, exclusive);
                    break;
                } else if (flag == FLAG_AGGREGATE_READY) {
                    State their_agg = State_read(state_aggregate_ref(look_back_ix));
                    exclusive = combine_state(their_agg, exclusive);
                    look_back_ix--;
                }
                // else spin
            }
            // step 5 of paper: compute inclusive prefix
            State inclusive_prefix = combine_state(exclusive, agg);
            sh_prefix = exclusive;
            State_write(state_prefix_ref(tile_ix), inclusive_prefix);
            memoryBarrierBuffer();
            flag = FLAG_PREFIX_READY;
            state[state_flag_index(tile_ix)] = flag;
        }
    }
    barrier();
    my_min_fill = sh_min_fill;
    if (tile_ix != 0) {
        exclusive = sh_prefix;
    }
    State row = exclusive;
    if (gl_LocalInvocationID.x > 0) {
        uint ix = gl_LocalInvocationID.x - 1;
        State other;
        other.mat = sh_mat[ix];
        other.translate = sh_translate[ix];
        other.bbox = sh_bbox[ix];
        other.linewidth = sh_width[ix];
        other.flags = sh_flags[ix];
        row = combine_state(row, other);
    }
    if (my_min_fill == ~0 && gl_LocalInvocationID.x == 0) {
        state[state_flag_index(tile_ix) + 1] = 0x7f800000; // infinity
    }
    for (uint i = 0; i < N_ROWS; i++) {
        State st = combine_state(row, th_state[i]);
        if (my_min_fill == ix + i) {
            state[state_flag_index(tile_ix) + 1] = floatBitsToUint(st.bbox.z);
        }
        // We write the state now for development purposes, but the
        // actual goal is to write transformed and annotated elements.
        //State_write(StateRef((ix + i) * State_size), st);
        // Here we read again from the original scene. There may be
        // gains to be had from stashing in shared memory or possibly
        // registers (though register pressure is an issue).
        ElementRef this_ref = Element_index(ref, i);
        AnnotatedRef out_ref = AnnotatedRef((ix + i) * Annotated_size);
        uint tag = Element_tag(this_ref);
        switch (tag) {
        case Element_FillLine:
        case Element_StrokeLine:
            LineSeg line = Element_StrokeLine_read(this_ref);
            AnnoStrokeLineSeg anno_line;
            anno_line.p0 = st.mat.xy * line.p0.x + st.mat.zw * line.p0.y + st.translate;
            anno_line.p1 = st.mat.xy * line.p1.x + st.mat.zw * line.p1.y + st.translate;
            if (tag == Element_StrokeLine) {
                anno_line.stroke = get_linewidth(st);
            } else {
                anno_line.stroke = vec2(0.0);
            }
            // We do encoding a bit by hand to minimize divergence. Another approach
            // would be to have a fill/stroke bool.
            uint out_tag = tag == Element_FillLine ? Annotated_FillLine : Annotated_StrokeLine;
            annotated[out_ref.offset >> 2] = out_tag;
            AnnoStrokeLineSeg_write(AnnoStrokeLineSegRef(out_ref.offset + 4), anno_line);
            break;
        case Element_Stroke:
            Stroke stroke = Element_Stroke_read(this_ref);
            AnnoStroke anno_stroke;
            anno_stroke.rgba_color = stroke.rgba_color;
            vec2 lw = get_linewidth(st);
            anno_stroke.bbox = st.bbox + vec4(-lw, lw);
            anno_stroke.linewidth = st.linewidth * sqrt(st.mat.x * st.mat.w - st.mat.y * st.mat.z);
            Annotated_Stroke_write(out_ref, anno_stroke);
            break;
        case Element_Fill:
            Fill fill = Element_Fill_read(this_ref);
            AnnoFill anno_fill;
            anno_fill.rgba_color = fill.rgba_color;
            anno_fill.bbox = st.bbox;
            Annotated_Fill_write(out_ref, anno_fill);
            break;
        default:
            Annotated_Nop_write(out_ref);
            break;
        }
    }
 }
--- a/piet-gpu/shader/elements.spv
+++ b/piet-gpu/shader/elements.spv
--- a/piet-gpu/shader/fill_seg.h
+++ b/piet-gpu/shader/fill_seg.h
@ -1,130 +0,0 @@
 // Code auto-generated by piet-gpu-derive
 struct FillTileHeaderRef {
    uint offset;
 };
 struct FillItemHeaderRef {
    uint offset;
 };
 struct FillSegmentRef {
    uint offset;
 };
 struct FillSegChunkRef {
    uint offset;
 };
 struct FillTileHeader {
    uint n;
    FillItemHeaderRef items;
 };
 #define FillTileHeader_size 8
 FillTileHeaderRef FillTileHeader_index(FillTileHeaderRef ref, uint index) {
    return FillTileHeaderRef(ref.offset + index * FillTileHeader_size);
 }
 struct FillItemHeader {
    int backdrop;
    FillSegChunkRef segments;
 };
 #define FillItemHeader_size 8
 FillItemHeaderRef FillItemHeader_index(FillItemHeaderRef ref, uint index) {
    return FillItemHeaderRef(ref.offset + index * FillItemHeader_size);
 }
 struct FillSegment {
    vec2 start;
    vec2 end;
 };
 #define FillSegment_size 16
 FillSegmentRef FillSegment_index(FillSegmentRef ref, uint index) {
    return FillSegmentRef(ref.offset + index * FillSegment_size);
 }
 struct FillSegChunk {
    uint n;
    FillSegChunkRef next;
 };
 #define FillSegChunk_size 8
 FillSegChunkRef FillSegChunk_index(FillSegChunkRef ref, uint index) {
    return FillSegChunkRef(ref.offset + index * FillSegChunk_size);
 }
 FillTileHeader FillTileHeader_read(FillTileHeaderRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = fill_seg[ix + 0];
    uint raw1 = fill_seg[ix + 1];
    FillTileHeader s;
    s.n = raw0;
    s.items = FillItemHeaderRef(raw1);
    return s;
 }
 void FillTileHeader_write(FillTileHeaderRef ref, FillTileHeader s) {
    uint ix = ref.offset >> 2;
    fill_seg[ix + 0] = s.n;
    fill_seg[ix + 1] = s.items.offset;
 }
 FillItemHeader FillItemHeader_read(FillItemHeaderRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = fill_seg[ix + 0];
    uint raw1 = fill_seg[ix + 1];
    FillItemHeader s;
    s.backdrop = int(raw0);
    s.segments = FillSegChunkRef(raw1);
    return s;
 }
 void FillItemHeader_write(FillItemHeaderRef ref, FillItemHeader s) {
    uint ix = ref.offset >> 2;
    fill_seg[ix + 0] = uint(s.backdrop);
    fill_seg[ix + 1] = s.segments.offset;
 }
 FillSegment FillSegment_read(FillSegmentRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = fill_seg[ix + 0];
    uint raw1 = fill_seg[ix + 1];
    uint raw2 = fill_seg[ix + 2];
    uint raw3 = fill_seg[ix + 3];
    FillSegment s;
    s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    return s;
 }
 void FillSegment_write(FillSegmentRef ref, FillSegment s) {
    uint ix = ref.offset >> 2;
    fill_seg[ix + 0] = floatBitsToUint(s.start.x);
    fill_seg[ix + 1] = floatBitsToUint(s.start.y);
    fill_seg[ix + 2] = floatBitsToUint(s.end.x);
    fill_seg[ix + 3] = floatBitsToUint(s.end.y);
 }
 FillSegChunk FillSegChunk_read(FillSegChunkRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = fill_seg[ix + 0];
    uint raw1 = fill_seg[ix + 1];
    FillSegChunk s;
    s.n = raw0;
    s.next = FillSegChunkRef(raw1);
    return s;
 }
 void FillSegChunk_write(FillSegChunkRef ref, FillSegChunk s) {
    uint ix = ref.offset >> 2;
    fill_seg[ix + 0] = s.n;
    fill_seg[ix + 1] = s.next.offset;
 }
--- a/piet-gpu/shader/kernel1.comp
+++ b/piet-gpu/shader/kernel1.comp
@ -1,161 +0,0 @@
 // This is "kernel 1" in a 4-kernel pipeline. It traverses the scene graph
 // and outputs "instances" (references to item + translation) for each item
 // that intersects the tilegroup.
 //
 // This implementation is simplistic and leaves a lot of performance on the
 // table. A fancier implementation would use threadgroup shared memory or
 // subgroups (or possibly both) to parallelize the reading of the input and
 // the computation of tilegroup intersection.
 //
 // In addition, there are some features currently missing, such as support
 // for clipping.
 #version 450
 #extension GL_GOOGLE_include_directive : enable
 // It's possible we should lay this out with x and do our own math.
 layout(local_size_x = 1, local_size_y = 32) in;
 layout(set = 0, binding = 0) readonly buffer SceneBuf {
    uint[] scene;
 };
 layout(set = 0, binding = 1) buffer TilegroupBuf {
    uint[] tilegroup;
 };
 layout(set = 0, binding = 2) buffer AllocBuf {
    uint alloc;
 };
 #include "scene.h"
 #include "tilegroup.h"
 #include "setup.h"
 #define MAX_STACK 8
 struct StackElement {
    PietItemRef group;
    uint index;
    vec2 offset;
 };
 void main() {
    StackElement stack[MAX_STACK];
    uint stack_ix = 0;
    uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS + gl_GlobalInvocationID.x;
    TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE);
    uint tg_limit = tg_ref.offset + TILEGROUP_INITIAL_ALLOC - 2 * TileGroup_size;
    // State for stroke references.
    TileGroupRef stroke_start = TileGroupRef(tg_ref.offset + TILEGROUP_STROKE_START);
    ChunkRef stroke_chunk_start = ChunkRef(stroke_start.offset + 4);
    InstanceRef stroke_ref = InstanceRef(stroke_chunk_start.offset + Chunk_size);
    uint stroke_limit = stroke_start.offset + TILEGROUP_INITIAL_STROKE_ALLOC - Instance_size;
    uint stroke_chunk_n = 0;
    uint stroke_n = 0;
    // State for fill references. All this is a bit cut'n'paste, but making a
    // proper abstraction isn't easy.
    TileGroupRef fill_start = TileGroupRef(tg_ref.offset + TILEGROUP_FILL_START);
    ChunkRef fill_chunk_start = ChunkRef(fill_start.offset + 4);
    InstanceRef fill_ref = InstanceRef(fill_chunk_start.offset + Chunk_size);
    uint fill_limit = fill_start.offset + TILEGROUP_INITIAL_FILL_ALLOC - Instance_size;
    uint fill_chunk_n = 0;
    uint fill_n = 0;
    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILEGROUP_WIDTH_PX, TILEGROUP_HEIGHT_PX);
    PietItemRef root = PietItemRef(0);
    SimpleGroup group = PietItem_Group_read(root);
    StackElement tos = StackElement(root, 0, group.offset.xy);
    while (true) {
        if (tos.index < group.n_items) {
            Bbox bbox = Bbox_read(Bbox_index(group.bboxes, tos.index));
            vec4 bb = vec4(bbox.bbox) + tos.offset.xyxy;
            bool hit = max(bb.x, xy0.x) < min(bb.z, xy0.x + float(TILEGROUP_WIDTH_PX))
                && max(bb.y, xy0.y) < min(bb.w, xy0.y + float(TILEGROUP_HEIGHT_PX));
            bool is_group = false;
            uint tag;
            if (hit) {
                PietItemRef item_ref = PietItem_index(group.items, tos.index);
                tag = PietItem_tag(item_ref);
                is_group = tag == PietItem_Group;
            }
            if (hit && !is_group) {
                PietItemRef item_ref = PietItem_index(group.items, tos.index);
                Instance ins = Instance(item_ref.offset, tos.offset);
                if (tg_ref.offset > tg_limit) {
                    // Allocation exceeded; do atomic bump alloc.
                    uint new_tg = atomicAdd(alloc, TILEGROUP_INITIAL_ALLOC);
                    Jump jump = Jump(TileGroupRef(new_tg));
                    TileGroup_Jump_write(tg_ref, jump);
                    tg_ref = TileGroupRef(new_tg);
                    tg_limit = tg_ref.offset + TILEGROUP_INITIAL_ALLOC - 2 * TileGroup_size;
                }
                TileGroup_Instance_write(tg_ref, ins);
                tg_ref.offset += TileGroup_size;
                if (tag == PietItem_Poly) {
                    if (stroke_ref.offset > stroke_limit) {
                        uint new_stroke = atomicAdd(alloc, TILEGROUP_STROKE_ALLOC);
                        Chunk_write(stroke_chunk_start, Chunk(stroke_chunk_n, ChunkRef(new_stroke)));
                        stroke_chunk_start = ChunkRef(new_stroke);
                        stroke_ref = InstanceRef(new_stroke + Chunk_size);
                        stroke_n += stroke_chunk_n;
                        stroke_chunk_n = 0;
                        stroke_limit = new_stroke + TILEGROUP_STROKE_ALLOC - Instance_size;
                    }
                    Instance_write(stroke_ref, ins);
                    stroke_chunk_n++;
                    stroke_ref.offset += Instance_size;
                } else if (tag == PietItem_Fill) {
                    if (fill_ref.offset > fill_limit) {
                        uint new_fill = atomicAdd(alloc, TILEGROUP_FILL_ALLOC);
                        Chunk_write(fill_chunk_start, Chunk(fill_chunk_n, ChunkRef(new_fill)));
                        fill_chunk_start = ChunkRef(new_fill);
                        fill_ref = InstanceRef(new_fill + Chunk_size);
                        fill_n += fill_chunk_n;
                        fill_chunk_n = 0;
                        fill_limit = new_fill + TILEGROUP_FILL_ALLOC - Instance_size;
                    }
                    Instance_write(fill_ref, ins);
                    fill_chunk_n++;
                    fill_ref.offset += Instance_size;
                }
            }
            if (is_group) {
                PietItemRef item_ref = PietItem_index(group.items, tos.index);
                tos.index++;
                if (tos.index < group.n_items) {
                    stack[stack_ix++] = tos;
                }
                group = PietItem_Group_read(item_ref);
                tos = StackElement(item_ref, 0, tos.offset + group.offset.xy);
            } else {
                tos.index++;
            }
        } else {
            // processed all items in this group; pop the stack
            if (stack_ix == 0) {
                break;
            }
            tos = stack[--stack_ix];
            group = PietItem_Group_read(tos.group);
        }
    }
    TileGroup_End_write(tg_ref);
    stroke_n += stroke_chunk_n;
    if (stroke_n > 0) {
        Chunk_write(stroke_chunk_start, Chunk(stroke_chunk_n, ChunkRef(0)));
    }
    tilegroup[stroke_start.offset >> 2] = stroke_n;
    fill_n += fill_chunk_n;
    if (fill_n > 0) {
        Chunk_write(fill_chunk_start, Chunk(fill_chunk_n, ChunkRef(0)));
    }
    tilegroup[fill_start.offset >> 2] = fill_n;
 }
--- a/piet-gpu/shader/kernel1.spv
+++ b/piet-gpu/shader/kernel1.spv
--- a/piet-gpu/shader/kernel2f.comp
+++ b/piet-gpu/shader/kernel2f.comp
@ -1,167 +0,0 @@
 // This is "kernel 2" (fill) in a 4-kernel pipeline. It processes the fill
 // (polyline) items in the scene and generates a list of segments for each, for
 // each tile.
 #version 450
 #extension GL_GOOGLE_include_directive : enable
 layout(local_size_x = 32) in;
 layout(set = 0, binding = 0) readonly buffer SceneBuf {
    uint[] scene;
 };
 layout(set = 0, binding = 1) buffer TilegroupBuf {
    uint[] tilegroup;
 };
 layout(set = 0, binding = 2) buffer FillSegBuf {
    uint[] fill_seg;
 };
 layout(set = 0, binding = 3) buffer AllocBuf {
    uint alloc;
 };
 #include "scene.h"
 #include "tilegroup.h"
 #include "fill_seg.h"
 #include "setup.h"
 // Ensure that there is space to encode a segment.
 void alloc_chunk(inout uint chunk_n_segs, inout FillSegChunkRef seg_chunk_ref,
    inout FillSegChunkRef first_seg_chunk, inout uint seg_limit)
 {
    if (chunk_n_segs == 0) {
        if (seg_chunk_ref.offset + 40 > seg_limit) {
            seg_chunk_ref.offset = atomicAdd(alloc, SEG_CHUNK_ALLOC);
            seg_limit = seg_chunk_ref.offset + SEG_CHUNK_ALLOC - FillSegment_size;
        }
        first_seg_chunk = seg_chunk_ref;
    } else if (seg_chunk_ref.offset + FillSegChunk_size + FillSegment_size * chunk_n_segs > seg_limit) {
        uint new_chunk_ref = atomicAdd(alloc, SEG_CHUNK_ALLOC);
        seg_limit = new_chunk_ref + SEG_CHUNK_ALLOC - FillSegment_size;
        FillSegChunk_write(seg_chunk_ref, FillSegChunk(chunk_n_segs, FillSegChunkRef(new_chunk_ref)));
        seg_chunk_ref.offset = new_chunk_ref;
        chunk_n_segs = 0;
    }
 }
 void main() {
    uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x;
    uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS
        + (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES);
    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
    TileGroupRef fill_start = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE + TILEGROUP_FILL_START);
    uint fill_n = tilegroup[fill_start.offset >> 2];
    FillTileHeaderRef tile_header_ref = FillTileHeaderRef(tile_ix * FillTileHeader_size);
    if (fill_n > 0) {
        ChunkRef chunk_ref = ChunkRef(fill_start.offset + 4);
        Chunk chunk = Chunk_read(chunk_ref);
        InstanceRef fill_ref = InstanceRef(chunk_ref.offset + Chunk_size);
        FillItemHeaderRef item_header = FillItemHeaderRef(atomicAdd(alloc, fill_n * FillItemHeader_size));
        FillTileHeader_write(tile_header_ref, FillTileHeader(fill_n, item_header));
        FillSegChunkRef seg_chunk_ref = FillSegChunkRef(0);
        uint seg_limit = 0;
        // Iterate through items; fill_n holds count remaining.
        while (true) {
            if (chunk.chunk_n == 0) {
                chunk_ref = chunk.next;
                if (chunk_ref.offset == 0) {
                    break;
                }
                chunk = Chunk_read(chunk_ref);
                fill_ref = InstanceRef(chunk_ref.offset + Chunk_size);
            }
            Instance ins = Instance_read(fill_ref);
            PietFill fill = PietItem_Fill_read(PietItemRef(ins.item_ref));
            // Process the fill polyline item.
            uint max_n_segs = fill.n_points - 1;
            uint chunk_n_segs = 0;
            int backdrop = 0;
            FillSegChunkRef seg_chunk_ref;
            FillSegChunkRef first_seg_chunk = FillSegChunkRef(0);
            vec2 start = Point_read(fill.points).xy;
            for (uint j = 0; j < max_n_segs; j++) {
                fill.points.offset += Point_size;
                vec2 end = Point_read(fill.points).xy;
                // Process one segment.
                // TODO: I think this would go more smoothly (and be easier to
                // make numerically robust) if it were based on clipping the line
                // to the tile box. See:
                // https://tavianator.com/fast-branchless-raybounding-box-intersections/
                vec2 xymin = min(start, end);
                vec2 xymax = max(start, end);
                float a = end.y - start.y;
                float b = start.x - end.x;
                float c = -(a * start.x + b * start.y);
                vec2 xy1 = xy0 + vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
                float ytop = max(xy0.y, xymin.y);
                float ybot = min(xy1.y, xymax.y);
                float s00 = sign(b * ytop + a * xy0.x + c);
                float s01 = sign(b * ytop + a * xy1.x + c);
                float s10 = sign(b * ybot + a * xy0.x + c);
                float s11 = sign(b * ybot + a * xy1.x + c);
                float sTopLeft = sign(b * xy0.y + a * xy0.x + c);
                if (sTopLeft == sign(a) && xymin.y <= xy0.y && xymax.y > xy0.y) {
                    backdrop -= int(s00);
                }
                // This is adapted from piet-metal but could be improved.
                if (max(xymin.x, xy0.x) < min(xymax.x, xy1.x)
                    && ytop < ybot
                    && s00 * s01 + s00 * s10 + s00 * s11 < 3.0)
                {
                    // avoid overwriting `end` so that it can be used as start
                    vec2 enc_end = end;
                    if (xymin.x < xy0.x) {
                        float yEdge = mix(start.y, end.y, (start.x - xy0.x) / b);
                        if (yEdge >= xy0.y && yEdge < xy1.y) {
                            // This is encoded the same as a general fill segment, but could be
                            // special-cased, either here or in rendering. (It was special-cased
                            // in piet-metal).
                            FillSegment edge_seg;
                            if (b > 0.0) {
                                enc_end = vec2(xy0.x, yEdge);
                                edge_seg.start = enc_end;
                                edge_seg.end = vec2(xy0.x, xy1.y);
                            } else {
                                start = vec2(xy0.x, yEdge);
                                edge_seg.start = vec2(xy0.x, xy1.y);
                                edge_seg.end = start;
                            }
                            alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit);
                            FillSegment_write(FillSegmentRef(seg_chunk_ref.offset + FillSegChunk_size + FillSegment_size * chunk_n_segs), edge_seg);
                            chunk_n_segs++;
                        }
                    }
                    alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit);
                    FillSegment seg = FillSegment(start, enc_end);
                    FillSegment_write(FillSegmentRef(seg_chunk_ref.offset + FillSegChunk_size + FillSegment_size * chunk_n_segs), seg);
                    chunk_n_segs++;
                }
                start = end;
            }
            FillItemHeader_write(item_header, FillItemHeader(backdrop, first_seg_chunk));
            if (chunk_n_segs != 0) {
                FillSegChunk_write(seg_chunk_ref, FillSegChunk(chunk_n_segs, FillSegChunkRef(0)));
                seg_chunk_ref.offset += FillSegChunk_size + FillSegment_size * chunk_n_segs;
            }
            fill_ref.offset += Instance_size;
            chunk.chunk_n--;
            item_header.offset += FillItemHeader_size;
        }
    } else {
        // As an optimization, we could just write 0 for the size.
        FillTileHeader_write(tile_header_ref, FillTileHeader(fill_n, FillItemHeaderRef(0)));
    }
 }
--- a/piet-gpu/shader/kernel2f.spv
+++ b/piet-gpu/shader/kernel2f.spv
--- a/piet-gpu/shader/kernel2s.comp
+++ b/piet-gpu/shader/kernel2s.comp
@ -1,137 +0,0 @@
 // This is "kernel 2" (strokes) in a 4-kernel pipeline. It processes the stroke
 // (polyline) items in the scene and generates a list of segments for each, for
 // each tile.
 #version 450
 #extension GL_GOOGLE_include_directive : enable
 layout(local_size_x = 32) in;
 layout(set = 0, binding = 0) readonly buffer SceneBuf {
    uint[] scene;
 };
 layout(set = 0, binding = 1) buffer TilegroupBuf {
    uint[] tilegroup;
 };
 layout(set = 0, binding = 2) buffer SegmentBuf {
    uint[] segment;
 };
 layout(set = 0, binding = 3) buffer AllocBuf {
    uint alloc;
 };
 #include "scene.h"
 #include "tilegroup.h"
 #include "segment.h"
 #include "setup.h"
 void main() {
    uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x;
    uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS
        + (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES);
    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
    TileGroupRef stroke_start = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE + TILEGROUP_STROKE_START);
    uint stroke_n = tilegroup[stroke_start.offset >> 2];
    TileHeaderRef tile_header_ref = TileHeaderRef(tile_ix * TileHeader_size);
    if (stroke_n > 0) {
        ChunkRef chunk_ref = ChunkRef(stroke_start.offset + 4);
        Chunk chunk = Chunk_read(chunk_ref);
        InstanceRef stroke_ref = InstanceRef(chunk_ref.offset + Chunk_size);
        ItemHeaderRef item_header = ItemHeaderRef(atomicAdd(alloc, stroke_n * ItemHeader_size));
        TileHeader_write(tile_header_ref, TileHeader(stroke_n, item_header));
        SegChunkRef seg_chunk_ref = SegChunkRef(0);
        uint seg_limit = 0;
        // Iterate through items; stroke_n holds count remaining.
        while (true) {
            if (chunk.chunk_n == 0) {
                chunk_ref = chunk.next;
                if (chunk_ref.offset == 0) {
                    break;
                }
                chunk = Chunk_read(chunk_ref);
                stroke_ref = InstanceRef(chunk_ref.offset + Chunk_size);
            }
            Instance ins = Instance_read(stroke_ref);
            PietStrokePolyLine poly = PietItem_Poly_read(PietItemRef(ins.item_ref));
            // Process the stroke polyline item.
            uint max_n_segs = poly.n_points - 1;
            uint chunk_n_segs = 0;
            SegChunkRef seg_chunk_ref;
            vec2 start = Point_read(poly.points).xy;
            for (uint j = 0; j < max_n_segs; j++) {
                poly.points.offset += Point_size;
                vec2 end = Point_read(poly.points).xy;
                // Process one segment.
                // This logic just tests for collision. What we probably want to do
                // is a clipping algorithm like Liang-Barsky, and then store coords
                // relative to the tile in f16. See also:
                // https://tavianator.com/fast-branchless-raybounding-box-intersections/
                // Also note that when we go to the fancy version, we want to compute
                // the (horizontal projection of) the bounding box of the intersection
                // once per tilegroup, so we can assign work to individual tiles.
                float a = end.y - start.y;
                float b = start.x - end.x;
                float c = -(a * start.x + b * start.y);
                float half_width = 0.5 * poly.width;
                // Tile boundaries padded by half-width.
                float xmin = xy0.x - half_width;
                float ymin = xy0.y - half_width;
                float xmax = xy0.x + float(TILE_WIDTH_PX) + half_width;
                float ymax = xy0.y + float(TILE_HEIGHT_PX) + half_width;
                float s00 = sign(b * ymin + a * xmin + c);
                float s01 = sign(b * ymin + a * xmax + c);
                float s10 = sign(b * ymax + a * xmin + c);
                float s11 = sign(b * ymax + a * xmax + c);
                // If bounding boxes intersect and not all four corners are on the same side, hit.
                // Also note: this is designed to be false on NAN input.
                if (max(min(start.x, end.x), xmin) < min(max(start.x, end.x), xmax)
                    && max(min(start.y, end.y), ymin) < min(max(start.y, end.y), ymax)
                    && s00 * s01 + s00 * s10 + s00 * s11 < 3.0)
                {
                    // Allocate a chunk if needed.
                    if (chunk_n_segs == 0) {
                        if (seg_chunk_ref.offset + 40 > seg_limit) {
                            seg_chunk_ref.offset = atomicAdd(alloc, SEG_CHUNK_ALLOC);
                            seg_limit = seg_chunk_ref.offset + SEG_CHUNK_ALLOC - Segment_size;
                        }
                        ItemHeader_write(item_header, ItemHeader(seg_chunk_ref));
                    } else if (seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs > seg_limit) {
                        uint new_chunk_ref = atomicAdd(alloc, SEG_CHUNK_ALLOC);
                        seg_limit = new_chunk_ref + SEG_CHUNK_ALLOC - Segment_size;
                        SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, SegChunkRef(new_chunk_ref)));
                        seg_chunk_ref.offset = new_chunk_ref;
                        chunk_n_segs = 0;
                    }
                    Segment seg = Segment(start, end);
                    Segment_write(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs), seg);
                    chunk_n_segs++;
                }
                start = end;
            }
            if (chunk_n_segs == 0) {
                ItemHeader_write(item_header, ItemHeader(SegChunkRef(0)));
            } else {
                SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, SegChunkRef(0)));
                seg_chunk_ref.offset += SegChunk_size + Segment_size * chunk_n_segs;
            }
            stroke_ref.offset += Instance_size;
            chunk.chunk_n--;
            item_header.offset += ItemHeader_size;
        }
    } else {
        // As an optimization, we could just write 0 for the size.
        TileHeader_write(tile_header_ref, TileHeader(stroke_n, ItemHeaderRef(0)));
    }
 }
--- a/piet-gpu/shader/kernel2s.spv
+++ b/piet-gpu/shader/kernel2s.spv
--- a/piet-gpu/shader/kernel3.comp
+++ b/piet-gpu/shader/kernel3.comp
@ -1,135 +0,0 @@
 // This is "kernel 3" in a 4-kernel pipeline. It walks the active items
 // for the tilegroup and produces a per-tile command list for each tile.
 #version 450
 #extension GL_GOOGLE_include_directive : enable
 layout(local_size_x = 32, local_size_y = 1) in;
 layout(set = 0, binding = 0) readonly buffer SceneBuf {
    uint[] scene;
 };
 // TODO: this should have a `readonly` qualifier, but then inclusion
 // of ptcl.h would fail because of the writers.
 layout(set = 0, binding = 1) buffer TilegroupBuf {
    uint[] tilegroup;
 };
 // Used readonly
 layout(set = 0, binding = 2) buffer SegmentBuf {
    uint[] segment;
 };
 // Used readonly
 layout(set = 0, binding = 3) buffer FillSegmentBuf {
    uint[] fill_seg;
 };
 layout(set = 0, binding = 4) buffer PtclBuf {
    uint[] ptcl;
 };
 layout(set = 0, binding = 5) buffer AllocBuf {
    uint alloc;
 };
 #include "scene.h"
 #include "tilegroup.h"
 #include "segment.h"
 #include "fill_seg.h"
 #include "ptcl.h"
 #include "setup.h"
 void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) {
    if (cmd_ref.offset > cmd_limit) {
        uint new_cmd = atomicAdd(alloc, PTCL_INITIAL_ALLOC);
        CmdJump jump = CmdJump(new_cmd);
        Cmd_Jump_write(cmd_ref, jump);
        cmd_ref = CmdRef(new_cmd);
        cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
    }
 }
 void main() {
    uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x;
    uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS
        + (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES);
    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
    TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE);
    CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC);
    uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
    TileHeader stroke_th = TileHeader_read(TileHeaderRef(tile_ix * TileHeader_size));
    FillTileHeader fill_th = FillTileHeader_read(FillTileHeaderRef(tile_ix * FillTileHeader_size));
    while (true) {
        uint tg_tag = TileGroup_tag(tg_ref);
        if (tg_tag == TileGroup_End) {
            break;
        }
        if (tg_tag == TileGroup_Jump) {
            tg_ref = TileGroup_Jump_read(tg_ref).new_ref;
            continue;
        }
        // Assume tg_tag is `Instance`, though there will be more cases.
        Instance ins = TileGroup_Instance_read(tg_ref);
        PietItemRef item_ref = PietItemRef(ins.item_ref);
        uint item_tag = PietItem_tag(item_ref);
        switch (item_tag) {
        case PietItem_Circle:
            PietCircle circle = PietItem_Circle_read(item_ref);
            vec2 center = ins.offset + circle.center.xy;
            float r = circle.radius;
            if (max(center.x - r, xy0.x) < min(center.x + r, xy0.x + float(TILE_WIDTH_PX))
                && max(center.y - r, xy0.y) < min(center.y + r, xy0.y + float(TILE_HEIGHT_PX)))
            {
                CmdCircle cmd = CmdCircle(center, r, circle.rgba_color);
                alloc_cmd(cmd_ref, cmd_limit);
                Cmd_Circle_write(cmd_ref, cmd);
                cmd_ref.offset += Cmd_size;
            }
            break;
        case PietItem_Poly:
            ItemHeader stroke_item = ItemHeader_read(stroke_th.items);
            stroke_th.items.offset += ItemHeader_size;
            if (stroke_item.segments.offset != 0) {
                PietStrokePolyLine poly = PietItem_Poly_read(item_ref);
                CmdStroke cmd = CmdStroke(
                    stroke_item.segments.offset,
                    0.5 * poly.width,
                    poly.rgba_color
                );
                alloc_cmd(cmd_ref, cmd_limit);
                Cmd_Stroke_write(cmd_ref, cmd);
                cmd_ref.offset += Cmd_size;
            }
            break;
        case PietItem_Fill:
            FillItemHeader fill_item = FillItemHeader_read(fill_th.items);
            fill_th.items.offset += FillItemHeader_size;
            // TODO: handle segments == 0 but backdrop != specially, it's a solid tile.
            if (fill_item.segments.offset != 0) {
                PietFill fill = PietItem_Fill_read(item_ref);
                CmdFill cmd = CmdFill(
                    fill_item.segments.offset,
                    fill_item.backdrop,
                    fill.rgba_color
                );
                alloc_cmd(cmd_ref, cmd_limit);
                Cmd_Fill_write(cmd_ref, cmd);
                cmd_ref.offset += Cmd_size;
            } else if (fill_item.backdrop != 0) {
                // TODO: truncate existing cmd list if alpha is opaque
                PietFill fill = PietItem_Fill_read(item_ref);
                alloc_cmd(cmd_ref, cmd_limit);
                Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color));
                cmd_ref.offset += Cmd_size;
            }
            break;
        }
        tg_ref.offset += TileGroup_size;
    }
    Cmd_End_write(cmd_ref);
 }
--- a/piet-gpu/shader/kernel3.spv
+++ b/piet-gpu/shader/kernel3.spv
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@ -6,29 +6,20 @@
 #version 450
 #extension GL_GOOGLE_include_directive : enable
 #extension GL_KHR_shader_subgroup_basic : enable
-layout(local_size_x = 16, local_size_y = 16) in;
+#define CHUNK 8
 #define CHUNK_DY (16 / CHUNK)
 layout(local_size_x = 16, local_size_y = 2) in;
 // Same concern that this should be readonly as in kernel 3.
 layout(set = 0, binding = 0) buffer PtclBuf {
    uint[] ptcl;
 };
-// Used readonly
+layout(rgba8, set = 0, binding = 1) uniform writeonly image2D image;
 layout(set = 0, binding = 1) buffer SegmentBuf {
    uint[] segment;
 };
 // Used readonly
 layout(set = 0, binding = 2) buffer FillSegBuf {
    uint[] fill_seg;
 };
 layout(rgba8, set = 0, binding = 3) uniform writeonly image2D image;
 #include "ptcl.h"
 #include "segment.h"
 #include "fill_seg.h"
 #include "setup.h"
@ -36,10 +27,14 @@ void main() {
    uint tile_ix = gl_WorkGroupID.y * WIDTH_IN_TILES + gl_WorkGroupID.x;
    CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC);
-    uvec2 xy_uint = gl_GlobalInvocationID.xy;
+    uvec2 xy_uint = uvec2(gl_GlobalInvocationID.x, gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y);
    vec2 xy = vec2(xy_uint);
    vec2 uv = xy * vec2(1.0 / IMAGE_WIDTH, 1.0 / IMAGE_HEIGHT);
-    vec3 rgb = uv.xyy;
+    //vec3 rgb = uv.xyy;
    vec3 rgb[CHUNK];
    for (uint i = 0; i < CHUNK; i++) {
        rgb[i] = vec3(0.5);
    }
    while (true) {
        uint tag = Cmd_tag(cmd_ref);
@ -49,65 +44,85 @@ void main() {
        switch (tag) {
        case Cmd_Circle:
            CmdCircle circle = Cmd_Circle_read(cmd_ref);
            float r = length(xy + vec2(0.5, 0.5) - circle.center.xy);
            float alpha = clamp(0.5 + circle.radius - r, 0.0, 1.0);
            vec4 fg_rgba = unpackUnorm4x8(circle.rgba_color).wzyx;
-            // TODO: sRGB
+            for (uint i = 0; i < CHUNK; i++) {
-            rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a);
+                float dy = float(i * CHUNK_DY);
                float r = length(vec2(xy.x, xy.y + dy) + vec2(0.5, 0.5) - circle.center.xy);
                float alpha = clamp(0.5 + circle.radius - r, 0.0, 1.0);
                // TODO: sRGB
                rgb[i] = mix(rgb[i], fg_rgba.rgb, alpha * fg_rgba.a);
            }
            break;
        case Cmd_Stroke:
            CmdStroke stroke = Cmd_Stroke_read(cmd_ref);
-            float df = 1e9;
+            float df[CHUNK];
-            SegChunkRef seg_chunk_ref = SegChunkRef(stroke.seg_ref);
+            for (uint k = 0; k < CHUNK; k++) df[k] = 1e9;
            SegChunkRef seg_chunk_ref = stroke.seg_ref;
            do {
                SegChunk seg_chunk = SegChunk_read(seg_chunk_ref);
                SegmentRef segs = seg_chunk.segs;
                for (int i = 0; i < seg_chunk.n; i++) {
-                    Segment seg = Segment_read(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * i));
+                    Segment seg = Segment_read(Segment_index(segs, i));
                    vec2 line_vec = seg.end - seg.start;
-                    vec2 dpos = xy + vec2(0.5, 0.5) - seg.start;
+                    for (uint k = 0; k < CHUNK; k++) {
-                    float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
+                        vec2 dpos = xy + vec2(0.5, 0.5) - seg.start;
-                    df = min(df, length(line_vec * t - dpos));
+                        dpos.y += float(k * CHUNK_DY);
                        float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
                        df[k] = min(df[k], length(line_vec * t - dpos));
                    }
                }
                seg_chunk_ref = seg_chunk.next;
            } while (seg_chunk_ref.offset != 0);
            fg_rgba = unpackUnorm4x8(stroke.rgba_color).wzyx;
-            alpha = clamp(stroke.half_width + 0.5 - df, 0.0, 1.0);
+            for (uint k = 0; k < CHUNK; k++) {
-            rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a);
+                float alpha = clamp(stroke.half_width + 0.5 - df[k], 0.0, 1.0);
                rgb[k] = mix(rgb[k], fg_rgba.rgb, alpha * fg_rgba.a);
            }
            break;
        case Cmd_Fill:
            CmdFill fill = Cmd_Fill_read(cmd_ref);
            // Probably better to store as float, but conversion is no doubt cheap.
-            float area = float(fill.backdrop);
+            float area[CHUNK];
-            FillSegChunkRef fill_seg_chunk_ref = FillSegChunkRef(fill.seg_ref);
+            for (uint k = 0; k < CHUNK; k++) area[k] = float(fill.backdrop);
            SegChunkRef fill_seg_chunk_ref = fill.seg_ref;
            do {
-                FillSegChunk seg_chunk = FillSegChunk_read(fill_seg_chunk_ref);
+                SegChunk seg_chunk = SegChunk_read(fill_seg_chunk_ref);
                SegmentRef segs = seg_chunk.segs;
                for (int i = 0; i < seg_chunk.n; i++) {
-                    FillSegment seg = FillSegment_read(FillSegmentRef(fill_seg_chunk_ref.offset + FillSegChunk_size + FillSegment_size * i));
+                    Segment seg = Segment_read(Segment_index(segs, i));
-                    vec2 start = seg.start - xy;
+                    for (uint k = 0; k < CHUNK; k++) {
-                    vec2 end = seg.end - xy;
+                        vec2 my_xy = vec2(xy.x, xy.y + float(k * CHUNK_DY));
-                    vec2 window = clamp(vec2(start.y, end.y), 0.0, 1.0);
+                        vec2 start = seg.start - my_xy;
-                    if (window.x != window.y) {
+                        vec2 end = seg.end - my_xy;
-                        vec2 t = (window - start.y) / (end.y - start.y);
+                        vec2 window = clamp(vec2(start.y, end.y), 0.0, 1.0);
-                        vec2 xs = vec2(mix(start.x, end.x, t.x), mix(start.x, end.x, t.y));
+                        if (window.x != window.y) {
-                        float xmin = min(min(xs.x, xs.y), 1.0) - 1e-6;
+                            vec2 t = (window - start.y) / (end.y - start.y);
-                        float xmax = max(xs.x, xs.y);
+                            vec2 xs = vec2(mix(start.x, end.x, t.x), mix(start.x, end.x, t.y));
-                        float b = min(xmax, 1.0);
+                            float xmin = min(min(xs.x, xs.y), 1.0) - 1e-6;
-                        float c = max(b, 0.0);
+                            float xmax = max(xs.x, xs.y);
-                        float d = max(xmin, 0.0);
+                            float b = min(xmax, 1.0);
-                        float a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin);
+                            float c = max(b, 0.0);
-                        area += a * (window.x - window.y);
+                            float d = max(xmin, 0.0);
                            float a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin);
                            area[k] += a * (window.x - window.y);
                        }
                        area[k] += sign(end.x - start.x) * clamp(my_xy.y - seg.y_edge + 1.0, 0.0, 1.0);
                    }
                }
                fill_seg_chunk_ref = seg_chunk.next;
            } while (fill_seg_chunk_ref.offset != 0);
            fg_rgba = unpackUnorm4x8(fill.rgba_color).wzyx;
-            alpha = min(abs(area), 1.0);
+            for (uint k = 0; k < CHUNK; k++) {
-            rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a);
+                float alpha = min(abs(area[k]), 1.0);
                rgb[k] = mix(rgb[k], fg_rgba.rgb, alpha * fg_rgba.a);
            }
            break;
        case Cmd_Solid:
            CmdSolid solid = Cmd_Solid_read(cmd_ref);
            fg_rgba = unpackUnorm4x8(solid.rgba_color).wzyx;
-            rgb = mix(rgb, fg_rgba.rgb, fg_rgba.a);
+            for (uint k = 0; k < CHUNK; k++) {
                rgb[k] = mix(rgb[k], fg_rgba.rgb, fg_rgba.a);
            }
            break;
        case Cmd_Jump:
            cmd_ref = CmdRef(Cmd_Jump_read(cmd_ref).new_ref);
@ -116,5 +131,8 @@ void main() {
        cmd_ref.offset += Cmd_size;
    }
-    imageStore(image, ivec2(xy_uint), vec4(rgb, 1.0));
+    // TODO: sRGB
    for (uint i = 0; i < CHUNK; i++) {
        imageStore(image, ivec2(xy_uint.x, xy_uint.y + CHUNK_DY * i), vec4(rgb[i], 1.0));
    }
 }
--- a/piet-gpu/shader/kernel4.spv
+++ b/piet-gpu/shader/kernel4.spv
--- a/piet-gpu/shader/ptcl.h
+++ b/piet-gpu/shader/ptcl.h
@ -36,6 +36,14 @@ struct CmdRef {
    uint offset;
 };
 struct SegmentRef {
    uint offset;
 };
 struct SegChunkRef {
    uint offset;
 };
 struct CmdCircle {
    vec2 center;
    float radius;
@ -60,7 +68,7 @@ CmdLineRef CmdLine_index(CmdLineRef ref, uint index) {
 }
 struct CmdStroke {
-    uint seg_ref;
+    SegChunkRef seg_ref;
    float half_width;
    uint rgba_color;
 };
@ -72,7 +80,7 @@ CmdStrokeRef CmdStroke_index(CmdStrokeRef ref, uint index) {
 }
 struct CmdFill {
-    uint seg_ref;
+    SegChunkRef seg_ref;
    int backdrop;
    uint rgba_color;
 };
@ -141,6 +149,30 @@ CmdRef Cmd_index(CmdRef ref, uint index) {
    return CmdRef(ref.offset + index * Cmd_size);
 }
 struct Segment {
    vec2 start;
    vec2 end;
    float y_edge;
 };
 #define Segment_size 20
 SegmentRef Segment_index(SegmentRef ref, uint index) {
    return SegmentRef(ref.offset + index * Segment_size);
 }
 struct SegChunk {
    uint n;
    SegChunkRef next;
    SegmentRef segs;
 };
 #define SegChunk_size 12
 SegChunkRef SegChunk_index(SegChunkRef ref, uint index) {
    return SegChunkRef(ref.offset + index * SegChunk_size);
 }
 CmdCircle CmdCircle_read(CmdCircleRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = ptcl[ix + 0];
@ -188,7 +220,7 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) {
    uint raw1 = ptcl[ix + 1];
    uint raw2 = ptcl[ix + 2];
    CmdStroke s;
-    s.seg_ref = raw0;
+    s.seg_ref = SegChunkRef(raw0);
    s.half_width = uintBitsToFloat(raw1);
    s.rgba_color = raw2;
    return s;
@ -196,7 +228,7 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) {
 void CmdStroke_write(CmdStrokeRef ref, CmdStroke s) {
    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = s.seg_ref;
+    ptcl[ix + 0] = s.seg_ref.offset;
    ptcl[ix + 1] = floatBitsToUint(s.half_width);
    ptcl[ix + 2] = s.rgba_color;
 }
@ -207,7 +239,7 @@ CmdFill CmdFill_read(CmdFillRef ref) {
    uint raw1 = ptcl[ix + 1];
    uint raw2 = ptcl[ix + 2];
    CmdFill s;
-    s.seg_ref = raw0;
+    s.seg_ref = SegChunkRef(raw0);
    s.backdrop = int(raw1);
    s.rgba_color = raw2;
    return s;
@ -215,7 +247,7 @@ CmdFill CmdFill_read(CmdFillRef ref) {
 void CmdFill_write(CmdFillRef ref, CmdFill s) {
    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = s.seg_ref;
+    ptcl[ix + 0] = s.seg_ref.offset;
    ptcl[ix + 1] = uint(s.backdrop);
    ptcl[ix + 2] = s.rgba_color;
 }
@ -362,3 +394,45 @@ void Cmd_Bail_write(CmdRef ref) {
    ptcl[ref.offset >> 2] = Cmd_Bail;
 }
 Segment Segment_read(SegmentRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = ptcl[ix + 0];
    uint raw1 = ptcl[ix + 1];
    uint raw2 = ptcl[ix + 2];
    uint raw3 = ptcl[ix + 3];
    uint raw4 = ptcl[ix + 4];
    Segment s;
    s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.y_edge = uintBitsToFloat(raw4);
    return s;
 }
 void Segment_write(SegmentRef ref, Segment s) {
    uint ix = ref.offset >> 2;
    ptcl[ix + 0] = floatBitsToUint(s.start.x);
    ptcl[ix + 1] = floatBitsToUint(s.start.y);
    ptcl[ix + 2] = floatBitsToUint(s.end.x);
    ptcl[ix + 3] = floatBitsToUint(s.end.y);
    ptcl[ix + 4] = floatBitsToUint(s.y_edge);
 }
 SegChunk SegChunk_read(SegChunkRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = ptcl[ix + 0];
    uint raw1 = ptcl[ix + 1];
    uint raw2 = ptcl[ix + 2];
    SegChunk s;
    s.n = raw0;
    s.next = SegChunkRef(raw1);
    s.segs = SegmentRef(raw2);
    return s;
 }
 void SegChunk_write(SegChunkRef ref, SegChunk s) {
    uint ix = ref.offset >> 2;
    ptcl[ix + 0] = s.n;
    ptcl[ix + 1] = s.next.offset;
    ptcl[ix + 2] = s.segs.offset;
 }
--- a/piet-gpu/shader/scene.h
+++ b/piet-gpu/shader/scene.h
@ -32,6 +32,38 @@ struct PietItemRef {
    uint offset;
 };
 struct LineSegRef {
    uint offset;
 };
 struct QuadSegRef {
    uint offset;
 };
 struct CubicSegRef {
    uint offset;
 };
 struct FillRef {
    uint offset;
 };
 struct StrokeRef {
    uint offset;
 };
 struct SetLineWidthRef {
    uint offset;
 };
 struct TransformRef {
    uint offset;
 };
 struct ElementRef {
    uint offset;
 };
 struct Bbox {
    ivec4 bbox;
 };
@ -128,6 +160,98 @@ PietItemRef PietItem_index(PietItemRef ref, uint index) {
    return PietItemRef(ref.offset + index * PietItem_size);
 }
 struct LineSeg {
    vec2 p0;
    vec2 p1;
 };
 #define LineSeg_size 16
 LineSegRef LineSeg_index(LineSegRef ref, uint index) {
    return LineSegRef(ref.offset + index * LineSeg_size);
 }
 struct QuadSeg {
    vec2 p0;
    vec2 p1;
    vec2 p2;
 };
 #define QuadSeg_size 24
 QuadSegRef QuadSeg_index(QuadSegRef ref, uint index) {
    return QuadSegRef(ref.offset + index * QuadSeg_size);
 }
 struct CubicSeg {
    vec2 p0;
    vec2 p1;
    vec2 p2;
    vec2 p3;
 };
 #define CubicSeg_size 32
 CubicSegRef CubicSeg_index(CubicSegRef ref, uint index) {
    return CubicSegRef(ref.offset + index * CubicSeg_size);
 }
 struct Fill {
    uint rgba_color;
 };
 #define Fill_size 4
 FillRef Fill_index(FillRef ref, uint index) {
    return FillRef(ref.offset + index * Fill_size);
 }
 struct Stroke {
    uint rgba_color;
 };
 #define Stroke_size 4
 StrokeRef Stroke_index(StrokeRef ref, uint index) {
    return StrokeRef(ref.offset + index * Stroke_size);
 }
 struct SetLineWidth {
    float width;
 };
 #define SetLineWidth_size 4
 SetLineWidthRef SetLineWidth_index(SetLineWidthRef ref, uint index) {
    return SetLineWidthRef(ref.offset + index * SetLineWidth_size);
 }
 struct Transform {
    vec4 mat;
    vec2 translate;
 };
 #define Transform_size 24
 TransformRef Transform_index(TransformRef ref, uint index) {
    return TransformRef(ref.offset + index * Transform_size);
 }
 #define Element_Nop 0
 #define Element_StrokeLine 1
 #define Element_FillLine 2
 #define Element_Quad 3
 #define Element_Cubic 4
 #define Element_Stroke 5
 #define Element_Fill 6
 #define Element_SetLineWidth 7
 #define Element_Transform 8
 #define Element_size 36
 ElementRef Element_index(ElementRef ref, uint index) {
    return ElementRef(ref.offset + index * Element_size);
 }
 Bbox Bbox_read(BboxRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = scene[ix + 0];
@ -236,3 +360,122 @@ PietStrokePolyLine PietItem_Poly_read(PietItemRef ref) {
    return PietStrokePolyLine_read(PietStrokePolyLineRef(ref.offset + 4));
 }
 LineSeg LineSeg_read(LineSegRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = scene[ix + 0];
    uint raw1 = scene[ix + 1];
    uint raw2 = scene[ix + 2];
    uint raw3 = scene[ix + 3];
    LineSeg s;
    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    return s;
 }
 QuadSeg QuadSeg_read(QuadSegRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = scene[ix + 0];
    uint raw1 = scene[ix + 1];
    uint raw2 = scene[ix + 2];
    uint raw3 = scene[ix + 3];
    uint raw4 = scene[ix + 4];
    uint raw5 = scene[ix + 5];
    QuadSeg s;
    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
    return s;
 }
 CubicSeg CubicSeg_read(CubicSegRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = scene[ix + 0];
    uint raw1 = scene[ix + 1];
    uint raw2 = scene[ix + 2];
    uint raw3 = scene[ix + 3];
    uint raw4 = scene[ix + 4];
    uint raw5 = scene[ix + 5];
    uint raw6 = scene[ix + 6];
    uint raw7 = scene[ix + 7];
    CubicSeg s;
    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
    s.p3 = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7));
    return s;
 }
 Fill Fill_read(FillRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = scene[ix + 0];
    Fill s;
    s.rgba_color = raw0;
    return s;
 }
 Stroke Stroke_read(StrokeRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = scene[ix + 0];
    Stroke s;
    s.rgba_color = raw0;
    return s;
 }
 SetLineWidth SetLineWidth_read(SetLineWidthRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = scene[ix + 0];
    SetLineWidth s;
    s.width = uintBitsToFloat(raw0);
    return s;
 }
 Transform Transform_read(TransformRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = scene[ix + 0];
    uint raw1 = scene[ix + 1];
    uint raw2 = scene[ix + 2];
    uint raw3 = scene[ix + 3];
    uint raw4 = scene[ix + 4];
    uint raw5 = scene[ix + 5];
    Transform s;
    s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
    return s;
 }
 uint Element_tag(ElementRef ref) {
    return scene[ref.offset >> 2];
 }
 LineSeg Element_StrokeLine_read(ElementRef ref) {
    return LineSeg_read(LineSegRef(ref.offset + 4));
 }
 LineSeg Element_FillLine_read(ElementRef ref) {
    return LineSeg_read(LineSegRef(ref.offset + 4));
 }
 QuadSeg Element_Quad_read(ElementRef ref) {
    return QuadSeg_read(QuadSegRef(ref.offset + 4));
 }
 CubicSeg Element_Cubic_read(ElementRef ref) {
    return CubicSeg_read(CubicSegRef(ref.offset + 4));
 }
 Stroke Element_Stroke_read(ElementRef ref) {
    return Stroke_read(StrokeRef(ref.offset + 4));
 }
 Fill Element_Fill_read(ElementRef ref) {
    return Fill_read(FillRef(ref.offset + 4));
 }
 SetLineWidth Element_SetLineWidth_read(ElementRef ref) {
    return SetLineWidth_read(SetLineWidthRef(ref.offset + 4));
 }
 Transform Element_Transform_read(ElementRef ref) {
    return Transform_read(TransformRef(ref.offset + 4));
 }
--- a/piet-gpu/shader/segment.h
+++ b/piet-gpu/shader/segment.h
@ -1,126 +0,0 @@
 // Code auto-generated by piet-gpu-derive
 struct TileHeaderRef {
    uint offset;
 };
 struct ItemHeaderRef {
    uint offset;
 };
 struct SegmentRef {
    uint offset;
 };
 struct SegChunkRef {
    uint offset;
 };
 struct TileHeader {
    uint n;
    ItemHeaderRef items;
 };
 #define TileHeader_size 8
 TileHeaderRef TileHeader_index(TileHeaderRef ref, uint index) {
    return TileHeaderRef(ref.offset + index * TileHeader_size);
 }
 struct ItemHeader {
    SegChunkRef segments;
 };
 #define ItemHeader_size 4
 ItemHeaderRef ItemHeader_index(ItemHeaderRef ref, uint index) {
    return ItemHeaderRef(ref.offset + index * ItemHeader_size);
 }
 struct Segment {
    vec2 start;
    vec2 end;
 };
 #define Segment_size 16
 SegmentRef Segment_index(SegmentRef ref, uint index) {
    return SegmentRef(ref.offset + index * Segment_size);
 }
 struct SegChunk {
    uint n;
    SegChunkRef next;
 };
 #define SegChunk_size 8
 SegChunkRef SegChunk_index(SegChunkRef ref, uint index) {
    return SegChunkRef(ref.offset + index * SegChunk_size);
 }
 TileHeader TileHeader_read(TileHeaderRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = segment[ix + 0];
    uint raw1 = segment[ix + 1];
    TileHeader s;
    s.n = raw0;
    s.items = ItemHeaderRef(raw1);
    return s;
 }
 void TileHeader_write(TileHeaderRef ref, TileHeader s) {
    uint ix = ref.offset >> 2;
    segment[ix + 0] = s.n;
    segment[ix + 1] = s.items.offset;
 }
 ItemHeader ItemHeader_read(ItemHeaderRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = segment[ix + 0];
    ItemHeader s;
    s.segments = SegChunkRef(raw0);
    return s;
 }
 void ItemHeader_write(ItemHeaderRef ref, ItemHeader s) {
    uint ix = ref.offset >> 2;
    segment[ix + 0] = s.segments.offset;
 }
 Segment Segment_read(SegmentRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = segment[ix + 0];
    uint raw1 = segment[ix + 1];
    uint raw2 = segment[ix + 2];
    uint raw3 = segment[ix + 3];
    Segment s;
    s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    return s;
 }
 void Segment_write(SegmentRef ref, Segment s) {
    uint ix = ref.offset >> 2;
    segment[ix + 0] = floatBitsToUint(s.start.x);
    segment[ix + 1] = floatBitsToUint(s.start.y);
    segment[ix + 2] = floatBitsToUint(s.end.x);
    segment[ix + 3] = floatBitsToUint(s.end.y);
 }
 SegChunk SegChunk_read(SegChunkRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = segment[ix + 0];
    uint raw1 = segment[ix + 1];
    SegChunk s;
    s.n = raw0;
    s.next = SegChunkRef(raw1);
    return s;
 }
 void SegChunk_write(SegChunkRef ref, SegChunk s) {
    uint ix = ref.offset >> 2;
    segment[ix + 0] = s.n;
    segment[ix + 1] = s.next.offset;
 }
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@ -39,4 +39,26 @@
 // Maximum number of segments in a SegChunk
 #define SEG_CHUNK_N 32
-#define SEG_CHUNK_ALLOC 512
+#define SEG_CHUNK_ALLOC 512
 // Stuff for new algorithm follows; some of the above should get
 // deleted.
 // These should probably be renamed and/or reworked. In the binning
 // kernel, they represent the number of bins. Also, the workgroup size
 // of that kernel is equal to the number of bins, but should probably
 // be more flexible (it's 512 in the K&L paper).
 #define N_TILE_X 16
 #define N_TILE_Y 16
 #define N_TILE (N_TILE_X * N_TILE_Y)
 #define LG_N_TILE 8
 #define N_SLICE (N_TILE / 32)
 // Number of workgroups for binning kernel
 #define N_WG 16
 // This is the ratio of the number of elements in a binning workgroup
 // over the number of elements in a partition workgroup.
 #define ELEMENT_BINNING_RATIO 2
 #define BIN_INITIAL_ALLOC 64
 #define BIN_ALLOC 256
--- a/piet-gpu/shader/state.h
+++ b/piet-gpu/shader/state.h
@ -0,0 +1,59 @@
 // Code auto-generated by piet-gpu-derive
 struct StateRef {
    uint offset;
 };
 struct State {
    vec4 mat;
    vec2 translate;
    vec4 bbox;
    float linewidth;
    uint flags;
 };
 #define State_size 48
 StateRef State_index(StateRef ref, uint index) {
    return StateRef(ref.offset + index * State_size);
 }
 State State_read(StateRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = state[ix + 0];
    uint raw1 = state[ix + 1];
    uint raw2 = state[ix + 2];
    uint raw3 = state[ix + 3];
    uint raw4 = state[ix + 4];
    uint raw5 = state[ix + 5];
    uint raw6 = state[ix + 6];
    uint raw7 = state[ix + 7];
    uint raw8 = state[ix + 8];
    uint raw9 = state[ix + 9];
    uint raw10 = state[ix + 10];
    uint raw11 = state[ix + 11];
    State s;
    s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
    s.bbox = vec4(uintBitsToFloat(raw6), uintBitsToFloat(raw7), uintBitsToFloat(raw8), uintBitsToFloat(raw9));
    s.linewidth = uintBitsToFloat(raw10);
    s.flags = raw11;
    return s;
 }
 void State_write(StateRef ref, State s) {
    uint ix = ref.offset >> 2;
    state[ix + 0] = floatBitsToUint(s.mat.x);
    state[ix + 1] = floatBitsToUint(s.mat.y);
    state[ix + 2] = floatBitsToUint(s.mat.z);
    state[ix + 3] = floatBitsToUint(s.mat.w);
    state[ix + 4] = floatBitsToUint(s.translate.x);
    state[ix + 5] = floatBitsToUint(s.translate.y);
    state[ix + 6] = floatBitsToUint(s.bbox.x);
    state[ix + 7] = floatBitsToUint(s.bbox.y);
    state[ix + 8] = floatBitsToUint(s.bbox.z);
    state[ix + 9] = floatBitsToUint(s.bbox.w);
    state[ix + 10] = floatBitsToUint(s.linewidth);
    state[ix + 11] = s.flags;
 }
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -1,5 +1,5 @@
 mod render_ctx;
 mod pico_svg;
 mod render_ctx;
 pub use render_ctx::PietGpuRenderContext;
@ -8,6 +8,8 @@ use rand::{Rng, RngCore};
 use piet::kurbo::{BezPath, Circle, Line, Point, Vec2};
 use piet::{Color, RenderContext};
 use piet_gpu_types::encoder::Encode;
 use piet_gpu_hal::{CmdBuf, Device, Error, ImageLayout, MemFlags};
 use pico_svg::PicoSvg;
@ -28,7 +30,20 @@ const PTCL_INITIAL_ALLOC: usize = 1024;
 const K2_PER_TILE_SIZE: usize = 8;
-const N_CIRCLES: usize = 1;
+const N_CIRCLES: usize = 0;
 const N_WG: u32 = 16;
 pub fn render_svg(rc: &mut impl RenderContext, filename: &str, scale: f64) {
    let xml_str = std::fs::read_to_string(filename).unwrap();
    let start = std::time::Instant::now();
    let svg = PicoSvg::load(&xml_str, scale).unwrap();
    println!("parsing time: {:?}", start.elapsed());
    let start = std::time::Instant::now();
    svg.render(rc);
    println!("flattening and encoding time: {:?}", start.elapsed());
 }
 pub fn render_scene(rc: &mut impl RenderContext) {
    let mut rng = rand::thread_rng();
@ -42,12 +57,14 @@ pub fn render_scene(rc: &mut impl RenderContext) {
        let circle = Circle::new(center, radius);
        rc.fill(circle, &color);
    }
    /*
    let mut path = BezPath::new();
    path.move_to((100.0, 1150.0));
    path.line_to((200.0, 1200.0));
    path.line_to((150.0, 1250.0));
    path.close_path();
    rc.fill(path, &Color::rgb8(128, 0, 128));
    */
    rc.stroke(
        Line::new((100.0, 100.0), (200.0, 150.0)),
        &Color::WHITE,
@ -59,7 +76,7 @@ pub fn render_scene(rc: &mut impl RenderContext) {
 #[allow(unused)]
 fn render_cardioid(rc: &mut impl RenderContext) {
-    let n = 91;
+    let n = 601;
    let dth = std::f64::consts::PI * 2.0 / (n as f64);
    let center = Point::new(1024.0, 768.0);
    let r = 750.0;
@ -67,7 +84,7 @@ fn render_cardioid(rc: &mut impl RenderContext) {
    for i in 1..n {
        let p0 = center + Vec2::from_angle(i as f64 * dth) * r;
        let p1 = center + Vec2::from_angle(((i * 2) % n) as f64 * dth) * r;
-        rc.fill(&Circle::new(p0, 8.0), &Color::WHITE);
+        //rc.fill(&Circle::new(p0, 8.0), &Color::WHITE);
        path.move_to(p0);
        path.line_to(p1);
        //rc.stroke(Line::new(p0, p1), &Color::BLACK, 2.0);
@ -96,10 +113,10 @@ fn dump_scene(buf: &[u8]) {
 }
 #[allow(unused)]
-fn dump_k1_data(k1_buf: &[u32]) {
+pub fn dump_k1_data(k1_buf: &[u32]) {
    for i in 0..k1_buf.len() {
        if k1_buf[i] != 0 {
-            println!("{:4x}: {:8x}", i, k1_buf[i]);
+            println!("{:4x}: {:8x}", i * 4, k1_buf[i]);
        }
    }
 }
@ -110,27 +127,30 @@ pub struct Renderer<D: Device> {
    scene_buf: D::Buffer,
    scene_dev: D::Buffer,
-    k1_alloc_buf_host: D::Buffer,
+    pub state_buf: D::Buffer,
-    k1_alloc_buf_dev: D::Buffer,
+    pub anno_buf: D::Buffer,
-    k2s_alloc_buf_host: D::Buffer,
+    pub bin_buf: D::Buffer,
-    k2s_alloc_buf_dev: D::Buffer,
+    pub ptcl_buf: D::Buffer,
-    k2f_alloc_buf_host: D::Buffer,
+
-    k2f_alloc_buf_dev: D::Buffer,
+    el_pipeline: D::Pipeline,
-    k3_alloc_buf_host: D::Buffer,
+    el_ds: D::DescriptorSet,
-    k3_alloc_buf_dev: D::Buffer,
+
-    tilegroup_buf: D::Buffer,
+    bin_pipeline: D::Pipeline,
-    ptcl_buf: D::Buffer,
+    bin_ds: D::DescriptorSet,
    bin_alloc_buf_host: D::Buffer,
    bin_alloc_buf_dev: D::Buffer,
    coarse_pipeline: D::Pipeline,
    coarse_ds: D::DescriptorSet,
    coarse_alloc_buf_host: D::Buffer,
    coarse_alloc_buf_dev: D::Buffer,
    k1_pipeline: D::Pipeline,
    k1_ds: D::DescriptorSet,
    k2s_pipeline: D::Pipeline,
    k2s_ds: D::DescriptorSet,
    k2f_pipeline: D::Pipeline,
    k2f_ds: D::DescriptorSet,
    k3_pipeline: D::Pipeline,
    k3_ds: D::DescriptorSet,
    k4_pipeline: D::Pipeline,
    k4_ds: D::DescriptorSet,
    n_elements: usize,
 }
 impl<D: Device> Renderer<D> {
@ -138,6 +158,9 @@ impl<D: Device> Renderer<D> {
        let host = MemFlags::host_coherent();
        let dev = MemFlags::device_local();
        let n_elements = scene.len() / piet_gpu_types::scene::Element::fixed_size();
        println!("scene: {} elements", n_elements);
        let scene_buf = device
            .create_buffer(std::mem::size_of_val(&scene[..]) as u64, host)
            .unwrap();
@ -146,174 +169,121 @@ impl<D: Device> Renderer<D> {
            .unwrap();
        device.write_buffer(&scene_buf, &scene)?;
-        let tilegroup_buf = device.create_buffer(4 * 1024 * 1024, dev)?;
+        let state_buf = device.create_buffer(1 * 1024 * 1024, dev)?;
        let anno_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
        let bin_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
        let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?;
        let segment_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
        let fill_seg_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
        let image_dev = device.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?;
-        let k1_alloc_buf_host = device.create_buffer(4, host)?;
+        let el_code = include_bytes!("../shader/elements.spv");
-        let k1_alloc_buf_dev = device.create_buffer(4, dev)?;
+        let el_pipeline = device.create_simple_compute_pipeline(el_code, 3, 0)?;
-        let k1_alloc_start = WIDTH_IN_TILEGROUPS * HEIGHT_IN_TILEGROUPS * TILEGROUP_STRIDE;
+        let el_ds = device.create_descriptor_set(
-        device.write_buffer(&k1_alloc_buf_host, &[k1_alloc_start as u32])?;
+            &el_pipeline,
-        let k1_code = include_bytes!("../shader/kernel1.spv");
+            &[&scene_dev, &state_buf, &anno_buf],
-        let k1_pipeline = device
+            &[],
-            .create_simple_compute_pipeline(k1_code, 3, 0)?;
+        )?;
        let k1_ds = device
            .create_descriptor_set(
                &k1_pipeline,
                &[&scene_dev, &tilegroup_buf, &k1_alloc_buf_dev],
                &[],
            )?;
-        let k2s_alloc_buf_host = device.create_buffer(4, host)?;
+        let bin_alloc_buf_host = device.create_buffer(12, host)?;
-        let k2s_alloc_buf_dev = device.create_buffer(4, dev)?;
+        let bin_alloc_buf_dev = device.create_buffer(12, dev)?;
        let k2s_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * K2_PER_TILE_SIZE;
        device
            .write_buffer(&k2s_alloc_buf_host, &[k2s_alloc_start as u32])
            ?;
        let k2s_code = include_bytes!("../shader/kernel2s.spv");
        let k2s_pipeline = device
            .create_simple_compute_pipeline(k2s_code, 4, 0)
            ?;
        let k2s_ds = device
            .create_descriptor_set(
                &k2s_pipeline,
                &[&scene_dev, &tilegroup_buf, &segment_buf, &k2s_alloc_buf_dev],
                &[],
            )
            ?;
-        let k2f_alloc_buf_host = device.create_buffer(4, host)?;
+        // TODO: constants
-        let k2f_alloc_buf_dev = device.create_buffer(4, dev)?;
+        let bin_alloc_start = ((n_elements + 255) & !255) * 8;
-        let k2f_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * K2_PER_TILE_SIZE;
+        device.write_buffer(
-        device
+            &bin_alloc_buf_host,
-            .write_buffer(&k2f_alloc_buf_host, &[k2f_alloc_start as u32])
+            &[n_elements as u32, 0, bin_alloc_start as u32],
-            ?;
+        )?;
-        let k2f_code = include_bytes!("../shader/kernel2f.spv");
+        let bin_code = include_bytes!("../shader/binning.spv");
-        let k2f_pipeline = device.create_simple_compute_pipeline(k2f_code, 4, 0)?;
+        let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 4, 0)?;
-        let k2f_ds = device
+        let bin_ds = device.create_descriptor_set(
-            .create_descriptor_set(
+            &bin_pipeline,
-                &k2f_pipeline,
+            &[&anno_buf, &state_buf, &bin_alloc_buf_dev, &bin_buf],
-                &[
+            &[],
-                    &scene_dev,
+        )?;
                    &tilegroup_buf,
                    &fill_seg_buf,
                    &k2f_alloc_buf_dev,
                ],
                &[],
            )
            ?;
-        let k3_alloc_buf_host = device.create_buffer(4, host)?;
+        let coarse_alloc_buf_host = device.create_buffer(8, host)?;
-        let k3_alloc_buf_dev = device.create_buffer(4, dev)?;
+        let coarse_alloc_buf_dev = device.create_buffer(8, dev)?;
-        let k3_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC;
+
-        device
+        let coarse_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC;
-            .write_buffer(&k3_alloc_buf_host, &[k3_alloc_start as u32])
+        device.write_buffer(
-            ?;
+            &coarse_alloc_buf_host,
-        let k3_code = include_bytes!("../shader/kernel3.spv");
+            &[n_elements as u32, coarse_alloc_start as u32],
-        let k3_pipeline = device.create_simple_compute_pipeline(k3_code, 6, 0)?;
+        )?;
-        let k3_ds = device
+        let coarse_code = include_bytes!("../shader/coarse.spv");
-            .create_descriptor_set(
+        let coarse_pipeline = device.create_simple_compute_pipeline(coarse_code, 4, 0)?;
-                &k3_pipeline,
+        let coarse_ds = device.create_descriptor_set(
-                &[
+            &coarse_pipeline,
-                    &scene_dev,
+            &[&anno_buf, &bin_buf, &coarse_alloc_buf_dev, &ptcl_buf],
-                    &tilegroup_buf,
+            &[],
-                    &segment_buf,
+        )?;
                    &fill_seg_buf,
                    &ptcl_buf,
                    &k3_alloc_buf_dev,
                ],
                &[],
            )
            ?;
        let k4_code = include_bytes!("../shader/kernel4.spv");
-        let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 3, 1)?;
+        let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 1, 1)?;
-        let k4_ds = device
+        let k4_ds = device.create_descriptor_set(&k4_pipeline, &[&ptcl_buf], &[&image_dev])?;
            .create_descriptor_set(&k4_pipeline, &[&ptcl_buf, &segment_buf, &fill_seg_buf], &[&image_dev])
            ?;
        Ok(Renderer {
            scene_buf,
            scene_dev,
            image_dev,
-            k1_alloc_buf_host,
+            el_pipeline,
-            k1_alloc_buf_dev,
+            el_ds,
-            k2s_alloc_buf_host,
+            bin_pipeline,
-            k2s_alloc_buf_dev,
+            bin_ds,
-            k2f_alloc_buf_host,
+            coarse_pipeline,
-            k2f_alloc_buf_dev,
+            coarse_ds,
            k3_alloc_buf_host,
            k3_alloc_buf_dev,
            tilegroup_buf,
            ptcl_buf,
            k1_pipeline,
            k1_ds,
            k2s_pipeline,
            k2s_ds,
            k2f_pipeline,
            k2f_ds,
            k3_pipeline,
            k3_ds,
            k4_pipeline,
            k4_ds,
            state_buf,
            anno_buf,
            bin_buf,
            ptcl_buf,
            bin_alloc_buf_host,
            bin_alloc_buf_dev,
            coarse_alloc_buf_host,
            coarse_alloc_buf_dev,
            n_elements,
        })
    }
    pub unsafe fn record(&self, cmd_buf: &mut impl CmdBuf<D>, query_pool: &D::QueryPool) {
        cmd_buf.copy_buffer(&self.scene_buf, &self.scene_dev);
-        // Note: we could use one alloc buf and reuse it. But we'll stick with
+        cmd_buf.copy_buffer(&self.bin_alloc_buf_host, &self.bin_alloc_buf_dev);
-        // multiple ones for clarity.
+        cmd_buf.copy_buffer(&self.coarse_alloc_buf_host, &self.coarse_alloc_buf_dev);
-        cmd_buf.copy_buffer(&self.k1_alloc_buf_host, &self.k1_alloc_buf_dev);
+        cmd_buf.clear_buffer(&self.state_buf);
        cmd_buf.copy_buffer(&self.k2s_alloc_buf_host, &self.k2s_alloc_buf_dev);
        cmd_buf.copy_buffer(&self.k2f_alloc_buf_host, &self.k2f_alloc_buf_dev);
        cmd_buf.copy_buffer(&self.k3_alloc_buf_host, &self.k3_alloc_buf_dev);
        // Note: these clears aren't necessary, and are here to make inspection
        // of the buffers cleaner. Can likely be removed.
        cmd_buf.clear_buffer(&self.tilegroup_buf);
        cmd_buf.clear_buffer(&self.ptcl_buf);
        cmd_buf.memory_barrier();
-        cmd_buf.image_barrier(&self.image_dev, ImageLayout::Undefined, ImageLayout::General);
+        cmd_buf.image_barrier(
            &self.image_dev,
            ImageLayout::Undefined,
            ImageLayout::General,
        );
        cmd_buf.reset_query_pool(&query_pool);
        cmd_buf.write_timestamp(&query_pool, 0);
        cmd_buf.dispatch(
-            &self.k1_pipeline,
+            &self.el_pipeline,
-            &self.k1_ds,
+            &self.el_ds,
-            ((WIDTH / 512) as u32, (HEIGHT / 512) as u32, 1),
+            (((self.n_elements + 127) / 128) as u32, 1, 1),
        );
        cmd_buf.write_timestamp(&query_pool, 1);
        cmd_buf.memory_barrier();
        cmd_buf.dispatch(
-            &self.k2s_pipeline,
+            &self.bin_pipeline,
-            &self.k2s_ds,
+            &self.bin_ds,
-            ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 1),
+            (((self.n_elements + 255) / 256) as u32, 1, 1),
        );
        cmd_buf.write_timestamp(&query_pool, 2);
        // Note: this barrier is not necessary (k2f does not depend on
        // k2s output), but I'm keeping it here to increase transparency
        // of performance.
        cmd_buf.memory_barrier();
        cmd_buf.dispatch(
-            &self.k2f_pipeline,
+            &self.coarse_pipeline,
-            &self.k2f_ds,
+            &self.coarse_ds,
-            ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 2),
+            (WIDTH as u32 / 256, HEIGHT as u32 / 256, 1),
        );
        cmd_buf.write_timestamp(&query_pool, 3);
        cmd_buf.memory_barrier();
        cmd_buf.dispatch(
            &self.k3_pipeline,
            &self.k3_ds,
            ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 3),
        );
        cmd_buf.write_timestamp(&query_pool, 4);
        cmd_buf.memory_barrier();
        cmd_buf.dispatch(
            &self.k4_pipeline,
            &self.k4_ds,
            ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1),
        );
-        cmd_buf.write_timestamp(&query_pool, 5);
+        cmd_buf.write_timestamp(&query_pool, 4);
        cmd_buf.memory_barrier();
        cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
    }
--- a/piet-gpu/src/pico_svg.rs
+++ b/piet-gpu/src/pico_svg.rs
@ -2,7 +2,7 @@
 use std::str::FromStr;
-use roxmltree::Document;
+use roxmltree::{Document, Node};
 use piet::kurbo::{Affine, BezPath};
@ -28,27 +28,19 @@ pub struct FillItem {
    path: BezPath,
 }
 struct Parser<'a> {
    scale: f64,
    items: &'a mut Vec<Item>,
 }
 impl PicoSvg {
    pub fn load(xml_string: &str, scale: f64) -> Result<PicoSvg, Box<dyn std::error::Error>> {
        let doc = Document::parse(xml_string)?;
        let root = doc.root_element();
        let g = root.first_element_child().ok_or("no root element")?;
        let mut items = Vec::new();
-        for el in g.children() {
+        let mut parser = Parser::new(&mut items, scale);
-            if el.is_element() {
+        for node in root.children() {
-                let d = el.attribute("d").ok_or("missing 'd' attribute")?;
+            parser.rec_parse(node)?;
                let bp = BezPath::from_svg(d)?;
                let path = Affine::scale(scale) * bp;
                if let Some(fill_color) = el.attribute("fill") {
                    let color = parse_color(fill_color);
                    items.push(Item::Fill(FillItem { color, path: path.clone() }));
                }
                if let Some(stroke_color) = el.attribute("stroke") {
                    let width = f64::from_str(el.attribute("stroke-width").ok_or("missing width")?)?;
                    let color = parse_color(stroke_color);
                    items.push(Item::Stroke(StrokeItem { width, color, path }));
                }
            }
        }
        Ok(PicoSvg { items })
    }
@ -58,6 +50,7 @@ impl PicoSvg {
            match item {
                Item::Fill(fill_item) => {
                    rc.fill(&fill_item.path, &fill_item.color);
                    //rc.stroke(&fill_item.path, &fill_item.color, 1.0);
                }
                Item::Stroke(stroke_item) => {
                    rc.stroke(&stroke_item.path, &stroke_item.color, stroke_item.width);
@ -67,6 +60,59 @@ impl PicoSvg {
    }
 }
 impl<'a> Parser<'a> {
    fn new(items: &'a mut Vec<Item>, scale: f64) -> Parser<'a> {
        Parser { scale, items }
    }
    fn rec_parse(&mut self, node: Node) -> Result<(), Box<dyn std::error::Error>> {
        let transform = if self.scale >= 0.0 {
            Affine::scale(self.scale)
        } else {
            Affine::new([-self.scale, 0.0, 0.0, self.scale, 0.0, 1536.0])
        };
        if node.is_element() {
            match node.tag_name().name() {
                "g" => {
                    for child in node.children() {
                        self.rec_parse(child)?;
                    }
                }
                "path" => {
                    let d = node.attribute("d").ok_or("missing 'd' attribute")?;
                    let bp = BezPath::from_svg(d)?;
                    let path = transform * bp;
                    // TODO: default fill color is black, but this is overridden in tiger to this logic.
                    if let Some(fill_color) = node.attribute("fill") {
                        if fill_color != "none" {
                            let color = parse_color(fill_color);
                            let color = modify_opacity(color, "fill-opacity", node);
                            self.items.push(Item::Fill(FillItem {
                                color,
                                path: path.clone(),
                            }));
                        }
                    }
                    if let Some(stroke_color) = node.attribute("stroke") {
                        if stroke_color != "none" {
                            let width = self.scale.abs()
                                * f64::from_str(
                                    node.attribute("stroke-width").ok_or("missing width")?,
                                )?;
                            let color = parse_color(stroke_color);
                            let color = modify_opacity(color, "stroke-opacity", node);
                            self.items
                                .push(Item::Stroke(StrokeItem { width, color, path }));
                        }
                    }
                }
                _ => (),
            }
        }
        Ok(())
    }
 }
 fn parse_color(color: &str) -> Color {
    if color.as_bytes()[0] == b'#' {
        let mut hex = u32::from_str_radix(&color[1..], 16).unwrap();
@ -74,7 +120,27 @@ fn parse_color(color: &str) -> Color {
            hex = (hex >> 8) * 0x110000 + ((hex >> 4) & 0xf) * 0x1100 + (hex & 0xf) * 0x11;
        }
        Color::from_rgba32_u32((hex << 8) + 0xff)
    } else if color.starts_with("rgb(") {
        let mut iter = color[4..color.len() - 1].split(',');
        let r = u8::from_str(iter.next().unwrap()).unwrap();
        let g = u8::from_str(iter.next().unwrap()).unwrap();
        let b = u8::from_str(iter.next().unwrap()).unwrap();
        Color::rgb8(r, g, b)
    } else {
        Color::from_rgba32_u32(0xff00ff80)
    }
 }
 fn modify_opacity(color: Color, attr_name: &str, node: Node) -> Color {
    if let Some(opacity) = node.attribute(attr_name) {
        let alpha = if opacity.ends_with("%") {
            let pctg = opacity[..opacity.len() - 1].parse().unwrap_or(100.0);
            pctg * 0.01
        } else {
            opacity.parse().unwrap_or(1.0)
        };
        color.with_alpha(alpha)
    } else {
        color
    }
 }
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@ -2,7 +2,11 @@ use std::borrow::Cow;
 use piet_gpu_types::encoder::{Encode, Encoder, Ref};
 use piet_gpu_types::scene;
-use piet_gpu_types::scene::{Bbox, PietCircle, PietFill, PietItem, PietStrokePolyLine, SimpleGroup};
+use piet_gpu_types::scene::{
    Bbox, PietCircle, PietFill, PietItem, PietStrokePolyLine, SimpleGroup,
 };
 use piet_gpu_types::scene::{CubicSeg, Element, Fill, LineSeg, QuadSeg, SetLineWidth, Stroke};
 use piet::kurbo::{Affine, PathEl, Point, Rect, Shape};
@ -27,10 +31,10 @@ pub struct PietGpuText;
 pub struct PietGpuRenderContext {
    encoder: Encoder,
-    bboxes: Vec<Bbox>,
+    elements: Vec<Element>,
    items: Vec<PietItem>,
    // Will probably need direct accesss to hal Device to create images etc.
    inner_text: PietGpuText,
    stroke_width: f32,
 }
 #[derive(Clone)]
@ -43,47 +47,22 @@ const TOLERANCE: f64 = 0.25;
 impl PietGpuRenderContext {
    pub fn new() -> PietGpuRenderContext {
-        let mut encoder = Encoder::new();
+        let encoder = Encoder::new();
-        let _reserve_root = encoder.alloc_chunk(PietItem::fixed_size() as u32);
+        let elements = Vec::new();
        let bboxes = Vec::new();
        let items = Vec::new();
        let inner_text = PietGpuText;
        let stroke_width = 0.0;
        PietGpuRenderContext {
            encoder,
-            bboxes,
+            elements,
            items,
            inner_text,
            stroke_width,
        }
    }
    pub fn get_scene_buf(&mut self) -> &[u8] {
-        let n_items = self.bboxes.len() as u32;
+        self.elements.encode(&mut self.encoder);
        let bboxes = self.bboxes.encode(&mut self.encoder).transmute();
        let items = self.items.encode(&mut self.encoder).transmute();
        let offset = scene::Point { xy: [0.0, 0.0] };
        let simple_group = SimpleGroup {
            n_items,
            bboxes,
            items,
            offset,
        };
        let root_item = PietItem::Group(simple_group);
        root_item.encode_to(&mut self.encoder.buf_mut()[0..PietItem::fixed_size()]);
        self.encoder.buf()
    }
    fn push_item(&mut self, item: PietItem, bbox: Rect) {
        let scene_bbox = Bbox {
            bbox: [
                bbox.x0.floor() as i16,
                bbox.y0.floor() as i16,
                bbox.x1.ceil() as i16,
                bbox.y1.ceil() as i16,
            ],
        };
        self.items.push(item);
        self.bboxes.push(scene_bbox);
    }
 }
 impl RenderContext for PietGpuRenderContext {
@ -107,20 +86,19 @@ impl RenderContext for PietGpuRenderContext {
    fn clear(&mut self, _color: Color) {}
    fn stroke(&mut self, shape: impl Shape, brush: &impl IntoBrush<Self>, width: f64) {
-        let bbox = shape.bounding_box();
+        let width = width as f32;
-        let brush = brush.make_brush(self, || bbox).into_owned();
+        if self.stroke_width != width {
            self.elements
                .push(Element::SetLineWidth(SetLineWidth { width }));
            self.stroke_width = width;
        }
        let brush = brush.make_brush(self, || shape.bounding_box()).into_owned();
        let path = shape.to_bez_path(TOLERANCE);
-        let (n_points, points) = flatten_shape(&mut self.encoder, path);
+        self.encode_path(path, false);
        match brush {
            PietGpuBrush::Solid(rgba_color) => {
-                let poly_line = PietStrokePolyLine {
+                let stroke = Stroke { rgba_color };
-                    rgba_color,
+                self.elements.push(Element::Stroke(stroke));
                    width: width as f32,
                    n_points,
                    points,
                };
                let bbox = bbox.inset(-0.5 * width);
                self.push_item(PietItem::Poly(poly_line), bbox);
            }
            _ => (),
        }
@ -136,35 +114,13 @@ impl RenderContext for PietGpuRenderContext {
    }
    fn fill(&mut self, shape: impl Shape, brush: &impl IntoBrush<Self>) {
        let bbox = shape.bounding_box();
        let brush = brush.make_brush(self, || shape.bounding_box()).into_owned();
        if let Some(circle) = shape.as_circle() {
            match brush {
                PietGpuBrush::Solid(rgba_color) => {
                    let piet_circle = PietCircle {
                        rgba_color,
                        center: to_scene_point(circle.center),
                        radius: circle.radius as f32,
                    };
                    let bbox = circle.bounding_box();
                    self.push_item(PietItem::Circle(piet_circle), bbox);
                }
                _ => {}
            }
            return;
        }
        let path = shape.to_bez_path(TOLERANCE);
-        let (n_points, points) = flatten_shape(&mut self.encoder, path);
+        self.encode_path(path, true);
        match brush {
            PietGpuBrush::Solid(rgba_color) => {
-                let fill = PietFill {
+                let fill = Fill { rgba_color };
-                    flags: 0,
+                self.elements.push(Element::Fill(fill));
                    rgba_color,
                    n_points,
                    points,
                };
                self.push_item(PietItem::Fill(fill), bbox);
            }
            _ => (),
        }
@ -241,45 +197,110 @@ impl RenderContext for PietGpuRenderContext {
    }
 }
-fn flatten_shape(
+impl PietGpuRenderContext {
-    encoder: &mut Encoder,
+    fn encode_line_seg(&mut self, seg: LineSeg, is_fill: bool) {
-    path: impl Iterator<Item = PathEl>,
+        if is_fill {
-) -> (u32, Ref<scene::Point>) {
+            self.elements.push(Element::FillLine(seg));
-    let mut points = Vec::new();
+        } else {
-    let mut start_pt = None;
+            self.elements.push(Element::StrokeLine(seg));
-    let mut last_pt = None;
+        }
-    piet::kurbo::flatten(path, TOLERANCE, |el| {
+    }
-        match el {
+
-            PathEl::MoveTo(p) => {
+    fn encode_path(&mut self, path: impl Iterator<Item = PathEl>, is_fill: bool) {
-                let scene_pt = to_scene_point(p);
+        let flatten = true;
-                start_pt = Some(clone_scene_pt(&scene_pt));
+        if flatten {
-                if !points.is_empty() {
+            let mut start_pt = None;
-                    points.push(scene::Point {
+            let mut last_pt = None;
-                        xy: [std::f32::NAN, std::f32::NAN],
+            piet::kurbo::flatten(path, TOLERANCE, |el| {
-                    });
+                match el {
                    PathEl::MoveTo(p) => {
                        let scene_pt = to_f32_2(p);
                        start_pt = Some(scene_pt);
                        last_pt = Some(scene_pt);
                    }
                    PathEl::LineTo(p) => {
                        let scene_pt = to_f32_2(p);
                        let seg = LineSeg {
                            p0: last_pt.unwrap(),
                            p1: scene_pt,
                        };
                        self.encode_line_seg(seg, is_fill);
                        last_pt = Some(scene_pt);
                    }
                    PathEl::ClosePath => {
                        if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) {
                            if last != start {
                                let seg = LineSeg {
                                    p0: last,
                                    p1: start,
                                };
                                self.encode_line_seg(seg, is_fill);
                            }
                        }
                    }
                    _ => (),
                }
-                last_pt = Some(clone_scene_pt(&scene_pt));
+                //println!("{:?}", el);
-                points.push(scene_pt);
+            });
-            }
+        } else {
-            PathEl::LineTo(p) => {
+            let mut start_pt = None;
-                let scene_pt = to_scene_point(p);
+            let mut last_pt = None;
-                last_pt = Some(clone_scene_pt(&scene_pt));
+            for el in path {
-                points.push(scene_pt);
+                match el {
-            }
+                    PathEl::MoveTo(p) => {
-            PathEl::ClosePath => {
+                        let scene_pt = to_f32_2(p);
-                if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) {
+                        start_pt = Some(scene_pt);
-                    if start.xy != last.xy {
+                        last_pt = Some(scene_pt);
-                        points.push(start);
+                    }
                    PathEl::LineTo(p) => {
                        let scene_pt = to_f32_2(p);
                        let seg = LineSeg {
                            p0: last_pt.unwrap(),
                            p1: scene_pt,
                        };
                        self.encode_line_seg(seg, is_fill);
                        last_pt = Some(scene_pt);
                    }
                    PathEl::QuadTo(p1, p2) => {
                        let scene_p1 = to_f32_2(p1);
                        let scene_p2 = to_f32_2(p2);
                        let seg = QuadSeg {
                            p0: last_pt.unwrap(),
                            p1: scene_p1,
                            p2: scene_p2,
                        };
                        self.elements.push(Element::Quad(seg));
                        last_pt = Some(scene_p2);
                    }
                    PathEl::CurveTo(p1, p2, p3) => {
                        let scene_p1 = to_f32_2(p1);
                        let scene_p2 = to_f32_2(p2);
                        let scene_p3 = to_f32_2(p3);
                        let seg = CubicSeg {
                            p0: last_pt.unwrap(),
                            p1: scene_p1,
                            p2: scene_p2,
                            p3: scene_p3,
                        };
                        self.elements.push(Element::Cubic(seg));
                        last_pt = Some(scene_p3);
                    }
                    PathEl::ClosePath => {
                        if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) {
                            if last != start {
                                let seg = LineSeg {
                                    p0: last,
                                    p1: start,
                                };
                                self.encode_line_seg(seg, is_fill);
                            }
                        }
                    }
                }
                //println!("{:?}", el);
            }
            _ => (),
        }
-        //println!("{:?}", el);
+    }
    });
    let n_points = points.len() as u32;
    let points_ref = points.encode(encoder).transmute();
    (n_points, points_ref)
 }
 impl Text for PietGpuText {
@ -360,13 +381,6 @@ impl IntoBrush<PietGpuRenderContext> for PietGpuBrush {
    }
 }
-fn to_scene_point(point: Point) -> scene::Point {
+fn to_f32_2(point: Point) -> [f32; 2] {
-    scene::Point {
+    [point.x as f32, point.y as f32]
        xy: [point.x as f32, point.y as f32],
    }
 }
 // TODO: allow #[derive(Clone)] in piet-gpu-derive.
 fn clone_scene_pt(p: &scene::Point) -> scene::Point {
    scene::Point { xy: p.xy }
 }