Merge pull request #20 from linebender/sorta

A sorta-middle architecture
2025-01-10 12:41:30 +11:00 · 2020-06-13 13:40:48 -07:00 · 2020-06-13 13:40:48 -07:00 · dc5facd198
parent daa7c9dd64 65f802894c
commit dc5facd198
33 changed files with 1368 additions and 337 deletions
--- a/piet-gpu-types/src/annotated.rs
+++ b/piet-gpu-types/src/annotated.rs
@ -3,9 +3,11 @@ use piet_gpu_derive::piet_gpu;
 piet_gpu! {
    #[gpu_write]
    mod annotated {
        // Note: path segments have moved to pathseg, delete these.
        struct AnnoFillLineSeg {
            p0: [f32; 2],
            p1: [f32; 2],
            path_ix: u32,
            // A note: the layout of this struct is shared with
            // AnnoStrokeLineSeg. In that case, we actually write
            // [0.0, 0.0] as the stroke field, to minimize divergence.
@ -13,6 +15,7 @@ piet_gpu! {
        struct AnnoStrokeLineSeg {
            p0: [f32; 2],
            p1: [f32; 2],
            path_ix: u32,
            // halfwidth in both x and y for binning
            stroke: [f32; 2],
        }
--- a/piet-gpu-types/src/lib.rs
+++ b/piet-gpu-types/src/lib.rs
@ -3,8 +3,10 @@
 pub mod annotated;
 pub mod bins;
 pub mod encoder;
 pub mod pathseg;
 pub mod ptcl;
 pub mod scene;
 pub mod state;
 pub mod test;
 pub mod tile;
 pub mod tilegroup;
--- a/piet-gpu-types/src/main.rs
+++ b/piet-gpu-types/src/main.rs
@ -7,7 +7,9 @@ fn main() {
        "scene" => print!("{}", piet_gpu_types::scene::gen_gpu_scene()),
        "state" => print!("{}", piet_gpu_types::state::gen_gpu_state()),
        "annotated" => print!("{}", piet_gpu_types::annotated::gen_gpu_annotated()),
        "pathseg" => print!("{}", piet_gpu_types::pathseg::gen_gpu_pathseg()),
        "bins" => print!("{}", piet_gpu_types::bins::gen_gpu_bins()),
        "tile" => print!("{}", piet_gpu_types::tile::gen_gpu_tile()),
        "tilegroup" => print!("{}", piet_gpu_types::tilegroup::gen_gpu_tilegroup()),
        "ptcl" => print!("{}", piet_gpu_types::ptcl::gen_gpu_ptcl()),
        "test" => print!("{}", piet_gpu_types::test::gen_gpu_test()),
--- a/piet-gpu-types/src/pathseg.rs
+++ b/piet-gpu-types/src/pathseg.rs
@ -0,0 +1,67 @@
 use piet_gpu_derive::piet_gpu;
 piet_gpu! {
    #[gpu_write]
    mod pathseg {
        struct PathFillLine {
            p0: [f32; 2],
            p1: [f32; 2],
            path_ix: u32,
            // A note: the layout of this struct is shared with
            // PathStrokeLine. In that case, we actually write
            // [0.0, 0.0] as the stroke field, to minimize divergence.
        }
        struct PathStrokeLine {
            p0: [f32; 2],
            p1: [f32; 2],
            path_ix: u32,
            // halfwidth in both x and y for binning
            stroke: [f32; 2],
        }
        struct PathFillCubic {
            p0: [f32; 2],
            p1: [f32; 2],
            p2: [f32; 2],
            p3: [f32; 2],
            path_ix: u32,
            // A note: the layout of this struct is shared with
            // PathStrokeCubic. In that case, we actually write
            // [0.0, 0.0] as the stroke field, to minimize divergence.
        }
        struct PathStrokeCubic {
            p0: [f32; 2],
            p1: [f32; 2],
            p2: [f32; 2],
            p3: [f32; 2],
            path_ix: u32,
            // halfwidth in both x and y for binning
            stroke: [f32; 2],
        }
        /*
        struct PathQuad {
            p0: [f32; 2],
            p1: [f32; 2],
            p2: [f32; 2],
            stroke: [f32; 2],
        }
        struct PathCubic {
            p0: [f32; 2],
            p1: [f32; 2],
            p2: [f32; 2],
            p3: [f32; 2],
            stroke: [f32; 2],
        }
        */
        enum PathSeg {
            Nop,
            FillLine(PathFillLine),
            StrokeLine(PathStrokeLine),
            FillCubic(PathFillCubic),
            StrokeCubic(PathStrokeCubic),
            /*
            Quad(AnnoQuadSeg),
            Cubic(AnnoCubicSeg),
            */
        }
    }
 }
--- a/piet-gpu-types/src/ptcl.rs
+++ b/piet-gpu-types/src/ptcl.rs
@ -13,13 +13,15 @@ piet_gpu! {
            end: [f32; 2],
        }
        struct CmdStroke {
-            // Consider a specialization to one segment.
+            // This is really a Ref<Tile>, but we don't have cross-module
-            seg_ref: Ref<SegChunk>,
+            // references.
            tile_ref: u32,
            half_width: f32,
            rgba_color: u32,
        }
        struct CmdFill {
-            seg_ref: Ref<SegChunk>,
+            // As above, really Ref<Tile>
            tile_ref: u32,
            backdrop: i32,
            rgba_color: u32,
        }
--- a/piet-gpu-types/src/scene.rs
+++ b/piet-gpu-types/src/scene.rs
@ -92,10 +92,10 @@ piet_gpu! {
            StrokeLine(LineSeg),
            FillLine(LineSeg),
-            // Note: we'll need to handle the stroke/fill distinction
+            StrokeQuad(QuadSeg),
-            // for these as well, when we do flattening on the GPU.
+            FillQuad(QuadSeg),
-            Quad(QuadSeg),
+            StrokeCubic(CubicSeg),
-            Cubic(CubicSeg),
+            FillCubic(CubicSeg),
            Stroke(Stroke),
            Fill(Fill),
            SetLineWidth(SetLineWidth),
--- a/piet-gpu-types/src/state.rs
+++ b/piet-gpu-types/src/state.rs
@ -9,6 +9,8 @@ piet_gpu! {
            bbox: [f32; 4],
            linewidth: f32,
            flags: u32,
            path_count: u32,
            pathseg_count: u32,
        }
    }
 }
--- a/piet-gpu-types/src/tile.rs
+++ b/piet-gpu-types/src/tile.rs
@ -0,0 +1,22 @@
 use piet_gpu_derive::piet_gpu;
 piet_gpu! {
    #[gpu_write]
    mod tile {
        struct Path {
            bbox: [u16; 4],
            tiles: Ref<Tile>,
        }
        struct Tile {
            tile: Ref<TileSeg>,
            backdrop: i32,
        }
        // Segments within a tile are represented as a linked list.
        struct TileSeg {
            start: [f32; 2],
            end: [f32; 2],
            y_edge: f32,
            next: Ref<TileSeg>,
        }
    }
 }
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@ -171,7 +171,7 @@ fn main() -> Result<(), Error> {
        let fence = device.create_fence(false)?;
        let mut cmd_buf = device.create_cmd_buf()?;
-        let query_pool = device.create_query_pool(5)?;
+        let query_pool = device.create_query_pool(8)?;
        let mut ctx = PietGpuRenderContext::new();
        if let Some(input) = matches.value_of("INPUT") {
@ -185,10 +185,12 @@ fn main() -> Result<(), Error> {
        } else {
            render_scene(&mut ctx);
        }
        let n_paths = ctx.path_count();
        let n_pathseg = ctx.pathseg_count();
        let scene = ctx.get_scene_buf();
        //dump_scene(&scene);
-        let renderer = Renderer::new(&device, scene)?;
+        let renderer = Renderer::new(&device, scene, n_paths, n_pathseg)?;
        let image_buf =
            device.create_buffer((WIDTH * HEIGHT * 4) as u64, MemFlags::host_coherent())?;
@ -200,13 +202,16 @@ fn main() -> Result<(), Error> {
        device.wait_and_reset(&[fence])?;
        let ts = device.reap_query_pool(&query_pool).unwrap();
        println!("Element kernel time: {:.3}ms", ts[0] * 1e3);
-        println!("Binning kernel time: {:.3}ms", (ts[1] - ts[0]) * 1e3);
+        println!("Tile allocation kernel time: {:.3}ms", (ts[1] - ts[0]) * 1e3);
-        println!("Coarse kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3);
+        println!("Coarse path kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3);
-        println!("Render kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3);
+        println!("Backdrop kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3);
        println!("Binning kernel time: {:.3}ms", (ts[4] - ts[3]) * 1e3);
        println!("Coarse raster kernel time: {:.3}ms", (ts[5] - ts[4]) * 1e3);
        println!("Render kernel time: {:.3}ms", (ts[6] - ts[5]) * 1e3);
        /*
        let mut data: Vec<u32> = Default::default();
-        device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap();
+        device.read_buffer(&renderer.tile_buf, &mut data).unwrap();
        piet_gpu::dump_k1_data(&data);
        //trace_ptcl(&data);
        */
--- a/piet-gpu/bin/winit.rs
+++ b/piet-gpu/bin/winit.rs
@ -42,9 +42,11 @@ fn main() -> Result<(), Error> {
        let mut ctx = PietGpuRenderContext::new();
        render_scene(&mut ctx);
        let n_paths = ctx.path_count();
        let n_pathseg = ctx.pathseg_count();
        let scene = ctx.get_scene_buf();
-        let renderer = Renderer::new(&device, scene)?;
+        let renderer = Renderer::new(&device, scene, n_paths, n_pathseg)?;
        event_loop.run(move |event, _, control_flow| {
            *control_flow = ControlFlow::Poll; // `ControlFlow::Wait` if only re-render on event
--- a/piet-gpu/shader/annotated.h
+++ b/piet-gpu/shader/annotated.h
@ -31,9 +31,10 @@ struct AnnotatedRef {
 struct AnnoFillLineSeg {
    vec2 p0;
    vec2 p1;
    uint path_ix;
 };
-#define AnnoFillLineSeg_size 16
+#define AnnoFillLineSeg_size 20
 AnnoFillLineSegRef AnnoFillLineSeg_index(AnnoFillLineSegRef ref, uint index) {
    return AnnoFillLineSegRef(ref.offset + index * AnnoFillLineSeg_size);
@ -42,10 +43,11 @@ AnnoFillLineSegRef AnnoFillLineSeg_index(AnnoFillLineSegRef ref, uint index) {
 struct AnnoStrokeLineSeg {
    vec2 p0;
    vec2 p1;
    uint path_ix;
    vec2 stroke;
 };
-#define AnnoStrokeLineSeg_size 24
+#define AnnoStrokeLineSeg_size 28
 AnnoStrokeLineSegRef AnnoStrokeLineSeg_index(AnnoStrokeLineSegRef ref, uint index) {
    return AnnoStrokeLineSegRef(ref.offset + index * AnnoStrokeLineSeg_size);
@ -120,9 +122,11 @@ AnnoFillLineSeg AnnoFillLineSeg_read(AnnoFillLineSegRef ref) {
    uint raw1 = annotated[ix + 1];
    uint raw2 = annotated[ix + 2];
    uint raw3 = annotated[ix + 3];
    uint raw4 = annotated[ix + 4];
    AnnoFillLineSeg s;
    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.path_ix = raw4;
    return s;
 }
@ -132,6 +136,7 @@ void AnnoFillLineSeg_write(AnnoFillLineSegRef ref, AnnoFillLineSeg s) {
    annotated[ix + 1] = floatBitsToUint(s.p0.y);
    annotated[ix + 2] = floatBitsToUint(s.p1.x);
    annotated[ix + 3] = floatBitsToUint(s.p1.y);
    annotated[ix + 4] = s.path_ix;
 }
 AnnoStrokeLineSeg AnnoStrokeLineSeg_read(AnnoStrokeLineSegRef ref) {
@ -142,10 +147,12 @@ AnnoStrokeLineSeg AnnoStrokeLineSeg_read(AnnoStrokeLineSegRef ref) {
    uint raw3 = annotated[ix + 3];
    uint raw4 = annotated[ix + 4];
    uint raw5 = annotated[ix + 5];
    uint raw6 = annotated[ix + 6];
    AnnoStrokeLineSeg s;
    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.stroke = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
+    s.path_ix = raw4;
    s.stroke = vec2(uintBitsToFloat(raw5), uintBitsToFloat(raw6));
    return s;
 }
@ -155,8 +162,9 @@ void AnnoStrokeLineSeg_write(AnnoStrokeLineSegRef ref, AnnoStrokeLineSeg s) {
    annotated[ix + 1] = floatBitsToUint(s.p0.y);
    annotated[ix + 2] = floatBitsToUint(s.p1.x);
    annotated[ix + 3] = floatBitsToUint(s.p1.y);
-    annotated[ix + 4] = floatBitsToUint(s.stroke.x);
+    annotated[ix + 4] = s.path_ix;
-    annotated[ix + 5] = floatBitsToUint(s.stroke.y);
+    annotated[ix + 5] = floatBitsToUint(s.stroke.x);
    annotated[ix + 6] = floatBitsToUint(s.stroke.y);
 }
 AnnoQuadSeg AnnoQuadSeg_read(AnnoQuadSegRef ref) {
--- a/piet-gpu/shader/backdrop.comp
+++ b/piet-gpu/shader/backdrop.comp
@ -0,0 +1,91 @@
 // Propagation of tile backdrop for filling.
 #version 450
 #extension GL_GOOGLE_include_directive : enable
 #include "setup.h"
 #define LG_BACKDROP_WG 8
 #define BACKDROP_WG (1 << LG_BACKDROP_WG)
 layout(local_size_x = BACKDROP_WG, local_size_y = 1) in;
 layout(set = 0, binding = 0) buffer AnnotatedBuf {
    uint[] annotated;
 };
 // This is really only used for n_elements; maybe we can handle that
 // a different way, but it's convenient to have the same signature as
 // tile allocation.
 layout(set = 0, binding = 1) buffer AllocBuf {
    uint n_elements;
    uint n_pathseg;
    uint alloc;
 };
 layout(set = 0, binding = 2) buffer TileBuf {
    uint[] tile;
 };
 #include "annotated.h"
 #include "tile.h"
 shared uint sh_row_count[BACKDROP_WG];
 shared uint sh_row_base[BACKDROP_WG];
 shared uint sh_row_width[BACKDROP_WG];
 void main() {
    uint th_ix = gl_LocalInvocationID.x;
    uint element_ix = gl_GlobalInvocationID.x;
    AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size);
    uint row_count = 0;
    if (element_ix < n_elements) {
        uint tag = Annotated_tag(ref);
        if (tag == Annotated_Fill) {
            PathRef path_ref = PathRef(element_ix * Path_size);
            Path path = Path_read(path_ref);
            sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
            row_count = path.bbox.w - path.bbox.y;
            if (row_count == 1) {
                // Note: this can probably be expanded to width = 2 as
                // long as it doesn't cross the left edge.
                row_count = 0;
            }
            sh_row_base[th_ix] = (path.tiles.offset >> 2) + 1;
        }
    }
    sh_row_count[th_ix] = row_count;
    // Prefix sum of sh_row_count
    for (uint i = 0; i < LG_BACKDROP_WG; i++) {
        barrier();
        if (th_ix >= (1 << i)) {
            row_count += sh_row_count[th_ix - (1 << i)];
        }
        barrier();
        sh_row_count[th_ix] = row_count;
    }
    barrier();
    uint total_rows = sh_row_count[BACKDROP_WG - 1];
    for (uint row = th_ix; row < total_rows; row += BACKDROP_WG) {
        // Binary search to find element
        uint el_ix = 0;
        for (uint i = 0; i < LG_BACKDROP_WG; i++) {
            uint probe = el_ix + ((BACKDROP_WG / 2) >> i);
            if (row >= sh_row_count[probe - 1]) {
                el_ix = probe;
            }
        }
        uint seq_ix = row - (el_ix > 0 ? sh_row_count[el_ix - 1] : 0);
        uint width = sh_row_width[el_ix];
        // Process one row sequentially
        uint tile_el_ix = sh_row_base[el_ix] + seq_ix * 2 * width;
        uint sum = tile[tile_el_ix];
        for (uint x = 1; x < width; x++) {
            tile_el_ix += 2;
            sum += tile[tile_el_ix];
            tile[tile_el_ix] = sum;
        }
    }
 }
--- a/piet-gpu/shader/backdrop.spv
+++ b/piet-gpu/shader/backdrop.spv
--- a/piet-gpu/shader/binning.spv
+++ b/piet-gpu/shader/binning.spv
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@ -14,6 +14,12 @@ build elements.spv: glsl elements.comp | scene.h state.h annotated.h
 build binning.spv: glsl binning.comp | annotated.h state.h bins.h setup.h
 build tile_alloc.spv: glsl tile_alloc.comp | annotated.h tile.h setup.h
 build path_coarse.spv: glsl path_coarse.comp | annotated.h pathseg.h tile.h setup.h
 build backdrop.spv: glsl backdrop.comp | annotated.h tile.h setup.h
 build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h
 build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@ -15,17 +15,22 @@ layout(set = 0, binding = 1) buffer BinsBuf {
    uint[] bins;
 };
-layout(set = 0, binding = 2) buffer AllocBuf {
+layout(set = 0, binding = 2) buffer TileBuf {
    uint[] tile;
 };
 layout(set = 0, binding = 3) buffer AllocBuf {
    uint n_elements;
    uint alloc;
 };
-layout(set = 0, binding = 3) buffer PtclBuf {
+layout(set = 0, binding = 4) buffer PtclBuf {
    uint[] ptcl;
 };
 #include "annotated.h"
 #include "bins.h"
 #include "tile.h"
 #include "ptcl.h"
 #define LG_N_PART_READ 8
@ -39,16 +44,16 @@ shared uint sh_part_count[N_PART_READ];
 shared uint sh_part_elements[N_PART_READ];
 shared uint sh_bitmaps[N_SLICE][N_TILE];
 shared uint sh_backdrop[N_SLICE][N_TILE];
 shared uint sh_bd_sign[N_SLICE];
 shared uint sh_is_segment[N_SLICE];
-// Shared state for parallel segment output stage
+shared uint sh_tile_count[N_TILE];
 // The width of the tile rect for the element, intersected with this bin
 shared uint sh_tile_width[N_TILE];
 shared uint sh_tile_x0[N_TILE];
 shared uint sh_tile_y0[N_TILE];
-// Count of total number of segments in each tile, then
+// These are set up so base + tile_y * stride + tile_x points to a Tile.
-// inclusive prefix sum of same.
+shared uint sh_tile_base[N_TILE];
-shared uint sh_seg_count[N_TILE];
+shared uint sh_tile_stride[N_TILE];
 shared uint sh_seg_alloc;
 // scale factors useful for converting coordinates to tiles
 #define SX (1.0 / float(TILE_WIDTH_PX))
@ -65,30 +70,6 @@ void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) {
    }
 }
 #define CHUNK_ALLOC_SLAB 16
 uint alloc_chunk_remaining;
 uint alloc_chunk_offset;
 SegChunkRef alloc_seg_chunk() {
    if (alloc_chunk_remaining == 0) {
        alloc_chunk_offset = atomicAdd(alloc, CHUNK_ALLOC_SLAB * SegChunk_size);
        alloc_chunk_remaining = CHUNK_ALLOC_SLAB;
    }
    uint offset = alloc_chunk_offset;
    alloc_chunk_offset += SegChunk_size;
    alloc_chunk_remaining--;
    return SegChunkRef(offset);
 }
 // Accumulate delta to backdrop.
 //
 // Each bit for which bd_bitmap is 1 and bd_sign is 1 counts as +1, and each
 // bit for which bd_bitmap is 1 and bd_sign is 0 counts as -1.
 int count_backdrop(uint bd_bitmap, uint bd_sign) {
    return bitCount(bd_bitmap & bd_sign) - bitCount(bd_bitmap & ~bd_sign);
 }
 void main() {
    // Could use either linear or 2d layouts for both dispatch and
    // invocations within the workgroup. We'll use variables to abstract.
@ -99,19 +80,15 @@ void main() {
    vec2 xy0 = vec2(N_TILE_X * TILE_WIDTH_PX * gl_WorkGroupID.x, N_TILE_Y * TILE_HEIGHT_PX * gl_WorkGroupID.y);
    uint th_ix = gl_LocalInvocationID.x;
-    uint tile_x = N_TILE_X * gl_WorkGroupID.x + gl_LocalInvocationID.x % N_TILE_X;
+    // Coordinates of top left of bin, in tiles.
-    uint tile_y = N_TILE_Y * gl_WorkGroupID.y + gl_LocalInvocationID.x / N_TILE_X;
+    uint bin_tile_x = N_TILE_X * gl_WorkGroupID.x;
-    uint this_tile_ix = tile_y * WIDTH_IN_TILES + tile_x;
+    uint bin_tile_y = N_TILE_Y * gl_WorkGroupID.y;
    uint tile_x = gl_LocalInvocationID.x % N_TILE_X;
    uint tile_y = gl_LocalInvocationID.x / N_TILE_X;
    uint this_tile_ix = (bin_tile_y + tile_y) * WIDTH_IN_TILES + bin_tile_x + tile_x;
    CmdRef cmd_ref = CmdRef(this_tile_ix * PTCL_INITIAL_ALLOC);
    uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
    // Allocation and management of segment output
    SegChunkRef first_seg_chunk = SegChunkRef(0);
    SegChunkRef last_chunk_ref = SegChunkRef(0);
    uint last_chunk_n = 0;
    SegmentRef last_chunk_segs = SegmentRef(0);
    alloc_chunk_remaining = 0;
    // I'm sure we can figure out how to do this with at least one fewer register...
    // Items up to rd_ix have been read from sh_elements
    uint rd_ix = 0;
@ -120,17 +97,10 @@ void main() {
    // Items between part_start_ix and ready_ix are ready to be transferred from sh_part_elements
    uint part_start_ix = 0;
    uint ready_ix = 0;
    if (th_ix < N_SLICE) {
        sh_bd_sign[th_ix] = 0;
    }
    int backdrop = 0;
    while (true) {
        for (uint i = 0; i < N_SLICE; i++) {
            sh_bitmaps[i][th_ix] = 0;
            sh_backdrop[i][th_ix] = 0;
        }
        if (th_ix < N_SLICE) {
            sh_is_segment[th_ix] = 0;
        }
        // parallel read of input partitions
@ -188,103 +158,87 @@ void main() {
        // Read one element, compute coverage.
        uint tag = Annotated_Nop;
        uint element_ix;
        AnnotatedRef ref;
        float right_edge = 0.0;
        if (th_ix + rd_ix < wr_ix) {
-            uint element_ix = sh_elements[th_ix];
+            element_ix = sh_elements[th_ix];
            right_edge = sh_right_edge[th_ix];
            ref = AnnotatedRef(element_ix * Annotated_size);
            tag = Annotated_tag(ref);
        }
        // Setup for coverage algorithm.
        float a, b, c;
        // Bounding box of element in pixel coordinates.
-        float xmin, xmax, ymin, ymax;
+        uint tile_count;
        uint my_slice = th_ix / 32;
        uint my_mask = 1 << (th_ix & 31);
        switch (tag) {
        case Annotated_FillLine:
        case Annotated_StrokeLine:
            AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref);
            xmin = min(line.p0.x, line.p1.x) - line.stroke.x;
            xmax = max(line.p0.x, line.p1.x) + line.stroke.x;
            ymin = min(line.p0.y, line.p1.y) - line.stroke.y;
            ymax = max(line.p0.y, line.p1.y) + line.stroke.y;
            float dx = line.p1.x - line.p0.x;
            float dy = line.p1.y - line.p0.y;
            if (tag == Annotated_FillLine) {
                // Set bit for backdrop sign calculation, 1 is +1, 0 is -1.
                if (dy < 0) {
                    atomicOr(sh_bd_sign[my_slice], my_mask);
                } else {
                    atomicAnd(sh_bd_sign[my_slice], ~my_mask);
                }
            }
            atomicOr(sh_is_segment[my_slice], my_mask);
            // Set up for per-scanline coverage formula, below.
            float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
            c = (line.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + line.stroke.y)) * SX;
            b = invslope; // Note: assumes square tiles, otherwise scale.
            a = (line.p0.x - xy0.x - (line.p0.y - 0.5 * float(TILE_HEIGHT_PX) - xy0.y) * b) * SX;
            break;
        case Annotated_Fill:
        case Annotated_Stroke:
-            // Note: we take advantage of the fact that fills and strokes
+            // Because the only elements we're processing right now are
-            // have compatible layout.
+            // paths, we can just use the element index as the path index.
-            AnnoFill fill = Annotated_Fill_read(ref);
+            // In future, when we're doing a bunch of stuff, the path index
-            xmin = fill.bbox.x;
+            // should probably be stored in the annotated element.
-            xmax = fill.bbox.z;
+            uint path_ix = element_ix;
-            ymin = fill.bbox.y;
+            Path path = Path_read(PathRef(path_ix * Path_size));
-            ymax = fill.bbox.w;
+            uint stride = path.bbox.z - path.bbox.x;
-            // Just let the clamping to xmin and xmax determine the bounds.
+            sh_tile_stride[th_ix] = stride;
-            a = 0.0;
+            int dx = int(path.bbox.x) - int(bin_tile_x);
-            b = 0.0;
+            int dy = int(path.bbox.y) - int(bin_tile_y);
-            c = 1e9;
+            int x0 = clamp(dx, 0, N_TILE_X);
            int y0 = clamp(dy, 0, N_TILE_Y);
            int x1 = clamp(int(path.bbox.z) - int(bin_tile_x), 0, N_TILE_X);
            int y1 = clamp(int(path.bbox.w) - int(bin_tile_y), 0, N_TILE_Y);
            sh_tile_width[th_ix] = uint(x1 - x0);
            sh_tile_x0[th_ix] = x0;
            sh_tile_y0[th_ix] = y0;
            tile_count = uint(x1 - x0) * uint(y1 - y0);
            // base relative to bin
            uint base = path.tiles.offset - uint(dy * stride + dx) * Tile_size;
            sh_tile_base[th_ix] = base;
            break;
        default:
-            ymin = 0;
+            tile_count = 0;
            ymax = 0;
            break;
        }
-        // Draw the coverage area into the bitmasks. This uses an algorithm
+        // Prefix sum of sh_tile_count
-        // that computes the coverage of a span for given scanline.
+        sh_tile_count[th_ix] = tile_count;
        for (uint i = 0; i < LG_N_TILE; i++) {
            barrier();
            if (th_ix >= (1 << i)) {
                tile_count += sh_tile_count[th_ix - (1 << i)];
            }
            barrier();
            sh_tile_count[th_ix] = tile_count;
        }
        barrier();
        uint total_tile_count = sh_tile_count[N_TILE - 1];
        for (uint ix = th_ix; ix < total_tile_count; ix += N_TILE) {
            // Binary search to find element
            uint el_ix = 0;
            for (uint i = 0; i < LG_N_TILE; i++) {
                uint probe = el_ix + ((N_TILE / 2) >> i);
                if (ix >= sh_tile_count[probe - 1]) {
                    el_ix = probe;
                }
            }
            uint seq_ix = ix - (el_ix > 0 ? sh_tile_count[el_ix - 1] : 0);
            uint width = sh_tile_width[el_ix];
            uint x = sh_tile_x0[el_ix] + seq_ix % width;
            uint y = sh_tile_y0[el_ix] + seq_ix / width;
            Tile tile = Tile_read(TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
            if (tile.tile.offset != 0 || tile.backdrop != 0) {
                uint el_slice = el_ix / 32;
                uint el_mask = 1 << (el_ix & 31);
                atomicOr(sh_bitmaps[el_slice][y * N_TILE_X + x], el_mask);
            }
        }
        // Compute bounding box in tiles and clip to this bin.
        int x0 = int(floor((xmin - xy0.x) * SX));
        int x1 = int(ceil((xmax - xy0.x) * SX));
        int xr = int(ceil((right_edge - xy0.x) * SX));
        int y0 = int(floor((ymin - xy0.y) * SY));
        int y1 = int(ceil((ymax - xy0.y) * SY));
        x0 = clamp(x0, 0, N_TILE_X);
        x1 = clamp(x1, x0, N_TILE_X);
        xr = clamp(xr, 0, N_TILE_X);
        y0 = clamp(y0, 0, N_TILE_Y);
        y1 = clamp(y1, y0, N_TILE_Y);
        float t = a + b * float(y0);
        for (uint y = y0; y < y1; y++) {
            uint xx0 = clamp(int(floor(t - c)), x0, x1);
            uint xx1 = clamp(int(ceil(t + c)), x0, x1);
            for (uint x = xx0; x < xx1; x++) {
                atomicOr(sh_bitmaps[my_slice][y * N_TILE_X + x], my_mask);
            }
            if (tag == Annotated_FillLine && ymin <= xy0.y + float(y * TILE_HEIGHT_PX)) {
                // Assign backdrop to all tiles to the right of the ray crossing the
                // top edge of this tile, up to the right edge of the fill bbox.
                float xray = t - 0.5 * b;
                xx0 = max(int(ceil(xray)), 0);
                for (uint x = xx0; x < xr; x++) {
                    atomicOr(sh_backdrop[my_slice][y * N_TILE_X + x], my_mask);
                }
            }
            t += b;
        }
        barrier();
        // We've computed coverage and other info for each element in the input, now for
        // the output stage. We'll do segments first using a more parallel algorithm.
        /*
        uint seg_count = 0;
        for (uint i = 0; i < N_SLICE; i++) {
            seg_count += bitCount(sh_bitmaps[i][th_ix] & sh_is_segment[i]);
@ -372,45 +326,29 @@ void main() {
            Segment seg = Segment(line.p0, line.p1, y_edge);
            Segment_write(SegmentRef(seg_alloc + Segment_size * ix), seg);
        }
        */
        // Output non-segment elements for this tile. The thread does a sequential walk
        // through the non-segment elements, and for segments, count and backdrop are
        // aggregated using bit counting.
        uint slice_ix = 0;
        uint bitmap = sh_bitmaps[0][th_ix];
        uint bd_bitmap = sh_backdrop[0][th_ix];
        uint bd_sign = sh_bd_sign[0];
        uint is_segment = sh_is_segment[0];
        uint seg_start = th_ix == 0 ? 0 : sh_seg_count[th_ix - 1];
        seg_count = 0;
        while (true) {
-            uint nonseg_bitmap = bitmap & ~is_segment;
+            if (bitmap == 0) {
            if (nonseg_bitmap == 0) {
                backdrop += count_backdrop(bd_bitmap, bd_sign);
                seg_count += bitCount(bitmap & is_segment);
                slice_ix++;
                if (slice_ix == N_SLICE) {
                    break;
                }
                bitmap = sh_bitmaps[slice_ix][th_ix];
-                bd_bitmap = sh_backdrop[slice_ix][th_ix];
+                if (bitmap == 0) {
                bd_sign = sh_bd_sign[slice_ix];
                is_segment = sh_is_segment[slice_ix];
                nonseg_bitmap = bitmap & ~is_segment;
                if (nonseg_bitmap == 0) {
                    continue;
                }
            }
-            uint element_ref_ix = slice_ix * 32 + findLSB(nonseg_bitmap);
+            uint element_ref_ix = slice_ix * 32 + findLSB(bitmap);
            uint element_ix = sh_elements[element_ref_ix];
-            // Bits up to and including the lsb
+            // Clear LSB
-            uint bd_mask = (nonseg_bitmap - 1) ^ nonseg_bitmap;
+            bitmap &= bitmap - 1;
            backdrop += count_backdrop(bd_bitmap & bd_mask, bd_sign);
            seg_count += bitCount(bitmap & bd_mask & is_segment);
            // Clear bits that have been consumed.
            bd_bitmap &= ~bd_mask;
            bitmap &= ~bd_mask;
            // At this point, we read the element again from global memory.
            // If that turns out to be expensive, maybe we can pack it into
@ -420,103 +358,36 @@ void main() {
            switch (tag) {
            case Annotated_Fill:
-                if (last_chunk_n > 0 || seg_count > 0) {
+                Tile tile = Tile_read(TileRef(sh_tile_base[element_ref_ix]
-                    SegChunkRef chunk_ref = SegChunkRef(0);
+                    + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
                    if (seg_count > 0) {
                        chunk_ref = alloc_seg_chunk();
                        SegChunk chunk;
                        chunk.n = seg_count;
                        chunk.next = SegChunkRef(0);
                        uint seg_offset = seg_alloc + seg_start * Segment_size;
                        chunk.segs = SegmentRef(seg_offset);
                        SegChunk_write(chunk_ref, chunk);
                    }
                    if (last_chunk_n > 0) {
                        SegChunk chunk;
                        chunk.n = last_chunk_n;
                        chunk.next = chunk_ref;
                        chunk.segs = last_chunk_segs;
                        SegChunk_write(last_chunk_ref, chunk);
                    } else {
                        first_seg_chunk = chunk_ref;
                    }
                AnnoFill fill = Annotated_Fill_read(ref);
                alloc_cmd(cmd_ref, cmd_limit);
                if (tile.tile.offset != 0) {
                    CmdFill cmd_fill;
-                    cmd_fill.seg_ref = first_seg_chunk;
+                    cmd_fill.tile_ref = tile.tile.offset;
-                    cmd_fill.backdrop = backdrop;
+                    cmd_fill.backdrop = tile.backdrop;
                    cmd_fill.rgba_color = fill.rgba_color;
                    alloc_cmd(cmd_ref, cmd_limit);
                    Cmd_Fill_write(cmd_ref, cmd_fill);
-                    cmd_ref.offset += Cmd_size;
+                } else {
                    last_chunk_n = 0;
                } else if (backdrop != 0) {
                    AnnoFill fill = Annotated_Fill_read(ref);
                    alloc_cmd(cmd_ref, cmd_limit);
                    Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color));
                    cmd_ref.offset += Cmd_size;
                }
-                seg_start += seg_count;
+                cmd_ref.offset += Cmd_size;
                seg_count = 0;
                backdrop = 0;
                break;
            case Annotated_Stroke:
-                // TODO: reduce divergence & code duplication? Much of the
+                tile = Tile_read(TileRef(sh_tile_base[element_ref_ix]
-                // fill and stroke processing is in common.
+                    + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
                if (last_chunk_n > 0 || seg_count > 0) {
                    SegChunkRef chunk_ref = SegChunkRef(0);
                    if (seg_count > 0) {
                        chunk_ref = alloc_seg_chunk();
                        SegChunk chunk;
                        chunk.n = seg_count;
                        chunk.next = SegChunkRef(0);
                        uint seg_offset = seg_alloc + seg_start * Segment_size;
                        chunk.segs = SegmentRef(seg_offset);
                        SegChunk_write(chunk_ref, chunk);
                    }
                    if (last_chunk_n > 0) {
                        SegChunk chunk;
                        chunk.n = last_chunk_n;
                        chunk.next = chunk_ref;
                        chunk.segs = last_chunk_segs;
                        SegChunk_write(last_chunk_ref, chunk);
                    } else {
                        first_seg_chunk = chunk_ref;
                    }
                AnnoStroke stroke = Annotated_Stroke_read(ref);
                CmdStroke cmd_stroke;
-                    cmd_stroke.seg_ref = first_seg_chunk;
+                cmd_stroke.tile_ref = tile.tile.offset;
                cmd_stroke.half_width = 0.5 * stroke.linewidth;
                cmd_stroke.rgba_color = stroke.rgba_color;
                alloc_cmd(cmd_ref, cmd_limit);
                Cmd_Stroke_write(cmd_ref, cmd_stroke);
                cmd_ref.offset += Cmd_size;
                    last_chunk_n = 0;
                }
                seg_start += seg_count;
                seg_count = 0;
                break;
            default:
                // This shouldn't happen, but just in case.
                seg_start++;
                break;
            }
        }
        if (seg_count > 0) {
            SegChunkRef chunk_ref = alloc_seg_chunk();
            if (last_chunk_n > 0) {
                SegChunk_write(last_chunk_ref, SegChunk(last_chunk_n, chunk_ref, last_chunk_segs));
            } else {
                first_seg_chunk = chunk_ref;
            }
            // TODO: free two registers by writing count and segments ref now,
            // as opposed to deferring SegChunk write until all fields are known.
            last_chunk_ref = chunk_ref;
            last_chunk_n = seg_count;
            uint seg_offset = seg_alloc + seg_start * Segment_size;
            last_chunk_segs = SegmentRef(seg_offset);
        }
        barrier();
        rd_ix += N_TILE;
--- a/piet-gpu/shader/coarse.spv
+++ b/piet-gpu/shader/coarse.spv
--- a/piet-gpu/shader/elements.comp
+++ b/piet-gpu/shader/elements.comp
@ -30,9 +30,15 @@ layout(set = 0, binding = 2) buffer AnnotatedBuf {
    uint[] annotated;
 };
 // Path segments are stored here.
 layout(set = 0, binding = 3) buffer PathSegBuf {
    uint[] pathseg;
 };
 #include "scene.h"
 #include "state.h"
 #include "annotated.h"
 #include "pathseg.h"
 #define StateBuf_stride (8 + 2 * State_size)
@ -83,6 +89,8 @@ State combine_state(State a, State b) {
    c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth;
    c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX)) | b.flags;
    c.flags |= (a.flags & FLAG_RESET_BBOX) >> 1;
    c.path_count = a.path_count + b.path_count;
    c.pathseg_count = a.pathseg_count + b.pathseg_count;
    return c;
 }
@ -96,6 +104,8 @@ State map_element(ElementRef ref, inout bool is_fill) {
    c.translate = vec2(0.0, 0.0);
    c.linewidth = 1.0; // TODO should be 0.0
    c.flags = 0;
    c.path_count = 0;
    c.pathseg_count = 0;
    is_fill = false;
    switch (tag) {
    case Element_FillLine:
@ -103,22 +113,28 @@ State map_element(ElementRef ref, inout bool is_fill) {
        LineSeg line = Element_FillLine_read(ref);
        c.bbox.xy = min(line.p0, line.p1);
        c.bbox.zw = max(line.p0, line.p1);
        c.pathseg_count = 1;
        break;
-    case Element_Quad:
+    case Element_FillQuad:
-        QuadSeg quad = Element_Quad_read(ref);
+    case Element_StrokeQuad:
        QuadSeg quad = Element_FillQuad_read(ref);
        c.bbox.xy = min(min(quad.p0, quad.p1), quad.p2);
        c.bbox.zw = max(max(quad.p0, quad.p1), quad.p2);
        c.pathseg_count = 1;
        break;
-    case Element_Cubic:
+    case Element_FillCubic:
-        CubicSeg cubic = Element_Cubic_read(ref);
+    case Element_StrokeCubic:
        CubicSeg cubic = Element_FillCubic_read(ref);
        c.bbox.xy = min(min(cubic.p0, cubic.p1), min(cubic.p2, cubic.p3));
        c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3));
        c.pathseg_count = 1;
        break;
    case Element_Fill:
        is_fill = true;
        // fall-through
    case Element_Stroke:
        c.flags = FLAG_RESET_BBOX;
        c.path_count = 1;
        break;
    case Element_SetLineWidth:
        SetLineWidth lw = Element_SetLineWidth_read(ref);
@ -148,6 +164,8 @@ shared vec2 sh_translate[WG_SIZE];
 shared vec4 sh_bbox[WG_SIZE];
 shared float sh_width[WG_SIZE];
 shared uint sh_flags[WG_SIZE];
 shared uint sh_path_count[WG_SIZE];
 shared uint sh_pathseg_count[WG_SIZE];
 shared uint sh_min_fill;
@ -187,6 +205,8 @@ void main() {
    sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
    sh_width[gl_LocalInvocationID.x] = agg.linewidth;
    sh_flags[gl_LocalInvocationID.x] = agg.flags;
    sh_path_count[gl_LocalInvocationID.x] = agg.path_count;
    sh_pathseg_count[gl_LocalInvocationID.x] = agg.pathseg_count;
    for (uint i = 0; i < LG_WG_SIZE; i++) {
        barrier();
        if (gl_LocalInvocationID.x >= (1 << i)) {
@ -197,6 +217,8 @@ void main() {
            other.bbox = sh_bbox[ix];
            other.linewidth = sh_width[ix];
            other.flags = sh_flags[ix];
            other.path_count = sh_path_count[ix];
            other.pathseg_count = sh_pathseg_count[ix];
            agg = combine_state(other, agg);
        }
        barrier();
@ -205,6 +227,8 @@ void main() {
        sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
        sh_width[gl_LocalInvocationID.x] = agg.linewidth;
        sh_flags[gl_LocalInvocationID.x] = agg.flags;
        sh_path_count[gl_LocalInvocationID.x] = agg.path_count;
        sh_pathseg_count[gl_LocalInvocationID.x] = agg.pathseg_count;
    }
    State exclusive;
@ -213,6 +237,8 @@ void main() {
    exclusive.translate = vec2(0.0, 0.0);
    exclusive.linewidth = 1.0; //TODO should be 0.0
    exclusive.flags = 0;
    exclusive.path_count = 0;
    exclusive.pathseg_count = 0;
    // Publish aggregate for this partition
    if (gl_LocalInvocationID.x == WG_SIZE - 1) {
@ -266,6 +292,8 @@ void main() {
        other.bbox = sh_bbox[ix];
        other.linewidth = sh_width[ix];
        other.flags = sh_flags[ix];
        other.path_count = sh_path_count[ix];
        other.pathseg_count = sh_pathseg_count[ix];
        row = combine_state(row, other);
    }
    if (my_min_fill == ~0 && gl_LocalInvocationID.x == 0) {
@ -284,25 +312,75 @@ void main() {
        // gains to be had from stashing in shared memory or possibly
        // registers (though register pressure is an issue).
        ElementRef this_ref = Element_index(ref, i);
        AnnotatedRef out_ref = AnnotatedRef((ix + i) * Annotated_size);
        uint tag = Element_tag(this_ref);
        switch (tag) {
        case Element_FillLine:
        case Element_StrokeLine:
            LineSeg line = Element_StrokeLine_read(this_ref);
-            AnnoStrokeLineSeg anno_line;
+            vec2 p0 = st.mat.xy * line.p0.x + st.mat.zw * line.p0.y + st.translate;
-            anno_line.p0 = st.mat.xy * line.p0.x + st.mat.zw * line.p0.y + st.translate;
+            vec2 p1 = st.mat.xy * line.p1.x + st.mat.zw * line.p1.y + st.translate;
-            anno_line.p1 = st.mat.xy * line.p1.x + st.mat.zw * line.p1.y + st.translate;
+            PathStrokeCubic path_cubic;
            path_cubic.p0 = p0;
            path_cubic.p1 = mix(p0, p1, 1.0 / 3.0);
            path_cubic.p2 = mix(p1, p0, 1.0 / 3.0);
            path_cubic.p3 = p1;
            path_cubic.path_ix = st.path_count;
            if (tag == Element_StrokeLine) {
-                anno_line.stroke = get_linewidth(st);
+                path_cubic.stroke = get_linewidth(st);
            } else {
-                anno_line.stroke = vec2(0.0);
+                path_cubic.stroke = vec2(0.0);
            }
            // We do encoding a bit by hand to minimize divergence. Another approach
            // would be to have a fill/stroke bool.
-            uint out_tag = tag == Element_FillLine ? Annotated_FillLine : Annotated_StrokeLine;
+            PathSegRef path_out_ref = PathSegRef((st.pathseg_count - 1) * PathSeg_size);
-            annotated[out_ref.offset >> 2] = out_tag;
+            uint out_tag = tag == Element_FillLine ? PathSeg_FillCubic : PathSeg_StrokeCubic;
-            AnnoStrokeLineSeg_write(AnnoStrokeLineSegRef(out_ref.offset + 4), anno_line);
+            pathseg[path_out_ref.offset >> 2] = out_tag;
            PathStrokeCubic_write(PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic);
            break;
        case Element_FillQuad:
        case Element_StrokeQuad:
            QuadSeg quad = Element_StrokeQuad_read(this_ref);
            p0 = st.mat.xy * quad.p0.x + st.mat.zw * quad.p0.y + st.translate;
            p1 = st.mat.xy * quad.p1.x + st.mat.zw * quad.p1.y + st.translate;
            vec2 p2 = st.mat.xy * quad.p2.x + st.mat.zw * quad.p2.y + st.translate;
            path_cubic;
            path_cubic.p0 = p0;
            path_cubic.p1 = mix(p1, p0, 1.0 / 3.0);
            path_cubic.p2 = mix(p1, p2, 1.0 / 3.0);
            path_cubic.p3 = p2;
            path_cubic.path_ix = st.path_count;
            if (tag == Element_StrokeQuad) {
                path_cubic.stroke = get_linewidth(st);
            } else {
                path_cubic.stroke = vec2(0.0);
            }
            // We do encoding a bit by hand to minimize divergence. Another approach
            // would be to have a fill/stroke bool.
            path_out_ref = PathSegRef((st.pathseg_count - 1) * PathSeg_size);
            out_tag = tag == Element_FillQuad ? PathSeg_FillCubic : PathSeg_StrokeCubic;
            pathseg[path_out_ref.offset >> 2] = out_tag;
            PathStrokeCubic_write(PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic);
            break;
        case Element_FillCubic:
        case Element_StrokeCubic:
            CubicSeg cubic = Element_StrokeCubic_read(this_ref);
            path_cubic;
            path_cubic.p0 = st.mat.xy * cubic.p0.x + st.mat.zw * cubic.p0.y + st.translate;
            path_cubic.p1 = st.mat.xy * cubic.p1.x + st.mat.zw * cubic.p1.y + st.translate;
            path_cubic.p2 = st.mat.xy * cubic.p2.x + st.mat.zw * cubic.p2.y + st.translate;
            path_cubic.p3 = st.mat.xy * cubic.p3.x + st.mat.zw * cubic.p3.y + st.translate;
            path_cubic.path_ix = st.path_count;
            if (tag == Element_StrokeCubic) {
                path_cubic.stroke = get_linewidth(st);
            } else {
                path_cubic.stroke = vec2(0.0);
            }
            // We do encoding a bit by hand to minimize divergence. Another approach
            // would be to have a fill/stroke bool.
            path_out_ref = PathSegRef((st.pathseg_count - 1) * PathSeg_size);
            out_tag = tag == Element_FillCubic ? PathSeg_FillCubic : PathSeg_StrokeCubic;
            pathseg[path_out_ref.offset >> 2] = out_tag;
            PathStrokeCubic_write(PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic);
            break;
        case Element_Stroke:
            Stroke stroke = Element_Stroke_read(this_ref);
@ -311,6 +389,7 @@ void main() {
            vec2 lw = get_linewidth(st);
            anno_stroke.bbox = st.bbox + vec4(-lw, lw);
            anno_stroke.linewidth = st.linewidth * sqrt(st.mat.x * st.mat.w - st.mat.y * st.mat.z);
            AnnotatedRef out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size);
            Annotated_Stroke_write(out_ref, anno_stroke);
            break;
        case Element_Fill:
@ -318,11 +397,9 @@ void main() {
            AnnoFill anno_fill;
            anno_fill.rgba_color = fill.rgba_color;
            anno_fill.bbox = st.bbox;
            out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size);
            Annotated_Fill_write(out_ref, anno_fill);
            break;
        default:
            Annotated_Nop_write(out_ref);
            break;
        }
    }
 }
--- a/piet-gpu/shader/elements.spv
+++ b/piet-gpu/shader/elements.spv
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@ -17,9 +17,14 @@ layout(set = 0, binding = 0) buffer PtclBuf {
    uint[] ptcl;
 };
-layout(rgba8, set = 0, binding = 1) uniform writeonly image2D image;
+layout(set = 0, binding = 1) buffer TileBuf {
    uint[] tile;
 };
 layout(rgba8, set = 0, binding = 2) uniform writeonly image2D image;
 #include "ptcl.h"
 #include "tile.h"
 #include "setup.h"
@ -57,12 +62,9 @@ void main() {
            CmdStroke stroke = Cmd_Stroke_read(cmd_ref);
            float df[CHUNK];
            for (uint k = 0; k < CHUNK; k++) df[k] = 1e9;
-            SegChunkRef seg_chunk_ref = stroke.seg_ref;
+            TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref);
            do {
-                SegChunk seg_chunk = SegChunk_read(seg_chunk_ref);
+                TileSeg seg = TileSeg_read(tile_seg_ref);
                SegmentRef segs = seg_chunk.segs;
                for (int i = 0; i < seg_chunk.n; i++) {
                    Segment seg = Segment_read(Segment_index(segs, i));
                vec2 line_vec = seg.end - seg.start;
                for (uint k = 0; k < CHUNK; k++) {
                    vec2 dpos = xy + vec2(0.5, 0.5) - seg.start;
@ -70,9 +72,8 @@ void main() {
                    float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
                    df[k] = min(df[k], length(line_vec * t - dpos));
                }
-                }
+                tile_seg_ref = seg.next;
-                seg_chunk_ref = seg_chunk.next;
+            } while (tile_seg_ref.offset != 0);
            } while (seg_chunk_ref.offset != 0);
            fg_rgba = unpackUnorm4x8(stroke.rgba_color).wzyx;
            for (uint k = 0; k < CHUNK; k++) {
                float alpha = clamp(stroke.half_width + 0.5 - df[k], 0.0, 1.0);
@ -84,12 +85,9 @@ void main() {
            // Probably better to store as float, but conversion is no doubt cheap.
            float area[CHUNK];
            for (uint k = 0; k < CHUNK; k++) area[k] = float(fill.backdrop);
-            SegChunkRef fill_seg_chunk_ref = fill.seg_ref;
+            tile_seg_ref = TileSegRef(fill.tile_ref);
            do {
-                SegChunk seg_chunk = SegChunk_read(fill_seg_chunk_ref);
+                TileSeg seg = TileSeg_read(tile_seg_ref);
                SegmentRef segs = seg_chunk.segs;
                for (int i = 0; i < seg_chunk.n; i++) {
                    Segment seg = Segment_read(Segment_index(segs, i));
                for (uint k = 0; k < CHUNK; k++) {
                    vec2 my_xy = vec2(xy.x, xy.y + float(k * CHUNK_DY));
                    vec2 start = seg.start - my_xy;
@ -108,9 +106,8 @@ void main() {
                    }
                    area[k] += sign(end.x - start.x) * clamp(my_xy.y - seg.y_edge + 1.0, 0.0, 1.0);
                }
-                }
+                tile_seg_ref = seg.next;
-                fill_seg_chunk_ref = seg_chunk.next;
+            } while (tile_seg_ref.offset != 0);
            } while (fill_seg_chunk_ref.offset != 0);
            fg_rgba = unpackUnorm4x8(fill.rgba_color).wzyx;
            for (uint k = 0; k < CHUNK; k++) {
                float alpha = min(abs(area[k]), 1.0);
--- a/piet-gpu/shader/kernel4.spv
+++ b/piet-gpu/shader/kernel4.spv
--- a/piet-gpu/shader/path_coarse.comp
+++ b/piet-gpu/shader/path_coarse.comp
@ -0,0 +1,265 @@
 // Coarse rasterization of path segments.
 // Allocation and initialization of tiles for paths.
 #version 450
 #extension GL_GOOGLE_include_directive : enable
 #include "setup.h"
 #define LG_COARSE_WG 5
 #define COARSE_WG (1 << LG_COARSE_WG)
 layout(local_size_x = COARSE_WG, local_size_y = 1) in;
 layout(set = 0, binding = 0) buffer PathSegBuf {
    uint[] pathseg;
 };
 layout(set = 0, binding = 1) buffer AllocBuf {
    uint n_paths;
    uint n_pathseg;
    uint alloc;
 };
 layout(set = 0, binding = 2) buffer TileBuf {
    uint[] tile;
 };
 #include "pathseg.h"
 #include "tile.h"
 // scale factors useful for converting coordinates to tiles
 #define SX (1.0 / float(TILE_WIDTH_PX))
 #define SY (1.0 / float(TILE_HEIGHT_PX))
 #define ACCURACY 0.25
 #define Q_ACCURACY (ACCURACY * 0.1)
 #define REM_ACCURACY (ACCURACY - Q_ACCURACY)
 #define MAX_HYPOT2 (432.0 * Q_ACCURACY * Q_ACCURACY)
 vec2 eval_quad(vec2 p0, vec2 p1, vec2 p2, float t) {
    float mt = 1.0 - t;
    return p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t;
 }
 vec2 eval_cubic(vec2 p0, vec2 p1, vec2 p2, vec2 p3, float t) {
    float mt = 1.0 - t;
    return p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t;
 }
 struct SubdivResult {
    float val;
    float a0;
    float a2;
 };
 /// An approximation to $\int (1 + 4x^2) ^ -0.25 dx$
 ///
 /// This is used for flattening curves.
 #define D 0.67
 float approx_parabola_integral(float x) {
    return x * inversesqrt(sqrt(1.0 - D + (D * D * D * D + 0.25 * x * x)));
 }
 /// An approximation to the inverse parabola integral.
 #define B 0.39
 float approx_parabola_inv_integral(float x) {
    return x * sqrt(1.0 - B + (B * B + 0.25 * x * x));
 }
 SubdivResult estimate_subdiv(vec2 p0, vec2 p1, vec2 p2, float sqrt_tol) {
    vec2 d01 = p1 - p0;
    vec2 d12 = p2 - p1;
    vec2 dd = d01 - d12;
    float cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x;
    float x0 = (d01.x * dd.x + d01.y * dd.y) / cross;
    float x2 = (d12.x * dd.x + d12.y * dd.y) / cross;
    float scale = abs(cross / (length(dd) * (x2 - x0)));
    float a0 = approx_parabola_integral(x0);
    float a2 = approx_parabola_integral(x2);
    float val = 0.0;
    if (scale < 1e9) {
        float da = abs(a2 - a0);
        float sqrt_scale = sqrt(scale);
        if (sign(x0) == sign(x2)) {
            val = da * sqrt_scale;
        } else {
            float xmin = sqrt_tol / sqrt_scale;
            val = sqrt_tol * da / approx_parabola_integral(xmin);
        }
    }
    return SubdivResult(val, a0, a2);
 }
 void main() {
    uint element_ix = gl_GlobalInvocationID.x;
    PathSegRef ref = PathSegRef(element_ix * PathSeg_size);
    uint tag = PathSeg_Nop;
    if (element_ix < n_pathseg) {
        tag = PathSeg_tag(ref);
    }
    // Setup for coverage algorithm.
    float a, b, c;
    // Bounding box of element in pixel coordinates.
    float xmin, xmax, ymin, ymax;
    PathStrokeLine line;
    float dx;
    switch (tag) {
    /*
    case PathSeg_FillLine:
    case PathSeg_StrokeLine:
        line = PathSeg_StrokeLine_read(ref);
        xmin = min(line.p0.x, line.p1.x) - line.stroke.x;
        xmax = max(line.p0.x, line.p1.x) + line.stroke.x;
        ymin = min(line.p0.y, line.p1.y) - line.stroke.y;
        ymax = max(line.p0.y, line.p1.y) + line.stroke.y;
        dx = line.p1.x - line.p0.x;
        float dy = line.p1.y - line.p0.y;
        // Set up for per-scanline coverage formula, below.
        float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
        c = (line.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + line.stroke.y)) * SX;
        b = invslope; // Note: assumes square tiles, otherwise scale.
        a = (line.p0.x - (line.p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX;
        break;
    */
    case PathSeg_FillCubic:
    case PathSeg_StrokeCubic:
        PathStrokeCubic cubic = PathSeg_StrokeCubic_read(ref);
        // Commented out code is for computing error bound on conversion to quadratics
        vec2 err_v = 3.0 * (cubic.p2 - cubic.p1) + cubic.p0 - cubic.p3;
        float err = err_v.x * err_v.x + err_v.y * err_v.y;
        // The number of quadratics.
        uint n_quads = max(uint(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0))), 1);
        // Iterate over quadratics and tote up the estimated number of segments.
        float val = 0.0;
        vec2 qp0 = cubic.p0;
        float step = 1.0 / float(n_quads);
        for (uint i = 0; i < n_quads; i++) {
            float t = float(i + 1) * step;
            vec2 qp2 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t);
            vec2 qp1 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t - 0.5 * step);
            qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
            SubdivResult params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY));
            val += params.val;
            qp0 = qp2;
        }
        uint n = max(uint(ceil(val * 0.5 / sqrt(REM_ACCURACY))), 1);
        uint path_ix = cubic.path_ix;
        Path path = Path_read(PathRef(path_ix * Path_size));
        ivec4 bbox = ivec4(path.bbox);
        vec2 p0 = cubic.p0;
        qp0 = cubic.p0;
        float v_step = val / float(n);
        int n_out = 1;
        float val_sum = 0.0;
        for (uint i = 0; i < n_quads; i++) {
            float t = float(i + 1) * step;
            vec2 qp2 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t);
            vec2 qp1 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t - 0.5 * step);
            qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
            SubdivResult params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY));
            float u0 = approx_parabola_inv_integral(params.a0);
            float u2 = approx_parabola_inv_integral(params.a2);
            float uscale = 1.0 / (u2 - u0);
            float target = float(n_out) * v_step;
            while (n_out == n || target < val_sum + params.val) {
                vec2 p1;
                if (n_out == n) {
                    p1 = cubic.p3;
                } else {
                    float u = (target - val_sum) / params.val;
                    float a = mix(params.a0, params.a2, u);
                    float au = approx_parabola_inv_integral(a);
                    float t = (au - u0) * uscale;
                    p1 = eval_quad(qp0, qp1, qp2, t);
                }
                // Output line segment
                xmin = min(p0.x, p1.x) - cubic.stroke.x;
                xmax = max(p0.x, p1.x) + cubic.stroke.x;
                ymin = min(p0.y, p1.y) - cubic.stroke.y;
                ymax = max(p0.y, p1.y) + cubic.stroke.y;
                float dx = p1.x - p0.x;
                float dy = p1.y - p0.y;
                // Set up for per-scanline coverage formula, below.
                float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
                c = (cubic.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + cubic.stroke.y)) * SX;
                b = invslope; // Note: assumes square tiles, otherwise scale.
                a = (p0.x - (p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX;
                int x0 = int(floor((xmin) * SX));
                int x1 = int(ceil((xmax) * SX));
                int y0 = int(floor((ymin) * SY));
                int y1 = int(ceil((ymax) * SY));
                x0 = clamp(x0, bbox.x, bbox.z);
                y0 = clamp(y0, bbox.y, bbox.w);
                x1 = clamp(x1, bbox.x, bbox.z);
                y1 = clamp(y1, bbox.y, bbox.w);
                float xc = a + b * float(y0);
                int stride = bbox.z - bbox.x;
                int base = (y0 - bbox.y) * stride - bbox.x;
                // TODO: can be tighter, use c to bound width
                uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
                // Consider using subgroups to aggregate atomic add.
                uint tile_offset = atomicAdd(alloc, n_tile_alloc * TileSeg_size);
                TileSeg tile_seg;
                for (int y = y0; y < y1; y++) {
                    float tile_y0 = float(y * TILE_HEIGHT_PX);
                    if (tag == PathSeg_FillCubic && min(p0.y, p1.y) <= tile_y0) {
                        int xray = max(int(ceil(xc - 0.5 * b)), bbox.x);
                        if (xray < bbox.z) {
                            int backdrop = p1.y < p0.y ? 1 : -1;
                            TileRef tile_ref = Tile_index(path.tiles, uint(base + xray));
                            uint tile_el = tile_ref.offset >> 2;
                            atomicAdd(tile[tile_el + 1], backdrop);
                        }
                    }
                    int xx0 = clamp(int(floor(xc - c)), x0, x1);
                    int xx1 = clamp(int(ceil(xc + c)), x0, x1);
                    for (int x = xx0; x < xx1; x++) {
                        float tile_x0 = float(x * TILE_WIDTH_PX);
                        TileRef tile_ref = Tile_index(path.tiles, uint(base + x));
                        uint tile_el = tile_ref.offset >> 2;
                        uint old = atomicExchange(tile[tile_el], tile_offset);
                        tile_seg.start = p0;
                        tile_seg.end = p1;
                        float y_edge = 0.0;
                        if (tag == PathSeg_FillCubic) {
                            y_edge = mix(p0.y, p1.y, (tile_x0 - p0.x) / dx);
                            if (min(p0.x, p1.x) < tile_x0 && y_edge >= tile_y0 && y_edge < tile_y0 + TILE_HEIGHT_PX) {
                                if (p0.x > p1.x) {
                                    tile_seg.end = vec2(tile_x0, y_edge);
                                } else {
                                    tile_seg.start = vec2(tile_x0, y_edge);
                                }
                            } else {
                                y_edge = 1e9;
                            }
                        }
                        tile_seg.y_edge = y_edge;
                        tile_seg.next.offset = old;
                        TileSeg_write(TileSegRef(tile_offset), tile_seg);
                        tile_offset += TileSeg_size;
                    }
                    xc += b;
                    base += stride;
                }
                n_out += 1;
                target += v_step;
                p0 = p1;
            }
            val_sum += params.val;
            qp0 = qp2;
        }
        break;
    }
 }
--- a/piet-gpu/shader/path_coarse.spv
+++ b/piet-gpu/shader/path_coarse.spv
--- a/piet-gpu/shader/pathseg.h
+++ b/piet-gpu/shader/pathseg.h
@ -0,0 +1,253 @@
 // Code auto-generated by piet-gpu-derive
 struct PathFillLineRef {
    uint offset;
 };
 struct PathStrokeLineRef {
    uint offset;
 };
 struct PathFillCubicRef {
    uint offset;
 };
 struct PathStrokeCubicRef {
    uint offset;
 };
 struct PathSegRef {
    uint offset;
 };
 struct PathFillLine {
    vec2 p0;
    vec2 p1;
    uint path_ix;
 };
 #define PathFillLine_size 20
 PathFillLineRef PathFillLine_index(PathFillLineRef ref, uint index) {
    return PathFillLineRef(ref.offset + index * PathFillLine_size);
 }
 struct PathStrokeLine {
    vec2 p0;
    vec2 p1;
    uint path_ix;
    vec2 stroke;
 };
 #define PathStrokeLine_size 28
 PathStrokeLineRef PathStrokeLine_index(PathStrokeLineRef ref, uint index) {
    return PathStrokeLineRef(ref.offset + index * PathStrokeLine_size);
 }
 struct PathFillCubic {
    vec2 p0;
    vec2 p1;
    vec2 p2;
    vec2 p3;
    uint path_ix;
 };
 #define PathFillCubic_size 36
 PathFillCubicRef PathFillCubic_index(PathFillCubicRef ref, uint index) {
    return PathFillCubicRef(ref.offset + index * PathFillCubic_size);
 }
 struct PathStrokeCubic {
    vec2 p0;
    vec2 p1;
    vec2 p2;
    vec2 p3;
    uint path_ix;
    vec2 stroke;
 };
 #define PathStrokeCubic_size 44
 PathStrokeCubicRef PathStrokeCubic_index(PathStrokeCubicRef ref, uint index) {
    return PathStrokeCubicRef(ref.offset + index * PathStrokeCubic_size);
 }
 #define PathSeg_Nop 0
 #define PathSeg_FillLine 1
 #define PathSeg_StrokeLine 2
 #define PathSeg_FillCubic 3
 #define PathSeg_StrokeCubic 4
 #define PathSeg_size 48
 PathSegRef PathSeg_index(PathSegRef ref, uint index) {
    return PathSegRef(ref.offset + index * PathSeg_size);
 }
 PathFillLine PathFillLine_read(PathFillLineRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = pathseg[ix + 0];
    uint raw1 = pathseg[ix + 1];
    uint raw2 = pathseg[ix + 2];
    uint raw3 = pathseg[ix + 3];
    uint raw4 = pathseg[ix + 4];
    PathFillLine s;
    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.path_ix = raw4;
    return s;
 }
 void PathFillLine_write(PathFillLineRef ref, PathFillLine s) {
    uint ix = ref.offset >> 2;
    pathseg[ix + 0] = floatBitsToUint(s.p0.x);
    pathseg[ix + 1] = floatBitsToUint(s.p0.y);
    pathseg[ix + 2] = floatBitsToUint(s.p1.x);
    pathseg[ix + 3] = floatBitsToUint(s.p1.y);
    pathseg[ix + 4] = s.path_ix;
 }
 PathStrokeLine PathStrokeLine_read(PathStrokeLineRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = pathseg[ix + 0];
    uint raw1 = pathseg[ix + 1];
    uint raw2 = pathseg[ix + 2];
    uint raw3 = pathseg[ix + 3];
    uint raw4 = pathseg[ix + 4];
    uint raw5 = pathseg[ix + 5];
    uint raw6 = pathseg[ix + 6];
    PathStrokeLine s;
    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.path_ix = raw4;
    s.stroke = vec2(uintBitsToFloat(raw5), uintBitsToFloat(raw6));
    return s;
 }
 void PathStrokeLine_write(PathStrokeLineRef ref, PathStrokeLine s) {
    uint ix = ref.offset >> 2;
    pathseg[ix + 0] = floatBitsToUint(s.p0.x);
    pathseg[ix + 1] = floatBitsToUint(s.p0.y);
    pathseg[ix + 2] = floatBitsToUint(s.p1.x);
    pathseg[ix + 3] = floatBitsToUint(s.p1.y);
    pathseg[ix + 4] = s.path_ix;
    pathseg[ix + 5] = floatBitsToUint(s.stroke.x);
    pathseg[ix + 6] = floatBitsToUint(s.stroke.y);
 }
 PathFillCubic PathFillCubic_read(PathFillCubicRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = pathseg[ix + 0];
    uint raw1 = pathseg[ix + 1];
    uint raw2 = pathseg[ix + 2];
    uint raw3 = pathseg[ix + 3];
    uint raw4 = pathseg[ix + 4];
    uint raw5 = pathseg[ix + 5];
    uint raw6 = pathseg[ix + 6];
    uint raw7 = pathseg[ix + 7];
    uint raw8 = pathseg[ix + 8];
    PathFillCubic s;
    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
    s.p3 = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7));
    s.path_ix = raw8;
    return s;
 }
 void PathFillCubic_write(PathFillCubicRef ref, PathFillCubic s) {
    uint ix = ref.offset >> 2;
    pathseg[ix + 0] = floatBitsToUint(s.p0.x);
    pathseg[ix + 1] = floatBitsToUint(s.p0.y);
    pathseg[ix + 2] = floatBitsToUint(s.p1.x);
    pathseg[ix + 3] = floatBitsToUint(s.p1.y);
    pathseg[ix + 4] = floatBitsToUint(s.p2.x);
    pathseg[ix + 5] = floatBitsToUint(s.p2.y);
    pathseg[ix + 6] = floatBitsToUint(s.p3.x);
    pathseg[ix + 7] = floatBitsToUint(s.p3.y);
    pathseg[ix + 8] = s.path_ix;
 }
 PathStrokeCubic PathStrokeCubic_read(PathStrokeCubicRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = pathseg[ix + 0];
    uint raw1 = pathseg[ix + 1];
    uint raw2 = pathseg[ix + 2];
    uint raw3 = pathseg[ix + 3];
    uint raw4 = pathseg[ix + 4];
    uint raw5 = pathseg[ix + 5];
    uint raw6 = pathseg[ix + 6];
    uint raw7 = pathseg[ix + 7];
    uint raw8 = pathseg[ix + 8];
    uint raw9 = pathseg[ix + 9];
    uint raw10 = pathseg[ix + 10];
    PathStrokeCubic s;
    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
    s.p3 = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7));
    s.path_ix = raw8;
    s.stroke = vec2(uintBitsToFloat(raw9), uintBitsToFloat(raw10));
    return s;
 }
 void PathStrokeCubic_write(PathStrokeCubicRef ref, PathStrokeCubic s) {
    uint ix = ref.offset >> 2;
    pathseg[ix + 0] = floatBitsToUint(s.p0.x);
    pathseg[ix + 1] = floatBitsToUint(s.p0.y);
    pathseg[ix + 2] = floatBitsToUint(s.p1.x);
    pathseg[ix + 3] = floatBitsToUint(s.p1.y);
    pathseg[ix + 4] = floatBitsToUint(s.p2.x);
    pathseg[ix + 5] = floatBitsToUint(s.p2.y);
    pathseg[ix + 6] = floatBitsToUint(s.p3.x);
    pathseg[ix + 7] = floatBitsToUint(s.p3.y);
    pathseg[ix + 8] = s.path_ix;
    pathseg[ix + 9] = floatBitsToUint(s.stroke.x);
    pathseg[ix + 10] = floatBitsToUint(s.stroke.y);
 }
 uint PathSeg_tag(PathSegRef ref) {
    return pathseg[ref.offset >> 2];
 }
 PathFillLine PathSeg_FillLine_read(PathSegRef ref) {
    return PathFillLine_read(PathFillLineRef(ref.offset + 4));
 }
 PathStrokeLine PathSeg_StrokeLine_read(PathSegRef ref) {
    return PathStrokeLine_read(PathStrokeLineRef(ref.offset + 4));
 }
 PathFillCubic PathSeg_FillCubic_read(PathSegRef ref) {
    return PathFillCubic_read(PathFillCubicRef(ref.offset + 4));
 }
 PathStrokeCubic PathSeg_StrokeCubic_read(PathSegRef ref) {
    return PathStrokeCubic_read(PathStrokeCubicRef(ref.offset + 4));
 }
 void PathSeg_Nop_write(PathSegRef ref) {
    pathseg[ref.offset >> 2] = PathSeg_Nop;
 }
 void PathSeg_FillLine_write(PathSegRef ref, PathFillLine s) {
    pathseg[ref.offset >> 2] = PathSeg_FillLine;
    PathFillLine_write(PathFillLineRef(ref.offset + 4), s);
 }
 void PathSeg_StrokeLine_write(PathSegRef ref, PathStrokeLine s) {
    pathseg[ref.offset >> 2] = PathSeg_StrokeLine;
    PathStrokeLine_write(PathStrokeLineRef(ref.offset + 4), s);
 }
 void PathSeg_FillCubic_write(PathSegRef ref, PathFillCubic s) {
    pathseg[ref.offset >> 2] = PathSeg_FillCubic;
    PathFillCubic_write(PathFillCubicRef(ref.offset + 4), s);
 }
 void PathSeg_StrokeCubic_write(PathSegRef ref, PathStrokeCubic s) {
    pathseg[ref.offset >> 2] = PathSeg_StrokeCubic;
    PathStrokeCubic_write(PathStrokeCubicRef(ref.offset + 4), s);
 }
--- a/piet-gpu/shader/ptcl.h
+++ b/piet-gpu/shader/ptcl.h
@ -68,7 +68,7 @@ CmdLineRef CmdLine_index(CmdLineRef ref, uint index) {
 }
 struct CmdStroke {
-    SegChunkRef seg_ref;
+    uint tile_ref;
    float half_width;
    uint rgba_color;
 };
@ -80,7 +80,7 @@ CmdStrokeRef CmdStroke_index(CmdStrokeRef ref, uint index) {
 }
 struct CmdFill {
-    SegChunkRef seg_ref;
+    uint tile_ref;
    int backdrop;
    uint rgba_color;
 };
@ -220,7 +220,7 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) {
    uint raw1 = ptcl[ix + 1];
    uint raw2 = ptcl[ix + 2];
    CmdStroke s;
-    s.seg_ref = SegChunkRef(raw0);
+    s.tile_ref = raw0;
    s.half_width = uintBitsToFloat(raw1);
    s.rgba_color = raw2;
    return s;
@ -228,7 +228,7 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) {
 void CmdStroke_write(CmdStrokeRef ref, CmdStroke s) {
    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = s.seg_ref.offset;
+    ptcl[ix + 0] = s.tile_ref;
    ptcl[ix + 1] = floatBitsToUint(s.half_width);
    ptcl[ix + 2] = s.rgba_color;
 }
@ -239,7 +239,7 @@ CmdFill CmdFill_read(CmdFillRef ref) {
    uint raw1 = ptcl[ix + 1];
    uint raw2 = ptcl[ix + 2];
    CmdFill s;
-    s.seg_ref = SegChunkRef(raw0);
+    s.tile_ref = raw0;
    s.backdrop = int(raw1);
    s.rgba_color = raw2;
    return s;
@ -247,7 +247,7 @@ CmdFill CmdFill_read(CmdFillRef ref) {
 void CmdFill_write(CmdFillRef ref, CmdFill s) {
    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = s.seg_ref.offset;
+    ptcl[ix + 0] = s.tile_ref;
    ptcl[ix + 1] = uint(s.backdrop);
    ptcl[ix + 2] = s.rgba_color;
 }
--- a/piet-gpu/shader/scene.h
+++ b/piet-gpu/shader/scene.h
@ -240,12 +240,14 @@ TransformRef Transform_index(TransformRef ref, uint index) {
 #define Element_Nop 0
 #define Element_StrokeLine 1
 #define Element_FillLine 2
-#define Element_Quad 3
+#define Element_StrokeQuad 3
-#define Element_Cubic 4
+#define Element_FillQuad 4
-#define Element_Stroke 5
+#define Element_StrokeCubic 5
-#define Element_Fill 6
+#define Element_FillCubic 6
-#define Element_SetLineWidth 7
+#define Element_Stroke 7
-#define Element_Transform 8
+#define Element_Fill 8
 #define Element_SetLineWidth 9
 #define Element_Transform 10
 #define Element_size 36
 ElementRef Element_index(ElementRef ref, uint index) {
@ -455,11 +457,19 @@ LineSeg Element_FillLine_read(ElementRef ref) {
    return LineSeg_read(LineSegRef(ref.offset + 4));
 }
-QuadSeg Element_Quad_read(ElementRef ref) {
+QuadSeg Element_StrokeQuad_read(ElementRef ref) {
    return QuadSeg_read(QuadSegRef(ref.offset + 4));
 }
-CubicSeg Element_Cubic_read(ElementRef ref) {
+QuadSeg Element_FillQuad_read(ElementRef ref) {
    return QuadSeg_read(QuadSegRef(ref.offset + 4));
 }
 CubicSeg Element_StrokeCubic_read(ElementRef ref) {
    return CubicSeg_read(CubicSegRef(ref.offset + 4));
 }
 CubicSeg Element_FillCubic_read(ElementRef ref) {
    return CubicSeg_read(CubicSegRef(ref.offset + 4));
 }
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@ -31,6 +31,7 @@
 // TODO: compute all these
 #define WIDTH_IN_TILES 128
 #define HEIGHT_IN_TILES 96
 #define TILEGROUP_WIDTH_TILES 32
 #define TILE_WIDTH_PX 16
 #define TILE_HEIGHT_PX 16
--- a/piet-gpu/shader/state.h
+++ b/piet-gpu/shader/state.h
@ -10,9 +10,11 @@ struct State {
    vec4 bbox;
    float linewidth;
    uint flags;
    uint path_count;
    uint pathseg_count;
 };
-#define State_size 48
+#define State_size 56
 StateRef State_index(StateRef ref, uint index) {
    return StateRef(ref.offset + index * State_size);
@ -32,12 +34,16 @@ State State_read(StateRef ref) {
    uint raw9 = state[ix + 9];
    uint raw10 = state[ix + 10];
    uint raw11 = state[ix + 11];
    uint raw12 = state[ix + 12];
    uint raw13 = state[ix + 13];
    State s;
    s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
    s.bbox = vec4(uintBitsToFloat(raw6), uintBitsToFloat(raw7), uintBitsToFloat(raw8), uintBitsToFloat(raw9));
    s.linewidth = uintBitsToFloat(raw10);
    s.flags = raw11;
    s.path_count = raw12;
    s.pathseg_count = raw13;
    return s;
 }
@ -55,5 +61,7 @@ void State_write(StateRef ref, State s) {
    state[ix + 9] = floatBitsToUint(s.bbox.w);
    state[ix + 10] = floatBitsToUint(s.linewidth);
    state[ix + 11] = s.flags;
    state[ix + 12] = s.path_count;
    state[ix + 13] = s.pathseg_count;
 }
--- a/piet-gpu/shader/tile.h
+++ b/piet-gpu/shader/tile.h
@ -0,0 +1,109 @@
 // Code auto-generated by piet-gpu-derive
 struct PathRef {
    uint offset;
 };
 struct TileRef {
    uint offset;
 };
 struct TileSegRef {
    uint offset;
 };
 struct Path {
    uvec4 bbox;
    TileRef tiles;
 };
 #define Path_size 12
 PathRef Path_index(PathRef ref, uint index) {
    return PathRef(ref.offset + index * Path_size);
 }
 struct Tile {
    TileSegRef tile;
    int backdrop;
 };
 #define Tile_size 8
 TileRef Tile_index(TileRef ref, uint index) {
    return TileRef(ref.offset + index * Tile_size);
 }
 struct TileSeg {
    vec2 start;
    vec2 end;
    float y_edge;
    TileSegRef next;
 };
 #define TileSeg_size 24
 TileSegRef TileSeg_index(TileSegRef ref, uint index) {
    return TileSegRef(ref.offset + index * TileSeg_size);
 }
 Path Path_read(PathRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = tile[ix + 0];
    uint raw1 = tile[ix + 1];
    uint raw2 = tile[ix + 2];
    Path s;
    s.bbox = uvec4(raw0 & 0xffff, raw0 >> 16, raw1 & 0xffff, raw1 >> 16);
    s.tiles = TileRef(raw2);
    return s;
 }
 void Path_write(PathRef ref, Path s) {
    uint ix = ref.offset >> 2;
    tile[ix + 0] = s.bbox.x | (s.bbox.y << 16);
    tile[ix + 1] = s.bbox.z | (s.bbox.w << 16);
    tile[ix + 2] = s.tiles.offset;
 }
 Tile Tile_read(TileRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = tile[ix + 0];
    uint raw1 = tile[ix + 1];
    Tile s;
    s.tile = TileSegRef(raw0);
    s.backdrop = int(raw1);
    return s;
 }
 void Tile_write(TileRef ref, Tile s) {
    uint ix = ref.offset >> 2;
    tile[ix + 0] = s.tile.offset;
    tile[ix + 1] = uint(s.backdrop);
 }
 TileSeg TileSeg_read(TileSegRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = tile[ix + 0];
    uint raw1 = tile[ix + 1];
    uint raw2 = tile[ix + 2];
    uint raw3 = tile[ix + 3];
    uint raw4 = tile[ix + 4];
    uint raw5 = tile[ix + 5];
    TileSeg s;
    s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.y_edge = uintBitsToFloat(raw4);
    s.next = TileSegRef(raw5);
    return s;
 }
 void TileSeg_write(TileSegRef ref, TileSeg s) {
    uint ix = ref.offset >> 2;
    tile[ix + 0] = floatBitsToUint(s.start.x);
    tile[ix + 1] = floatBitsToUint(s.start.y);
    tile[ix + 2] = floatBitsToUint(s.end.x);
    tile[ix + 3] = floatBitsToUint(s.end.y);
    tile[ix + 4] = floatBitsToUint(s.y_edge);
    tile[ix + 5] = s.next.offset;
 }
--- a/piet-gpu/shader/tile_alloc.comp
+++ b/piet-gpu/shader/tile_alloc.comp
@ -0,0 +1,100 @@
 // Allocation and initialization of tiles for paths.
 #version 450
 #extension GL_GOOGLE_include_directive : enable
 #include "setup.h"
 #define LG_TILE_ALLOC_WG 8
 #define TILE_ALLOC_WG (1 << LG_TILE_ALLOC_WG)
 layout(local_size_x = TILE_ALLOC_WG, local_size_y = 1) in;
 layout(set = 0, binding = 0) buffer AnnotatedBuf {
    uint[] annotated;
 };
 layout(set = 0, binding = 1) buffer AllocBuf {
    uint n_elements;
    uint n_pathseg;
    uint alloc;
 };
 layout(set = 0, binding = 2) buffer TileBuf {
    uint[] tile;
 };
 #include "annotated.h"
 #include "tile.h"
 // scale factors useful for converting coordinates to tiles
 #define SX (1.0 / float(TILE_WIDTH_PX))
 #define SY (1.0 / float(TILE_HEIGHT_PX))
 shared uint sh_tile_count[TILE_ALLOC_WG];
 shared uint sh_tile_alloc;
 void main() {
    uint th_ix = gl_LocalInvocationID.x;
    uint element_ix = gl_GlobalInvocationID.x;
    PathRef path_ref = PathRef(element_ix * Path_size);
    AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size);
    uint tag = Annotated_Nop;
    if (element_ix < n_elements) {
        tag = Annotated_tag(ref);
    }
    int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
    switch (tag) {
    case Annotated_Fill:
    case Annotated_Stroke:
        // Note: we take advantage of the fact that fills and strokes
        // have compatible layout.
        AnnoFill fill = Annotated_Fill_read(ref);
        x0 = int(floor(fill.bbox.x * SX));
        y0 = int(floor(fill.bbox.y * SY));
        x1 = int(ceil(fill.bbox.z * SX));
        y1 = int(ceil(fill.bbox.w * SY));
        break;
    }
    x0 = clamp(x0, 0, WIDTH_IN_TILES);
    y0 = clamp(y0, 0, HEIGHT_IN_TILES);
    x1 = clamp(x1, 0, WIDTH_IN_TILES);
    y1 = clamp(y1, 0, HEIGHT_IN_TILES);
    Path path;
    path.bbox = uvec4(x0, y0, x1, y1);
    uint tile_count = (x1 - x0) * (y1 - y0);
    uint n_tiles = tile_count;
    sh_tile_count[th_ix] = tile_count;
    // Prefix sum of sh_tile_count
    for (uint i = 0; i < LG_TILE_ALLOC_WG; i++) {
        barrier();
        if (th_ix >= (1 << i)) {
            tile_count += sh_tile_count[th_ix - (1 << i)];
        }
        barrier();
        sh_tile_count[th_ix] = tile_count;
    }
    if (th_ix == TILE_ALLOC_WG - 1) {
        sh_tile_alloc = atomicAdd(alloc, tile_count * Tile_size);
    }
    barrier();
    uint alloc_start = sh_tile_alloc;
    if (element_ix < n_elements) {
        uint tile_subix = th_ix > 0 ? sh_tile_count[th_ix - 1] : 0;
        path.tiles = TileRef(alloc_start + Tile_size * tile_subix);
        Path_write(path_ref, path);
    }
    // Zero out allocated tiles efficiently
    uint total_count = sh_tile_count[TILE_ALLOC_WG - 1] * (Tile_size / 4);
    uint start_ix = alloc_start >> 2;
    for (uint i = th_ix; i < total_count; i += TILE_ALLOC_WG) {
        // Note: this interleaving is faster than using Tile_write
        // by a significant amount.
        tile[start_ix + i] = 0;
    }
 }
--- a/piet-gpu/shader/tile_alloc.spv
+++ b/piet-gpu/shader/tile_alloc.spv
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -121,12 +121,26 @@ pub struct Renderer<D: Device> {
    pub state_buf: D::Buffer,
    pub anno_buf: D::Buffer,
    pub pathseg_buf: D::Buffer,
    pub tile_buf: D::Buffer,
    pub bin_buf: D::Buffer,
    pub ptcl_buf: D::Buffer,
    el_pipeline: D::Pipeline,
    el_ds: D::DescriptorSet,
    tile_pipeline: D::Pipeline,
    tile_ds: D::DescriptorSet,
    path_pipeline: D::Pipeline,
    path_ds: D::DescriptorSet,
    backdrop_pipeline: D::Pipeline,
    backdrop_ds: D::DescriptorSet,
    tile_alloc_buf_host: D::Buffer,
    tile_alloc_buf_dev: D::Buffer,
    bin_pipeline: D::Pipeline,
    bin_ds: D::DescriptorSet,
@ -143,10 +157,12 @@ pub struct Renderer<D: Device> {
    k4_ds: D::DescriptorSet,
    n_elements: usize,
    n_paths: usize,
    n_pathseg: usize,
 }
 impl<D: Device> Renderer<D> {
-    pub unsafe fn new(device: &D, scene: &[u8]) -> Result<Self, Error> {
+    pub unsafe fn new(device: &D, scene: &[u8], n_paths: usize, n_pathseg: usize) -> Result<Self, Error> {
        let host = MemFlags::host_coherent();
        let dev = MemFlags::device_local();
@ -163,15 +179,51 @@ impl<D: Device> Renderer<D> {
        let state_buf = device.create_buffer(1 * 1024 * 1024, dev)?;
        let anno_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
        let pathseg_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
        let tile_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
        let bin_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
        let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?;
        let image_dev = device.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?;
        let el_code = include_bytes!("../shader/elements.spv");
-        let el_pipeline = device.create_simple_compute_pipeline(el_code, 3, 0)?;
+        let el_pipeline = device.create_simple_compute_pipeline(el_code, 4, 0)?;
        let el_ds = device.create_descriptor_set(
            &el_pipeline,
-            &[&scene_dev, &state_buf, &anno_buf],
+            &[&scene_dev, &state_buf, &anno_buf, &pathseg_buf],
            &[],
        )?;
        let tile_alloc_buf_host = device.create_buffer(12, host)?;
        let tile_alloc_buf_dev = device.create_buffer(12, dev)?;
        // TODO: constants
        const PATH_SIZE: usize = 12;
        let tile_alloc_start = ((n_paths + 31) & !31) * PATH_SIZE;
        device.write_buffer(
            &tile_alloc_buf_host,
            &[n_paths as u32, n_pathseg as u32, tile_alloc_start as u32],
        )?;
        let tile_alloc_code = include_bytes!("../shader/tile_alloc.spv");
        let tile_pipeline = device.create_simple_compute_pipeline(tile_alloc_code, 3, 0)?;
        let tile_ds = device.create_descriptor_set(
            &tile_pipeline,
            &[&anno_buf, &tile_alloc_buf_dev, &tile_buf],
            &[],
        )?;
        let path_alloc_code = include_bytes!("../shader/path_coarse.spv");
        let path_pipeline = device.create_simple_compute_pipeline(path_alloc_code, 3, 0)?;
        let path_ds = device.create_descriptor_set(
            &path_pipeline,
            &[&pathseg_buf, &tile_alloc_buf_dev, &tile_buf],
            &[],
        )?;
        let backdrop_alloc_code = include_bytes!("../shader/backdrop.spv");
        let backdrop_pipeline = device.create_simple_compute_pipeline(backdrop_alloc_code, 3, 0)?;
        let backdrop_ds = device.create_descriptor_set(
            &backdrop_pipeline,
            &[&anno_buf, &tile_alloc_buf_dev, &tile_buf],
            &[],
        )?;
@ -179,10 +231,10 @@ impl<D: Device> Renderer<D> {
        let bin_alloc_buf_dev = device.create_buffer(12, dev)?;
        // TODO: constants
-        let bin_alloc_start = ((n_elements + 255) & !255) * 8;
+        let bin_alloc_start = ((n_paths + 255) & !255) * 8;
        device.write_buffer(
            &bin_alloc_buf_host,
-            &[n_elements as u32, 0, bin_alloc_start as u32],
+            &[n_paths as u32, 0, bin_alloc_start as u32],
        )?;
        let bin_code = include_bytes!("../shader/binning.spv");
        let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 4, 0)?;
@ -198,19 +250,23 @@ impl<D: Device> Renderer<D> {
        let coarse_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC;
        device.write_buffer(
            &coarse_alloc_buf_host,
-            &[n_elements as u32, coarse_alloc_start as u32],
+            &[n_paths as u32, coarse_alloc_start as u32],
        )?;
        let coarse_code = include_bytes!("../shader/coarse.spv");
-        let coarse_pipeline = device.create_simple_compute_pipeline(coarse_code, 4, 0)?;
+        let coarse_pipeline = device.create_simple_compute_pipeline(coarse_code, 5, 0)?;
        let coarse_ds = device.create_descriptor_set(
            &coarse_pipeline,
-            &[&anno_buf, &bin_buf, &coarse_alloc_buf_dev, &ptcl_buf],
+            &[&anno_buf, &bin_buf, &tile_buf, &coarse_alloc_buf_dev, &ptcl_buf],
            &[],
        )?;
        let k4_code = include_bytes!("../shader/kernel4.spv");
-        let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 1, 1)?;
+        let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 2, 1)?;
-        let k4_ds = device.create_descriptor_set(&k4_pipeline, &[&ptcl_buf], &[&image_dev])?;
+        let k4_ds = device.create_descriptor_set(
            &k4_pipeline, 
            &[&ptcl_buf, &tile_buf], 
            &[&image_dev]
        )?;
        Ok(Renderer {
            scene_buf,
@ -218,6 +274,12 @@ impl<D: Device> Renderer<D> {
            image_dev,
            el_pipeline,
            el_ds,
            tile_pipeline,
            tile_ds,
            path_pipeline,
            path_ds,
            backdrop_pipeline,
            backdrop_ds,
            bin_pipeline,
            bin_ds,
            coarse_pipeline,
@ -226,18 +288,25 @@ impl<D: Device> Renderer<D> {
            k4_ds,
            state_buf,
            anno_buf,
            pathseg_buf,
            tile_buf,
            bin_buf,
            ptcl_buf,
            tile_alloc_buf_host,
            tile_alloc_buf_dev,
            bin_alloc_buf_host,
            bin_alloc_buf_dev,
            coarse_alloc_buf_host,
            coarse_alloc_buf_dev,
            n_elements,
            n_paths,
            n_pathseg,
        })
    }
    pub unsafe fn record(&self, cmd_buf: &mut impl CmdBuf<D>, query_pool: &D::QueryPool) {
        cmd_buf.copy_buffer(&self.scene_buf, &self.scene_dev);
        cmd_buf.copy_buffer(&self.tile_alloc_buf_host, &self.tile_alloc_buf_dev);
        cmd_buf.copy_buffer(&self.bin_alloc_buf_host, &self.bin_alloc_buf_dev);
        cmd_buf.copy_buffer(&self.coarse_alloc_buf_host, &self.coarse_alloc_buf_dev);
        cmd_buf.clear_buffer(&self.state_buf);
@ -257,25 +326,49 @@ impl<D: Device> Renderer<D> {
        cmd_buf.write_timestamp(&query_pool, 1);
        cmd_buf.memory_barrier();
        cmd_buf.dispatch(
-            &self.bin_pipeline,
+            &self.tile_pipeline,
-            &self.bin_ds,
+            &self.tile_ds,
-            (((self.n_elements + 255) / 256) as u32, 1, 1),
+            (((self.n_paths + 255) / 256) as u32, 1, 1),
        );
        cmd_buf.write_timestamp(&query_pool, 2);
        cmd_buf.memory_barrier();
        cmd_buf.dispatch(
            &self.path_pipeline,
            &self.path_ds,
            (((self.n_pathseg + 31) / 32) as u32, 1, 1),
        );
        cmd_buf.write_timestamp(&query_pool, 3);
        cmd_buf.memory_barrier();
        cmd_buf.dispatch(
            &self.backdrop_pipeline,
            &self.backdrop_ds,
            (((self.n_paths + 255) / 256) as u32, 1, 1),
        );
        cmd_buf.write_timestamp(&query_pool, 4);
        // Note: this barrier is not needed as an actual dependency between
        // pipeline stages, but I am keeping it in so that timer queries are
        // easier to interpret.
        cmd_buf.memory_barrier();
        cmd_buf.dispatch(
            &self.bin_pipeline,
            &self.bin_ds,
            (((self.n_paths + 255) / 256) as u32, 1, 1),
        );
        cmd_buf.write_timestamp(&query_pool, 5);
        cmd_buf.memory_barrier();
        cmd_buf.dispatch(
            &self.coarse_pipeline,
            &self.coarse_ds,
            (WIDTH as u32 / 256, HEIGHT as u32 / 256, 1),
        );
-        cmd_buf.write_timestamp(&query_pool, 3);
+        cmd_buf.write_timestamp(&query_pool, 6);
        cmd_buf.memory_barrier();
        cmd_buf.dispatch(
            &self.k4_pipeline,
            &self.k4_ds,
            ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1),
        );
-        cmd_buf.write_timestamp(&query_pool, 4);
+        cmd_buf.write_timestamp(&query_pool, 7);
        cmd_buf.memory_barrier();
        cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
    }
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@ -31,6 +31,10 @@ pub struct PietGpuRenderContext {
    // Will probably need direct accesss to hal Device to create images etc.
    inner_text: PietGpuText,
    stroke_width: f32,
    // We're tallying these cpu-side for expedience, but will probably
    // move this to some kind of readback from element processing.
    path_count: usize,
    pathseg_count: usize,
 }
 #[derive(Clone)]
@ -52,6 +56,8 @@ impl PietGpuRenderContext {
            elements,
            inner_text,
            stroke_width,
            path_count: 0,
            pathseg_count: 0,
        }
    }
@ -59,6 +65,14 @@ impl PietGpuRenderContext {
        self.elements.encode(&mut self.encoder);
        self.encoder.buf()
    }
    pub fn path_count(&self) -> usize {
        self.path_count
    }
    pub fn pathseg_count(&self) -> usize {
        self.pathseg_count
    }
 }
 impl RenderContext for PietGpuRenderContext {
@ -95,6 +109,7 @@ impl RenderContext for PietGpuRenderContext {
            PietGpuBrush::Solid(rgba_color) => {
                let stroke = Stroke { rgba_color };
                self.elements.push(Element::Stroke(stroke));
                self.path_count += 1;
            }
            _ => (),
        }
@ -117,6 +132,7 @@ impl RenderContext for PietGpuRenderContext {
            PietGpuBrush::Solid(rgba_color) => {
                let fill = Fill { rgba_color };
                self.elements.push(Element::Fill(fill));
                self.path_count += 1;
            }
            _ => (),
        }
@ -200,10 +216,29 @@ impl PietGpuRenderContext {
        } else {
            self.elements.push(Element::StrokeLine(seg));
        }
        self.pathseg_count += 1;
    }
    fn encode_quad_seg(&mut self, seg: QuadSeg, is_fill: bool) {
        if is_fill {
            self.elements.push(Element::FillQuad(seg));
        } else {
            self.elements.push(Element::StrokeQuad(seg));
        }
        self.pathseg_count += 1;
    }
    fn encode_cubic_seg(&mut self, seg: CubicSeg, is_fill: bool) {
        if is_fill {
            self.elements.push(Element::FillCubic(seg));
        } else {
            self.elements.push(Element::StrokeCubic(seg));
        }
        self.pathseg_count += 1;
    }
    fn encode_path(&mut self, path: impl Iterator<Item = PathEl>, is_fill: bool) {
-        let flatten = true;
+        let flatten = false;
        if flatten {
            let mut start_pt = None;
            let mut last_pt = None;
@ -265,7 +300,7 @@ impl PietGpuRenderContext {
                            p1: scene_p1,
                            p2: scene_p2,
                        };
-                        self.elements.push(Element::Quad(seg));
+                        self.encode_quad_seg(seg, is_fill);
                        last_pt = Some(scene_p2);
                    }
                    PathEl::CurveTo(p1, p2, p3) => {
@ -278,7 +313,7 @@ impl PietGpuRenderContext {
                            p2: scene_p2,
                            p3: scene_p3,
                        };
-                        self.elements.push(Element::Cubic(seg));
+                        self.encode_cubic_seg(seg, is_fill);
                        last_pt = Some(scene_p3);
                    }
                    PathEl::ClosePath => {