diff --git a/piet-gpu-types/src/annotated.rs b/piet-gpu-types/src/annotated.rs
index f7a6ad6..d53d870 100644
--- a/piet-gpu-types/src/annotated.rs
+++ b/piet-gpu-types/src/annotated.rs
@@ -3,9 +3,11 @@ use piet_gpu_derive::piet_gpu;
 piet_gpu! {
     #[gpu_write]
     mod annotated {
+        // Note: path segments have moved to pathseg, delete these.
         struct AnnoFillLineSeg {
             p0: [f32; 2],
             p1: [f32; 2],
+            path_ix: u32,
             // A note: the layout of this struct is shared with
             // AnnoStrokeLineSeg. In that case, we actually write
             // [0.0, 0.0] as the stroke field, to minimize divergence.
@@ -13,6 +15,7 @@ piet_gpu! {
         struct AnnoStrokeLineSeg {
             p0: [f32; 2],
             p1: [f32; 2],
+            path_ix: u32,
             // halfwidth in both x and y for binning
             stroke: [f32; 2],
         }
diff --git a/piet-gpu-types/src/lib.rs b/piet-gpu-types/src/lib.rs
index 75a7731..62450d2 100644
--- a/piet-gpu-types/src/lib.rs
+++ b/piet-gpu-types/src/lib.rs
@@ -3,8 +3,10 @@
 pub mod annotated;
 pub mod bins;
 pub mod encoder;
+pub mod pathseg;
 pub mod ptcl;
 pub mod scene;
 pub mod state;
 pub mod test;
+pub mod tile;
 pub mod tilegroup;
diff --git a/piet-gpu-types/src/main.rs b/piet-gpu-types/src/main.rs
index 9c40051..7913c5f 100644
--- a/piet-gpu-types/src/main.rs
+++ b/piet-gpu-types/src/main.rs
@@ -7,7 +7,9 @@ fn main() {
         "scene" => print!("{}", piet_gpu_types::scene::gen_gpu_scene()),
         "state" => print!("{}", piet_gpu_types::state::gen_gpu_state()),
         "annotated" => print!("{}", piet_gpu_types::annotated::gen_gpu_annotated()),
+        "pathseg" => print!("{}", piet_gpu_types::pathseg::gen_gpu_pathseg()),
         "bins" => print!("{}", piet_gpu_types::bins::gen_gpu_bins()),
+        "tile" => print!("{}", piet_gpu_types::tile::gen_gpu_tile()),
         "tilegroup" => print!("{}", piet_gpu_types::tilegroup::gen_gpu_tilegroup()),
         "ptcl" => print!("{}", piet_gpu_types::ptcl::gen_gpu_ptcl()),
         "test" => print!("{}", piet_gpu_types::test::gen_gpu_test()),
diff --git a/piet-gpu-types/src/pathseg.rs b/piet-gpu-types/src/pathseg.rs
new file mode 100644
index 0000000..5ad382b
--- /dev/null
+++ b/piet-gpu-types/src/pathseg.rs
@@ -0,0 +1,46 @@
+use piet_gpu_derive::piet_gpu;
+
+piet_gpu! {
+    #[gpu_write]
+    mod pathseg {
+        struct PathFillLine {
+            p0: [f32; 2],
+            p1: [f32; 2],
+            path_ix: u32,
+            // A note: the layout of this struct is shared with
+            // PathStrokeLine. In that case, we actually write
+            // [0.0, 0.0] as the stroke field, to minimize divergence.
+        }
+        struct PathStrokeLine {
+            p0: [f32; 2],
+            p1: [f32; 2],
+            path_ix: u32,
+            // halfwidth in both x and y for binning
+            stroke: [f32; 2],
+        }
+        /*
+        struct PathQuad {
+            p0: [f32; 2],
+            p1: [f32; 2],
+            p2: [f32; 2],
+            stroke: [f32; 2],
+        }
+        struct PathCubic {
+            p0: [f32; 2],
+            p1: [f32; 2],
+            p2: [f32; 2],
+            p3: [f32; 2],
+            stroke: [f32; 2],
+        }
+        */
+        enum PathSeg {
+            Nop,
+            FillLine(PathFillLine),
+            StrokeLine(PathStrokeLine),
+            /*
+            Quad(AnnoQuadSeg),
+            Cubic(AnnoCubicSeg),
+            */
+        }
+    }
+}
diff --git a/piet-gpu-types/src/state.rs b/piet-gpu-types/src/state.rs
index 35076f0..602fab9 100644
--- a/piet-gpu-types/src/state.rs
+++ b/piet-gpu-types/src/state.rs
@@ -9,6 +9,8 @@ piet_gpu! {
             bbox: [f32; 4],
             linewidth: f32,
             flags: u32,
+            path_count: u32,
+            pathseg_count: u32,
         }
     }
 }
diff --git a/piet-gpu-types/src/tile.rs b/piet-gpu-types/src/tile.rs
new file mode 100644
index 0000000..5a28037
--- /dev/null
+++ b/piet-gpu-types/src/tile.rs
@@ -0,0 +1,21 @@
+use piet_gpu_derive::piet_gpu;
+
+piet_gpu! {
+    #[gpu_write]
+    mod tile {
+        struct Path {
+            bbox: [u16; 4],
+            tiles: Ref<Tile>,
+        }
+        struct Tile {
+            tile: Ref<TileSeg>,
+            backdrop: i32,
+        }
+        // Segments within a tile are represented as a linked list.
+        struct TileSeg {
+            start: [f32; 2],
+            end: [f32; 2],
+            next: Ref<TileSeg>,
+        }
+    }
+}
diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs
index 31024aa..04a20ba 100644
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@@ -185,10 +185,12 @@ fn main() -> Result<(), Error> {
         } else {
             render_scene(&mut ctx);
         }
+        let n_paths = ctx.path_count();
+        let n_pathseg = ctx.pathseg_count();
         let scene = ctx.get_scene_buf();
         //dump_scene(&scene);
 
-        let renderer = Renderer::new(&device, scene)?;
+        let renderer = Renderer::new(&device, scene, n_paths, n_pathseg)?;
         let image_buf =
             device.create_buffer((WIDTH * HEIGHT * 4) as u64, MemFlags::host_coherent())?;
 
@@ -200,16 +202,16 @@ fn main() -> Result<(), Error> {
         device.wait_and_reset(&[fence])?;
         let ts = device.reap_query_pool(&query_pool).unwrap();
         println!("Element kernel time: {:.3}ms", ts[0] * 1e3);
-        println!("Binning kernel time: {:.3}ms", (ts[1] - ts[0]) * 1e3);
-        println!("Coarse kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3);
-        println!("Render kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3);
-
+        println!("Tile allocation kernel time: {:.3}ms", (ts[1] - ts[0]) * 1e3);
+        println!("Coarse path kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3);
         /*
+        println!("Render kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3);
+        */
+
         let mut data: Vec<u32> = Default::default();
-        device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap();
+        device.read_buffer(&renderer.tile_buf, &mut data).unwrap();
         piet_gpu::dump_k1_data(&data);
         //trace_ptcl(&data);
-        */
 
         let mut img_data: Vec<u8> = Default::default();
         // Note: because png can use a `&[u8]` slice, we could avoid an extra copy
diff --git a/piet-gpu/bin/winit.rs b/piet-gpu/bin/winit.rs
index fd30fa3..3568732 100644
--- a/piet-gpu/bin/winit.rs
+++ b/piet-gpu/bin/winit.rs
@@ -42,9 +42,11 @@ fn main() -> Result<(), Error> {
 
         let mut ctx = PietGpuRenderContext::new();
         render_scene(&mut ctx);
+        let n_paths = ctx.path_count();
+        let n_pathseg = ctx.pathseg_count();
         let scene = ctx.get_scene_buf();
 
-        let renderer = Renderer::new(&device, scene)?;
+        let renderer = Renderer::new(&device, scene, n_paths, n_pathseg)?;
 
         event_loop.run(move |event, _, control_flow| {
             *control_flow = ControlFlow::Poll; // `ControlFlow::Wait` if only re-render on event
diff --git a/piet-gpu/shader/annotated.h b/piet-gpu/shader/annotated.h
index 9812264..f243fab 100644
--- a/piet-gpu/shader/annotated.h
+++ b/piet-gpu/shader/annotated.h
@@ -31,9 +31,10 @@ struct AnnotatedRef {
 struct AnnoFillLineSeg {
     vec2 p0;
     vec2 p1;
+    uint path_ix;
 };
 
-#define AnnoFillLineSeg_size 16
+#define AnnoFillLineSeg_size 20
 
 AnnoFillLineSegRef AnnoFillLineSeg_index(AnnoFillLineSegRef ref, uint index) {
     return AnnoFillLineSegRef(ref.offset + index * AnnoFillLineSeg_size);
@@ -42,10 +43,11 @@ AnnoFillLineSegRef AnnoFillLineSeg_index(AnnoFillLineSegRef ref, uint index) {
 struct AnnoStrokeLineSeg {
     vec2 p0;
     vec2 p1;
+    uint path_ix;
     vec2 stroke;
 };
 
-#define AnnoStrokeLineSeg_size 24
+#define AnnoStrokeLineSeg_size 28
 
 AnnoStrokeLineSegRef AnnoStrokeLineSeg_index(AnnoStrokeLineSegRef ref, uint index) {
     return AnnoStrokeLineSegRef(ref.offset + index * AnnoStrokeLineSeg_size);
@@ -120,9 +122,11 @@ AnnoFillLineSeg AnnoFillLineSeg_read(AnnoFillLineSegRef ref) {
     uint raw1 = annotated[ix + 1];
     uint raw2 = annotated[ix + 2];
     uint raw3 = annotated[ix + 3];
+    uint raw4 = annotated[ix + 4];
     AnnoFillLineSeg s;
     s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
     s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.path_ix = raw4;
     return s;
 }
 
@@ -132,6 +136,7 @@ void AnnoFillLineSeg_write(AnnoFillLineSegRef ref, AnnoFillLineSeg s) {
     annotated[ix + 1] = floatBitsToUint(s.p0.y);
     annotated[ix + 2] = floatBitsToUint(s.p1.x);
     annotated[ix + 3] = floatBitsToUint(s.p1.y);
+    annotated[ix + 4] = s.path_ix;
 }
 
 AnnoStrokeLineSeg AnnoStrokeLineSeg_read(AnnoStrokeLineSegRef ref) {
@@ -142,10 +147,12 @@ AnnoStrokeLineSeg AnnoStrokeLineSeg_read(AnnoStrokeLineSegRef ref) {
     uint raw3 = annotated[ix + 3];
     uint raw4 = annotated[ix + 4];
     uint raw5 = annotated[ix + 5];
+    uint raw6 = annotated[ix + 6];
     AnnoStrokeLineSeg s;
     s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
     s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.stroke = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
+    s.path_ix = raw4;
+    s.stroke = vec2(uintBitsToFloat(raw5), uintBitsToFloat(raw6));
     return s;
 }
 
@@ -155,8 +162,9 @@ void AnnoStrokeLineSeg_write(AnnoStrokeLineSegRef ref, AnnoStrokeLineSeg s) {
     annotated[ix + 1] = floatBitsToUint(s.p0.y);
     annotated[ix + 2] = floatBitsToUint(s.p1.x);
     annotated[ix + 3] = floatBitsToUint(s.p1.y);
-    annotated[ix + 4] = floatBitsToUint(s.stroke.x);
-    annotated[ix + 5] = floatBitsToUint(s.stroke.y);
+    annotated[ix + 4] = s.path_ix;
+    annotated[ix + 5] = floatBitsToUint(s.stroke.x);
+    annotated[ix + 6] = floatBitsToUint(s.stroke.y);
 }
 
 AnnoQuadSeg AnnoQuadSeg_read(AnnoQuadSegRef ref) {
diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv
index 6ea0877..524f9e4 100644
Binary files a/piet-gpu/shader/binning.spv and b/piet-gpu/shader/binning.spv differ
diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja
index 14c72aa..27fcfe2 100644
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@@ -14,6 +14,10 @@ build elements.spv: glsl elements.comp | scene.h state.h annotated.h
 
 build binning.spv: glsl binning.comp | annotated.h state.h bins.h setup.h
 
+build tile_alloc.spv: glsl tile_alloc.comp | annotated.h tile.h setup.h
+
+build path_coarse.spv: glsl path_coarse.comp | annotated.h tile.h setup.h
+
 build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h
 
 build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h
diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv
index 5a43f4a..4b7e1c4 100644
Binary files a/piet-gpu/shader/coarse.spv and b/piet-gpu/shader/coarse.spv differ
diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp
index 43bb9cc..230b710 100644
--- a/piet-gpu/shader/elements.comp
+++ b/piet-gpu/shader/elements.comp
@@ -30,9 +30,15 @@ layout(set = 0, binding = 2) buffer AnnotatedBuf {
     uint[] annotated;
 };
 
+// Path segments are stored here.
+layout(set = 0, binding = 3) buffer PathSegBuf {
+    uint[] pathseg;
+};
+
 #include "scene.h"
 #include "state.h"
 #include "annotated.h"
+#include "pathseg.h"
 
 #define StateBuf_stride (8 + 2 * State_size)
 
@@ -83,6 +89,8 @@ State combine_state(State a, State b) {
     c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth;
     c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX)) | b.flags;
     c.flags |= (a.flags & FLAG_RESET_BBOX) >> 1;
+    c.path_count = a.path_count + b.path_count;
+    c.pathseg_count = a.pathseg_count + b.pathseg_count;
     return c;
 }
 
@@ -96,6 +104,8 @@ State map_element(ElementRef ref, inout bool is_fill) {
     c.translate = vec2(0.0, 0.0);
     c.linewidth = 1.0; // TODO should be 0.0
     c.flags = 0;
+    c.path_count = 0;
+    c.pathseg_count = 0;
     is_fill = false;
     switch (tag) {
     case Element_FillLine:
@@ -103,22 +113,26 @@ State map_element(ElementRef ref, inout bool is_fill) {
         LineSeg line = Element_FillLine_read(ref);
         c.bbox.xy = min(line.p0, line.p1);
         c.bbox.zw = max(line.p0, line.p1);
+        c.pathseg_count = 1;
         break;
     case Element_Quad:
         QuadSeg quad = Element_Quad_read(ref);
         c.bbox.xy = min(min(quad.p0, quad.p1), quad.p2);
         c.bbox.zw = max(max(quad.p0, quad.p1), quad.p2);
+        c.pathseg_count = 1;
         break;
     case Element_Cubic:
         CubicSeg cubic = Element_Cubic_read(ref);
         c.bbox.xy = min(min(cubic.p0, cubic.p1), min(cubic.p2, cubic.p3));
         c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3));
+        c.pathseg_count = 1;
         break;
     case Element_Fill:
         is_fill = true;
         // fall-through
     case Element_Stroke:
         c.flags = FLAG_RESET_BBOX;
+        c.path_count = 1;
         break;
     case Element_SetLineWidth:
         SetLineWidth lw = Element_SetLineWidth_read(ref);
@@ -148,6 +162,8 @@ shared vec2 sh_translate[WG_SIZE];
 shared vec4 sh_bbox[WG_SIZE];
 shared float sh_width[WG_SIZE];
 shared uint sh_flags[WG_SIZE];
+shared uint sh_path_count[WG_SIZE];
+shared uint sh_pathseg_count[WG_SIZE];
 
 shared uint sh_min_fill;
 
@@ -187,6 +203,8 @@ void main() {
     sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
     sh_width[gl_LocalInvocationID.x] = agg.linewidth;
     sh_flags[gl_LocalInvocationID.x] = agg.flags;
+    sh_path_count[gl_LocalInvocationID.x] = agg.path_count;
+    sh_pathseg_count[gl_LocalInvocationID.x] = agg.pathseg_count;
     for (uint i = 0; i < LG_WG_SIZE; i++) {
         barrier();
         if (gl_LocalInvocationID.x >= (1 << i)) {
@@ -197,6 +215,8 @@ void main() {
             other.bbox = sh_bbox[ix];
             other.linewidth = sh_width[ix];
             other.flags = sh_flags[ix];
+            other.path_count = sh_path_count[ix];
+            other.pathseg_count = sh_pathseg_count[ix];
             agg = combine_state(other, agg);
         }
         barrier();
@@ -205,6 +225,8 @@ void main() {
         sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
         sh_width[gl_LocalInvocationID.x] = agg.linewidth;
         sh_flags[gl_LocalInvocationID.x] = agg.flags;
+        sh_path_count[gl_LocalInvocationID.x] = agg.path_count;
+        sh_pathseg_count[gl_LocalInvocationID.x] = agg.pathseg_count;
     }
 
     State exclusive;
@@ -213,6 +235,8 @@ void main() {
     exclusive.translate = vec2(0.0, 0.0);
     exclusive.linewidth = 1.0; //TODO should be 0.0
     exclusive.flags = 0;
+    exclusive.path_count = 0;
+    exclusive.pathseg_count = 0;
 
     // Publish aggregate for this partition
     if (gl_LocalInvocationID.x == WG_SIZE - 1) {
@@ -266,6 +290,8 @@ void main() {
         other.bbox = sh_bbox[ix];
         other.linewidth = sh_width[ix];
         other.flags = sh_flags[ix];
+        other.path_count = sh_path_count[ix];
+        other.pathseg_count = sh_pathseg_count[ix];
         row = combine_state(row, other);
     }
     if (my_min_fill == ~0 && gl_LocalInvocationID.x == 0) {
@@ -284,25 +310,26 @@ void main() {
         // gains to be had from stashing in shared memory or possibly
         // registers (though register pressure is an issue).
         ElementRef this_ref = Element_index(ref, i);
-        AnnotatedRef out_ref = AnnotatedRef((ix + i) * Annotated_size);
         uint tag = Element_tag(this_ref);
         switch (tag) {
         case Element_FillLine:
         case Element_StrokeLine:
             LineSeg line = Element_StrokeLine_read(this_ref);
-            AnnoStrokeLineSeg anno_line;
-            anno_line.p0 = st.mat.xy * line.p0.x + st.mat.zw * line.p0.y + st.translate;
-            anno_line.p1 = st.mat.xy * line.p1.x + st.mat.zw * line.p1.y + st.translate;
+            PathStrokeLine path_line;
+            path_line.p0 = st.mat.xy * line.p0.x + st.mat.zw * line.p0.y + st.translate;
+            path_line.p1 = st.mat.xy * line.p1.x + st.mat.zw * line.p1.y + st.translate;
+            path_line.path_ix = st.path_count;
             if (tag == Element_StrokeLine) {
-                anno_line.stroke = get_linewidth(st);
+                path_line.stroke = get_linewidth(st);
             } else {
-                anno_line.stroke = vec2(0.0);
+                path_line.stroke = vec2(0.0);
             }
             // We do encoding a bit by hand to minimize divergence. Another approach
             // would be to have a fill/stroke bool.
-            uint out_tag = tag == Element_FillLine ? Annotated_FillLine : Annotated_StrokeLine;
-            annotated[out_ref.offset >> 2] = out_tag;
-            AnnoStrokeLineSeg_write(AnnoStrokeLineSegRef(out_ref.offset + 4), anno_line);
+            PathSegRef path_out_ref = PathSegRef((st.pathseg_count - 1) * PathSeg_size);
+            uint out_tag = tag == Element_FillLine ? PathSeg_FillLine : PathSeg_StrokeLine;
+            pathseg[path_out_ref.offset >> 2] = out_tag;
+            PathStrokeLine_write(PathStrokeLineRef(path_out_ref.offset + 4), path_line);
             break;
         case Element_Stroke:
             Stroke stroke = Element_Stroke_read(this_ref);
@@ -311,6 +338,7 @@ void main() {
             vec2 lw = get_linewidth(st);
             anno_stroke.bbox = st.bbox + vec4(-lw, lw);
             anno_stroke.linewidth = st.linewidth * sqrt(st.mat.x * st.mat.w - st.mat.y * st.mat.z);
+            AnnotatedRef out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size);
             Annotated_Stroke_write(out_ref, anno_stroke);
             break;
         case Element_Fill:
@@ -318,11 +346,9 @@ void main() {
             AnnoFill anno_fill;
             anno_fill.rgba_color = fill.rgba_color;
             anno_fill.bbox = st.bbox;
+            out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size);
             Annotated_Fill_write(out_ref, anno_fill);
             break;
-        default:
-            Annotated_Nop_write(out_ref);
-            break;
         }
     }
 }
diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv
index a2d439c..18f4dc5 100644
Binary files a/piet-gpu/shader/elements.spv and b/piet-gpu/shader/elements.spv differ
diff --git a/piet-gpu/shader/path_coarse.comp b/piet-gpu/shader/path_coarse.comp
new file mode 100644
index 0000000..ff79925
--- /dev/null
+++ b/piet-gpu/shader/path_coarse.comp
@@ -0,0 +1,107 @@
+// Coarse rasterization of path segments.
+
+// Allocation and initialization of tiles for paths.
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+#include "setup.h"
+
+#define TILE_ALLOC_WG 32
+
+layout(local_size_x = TILE_ALLOC_WG, local_size_y = 1) in;
+
+layout(set = 0, binding = 0) buffer PathSegBuf {
+    uint[] pathseg;
+};
+
+layout(set = 0, binding = 1) buffer AllocBuf {
+    uint n_paths;
+    uint n_pathseg;
+    uint alloc;
+};
+
+layout(set = 0, binding = 2) buffer TileBuf {
+    uint[] tile;
+};
+
+#include "pathseg.h"
+#include "tile.h"
+
+// scale factors useful for converting coordinates to tiles
+#define SX (1.0 / float(TILE_WIDTH_PX))
+#define SY (1.0 / float(TILE_HEIGHT_PX))
+
+void main() {
+    uint element_ix = gl_GlobalInvocationID.x;
+    PathSegRef ref = PathSegRef(element_ix * PathSeg_size);
+
+    uint tag = PathSeg_Nop;
+    if (element_ix < n_pathseg) {
+        tag = PathSeg_tag(ref);
+    }
+    // Setup for coverage algorithm.
+    float a, b, c;
+    // Bounding box of element in pixel coordinates.
+    float xmin, xmax, ymin, ymax;
+    PathStrokeLine line;
+    switch (tag) {
+    case PathSeg_FillLine:
+    case PathSeg_StrokeLine:
+        line = PathSeg_StrokeLine_read(ref);
+        xmin = min(line.p0.x, line.p1.x) - line.stroke.x;
+        xmax = max(line.p0.x, line.p1.x) + line.stroke.x;
+        ymin = min(line.p0.y, line.p1.y) - line.stroke.y;
+        ymax = max(line.p0.y, line.p1.y) + line.stroke.y;
+        float dx = line.p1.x - line.p0.x;
+        float dy = line.p1.y - line.p0.y;
+        // Set up for per-scanline coverage formula, below.
+        float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
+        c = (line.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + line.stroke.y)) * SX;
+        b = invslope; // Note: assumes square tiles, otherwise scale.
+        a = (line.p0.x - (line.p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX;
+        break;
+    }
+    int x0 = int(floor((xmin) * SX));
+    int x1 = int(ceil((xmax) * SX));
+    int y0 = int(floor((ymin) * SY));
+    int y1 = int(ceil((ymax) * SY));
+    
+    uint path_ix = line.path_ix;
+    Path path = Path_read(PathRef(path_ix * Path_size));
+    ivec4 bbox = ivec4(path.bbox);
+    x0 = clamp(x0, bbox.x, bbox.z);
+    y0 = clamp(y0, bbox.y, bbox.w);
+    x1 = clamp(x1, bbox.x, bbox.z);
+    y1 = clamp(y1, bbox.y, bbox.w);
+    float t = a + b * float(y0);
+    int stride = bbox.z - bbox.x;
+    int base = (y0 - bbox.y) * stride - bbox.x;
+    // TODO: can be tighter, use c to bound width
+    uint n_tile_alloc = uint(stride * (bbox.w - bbox.y));
+    // Consider using subgroups to aggregate atomic add.
+    uint tile_offset = atomicAdd(alloc, n_tile_alloc * TileSeg_size);
+    TileSeg tile_seg;
+    tile_seg.start = line.p0;
+    tile_seg.end = line.p1;
+    for (int y = y0; y < y1; y++) {
+        int xx0 = clamp(int(floor(t - c)), x0, x1);
+        int xx1 = clamp(int(ceil(t + c)), x0, x1);
+        for (int x = xx0; x < xx1; x++) {
+            TileRef tile_ref = Tile_index(path.tiles, uint(base + x));
+            uint tile_el = tile_ref.offset >> 2;
+            uint old;
+            uint actual;
+            do {
+                old = tile[tile_el];
+                actual = atomicCompSwap(tile[tile_el], old, tile_offset);
+            } while (actual != old);
+            tile_seg.next.offset = old;
+            TileSeg_write(TileSegRef(tile_offset), tile_seg);
+            tile_offset += TileSeg_size;
+        }
+        // TODO for fills: backdrop
+        t += b;
+        base += stride;
+    }
+}
diff --git a/piet-gpu/shader/path_coarse.spv b/piet-gpu/shader/path_coarse.spv
new file mode 100644
index 0000000..ed212d7
Binary files /dev/null and b/piet-gpu/shader/path_coarse.spv differ
diff --git a/piet-gpu/shader/pathseg.h b/piet-gpu/shader/pathseg.h
new file mode 100644
index 0000000..dc36d7e
--- /dev/null
+++ b/piet-gpu/shader/pathseg.h
@@ -0,0 +1,125 @@
+// Code auto-generated by piet-gpu-derive
+
+struct PathFillLineRef {
+    uint offset;
+};
+
+struct PathStrokeLineRef {
+    uint offset;
+};
+
+struct PathSegRef {
+    uint offset;
+};
+
+struct PathFillLine {
+    vec2 p0;
+    vec2 p1;
+    uint path_ix;
+};
+
+#define PathFillLine_size 20
+
+PathFillLineRef PathFillLine_index(PathFillLineRef ref, uint index) {
+    return PathFillLineRef(ref.offset + index * PathFillLine_size);
+}
+
+struct PathStrokeLine {
+    vec2 p0;
+    vec2 p1;
+    uint path_ix;
+    vec2 stroke;
+};
+
+#define PathStrokeLine_size 28
+
+PathStrokeLineRef PathStrokeLine_index(PathStrokeLineRef ref, uint index) {
+    return PathStrokeLineRef(ref.offset + index * PathStrokeLine_size);
+}
+
+#define PathSeg_Nop 0
+#define PathSeg_FillLine 1
+#define PathSeg_StrokeLine 2
+#define PathSeg_size 32
+
+PathSegRef PathSeg_index(PathSegRef ref, uint index) {
+    return PathSegRef(ref.offset + index * PathSeg_size);
+}
+
+PathFillLine PathFillLine_read(PathFillLineRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = pathseg[ix + 0];
+    uint raw1 = pathseg[ix + 1];
+    uint raw2 = pathseg[ix + 2];
+    uint raw3 = pathseg[ix + 3];
+    uint raw4 = pathseg[ix + 4];
+    PathFillLine s;
+    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.path_ix = raw4;
+    return s;
+}
+
+void PathFillLine_write(PathFillLineRef ref, PathFillLine s) {
+    uint ix = ref.offset >> 2;
+    pathseg[ix + 0] = floatBitsToUint(s.p0.x);
+    pathseg[ix + 1] = floatBitsToUint(s.p0.y);
+    pathseg[ix + 2] = floatBitsToUint(s.p1.x);
+    pathseg[ix + 3] = floatBitsToUint(s.p1.y);
+    pathseg[ix + 4] = s.path_ix;
+}
+
+PathStrokeLine PathStrokeLine_read(PathStrokeLineRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = pathseg[ix + 0];
+    uint raw1 = pathseg[ix + 1];
+    uint raw2 = pathseg[ix + 2];
+    uint raw3 = pathseg[ix + 3];
+    uint raw4 = pathseg[ix + 4];
+    uint raw5 = pathseg[ix + 5];
+    uint raw6 = pathseg[ix + 6];
+    PathStrokeLine s;
+    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.path_ix = raw4;
+    s.stroke = vec2(uintBitsToFloat(raw5), uintBitsToFloat(raw6));
+    return s;
+}
+
+void PathStrokeLine_write(PathStrokeLineRef ref, PathStrokeLine s) {
+    uint ix = ref.offset >> 2;
+    pathseg[ix + 0] = floatBitsToUint(s.p0.x);
+    pathseg[ix + 1] = floatBitsToUint(s.p0.y);
+    pathseg[ix + 2] = floatBitsToUint(s.p1.x);
+    pathseg[ix + 3] = floatBitsToUint(s.p1.y);
+    pathseg[ix + 4] = s.path_ix;
+    pathseg[ix + 5] = floatBitsToUint(s.stroke.x);
+    pathseg[ix + 6] = floatBitsToUint(s.stroke.y);
+}
+
+uint PathSeg_tag(PathSegRef ref) {
+    return pathseg[ref.offset >> 2];
+}
+
+PathFillLine PathSeg_FillLine_read(PathSegRef ref) {
+    return PathFillLine_read(PathFillLineRef(ref.offset + 4));
+}
+
+PathStrokeLine PathSeg_StrokeLine_read(PathSegRef ref) {
+    return PathStrokeLine_read(PathStrokeLineRef(ref.offset + 4));
+}
+
+void PathSeg_Nop_write(PathSegRef ref) {
+    pathseg[ref.offset >> 2] = PathSeg_Nop;
+}
+
+void PathSeg_FillLine_write(PathSegRef ref, PathFillLine s) {
+    pathseg[ref.offset >> 2] = PathSeg_FillLine;
+    PathFillLine_write(PathFillLineRef(ref.offset + 4), s);
+}
+
+void PathSeg_StrokeLine_write(PathSegRef ref, PathStrokeLine s) {
+    pathseg[ref.offset >> 2] = PathSeg_StrokeLine;
+    PathStrokeLine_write(PathStrokeLineRef(ref.offset + 4), s);
+}
+
diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h
index b913086..03b3353 100644
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@@ -31,6 +31,7 @@
 // TODO: compute all these
 
 #define WIDTH_IN_TILES 128
+#define HEIGHT_IN_TILES 96
 #define TILEGROUP_WIDTH_TILES 32
 #define TILE_WIDTH_PX 16
 #define TILE_HEIGHT_PX 16
diff --git a/piet-gpu/shader/state.h b/piet-gpu/shader/state.h
index 2547b93..eacab52 100644
--- a/piet-gpu/shader/state.h
+++ b/piet-gpu/shader/state.h
@@ -10,9 +10,11 @@ struct State {
     vec4 bbox;
     float linewidth;
     uint flags;
+    uint path_count;
+    uint pathseg_count;
 };
 
-#define State_size 48
+#define State_size 56
 
 StateRef State_index(StateRef ref, uint index) {
     return StateRef(ref.offset + index * State_size);
@@ -32,12 +34,16 @@ State State_read(StateRef ref) {
     uint raw9 = state[ix + 9];
     uint raw10 = state[ix + 10];
     uint raw11 = state[ix + 11];
+    uint raw12 = state[ix + 12];
+    uint raw13 = state[ix + 13];
     State s;
     s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
     s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
     s.bbox = vec4(uintBitsToFloat(raw6), uintBitsToFloat(raw7), uintBitsToFloat(raw8), uintBitsToFloat(raw9));
     s.linewidth = uintBitsToFloat(raw10);
     s.flags = raw11;
+    s.path_count = raw12;
+    s.pathseg_count = raw13;
     return s;
 }
 
@@ -55,5 +61,7 @@ void State_write(StateRef ref, State s) {
     state[ix + 9] = floatBitsToUint(s.bbox.w);
     state[ix + 10] = floatBitsToUint(s.linewidth);
     state[ix + 11] = s.flags;
+    state[ix + 12] = s.path_count;
+    state[ix + 13] = s.pathseg_count;
 }
 
diff --git a/piet-gpu/shader/tile.h b/piet-gpu/shader/tile.h
new file mode 100644
index 0000000..b4a8c9b
--- /dev/null
+++ b/piet-gpu/shader/tile.h
@@ -0,0 +1,105 @@
+// Code auto-generated by piet-gpu-derive
+
+struct PathRef {
+    uint offset;
+};
+
+struct TileRef {
+    uint offset;
+};
+
+struct TileSegRef {
+    uint offset;
+};
+
+struct Path {
+    uvec4 bbox;
+    TileRef tiles;
+};
+
+#define Path_size 12
+
+PathRef Path_index(PathRef ref, uint index) {
+    return PathRef(ref.offset + index * Path_size);
+}
+
+struct Tile {
+    TileSegRef tile;
+    int backdrop;
+};
+
+#define Tile_size 8
+
+TileRef Tile_index(TileRef ref, uint index) {
+    return TileRef(ref.offset + index * Tile_size);
+}
+
+struct TileSeg {
+    vec2 start;
+    vec2 end;
+    TileSegRef next;
+};
+
+#define TileSeg_size 20
+
+TileSegRef TileSeg_index(TileSegRef ref, uint index) {
+    return TileSegRef(ref.offset + index * TileSeg_size);
+}
+
+Path Path_read(PathRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = tile[ix + 0];
+    uint raw1 = tile[ix + 1];
+    uint raw2 = tile[ix + 2];
+    Path s;
+    s.bbox = uvec4(raw0 & 0xffff, raw0 >> 16, raw1 & 0xffff, raw1 >> 16);
+    s.tiles = TileRef(raw2);
+    return s;
+}
+
+void Path_write(PathRef ref, Path s) {
+    uint ix = ref.offset >> 2;
+    tile[ix + 0] = s.bbox.x | (s.bbox.y << 16);
+    tile[ix + 1] = s.bbox.z | (s.bbox.w << 16);
+    tile[ix + 2] = s.tiles.offset;
+}
+
+Tile Tile_read(TileRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = tile[ix + 0];
+    uint raw1 = tile[ix + 1];
+    Tile s;
+    s.tile = TileSegRef(raw0);
+    s.backdrop = int(raw1);
+    return s;
+}
+
+void Tile_write(TileRef ref, Tile s) {
+    uint ix = ref.offset >> 2;
+    tile[ix + 0] = s.tile.offset;
+    tile[ix + 1] = uint(s.backdrop);
+}
+
+TileSeg TileSeg_read(TileSegRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = tile[ix + 0];
+    uint raw1 = tile[ix + 1];
+    uint raw2 = tile[ix + 2];
+    uint raw3 = tile[ix + 3];
+    uint raw4 = tile[ix + 4];
+    TileSeg s;
+    s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.next = TileSegRef(raw4);
+    return s;
+}
+
+void TileSeg_write(TileSegRef ref, TileSeg s) {
+    uint ix = ref.offset >> 2;
+    tile[ix + 0] = floatBitsToUint(s.start.x);
+    tile[ix + 1] = floatBitsToUint(s.start.y);
+    tile[ix + 2] = floatBitsToUint(s.end.x);
+    tile[ix + 3] = floatBitsToUint(s.end.y);
+    tile[ix + 4] = s.next.offset;
+}
+
diff --git a/piet-gpu/shader/tile_alloc.comp b/piet-gpu/shader/tile_alloc.comp
new file mode 100644
index 0000000..d8b1eb9
--- /dev/null
+++ b/piet-gpu/shader/tile_alloc.comp
@@ -0,0 +1,73 @@
+// Allocation and initialization of tiles for paths.
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+#include "setup.h"
+
+#define TILE_ALLOC_WG 32
+
+layout(local_size_x = TILE_ALLOC_WG, local_size_y = 1) in;
+
+layout(set = 0, binding = 0) buffer AnnotatedBuf {
+    uint[] annotated;
+};
+
+layout(set = 0, binding = 1) buffer AllocBuf {
+    uint n_elements;
+    uint n_pathseg;
+    uint alloc;
+};
+
+layout(set = 0, binding = 2) buffer TileBuf {
+    uint[] tile;
+};
+
+#include "annotated.h"
+#include "tile.h"
+
+// scale factors useful for converting coordinates to tiles
+#define SX (1.0 / float(TILE_WIDTH_PX))
+#define SY (1.0 / float(TILE_HEIGHT_PX))
+
+void main() {
+    uint element_ix = gl_GlobalInvocationID.x;
+    PathRef path_ref = PathRef(element_ix * Path_size);
+    AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size);
+
+    uint tag = Annotated_Nop;
+    if (element_ix < n_elements) {
+        tag = Annotated_tag(ref);
+    }
+    int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
+    switch (tag) {
+    case Annotated_Fill:
+    case Annotated_Stroke:
+        // Note: we take advantage of the fact that fills and strokes
+        // have compatible layout.
+        AnnoFill fill = Annotated_Fill_read(ref);
+        x0 = int(floor(fill.bbox.x * SX));
+        y0 = int(floor(fill.bbox.y * SY));
+        x1 = int(ceil(fill.bbox.z * SX));
+        y1 = int(ceil(fill.bbox.w * SY));
+        break;
+    }
+    x0 = clamp(x0, 0, WIDTH_IN_TILES);
+    y0 = clamp(y0, 0, HEIGHT_IN_TILES);
+    x1 = clamp(x1, 0, WIDTH_IN_TILES);
+    y1 = clamp(y1, 0, HEIGHT_IN_TILES);
+
+    Path path;
+    path.bbox = uvec4(x0, y0, x1, y1);
+    uint n_tiles = (x1 - x0) * (y1 - y0);
+    path.tiles = TileRef(0);
+    if (n_tiles > 0) {
+        path.tiles.offset = atomicAdd(alloc, n_tiles * Tile_size);
+        Tile init_tile = Tile(TileSegRef(0), 0);
+        // TODO: improve load balancing
+        for (uint i = 0; i < n_tiles; i++) {
+            Tile_write(Tile_index(path.tiles, i), init_tile);
+        }
+    }
+    Path_write(path_ref, path);
+}
diff --git a/piet-gpu/shader/tile_alloc.spv b/piet-gpu/shader/tile_alloc.spv
new file mode 100644
index 0000000..0835903
Binary files /dev/null and b/piet-gpu/shader/tile_alloc.spv differ
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index 3ec7e1d..19e9b43 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -129,12 +129,23 @@ pub struct Renderer<D: Device> {
 
     pub state_buf: D::Buffer,
     pub anno_buf: D::Buffer,
+    pub pathseg_buf: D::Buffer,
+    pub tile_buf: D::Buffer,
     pub bin_buf: D::Buffer,
     pub ptcl_buf: D::Buffer,
 
     el_pipeline: D::Pipeline,
     el_ds: D::DescriptorSet,
 
+    tile_pipeline: D::Pipeline,
+    tile_ds: D::DescriptorSet,
+
+    path_pipeline: D::Pipeline,
+    path_ds: D::DescriptorSet,
+
+    tile_alloc_buf_host: D::Buffer,
+    tile_alloc_buf_dev: D::Buffer,
+
     bin_pipeline: D::Pipeline,
     bin_ds: D::DescriptorSet,
 
@@ -151,10 +162,12 @@ pub struct Renderer<D: Device> {
     k4_ds: D::DescriptorSet,
 
     n_elements: usize,
+    n_paths: usize,
+    n_pathseg: usize,
 }
 
 impl<D: Device> Renderer<D> {
-    pub unsafe fn new(device: &D, scene: &[u8]) -> Result<Self, Error> {
+    pub unsafe fn new(device: &D, scene: &[u8], n_paths: usize, n_pathseg: usize) -> Result<Self, Error> {
         let host = MemFlags::host_coherent();
         let dev = MemFlags::device_local();
 
@@ -170,16 +183,44 @@ impl<D: Device> Renderer<D> {
         device.write_buffer(&scene_buf, &scene)?;
 
         let state_buf = device.create_buffer(1 * 1024 * 1024, dev)?;
-        let anno_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
+        let anno_buf = device.create_buffer(64 * 1024 * 1024, host)?;
+        let pathseg_buf = device.create_buffer(64 * 1024 * 1024, host)?;
+        let tile_buf = device.create_buffer(64 * 1024 * 1024, host)?;
         let bin_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
         let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?;
         let image_dev = device.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?;
 
         let el_code = include_bytes!("../shader/elements.spv");
-        let el_pipeline = device.create_simple_compute_pipeline(el_code, 3, 0)?;
+        let el_pipeline = device.create_simple_compute_pipeline(el_code, 4, 0)?;
         let el_ds = device.create_descriptor_set(
             &el_pipeline,
-            &[&scene_dev, &state_buf, &anno_buf],
+            &[&scene_dev, &state_buf, &anno_buf, &pathseg_buf],
+            &[],
+        )?;
+
+        let tile_alloc_buf_host = device.create_buffer(12, host)?;
+        let tile_alloc_buf_dev = device.create_buffer(12, dev)?;
+
+        // TODO: constants
+        const PATH_SIZE: usize = 12;
+        let tile_alloc_start = ((n_paths + 31) & !31) * PATH_SIZE;
+        device.write_buffer(
+            &tile_alloc_buf_host,
+            &[n_paths as u32, n_pathseg as u32, tile_alloc_start as u32],
+        )?;
+        let tile_alloc_code = include_bytes!("../shader/tile_alloc.spv");
+        let tile_pipeline = device.create_simple_compute_pipeline(tile_alloc_code, 3, 0)?;
+        let tile_ds = device.create_descriptor_set(
+            &tile_pipeline,
+            &[&anno_buf, &tile_alloc_buf_dev, &tile_buf],
+            &[],
+        )?;
+
+        let path_alloc_code = include_bytes!("../shader/path_coarse.spv");
+        let path_pipeline = device.create_simple_compute_pipeline(path_alloc_code, 3, 0)?;
+        let path_ds = device.create_descriptor_set(
+            &path_pipeline,
+            &[&pathseg_buf, &tile_alloc_buf_dev, &tile_buf],
             &[],
         )?;
 
@@ -226,6 +267,10 @@ impl<D: Device> Renderer<D> {
             image_dev,
             el_pipeline,
             el_ds,
+            tile_pipeline,
+            tile_ds,
+            path_pipeline,
+            path_ds,
             bin_pipeline,
             bin_ds,
             coarse_pipeline,
@@ -234,18 +279,25 @@ impl<D: Device> Renderer<D> {
             k4_ds,
             state_buf,
             anno_buf,
+            pathseg_buf,
+            tile_buf,
             bin_buf,
             ptcl_buf,
+            tile_alloc_buf_host,
+            tile_alloc_buf_dev,
             bin_alloc_buf_host,
             bin_alloc_buf_dev,
             coarse_alloc_buf_host,
             coarse_alloc_buf_dev,
             n_elements,
+            n_paths,
+            n_pathseg,
         })
     }
 
     pub unsafe fn record(&self, cmd_buf: &mut impl CmdBuf<D>, query_pool: &D::QueryPool) {
         cmd_buf.copy_buffer(&self.scene_buf, &self.scene_dev);
+        cmd_buf.copy_buffer(&self.tile_alloc_buf_host, &self.tile_alloc_buf_dev);
         cmd_buf.copy_buffer(&self.bin_alloc_buf_host, &self.bin_alloc_buf_dev);
         cmd_buf.copy_buffer(&self.coarse_alloc_buf_host, &self.coarse_alloc_buf_dev);
         cmd_buf.clear_buffer(&self.state_buf);
@@ -264,26 +316,44 @@ impl<D: Device> Renderer<D> {
         );
         cmd_buf.write_timestamp(&query_pool, 1);
         cmd_buf.memory_barrier();
+        cmd_buf.dispatch(
+            &self.tile_pipeline,
+            &self.tile_ds,
+            (((self.n_paths + 31) / 32) as u32, 1, 1),
+        );
+        cmd_buf.write_timestamp(&query_pool, 2);
+        cmd_buf.memory_barrier();
+        cmd_buf.dispatch(
+            &self.path_pipeline,
+            &self.path_ds,
+            (((self.n_pathseg + 31) / 32) as u32, 1, 1),
+        );
+        /*
         cmd_buf.dispatch(
             &self.bin_pipeline,
             &self.bin_ds,
             (((self.n_elements + 255) / 256) as u32, 1, 1),
         );
-        cmd_buf.write_timestamp(&query_pool, 2);
+        */
+        cmd_buf.write_timestamp(&query_pool, 3);
         cmd_buf.memory_barrier();
+        /*
         cmd_buf.dispatch(
             &self.coarse_pipeline,
             &self.coarse_ds,
             (WIDTH as u32 / 256, HEIGHT as u32 / 256, 1),
         );
-        cmd_buf.write_timestamp(&query_pool, 3);
+        */
+        cmd_buf.write_timestamp(&query_pool, 4);
         cmd_buf.memory_barrier();
+        /*
         cmd_buf.dispatch(
             &self.k4_pipeline,
             &self.k4_ds,
             ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1),
         );
-        cmd_buf.write_timestamp(&query_pool, 4);
+        cmd_buf.write_timestamp(&query_pool, 5);
+        */
         cmd_buf.memory_barrier();
         cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
     }
diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs
index da234de..7908ff2 100644
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@@ -35,6 +35,10 @@ pub struct PietGpuRenderContext {
     // Will probably need direct accesss to hal Device to create images etc.
     inner_text: PietGpuText,
     stroke_width: f32,
+    // We're tallying these cpu-side for expedience, but will probably
+    // move this to some kind of readback from element processing.
+    path_count: usize,
+    pathseg_count: usize,
 }
 
 #[derive(Clone)]
@@ -56,6 +60,8 @@ impl PietGpuRenderContext {
             elements,
             inner_text,
             stroke_width,
+            path_count: 0,
+            pathseg_count: 0,
         }
     }
 
@@ -63,6 +69,14 @@ impl PietGpuRenderContext {
         self.elements.encode(&mut self.encoder);
         self.encoder.buf()
     }
+
+    pub fn path_count(&self) -> usize {
+        self.path_count
+    }
+
+    pub fn pathseg_count(&self) -> usize {
+        self.pathseg_count
+    }
 }
 
 impl RenderContext for PietGpuRenderContext {
@@ -99,6 +113,7 @@ impl RenderContext for PietGpuRenderContext {
             PietGpuBrush::Solid(rgba_color) => {
                 let stroke = Stroke { rgba_color };
                 self.elements.push(Element::Stroke(stroke));
+                self.path_count += 1;
             }
             _ => (),
         }
@@ -121,6 +136,7 @@ impl RenderContext for PietGpuRenderContext {
             PietGpuBrush::Solid(rgba_color) => {
                 let fill = Fill { rgba_color };
                 self.elements.push(Element::Fill(fill));
+                self.path_count += 1;
             }
             _ => (),
         }
@@ -204,6 +220,7 @@ impl PietGpuRenderContext {
         } else {
             self.elements.push(Element::StrokeLine(seg));
         }
+        self.pathseg_count += 1;
     }
 
     fn encode_path(&mut self, path: impl Iterator<Item = PathEl>, is_fill: bool) {