diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs
index 73f33ee..672b42d 100644
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@@ -41,7 +41,7 @@ fn main() -> Result<(), Error> {
 
         let fence = device.create_fence(false)?;
         let mut cmd_buf = device.create_cmd_buf()?;
-        let query_pool = device.create_query_pool(4)?;
+        let query_pool = device.create_query_pool(5)?;
 
         let mut ctx = PietGpuRenderContext::new();
         render_scene(&mut ctx);
@@ -62,10 +62,11 @@ fn main() -> Result<(), Error> {
         println!("Element kernel time: {:.3}ms", ts[0] * 1e3);
         println!("Binning kernel time: {:.3}ms", (ts[1] - ts[0]) * 1e3);
         println!("Coarse kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3);
+        println!("Render kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3);
 
         /*
         let mut data: Vec<u32> = Default::default();
-        device.read_buffer(&renderer.bin_buf, &mut data).unwrap();
+        device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap();
         piet_gpu::dump_k1_data(&data);
 
         let mut data: Vec<u32> = Default::default();
diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
index 4e4ff19..da25ce4 100644
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@@ -46,6 +46,17 @@ shared uint sh_bitmaps[N_SLICE][N_TILE];
 #define SX (1.0 / float(TILE_WIDTH_PX))
 #define SY (1.0 / float(TILE_HEIGHT_PX))
 
+// Perhaps cmd_limit should be a global? This is a style question.
+void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) {
+    if (cmd_ref.offset > cmd_limit) {
+        uint new_cmd = atomicAdd(alloc, PTCL_INITIAL_ALLOC);
+        CmdJump jump = CmdJump(new_cmd);
+        Cmd_Jump_write(cmd_ref, jump);
+        cmd_ref = CmdRef(new_cmd);
+        cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
+    }
+}
+
 void main() {
     // Could use either linear or 2d layouts for both dispatch and
     // invocations within the workgroup. We'll use variables to abstract.
@@ -53,6 +64,13 @@ void main() {
     // Top left coordinates of this bin.
     vec2 xy0 = vec2(N_TILE_X * TILE_WIDTH_PX * gl_WorkGroupID.x, N_TILE_Y * TILE_HEIGHT_PX * gl_WorkGroupID.y);
     uint th_ix = gl_LocalInvocationID.x;
+
+    uint tile_x = N_TILE_X * gl_WorkGroupID.x + gl_LocalInvocationID.x % N_TILE_X;
+    uint tile_y = N_TILE_Y * gl_WorkGroupID.y + gl_LocalInvocationID.x / N_TILE_X;
+    uint tile_ix = tile_y * WIDTH_IN_TILES + tile_x;
+    CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC);
+    uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
+
     uint wr_ix = 0;
     uint rd_ix = 0;
     uint first_el;
@@ -172,6 +190,7 @@ void main() {
                 y++;
             }
         }
+        barrier();
 
         // Output elements for this tile, based on bitmaps.
         uint slice_ix = 0;
@@ -193,13 +212,25 @@ void main() {
             // At this point, we read the element again from global memory.
             // If that turns out to be expensive, maybe we can pack it into
             // shared memory (or perhaps just the tag).
-            probe += 1;
+            ref = AnnotatedRef(element_ix * Annotated_size);
+            tag = Annotated_tag(ref);
+
+            switch (tag) {
+            case Annotated_Fill:
+            case Annotated_Stroke:
+                // Note: we take advantage of the fact that fills and strokes
+                // have compatible layout.
+                AnnoFill fill = Annotated_Fill_read(ref);
+                alloc_cmd(cmd_ref, cmd_limit);
+                Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color));
+                break;
+            }
 
             // clear LSB
             bitmap &= bitmap - 1;
         }
 
         rd_ix += N_TILE;
+        break;
     } while (wr_ix > rd_ix);
-    ptcl[bin_ix * N_TILE + th_ix] = probe;
 }
diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv
index ded68da..ed005fc 100644
Binary files a/piet-gpu/shader/coarse.spv and b/piet-gpu/shader/coarse.spv differ
diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp
index d6f33b7..2df43ec 100644
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@@ -39,7 +39,8 @@ void main() {
     uvec2 xy_uint = gl_GlobalInvocationID.xy;
     vec2 xy = vec2(xy_uint);
     vec2 uv = xy * vec2(1.0 / IMAGE_WIDTH, 1.0 / IMAGE_HEIGHT);
-    vec3 rgb = uv.xyy;
+    //vec3 rgb = uv.xyy;
+    vec3 rgb = vec3(0.75);
 
     while (true) {
         uint tag = Cmd_tag(cmd_ref);
diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv
index d9dacc0..00e1ac3 100644
Binary files a/piet-gpu/shader/kernel4.spv and b/piet-gpu/shader/kernel4.spv differ
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index cca0b0b..2527b50 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -30,7 +30,7 @@ const PTCL_INITIAL_ALLOC: usize = 1024;
 
 const K2_PER_TILE_SIZE: usize = 8;
 
-const N_CIRCLES: usize = 1;
+const N_CIRCLES: usize = 0;
 
 const N_WG: u32 = 16;
 
@@ -47,11 +47,13 @@ pub fn render_scene(rc: &mut impl RenderContext) {
         rc.fill(circle, &color);
     }
     let mut path = BezPath::new();
+    /*
     path.move_to((100.0, 1150.0));
     path.line_to((200.0, 1200.0));
     path.line_to((150.0, 1250.0));
     path.close_path();
     rc.fill(path, &Color::rgb8(128, 0, 128));
+    */
     rc.stroke(
         Line::new((100.0, 100.0), (200.0, 150.0)),
         &Color::WHITE,
@@ -134,29 +136,9 @@ pub struct Renderer<D: Device> {
     coarse_alloc_buf_host: D::Buffer,
     coarse_alloc_buf_dev: D::Buffer,
 
-    /*
-    k1_alloc_buf_host: D::Buffer,
-    k1_alloc_buf_dev: D::Buffer,
-    k2s_alloc_buf_host: D::Buffer,
-    k2s_alloc_buf_dev: D::Buffer,
-    k2f_alloc_buf_host: D::Buffer,
-    k2f_alloc_buf_dev: D::Buffer,
-    k3_alloc_buf_host: D::Buffer,
-    k3_alloc_buf_dev: D::Buffer,
-    tilegroup_buf: D::Buffer,
-    ptcl_buf: D::Buffer,
-
-    k1_pipeline: D::Pipeline,
-    k1_ds: D::DescriptorSet,
-    k2s_pipeline: D::Pipeline,
-    k2s_ds: D::DescriptorSet,
-    k2f_pipeline: D::Pipeline,
-    k2f_ds: D::DescriptorSet,
-    k3_pipeline: D::Pipeline,
-    k3_ds: D::DescriptorSet,
     k4_pipeline: D::Pipeline,
     k4_ds: D::DescriptorSet,
-    */
+ 
     n_elements: usize,
 }
 
@@ -213,10 +195,10 @@ impl<D: Device> Renderer<D> {
         let coarse_alloc_buf_host = device.create_buffer(4, host)?;
         let coarse_alloc_buf_dev = device.create_buffer(4, dev)?;
 
-        let coarse_alloc_start = 256 * 64 * N_WG;
+        let coarse_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC;
         device
             .write_buffer(&coarse_alloc_buf_host, &[
-                coarse_alloc_start,
+                coarse_alloc_start as u32,
             ])
             ?;
         let coarse_code = include_bytes!("../shader/coarse.spv");
@@ -227,72 +209,11 @@ impl<D: Device> Renderer<D> {
             &[],
         )?;
 
-        /*
-        let tilegroup_buf = device.create_buffer(4 * 1024 * 1024, dev)?;
-        let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?;
+        // These will probably be combined with the ptcl buf, as they're all written by the
+        // same kernel now.
         let segment_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
         let fill_seg_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
 
-        let k1_alloc_buf_host = device.create_buffer(4, host)?;
-        let k1_alloc_buf_dev = device.create_buffer(4, dev)?;
-        let k1_alloc_start = WIDTH_IN_TILEGROUPS * HEIGHT_IN_TILEGROUPS * TILEGROUP_STRIDE;
-        device.write_buffer(&k1_alloc_buf_host, &[k1_alloc_start as u32])?;
-        let k1_code = include_bytes!("../shader/kernel1.spv");
-        let k1_pipeline = device.create_simple_compute_pipeline(k1_code, 3, 0)?;
-        let k1_ds = device.create_descriptor_set(
-            &k1_pipeline,
-            &[&scene_dev, &tilegroup_buf, &k1_alloc_buf_dev],
-            &[],
-        )?;
-
-        let k2s_alloc_buf_host = device.create_buffer(4, host)?;
-        let k2s_alloc_buf_dev = device.create_buffer(4, dev)?;
-        let k2s_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * K2_PER_TILE_SIZE;
-        device.write_buffer(&k2s_alloc_buf_host, &[k2s_alloc_start as u32])?;
-        let k2s_code = include_bytes!("../shader/kernel2s.spv");
-        let k2s_pipeline = device.create_simple_compute_pipeline(k2s_code, 4, 0)?;
-        let k2s_ds = device.create_descriptor_set(
-            &k2s_pipeline,
-            &[&scene_dev, &tilegroup_buf, &segment_buf, &k2s_alloc_buf_dev],
-            &[],
-        )?;
-
-        let k2f_alloc_buf_host = device.create_buffer(4, host)?;
-        let k2f_alloc_buf_dev = device.create_buffer(4, dev)?;
-        let k2f_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * K2_PER_TILE_SIZE;
-        device.write_buffer(&k2f_alloc_buf_host, &[k2f_alloc_start as u32])?;
-        let k2f_code = include_bytes!("../shader/kernel2f.spv");
-        let k2f_pipeline = device.create_simple_compute_pipeline(k2f_code, 4, 0)?;
-        let k2f_ds = device.create_descriptor_set(
-            &k2f_pipeline,
-            &[
-                &scene_dev,
-                &tilegroup_buf,
-                &fill_seg_buf,
-                &k2f_alloc_buf_dev,
-            ],
-            &[],
-        )?;
-
-        let k3_alloc_buf_host = device.create_buffer(4, host)?;
-        let k3_alloc_buf_dev = device.create_buffer(4, dev)?;
-        let k3_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC;
-        device.write_buffer(&k3_alloc_buf_host, &[k3_alloc_start as u32])?;
-        let k3_code = include_bytes!("../shader/kernel3.spv");
-        let k3_pipeline = device.create_simple_compute_pipeline(k3_code, 6, 0)?;
-        let k3_ds = device.create_descriptor_set(
-            &k3_pipeline,
-            &[
-                &scene_dev,
-                &tilegroup_buf,
-                &segment_buf,
-                &fill_seg_buf,
-                &ptcl_buf,
-                &k3_alloc_buf_dev,
-            ],
-            &[],
-        )?;
-
         let k4_code = include_bytes!("../shader/kernel4.spv");
         let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 3, 1)?;
         let k4_ds = device.create_descriptor_set(
@@ -300,7 +221,6 @@ impl<D: Device> Renderer<D> {
             &[&ptcl_buf, &segment_buf, &fill_seg_buf],
             &[&image_dev],
         )?;
-        */
 
         Ok(Renderer {
             scene_buf,
@@ -312,6 +232,8 @@ impl<D: Device> Renderer<D> {
             bin_ds,
             coarse_pipeline,
             coarse_ds,
+            k4_pipeline,
+            k4_ds,
             state_buf,
             anno_buf,
             bin_buf,
@@ -339,7 +261,7 @@ impl<D: Device> Renderer<D> {
         cmd_buf.dispatch(
             &self.el_pipeline,
             &self.el_ds,
-            ((self.n_elements / 128) as u32, 1, 1),
+            (((self.n_elements + 127) / 128) as u32, 1, 1),
         );
         cmd_buf.write_timestamp(&query_pool, 1);
         cmd_buf.memory_barrier();
@@ -357,6 +279,13 @@ impl<D: Device> Renderer<D> {
         );
         cmd_buf.write_timestamp(&query_pool, 3);
         cmd_buf.memory_barrier();
+        cmd_buf.dispatch(
+            &self.k4_pipeline,
+            &self.k4_ds,
+            ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1),
+        );
+        cmd_buf.write_timestamp(&query_pool, 4);
+        cmd_buf.memory_barrier();
         cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
     }
 }