diff --git a/piet-gpu-types/src/ptcl.rs b/piet-gpu-types/src/ptcl.rs
index bdf342b..98c4d44 100644
--- a/piet-gpu-types/src/ptcl.rs
+++ b/piet-gpu-types/src/ptcl.rs
@@ -13,8 +13,9 @@ piet_gpu! {
             end: [f32; 2],
         }
         struct CmdStroke {
-            // Consider a specialization to one segment.
-            seg_ref: Ref<SegChunk>,
+            // This is really a Ref<Tile>, but we don't have cross-module
+            // references.
+            tile_ref: u32,
             half_width: f32,
             rgba_color: u32,
         }
diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs
index 04a20ba..0714f00 100644
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@@ -171,7 +171,7 @@ fn main() -> Result<(), Error> {
 
         let fence = device.create_fence(false)?;
         let mut cmd_buf = device.create_cmd_buf()?;
-        let query_pool = device.create_query_pool(5)?;
+        let query_pool = device.create_query_pool(7)?;
 
         let mut ctx = PietGpuRenderContext::new();
         if let Some(input) = matches.value_of("INPUT") {
@@ -204,14 +204,16 @@ fn main() -> Result<(), Error> {
         println!("Element kernel time: {:.3}ms", ts[0] * 1e3);
         println!("Tile allocation kernel time: {:.3}ms", (ts[1] - ts[0]) * 1e3);
         println!("Coarse path kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3);
-        /*
-        println!("Render kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3);
-        */
+        println!("Binning kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3);
+        println!("Coarse raster kernel time: {:.3}ms", (ts[4] - ts[3]) * 1e3);
+        println!("Render kernel time: {:.3}ms", (ts[5] - ts[4]) * 1e3);
 
+        /*
         let mut data: Vec<u32> = Default::default();
-        device.read_buffer(&renderer.tile_buf, &mut data).unwrap();
+        device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap();
         piet_gpu::dump_k1_data(&data);
         //trace_ptcl(&data);
+        */
 
         let mut img_data: Vec<u8> = Default::default();
         // Note: because png can use a `&[u8]` slice, we could avoid an extra copy
diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
index 3656f77..28efd16 100644
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@@ -15,17 +15,22 @@ layout(set = 0, binding = 1) buffer BinsBuf {
     uint[] bins;
 };
 
-layout(set = 0, binding = 2) buffer AllocBuf {
+layout(set = 0, binding = 2) buffer TileBuf {
+    uint[] tile;
+};
+
+layout(set = 0, binding = 3) buffer AllocBuf {
     uint n_elements;
     uint alloc;
 };
 
-layout(set = 0, binding = 3) buffer PtclBuf {
+layout(set = 0, binding = 4) buffer PtclBuf {
     uint[] ptcl;
 };
 
 #include "annotated.h"
 #include "bins.h"
+#include "tile.h"
 #include "ptcl.h"
 
 #define LG_N_PART_READ 8
@@ -197,37 +202,11 @@ void main() {
             tag = Annotated_tag(ref);
         }
 
-        // Setup for coverage algorithm.
-        float a, b, c;
         // Bounding box of element in pixel coordinates.
         float xmin, xmax, ymin, ymax;
         uint my_slice = th_ix / 32;
         uint my_mask = 1 << (th_ix & 31);
         switch (tag) {
-        case Annotated_FillLine:
-        case Annotated_StrokeLine:
-            AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref);
-            xmin = min(line.p0.x, line.p1.x) - line.stroke.x;
-            xmax = max(line.p0.x, line.p1.x) + line.stroke.x;
-            ymin = min(line.p0.y, line.p1.y) - line.stroke.y;
-            ymax = max(line.p0.y, line.p1.y) + line.stroke.y;
-            float dx = line.p1.x - line.p0.x;
-            float dy = line.p1.y - line.p0.y;
-            if (tag == Annotated_FillLine) {
-                // Set bit for backdrop sign calculation, 1 is +1, 0 is -1.
-                if (dy < 0) {
-                    atomicOr(sh_bd_sign[my_slice], my_mask);
-                } else {
-                    atomicAnd(sh_bd_sign[my_slice], ~my_mask);
-                }
-            }
-            atomicOr(sh_is_segment[my_slice], my_mask);
-            // Set up for per-scanline coverage formula, below.
-            float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
-            c = (line.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + line.stroke.y)) * SX;
-            b = invslope; // Note: assumes square tiles, otherwise scale.
-            a = (line.p0.x - xy0.x - (line.p0.y - 0.5 * float(TILE_HEIGHT_PX) - xy0.y) * b) * SX;
-            break;
         case Annotated_Fill:
         case Annotated_Stroke:
             // Note: we take advantage of the fact that fills and strokes
@@ -237,10 +216,6 @@ void main() {
             xmax = fill.bbox.z;
             ymin = fill.bbox.y;
             ymax = fill.bbox.w;
-            // Just let the clamping to xmin and xmax determine the bounds.
-            a = 0.0;
-            b = 0.0;
-            c = 1e9;
             break;
         default:
             ymin = 0;
@@ -254,37 +229,23 @@ void main() {
         // Compute bounding box in tiles and clip to this bin.
         int x0 = int(floor((xmin - xy0.x) * SX));
         int x1 = int(ceil((xmax - xy0.x) * SX));
-        int xr = int(ceil((right_edge - xy0.x) * SX));
         int y0 = int(floor((ymin - xy0.y) * SY));
         int y1 = int(ceil((ymax - xy0.y) * SY));
         x0 = clamp(x0, 0, N_TILE_X);
         x1 = clamp(x1, x0, N_TILE_X);
-        xr = clamp(xr, 0, N_TILE_X);
         y0 = clamp(y0, 0, N_TILE_Y);
         y1 = clamp(y1, y0, N_TILE_Y);
-        float t = a + b * float(y0);
         for (uint y = y0; y < y1; y++) {
-            uint xx0 = clamp(int(floor(t - c)), x0, x1);
-            uint xx1 = clamp(int(ceil(t + c)), x0, x1);
-            for (uint x = xx0; x < xx1; x++) {
+            for (uint x = x0; x < x1; x++) {
                 atomicOr(sh_bitmaps[my_slice][y * N_TILE_X + x], my_mask);
             }
-            if (tag == Annotated_FillLine && ymin <= xy0.y + float(y * TILE_HEIGHT_PX)) {
-                // Assign backdrop to all tiles to the right of the ray crossing the
-                // top edge of this tile, up to the right edge of the fill bbox.
-                float xray = t - 0.5 * b;
-                xx0 = max(int(ceil(xray)), 0);
-                for (uint x = xx0; x < xr; x++) {
-                    atomicOr(sh_backdrop[my_slice][y * N_TILE_X + x], my_mask);
-                }
-            }
-            t += b;
         }
         barrier();
 
         // We've computed coverage and other info for each element in the input, now for
         // the output stage. We'll do segments first using a more parallel algorithm.
 
+        /*
         uint seg_count = 0;
         for (uint i = 0; i < N_SLICE; i++) {
             seg_count += bitCount(sh_bitmaps[i][th_ix] & sh_is_segment[i]);
@@ -372,45 +333,29 @@ void main() {
             Segment seg = Segment(line.p0, line.p1, y_edge);
             Segment_write(SegmentRef(seg_alloc + Segment_size * ix), seg);
         }
+        */
 
         // Output non-segment elements for this tile. The thread does a sequential walk
         // through the non-segment elements, and for segments, count and backdrop are
         // aggregated using bit counting.
         uint slice_ix = 0;
         uint bitmap = sh_bitmaps[0][th_ix];
-        uint bd_bitmap = sh_backdrop[0][th_ix];
-        uint bd_sign = sh_bd_sign[0];
-        uint is_segment = sh_is_segment[0];
-        uint seg_start = th_ix == 0 ? 0 : sh_seg_count[th_ix - 1];
-        seg_count = 0;
         while (true) {
-            uint nonseg_bitmap = bitmap & ~is_segment;
-            if (nonseg_bitmap == 0) {
-                backdrop += count_backdrop(bd_bitmap, bd_sign);
-                seg_count += bitCount(bitmap & is_segment);
+            if (bitmap == 0) {
                 slice_ix++;
                 if (slice_ix == N_SLICE) {
                     break;
                 }
                 bitmap = sh_bitmaps[slice_ix][th_ix];
-                bd_bitmap = sh_backdrop[slice_ix][th_ix];
-                bd_sign = sh_bd_sign[slice_ix];
-                is_segment = sh_is_segment[slice_ix];
-                nonseg_bitmap = bitmap & ~is_segment;
-                if (nonseg_bitmap == 0) {
+                if (bitmap == 0) {
                     continue;
                 }
             }
-            uint element_ref_ix = slice_ix * 32 + findLSB(nonseg_bitmap);
+            uint element_ref_ix = slice_ix * 32 + findLSB(bitmap);
             uint element_ix = sh_elements[element_ref_ix];
 
-            // Bits up to and including the lsb
-            uint bd_mask = (nonseg_bitmap - 1) ^ nonseg_bitmap;
-            backdrop += count_backdrop(bd_bitmap & bd_mask, bd_sign);
-            seg_count += bitCount(bitmap & bd_mask & is_segment);
-            // Clear bits that have been consumed.
-            bd_bitmap &= ~bd_mask;
-            bitmap &= ~bd_mask;
+            // Clear LSB
+            bitmap &= bitmap - 1;
 
             // At this point, we read the element again from global memory.
             // If that turns out to be expensive, maybe we can pack it into
@@ -419,6 +364,7 @@ void main() {
             tag = Annotated_tag(ref);
 
             switch (tag) {
+            /*
             case Annotated_Fill:
                 if (last_chunk_n > 0 || seg_count > 0) {
                     SegChunkRef chunk_ref = SegChunkRef(0);
@@ -460,63 +406,34 @@ void main() {
                 seg_count = 0;
                 backdrop = 0;
                 break;
+            */
             case Annotated_Stroke:
-                // TODO: reduce divergence & code duplication? Much of the
-                // fill and stroke processing is in common.
-                if (last_chunk_n > 0 || seg_count > 0) {
-                    SegChunkRef chunk_ref = SegChunkRef(0);
-                    if (seg_count > 0) {
-                        chunk_ref = alloc_seg_chunk();
-                        SegChunk chunk;
-                        chunk.n = seg_count;
-                        chunk.next = SegChunkRef(0);
-                        uint seg_offset = seg_alloc + seg_start * Segment_size;
-                        chunk.segs = SegmentRef(seg_offset);
-                        SegChunk_write(chunk_ref, chunk);
+                // Because the only elements we're processing right now are
+                // paths, we can just use the element index as the path index.
+                // In future, when we're doing a bunch of stuff, the path index
+                // should probably be stored in the annotated element.
+                uint path_ix = element_ix;
+                Path path = Path_read(PathRef(path_ix * Path_size));
+                // It may be we have a strong guarantee this will always be `true`, but
+                // I prefer not to take chances.
+                if (tile_x >= path.bbox.x && tile_x < path.bbox.z && tile_y >= path.bbox.y && tile_y < path.bbox.w) {
+                    uint stride = path.bbox.z - path.bbox.x;
+                    uint tile_subix = (tile_y - path.bbox.y) * stride + tile_x - path.bbox.x;
+                    Tile tile = Tile_read(Tile_index(path.tiles, tile_subix));
+                    if (tile.tile.offset != 0) {
+                        AnnoStroke stroke = Annotated_Stroke_read(ref);
+                        CmdStroke cmd_stroke;
+                        cmd_stroke.tile_ref = tile.tile.offset;
+                        cmd_stroke.half_width = 0.5 * stroke.linewidth;
+                        cmd_stroke.rgba_color = stroke.rgba_color;
+                        alloc_cmd(cmd_ref, cmd_limit);
+                        Cmd_Stroke_write(cmd_ref, cmd_stroke);
+                        cmd_ref.offset += Cmd_size;
                     }
-                    if (last_chunk_n > 0) {
-                        SegChunk chunk;
-                        chunk.n = last_chunk_n;
-                        chunk.next = chunk_ref;
-                        chunk.segs = last_chunk_segs;
-                        SegChunk_write(last_chunk_ref, chunk);
-                    } else {
-                        first_seg_chunk = chunk_ref;
-                    }
-
-                    AnnoStroke stroke = Annotated_Stroke_read(ref);
-                    CmdStroke cmd_stroke;
-                    cmd_stroke.seg_ref = first_seg_chunk;
-                    cmd_stroke.half_width = 0.5 * stroke.linewidth;
-                    cmd_stroke.rgba_color = stroke.rgba_color;
-                    alloc_cmd(cmd_ref, cmd_limit);
-                    Cmd_Stroke_write(cmd_ref, cmd_stroke);
-                    cmd_ref.offset += Cmd_size;
-                    last_chunk_n = 0;
                 }
-                seg_start += seg_count;
-                seg_count = 0;
-                break;
-            default:
-                // This shouldn't happen, but just in case.
-                seg_start++;
                 break;
             }
         }
-        if (seg_count > 0) {
-            SegChunkRef chunk_ref = alloc_seg_chunk();
-            if (last_chunk_n > 0) {
-                SegChunk_write(last_chunk_ref, SegChunk(last_chunk_n, chunk_ref, last_chunk_segs));
-            } else {
-                first_seg_chunk = chunk_ref;
-            }
-            // TODO: free two registers by writing count and segments ref now,
-            // as opposed to deferring SegChunk write until all fields are known.
-            last_chunk_ref = chunk_ref;
-            last_chunk_n = seg_count;
-            uint seg_offset = seg_alloc + seg_start * Segment_size;
-            last_chunk_segs = SegmentRef(seg_offset);
-        }
         barrier();
 
         rd_ix += N_TILE;
diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv
index 4b7e1c4..6b2afaf 100644
Binary files a/piet-gpu/shader/coarse.spv and b/piet-gpu/shader/coarse.spv differ
diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp
index 2c068aa..0ecda68 100644
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@@ -17,9 +17,14 @@ layout(set = 0, binding = 0) buffer PtclBuf {
     uint[] ptcl;
 };
 
-layout(rgba8, set = 0, binding = 1) uniform writeonly image2D image;
+layout(set = 0, binding = 1) buffer TileBuf {
+    uint[] tile;
+};
+
+layout(rgba8, set = 0, binding = 2) uniform writeonly image2D image;
 
 #include "ptcl.h"
+#include "tile.h"
 
 #include "setup.h"
 
@@ -57,28 +62,25 @@ void main() {
             CmdStroke stroke = Cmd_Stroke_read(cmd_ref);
             float df[CHUNK];
             for (uint k = 0; k < CHUNK; k++) df[k] = 1e9;
-            SegChunkRef seg_chunk_ref = stroke.seg_ref;
+            TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref);
             do {
-                SegChunk seg_chunk = SegChunk_read(seg_chunk_ref);
-                SegmentRef segs = seg_chunk.segs;
-                for (int i = 0; i < seg_chunk.n; i++) {
-                    Segment seg = Segment_read(Segment_index(segs, i));
-                    vec2 line_vec = seg.end - seg.start;
-                    for (uint k = 0; k < CHUNK; k++) {
-                        vec2 dpos = xy + vec2(0.5, 0.5) - seg.start;
-                        dpos.y += float(k * CHUNK_DY);
-                        float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
-                        df[k] = min(df[k], length(line_vec * t - dpos));
-                    }
+                TileSeg seg = TileSeg_read(tile_seg_ref);
+                vec2 line_vec = seg.end - seg.start;
+                for (uint k = 0; k < CHUNK; k++) {
+                    vec2 dpos = xy + vec2(0.5, 0.5) - seg.start;
+                    dpos.y += float(k * CHUNK_DY);
+                    float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
+                    df[k] = min(df[k], length(line_vec * t - dpos));
                 }
-                seg_chunk_ref = seg_chunk.next;
-            } while (seg_chunk_ref.offset != 0);
+                tile_seg_ref = seg.next;
+            } while (tile_seg_ref.offset != 0);
             fg_rgba = unpackUnorm4x8(stroke.rgba_color).wzyx;
             for (uint k = 0; k < CHUNK; k++) {
                 float alpha = clamp(stroke.half_width + 0.5 - df[k], 0.0, 1.0);
                 rgb[k] = mix(rgb[k], fg_rgba.rgb, alpha * fg_rgba.a);
             }
             break;
+        /*
         case Cmd_Fill:
             CmdFill fill = Cmd_Fill_read(cmd_ref);
             // Probably better to store as float, but conversion is no doubt cheap.
@@ -117,6 +119,7 @@ void main() {
                 rgb[k] = mix(rgb[k], fg_rgba.rgb, alpha * fg_rgba.a);
             }
             break;
+        */
         case Cmd_Solid:
             CmdSolid solid = Cmd_Solid_read(cmd_ref);
             fg_rgba = unpackUnorm4x8(solid.rgba_color).wzyx;
diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv
index 5215e2f..cb27407 100644
Binary files a/piet-gpu/shader/kernel4.spv and b/piet-gpu/shader/kernel4.spv differ
diff --git a/piet-gpu/shader/path_coarse.comp b/piet-gpu/shader/path_coarse.comp
index ff79925..5a4b78c 100644
--- a/piet-gpu/shader/path_coarse.comp
+++ b/piet-gpu/shader/path_coarse.comp
@@ -78,7 +78,7 @@ void main() {
     int stride = bbox.z - bbox.x;
     int base = (y0 - bbox.y) * stride - bbox.x;
     // TODO: can be tighter, use c to bound width
-    uint n_tile_alloc = uint(stride * (bbox.w - bbox.y));
+    uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
     // Consider using subgroups to aggregate atomic add.
     uint tile_offset = atomicAdd(alloc, n_tile_alloc * TileSeg_size);
     TileSeg tile_seg;
diff --git a/piet-gpu/shader/path_coarse.spv b/piet-gpu/shader/path_coarse.spv
index ed212d7..cf0d4b9 100644
Binary files a/piet-gpu/shader/path_coarse.spv and b/piet-gpu/shader/path_coarse.spv differ
diff --git a/piet-gpu/shader/ptcl.h b/piet-gpu/shader/ptcl.h
index dd1f9a8..d337598 100644
--- a/piet-gpu/shader/ptcl.h
+++ b/piet-gpu/shader/ptcl.h
@@ -68,7 +68,7 @@ CmdLineRef CmdLine_index(CmdLineRef ref, uint index) {
 }
 
 struct CmdStroke {
-    SegChunkRef seg_ref;
+    uint tile_ref;
     float half_width;
     uint rgba_color;
 };
@@ -220,7 +220,7 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) {
     uint raw1 = ptcl[ix + 1];
     uint raw2 = ptcl[ix + 2];
     CmdStroke s;
-    s.seg_ref = SegChunkRef(raw0);
+    s.tile_ref = raw0;
     s.half_width = uintBitsToFloat(raw1);
     s.rgba_color = raw2;
     return s;
@@ -228,7 +228,7 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) {
 
 void CmdStroke_write(CmdStrokeRef ref, CmdStroke s) {
     uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = s.seg_ref.offset;
+    ptcl[ix + 0] = s.tile_ref;
     ptcl[ix + 1] = floatBitsToUint(s.half_width);
     ptcl[ix + 2] = s.rgba_color;
 }
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index 19e9b43..b568827 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -183,9 +183,9 @@ impl<D: Device> Renderer<D> {
         device.write_buffer(&scene_buf, &scene)?;
 
         let state_buf = device.create_buffer(1 * 1024 * 1024, dev)?;
-        let anno_buf = device.create_buffer(64 * 1024 * 1024, host)?;
-        let pathseg_buf = device.create_buffer(64 * 1024 * 1024, host)?;
-        let tile_buf = device.create_buffer(64 * 1024 * 1024, host)?;
+        let anno_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
+        let pathseg_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
+        let tile_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
         let bin_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
         let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?;
         let image_dev = device.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?;
@@ -228,10 +228,10 @@ impl<D: Device> Renderer<D> {
         let bin_alloc_buf_dev = device.create_buffer(12, dev)?;
 
         // TODO: constants
-        let bin_alloc_start = ((n_elements + 255) & !255) * 8;
+        let bin_alloc_start = ((n_paths + 255) & !255) * 8;
         device.write_buffer(
             &bin_alloc_buf_host,
-            &[n_elements as u32, 0, bin_alloc_start as u32],
+            &[n_paths as u32, 0, bin_alloc_start as u32],
         )?;
         let bin_code = include_bytes!("../shader/binning.spv");
         let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 4, 0)?;
@@ -250,16 +250,20 @@ impl<D: Device> Renderer<D> {
             &[n_elements as u32, coarse_alloc_start as u32],
         )?;
         let coarse_code = include_bytes!("../shader/coarse.spv");
-        let coarse_pipeline = device.create_simple_compute_pipeline(coarse_code, 4, 0)?;
+        let coarse_pipeline = device.create_simple_compute_pipeline(coarse_code, 5, 0)?;
         let coarse_ds = device.create_descriptor_set(
             &coarse_pipeline,
-            &[&anno_buf, &bin_buf, &coarse_alloc_buf_dev, &ptcl_buf],
+            &[&anno_buf, &bin_buf, &tile_buf, &coarse_alloc_buf_dev, &ptcl_buf],
             &[],
         )?;
 
         let k4_code = include_bytes!("../shader/kernel4.spv");
-        let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 1, 1)?;
-        let k4_ds = device.create_descriptor_set(&k4_pipeline, &[&ptcl_buf], &[&image_dev])?;
+        let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 2, 1)?;
+        let k4_ds = device.create_descriptor_set(
+            &k4_pipeline, 
+            &[&ptcl_buf, &tile_buf], 
+            &[&image_dev]
+        )?;
 
         Ok(Renderer {
             scene_buf,
@@ -328,32 +332,31 @@ impl<D: Device> Renderer<D> {
             &self.path_ds,
             (((self.n_pathseg + 31) / 32) as u32, 1, 1),
         );
-        /*
+        cmd_buf.write_timestamp(&query_pool, 3);
+        // Note: this barrier is not needed as an actual dependency between
+        // pipeline stages, but I am keeping it in so that timer queries are
+        // easier to interpret.
+        cmd_buf.memory_barrier();
         cmd_buf.dispatch(
             &self.bin_pipeline,
             &self.bin_ds,
-            (((self.n_elements + 255) / 256) as u32, 1, 1),
+            (((self.n_paths + 255) / 256) as u32, 1, 1),
         );
-        */
-        cmd_buf.write_timestamp(&query_pool, 3);
+        cmd_buf.write_timestamp(&query_pool, 4);
         cmd_buf.memory_barrier();
-        /*
         cmd_buf.dispatch(
             &self.coarse_pipeline,
             &self.coarse_ds,
             (WIDTH as u32 / 256, HEIGHT as u32 / 256, 1),
         );
-        */
-        cmd_buf.write_timestamp(&query_pool, 4);
+        cmd_buf.write_timestamp(&query_pool, 5);
         cmd_buf.memory_barrier();
-        /*
         cmd_buf.dispatch(
             &self.k4_pipeline,
             &self.k4_ds,
             ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1),
         );
-        cmd_buf.write_timestamp(&query_pool, 5);
-        */
+        cmd_buf.write_timestamp(&query_pool, 6);
         cmd_buf.memory_barrier();
         cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
     }
diff --git a/piet-gpu/src/pico_svg.rs b/piet-gpu/src/pico_svg.rs
index 140c42d..0aac61a 100644
--- a/piet-gpu/src/pico_svg.rs
+++ b/piet-gpu/src/pico_svg.rs
@@ -49,8 +49,8 @@ impl PicoSvg {
         for item in &self.items {
             match item {
                 Item::Fill(fill_item) => {
-                    rc.fill(&fill_item.path, &fill_item.color);
-                    //rc.stroke(&fill_item.path, &fill_item.color, 1.0);
+                    //rc.fill(&fill_item.path, &fill_item.color);
+                    rc.stroke(&fill_item.path, &fill_item.color, 1.0);
                 }
                 Item::Stroke(stroke_item) => {
                     rc.stroke(&stroke_item.path, &stroke_item.color, stroke_item.width);