diff --git a/.vscode/settings.json b/.vscode/settings.json
index 3a9bfe5..43d84c0 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,10 +1,12 @@
 {
   "wgsl-analyzer.customImports": {
     "bbox": "${workspaceFolder}/piet-wgsl/shader/shared/bbox.wgsl",
+    "bump": "${workspaceFolder}/piet-wgsl/shader/shared/bump.wgsl",
     "config": "${workspaceFolder}/piet-wgsl/shader/shared/config.wgsl",
     "drawtag": "${workspaceFolder}/piet-wgsl/shader/shared/drawtag.wgsl",
     "segment": "${workspaceFolder}/piet-wgsl/shader/shared/segment.wgsl",
-    "pathtag": "${workspaceFolder}/piet-wgsl/shader/shared/pathtag.wgsl"
+    "pathtag": "${workspaceFolder}/piet-wgsl/shader/shared/pathtag.wgsl",
+    "ptcl": "${workspaceFolder}/piet-wgsl/shader/shared/ptcl.wgsl"
   },
   "wgsl-analyzer.diagnostics.nagaVersion": "main"
 }
diff --git a/piet-wgsl/shader/binning.wgsl b/piet-wgsl/shader/binning.wgsl
index 1282164..d16a620 100644
--- a/piet-wgsl/shader/binning.wgsl
+++ b/piet-wgsl/shader/binning.wgsl
@@ -19,6 +19,7 @@
 #import config
 #import drawtag
 #import bbox
+#import bump
 
 @group(0) @binding(0)
 var<storage> config: Config;
@@ -26,7 +27,6 @@ var<storage> config: Config;
 @group(0) @binding(1)
 var<storage> draw_monoids: array<DrawMonoid>;
 
-
 @group(0) @binding(2)
 var<storage> path_bbox_buf: array<PathBBox>;
 
@@ -37,10 +37,6 @@ var<storage> clip_bbox_buf: array<vec4<f32>>;
 var<storage, read_write> intersected_bbox: array<vec4<f32>>;
 
 // TODO: put into shared include
-// TODO: robust memory (failure flags)
-struct BumpAllocators {
-    binning: atomic<u32>,
-}
 
 @group(0) @binding(5)
 var<storate, read_write> bump: BumpAllocators;
@@ -48,6 +44,7 @@ var<storate, read_write> bump: BumpAllocators;
 @group(0) @binding(6)
 var<storage, read_write> bin_data: array<u32>;
 
+// TODO: put in common place
 struct BinHeader {
     element_count: u32,
     chunk_offset: u32,
@@ -56,14 +53,6 @@ struct BinHeader {
 @group(0) @binding(7)
 var<storage, read_write> bin_header: array<BinHeader>;
 
-// These should probably be in a common block.
-let TILE_WIDTH = 16u;
-let TILE_HEIGHT = 16u;
-// Number of tiles per bin
-let N_TILE_X = 16u;
-let N_TILE_Y = 16u;
-let N_TILE = N_TILE_X * N_TILE_Y;
-
 // conversion factors from coordinates to bin
 let SX = 1.0 / f32(N_TILE_X * TILE_WIDTH);
 let SY = 1.0 / f32(N_TILE_Y * TILE_HEIGHT);
diff --git a/piet-wgsl/shader/coarse.wgsl b/piet-wgsl/shader/coarse.wgsl
new file mode 100644
index 0000000..159a3d7
--- /dev/null
+++ b/piet-wgsl/shader/coarse.wgsl
@@ -0,0 +1,327 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+// The coarse rasterization stage.
+
+#import config
+#import bump
+#import drawtag
+#import ptcl
+
+@group(0) @binding(0)
+var<storage> config: Config;
+
+@group(0) @binding(1)
+var<storage> scene: array<u32>;
+
+@group(0) @binding(2)
+var<storage> draw_monoids: array<DrawMonoid>;
+
+// TODO: dedup
+struct BinHeader {
+    element_count: u32,
+    chunk_offset: u32,
+}
+
+@group(0) @binding(3)
+var<storage> bin_headers: array<BinHeader>;
+
+@group(0) @binding(4)
+var<storage, read_write> bin_data: array<u32>;
+
+@group(0) @binding(5)
+var<storage, read_write> bump: BumpAllocators;
+
+@group(0) @binding(6)
+var<storage, read_write> ptcl: array<u32>;
+
+// TODO: put this in the right place
+struct Path {
+    // bounding box in pixels
+    bbox: vec4<u32>,
+    // offset (in u32's) to tile rectangle
+    tiles: u32,
+}
+
+struct Tile {
+    backdrop: i32,
+    segments: u32,
+}
+
+@group(0) @binding(7)
+var<storage> paths: array<Path>;
+
+@group(0) @binding(8)
+var<storage> tiles: array<Tile>;
+
+
+// Much of this code assumes WG_SIZE == N_TILE. If these diverge, then
+// a fair amount of fixup is needed.
+let WG_SIZE = 256u;
+let N_SLICE = WG_SIZE / 32u;
+
+var<workgroup> sh_bitmaps: array<array<atomic<u32>, N_TILE>, N_SLICE>;
+var<workgroup> sh_part_count: array<u32, WG_SIZE>;
+var<workgroup> sh_part_offsets: array<u32, WG_SIZE>;
+var<workgroup> sh_drawobj_ix: array<u32, WG_SIZE>;
+var<workgroup> sh_tile_stride: array<u32, WG_SIZE>;
+var<workgroup> sh_tile_width: array<u32, WG_SIZE>;
+var<workgroup> sh_tile_x0: array<u32, WG_SIZE>;
+var<workgroup> sh_tile_y0: array<u32, WG_SIZE>;
+var<workgroup> sh_tile_count: array<u32, WG_SIZE>;
+var<workgroup> sh_tile_base: array<u32, WG_SIZE>;
+
+// helper functions for writing ptcl
+
+var<private> cmd_offset: u32;
+var<private> cmd_limit: u32;
+
+// Make sure there is space for a command of given size, plus a jump if needed
+fn alloc_cmd(size: u32) {
+    if cmd_offset + size >= cmd_limit {
+        let new_cmd = atomicAdd(&bump.ptcl, PTCL_INCREMENT);
+        // TODO: robust memory
+        ptcl[cmd_offset] = CMD_JUMP;
+        ptcl[cmd_offset + 1u] = new_cmd;
+        cmd_offset = new_cmd;
+        cmd_limit = cmd_offset + (PTCL_INCREMENT - PTCL_HEADROOM);
+    }
+}
+
+fn write_path(tile: Tile, linewidth: f32) {
+    // TODO: take flags
+    // TODO: handle stroke
+    alloc_cmd(3u);
+    if tile.segments != 0u {
+        let fill = CmdFill(tile.segments, tile.backdrop);
+        ptcl[cmd_offset] = CMD_FILL;
+        ptcl[cmd_offset + 1u] = fill.tile;
+        ptcl[cmd_offset + 2u] = u32(fill.backdrop);
+        cmd_offset += 3u;
+    } else {
+        ptcl[cmd_offset] = CMD_SOLID;
+        cmd_offset += 1u;
+    }
+}
+
+fn write_color(color: CmdColor) {
+    alloc_cmd(2u);
+    ptcl[cmd_offset] = CMD_FILL;
+    ptcl[cmd_offset + 1u] = color.rgba_color;
+    cmd_offset += 2u;
+
+}
+
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+    @builtin(workgroup_id) wg_id: vec3<u32>,
+) {
+    let width_in_bins = (config.width_in_tiles + N_TILE_X - 1u) / N_TILE_X;
+    let bin_ix = width_in_bins * wg_id.y + wg_id.x;
+    let n_partitions = (config.n_drawobj + N_TILE - 1u) / N_TILE;
+
+    // Coordinates of the top left of this bin, in tiles.
+    let bin_tile_x = N_TILE_X * wg_id.x;
+    let bin_tile_y = N_TILE_Y * wg_id.y;
+
+    let tile_x = local_id.x % N_TILE_X;
+    let tile_y = local_id.y % N_TILE_Y;
+    let this_tile_ix = (bin_tile_y + tile_y) * config.width_in_tiles + bin_tile_x + tile_x;
+    cmd_offset = this_tile_ix * PTCL_INITIAL_ALLOC;
+    cmd_limit = cmd_offset + (PTCL_INITIAL_ALLOC - PTCL_HEADROOM);
+    // TODO: clip state
+    let clip_zero_depth = 0u;
+
+    var partition_ix = 0u;
+    var rd_ix = 0u;
+    var wr_ix = 0u;
+    var part_start_ix = 0u;
+    var ready_ix = 0u;
+    // TODO: blend state
+    
+    while true {
+        for (var i = 0u; i < N_SLICE; i += 1u) {
+            atomicStore(&sh_bitmaps[i][local_id.x], 0u);
+        }
+
+        while true {
+            if ready_ix == wr_ix && partition_ix < n_partitions {
+                part_start_ix = ready_ix;
+                var count = 0u;
+                if partition_ix + local_id.x < n_partitions {
+                    let in_ix = (partition_ix + local_id.x) * N_TILE + bin_ix;
+                    let bin_header = bin_headers[in_ix];
+                    count = bin_header.element_count;
+                    sh_part_offsets[local_id.x] = bin_header.chunk_offset;
+                }
+                // prefix sum the element counts
+                for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+                    sh_part_count[local_id.x] = count;
+                    workgroupBarrier();
+                    if local_id.x >= (1u << i) {
+                        count += sh_part_count[local_id - (1u << i)];
+                    }
+                    workgroupBarrier();
+                }
+                sh_part_count[local_id.x] = part_start_ix + count;
+                workgroupBarrier();
+                ready_ix = sh_part_count[WG_SIZE - 1u];
+                partition_ix += WG_SIZE;
+            }
+            // use binary search to find draw object to read
+            var ix = rd_ix + local_id.x;
+            if ix >= wr_ix && ix < ready_ix {
+                var part_ix = 0u;
+                for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+                    let probe = part_ix + ((N_TILE / 2u) >> i);
+                    if ix >= sh_part_count[probe - 1u] {
+                        part_ix = probe;
+                    }
+                }
+                ix -= select(part_start_ix, sh_part_count[part_ix - 1u], part_ix > 0u);
+                let offset = sh_part_offsets[part_ix];
+                sh_drawobj_ix[local_id.x] = bin_data[offset + ix];
+            }
+            wr_ix = min(rd_ix + N_TILE, ready_ix);
+            if wr_ix - rd_ix >= N_TILE || (wr_ix >= ready_ix && partition_ix >= n_partitions) {
+                break;
+            }
+        }
+        // At this point, sh_drawobj_ix[0.. wr_ix - rd_ix] contains merged binning results.
+        var tag = DRAWTAG_NOP;
+        var drawobj_ix: u32;
+        if local_id.x + rd_ix < wr_ix {
+            drawobj_ix = sh_drawobj_ix[local_id.x];
+            tag = scene[config.drawtag_base + drawobj_ix];
+        }
+
+        var tile_count = 0u;
+        // I think this predicate is the same as the last, maybe they can be combined
+        if tag != DRAWTAG_NOP {
+            let path_ix = draw_monoids[drawobj_ix].path_ix;
+            let path = paths[path_ix];
+            let stride = path.bbox.z - path.bbox.x;
+            sh_tile_stride[local_id.x] = stride;
+            let dx = i32(path.bbox.x) - i32(bin_tile_x);
+            let dy = i32(path.bbox.y) - i32(bin_tile_y);
+            let x0 = clamp(dx, 0, i32(N_TILE_X));
+            let y0 = clamp(dy, 0, i32(N_TILE_Y));
+            let x1 = clamp(i32(path.bbox.z) - i32(bin_tile_x), 0, i32(N_TILE_X));
+            let y1 = clamp(i32(path.bbox.w) - i32(bin_tile_y), 0, i32(N_TILE_Y));
+            sh_tile_width[local_id.x] = u32(x1 - x0);
+            sh_tile_x0[local_id.x] = u32(x0);
+            sh_tile_y0[local_id.x] = u32(y0);
+            tile_count = u32(x1 - x0) * u32(y1 - y0);
+            // base relative to bin
+            let base = path.tiles - u32(dy * i32(stride) + dx);
+            sh_tile_base[local_id.x] = base;
+            // TODO: there's a write_tile_alloc here in the source, not sure what it's supposed to do
+        }
+
+        // Prefix sum of tile counts
+        sh_tile_count[local_id.x] = tile_count;
+        for (var i = 0; i < firstTrailingBit(N_TILE); i += 1u) {
+            workgroupBarrier();
+            if local_id.x >= (1u << i) {
+                tile_count += sh_tile_count[local_id.x - (1u << i)];
+            }
+            workgroupBarrier();
+            sh_tile_count[local_id.x] = tile_count;
+        }
+        workgroupBarrier();
+        let total_tile_count = sh_tile_count[N_TILE - 1u];
+        // Parallel iteration over all tiles
+        for (var ix = local_id.x; ix < total_tile_count; ix += N_TILE) {
+            // Binary search to find draw object which contains this tile
+            var el_ix = 0u;
+            for (var i = 0u; i < firstTrailingBit(N_TILE); i += 1u) {
+                let probe = el_ix + ((N_TILE / 2u) >> i);
+                if ix >= sh_tile_count[probe - 1u] {
+                    el_ix = probe;
+                }
+            }
+            drawobj_ix = sh_drawobj_ix[el_ix];
+            tag = scene[config.drawtag_base + drawobj_ix];
+            // TODO: clip logic
+            let seq_ix = ix - select(0u, sh_tile_count[el_ix - 1u], el_ix > 0u);
+            let width = sh_tile_width[el_ix];
+            let x = sh_tile_x0[el_ix] + seq_ix % width;
+            let y = sh_tile_y0[el_ix] + seq_ix / width;
+            let tile_ix = sh_tile_base[el_ix] + sh_tile_stride[el_ix] * y + x;
+            let tile = tiles[tile_ix];
+            // TODO: this predicate becomes more interesting with clip
+            let include_tile = tile.segments != 0u || tile.backdrop != 0;
+            if include_tile {
+                let el_slice = el_ix / 32u;
+                let el_mask = 1u << (el_ix & 31u);
+                atomicOr(&sh_bitmaps[el_slice][y * N_TILE_X + x], el_mask);
+            }
+        }
+        workgroupBarrier();
+        // At this point bit drawobj % 32 is set in sh_bitmaps[drawobj / 32][y * N_TILE_X + x]
+        // if drawobj touches tile (x, y).
+
+        // Write per-tile command list for this tile
+        var slice_ix = 0u;
+        var bitmap = atomicLoad(&sh_bitmaps[0u][local_id.x]);
+        while true {
+            if bitmap == 0u {
+                slice_ix += 1u;
+                // potential optimization: make iteration limit dynamic
+                if slice_ix == N_SLICE {
+                    break;
+                }
+                bitmap = atomicLoad(&sh_bitmaps[slice_ix][local_id.x]);
+                if bitmap == 0u {
+                    continue;
+                }
+            }
+            let el_ix = slice_ix * 32u + firstTrailingBit(bitmap);
+            drawobj_ix = sh_drawobj_ix[el_ix];
+            // clear LSB of bitmap, using bit magic
+            bitmap &= bitmap - 1u;
+            let drawtag = scene[config.drawtag_base + drawobj_ix];
+            let dm = draw_monoids[drawobj_ix];
+            let dd = config.drawdata_base + dm.scene_offset;
+            if clip_zero_depth == 0u {
+                let tile_ix = sh_tile_base[el_ix] + sh_tile_stride[el_ix] * tile_y + tile_x;
+                let tile = tiles[tile_ix];
+                switch drawtag {
+                    case DRAWTAG_FILL_COLOR: {
+                        // TODO: get linewidth from draw object
+                        let linewidth = -1.0;
+                        let rgba_color = scene[dd];
+                        write_path(tile, linewidth);
+                        write_color(CmdColor(rgba_color));
+                    }
+                    default: {}
+                }
+            }
+        }
+
+        rd_ix += N_TILE;
+        if rd_ix >= ready_ix && partition_ix >= n_partitions {
+            break;
+        }
+        workgroupBarrier();
+    }
+    if bin_tile_x < config.width_in_tiles && bin_tile_y < config.height_in_tiles {
+        ptcl[cmd_offset] = CMD_END;
+        // TODO: blend stack allocation
+    }
+}
diff --git a/piet-wgsl/shader/draw_leaf.wgsl b/piet-wgsl/shader/draw_leaf.wgsl
index 2169dd4..21edcfe 100644
--- a/piet-wgsl/shader/draw_leaf.wgsl
+++ b/piet-wgsl/shader/draw_leaf.wgsl
@@ -38,7 +38,7 @@ var<storage> path_bbox: array<PathBbox>;
 @group(0) @binding(5)
 var<storage, read_write> info: array<u32>;
 
-let WG_SIZE = 256;
+let WG_SIZE = 256u;
 
 var<workgroup> sh_scratch: array<DrawMonoid, WG_SIZE>;
 
@@ -119,7 +119,7 @@ fn main(
                 let r0 = bitcast<f32>(scene[dd + 5u]);
                 let r1 = bitcast<f32>(scene[dd + 6u]);
                 let inv_det = 1.0 / (mat.x * mat.w - mat.y * mat.z);
-                let inv_mat = inv_det * vec4(mat.w, -mat.y, -mat.z, mat.x);
+                let inv_mat = inv_det * vec4<f32>(mat.w, -mat.y, -mat.z, mat.x);
                 var inv_tr = inv_mat.xz * translate.x + inv_mat.yw * translate.y;
                 inv_tr += p0;
                 let center1 = p1 - p0;
diff --git a/piet-wgsl/shader/shared/bbox.wgsl b/piet-wgsl/shader/shared/bbox.wgsl
index c0eeb1c..fb7728b 100644
--- a/piet-wgsl/shader/shared/bbox.wgsl
+++ b/piet-wgsl/shader/shared/bbox.wgsl
@@ -25,6 +25,6 @@ struct PathBbox {
     trans_ix: u32,
 }
 
-fn bbox_intersect(a: vec4<f32>, b: vec4<f32>) -> f32 {
-    return vec4(max(a.xy, b.xy), min(a.zyw, b.zw));
+fn bbox_intersect(a: vec4<f32>, b: vec4<f32>) -> vec4<f32> {
+    return vec4(max(a.xy, b.xy), min(a.zw, b.zw));
 }
diff --git a/piet-wgsl/shader/shared/bump.wgsl b/piet-wgsl/shader/shared/bump.wgsl
new file mode 100644
index 0000000..6d9a225
--- /dev/null
+++ b/piet-wgsl/shader/shared/bump.wgsl
@@ -0,0 +1,21 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+// TODO: robust memory (failure flags)
+struct BumpAllocators {
+    binning: atomic<u32>,
+    ptcl: atomic<u32>,
+}
diff --git a/piet-wgsl/shader/shared/config.wgsl b/piet-wgsl/shader/shared/config.wgsl
index cdb0abc..0a91782 100644
--- a/piet-wgsl/shader/shared/config.wgsl
+++ b/piet-wgsl/shader/shared/config.wgsl
@@ -20,8 +20,28 @@ struct Config {
 
     n_drawobj: u32,
 
-    // offsets within config file (in u32 units)
+    // offsets within scene buffer (in u32 units)
     // Note: this is a difference from piet-gpu, which is in bytes
     drawtag_base: u32,
     drawdata_base: u32,
 }
+
+// Geometry of tiles and bins
+
+
+let TILE_WIDTH = 16u;
+let TILE_HEIGHT = 16u;
+// Number of tiles per bin
+let N_TILE_X = 16u;
+let N_TILE_Y = 16u;
+let N_TILE = N_TILE_X * N_TILE_Y;
+
+// Should ptcl stuff move to a separate import?
+
+// Layout of per-tile command list
+// Initial allocation, in u32's.
+let PTCL_INITIAL_ALLOC = 64u;
+let PTCL_INCREMENT = 256u;
+
+// Amount of space taken by jump
+let PTCL_HEADROOM = 2u;
diff --git a/piet-wgsl/shader/shared/drawtag.wgsl b/piet-wgsl/shader/shared/drawtag.wgsl
index c318e53..25608bd 100644
--- a/piet-wgsl/shader/shared/drawtag.wgsl
+++ b/piet-wgsl/shader/shared/drawtag.wgsl
@@ -31,11 +31,11 @@ struct DrawMonoid {
 // version of the draw monoid.
 let DRAWTAG_NOP = 0u;
 let DRAWTAG_FILL_COLOR = 0x44u;
-let DRAWTAG_FILL_LIN_GRADIENT = 0x114;
-let DRAWTAG_FILL_RAD_GRADIENT = 0x2dc;
-let DRAWTAG_FILL_IMAGE = 0x48;
-let DRAWTAG_BEGIN_CLIP = 0x05;
-let DRAWTAG_END_CLIP = 0x25;
+let DRAWTAG_FILL_LIN_GRADIENT = 0x114u;
+let DRAWTAG_FILL_RAD_GRADIENT = 0x2dcu;
+let DRAWTAG_FILL_IMAGE = 0x48u;
+let DRAWTAG_BEGIN_CLIP = 0x05u;
+let DRAWTAG_END_CLIP = 0x25u;
 
 fn draw_monoid_identity() -> DrawMonoid {
     return DrawMonoid();
@@ -49,8 +49,7 @@ fn combine_draw_monoid(a: DrawMonoid, b: DrawMonoid) {
     c.info_offset = a.info_offset + b.info_offset;
 }
 
-fn map_draw_tag(tag_word: u32) -> DawMonoid {
-    let has_path = ;
+fn map_draw_tag(tag_word: u32) -> DrawMonoid {
     var c: DrawMonoid;
     c.path_ix = u32(tag_word != DRAWTAG_NOP);
     c.clip_ix = tag_word & 1u;
diff --git a/piet-wgsl/shader/shared/ptcl.wgsl b/piet-wgsl/shader/shared/ptcl.wgsl
new file mode 100644
index 0000000..516189d
--- /dev/null
+++ b/piet-wgsl/shader/shared/ptcl.wgsl
@@ -0,0 +1,38 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+// Tags for PTCL commands
+let CMD_END = 0u;
+let CMD_FILL = 1u;
+let CMD_SOLID = 3u;
+let CMD_COLOR = 5u;
+let CMD_JUMP = 11u;
+
+// The individual PTCL structs are written here, but read/write is by
+// hand in the relevant shaders
+
+struct CmdFill {
+    tile: u32,
+    backdrop: i32,
+}
+
+struct CmdJump {
+    target: u32,
+}
+
+struct CmdColor {
+    rgba_color: u32,
+}