Merge pull request #203 from linebender/clip_checkpoint

Extend pipeline
2025-01-10 04:31:30 +11:00 · 2022-11-18 17:28:59 -05:00 · 2022-11-18 17:28:59 -05:00 · de998220db
parent c5dbf7612c adc9811776
commit de998220db
39 changed files with 4277 additions and 122 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -1,8 +1,16 @@
 {
  "wgsl-analyzer.customImports": {
+    "bbox": "${workspaceFolder}/piet-wgsl/shader/shared/bbox.wgsl",
+    "bump": "${workspaceFolder}/piet-wgsl/shader/shared/bump.wgsl",
+    "clip": "${workspaceFolder}/piet-wgsl/shader/shared/clip.wgsl",
    "config": "${workspaceFolder}/piet-wgsl/shader/shared/config.wgsl",
+    "cubic": "${workspaceFolder}/piet-wgsl/shader/shared/cubic.wgsl",
+    "drawtag": "${workspaceFolder}/piet-wgsl/shader/shared/drawtag.wgsl",
    "segment": "${workspaceFolder}/piet-wgsl/shader/shared/segment.wgsl",
-    "pathtag": "${workspaceFolder}/piet-wgsl/shader/shared/pathtag.wgsl"
+    "pathtag": "${workspaceFolder}/piet-wgsl/shader/shared/pathtag.wgsl",
+    "ptcl": "${workspaceFolder}/piet-wgsl/shader/shared/ptcl.wgsl",
+    "tile": "${workspaceFolder}/piet-wgsl/shader/shared/tile.wgsl"
  },
-  "wgsl-analyzer.diagnostics.nagaVersion": "main"
+  "wgsl-analyzer.diagnostics.nagaVersion": "main",
+  "wgsl-analyzer.preprocessor.shaderDefs": ["full"]
 }
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1174,10 +1174,12 @@ dependencies = [
 "bytemuck",
 "env_logger",
 "futures-intrusive",
+ "kurbo 0.8.3",
 "parking_lot",
 "piet-scene",
 "png",
 "pollster",
+ "roxmltree",
 "wgpu",
 ]

--- a/piet-wgsl/Cargo.toml
+++ b/piet-wgsl/Cargo.toml
@ -14,4 +14,8 @@ parking_lot = "0.12"
 bytemuck = { version = "1.12.1", features = ["derive"] }
 png = "0.17.6"

-piet-scene = { path = "../piet-scene" }
+piet-scene = { path = "../piet-scene", features = ["kurbo"] }
+
+# for picosvg, should be split out
+roxmltree = "0.13"
+kurbo = "0.8.3"
--- a/piet-wgsl/shader/backdrop_dyn.wgsl
+++ b/piet-wgsl/shader/backdrop_dyn.wgsl
@ -0,0 +1,84 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+// Prefix sum for dynamically allocated backdrops
+
+#import config
+#import tile
+
+@group(0) @binding(0)
+var<storage> config: Config;
+
+@group(0) @binding(1)
+var<storage> paths: array<Path>;
+
+@group(0) @binding(2)
+var<storage, read_write> tiles: array<Tile>;
+
+let WG_SIZE = 256u;
+
+var<workgroup> sh_row_width: array<u32, WG_SIZE>;
+var<workgroup> sh_row_count: array<u32, WG_SIZE>;
+var<workgroup> sh_offset: array<u32, WG_SIZE>;
+
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+) {
+    let drawobj_ix = global_id.x;
+    var row_count = 0u;
+    if drawobj_ix < config.n_drawobj {
+        // TODO: when rectangles, path and draw obj are not the same
+        let path = paths[drawobj_ix];
+        sh_row_width[local_id.x] = path.bbox.z - path.bbox.x;
+        row_count = path.bbox.w - path.bbox.y;
+        sh_offset[local_id.x] = path.tiles;
+    }
+    sh_row_count[local_id.x] = row_count;
+
+    // Prefix sum of row counts
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+        workgroupBarrier();
+        if local_id.x >= (1u << i) {
+            row_count += sh_row_count[local_id.x - (1u << i)];
+        }
+        workgroupBarrier();
+        sh_row_count[local_id.x] = row_count;
+    }
+    workgroupBarrier();
+    let total_rows = sh_row_count[WG_SIZE - 1u];
+    for (var row = local_id.x; row < total_rows; row += WG_SIZE) {
+        var el_ix = 0u;
+        for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+            let probe = el_ix + ((WG_SIZE / 2u) >> i);
+            if row >= sh_row_count[probe - 1u] {
+                el_ix = probe;
+            }
+        }
+        let width = sh_row_width[el_ix];
+        if width > 0u {
+            var seq_ix = row - select(0u, sh_row_count[el_ix - 1u], el_ix > 0u);
+            var tile_ix = sh_offset[el_ix] + seq_ix * width;
+            var sum = tiles[tile_ix].backdrop;
+            for (var x = 1u; x < width; x += 1u) {
+                tile_ix += 1u;
+                sum += tiles[tile_ix].backdrop;
+                tiles[tile_ix].backdrop = sum;
+            }
+        }
+    }
+}
--- a/piet-wgsl/shader/bbox_clear.wgsl
+++ b/piet-wgsl/shader/bbox_clear.wgsl
@ -0,0 +1,45 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+#import config
+
+@group(0) @binding(0)
+var<storage> config: Config;
+
+struct PathBbox {
+    x0: i32,
+    y0: i32,
+    x1: i32,
+    y1: i32,
+    linewidth: f32,
+    trans_ix: u32,
+}
+
+@group(0) @binding(1)
+var<storage, read_write> path_bboxes: array<PathBbox>;
+
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+) {
+    let ix = global_id.x;
+    if ix < config.n_path {
+        path_bboxes[ix].x0 = 0x7fffffff;
+        path_bboxes[ix].y0 = 0x7fffffff;
+        path_bboxes[ix].x1 = -0x80000000;
+        path_bboxes[ix].y1 = -0x80000000;
+    }
+}
--- a/piet-wgsl/shader/binning.wgsl
+++ b/piet-wgsl/shader/binning.wgsl
@ -0,0 +1,164 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+// The binning stage
+
+#import config
+#import drawtag
+#import bbox
+#import bump
+
+@group(0) @binding(0)
+var<storage> config: Config;
+
+@group(0) @binding(1)
+var<storage> draw_monoids: array<DrawMonoid>;
+
+@group(0) @binding(2)
+var<storage> path_bbox_buf: array<PathBbox>;
+
+@group(0) @binding(3)
+var<storage> clip_bbox_buf: array<vec4<f32>>;
+
+@group(0) @binding(4)
+var<storage, read_write> intersected_bbox: array<vec4<f32>>;
+
+@group(0) @binding(5)
+var<storage, read_write> bump: BumpAllocators;
+
+@group(0) @binding(6)
+var<storage, read_write> bin_data: array<u32>;
+
+// TODO: put in common place
+struct BinHeader {
+    element_count: u32,
+    chunk_offset: u32,
+}
+
+@group(0) @binding(7)
+var<storage, read_write> bin_header: array<BinHeader>;
+
+// conversion factors from coordinates to bin
+let SX = 0.00390625;
+let SY = 0.00390625;
+//let SX = 1.0 / f32(N_TILE_X * TILE_WIDTH);
+//let SY = 1.0 / f32(N_TILE_Y * TILE_HEIGHT);
+
+let WG_SIZE = 256u;
+let N_SLICE = 8u;
+//let N_SLICE = WG_SIZE / 32u;
+
+var<workgroup> sh_bitmaps: array<array<atomic<u32>, N_TILE>, N_SLICE>;
+var<workgroup> sh_count: array<array<u32, N_TILE>, N_SLICE>;
+var<workgroup> sh_chunk_offset: array<u32, N_TILE>;
+
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+    @builtin(workgroup_id) wg_id: vec3<u32>,
+) {
+    for (var i = 0u; i < N_SLICE; i += 1u) {
+        atomicStore(&sh_bitmaps[i][local_id.x], 0u);
+    }
+
+    // Read inputs and determine coverage of bins
+    let element_ix = global_id.x;
+    var x0 = 0;
+    var y0 = 0;
+    var x1 = 0;
+    var y1 = 0;
+    if element_ix < config.n_drawobj {
+        let draw_monoid = draw_monoids[element_ix];
+        var clip_bbox = vec4<f32>(-1e9, -1e9, 1e9, 1e9);
+        if draw_monoid.clip_ix > 0u {
+            clip_bbox = clip_bbox_buf[draw_monoid.clip_ix - 1u];
+        }
+        // For clip elements, clip_box is the bbox of the clip path,
+        // intersected with enclosing clips.
+        // For other elements, it is the bbox of the enclosing clips.
+        // TODO check this is true
+
+        let path_bbox = path_bbox_buf[draw_monoid.path_ix];
+        let pb = vec4<f32>(vec4<i32>(path_bbox.x0, path_bbox.y0, path_bbox.x1, path_bbox.y1));
+        let bbox_raw = bbox_intersect(clip_bbox, pb);
+        // TODO(naga): clunky expression a workaround for broken lhs swizzle
+        let bbox = vec4<f32>(bbox_raw.xy, max(bbox_raw.xy, bbox_raw.zw));
+
+        intersected_bbox[element_ix] = bbox;
+        x0 = i32(floor(bbox.x * SX));
+        y0 = i32(floor(bbox.y * SY));
+        x1 = i32(ceil(bbox.z * SX));
+        y1 = i32(ceil(bbox.w * SY));
+    }
+    let width_in_bins = i32((config.width_in_tiles + N_TILE_X - 1u) / N_TILE_X);
+    let height_in_bins = i32((config.height_in_tiles + N_TILE_Y - 1u) / N_TILE_Y);
+    x0 = clamp(x0, 0, width_in_bins);
+    y0 = clamp(y0, 0, height_in_bins);
+    x1 = clamp(x1, 0, width_in_bins);
+    y1 = clamp(y1, 0, height_in_bins);
+    if x0 == x1 {
+        y1 = y0;
+    }
+    var x = x0;
+    var y = y0;
+    let my_slice = local_id.x / 32u;
+    let my_mask = 1u << (local_id.x & 31u);
+    while y < y1 {
+        atomicOr(&sh_bitmaps[my_slice][y * width_in_bins + x], my_mask);
+        x += 1;
+        if x == x1 {
+            x = x0;
+            y += 1;
+        }
+    }
+
+    workgroupBarrier();
+    // Allocate output segments
+    var element_count = 0u;
+    for (var i = 0u; i < N_SLICE; i += 1u) {
+        element_count += countOneBits(atomicLoad(&sh_bitmaps[i][local_id.x]));
+        sh_count[i][local_id.x] = element_count;
+    }
+    // element_count is the number of draw objects covering this thread's bin
+    let chunk_offset = atomicAdd(&bump.binning, element_count);
+    sh_chunk_offset[local_id.x] = chunk_offset;
+    bin_header[global_id.x].element_count = element_count;
+    bin_header[global_id.x].chunk_offset = chunk_offset;
+    workgroupBarrier();
+
+    // loop over bbox of bins touched by this draw object
+    x = x0;
+    y = y0;
+    while y < y1 {
+        let bin_ix = y * width_in_bins + x;
+        let out_mask = atomicLoad(&sh_bitmaps[my_slice][bin_ix]);
+        // I think this predicate will always be true...
+        if (out_mask & my_mask) != 0u {
+            var idx = countOneBits(out_mask & (my_mask - 1u));
+            if my_slice > 0u {
+                idx += sh_count[my_slice - 1u][bin_ix];
+            }
+            let offset = sh_chunk_offset[bin_ix];
+            bin_data[offset + idx] = element_ix;
+        }
+        x += 1;
+        if x == x1 {
+            x = x0;
+            y += 1;
+        }
+    }
+}
--- a/piet-wgsl/shader/clip_leaf.wgsl
+++ b/piet-wgsl/shader/clip_leaf.wgsl
@ -0,0 +1,199 @@
+
+#import config
+#import bbox
+#import clip
+#import drawtag
+
+@group(0) @binding(0)
+var<storage> config: Config;
+
+@group(0) @binding(1)
+var<storage> clip_inp: array<i32>;
+
+@group(0) @binding(2)
+var<storage> path_bboxes: array<PathBbox>;
+
+@group(0) @binding(3)
+var<storage> reduced: array<Bic>;
+
+@group(0) @binding(4)
+var<storage> clip_els: array<ClipEl>;
+
+@group(0) @binding(5)
+var<storage, read_write> draw_monoids: array<DrawMonoid>;
+
+@group(0) @binding(6)
+var<storage, read_write> clip_bboxes: array<vec4<f32>>;
+
+let WG_SIZE = 256u;
+var<workgroup> sh_bic: array<Bic, 510 >;
+var<workgroup> sh_stack: array<u32, WG_SIZE>;
+var<workgroup> sh_stack_bbox: array<vec4<f32>, WG_SIZE>;
+var<workgroup> sh_bbox: array<vec4<f32>, WG_SIZE>;
+var<workgroup> sh_link: array<i32, WG_SIZE>;
+
+fn search_link(bic: ptr<function, Bic>, ix: u32) -> i32 {
+    var ix = ix;
+    var j = 0u;
+    while j < firstTrailingBit(WG_SIZE) {
+        let base = 2u * WG_SIZE - (2u << (firstTrailingBit(WG_SIZE) - j));
+        if ((ix >> j) & 1u) != 0u {
+            let test = bic_combine(sh_bic[base + (ix >> j) - 1u], *bic);
+            if test.b > 0u {
+                break;
+            }
+            *bic = test;
+            ix -= 1u << j;
+        }
+        j += 1u;
+    }
+    if ix > 0u {
+        while j > 0u {
+            j -= 1u;
+            let base = 2u * WG_SIZE - (2u << (firstTrailingBit(WG_SIZE) - j));
+            let test = bic_combine(sh_bic[base + (ix >> j) - 1u], *bic);
+            if test.b == 0u {
+                *bic = test;
+                ix -= 1u << j;
+            }
+        }
+    }
+    if ix > 0u {
+        return i32(ix) - 1;
+    } else {
+        return i32(~0u - (*bic).a);
+    }
+}
+
+fn load_clip_inp(ix: u32) -> i32 {
+    if ix < config.n_clip {
+        return clip_inp[ix];
+    } else {
+        return -2147483648;
+        // literal too large?
+        // return 0x80000000;
+    }
+}
+
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+    @builtin(workgroup_id) wg_id: vec3<u32>,
+) {
+    var bic: Bic;
+    if local_id.x < wg_id.x {
+        bic = reduced[local_id.x];
+    }
+    sh_bic[local_id.x] = bic;
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+        workgroupBarrier();
+        if local_id.x + (1u << i) < WG_SIZE {
+            let other = sh_bic[local_id.x + (1u << i)];
+            bic = bic_combine(bic, other);
+        }
+        workgroupBarrier();
+        sh_bic[local_id.x] = bic;
+    }
+    workgroupBarrier();
+    let stack_size = sh_bic[0].b;
+    // TODO: if stack depth > WG_SIZE desired, scan here
+
+    // binary search in stack
+    let sp = WG_SIZE - 1u - local_id.x;
+    var ix = 0u;
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+        let probe = ix + ((WG_SIZE / 2u) >> i);
+        if sp < sh_bic[probe].b {
+            ix = probe;
+        }
+    }
+    let b = sh_bic[ix].b;
+    var bbox = vec4<f32>(-1e9, -1e9, 1e9, 1e9);
+    if sp < b {
+        let el = clip_els[ix * WG_SIZE + b - sp - 1u];
+        sh_stack[local_id.x] = el.parent_ix;
+        bbox = el.bbox;
+    }
+    // forward scan of bbox values of prefix stack
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+        sh_stack_bbox[local_id.x] = bbox;
+        workgroupBarrier();
+        if local_id.x >= (1u << i) {
+            bbox = bbox_intersect(sh_stack_bbox[local_id.x - (1u << i)], bbox);
+        }
+        workgroupBarrier();
+    }
+    sh_stack_bbox[local_id.x] = bbox;
+
+    // Read input and compute Bic binary tree
+    let inp = load_clip_inp(global_id.x);
+    let is_push = inp >= 0;
+    var bic = Bic(1u - u32(is_push), u32(is_push));
+    sh_bic[local_id.x] = bic;
+    if is_push {
+        let path_bbox = path_bboxes[inp];
+        bbox = vec4<f32>(f32(path_bbox.x0), f32(path_bbox.y0), f32(path_bbox.x1), f32(path_bbox.y1));
+    } else {
+        bbox = vec4<f32>(-1e9, -1e9, 1e9, 1e9);
+    }
+    var inbase = 0u;
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE) - 1u; i += 1u) {
+        let outbase = 2u * WG_SIZE - (1u << (firstTrailingBit(WG_SIZE) - i));
+        workgroupBarrier();
+        if local_id.x < 1u << (firstTrailingBit(WG_SIZE) - 1u - i) {
+            let in_off = inbase + local_id.x * 2u;
+            sh_bic[outbase + local_id.x] = bic_combine(sh_bic[in_off], sh_bic[in_off + 1u]);
+        }
+        inbase = outbase;
+    }
+    workgroupBarrier();
+    // search for predecessor node
+    bic = Bic();
+    var link = search_link(&bic, local_id.x);
+    sh_link[local_id.x] = link;
+    workgroupBarrier();
+    let grandparent = select(link - 1, sh_link[link], link >= 0);
+    var parent: i32;
+    if link >= 0 {
+        parent = i32(wg_id.x * WG_SIZE) + link;
+    } else if link + i32(stack_size) >= 0 {
+        parent = i32(sh_stack[i32(WG_SIZE) + link]);
+    } else {
+        parent = -1;
+    }
+    // bbox scan (intersect) across parent links
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+        if i != 0u {
+            sh_link[local_id.x] = link;
+        }
+        sh_bbox[local_id.x] = bbox;
+        workgroupBarrier();
+        if link >= 0 {
+            bbox = bbox_intersect(sh_bbox[link], bbox);
+            link = sh_link[link];
+        }
+        workgroupBarrier();
+    }
+    if link + i32(stack_size) >= 0 {
+        bbox = bbox_intersect(sh_stack_bbox[i32(WG_SIZE) + link], bbox);
+    }
+    // At this point, bbox is the intersection of bboxes on the path to the root
+    sh_bbox[local_id.x] = bbox;
+    workgroupBarrier();
+
+    if !is_push && global_id.x < config.n_clip {
+        // Fix up drawmonoid so path_ix of EndClip matches BeginClip
+        let path_ix = clip_inp[parent];
+        draw_monoids[~inp].path_ix = u32(path_ix);
+
+        if grandparent >= 0 {
+            bbox = sh_bbox[grandparent];
+        } else if grandparent + i32(stack_size) >= 0 {
+            bbox = sh_stack_bbox[i32(WG_SIZE) + grandparent];
+        } else {
+            bbox = vec4<f32>(-1e9, -1e9, 1e9, 1e9);
+        }
+    }
+    clip_bboxes[global_id.x] = bbox;
+}
--- a/piet-wgsl/shader/clip_reduce.wgsl
+++ b/piet-wgsl/shader/clip_reduce.wgsl
@ -0,0 +1,66 @@
+
+#import config
+#import bbox
+#import clip
+
+@group(0) @binding(0)
+var<storage> config: Config;
+
+@group(0) @binding(1)
+var<storage> clip_inp: array<i32>;
+
+@group(0) @binding(2)
+var<storage> path_bboxes: array<PathBbox>;
+
+@group(0) @binding(3)
+var<storage, read_write> reduced: array<Bic>;
+
+@group(0) @binding(4)
+var<storage, read_write> clip_out: array<ClipEl>;
+
+let WG_SIZE = 256u;
+var<workgroup> sh_bic: array<Bic, WG_SIZE>;
+var<workgroup> sh_parent: array<u32, WG_SIZE>;
+var<workgroup> sh_path_ix: array<u32, WG_SIZE>;
+
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+    @builtin(workgroup_id) wg_id: vec3<u32>, 
+) {
+    let inp = clip_inp[global_id.x];
+    let is_push = inp >= 0;
+    var bic = Bic(1u - u32(is_push), u32(is_push));
+    // reverse scan of bicyclic semigroup
+    sh_bic[local_id.x] = bic;
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+        workgroupBarrier();
+        if local_id.x + (1u << i) < WG_SIZE {
+            let other = sh_bic[local_id.x + (1u << i)];
+            bic = bic_combine(bic, other);
+        }
+        workgroupBarrier();
+        sh_bic[local_id.x] = bic;
+    }
+    if local_id.x == 0u {
+        reduced[wg_id.x] = bic;
+    }
+    workgroupBarrier();
+    let size = sh_bic[0].b;
+    bic = Bic();
+    if is_push && bic.a == 0u {
+        let local_ix = size - bic.b - 1u;
+        sh_parent[local_ix] = local_id.x;
+        sh_path_ix[local_ix] = u32(inp);
+    }
+    workgroupBarrier();
+    // TODO: possibly do forward scan here if depth can exceed wg size
+    if local_id.x < size {
+        let path_ix = sh_path_ix[local_id.x];
+        let path_bbox = path_bboxes[path_ix];
+        let parent_ix = sh_parent[local_id.x] + wg_id.x * WG_SIZE;
+        let bbox = vec4<f32>(f32(path_bbox.x0), f32(path_bbox.y0), f32(path_bbox.x1), f32(path_bbox.y1));
+        clip_out[global_id.x] = ClipEl(parent_ix, bbox);
+    }
+}
--- a/piet-wgsl/shader/coarse.wgsl
+++ b/piet-wgsl/shader/coarse.wgsl
@ -0,0 +1,448 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+// The coarse rasterization stage.
+
+#import config
+#import bump
+#import drawtag
+#import ptcl
+#import tile
+
+@group(0) @binding(0)
+var<storage> config: Config;
+
+@group(0) @binding(1)
+var<storage> scene: array<u32>;
+
+@group(0) @binding(2)
+var<storage> draw_monoids: array<DrawMonoid>;
+
+// TODO: dedup
+struct BinHeader {
+    element_count: u32,
+    chunk_offset: u32,
+}
+
+@group(0) @binding(3)
+var<storage> bin_headers: array<BinHeader>;
+
+@group(0) @binding(4)
+var<storage> bin_data: array<u32>;
+
+@group(0) @binding(5)
+var<storage> paths: array<Path>;
+
+@group(0) @binding(6)
+var<storage> tiles: array<Tile>;
+
+@group(0) @binding(7)
+var<storage> info: array<u32>;
+
+@group(0) @binding(8)
+var<storage, read_write> bump: BumpAllocators;
+
+@group(0) @binding(9)
+var<storage, read_write> ptcl: array<u32>;
+
+
+
+// Much of this code assumes WG_SIZE == N_TILE. If these diverge, then
+// a fair amount of fixup is needed.
+let WG_SIZE = 256u;
+//let N_SLICE = WG_SIZE / 32u;
+let N_SLICE = 8u;
+
+var<workgroup> sh_bitmaps: array<array<atomic<u32>, N_TILE>, N_SLICE>;
+var<workgroup> sh_part_count: array<u32, WG_SIZE>;
+var<workgroup> sh_part_offsets: array<u32, WG_SIZE>;
+var<workgroup> sh_drawobj_ix: array<u32, WG_SIZE>;
+var<workgroup> sh_tile_stride: array<u32, WG_SIZE>;
+var<workgroup> sh_tile_width: array<u32, WG_SIZE>;
+var<workgroup> sh_tile_x0: array<u32, WG_SIZE>;
+var<workgroup> sh_tile_y0: array<u32, WG_SIZE>;
+var<workgroup> sh_tile_count: array<u32, WG_SIZE>;
+var<workgroup> sh_tile_base: array<u32, WG_SIZE>;
+
+// helper functions for writing ptcl
+
+var<private> cmd_offset: u32;
+var<private> cmd_limit: u32;
+
+// Make sure there is space for a command of given size, plus a jump if needed
+fn alloc_cmd(size: u32) {
+    if cmd_offset + size >= cmd_limit {
+        // We might be able to save a little bit of computation here
+        // by setting the initial value of the bump allocator.
+        let ptcl_dyn_start = config.width_in_tiles * config.height_in_tiles * PTCL_INITIAL_ALLOC;
+        let new_cmd = ptcl_dyn_start + atomicAdd(&bump.ptcl, PTCL_INCREMENT);
+        // TODO: robust memory
+        ptcl[cmd_offset] = CMD_JUMP;
+        ptcl[cmd_offset + 1u] = new_cmd;
+        cmd_offset = new_cmd;
+        cmd_limit = cmd_offset + (PTCL_INCREMENT - PTCL_HEADROOM);
+    }
+}
+
+fn write_path(tile: Tile, linewidth: f32) {
+    // TODO: take flags
+    alloc_cmd(3u);
+    if linewidth < 0.0 {
+        if tile.segments != 0u {
+            let fill = CmdFill(tile.segments, tile.backdrop);
+            ptcl[cmd_offset] = CMD_FILL;
+            ptcl[cmd_offset + 1u] = fill.tile;
+            ptcl[cmd_offset + 2u] = u32(fill.backdrop);
+            cmd_offset += 3u;
+        } else {
+            ptcl[cmd_offset] = CMD_SOLID;
+            cmd_offset += 1u;
+        }
+    } else {
+        let stroke = CmdStroke(tile.segments, 0.5 * linewidth);
+        ptcl[cmd_offset] = CMD_STROKE;
+        ptcl[cmd_offset + 1u] = stroke.tile;
+        ptcl[cmd_offset + 2u] = bitcast<u32>(stroke.half_width);
+        cmd_offset += 3u;
+    }
+}
+
+fn write_color(color: CmdColor) {
+    alloc_cmd(2u);
+    ptcl[cmd_offset] = CMD_COLOR;
+    ptcl[cmd_offset + 1u] = color.rgba_color;
+    cmd_offset += 2u;
+}
+
+// Discussion point: these are basically copying from info to ptcl. We
+// could just write an info offset and have fine bind that buffer and read
+// from it.
+
+fn write_lin_grad(lin: CmdLinGrad) {
+    alloc_cmd(5u);
+    ptcl[cmd_offset] = CMD_LIN_GRAD;
+    ptcl[cmd_offset + 1u] = lin.index;
+    ptcl[cmd_offset + 2u] = bitcast<u32>(lin.line_x);
+    ptcl[cmd_offset + 3u] = bitcast<u32>(lin.line_y);
+    ptcl[cmd_offset + 4u] = bitcast<u32>(lin.line_c);
+    cmd_offset += 5u;
+}
+
+fn write_rad_grad(rad: CmdRadGrad) {
+    alloc_cmd(12u);
+    ptcl[cmd_offset] = CMD_RAD_GRAD;
+    ptcl[cmd_offset + 1u] = rad.index;
+    ptcl[cmd_offset + 2u] = bitcast<u32>(rad.matrx.x);
+    ptcl[cmd_offset + 3u] = bitcast<u32>(rad.matrx.y);
+    ptcl[cmd_offset + 4u] = bitcast<u32>(rad.matrx.z);
+    ptcl[cmd_offset + 5u] = bitcast<u32>(rad.matrx.w);
+    ptcl[cmd_offset + 6u] = bitcast<u32>(rad.xlat.x);
+    ptcl[cmd_offset + 7u] = bitcast<u32>(rad.xlat.y);
+    ptcl[cmd_offset + 8u] = bitcast<u32>(rad.c1.x);
+    ptcl[cmd_offset + 9u] = bitcast<u32>(rad.c1.y);
+    ptcl[cmd_offset + 10u] = bitcast<u32>(rad.ra);
+    ptcl[cmd_offset + 11u] = bitcast<u32>(rad.roff);
+    cmd_offset += 12u;
+}
+
+fn write_begin_clip() {
+    alloc_cmd(1u);
+    ptcl[cmd_offset] = CMD_BEGIN_CLIP;
+    cmd_offset += 1u;
+}
+
+fn write_end_clip(blend: u32) {
+    alloc_cmd(2u);
+    ptcl[cmd_offset] = CMD_END_CLIP;
+    ptcl[cmd_offset + 1u] = blend;
+    cmd_offset += 2u;
+}
+
+@compute @workgroup_size(256)
+fn main(
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+    @builtin(workgroup_id) wg_id: vec3<u32>,
+) {
+    let width_in_bins = (config.width_in_tiles + N_TILE_X - 1u) / N_TILE_X;
+    let bin_ix = width_in_bins * wg_id.y + wg_id.x;
+    let n_partitions = (config.n_drawobj + N_TILE - 1u) / N_TILE;
+
+    // Coordinates of the top left of this bin, in tiles.
+    let bin_tile_x = N_TILE_X * wg_id.x;
+    let bin_tile_y = N_TILE_Y * wg_id.y;
+
+    let tile_x = local_id.x % N_TILE_X;
+    let tile_y = local_id.x / N_TILE_X;
+    let this_tile_ix = (bin_tile_y + tile_y) * config.width_in_tiles + bin_tile_x + tile_x;
+    cmd_offset = this_tile_ix * PTCL_INITIAL_ALLOC;
+    cmd_limit = cmd_offset + (PTCL_INITIAL_ALLOC - PTCL_HEADROOM);
+
+    // clip state
+    var clip_zero_depth = 0u;
+    var clip_depth = 0u;
+
+    var partition_ix = 0u;
+    var rd_ix = 0u;
+    var wr_ix = 0u;
+    var part_start_ix = 0u;
+    var ready_ix = 0u;
+
+    // blend state
+    var render_blend_depth = 0u;
+    var max_blend_depth = 0u;
+    
+    while true {
+        for (var i = 0u; i < N_SLICE; i += 1u) {
+            atomicStore(&sh_bitmaps[i][local_id.x], 0u);
+        }
+
+        while true {
+            if ready_ix == wr_ix && partition_ix < n_partitions {
+                part_start_ix = ready_ix;
+                var count = 0u;
+                if partition_ix + local_id.x < n_partitions {
+                    let in_ix = (partition_ix + local_id.x) * N_TILE + bin_ix;
+                    let bin_header = bin_headers[in_ix];
+                    count = bin_header.element_count;
+                    sh_part_offsets[local_id.x] = bin_header.chunk_offset;
+                }
+                // prefix sum the element counts
+                for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+                    sh_part_count[local_id.x] = count;
+                    workgroupBarrier();
+                    if local_id.x >= (1u << i) {
+                        count += sh_part_count[local_id.x - (1u << i)];
+                    }
+                    workgroupBarrier();
+                }
+                sh_part_count[local_id.x] = part_start_ix + count;
+                workgroupBarrier();
+                ready_ix = sh_part_count[WG_SIZE - 1u];
+                partition_ix += WG_SIZE;
+            }
+            // use binary search to find draw object to read
+            var ix = rd_ix + local_id.x;
+            if ix >= wr_ix && ix < ready_ix {
+                var part_ix = 0u;
+                for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+                    let probe = part_ix + ((N_TILE / 2u) >> i);
+                    if ix >= sh_part_count[probe - 1u] {
+                        part_ix = probe;
+                    }
+                }
+                ix -= select(part_start_ix, sh_part_count[part_ix - 1u], part_ix > 0u);
+                let offset = sh_part_offsets[part_ix];
+                sh_drawobj_ix[local_id.x] = bin_data[offset + ix];
+            }
+            wr_ix = min(rd_ix + N_TILE, ready_ix);
+            if wr_ix - rd_ix >= N_TILE || (wr_ix >= ready_ix && partition_ix >= n_partitions) {
+                break;
+            }
+        }
+        // At this point, sh_drawobj_ix[0.. wr_ix - rd_ix] contains merged binning results.
+        var tag = DRAWTAG_NOP;
+        var drawobj_ix: u32;
+        if local_id.x + rd_ix < wr_ix {
+            drawobj_ix = sh_drawobj_ix[local_id.x];
+            tag = scene[config.drawtag_base + drawobj_ix];
+        }
+
+        var tile_count = 0u;
+        // I think this predicate is the same as the last, maybe they can be combined
+        if tag != DRAWTAG_NOP {
+            let path_ix = draw_monoids[drawobj_ix].path_ix;
+            let path = paths[path_ix];
+            let stride = path.bbox.z - path.bbox.x;
+            sh_tile_stride[local_id.x] = stride;
+            let dx = i32(path.bbox.x) - i32(bin_tile_x);
+            let dy = i32(path.bbox.y) - i32(bin_tile_y);
+            let x0 = clamp(dx, 0, i32(N_TILE_X));
+            let y0 = clamp(dy, 0, i32(N_TILE_Y));
+            let x1 = clamp(i32(path.bbox.z) - i32(bin_tile_x), 0, i32(N_TILE_X));
+            let y1 = clamp(i32(path.bbox.w) - i32(bin_tile_y), 0, i32(N_TILE_Y));
+            sh_tile_width[local_id.x] = u32(x1 - x0);
+            sh_tile_x0[local_id.x] = u32(x0);
+            sh_tile_y0[local_id.x] = u32(y0);
+            tile_count = u32(x1 - x0) * u32(y1 - y0);
+            // base relative to bin
+            let base = path.tiles - u32(dy * i32(stride) + dx);
+            sh_tile_base[local_id.x] = base;
+            // TODO: there's a write_tile_alloc here in the source, not sure what it's supposed to do
+        }
+
+        // Prefix sum of tile counts
+        sh_tile_count[local_id.x] = tile_count;
+        for (var i = 0u; i < firstTrailingBit(N_TILE); i += 1u) {
+            workgroupBarrier();
+            if local_id.x >= (1u << i) {
+                tile_count += sh_tile_count[local_id.x - (1u << i)];
+            }
+            workgroupBarrier();
+            sh_tile_count[local_id.x] = tile_count;
+        }
+        workgroupBarrier();
+        let total_tile_count = sh_tile_count[N_TILE - 1u];
+        // Parallel iteration over all tiles
+        for (var ix = local_id.x; ix < total_tile_count; ix += N_TILE) {
+            // Binary search to find draw object which contains this tile
+            var el_ix = 0u;
+            for (var i = 0u; i < firstTrailingBit(N_TILE); i += 1u) {
+                let probe = el_ix + ((N_TILE / 2u) >> i);
+                if ix >= sh_tile_count[probe - 1u] {
+                    el_ix = probe;
+                }
+            }
+            drawobj_ix = sh_drawobj_ix[el_ix];
+            tag = scene[config.drawtag_base + drawobj_ix];
+            // TODO: clip logic
+            let seq_ix = ix - select(0u, sh_tile_count[el_ix - 1u], el_ix > 0u);
+            let width = sh_tile_width[el_ix];
+            let x = sh_tile_x0[el_ix] + seq_ix % width;
+            let y = sh_tile_y0[el_ix] + seq_ix / width;
+            let tile_ix = sh_tile_base[el_ix] + sh_tile_stride[el_ix] * y + x;
+            let tile = tiles[tile_ix];
+            let is_clip = (tag & 1u) != 0u;
+            var is_blend = false;
+            if is_clip {
+                let BLEND_CLIP = (128u << 8u) | 3u;
+                let scene_offset = draw_monoids[drawobj_ix].scene_offset;
+                let dd = config.drawdata_base + scene_offset;
+                let blend = scene[dd];
+                is_blend = blend != BLEND_CLIP;
+            }
+            let include_tile = tile.segments != 0u || (tile.backdrop == 0) == is_clip || is_blend;
+            if include_tile {
+                let el_slice = el_ix / 32u;
+                let el_mask = 1u << (el_ix & 31u);
+                atomicOr(&sh_bitmaps[el_slice][y * N_TILE_X + x], el_mask);
+            }
+        }
+        workgroupBarrier();
+        // At this point bit drawobj % 32 is set in sh_bitmaps[drawobj / 32][y * N_TILE_X + x]
+        // if drawobj touches tile (x, y).
+
+        // Write per-tile command list for this tile
+        var slice_ix = 0u;
+        var bitmap = atomicLoad(&sh_bitmaps[0u][local_id.x]);
+        while true {
+            if bitmap == 0u {
+                slice_ix += 1u;
+                // potential optimization: make iteration limit dynamic
+                if slice_ix == N_SLICE {
+                    break;
+                }
+                bitmap = atomicLoad(&sh_bitmaps[slice_ix][local_id.x]);
+                if bitmap == 0u {
+                    continue;
+                }
+            }
+            let el_ix = slice_ix * 32u + firstTrailingBit(bitmap);
+            drawobj_ix = sh_drawobj_ix[el_ix];
+            // clear LSB of bitmap, using bit magic
+            bitmap &= bitmap - 1u;
+            let drawtag = scene[config.drawtag_base + drawobj_ix];
+            let dm = draw_monoids[drawobj_ix];
+            let dd = config.drawdata_base + dm.scene_offset;
+            let di = dm.info_offset;
+            if clip_zero_depth == 0u {
+                let tile_ix = sh_tile_base[el_ix] + sh_tile_stride[el_ix] * tile_y + tile_x;
+                let tile = tiles[tile_ix];
+                switch drawtag {
+                    // DRAWTAG_FILL_COLOR
+                    case 0x44u: {
+                        let linewidth = bitcast<f32>(info[di]);
+                        write_path(tile, linewidth);
+                        let rgba_color = scene[dd];
+                        write_color(CmdColor(rgba_color));
+                    }
+                    // DRAWTAG_FILL_LIN_GRADIENT
+                    case 0x114u: {
+                        let linewidth = bitcast<f32>(info[di]);
+                        write_path(tile, linewidth);
+                        var lin: CmdLinGrad;
+                        lin.index = scene[dd];
+                        lin.line_x = bitcast<f32>(info[di + 1u]);
+                        lin.line_y = bitcast<f32>(info[di + 2u]);
+                        lin.line_c = bitcast<f32>(info[di + 3u]);
+                        write_lin_grad(lin);
+                    }
+                    // DRAWTAG_FILL_RAD_GRADIENT
+                    case 0x2dcu: {
+                        let linewidth = bitcast<f32>(info[di]);
+                        write_path(tile, linewidth);
+                        var rad: CmdRadGrad;
+                        rad.index = scene[dd];
+                        let m0 = bitcast<f32>(info[di + 1u]);
+                        let m1 = bitcast<f32>(info[di + 2u]);
+                        let m2 = bitcast<f32>(info[di + 3u]);
+                        let m3 = bitcast<f32>(info[di + 4u]);
+                        rad.matrx = vec4<f32>(m0, m1, m2, m3);
+                        rad.xlat = vec2<f32>(bitcast<f32>(info[di + 5u]), bitcast<f32>(info[di + 6u]));
+                        rad.c1 = vec2<f32>(bitcast<f32>(info[di + 7u]), bitcast<f32>(info[di + 8u]));
+                        rad.ra = bitcast<f32>(info[di + 9u]);
+                        rad.roff = bitcast<f32>(info[di + 10u]);
+                        write_rad_grad(rad);
+                    }
+                    // DRAWTAG_BEGIN_CLIP
+                    case 0x05u: {
+                        if tile.segments == 0u && tile.backdrop == 0 {
+                            clip_zero_depth = clip_depth + 1u;
+                        } else {
+                            write_begin_clip();
+                            render_blend_depth += 1u;
+                            max_blend_depth = max(max_blend_depth, render_blend_depth);
+                        }
+                        clip_depth += 1u;
+                    }
+                    // DRAWTAG_END_CLIP
+                    case 0x25u: {
+                        clip_depth -= 1u;
+                        write_path(tile, -1.0);
+                        write_end_clip(scene[dd]);
+                        render_blend_depth -= 1u;
+                    }
+                    default: {}
+                }
+            } else {
+                // In "clip zero" state, suppress all drawing
+                switch drawtag {
+                    // DRAWTAG_BEGIN_CLIP
+                    case 0x05u: {
+                        clip_depth += 1u;
+                    }
+                    // DRAWTAG_END_CLIP
+                    case 0x25u: {
+                        if clip_depth == clip_zero_depth {
+                            clip_zero_depth = 0u;
+                        }
+                        clip_depth -= 1u;
+                    }
+                    default: {}
+                }
+            }
+        }
+
+        rd_ix += N_TILE;
+        if rd_ix >= ready_ix && partition_ix >= n_partitions {
+            break;
+        }
+        workgroupBarrier();
+    }
+    if bin_tile_x < config.width_in_tiles && bin_tile_y < config.height_in_tiles {
+        //ptcl[cmd_offset] = CMD_END;
+        // TODO: blend stack allocation
+    }
+}
--- a/piet-wgsl/shader/draw_leaf.wgsl
+++ b/piet-wgsl/shader/draw_leaf.wgsl
@ -0,0 +1,197 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+// Finish prefix sum of drawtags, decode draw objects.
+
+#import config
+#import clip
+#import drawtag
+#import bbox
+
+@group(0) @binding(0)
+var<storage> config: Config;
+
+@group(0) @binding(1)
+var<storage> scene: array<u32>;
+
+@group(0) @binding(2)
+var<storage> reduced: array<DrawMonoid>;
+
+@group(0) @binding(3)
+var<storage> path_bbox: array<PathBbox>;
+
+@group(0) @binding(4)
+var<storage, read_write> draw_monoid: array<DrawMonoid>;
+
+@group(0) @binding(5)
+var<storage, read_write> info: array<u32>;
+
+@group(0) @binding(6)
+var<storage, read_write> clip_inp: array<i32>;
+
+let WG_SIZE = 256u;
+
+// Possibly dedup?
+struct Transform {
+    matrx: vec4<f32>,
+    translate: vec2<f32>,
+}
+
+fn read_transform(transform_base: u32, ix: u32) -> Transform {
+    let base = transform_base + ix * 6u;
+    let c0 = bitcast<f32>(scene[base]);
+    let c1 = bitcast<f32>(scene[base + 1u]);
+    let c2 = bitcast<f32>(scene[base + 2u]);
+    let c3 = bitcast<f32>(scene[base + 3u]);
+    let c4 = bitcast<f32>(scene[base + 4u]);
+    let c5 = bitcast<f32>(scene[base + 5u]);
+    let matrx = vec4<f32>(c0, c1, c2, c3);
+    let translate = vec2<f32>(c4, c5);
+    return Transform(matrx, translate);
+}
+
+var<workgroup> sh_scratch: array<DrawMonoid, WG_SIZE>;
+
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+    @builtin(workgroup_id) wg_id: vec3<u32>,
+) {
+    let ix = global_id.x;
+    // Reduce prefix of workgroups up to this one
+    var agg = draw_monoid_identity();
+    if local_id.x < wg_id.x {
+        agg = reduced[local_id.x];
+    }
+    sh_scratch[local_id.x] = agg;
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+        workgroupBarrier();
+        if local_id.x + (1u << i) < WG_SIZE {
+            let other = sh_scratch[local_id.x + (1u << i)];
+            agg = combine_draw_monoid(agg, other);
+        }
+        workgroupBarrier();
+        sh_scratch[local_id.x] = agg;
+    }
+    // Two barriers can be eliminated if we use separate shared arrays
+    // for prefix and intra-workgroup prefix sum.
+    workgroupBarrier();
+    var m = sh_scratch[0];
+    workgroupBarrier();
+    let tag_word = scene[config.drawtag_base + ix];
+    agg = map_draw_tag(tag_word);
+    sh_scratch[local_id.x] = agg;
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+        workgroupBarrier();
+        if local_id.x >= 1u << i {
+            let other = sh_scratch[local_id.x - (1u << i)];
+            agg = combine_draw_monoid(agg, other);
+        }
+        workgroupBarrier();
+        sh_scratch[local_id.x] = agg;
+    }
+    workgroupBarrier();
+    if local_id.x > 0u {
+        m = combine_draw_monoid(m, sh_scratch[local_id.x - 1u]);
+    }
+    // m now contains exclusive prefix sum of draw monoid
+    draw_monoid[ix] = m;
+    let dd = config.drawdata_base + m.scene_offset;
+    let di = m.info_offset;
+    if tag_word == DRAWTAG_FILL_COLOR || tag_word == DRAWTAG_FILL_LIN_GRADIENT ||
+        tag_word == DRAWTAG_FILL_RAD_GRADIENT || tag_word == DRAWTAG_FILL_IMAGE ||
+        tag_word == DRAWTAG_BEGIN_CLIP
+    {
+        let bbox = path_bbox[m.path_ix];
+        // TODO: bbox is mostly yagni here, sort that out. Maybe clips?
+        // let x0 = f32(bbox.x0);
+        // let y0 = f32(bbox.y0);
+        // let x1 = f32(bbox.x1);
+        // let y1 = f32(bbox.y1);
+        // let bbox_f = vec4(x0, y0, x1, y1);
+        let fill_mode = u32(bbox.linewidth >= 0.0);
+        var matrx: vec4<f32>;
+        var translate: vec2<f32>;
+        var linewidth = bbox.linewidth;
+        if linewidth >= 0.0 || tag_word == DRAWTAG_FILL_LIN_GRADIENT || tag_word == DRAWTAG_FILL_RAD_GRADIENT {
+            let transform = read_transform(config.transform_base, bbox.trans_ix);
+            matrx = transform.matrx;
+            translate = transform.translate;
+        }
+        if linewidth >= 0.0 {
+            // Note: doesn't deal with anisotropic case
+            linewidth *= sqrt(abs(matrx.x * matrx.w - matrx.y * matrx.z)); 
+        }
+        switch tag_word {
+            // DRAWTAG_FILL_COLOR, DRAWTAG_FILL_IMAGE
+            case 0x44u, 0x48u: {
+                info[di] = bitcast<u32>(linewidth);
+            }
+            // DRAWTAG_FILL_LIN_GRADIENT
+            case 0x114u: {
+                info[di] = bitcast<u32>(linewidth);
+                var p0 = bitcast<vec2<f32>>(vec2(scene[dd + 1u], scene[dd + 2u]));
+                var p1 = bitcast<vec2<f32>>(vec2(scene[dd + 3u], scene[dd + 4u]));
+                p0 = matrx.xy * p0.x + matrx.zw * p0.y + translate;
+                p1 = matrx.xy * p1.x + matrx.zw * p1.y + translate;
+                let dxy = p1 - p0;
+                let scale = 1.0 / dot(dxy, dxy);
+                let line_xy = dxy * scale;
+                let line_c = -dot(p0, line_xy);
+                info[di + 1u] = bitcast<u32>(line_xy.x);
+                info[di + 2u] = bitcast<u32>(line_xy.y);
+                info[di + 3u] = bitcast<u32>(line_c);
+            }
+            // DRAWTAG_FILL_RAD_GRADIENT
+            case 0x2dcu: {
+                info[di] = bitcast<u32>(linewidth);
+                var p0 = bitcast<vec2<f32>>(vec2(scene[dd + 1u], scene[dd + 2u]));
+                var p1 = bitcast<vec2<f32>>(vec2(scene[dd + 3u], scene[dd + 4u]));
+                let r0 = bitcast<f32>(scene[dd + 5u]);
+                let r1 = bitcast<f32>(scene[dd + 6u]);
+                let inv_det = 1.0 / (matrx.x * matrx.w - matrx.y * matrx.z);
+                let inv_mat = inv_det * vec4<f32>(matrx.w, -matrx.y, -matrx.z, matrx.x);
+                var inv_tr = inv_mat.xz * translate.x + inv_mat.yw * translate.y;
+                inv_tr += p0;
+                let center1 = p1 - p0;
+                let rr = r1 / (r1 - r0);
+                let ra_inv = rr / (r1 * r1 - dot(center1, center1));
+                let c1 = center1 * ra_inv;
+                let ra = rr * ra_inv;
+                let roff = rr - 1.0;
+                info[di + 1u] = bitcast<u32>(inv_mat.x);
+                info[di + 2u] = bitcast<u32>(inv_mat.y);
+                info[di + 3u] = bitcast<u32>(inv_mat.z);
+                info[di + 4u] = bitcast<u32>(inv_mat.w);
+                info[di + 5u] = bitcast<u32>(inv_tr.x);
+                info[di + 6u] = bitcast<u32>(inv_tr.y);
+                info[di + 7u] = bitcast<u32>(c1.x);
+                info[di + 8u] = bitcast<u32>(c1.y);
+                info[di + 9u] = bitcast<u32>(ra);
+                info[di + 10u] = bitcast<u32>(roff);
+            }
+            default: {}
+        }
+    }
+    if tag_word == DRAWTAG_BEGIN_CLIP || tag_word == DRAWTAG_END_CLIP {
+        var path_ix = ~ix;
+        if tag_word == DRAWTAG_BEGIN_CLIP {
+            path_ix = m.path_ix;
+        }
+        clip_inp[m.clip_ix] = i32(path_ix);
+    }
+}
--- a/piet-wgsl/shader/draw_reduce.wgsl
+++ b/piet-wgsl/shader/draw_reduce.wgsl
@ -0,0 +1,54 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+#import config
+#import drawtag
+
+@group(0) @binding(0)
+var<storage> config: Config;
+
+@group(0) @binding(1)
+var<storage> scene: array<u32>;
+
+@group(0) @binding(2)
+var<storage, read_write> reduced: array<DrawMonoid>;
+
+let WG_SIZE = 256u;
+
+var<workgroup> sh_scratch: array<DrawMonoid, WG_SIZE>;
+
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+) {
+    let ix = global_id.x;
+    let tag_word = scene[config.drawtag_base + ix];
+    var agg = map_draw_tag(tag_word);
+    sh_scratch[local_id.x] = agg;
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+        workgroupBarrier();
+        if local_id.x + (1u << i) < WG_SIZE {
+            let other = sh_scratch[local_id.x + (1u << i)];
+            agg = combine_draw_monoid(agg, other);
+        }
+        workgroupBarrier();
+        sh_scratch[local_id.x] = agg;
+    }
+    if local_id.x == 0u {
+        reduced[ix >> firstTrailingBit(WG_SIZE)] = agg;
+    }
+}
--- a/piet-wgsl/shader/fine.wgsl
+++ b/piet-wgsl/shader/fine.wgsl
@ -14,6 +14,9 @@
 //
 // Also licensed under MIT license, at your choice.

+// Fine rasterizer. This can run in simple (just path rendering) and full
+// modes, controllable by #define.
+
 // This is a cut'n'paste w/ backdrop.
 struct Tile {
    backdrop: i32,
@ -36,17 +39,64 @@ var<storage> segments: array<Segment>;
@group(0) @binding(3)
 var<storage, read_write> output: array<u32>;

+#ifdef full
+
+#import blend
+#import ptcl
+
+let GRADIENT_WIDTH = 512;
+let BLEND_STACK_SPLIT = 4u;
+
+@group(0) @binding(4)
+var<storage> ptcl: array<u32>;
+
+@group(0) @binding(5)
+var gradients: texture_2d<f32>;
+
+fn read_fill(cmd_ix: u32) -> CmdFill {
+    let tile = ptcl[cmd_ix + 1u];
+    let backdrop = i32(ptcl[cmd_ix + 2u]);
+    return CmdFill(tile, backdrop);
+}
+
+fn read_stroke(cmd_ix: u32) -> CmdStroke {
+    let tile = ptcl[cmd_ix + 1u];
+    let half_width = bitcast<f32>(ptcl[cmd_ix + 2u]);
+    return CmdStroke(tile, half_width);
+}
+
+fn read_color(cmd_ix: u32) -> CmdColor {
+    let rgba_color = ptcl[cmd_ix + 1u];
+    return CmdColor(rgba_color);
+}
+
+fn read_lin_grad(cmd_ix: u32) -> CmdLinGrad {
+    let index = ptcl[cmd_ix + 1u];
+    let line_x = bitcast<f32>(ptcl[cmd_ix + 2u]);
+    let line_y = bitcast<f32>(ptcl[cmd_ix + 3u]);
+    let line_c = bitcast<f32>(ptcl[cmd_ix + 4u]);
+    return CmdLinGrad(index, line_x, line_y, line_c);
+}
+
+fn read_rad_grad(cmd_ix: u32) -> CmdRadGrad {
+    let index = ptcl[cmd_ix + 1u];
+    let m0 = bitcast<f32>(ptcl[cmd_ix + 2u]);
+    let m1 = bitcast<f32>(ptcl[cmd_ix + 3u]);
+    let m2 = bitcast<f32>(ptcl[cmd_ix + 4u]);
+    let m3 = bitcast<f32>(ptcl[cmd_ix + 5u]);
+    let matrx = vec4<f32>(m0, m1, m2, m3);
+    let xlat = vec2<f32>(bitcast<f32>(ptcl[cmd_ix + 6u]), bitcast<f32>(ptcl[cmd_ix + 7u]));
+    let c1 = vec2<f32>(bitcast<f32>(ptcl[cmd_ix + 8u]), bitcast<f32>(ptcl[cmd_ix + 9u]));
+    let ra = bitcast<f32>(ptcl[cmd_ix + 10u]);
+    let roff = bitcast<f32>(ptcl[cmd_ix + 11u]);
+    return CmdRadGrad(index, matrx, xlat, c1, ra, roff);
+}
+
+#endif
+
 let PIXELS_PER_THREAD = 4u;

-@compute @workgroup_size(4, 16)
-fn main(
-    @builtin(global_invocation_id) global_id: vec3<u32>,
-    @builtin(local_invocation_id) local_id: vec3<u32>,
-    @builtin(workgroup_id) wg_id: vec3<u32>,
-) {
-    let tile_ix = wg_id.y * config.width_in_tiles + wg_id.x;
-    let xy = vec2<f32>(f32(global_id.x * PIXELS_PER_THREAD), f32(global_id.y));
-    let tile = tiles[tile_ix];
+fn fill_path(tile: Tile, xy: vec2<f32>) -> array<f32, PIXELS_PER_THREAD> {
    var area: array<f32, PIXELS_PER_THREAD>;
    let backdrop_f = f32(tile.backdrop);
    for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
@ -89,8 +139,168 @@ fn main(
    for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
        area[i] = abs(area[i]);
    }
+    return area;
+}
+
+fn stroke_path(seg: u32, half_width: f32, xy: vec2<f32>) -> array<f32, PIXELS_PER_THREAD> {
+    var df: array<f32, PIXELS_PER_THREAD>;
+    for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
+        df[i] = 1e9;
+    }
+    var segment_ix = seg;
+    while segment_ix != 0u {
+        let segment = segments[segment_ix];
+        let delta = segment.delta;
+        let dpos0 = xy + vec2<f32>(0.5, 0.5) - segment.origin;
+        let scale = 1.0 / dot(delta, delta);
+        for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
+            let dpos = vec2<f32>(dpos0.x + f32(i), dpos0.y);
+            let t = clamp(dot(dpos, delta) * scale, 0.0, 1.0);
+            // performance idea: hoist sqrt out of loop
+            df[i] = min(df[i], length(delta * t - dpos));
+        }
+        segment_ix = segment.next;
+    }
+    for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
+        // reuse array; return alpha rather than distance
+        df[i] = clamp(half_width + 0.5 - df[i], 0.0, 1.0);
+    }
+    return df;
+}
+
+@compute @workgroup_size(4, 16)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+    @builtin(workgroup_id) wg_id: vec3<u32>,
+) {
+    let tile_ix = wg_id.y * config.width_in_tiles + wg_id.x;
+    let xy = vec2<f32>(f32(global_id.x * PIXELS_PER_THREAD), f32(global_id.y));
+#ifdef full
+    var rgba: array<vec4<f32>, PIXELS_PER_THREAD>;
+    var blend_stack: array<array<u32, BLEND_STACK_SPLIT>, PIXELS_PER_THREAD>;
+    var clip_depth = 0u;
+    var area: array<f32, PIXELS_PER_THREAD>;
+    var cmd_ix = tile_ix * PTCL_INITIAL_ALLOC;
+
+    // main interpretation loop
+    while true {
+        let tag = ptcl[cmd_ix];
+        if tag == CMD_END {
+            break;
+        }
+        switch tag {
+            // CMD_FILL
+            case 1u: {
+                let fill = read_fill(cmd_ix);
+                let tile = Tile(fill.backdrop, fill.tile);
+                area = fill_path(tile, xy);
+                cmd_ix += 3u;
+            }
+            // CMD_STROKE
+            case 2u: {
+                let stroke = read_stroke(cmd_ix);
+                area = stroke_path(stroke.tile, stroke.half_width, xy);
+                cmd_ix += 3u;
+            }
+            // CMD_SOLID
+            case 3u: {
+                for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
+                    area[i] = 1.0;
+                }
+                cmd_ix += 1u;
+            }
+            // CMD_COLOR
+            case 5u: {
+                let color = read_color(cmd_ix);
+                let fg = unpack4x8unorm(color.rgba_color).wzyx;
+                for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
+                    let fg_i = fg * area[i];
+                    rgba[i] = rgba[i] * (1.0 - fg_i.a) + fg_i;
+                }
+                cmd_ix += 2u;
+            }
+            // CMD_LIN_GRAD
+            case 6u: {
+                let lin = read_lin_grad(cmd_ix);
+                let d = lin.line_x * xy.x + lin.line_y * xy.y + lin.line_c;
+                for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
+                    let my_d = d + lin.line_x * f32(i);
+                    let x = i32(round(clamp(my_d, 0.0, 1.0) * f32(GRADIENT_WIDTH - 1)));
+                    let fg_rgba = textureLoad(gradients, vec2<i32>(x, i32(lin.index)), 0);
+                    let fg_i = fg_rgba * area[i];
+                    rgba[i] = rgba[i] * (1.0 - fg_i.a) + fg_i;
+                }
+                cmd_ix += 5u;
+            }
+            // CMD_RAD_GRAD
+            case 7u: {
+                let rad = read_rad_grad(cmd_ix);
+                for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
+                    let my_xy = vec2<f32>(xy.x + f32(i), xy.y);
+                    // TODO: can hoist y, but for now stick to piet-gpu
+                    let xy_xformed = rad.matrx.xz * my_xy.x + rad.matrx.yw * my_xy.y - rad.xlat;
+                    let ba = dot(xy_xformed, rad.c1);
+                    let ca = rad.ra * dot(xy_xformed, xy_xformed);
+                    let t = sqrt(ba * ba + ca) - ba - rad.roff;
+                    let x = i32(round(clamp(t, 0.0, 1.0) * f32(GRADIENT_WIDTH - 1)));
+                    let fg_rgba = textureLoad(gradients, vec2<i32>(x, i32(rad.index)), 0);
+                    let fg_i = fg_rgba * area[i];
+                    rgba[i] = rgba[i] * (1.0 - fg_i.a) + fg_i;
+                }
+                cmd_ix += 12u;
+            }
+            // CMD_BEGIN_CLIP
+            case 9u: {
+                if clip_depth < BLEND_STACK_SPLIT {
+                    for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
+                        blend_stack[clip_depth][i] = pack4x8unorm(rgba[i]);
+                        rgba[i] = vec4<f32>(0.0);
+                    }
+                } else {
+                    // TODO: spill to memory
+                }
+                clip_depth += 1u;
+                cmd_ix += 1u;
+            }
+            // CMD_END_CLIP
+            case 10u: {
+                let blend = ptcl[cmd_ix + 1u];
+                clip_depth -= 1u;
+                for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
+                    var bg_rgba: u32;
+                    if clip_depth < BLEND_STACK_SPLIT {
+                        bg_rgba = blend_stack[clip_depth][i];
+                    } else {
+                        // load from memory
+                    }
+                    let bg = unpack4x8unorm(bg_rgba);
+                    let fg = rgba[i] * area[i];
+                    rgba[i] = blend_mix_compose(bg, fg, blend);
+                }
+                cmd_ix += 2u;
+            }
+            // CMD_JUMP
+            case 11u: {
+                cmd_ix = ptcl[cmd_ix + 1u];
+            }
+            default: {}
+        }
+    }
+    let out_ix = global_id.y * (config.width_in_tiles * TILE_WIDTH) + global_id.x * PIXELS_PER_THREAD;
+    for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
+        let fg = rgba[i];
+        let a_inv = 1.0 / (fg.a + 1e-6);
+        let rgba_sep = vec4<f32>(fg.r * a_inv, fg.g * a_inv, fg.b * a_inv, fg.a);
+        let bytes = pack4x8unorm(rgba_sep);
+        output[out_ix + i] = bytes;
+    }
+#else
+    let tile = tiles[tile_ix];
+    let area = fill_path(tile, xy);

    let bytes = pack4x8unorm(vec4<f32>(area[0], area[1], area[2], area[3]));
    let out_ix = global_id.y * (config.width_in_tiles * 4u) + global_id.x;
    output[out_ix] = bytes;
+#endif
 }
--- a/piet-wgsl/shader/path_coarse.wgsl
+++ b/piet-wgsl/shader/path_coarse.wgsl
@ -14,59 +14,53 @@
 //
 // Also licensed under MIT license, at your choice.

+#import config
 #import pathtag

@group(0) @binding(0)
-var<storage> path_tags: array<u32>;
+var<storage> config: Config;

@group(0) @binding(1)
-var<storage> tag_monoids: array<TagMonoid>;
+var<storage> scene: array<u32>;

-// TODO: should probably have single "scene" binding.
@group(0) @binding(2)
-var<storage> path_data: array<u32>;
+var<storage> tag_monoids: array<TagMonoid>;

 #ifdef cubics_out
@group(0) @binding(3)
 var<storage, read_write> output: array<vec2<f32>>;
 #else
-#import config
-
-struct Tile {
+// We don't get this from import as it's the atomic version
+struct AtomicTile {
    backdrop: atomic<i32>,
    segments: atomic<u32>,
 }

 #import segment

-// Should probably be uniform binding
@group(0) @binding(3)
-var<storage> config: Config;
+var<storage, read_write> tiles: array<AtomicTile>;

@group(0) @binding(4)
-var<storage, read_write> tiles: array<Tile>;
-
-@group(0) @binding(5)
 var<storage, read_write> segments: array<Segment>;
 #endif

+var<private> pathdata_base: u32;
+
 fn read_f32_point(ix: u32) -> vec2<f32> {
-    let x = bitcast<f32>(path_data[ix]);
-    let y = bitcast<f32>(path_data[ix + 1u]);
+    let x = bitcast<f32>(scene[pathdata_base + ix]);
+    let y = bitcast<f32>(scene[pathdata_base + ix + 1u]);
    return vec2<f32>(x, y);
 }

 fn read_i16_point(ix: u32) -> vec2<f32> {
-    let raw = path_data[ix];
+    let raw = scene[pathdata_base + ix];
    let x = f32(i32(raw << 16u) >> 16u);
    let y = f32(i32(raw) >> 16u);
    return vec2<f32>(x, y);
 }

 #ifndef cubics_out
-let TILE_WIDTH = 16u;
-let TILE_HEIGHT = 16u;
-
 struct SubdivResult {
    val: f32,
    a0: f32,
@ -136,7 +130,8 @@ fn main(
 ) {
    // Obtain exclusive prefix sum of tag monoid
    let ix = global_id.x;
-    let tag_word = path_tags[ix >> 2u];
+    let tag_word = scene[config.pathtag_base + (ix >> 2u)];
+    pathdata_base = config.pathdata_base;
    let shift = (ix & 3u) * 8u;
    var tm = reduce_tag(tag_word & ((1u << shift) - 1u));
    tm = combine_tag_monoid(tag_monoids[ix >> 2u], tm);
--- a/piet-wgsl/shader/path_coarse_full.wgsl
+++ b/piet-wgsl/shader/path_coarse_full.wgsl
@ -0,0 +1,275 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+// Path coarse rasterization for the full implementation.
+
+#import config
+#import pathtag
+#import tile
+#import segment
+#import cubic
+#import bump
+
+@group(0) @binding(0)
+var<storage> config: Config;
+
+@group(0) @binding(1)
+var<storage> scene: array<u32>;
+
+@group(0) @binding(2)
+var<storage> tag_monoids: array<TagMonoid>;
+
+@group(0) @binding(3)
+var<storage> cubics: array<Cubic>;
+
+@group(0) @binding(4)
+var<storage> paths: array<Path>;
+
+// We don't get this from import as it's the atomic version
+struct AtomicTile {
+    backdrop: atomic<i32>,
+    segments: atomic<u32>,
+}
+
+@group(0) @binding(5)
+var<storage, read_write> bump: BumpAllocators;
+
+@group(0) @binding(6)
+var<storage, read_write> tiles: array<AtomicTile>;
+
+@group(0) @binding(7)
+var<storage, read_write> segments: array<Segment>;
+
+struct SubdivResult {
+    val: f32,
+    a0: f32,
+    a2: f32,
+}
+
+let D = 0.67;
+fn approx_parabola_integral(x: f32) -> f32 {
+    return x * inverseSqrt(sqrt(1.0 - D + (D * D * D * D + 0.25 * x * x)));
+}
+
+let B = 0.39;
+fn approx_parabola_inv_integral(x: f32) -> f32 {
+    return x * sqrt(1.0 - B + (B * B + 0.5 * x * x));
+}
+
+fn estimate_subdiv(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, sqrt_tol: f32) -> SubdivResult {
+    let d01 = p1 - p0;
+    let d12 = p2 - p1;
+    let dd = d01 - d12;
+    let cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x;
+    let cross_inv = 1.0 / cross;
+    let x0 = dot(d01, dd) * cross_inv;
+    let x2 = dot(d12, dd) * cross_inv;
+    let scale = abs(cross / (length(dd) * (x2 - x0)));
+
+    let a0 = approx_parabola_integral(x0);
+    let a2 = approx_parabola_integral(x2);
+    var val = 0.0;
+    if scale < 1e9 {
+        let da = abs(a2 - a0);
+        let sqrt_scale = sqrt(scale);
+        if sign(x0) == sign(x2) {
+            val = sqrt_scale;
+        } else {
+            let xmin = sqrt_tol / sqrt_scale;
+            val = sqrt_tol / approx_parabola_integral(xmin);
+        }
+        val *= da;
+    }
+    return SubdivResult(val, a0, a2);
+}
+
+fn eval_quad(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, t: f32) -> vec2<f32> {
+    let mt = 1.0 - t;
+    return p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t;
+}
+
+fn eval_cubic(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, p3: vec2<f32>, t: f32) -> vec2<f32> {
+    let mt = 1.0 - t;
+    return p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t;
+}
+
+fn alloc_segment() -> u32 {
+    return atomicAdd(&bump.segments, 1u) + 1u;
+}
+
+let MAX_QUADS = 16u;
+
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+) {
+    let ix = global_id.x;
+    let tag_word = scene[config.pathtag_base + (ix >> 2u)];
+    let shift = (ix & 3u) * 8u;
+    var tag_byte = (tag_word >> shift) & 0xffu;
+
+    if (tag_byte & PATH_TAG_SEG_TYPE) != 0u {
+        // Discussion question: it might actually be cheaper to do the path segment
+        // decoding & transform again rather than store the result in a buffer;
+        // classic memory vs ALU tradeoff.
+        let cubic = cubics[global_id.x];
+        let path = paths[cubic.path_ix];
+        let bbox = vec4<i32>(path.bbox);
+        let p0 = cubic.p0;
+        let p1 = cubic.p1;
+        let p2 = cubic.p2;
+        let p3 = cubic.p3;
+        let err_v = 3.0 * (p2 - p1) + p0 - p3;
+        let err = dot(err_v, err_v);
+        let ACCURACY = 0.25;
+        let Q_ACCURACY = ACCURACY * 0.1;
+        let REM_ACCURACY = (ACCURACY - Q_ACCURACY);
+        let MAX_HYPOT2 = 432.0 * Q_ACCURACY * Q_ACCURACY;
+        var n_quads = max(u32(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0))), 1u);
+        n_quads = min(n_quads, MAX_QUADS);
+        var keep_params: array<SubdivResult, MAX_QUADS>;
+        var val = 0.0;
+        var qp0 = p0;
+        let step = 1.0 / f32(n_quads);
+        for (var i = 0u; i < n_quads; i += 1u) {
+            let t = f32(i + 1u) * step;
+            let qp2 = eval_cubic(p0, p1, p2, p3, t);
+            var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step);
+            qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
+            let params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY));
+            keep_params[i] = params;
+            val += params.val;
+            qp0 = qp2;
+        }
+        let n = max(u32(ceil(val * (0.5 / sqrt(REM_ACCURACY)))), 1u);
+        var lp0 = p0;
+        qp0 = p0;
+        let v_step = val / f32(n);
+        var n_out = 1u;
+        var val_sum = 0.0;
+        for (var i = 0u; i < n_quads; i += 1u) {
+            let t = f32(i + 1u) * step;
+            let qp2 = eval_cubic(p0, p1, p2, p3, t);
+            var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step);
+            qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
+            let params = keep_params[i];
+            let u0 = approx_parabola_inv_integral(params.a0);
+            let u2 = approx_parabola_inv_integral(params.a2);
+            let uscale = 1.0 / (u2 - u0);
+            var val_target = f32(n_out) * v_step;
+            while n_out == n || val_target < val_sum + params.val {
+                var lp1: vec2<f32>;
+                if n_out == n {
+                    lp1 = p3;
+                } else {
+                    let u = (val_target - val_sum) / params.val;
+                    let a = mix(params.a0, params.a2, u);
+                    let au = approx_parabola_inv_integral(a);
+                    let t = (au - u0) * uscale;
+                    lp1 = eval_quad(qp0, qp1, qp2, t);
+                }
+
+                // Output line segment lp0..lp1
+                let xymin = min(lp0, lp1);
+                let xymax = max(lp0, lp1);
+                let dp = lp1 - lp0;
+                let recip_dx = 1.0 / dp.x;
+                let invslope = select(dp.x / dp.y, 1.0e9, abs(dp.y) < 1.0e-9);
+                let c = 0.5 * abs(invslope);
+                let b = invslope;
+                let SX = 1.0 / f32(TILE_WIDTH);
+                let SY = 1.0 / f32(TILE_HEIGHT);
+                let a = (lp0.x - (lp0.y - 0.5 * f32(TILE_HEIGHT)) * b) * SX;
+                var x0 = i32(floor(xymin.x * SX));
+                var x1 = i32(floor(xymax.x * SX) + 1.0);
+                var y0 = i32(floor(xymin.y * SY));
+                var y1 = i32(floor(xymax.y * SY) + 1.0);
+                x0 = clamp(x0, bbox.x, bbox.z);
+                x1 = clamp(x1, bbox.x, bbox.z);
+                y0 = clamp(y0, bbox.y, bbox.w);
+                y1 = clamp(y1, bbox.y, bbox.w);
+                var xc = a + b * f32(y0);
+                let stride = bbox.z - bbox.x;
+                var base = i32(path.tiles) + (y0 - bbox.y) * stride - bbox.x;
+                var xray = i32(floor(lp0.x * SX));
+                var last_xray = i32(floor(lp1.x * SX));
+                if dp.y < 0.0 {
+                    let tmp = xray;
+                    xray = last_xray;
+                    last_xray = tmp;
+                }
+                for (var y = y0; y < y1; y += 1) {
+                    let tile_y0 = f32(y) * f32(TILE_HEIGHT);
+                    let xbackdrop = max(xray + 1, bbox.x);
+                    if xymin.y < tile_y0 && xbackdrop < bbox.z {
+                        let backdrop = select(-1, 1, dp.y < 0.0);
+                        let tile_ix = base + xbackdrop;
+                        atomicAdd(&tiles[tile_ix].backdrop, backdrop);
+                    }
+                    var next_xray = last_xray;
+                    if y + 1 < y1 {
+                        let tile_y1 = f32(y + 1) * f32(TILE_HEIGHT);
+                        let x_edge = lp0.x + (tile_y1 - lp0.y) * invslope;
+                        next_xray = i32(floor(x_edge * SX));
+                    }
+                    let min_xray = min(xray, next_xray);
+                    let max_xray = max(xray, next_xray);
+                    var xx0 = min(i32(floor(xc - c)), min_xray);
+                    var xx1 = max(i32(ceil(xc + c)), max_xray + 1);
+                    xx0 = clamp(xx0, x0, x1);
+                    xx1 = clamp(xx1, x0, x1);
+                    var tile_seg: Segment;
+                    for (var x = xx0; x < xx1; x += 1) {
+                        let tile_x0 = f32(x) * f32(TILE_WIDTH);
+                        let tile_ix = base + x;
+                        // allocate segment, insert linked list
+                        let seg_ix = alloc_segment();
+                        let old = atomicExchange(&tiles[tile_ix].segments, seg_ix);
+                        tile_seg.origin = lp0;
+                        tile_seg.delta = dp;
+                        var y_edge = mix(lp0.y, lp1.y, (tile_x0 - lp0.x) * recip_dx);
+                        if xymin.x < tile_x0 {
+                            let p = vec2<f32>(tile_x0, y_edge);
+                            if dp.x < 0.0 {
+                                tile_seg.delta = p - lp0;
+                            } else {
+                                tile_seg.origin = p;
+                                tile_seg.delta = lp1 - p;
+                            }
+                            if tile_seg.delta.x == 0.0 {
+                                tile_seg.delta.x = sign(dp.x) * 1e-9;
+                            }
+                        }
+                        if x <= min_xray || max_xray < x {
+                            y_edge = 1e9;
+                        }
+                        tile_seg.y_edge = y_edge;
+                        tile_seg.next = old;
+                        segments[seg_ix] = tile_seg;
+                    }
+                    xc += b;
+                    base += stride;
+                    xray = next_xray;
+                }
+                n_out += 1u;
+                val_target += v_step;
+                lp0 = lp1;
+            }
+            val_sum += params.val;
+            qp0 = qp2;
+        }
+    }
+}
--- a/piet-wgsl/shader/pathseg.wgsl
+++ b/piet-wgsl/shader/pathseg.wgsl
@ -0,0 +1,217 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+// Path segment decoding for the full case.
+
+// In the simple case, path segments are decoded as part of the coarse
+// path rendering stage. In the full case, they are separated, as the
+// decoding process also generates bounding boxes, and those in turn are
+// used for tile allocation and clipping; actual coarse path rasterization
+// can't proceed until those are complete.
+
+// There's some duplication of the decoding code but we won't worry about
+// that just now. Perhaps it could be factored more nicely later.
+
+#import config
+#import pathtag
+#import cubic
+
+@group(0) @binding(0)
+var<storage> config: Config;
+
+@group(0) @binding(1)
+var<storage> scene: array<u32>;
+
+@group(0) @binding(2)
+var<storage> tag_monoids: array<TagMonoid>;
+
+struct AtomicPathBbox {
+    x0: atomic<i32>,
+    y0: atomic<i32>,
+    x1: atomic<i32>,
+    y1: atomic<i32>,
+    linewidth: f32,
+    trans_ix: u32,
+}
+
+@group(0) @binding(3)
+var<storage, read_write> path_bboxes: array<AtomicPathBbox>;
+
+
+@group(0) @binding(4)
+var<storage, read_write> cubics: array<Cubic>;
+
+// Monoid is yagni, for future optimization
+
+// struct BboxMonoid {
+//     bbox: vec4<f32>,
+//     flags: u32,
+// }
+
+// let FLAG_RESET_BBOX = 1u;
+// let FLAG_SET_BBOX = 2u;
+
+// fn combine_bbox_monoid(a: BboxMonoid, b: BboxMonoid) -> BboxMonoid {
+//     var c: BboxMonoid;
+//     c.bbox = b.bbox;
+//     // TODO: previous-me thought this should be gated on b & SET_BBOX == false also
+//     if (a.flags & FLAG_RESET_BBOX) == 0u && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y {
+//         c.bbox = a.bbox;
+//     } else if (a.flags & FLAG_RESET_BBOX) == 0u && (b.flags & FLAG_SET_BBOX) == 0u ||
+//         (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)
+//     {
+//         c.bbox = vec4<f32>(min(a.bbox.xy, c.bbox.xy), max(a.bbox.xw, c.bbox.zw));
+//     }
+//     c.flags = (a.flags & FLAG_SET_BBOX) | b.flags;
+//     c.flags |= (a.flags & FLAG_RESET_BBOX) << 1u;
+//     return c;
+// }
+
+// fn bbox_monoid_identity() -> BboxMonoid {
+//     return BboxMonoid();
+// }
+
+var<private> pathdata_base: u32;
+
+fn read_f32_point(ix: u32) -> vec2<f32> {
+    let x = bitcast<f32>(scene[pathdata_base + ix]);
+    let y = bitcast<f32>(scene[pathdata_base + ix + 1u]);
+    return vec2<f32>(x, y);
+}
+
+fn read_i16_point(ix: u32) -> vec2<f32> {
+    let raw = scene[pathdata_base + ix];
+    let x = f32(i32(raw << 16u) >> 16u);
+    let y = f32(i32(raw) >> 16u);
+    return vec2<f32>(x, y);
+}
+
+struct Transform {
+    matrx: vec4<f32>,
+    translate: vec2<f32>,
+}
+
+fn read_transform(transform_base: u32, ix: u32) -> Transform {
+    let base = transform_base + ix * 6u;
+    let c0 = bitcast<f32>(scene[base]);
+    let c1 = bitcast<f32>(scene[base + 1u]);
+    let c2 = bitcast<f32>(scene[base + 2u]);
+    let c3 = bitcast<f32>(scene[base + 3u]);
+    let c4 = bitcast<f32>(scene[base + 4u]);
+    let c5 = bitcast<f32>(scene[base + 5u]);
+    let matrx = vec4<f32>(c0, c1, c2, c3);
+    let translate = vec2<f32>(c4, c5);
+    return Transform(matrx, translate);
+}
+
+fn transform_apply(transform: Transform, p: vec2<f32>) -> vec2<f32> {
+    return transform.matrx.xy * p.x + transform.matrx.zw * p.y + transform.translate;
+}
+
+fn round_down(x: f32) -> i32 {
+    return i32(floor(x));
+}
+
+fn round_up(x: f32) -> i32 {
+    return i32(ceil(x));
+}
+
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+) {
+    let ix = global_id.x;
+    let tag_word = scene[config.pathtag_base + (ix >> 2u)];
+    pathdata_base = config.pathdata_base;
+    let shift = (ix & 3u) * 8u;
+    var tm = reduce_tag(tag_word & ((1u << shift) - 1u));
+    tm = combine_tag_monoid(tag_monoids[ix >> 2u], tm);
+    var tag_byte = (tag_word >> shift) & 0xffu;
+
+    let out = &path_bboxes[tm.path_ix];
+    var linewidth: f32;
+    if (tag_byte & PATH_TAG_PATH) != 0u {
+        linewidth = bitcast<f32>(scene[config.linewidth_base + tm.linewidth_ix]);
+        (*out).linewidth = linewidth;
+        (*out).trans_ix = tm.trans_ix;
+    }
+    // Decode path data
+    let seg_type = tag_byte & PATH_TAG_SEG_TYPE;
+    if seg_type != 0u {
+        var p0: vec2<f32>;
+        var p1: vec2<f32>;
+        var p2: vec2<f32>;
+        var p3: vec2<f32>;
+        if (tag_byte & PATH_TAG_F32) != 0u {
+            p0 = read_f32_point(tm.pathseg_offset);
+            p1 = read_f32_point(tm.pathseg_offset + 2u);
+            if seg_type >= PATH_TAG_QUADTO {
+                p2 = read_f32_point(tm.pathseg_offset + 4u);
+                if seg_type == PATH_TAG_CUBICTO {
+                    p3 = read_f32_point(tm.pathseg_offset + 6u);
+                }
+            }
+        } else {
+            p0 = read_i16_point(tm.pathseg_offset);
+            p1 = read_i16_point(tm.pathseg_offset + 1u);
+            if seg_type >= PATH_TAG_QUADTO {
+                p2 = read_i16_point(tm.pathseg_offset + 2u);
+                if seg_type == PATH_TAG_CUBICTO {
+                    p3 = read_i16_point(tm.pathseg_offset + 3u);
+                }
+            }
+        }
+        let transform = read_transform(config.transform_base, tm.trans_ix);
+        //let transform = Transform(vec4<f32>(1.0, 0.0, 0.0, 1.0), vec2<f32>());
+        p0 = transform_apply(transform, p0);
+        p1 = transform_apply(transform, p1);
+        var bbox = vec4<f32>(min(p0, p1), max(p0, p1));
+        // Degree-raise
+        if seg_type == PATH_TAG_LINETO {
+            p3 = p1;
+            p2 = mix(p3, p0, 1.0 / 3.0);
+            p1 = mix(p0, p3, 1.0 / 3.0);
+        } else if seg_type >= PATH_TAG_QUADTO {
+            p2 = transform_apply(transform, p2);
+            bbox = vec4<f32>(min(bbox.xy, p2), max(bbox.zw, p2));
+            if seg_type == PATH_TAG_CUBICTO {
+                p3 = transform_apply(transform, p3);
+                bbox = vec4<f32>(min(bbox.xy, p3), max(bbox.zw, p3));
+            } else {
+                p3 = p2;
+                p2 = mix(p1, p2, 1.0 / 3.0);
+                p1 = mix(p1, p0, 1.0 / 3.0);
+            }
+        }
+        if linewidth >= 0.0 {
+            // See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm
+            // This is the correct bounding box, but we're not handling rendering
+            // in the isotropic case, so it may mismatch.
+            let stroke = 0.5 * linewidth * vec2<f32>(length(transform.matrx.xz), length(transform.matrx.yw));
+            bbox += vec4<f32>(-stroke, stroke);
+        }
+        cubics[global_id.x] = Cubic(p0, p1, p2, p3, tm.path_ix, 0u);
+        // Update bounding box using atomics only. Computing a monoid is a
+        // potential future optimization.
+        if bbox.z > bbox.x || bbox.w > bbox.y {
+            atomicMin(&(*out).x0, round_down(bbox.x));
+            atomicMin(&(*out).y0, round_down(bbox.y));
+            atomicMax(&(*out).x1, round_up(bbox.z));
+            atomicMax(&(*out).y1, round_up(bbox.w));
+        }
+    }
+}
--- a/piet-wgsl/shader/pathtag_reduce.wgsl
+++ b/piet-wgsl/shader/pathtag_reduce.wgsl
@ -14,14 +14,16 @@
 //
 // Also licensed under MIT license, at your choice.

+#import config
 #import pathtag

-// Note: should have a single scene binding, path_tags are a slice
-// in that; need a config uniform.
@group(0) @binding(0)
-var<storage> path_tags: array<u32>;
+var<storage> config: Config;

@group(0) @binding(1)
+var<storage> scene: array<u32>;
+
+@group(0) @binding(2)
 var<storage, read_write> reduced: array<TagMonoid>;

 let LG_WG_SIZE = 8u;
@ -35,7 +37,7 @@ fn main(
    @builtin(local_invocation_id) local_id: vec3<u32>,
 ) {
    let ix = global_id.x;
-    let tag_word = path_tags[ix];
+    let tag_word = scene[config.pathtag_base + ix];
    var agg = reduce_tag(tag_word);
    sh_scratch[local_id.x] = agg;
    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
--- a/piet-wgsl/shader/pathtag_scan.wgsl
+++ b/piet-wgsl/shader/pathtag_scan.wgsl
@ -14,15 +14,19 @@
 //
 // Also licensed under MIT license, at your choice.

+#import config
 #import pathtag

@group(0) @binding(0)
-var<storage> path_tags: array<u32>;
+var<storage> config: Config;

@group(0) @binding(1)
-var<storage> reduced: array<TagMonoid>;
+var<storage> scene: array<u32>;

@group(0) @binding(2)
+var<storage> reduced: array<TagMonoid>;
+
+@group(0) @binding(3)
 var<storage, read_write> tag_monoids: array<TagMonoid>;

 let LG_WG_SIZE = 8u;
@ -39,13 +43,13 @@ fn main(
    @builtin(workgroup_id) wg_id: vec3<u32>,
 ) {
    var agg = tag_monoid_identity();
-    if (local_id.x < wg_id.x) {
+    if local_id.x < wg_id.x {
        agg = reduced[local_id.x];
    }
    sh_parent[local_id.x] = agg;
    for (var i = 0u; i < LG_WG_SIZE; i += 1u) {
        workgroupBarrier();
-        if (local_id.x + (1u << i) < WG_SIZE) {
+        if local_id.x + (1u << i) < WG_SIZE {
            let other = sh_parent[local_id.x + (1u << i)];
            agg = combine_tag_monoid(agg, other);
        }
@ -54,12 +58,12 @@ fn main(
    }

    let ix = global_id.x;
-    let tag_word = path_tags[ix];
+    let tag_word = scene[config.pathtag_base + ix];
    agg = reduce_tag(tag_word);
    sh_monoid[local_id.x] = agg;
    for (var i = 0u; i < LG_WG_SIZE; i += 1u) {
        workgroupBarrier();
-        if (local_id.x >= 1u << i) {
+        if local_id.x >= 1u << i {
            let other = sh_monoid[local_id.x - (1u << i)];
            agg = combine_tag_monoid(other, agg);
        }
@ -68,7 +72,7 @@ fn main(
    }
    // prefix up to this workgroup
    var tm = sh_parent[0];
-    if (local_id.x > 0u) {
+    if local_id.x > 0u {
        tm = combine_tag_monoid(tm, sh_monoid[local_id.x - 1u]);
    }
    // exclusive prefix sum, granularity of 4 tag bytes
--- a/piet-wgsl/shader/shared/bbox.wgsl
+++ b/piet-wgsl/shader/shared/bbox.wgsl
@ -0,0 +1,32 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+// The annotated bounding box for a path. It has been transformed,
+// but contains a link to the active transform, mostly for gradients.
+// Coordinates are integer pixels (for the convenience of atomic update)
+// but will probably become fixed-point fractions for rectangles.
+struct PathBbox {
+    x0: i32,
+    y0: i32,
+    x1: i32,
+    y1: i32,
+    linewidth: f32,
+    trans_ix: u32,
+}
+
+fn bbox_intersect(a: vec4<f32>, b: vec4<f32>) -> vec4<f32> {
+    return vec4(max(a.xy, b.xy), min(a.zw, b.zw));
+}
--- a/piet-wgsl/shader/shared/blend.wgsl
+++ b/piet-wgsl/shader/shared/blend.wgsl
@ -0,0 +1,351 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+
+// Color mixing modes
+
+let MIX_NORMAL = 0u;
+let MIX_MULTIPLY = 1u;
+let MIX_SCREEN = 2u;
+let MIX_OVERLAY = 3u;
+let MIX_DARKEN = 4u;
+let MIX_LIGHTEN = 5u;
+let MIX_COLOR_DODGE = 6u;
+let MIX_COLOR_BURN = 7u;
+let MIX_HARD_LIGHT = 8u;
+let MIX_SOFT_LIGHT = 9u;
+let MIX_DIFFERENCE = 10u;
+let MIX_EXCLUSION = 11u;
+let MIX_HUE = 12u;
+let MIX_SATURATION = 13u;
+let MIX_COLOR = 14u;
+let MIX_LUMINOSITY = 15u;
+let MIX_CLIP = 128u;
+
+fn screen(cb: vec3<f32>, cs: vec3<f32>) -> vec3<f32> {
+    return cb + cs - (cb * cs);
+}
+
+fn color_dodge(cb: f32, cs: f32) -> f32 {
+    if cb == 0.0 {
+        return 0.0;
+    } else if cs == 1.0 {
+        return 1.0;
+    } else {
+        return min(1.0, cb / (1.0 - cs));
+    }
+}
+
+fn color_burn(cb: f32, cs: f32) -> f32 {
+    if cb == 1.0 {
+        return 1.0;
+    } else if cs == 0.0 {
+        return 0.0;
+    } else {
+        return 1.0 - min(1.0, (1.0 - cb) / cs);
+    }
+}
+
+fn hard_light(cb: vec3<f32>, cs: vec3<f32>) -> vec3<f32> {
+	return mix(
+		screen(cb, 2.0 * cs - 1.0),
+		cb * 2.0 * cs,
+        vec3<f32>(cs <= vec3<f32>(0.5))
+	);
+}
+
+fn soft_light(cb: vec3<f32>, cs: vec3<f32>) -> vec3<f32> {
+    let d = mix(
+        sqrt(cb),
+        ((16.0 * cb - vec3(12.0)) * cb + vec3(4.0)) * cb,
+        vec3<f32>(cb <= vec3<f32>(0.25))
+    );
+    return mix(
+        cb + (2.0 * cs - vec3(1.0)) * (d - cb),
+        cb - (vec3(1.0) - 2.0 * cs) * cb * (vec3(1.0) - cb),
+        vec3<f32>(cs <= vec3<f32>(0.5))
+    );
+}
+
+fn sat(c: vec3<f32>) -> f32 {
+    return max(c.x, max(c.y, c.z)) - min(c.x, min(c.y, c.z));
+}
+
+fn lum(c: vec3<f32>) -> f32 {
+    let f = vec3<f32>(0.3, 0.59, 0.11);
+    return dot(c, f);
+}
+
+fn clip_color(c: vec3<f32>) -> vec3<f32> {
+    var c = c;
+    let l = lum(c);
+    let n = min(c.x, min(c.y, c.z));
+    let x = max(c.x, max(c.y, c.z));
+    if n < 0.0 {
+        c = l + (((c - l) * l) / (l - n));
+    }
+    if x > 1.0 {
+        c = l + (((c - l) * (1.0 - l)) / (x - l));
+    }
+    return c;
+}
+
+fn set_lum(c: vec3<f32>, l: f32) -> vec3<f32> {
+    return clip_color(c + (l - lum(c)));
+}
+
+fn set_sat_inner(
+    cmin: ptr<function, f32>, 
+    cmid: ptr<function, f32>, 
+    cmax: ptr<function, f32>, 
+    s: f32
+) {
+    if *cmax > *cmin {
+        *cmid = ((*cmid - *cmin) * s) / (*cmax - *cmin);
+        *cmax = s;
+    } else {
+        *cmid = 0.0;
+        *cmax = 0.0;
+    }
+    *cmin = 0.0;
+}
+
+fn set_sat(c: vec3<f32>, s: f32) -> vec3<f32> {
+    var r = c.r;
+    var g = c.g;
+    var b = c.b;
+    if r <= g {
+        if g <= b {
+            set_sat_inner(&r, &g, &b, s);
+        } else {
+            if r <= b {
+                set_sat_inner(&r, &b, &g, s);
+            } else {
+                set_sat_inner(&b, &r, &g, s);
+            }
+        }
+    } else {
+        if r <= b {
+            set_sat_inner(&g, &r, &b, s);
+        } else {
+            if g <= b {
+                set_sat_inner(&g, &b, &r, s);
+            } else {
+                set_sat_inner(&b, &g, &r, s);
+            }
+        }
+    }
+    return vec3<f32>(r, g, b);
+}
+
+// Blends two RGB colors together. The colors are assumed to be in sRGB
+// color space, and this function does not take alpha into account.
+fn blend_mix(cb: vec3<f32>, cs: vec3<f32>, mode: u32) -> vec3<f32> {
+    var b = vec3<f32>(0.0);
+    switch mode {
+        // MIX_MULTIPLY
+        case 1u: {
+            b = cb * cs;
+        }
+        // MIX_SCREEN
+        case 2u: {
+            b = screen(cb, cs);
+        }
+        // MIX_OVERLAY
+        case 3u: {
+            b = hard_light(cs, cb);
+        }
+        // MIX_DARKEN
+        case 4u: {
+            b = min(cb, cs);
+        }
+        // MIX_LIGHTEN
+        case 5u: {
+            b = max(cb, cs);
+        }
+        // MIX_COLOR_DODGE
+        case 6u: {
+            b = vec3<f32>(color_dodge(cb.x, cs.x), color_dodge(cb.y, cs.y), color_dodge(cb.z, cs.z));
+        }
+        // MIX_COLOR_BURN
+        case 7u: {
+            b = vec3<f32>(color_burn(cb.x, cs.x), color_burn(cb.y, cs.y), color_burn(cb.z, cs.z));
+        }
+        // MIX_HARD_LIGHT
+        case 8u: {
+            b = hard_light(cb, cs);
+        }
+        // MIX_SOFT_LIGHT
+        case 9u: {
+            b = soft_light(cb, cs);
+        }
+        // MIX_DIFFERENCE
+        case 10u: {
+            b = abs(cb - cs);
+        }
+        // MIX_EXCLUSION
+        case 11u: {
+            b = cb + cs - 2.0 * cb * cs;
+        }
+        // MIX_HUE
+        case 12u: {
+            b = set_lum(set_sat(cs, sat(cb)), lum(cb));
+        }
+        // MIX_SATURATION
+        case 13u: {
+            b = set_lum(set_sat(cb, sat(cs)), lum(cb));
+        }
+        // MIX_COLOR
+        case 14u: {
+            b = set_lum(cs, lum(cb));
+        }
+        // MIX_LUMINOSITY
+        case 15u: {
+            b = set_lum(cb, lum(cs));
+        }
+        default: {
+            b = cs;
+        }
+    }
+    return b;
+}
+
+// Composition modes
+
+let COMPOSE_CLEAR = 0u;
+let COMPOSE_COPY = 1u;
+let COMPOSE_DEST = 2u;
+let COMPOSE_SRC_OVER = 3u;
+let COMPOSE_DEST_OVER = 4u;
+let COMPOSE_SRC_IN = 5u;
+let COMPOSE_DEST_IN = 6u;
+let COMPOSE_SRC_OUT = 7u;
+let COMPOSE_DEST_OUT = 8u;
+let COMPOSE_SRC_ATOP = 9u;
+let COMPOSE_DEST_ATOP = 10u;
+let COMPOSE_XOR = 11u;
+let COMPOSE_PLUS = 12u;
+let COMPOSE_PLUS_LIGHTER = 13u;
+
+// Apply general compositing operation.
+// Inputs are separated colors and alpha, output is premultiplied.
+fn blend_compose(
+    cb: vec3<f32>, 
+    cs: vec3<f32>, 
+    ab: f32, 
+    as_: f32, 
+    mode: u32
+) -> vec4<f32> {
+    var fa = 0.0;
+    var fb = 0.0;
+    switch mode {
+        // COMPOSE_COPY
+        case 1u: {
+            fa = 1.0;
+            fb = 0.0;
+        }
+        // COMPOSE_DEST
+        case 2u: {
+            fa = 0.0;
+            fb = 1.0;
+        }
+        // COMPOSE_SRC_OVER
+        case 3u: {
+            fa = 1.0;
+            fb = 1.0 - as_;
+        }
+        // COMPOSE_DEST_OVER
+        case 4u: {
+            fa = 1.0 - ab;
+            fb = 1.0;
+        }
+        // COMPOSE_SRC_IN
+        case 5u: {
+            fa = ab;
+            fb = 0.0;
+        }
+        // COMPOSE_DEST_IN
+        case 6u: {
+            fa = 0.0;
+            fb = as_;
+        }
+        // COMPOSE_SRC_OUT
+        case 7u: {
+            fa = 1.0 - ab;
+            fb = 0.0;
+        }
+        // COMPOSE_DEST_OUT
+        case 8u: {
+            fa = 0.0;
+            fb = 1.0 - as_;
+        }
+        // COMPOSE_SRC_ATOP
+        case 9u: {
+            fa = ab;
+            fb = 1.0 - as_;
+        }
+        // COMPOSE_DEST_ATOP
+        case 10u: {
+            fa = 1.0 - ab;
+            fb = as_;
+        }
+        // COMPOSE_XOR
+        case 11u: {
+            fa = 1.0 - ab;
+            fb = 1.0 - as_;
+        }
+        // COMPOSE_PLUS
+        case 12u: {
+            fa = 1.0;
+            fb = 1.0;
+        }
+        // COMPOSE_PLUS_LIGHTER
+        case 13u: {
+            return min(vec4<f32>(1.0), vec4<f32>(as_ * cs + ab * cb, as_ + ab));
+        }
+        default: {}
+    }
+    let as_fa = as_ * fa;
+    let ab_fb = ab * fb;
+    let co = as_fa * cs + ab_fb * cb;
+    return vec4<f32>(co, as_fa + ab_fb);
+}
+
+// Apply color mixing and composition. Both input and output colors are
+// premultiplied RGB.
+fn blend_mix_compose(backdrop: vec4<f32>, src: vec4<f32>, mode: u32) -> vec4<f32> {
+    let BLEND_DEFAULT = ((MIX_NORMAL << 8u) | COMPOSE_SRC_OVER);
+    let EPSILON = 1e-15;
+    if (mode & 0x7fffu) == BLEND_DEFAULT {
+        // Both normal+src_over blend and clip case
+        return backdrop * (1.0 - src.a) + src;
+    }
+    // Un-premultiply colors for blending
+    let inv_src_a = 1.0 / (src.a + EPSILON);
+    var cs = src.rgb * inv_src_a;
+    let inv_backdrop_a = 1.0 / (backdrop.a + EPSILON);
+    let cb = backdrop.rgb * inv_backdrop_a;
+    let mix_mode = mode >> 8u;
+    let mixed = blend_mix(cb, cs, mix_mode);
+    cs = mix(cs, mixed, backdrop.a);
+    let compose_mode = mode & 0xffu;
+    if compose_mode == COMPOSE_SRC_OVER {
+        let co = mix(backdrop.rgb, cs, src.a);
+        return vec4<f32>(co, src.a + backdrop.a * (1.0 - src.a));
+    } else {
+        return blend_compose(cb, cs, backdrop.a, src.a, compose_mode);
+    }
+}
--- a/piet-wgsl/shader/shared/bump.wgsl
+++ b/piet-wgsl/shader/shared/bump.wgsl
@ -0,0 +1,23 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+// TODO: robust memory (failure flags)
+struct BumpAllocators {
+    binning: atomic<u32>,
+    ptcl: atomic<u32>,
+    tile: atomic<u32>,
+    segments: atomic<u32>,
+}
--- a/piet-wgsl/shader/shared/clip.wgsl
+++ b/piet-wgsl/shader/shared/clip.wgsl
@ -0,0 +1,14 @@
+struct Bic {
+    a: u32,
+    b: u32,
+}
+
+fn bic_combine(x: Bic, y: Bic) -> Bic {
+    let m = min(x.b, y.a);
+    return Bic(x.a + y.a - m, x.b + y.b - m);
+}
+
+struct ClipEl {
+    parent_ix: u32,
+    bbox: vec4<f32>,
+}
--- a/piet-wgsl/shader/shared/config.wgsl
+++ b/piet-wgsl/shader/shared/config.wgsl
@ -17,4 +17,29 @@
 struct Config {
    width_in_tiles: u32,
    height_in_tiles: u32,
+
+    n_drawobj: u32,
+    n_path: u32,
+    n_clip: u32,
+
+    // offsets within scene buffer (in u32 units)
+    // Note: this is a difference from piet-gpu, which is in bytes
+    pathtag_base: u32,
+    pathdata_base: u32,
+
+    drawtag_base: u32,
+    drawdata_base: u32,
+
+    transform_base: u32,
+    linewidth_base: u32,
 }
+
+// Geometry of tiles and bins
+
+let TILE_WIDTH = 16u;
+let TILE_HEIGHT = 16u;
+// Number of tiles per bin
+let N_TILE_X = 16u;
+let N_TILE_Y = 16u;
+//let N_TILE = N_TILE_X * N_TILE_Y;
+let N_TILE = 256u;
--- a/piet-wgsl/shader/shared/cubic.wgsl
+++ b/piet-wgsl/shader/shared/cubic.wgsl
@ -0,0 +1,25 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+struct Cubic {
+    p0: vec2<f32>,
+    p1: vec2<f32>,
+    p2: vec2<f32>,
+    p3: vec2<f32>,
+    path_ix: u32,
+    // Needed?
+    padding: u32,
+}
--- a/piet-wgsl/shader/shared/drawtag.wgsl
+++ b/piet-wgsl/shader/shared/drawtag.wgsl
@ -0,0 +1,60 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+// The DrawMonoid is computed as a prefix sum to aid in decoding
+// the variable-length encoding of draw objects.
+struct DrawMonoid {
+    // The number of paths preceding this draw object.
+    path_ix: u32,
+    // The number of clip operations preceding this draw object.
+    clip_ix: u32,
+    // The offset of the encoded draw object in the scene (u32s).
+    scene_offset: u32,
+    // The offset of the associated info.
+    info_offset: u32,
+}
+
+// Each draw object has a 32-bit draw tag, which is a bit-packed
+// version of the draw monoid.
+let DRAWTAG_NOP = 0u;
+let DRAWTAG_FILL_COLOR = 0x44u;
+let DRAWTAG_FILL_LIN_GRADIENT = 0x114u;
+let DRAWTAG_FILL_RAD_GRADIENT = 0x2dcu;
+let DRAWTAG_FILL_IMAGE = 0x48u;
+let DRAWTAG_BEGIN_CLIP = 0x05u;
+let DRAWTAG_END_CLIP = 0x25u;
+
+fn draw_monoid_identity() -> DrawMonoid {
+    return DrawMonoid();
+}
+
+fn combine_draw_monoid(a: DrawMonoid, b: DrawMonoid) -> DrawMonoid {
+    var c: DrawMonoid;
+    c.path_ix = a.path_ix + b.path_ix;
+    c.clip_ix = a.clip_ix + b.clip_ix;
+    c.scene_offset = a.scene_offset + b.scene_offset;
+    c.info_offset = a.info_offset + b.info_offset;
+    return c;
+}
+
+fn map_draw_tag(tag_word: u32) -> DrawMonoid {
+    var c: DrawMonoid;
+    c.path_ix = u32(tag_word != DRAWTAG_NOP);
+    c.clip_ix = tag_word & 1u;
+    c.scene_offset = (tag_word >> 2u) & 0x07u;
+    c.info_offset = (tag_word >> 6u) & 0x0fu;
+    return c;
+}
--- a/piet-wgsl/shader/shared/pathtag.wgsl
+++ b/piet-wgsl/shader/shared/pathtag.wgsl
@ -16,9 +16,13 @@

 struct TagMonoid {
    trans_ix: u32,
+    // TODO: I don't think pathseg_ix is used.
    pathseg_ix: u32,
    pathseg_offset: u32,
-    // Note: piet-gpu has linewidth and path, but not needed here
+#ifdef full
+    linewidth_ix: u32,
+    path_ix: u32,
+#endif
 }

 let PATH_TAG_SEG_TYPE = 3u;
@ -26,15 +30,14 @@ let PATH_TAG_LINETO = 1u;
 let PATH_TAG_QUADTO = 2u;
 let PATH_TAG_CUBICTO = 3u;
 let PATH_TAG_F32 = 8u;
-let PATH_TAG_PATH = 0x10u;
 let PATH_TAG_TRANSFORM = 0x20u;
+#ifdef full
+let PATH_TAG_PATH = 0x10u;
+let PATH_TAG_LINEWIDTH = 0x40u;
+#endif

 fn tag_monoid_identity() -> TagMonoid {
-    var c: TagMonoid;
-    c.trans_ix = 0u;
-    c.pathseg_ix = 0u;
-    c.pathseg_offset = 0u;
-    return c;
+    return TagMonoid();
 }

 fn combine_tag_monoid(a: TagMonoid, b: TagMonoid) -> TagMonoid {
@ -42,6 +45,10 @@ fn combine_tag_monoid(a: TagMonoid, b: TagMonoid) -> TagMonoid {
    c.trans_ix = a.trans_ix + b.trans_ix;
    c.pathseg_ix = a.pathseg_ix + b.pathseg_ix;
    c.pathseg_offset = a.pathseg_offset + b.pathseg_offset;
+#ifdef full
+    c.linewidth_ix = a.linewidth_ix + b.linewidth_ix;
+    c.path_ix = a.path_ix + b.path_ix;
+#endif
    return c;
 }

@ -55,5 +62,9 @@ fn reduce_tag(tag_word: u32) -> TagMonoid {
    a += a >> 8u;
    a += a >> 16u;
    c.pathseg_offset = a & 0xffu;
+#ifdef full
+    c.path_ix = countOneBits(tag_word & (PATH_TAG_PATH * 0x1010101u));
+    c.linewidth_ix = countOneBits(tag_word & (PATH_TAG_LINEWIDTH * 0x1010101u));
+#endif
    return c;
 }
--- a/piet-wgsl/shader/shared/ptcl.wgsl
+++ b/piet-wgsl/shader/shared/ptcl.wgsl
@ -0,0 +1,72 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+// Layout of per-tile command list
+// Initial allocation, in u32's.
+let PTCL_INITIAL_ALLOC = 64u;
+let PTCL_INCREMENT = 256u;
+
+// Amount of space taken by jump
+let PTCL_HEADROOM = 2u;
+
+// Tags for PTCL commands
+let CMD_END = 0u;
+let CMD_FILL = 1u;
+let CMD_STROKE = 2u;
+let CMD_SOLID = 3u;
+let CMD_COLOR = 5u;
+let CMD_LIN_GRAD = 6u;
+let CMD_RAD_GRAD = 7u;
+let CMD_BEGIN_CLIP = 9u;
+let CMD_END_CLIP = 10u;
+let CMD_JUMP = 11u;
+
+// The individual PTCL structs are written here, but read/write is by
+// hand in the relevant shaders
+
+struct CmdFill {
+    tile: u32,
+    backdrop: i32,
+}
+
+struct CmdStroke {
+    tile: u32,
+    half_width: f32,
+}
+
+struct CmdJump {
+    new_ix: u32,
+}
+
+struct CmdColor {
+    rgba_color: u32,
+}
+
+struct CmdLinGrad {
+    index: u32,
+    line_x: f32,
+    line_y: f32,
+    line_c: f32,
+}
+
+struct CmdRadGrad {
+    index: u32,
+    matrx: vec4<f32>,
+    xlat: vec2<f32>,
+    c1: vec2<f32>,
+    ra: f32,
+    roff: f32,
+}
--- a/piet-wgsl/shader/shared/tile.wgsl
+++ b/piet-wgsl/shader/shared/tile.wgsl
@ -0,0 +1,29 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+// Common datatypes for path and tile intermediate info.
+
+struct Path {
+    // bounding box in tiles
+    bbox: vec4<u32>,
+    // offset (in u32's) to tile rectangle
+    tiles: u32,
+}
+
+struct Tile {
+    backdrop: i32,
+    segments: u32,
+}
--- a/piet-wgsl/shader/tile_alloc.wgsl
+++ b/piet-wgsl/shader/tile_alloc.wgsl
@ -0,0 +1,114 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+// Tile allocation (and zeroing of tiles)
+
+#import config
+#import bump
+#import drawtag
+#import tile
+
+@group(0) @binding(0)
+var<storage> config: Config;
+
+@group(0) @binding(1)
+var<storage> scene: array<u32>;
+
+@group(0) @binding(2)
+var<storage> draw_bboxes: array<vec4<f32>>;
+
+@group(0) @binding(3)
+var<storage, read_write> bump: BumpAllocators;
+
+@group(0) @binding(4)
+var<storage, read_write> paths: array<Path>;
+
+@group(0) @binding(5)
+var<storage, read_write> tiles: array<Tile>;
+
+let WG_SIZE = 256u;
+
+var<workgroup> sh_tile_count: array<u32, WG_SIZE>;
+var<workgroup> sh_tile_offset: u32;
+
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+) {
+    // scale factors useful for converting coordinates to tiles
+    // TODO: make into constants
+    let SX = 1.0 / f32(TILE_WIDTH);
+    let SY = 1.0 / f32(TILE_HEIGHT);
+
+    let drawobj_ix = global_id.x;
+    var drawtag = DRAWTAG_NOP;
+    if drawobj_ix < config.n_drawobj {
+        drawtag = scene[config.drawtag_base + drawobj_ix];
+    }
+    var x0 = 0;
+    var y0 = 0;
+    var x1 = 0;
+    var y1 = 0;
+    if drawtag != DRAWTAG_NOP && drawtag != DRAWTAG_END_CLIP {
+        let bbox = draw_bboxes[drawobj_ix];
+        x0 = i32(floor(bbox.x * SX));
+        y0 = i32(floor(bbox.y * SY));
+        x1 = i32(ceil(bbox.z * SX));
+        y1 = i32(ceil(bbox.w * SY));
+    }
+    let ux0 = u32(clamp(x0, 0, i32(config.width_in_tiles)));
+    let uy0 = u32(clamp(y0, 0, i32(config.height_in_tiles)));
+    let ux1 = u32(clamp(x1, 0, i32(config.width_in_tiles)));
+    let uy1 = u32(clamp(y1, 0, i32(config.height_in_tiles)));
+    let tile_count = (ux1 - ux0) * (uy1 - uy0);
+    var total_tile_count = tile_count;
+    sh_tile_count[local_id.x] = tile_count;
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+        workgroupBarrier();
+        if local_id.x >= (1u << i) {
+            total_tile_count += sh_tile_count[local_id.x - (1u << i)];
+        }
+        workgroupBarrier();
+        sh_tile_count[local_id.x] = total_tile_count;
+    }
+    if local_id.x == WG_SIZE - 1u {
+        paths[drawobj_ix].tiles = atomicAdd(&bump.tile, sh_tile_count[WG_SIZE - 1u]);
+    }
+    // Using storage barriers is a workaround for what appears to be a miscompilation
+    // when a normal workgroup-shared variable is used to broadcast the value.
+    storageBarrier();
+    let tile_offset = paths[drawobj_ix | (WG_SIZE - 1u)].tiles;
+    storageBarrier();
+    if drawobj_ix < config.n_drawobj {
+        let tile_subix = select(0u, sh_tile_count[local_id.x - 1u], local_id.x > 0u);
+        let bbox = vec4<u32>(ux0, uy0, ux1, uy1);
+        let path = Path(bbox, tile_offset + tile_subix);
+        paths[drawobj_ix] = path;
+    }
+
+    // zero allocated memory
+    // Note: if the number of draw objects is small, utilization will be poor.
+    // There are two things that can be done to improve that. One would be a
+    // separate (indirect) dispatch. Another would be to have each workgroup
+    // process fewer draw objects than the number of threads in the wg.
+    let total_count = sh_tile_count[WG_SIZE - 1u];
+    for (var i = local_id.x; i < total_count; i += WG_SIZE) {
+        // Note: could format output buffer as u32 for even better load
+        // balancing, as does piet-gpu.
+        tiles[tile_offset + i] = Tile(0, 0u);
+    }
+}
--- a/piet-wgsl/src/debug.rs
+++ b/piet-wgsl/src/debug.rs
@ -0,0 +1,5 @@
+#![allow(dead_code)]
+
+pub mod clip;
+pub mod draw;
+pub mod fine;
--- a/piet-wgsl/src/debug/clip.rs
+++ b/piet-wgsl/src/debug/clip.rs
@ -0,0 +1,13 @@
+use bytemuck::{Pod, Zeroable};
+
+#[derive(Copy, Clone, Debug, Zeroable, Pod)]
+#[repr(C)]
+pub struct ClipEl {
+    pub parent_ix: u32,
+    pub pad: [u32; 3],
+    pub bbox: [f32; 4],
+}
+
+pub fn parse_clip_els(data: &[u8]) -> Vec<ClipEl> {
+    Vec::from(bytemuck::cast_slice(data))
+}
--- a/piet-wgsl/src/debug/draw.rs
+++ b/piet-wgsl/src/debug/draw.rs
@ -0,0 +1,14 @@
+use bytemuck::{Pod, Zeroable};
+
+#[derive(Copy, Clone, Debug, Zeroable, Pod)]
+#[repr(C)]
+pub struct DrawMonoid {
+    pub path_ix: u32,
+    pub clip_ix: u32,
+    pub scene_offset: u32,
+    pub info_offset: u32,
+}
+
+pub fn parse_draw_monoids(data: &[u8]) -> Vec<DrawMonoid> {
+    Vec::from(bytemuck::cast_slice(data))
+}
--- a/piet-wgsl/src/debug/fine.rs
+++ b/piet-wgsl/src/debug/fine.rs
@ -0,0 +1,153 @@
+#[derive(Copy, Clone, Debug)]
+#[repr(C)]
+pub struct Fill {
+    pub tile: u32,
+    pub backdrop: i32,
+}
+
+#[derive(Copy, Clone, Debug)]
+#[repr(C)]
+pub struct Stroke {
+    pub tile: u32,
+    pub half_width: f32,
+}
+
+#[derive(Copy, Clone, Debug)]
+#[repr(C)]
+pub struct Color {
+    abgr: [u8; 4],
+}
+
+#[derive(Copy, Clone, Debug)]
+#[repr(C)]
+pub struct LinGrad {
+    pub index: u32,
+    pub line_x: f32,
+    pub line_y: f32,
+    pub line_c: f32,
+}
+
+#[derive(Copy, Clone, Debug)]
+#[repr(C)]
+pub struct RadGrad {
+    pub index: u32,
+    pub matrix: [f32; 4],
+    pub xlat: [f32; 2],
+    pub c1: [f32; 2],
+    pub ra: f32,
+    pub roff: f32,
+}
+
+#[derive(Copy, Clone, Debug)]
+pub enum Command {
+    Fill(Fill),
+    Stroke(Stroke),
+    Solid,
+    Color(Color),
+    LinGrad(LinGrad),
+    RadGrad(RadGrad),
+    BeginClip,
+    EndClip(u32),
+    End,
+}
+
+const PTCL_INITIAL_ALLOC: usize = 64;
+
+#[derive(Debug)]
+pub struct CommandList {
+    pub tiles: Vec<(u32, u32, Vec<Command>)>,
+}
+
+impl CommandList {
+    pub fn parse(width: usize, height: usize, ptcl: &[u8]) -> Self {
+        let mut tiles = vec![];
+        let width_tiles = width / 16;
+        let height_tiles = height / 16;
+        for y in 0..height_tiles {
+            for x in 0..width_tiles {
+                let tile_ix = y * width_tiles + x;
+                let ix = tile_ix * PTCL_INITIAL_ALLOC;
+                let commands = parse_commands(ptcl, ix);
+                if !commands.is_empty() {
+                    tiles.push((x as u32, y as u32, commands));
+                }
+            }
+        }
+        Self { tiles }
+    }
+}
+
+fn parse_commands(ptcl: &[u8], mut ix: usize) -> Vec<Command> {
+    let mut commands = vec![];
+    let words: &[u32] = bytemuck::cast_slice(ptcl);
+    while ix < words.len() {
+        let tag = words[ix];
+        ix += 1;
+        match tag {
+            0 => break,
+            1 => {
+                commands.push(Command::Fill(Fill {
+                    tile: words[ix],
+                    backdrop: words[ix + 1] as i32,
+                }));
+                ix += 2;
+            }
+            2 => {
+                commands.push(Command::Stroke(Stroke {
+                    tile: words[ix],
+                    half_width: bytemuck::cast(words[ix + 1]),
+                }));
+                ix += 2;
+            }
+            3 => {
+                commands.push(Command::Solid);
+            }
+            5 => {
+                commands.push(Command::Color(Color {
+                    abgr: bytemuck::cast(words[ix]),
+                }));
+                ix += 1;
+            }
+            6 => {
+                commands.push(Command::LinGrad(LinGrad {
+                    index: words[ix],
+                    line_x: bytemuck::cast(words[ix + 1]),
+                    line_y: bytemuck::cast(words[ix + 2]),
+                    line_c: bytemuck::cast(words[ix + 3]),
+                }));
+                ix += 4;
+            }
+            7 => {
+                let matrix = [
+                    bytemuck::cast(words[ix + 1]),
+                    bytemuck::cast(words[ix + 2]),
+                    bytemuck::cast(words[ix + 3]),
+                    bytemuck::cast(words[ix + 4]),
+                ];
+                let xlat = [bytemuck::cast(words[ix + 5]), bytemuck::cast(words[ix + 6])];
+                let c1 = [bytemuck::cast(words[ix + 7]), bytemuck::cast(words[ix + 8])];
+                commands.push(Command::RadGrad(RadGrad {
+                    index: words[ix],
+                    matrix,
+                    xlat,
+                    c1,
+                    ra: bytemuck::cast(words[ix + 9]),
+                    roff: bytemuck::cast(words[ix + 10]),
+                }));
+                ix += 11;
+            }
+            9 => {
+                commands.push(Command::BeginClip);
+            }
+            10 => {
+                commands.push(Command::EndClip(words[ix]));
+                ix += 1;
+            }
+            11 => {
+                ix = words[ix] as usize;
+            }
+            _ => {}
+        }
+    }
+    commands
+}
--- a/piet-wgsl/src/engine.rs
+++ b/piet-wgsl/src/engine.rs
@ -17,7 +17,7 @@
 use std::{
    borrow::Cow,
    collections::{hash_map::Entry, HashMap},
-    num::NonZeroU64,
+    num::{NonZeroU32, NonZeroU64},
    sync::atomic::{AtomicU64, Ordering},
 };

@ -25,7 +25,8 @@ use futures_intrusive::channel::shared::GenericOneshotReceiver;
 use parking_lot::RawMutex;
 use wgpu::{
    util::DeviceExt, BindGroup, BindGroupLayout, Buffer, BufferAsyncError, BufferSlice, BufferView,
-    ComputePipeline, Device, Queue,
+    ComputePipeline, Device, Queue, Texture, TextureAspect, TextureFormat, TextureUsages,
+    TextureView, TextureViewDimension,
 };

 pub type Error = Box<dyn std::error::Error>;
@ -58,12 +59,27 @@ pub struct BufProxy {
    id: Id,
 }

+#[derive(Clone, Copy)]
+pub struct ImageProxy {
+    width: u32,
+    height: u32,
+    // TODO: format
+    id: Id,
+}
+
+#[derive(Clone, Copy)]
+pub enum ResourceProxy {
+    Buf(BufProxy),
+    Image(ImageProxy),
+}
+
 pub enum Command {
    Upload(BufProxy, Vec<u8>),
+    UploadImage(ImageProxy, Vec<u8>),
    // Discussion question: third argument is vec of resources?
    // Maybe use tricks to make more ergonomic?
    // Alternative: provide bufs & images as separate sequences, like piet-gpu.
-    Dispatch(ShaderId, (u32, u32, u32), Vec<BufProxy>),
+    Dispatch(ShaderId, (u32, u32, u32), Vec<ResourceProxy>),
    Download(BufProxy),
    Clear(BufProxy, u64, Option<NonZeroU64>),
 }
@ -92,6 +108,7 @@ pub enum BindType {
 #[derive(Default)]
 struct BindMap {
    buf_map: HashMap<Id, Buffer>,
+    image_map: HashMap<Id, (Texture, TextureView)>,
 }

 impl Engine {
@ -132,6 +149,16 @@ impl Engine {
                    },
                    count: None,
                },
+                BindType::ImageRead => wgpu::BindGroupLayoutEntry {
+                    binding: i as u32,
+                    visibility: wgpu::ShaderStages::COMPUTE,
+                    ty: wgpu::BindingType::Texture {
+                        sample_type: wgpu::TextureSampleType::Float { filterable: true },
+                        view_dimension: wgpu::TextureViewDimension::D2,
+                        multisampled: false,
+                    },
+                    count: None,
+                },
                _ => todo!(),
            })
            .collect::<Vec<_>>();
@ -182,7 +209,60 @@ impl Engine {
                    });
                    bind_map.insert_buf(buf_proxy.id, buf);
                }
+                Command::UploadImage(image_proxy, bytes) => {
+                    let buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+                        label: None,
+                        contents: &bytes,
+                        usage: wgpu::BufferUsages::COPY_SRC,
+                    });
+                    let texture = device.create_texture(&wgpu::TextureDescriptor {
+                        label: None,
+                        size: wgpu::Extent3d {
+                            width: image_proxy.width,
+                            height: image_proxy.height,
+                            depth_or_array_layers: 1,
+                        },
+                        mip_level_count: 1,
+                        sample_count: 1,
+                        dimension: wgpu::TextureDimension::D2,
+                        usage: TextureUsages::TEXTURE_BINDING | TextureUsages::COPY_DST,
+                        format: TextureFormat::Rgba8Unorm,
+                    });
+                    let texture_view = texture.create_view(&wgpu::TextureViewDescriptor {
+                        label: None,
+                        dimension: Some(TextureViewDimension::D2),
+                        aspect: TextureAspect::All,
+                        mip_level_count: None,
+                        base_mip_level: 0,
+                        base_array_layer: 0,
+                        array_layer_count: None,
+                        format: Some(TextureFormat::Rgba8Unorm),
+                    });
+                    encoder.copy_buffer_to_texture(
+                        wgpu::ImageCopyBuffer {
+                            buffer: &buf,
+                            layout: wgpu::ImageDataLayout {
+                                offset: 0,
+                                bytes_per_row: NonZeroU32::new(image_proxy.width * 4),
+                                rows_per_image: None,
+                            },
+                        },
+                        wgpu::ImageCopyTexture {
+                            texture: &texture,
+                            mip_level: 0,
+                            origin: wgpu::Origin3d { x: 0, y: 0, z: 0 },
+                            aspect: TextureAspect::All,
+                        },
+                        wgpu::Extent3d {
+                            width: image_proxy.width,
+                            height: image_proxy.height,
+                            depth_or_array_layers: 1,
+                        },
+                    );
+                    bind_map.insert_image(image_proxy.id, texture, texture_view)
+                }
                Command::Dispatch(shader_id, wg_size, bindings) => {
+                    println!("dispatching {:?} with {} bindings", wg_size, bindings.len());
                    let shader = &self.shaders[shader_id.0];
                    let bind_group =
                        bind_map.create_bind_group(device, &shader.bind_group_layout, bindings)?;
@ -225,13 +305,28 @@ impl Recording {
        buf_proxy
    }

-    pub fn dispatch(
+    pub fn upload_image(
        &mut self,
-        shader: ShaderId,
-        wg_size: (u32, u32, u32),
-        resources: impl Into<Vec<BufProxy>>,
-    ) {
-        self.push(Command::Dispatch(shader, wg_size, resources.into()));
+        width: u32,
+        height: u32,
+        data: impl Into<Vec<u8>>,
+    ) -> ImageProxy {
+        let data = data.into();
+        let image_proxy = ImageProxy::new(width, height);
+        self.push(Command::UploadImage(image_proxy, data));
+        image_proxy
+    }
+
+    pub fn dispatch<R>(&mut self, shader: ShaderId, wg_size: (u32, u32, u32), resources: R)
+    where
+        R: IntoIterator,
+        R::Item: Into<ResourceProxy>,
+    {
+        self.push(Command::Dispatch(
+            shader,
+            wg_size,
+            resources.into_iter().map(|r| r.into()).collect(),
+        ));
    }

    pub fn download(&mut self, buf: BufProxy) {
@ -246,7 +341,53 @@ impl Recording {
 impl BufProxy {
    pub fn new(size: u64) -> Self {
        let id = Id::next();
-        BufProxy { id, size }
+        BufProxy {
+            id,
+            size: size.max(16),
+        }
+    }
+}
+
+impl ImageProxy {
+    pub fn new(width: u32, height: u32) -> Self {
+        let id = Id::next();
+        ImageProxy { width, height, id }
+    }
+}
+
+impl ResourceProxy {
+    pub fn new_buf(size: u64) -> Self {
+        Self::Buf(BufProxy::new(size))
+    }
+
+    pub fn new_image(width: u32, height: u32) -> Self {
+        Self::Image(ImageProxy::new(width, height))
+    }
+
+    pub fn as_buf(&self) -> Option<&BufProxy> {
+        match self {
+            Self::Buf(proxy) => Some(&proxy),
+            _ => None,
+        }
+    }
+
+    pub fn as_image(&self) -> Option<&ImageProxy> {
+        match self {
+            Self::Image(proxy) => Some(&proxy),
+            _ => None,
+        }
+    }
+}
+
+impl From<BufProxy> for ResourceProxy {
+    fn from(value: BufProxy) -> Self {
+        Self::Buf(value)
+    }
+}
+
+impl From<ImageProxy> for ResourceProxy {
+    fn from(value: ImageProxy) -> Self {
+        Self::Image(value)
    }
 }

@ -263,34 +404,79 @@ impl BindMap {
        self.buf_map.insert(id, buf);
    }

+    fn insert_image(&mut self, id: Id, image: Texture, image_view: TextureView) {
+        self.image_map.insert(id, (image, image_view));
+    }
+
    fn create_bind_group(
        &mut self,
        device: &Device,
        layout: &BindGroupLayout,
-        bindings: &[BufProxy],
+        bindings: &[ResourceProxy],
    ) -> Result<BindGroup, Error> {
        for proxy in bindings {
-            if let Entry::Vacant(v) = self.buf_map.entry(proxy.id) {
-                let buf = device.create_buffer(&wgpu::BufferDescriptor {
-                    label: None,
-                    size: proxy.size,
-                    usage: wgpu::BufferUsages::STORAGE
-                        | wgpu::BufferUsages::COPY_DST
-                        | wgpu::BufferUsages::COPY_SRC,
-                    mapped_at_creation: false,
-                });
-                v.insert(buf);
+            match proxy {
+                ResourceProxy::Buf(proxy) => {
+                    if let Entry::Vacant(v) = self.buf_map.entry(proxy.id) {
+                        let buf = device.create_buffer(&wgpu::BufferDescriptor {
+                            label: None,
+                            size: proxy.size,
+                            usage: wgpu::BufferUsages::STORAGE
+                                | wgpu::BufferUsages::COPY_DST
+                                | wgpu::BufferUsages::COPY_SRC,
+                            mapped_at_creation: false,
+                        });
+                        v.insert(buf);
+                    }
+                }
+                ResourceProxy::Image(proxy) => {
+                    if let Entry::Vacant(v) = self.image_map.entry(proxy.id) {
+                        let texture = device.create_texture(&wgpu::TextureDescriptor {
+                            label: None,
+                            size: wgpu::Extent3d {
+                                width: proxy.width,
+                                height: proxy.height,
+                                depth_or_array_layers: 1,
+                            },
+                            mip_level_count: 1,
+                            sample_count: 1,
+                            dimension: wgpu::TextureDimension::D2,
+                            usage: TextureUsages::TEXTURE_BINDING | TextureUsages::COPY_DST,
+                            format: TextureFormat::Rgba8Unorm,
+                        });
+                        let texture_view = texture.create_view(&wgpu::TextureViewDescriptor {
+                            label: None,
+                            dimension: Some(TextureViewDimension::D2),
+                            aspect: TextureAspect::All,
+                            mip_level_count: None,
+                            base_mip_level: 0,
+                            base_array_layer: 0,
+                            array_layer_count: None,
+                            format: Some(TextureFormat::Rgba8Unorm),
+                        });
+                        v.insert((texture, texture_view));
+                    }
+                }
            }
        }
        let entries = bindings
            .iter()
            .enumerate()
-            .map(|(i, proxy)| {
-                let buf = self.buf_map.get(&proxy.id).unwrap();
-                Ok(wgpu::BindGroupEntry {
-                    binding: i as u32,
-                    resource: buf.as_entire_binding(),
-                })
+            .map(|(i, proxy)| match proxy {
+                ResourceProxy::Buf(proxy) => {
+                    let buf = self.buf_map.get(&proxy.id).unwrap();
+                    Ok(wgpu::BindGroupEntry {
+                        binding: i as u32,
+                        resource: buf.as_entire_binding(),
+                    })
+                }
+                ResourceProxy::Image(proxy) => {
+                    let texture = self.image_map.get(&proxy.id).unwrap();
+                    Ok(wgpu::BindGroupEntry {
+                        binding: i as u32,
+                        resource: wgpu::BindingResource::TextureView(&texture.1),
+                    })
+                }
            })
            .collect::<Result<Vec<_>, Error>>()?;
        let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
--- a/piet-wgsl/src/main.rs
+++ b/piet-wgsl/src/main.rs
@ -20,11 +20,12 @@ use std::{fs::File, io::BufWriter};

 use engine::Engine;

-use render::render;
-use test_scene::dump_scene_info;
-use wgpu::{Device, Queue};
+use wgpu::{Device, Limits, Queue};

+mod debug;
 mod engine;
+mod pico_svg;
+mod ramp;
 mod render;
 mod shaders;
 mod test_scene;
@ -33,12 +34,14 @@ async fn run() -> Result<(), Box<dyn std::error::Error>> {
    let instance = wgpu::Instance::new(wgpu::Backends::PRIMARY);
    let adapter = instance.request_adapter(&Default::default()).await.unwrap();
    let features = adapter.features();
+    let mut limits = Limits::default();
+    limits.max_storage_buffers_per_shader_stage = 16;
    let (device, queue) = adapter
        .request_device(
            &wgpu::DeviceDescriptor {
                label: None,
                features: features & wgpu::Features::TIMESTAMP_QUERY,
-                limits: Default::default(),
+                limits,
            },
            None,
        )
@ -49,25 +52,46 @@ async fn run() -> Result<(), Box<dyn std::error::Error>> {
    Ok(())
 }

+fn dump_buf(buf: &[u32]) {
+    for (i, val) in buf.iter().enumerate() {
+        if *val != 0 {
+            let lo = val & 0x7fff_ffff;
+            if lo >= 0x3000_0000 && lo < 0x5000_0000 {
+                println!("{}: {:x} {}", i, val, f32::from_bits(*val));
+            } else {
+                println!("{}: {:x}", i, val);
+            }
+        }
+    }
+}
+
 async fn do_render(
    device: &Device,
    queue: &Queue,
    engine: &mut Engine,
 ) -> Result<(), Box<dyn std::error::Error>> {
+    #[allow(unused)]
    let shaders = shaders::init_shaders(device, engine)?;
+    let full_shaders = shaders::full_shaders(device, engine)?;
    let scene = test_scene::gen_test_scene();
-    dump_scene_info(&scene);
-    let (recording, buf) = render(&scene, &shaders);
+    //test_scene::dump_scene_info(&scene);
+    //let (recording, buf) = render::render(&scene, &shaders);
+    let (recording, buf) = render::render_full(&scene, &full_shaders);
    let downloads = engine.run_recording(&device, &queue, &recording)?;
    let mapped = downloads.map();
    device.poll(wgpu::Maintain::Wait);
    let buf = mapped.get_mapped(buf).await?;

-    let file = File::create("image.png")?;
-    let w = BufWriter::new(file);
-    let encoder = png::Encoder::new(w, 1024, 1024);
-    let mut writer = encoder.write_header()?;
-    writer.write_image_data(&buf)?;
+    if false {
+        dump_buf(bytemuck::cast_slice(&buf));
+    } else {
+        let file = File::create("image.png")?;
+        let w = BufWriter::new(file);
+        let mut encoder = png::Encoder::new(w, 1024, 1024);
+        encoder.set_color(png::ColorType::Rgba);
+        let mut writer = encoder.write_header()?;
+        writer.write_image_data(&buf)?;
+    }
    Ok(())
 }

--- a/piet-wgsl/src/pico_svg.rs
+++ b/piet-wgsl/src/pico_svg.rs
@ -0,0 +1,140 @@
+//! A loader for a tiny fragment of SVG
+
+use std::str::FromStr;
+
+use roxmltree::{Document, Node};
+
+use kurbo::{Affine, BezPath};
+
+use piet_scene::Color;
+
+pub struct PicoSvg {
+    pub items: Vec<Item>,
+}
+
+pub enum Item {
+    Fill(FillItem),
+    Stroke(StrokeItem),
+}
+
+pub struct StrokeItem {
+    pub width: f64,
+    pub color: Color,
+    pub path: BezPath,
+}
+
+pub struct FillItem {
+    pub color: Color,
+    pub path: BezPath,
+}
+
+struct Parser<'a> {
+    scale: f64,
+    items: &'a mut Vec<Item>,
+}
+
+impl PicoSvg {
+    pub fn load(xml_string: &str, scale: f64) -> Result<PicoSvg, Box<dyn std::error::Error>> {
+        let doc = Document::parse(xml_string)?;
+        let root = doc.root_element();
+        let mut items = Vec::new();
+        let mut parser = Parser::new(&mut items, scale);
+        for node in root.children() {
+            parser.rec_parse(node)?;
+        }
+        Ok(PicoSvg { items })
+    }
+}
+
+impl<'a> Parser<'a> {
+    fn new(items: &'a mut Vec<Item>, scale: f64) -> Parser<'a> {
+        Parser { scale, items }
+    }
+
+    fn rec_parse(&mut self, node: Node) -> Result<(), Box<dyn std::error::Error>> {
+        let transform = if self.scale >= 0.0 {
+            Affine::scale(self.scale)
+        } else {
+            Affine::new([-self.scale, 0.0, 0.0, self.scale, 0.0, 1536.0])
+        };
+        if node.is_element() {
+            match node.tag_name().name() {
+                "g" => {
+                    for child in node.children() {
+                        self.rec_parse(child)?;
+                    }
+                }
+                "path" => {
+                    let d = node.attribute("d").ok_or("missing 'd' attribute")?;
+                    let bp = BezPath::from_svg(d)?;
+                    let path = transform * bp;
+                    // TODO: default fill color is black, but this is overridden in tiger to this logic.
+                    if let Some(fill_color) = node.attribute("fill") {
+                        if fill_color != "none" {
+                            let color = parse_color(fill_color);
+                            let color = modify_opacity(color, "fill-opacity", node);
+                            self.items.push(Item::Fill(FillItem {
+                                color,
+                                path: path.clone(),
+                            }));
+                        }
+                    }
+                    if let Some(stroke_color) = node.attribute("stroke") {
+                        if stroke_color != "none" {
+                            let width = self.scale.abs()
+                                * f64::from_str(
+                                    node.attribute("stroke-width").ok_or("missing width")?,
+                                )?;
+                            let color = parse_color(stroke_color);
+                            let color = modify_opacity(color, "stroke-opacity", node);
+                            self.items
+                                .push(Item::Stroke(StrokeItem { width, color, path }));
+                        }
+                    }
+                }
+                _ => (),
+            }
+        }
+        Ok(())
+    }
+}
+
+fn parse_color(color: &str) -> Color {
+    if color.as_bytes()[0] == b'#' {
+        let mut hex = u32::from_str_radix(&color[1..], 16).unwrap();
+        if color.len() == 4 {
+            hex = (hex >> 8) * 0x110000 + ((hex >> 4) & 0xf) * 0x1100 + (hex & 0xf) * 0x11;
+        }
+        let rgba = (hex << 8) + 0xff;
+        let (r, g, b, a) = (
+            (rgba >> 24 & 255) as u8,
+            ((rgba >> 16) & 255) as u8,
+            ((rgba >> 8) & 255) as u8,
+            (rgba & 255) as u8,
+        );
+        Color::rgba8(r, g, b, a)
+    } else if color.starts_with("rgb(") {
+        let mut iter = color[4..color.len() - 1].split(',');
+        let r = u8::from_str(iter.next().unwrap()).unwrap();
+        let g = u8::from_str(iter.next().unwrap()).unwrap();
+        let b = u8::from_str(iter.next().unwrap()).unwrap();
+        Color::rgb8(r, g, b)
+    } else {
+        Color::rgba8(255, 0, 255, 0x80)
+    }
+}
+
+fn modify_opacity(mut color: Color, attr_name: &str, node: Node) -> Color {
+    if let Some(opacity) = node.attribute(attr_name) {
+        let alpha = if opacity.ends_with("%") {
+            let pctg = opacity[..opacity.len() - 1].parse().unwrap_or(100.0);
+            pctg * 0.01
+        } else {
+            opacity.parse().unwrap_or(1.0)
+        } as f64;
+        color.a = (alpha.min(1.0).max(0.0) * 255.0).round() as u8;
+        color
+    } else {
+        color
+    }
+}
--- a/piet-wgsl/src/ramp.rs
+++ b/piet-wgsl/src/ramp.rs
@ -0,0 +1,137 @@
+use piet_scene::{Color, GradientStop, GradientStops};
+
+use std::collections::HashMap;
+
+const N_SAMPLES: usize = 512;
+const RETAINED_COUNT: usize = 64;
+
+#[derive(Default)]
+pub struct RampCache {
+    epoch: u64,
+    map: HashMap<GradientStops, (u32, u64)>,
+    data: Vec<u32>,
+}
+
+impl RampCache {
+    pub fn advance(&mut self) {
+        self.epoch += 1;
+        if self.map.len() > RETAINED_COUNT {
+            self.map
+                .retain(|_key, value| value.0 < RETAINED_COUNT as u32);
+            self.data.truncate(RETAINED_COUNT * N_SAMPLES);
+        }
+    }
+
+    pub fn add(&mut self, stops: &[GradientStop]) -> u32 {
+        if let Some(entry) = self.map.get_mut(stops) {
+            entry.1 = self.epoch;
+            entry.0
+        } else if self.map.len() < RETAINED_COUNT {
+            let id = (self.data.len() / N_SAMPLES) as u32;
+            self.data.extend(make_ramp(stops));
+            self.map.insert(stops.into(), (id, self.epoch));
+            id
+        } else {
+            let mut reuse = None;
+            for (stops, (id, epoch)) in &self.map {
+                if *epoch + 2 < self.epoch {
+                    reuse = Some((stops.to_owned(), *id));
+                    break;
+                }
+            }
+            if let Some((old_stops, id)) = reuse {
+                self.map.remove(&old_stops);
+                let start = id as usize * N_SAMPLES;
+                for (dst, src) in self.data[start..start + N_SAMPLES]
+                    .iter_mut()
+                    .zip(make_ramp(stops))
+                {
+                    *dst = src;
+                }
+                self.map.insert(stops.into(), (id, self.epoch));
+                id
+            } else {
+                let id = (self.data.len() / N_SAMPLES) as u32;
+                self.data.extend(make_ramp(stops));
+                self.map.insert(stops.into(), (id, self.epoch));
+                id
+            }
+        }
+    }
+
+    pub fn data(&self) -> &[u32] {
+        &self.data
+    }
+
+    pub fn width(&self) -> u32 {
+        N_SAMPLES as u32
+    }
+
+    pub fn height(&self) -> u32 {
+        (self.data.len() / N_SAMPLES) as u32
+    }
+}
+
+fn make_ramp<'a>(stops: &'a [GradientStop]) -> impl Iterator<Item = u32> + 'a {
+    let mut last_u = 0.0;
+    let mut last_c = ColorF64::from_color(stops[0].color);
+    let mut this_u = last_u;
+    let mut this_c = last_c;
+    let mut j = 0;
+    (0..N_SAMPLES).map(move |i| {
+        let u = (i as f64) / (N_SAMPLES - 1) as f64;
+        while u > this_u {
+            last_u = this_u;
+            last_c = this_c;
+            if let Some(s) = stops.get(j + 1) {
+                this_u = s.offset as f64;
+                this_c = ColorF64::from_color(s.color);
+                j += 1;
+            } else {
+                break;
+            }
+        }
+        let du = this_u - last_u;
+        let c = if du < 1e-9 {
+            this_c
+        } else {
+            last_c.lerp(&this_c, (u - last_u) / du)
+        };
+        c.to_premul_u32()
+    })
+}
+
+#[derive(Copy, Clone, Debug)]
+struct ColorF64([f64; 4]);
+
+impl ColorF64 {
+    fn from_color(color: Color) -> Self {
+        Self([
+            color.r as f64 / 255.0,
+            color.g as f64 / 255.0,
+            color.b as f64 / 255.0,
+            color.a as f64 / 255.0,
+        ])
+    }
+
+    fn lerp(&self, other: &Self, a: f64) -> Self {
+        fn l(x: f64, y: f64, a: f64) -> f64 {
+            x * (1.0 - a) + y * a
+        }
+        Self([
+            l(self.0[0], other.0[0], a),
+            l(self.0[1], other.0[1], a),
+            l(self.0[2], other.0[2], a),
+            l(self.0[3], other.0[3], a),
+        ])
+    }
+
+    fn to_premul_u32(&self) -> u32 {
+        let a = self.0[3].min(1.0).max(0.0);
+        let r = ((self.0[0] * a).min(1.0).max(0.0) * 255.0) as u32;
+        let g = ((self.0[1] * a).min(1.0).max(0.0) * 255.0) as u32;
+        let b = ((self.0[2] * a).min(1.0).max(0.0) * 255.0) as u32;
+        let a = (a * 255.0) as u32;
+        r | (g << 8) | (b << 16) | (a << 24)
+    }
+}
--- a/piet-wgsl/src/render.rs
+++ b/piet-wgsl/src/render.rs
@ -4,17 +4,39 @@ use bytemuck::{Pod, Zeroable};
 use piet_scene::Scene;

 use crate::{
-    engine::{BufProxy, Recording},
-    shaders::{self, Shaders},
+    engine::{BufProxy, Recording, ResourceProxy},
+    shaders::{self, FullShaders, Shaders},
 };

 const TAG_MONOID_SIZE: u64 = 12;
+const TAG_MONOID_FULL_SIZE: u64 = 20;
+const PATH_BBOX_SIZE: u64 = 24;
+const CUBIC_SIZE: u64 = 40;
+const DRAWMONOID_SIZE: u64 = 16;
+const MAX_DRAWINFO_SIZE: u64 = 44;
+const CLIP_BIC_SIZE: u64 = 8;
+const CLIP_EL_SIZE: u64 = 32;
+const CLIP_INP_SIZE: u64 = 4;
+const CLIP_BBOX_SIZE: u64 = 16;
+const PATH_SIZE: u64 = 32;
+const DRAW_BBOX_SIZE: u64 = 16;
+const BUMP_SIZE: u64 = 16;
+const BIN_HEADER_SIZE: u64 = 8;

 #[repr(C)]
-#[derive(Clone, Copy, Zeroable, Pod)]
+#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
 struct Config {
    width_in_tiles: u32,
    height_in_tiles: u32,
+    n_drawobj: u32,
+    n_path: u32,
+    n_clip: u32,
+    pathtag_base: u32,
+    pathdata_base: u32,
+    drawtag_base: u32,
+    drawdata_base: u32,
+    transform_base: u32,
+    linewidth_base: u32,
 }

 #[repr(C)]
@ -26,22 +48,39 @@ pub struct PathSegment {
    next: u32,
 }

+fn size_to_words(byte_size: usize) -> u32 {
+    (byte_size / std::mem::size_of::<u32>()) as u32
+}
+
 pub fn render(scene: &Scene, shaders: &Shaders) -> (Recording, BufProxy) {
    let mut recording = Recording::default();
    let data = scene.data();
    let n_pathtag = data.tag_stream.len();
    let pathtag_padded = align_up(n_pathtag, 4 * shaders::PATHTAG_REDUCE_WG);
    let pathtag_wgs = pathtag_padded / (4 * shaders::PATHTAG_REDUCE_WG as usize);
-    let mut tag_data: Vec<u8> = Vec::with_capacity(pathtag_padded);
-    tag_data.extend(&data.tag_stream);
-    tag_data.resize(pathtag_padded, 0);
-    let pathtag_buf = recording.upload(tag_data);
+    let mut scene: Vec<u8> = Vec::with_capacity(pathtag_padded);
+    let pathtag_base = size_to_words(scene.len());
+    scene.extend(&data.tag_stream);
+    scene.resize(pathtag_padded, 0);
+    let pathdata_base = size_to_words(scene.len());
+    scene.extend(&data.pathseg_stream);
+
+    let config = Config {
+        width_in_tiles: 64,
+        height_in_tiles: 64,
+        pathtag_base,
+        pathdata_base,
+        ..Default::default()
+    };
+    let scene_buf = recording.upload(scene);
+    let config_buf = recording.upload(bytemuck::bytes_of(&config).to_owned());
+
    let reduced_buf = BufProxy::new(pathtag_wgs as u64 * TAG_MONOID_SIZE);
    // TODO: really only need pathtag_wgs - 1
    recording.dispatch(
        shaders.pathtag_reduce,
        (pathtag_wgs as u32, 1, 1),
-        [pathtag_buf, reduced_buf],
+        [config_buf, scene_buf, reduced_buf],
    );

    let tagmonoid_buf =
@ -49,19 +88,11 @@ pub fn render(scene: &Scene, shaders: &Shaders) -> (Recording, BufProxy) {
    recording.dispatch(
        shaders.pathtag_scan,
        (pathtag_wgs as u32, 1, 1),
-        [pathtag_buf, reduced_buf, tagmonoid_buf],
+        [config_buf, scene_buf, reduced_buf, tagmonoid_buf],
    );

-    let path_coarse_wgs = (data.n_pathseg + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
-    // The clone here is kinda BS, think about reducing copies
-    // Of course, we'll probably end up concatenating into a single scene binding.
-    let pathdata_buf = recording.upload(data.pathseg_stream.clone());
-    //let cubics_buf = BufProxy::new(data.n_pathseg as u64 * 32);
-    let config = Config {
-        width_in_tiles: 64,
-        height_in_tiles: 64,
-    };
-    let config_buf = recording.upload(bytemuck::bytes_of(&config).to_owned());
+    let path_coarse_wgs =
+        (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
    // TODO: more principled size calc
    let tiles_buf = BufProxy::new(4097 * 8);
    let segments_buf = BufProxy::new(256 * 24);
@ -70,10 +101,9 @@ pub fn render(scene: &Scene, shaders: &Shaders) -> (Recording, BufProxy) {
        shaders.path_coarse,
        (path_coarse_wgs, 1, 1),
        [
-            pathtag_buf,
-            tagmonoid_buf,
-            pathdata_buf,
            config_buf,
+            scene_buf,
+            tagmonoid_buf,
            tiles_buf,
            segments_buf,
        ],
@ -95,6 +125,280 @@ pub fn render(scene: &Scene, shaders: &Shaders) -> (Recording, BufProxy) {
    (recording, out_buf)
 }

+pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy) {
+    let mut recording = Recording::default();
+    let mut ramps = crate::ramp::RampCache::default();
+    let mut drawdata_patches: Vec<(usize, u32)> = vec![];
+    let data = scene.data();
+    let stop_data = &data.resources.stops;
+    for patch in &data.resources.patches {
+        use piet_scene::ResourcePatch;
+        match patch {
+            ResourcePatch::Ramp { offset, stops } => {
+                let ramp_id = ramps.add(&stop_data[stops.clone()]);
+                drawdata_patches.push((*offset, ramp_id));
+            }
+        }
+    }
+    let gradient_image = if drawdata_patches.is_empty() {
+        ResourceProxy::new_image(1, 1)
+    } else {
+        let data = ramps.data();
+        let width = ramps.width();
+        let height = ramps.height();
+        let data: &[u8] = bytemuck::cast_slice(data);
+        println!(
+            "gradient image: {}x{} ({} bytes)",
+            width,
+            height,
+            data.len()
+        );
+        ResourceProxy::Image(recording.upload_image(width, height, data))
+    };
+    let n_pathtag = data.tag_stream.len();
+    let pathtag_padded = align_up(n_pathtag, 4 * shaders::PATHTAG_REDUCE_WG);
+    // TODO: can compute size accurately, avoid reallocation
+    let mut scene: Vec<u8> = Vec::with_capacity(pathtag_padded);
+    let pathtag_base = size_to_words(scene.len());
+    scene.extend(&data.tag_stream);
+    scene.resize(pathtag_padded, 0);
+    let pathdata_base = size_to_words(scene.len());
+    scene.extend(&data.pathseg_stream);
+    let drawtag_base = size_to_words(scene.len());
+    scene.extend(bytemuck::cast_slice(&data.drawtag_stream));
+    let drawdata_base = size_to_words(scene.len());
+    if !drawdata_patches.is_empty() {
+        let mut pos = 0;
+        for patch in drawdata_patches {
+            let offset = patch.0;
+            let value = patch.1;
+            if pos < offset {
+                scene.extend_from_slice(&data.drawdata_stream[pos..offset]);
+            }
+            scene.extend_from_slice(bytemuck::bytes_of(&value));
+            pos = offset + 4;
+        }
+        if pos < data.drawdata_stream.len() {
+            scene.extend_from_slice(&data.drawdata_stream[pos..])
+        }
+    } else {
+        scene.extend(&data.drawdata_stream);
+    }
+    let transform_base = size_to_words(scene.len());
+    scene.extend(bytemuck::cast_slice(&data.transform_stream));
+    let linewidth_base = size_to_words(scene.len());
+    scene.extend(bytemuck::cast_slice(&data.linewidth_stream));
+    let n_path = data.n_path;
+    // TODO: calculate for real when we do rectangles
+    let n_drawobj = n_path;
+    let n_clip = data.n_clip;
+    let config = Config {
+        width_in_tiles: 64,
+        height_in_tiles: 64,
+        n_drawobj,
+        n_path,
+        n_clip,
+        pathtag_base,
+        pathdata_base,
+        drawtag_base,
+        drawdata_base,
+        transform_base,
+        linewidth_base,
+    };
+    println!("{:?}", config);
+    let scene_buf = ResourceProxy::Buf(recording.upload(scene));
+    let config_buf = ResourceProxy::Buf(recording.upload(bytemuck::bytes_of(&config).to_owned()));
+
+    let pathtag_wgs = pathtag_padded / (4 * shaders::PATHTAG_REDUCE_WG as usize);
+    let reduced_buf = ResourceProxy::new_buf(pathtag_wgs as u64 * TAG_MONOID_FULL_SIZE);
+    // TODO: really only need pathtag_wgs - 1
+    recording.dispatch(
+        shaders.pathtag_reduce,
+        (pathtag_wgs as u32, 1, 1),
+        [config_buf, scene_buf, reduced_buf],
+    );
+
+    let tagmonoid_buf = ResourceProxy::new_buf(
+        pathtag_wgs as u64 * shaders::PATHTAG_REDUCE_WG as u64 * TAG_MONOID_FULL_SIZE,
+    );
+    recording.dispatch(
+        shaders.pathtag_scan,
+        (pathtag_wgs as u32, 1, 1),
+        [config_buf, scene_buf, reduced_buf, tagmonoid_buf],
+    );
+    let drawobj_wgs = (n_drawobj + shaders::PATH_BBOX_WG - 1) / shaders::PATH_BBOX_WG;
+    let path_bbox_buf = ResourceProxy::new_buf(n_path as u64 * PATH_BBOX_SIZE);
+    recording.dispatch(
+        shaders.bbox_clear,
+        (drawobj_wgs, 1, 1),
+        [config_buf, path_bbox_buf],
+    );
+    let cubic_buf = ResourceProxy::new_buf(n_pathtag as u64 * CUBIC_SIZE);
+    let path_coarse_wgs =
+        (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
+    recording.dispatch(
+        shaders.pathseg,
+        (path_coarse_wgs, 1, 1),
+        [
+            config_buf,
+            scene_buf,
+            tagmonoid_buf,
+            path_bbox_buf,
+            cubic_buf,
+        ],
+    );
+    let draw_reduced_buf = ResourceProxy::new_buf(drawobj_wgs as u64 * DRAWMONOID_SIZE);
+    recording.dispatch(
+        shaders.draw_reduce,
+        (drawobj_wgs, 1, 1),
+        [config_buf, scene_buf, draw_reduced_buf],
+    );
+    let draw_monoid_buf = ResourceProxy::new_buf(n_drawobj as u64 * DRAWMONOID_SIZE);
+    let info_buf = ResourceProxy::new_buf(n_drawobj as u64 * MAX_DRAWINFO_SIZE);
+    let clip_inp_buf = ResourceProxy::new_buf(data.n_clip as u64 * CLIP_INP_SIZE);
+    recording.dispatch(
+        shaders.draw_leaf,
+        (drawobj_wgs, 1, 1),
+        [
+            config_buf,
+            scene_buf,
+            draw_reduced_buf,
+            path_bbox_buf,
+            draw_monoid_buf,
+            info_buf,
+            clip_inp_buf,
+        ],
+    );
+    let clip_el_buf = ResourceProxy::new_buf(data.n_clip as u64 * CLIP_EL_SIZE);
+    let clip_bic_buf =
+        ResourceProxy::new_buf((n_clip / shaders::CLIP_REDUCE_WG) as u64 * CLIP_BIC_SIZE);
+    let clip_wg_reduce = n_clip.saturating_sub(1) / shaders::CLIP_REDUCE_WG;
+    if clip_wg_reduce > 0 {
+        recording.dispatch(
+            shaders.clip_reduce,
+            (clip_wg_reduce, 1, 1),
+            [
+                config_buf,
+                clip_inp_buf,
+                path_bbox_buf,
+                clip_bic_buf,
+                clip_el_buf,
+            ],
+        );
+    }
+    let clip_wg = (n_clip + shaders::CLIP_REDUCE_WG - 1) / shaders::CLIP_REDUCE_WG;
+    let clip_bbox_buf = ResourceProxy::new_buf(n_clip as u64 * CLIP_BBOX_SIZE);
+    if clip_wg > 0 {
+        recording.dispatch(
+            shaders.clip_leaf,
+            (clip_wg, 1, 1),
+            [
+                config_buf,
+                clip_inp_buf,
+                path_bbox_buf,
+                clip_bic_buf,
+                clip_el_buf,
+                draw_monoid_buf,
+                clip_bbox_buf,
+            ],
+        );
+    }
+    let draw_bbox_buf = ResourceProxy::new_buf(n_path as u64 * DRAW_BBOX_SIZE);
+    let bump_buf = BufProxy::new(BUMP_SIZE);
+    let bin_data_buf = ResourceProxy::new_buf(1 << 20);
+    let width_in_bins = (config.width_in_tiles + 15) / 16;
+    let height_in_bins = (config.height_in_tiles + 15) / 16;
+    let n_bins = width_in_bins * height_in_bins;
+    let bin_header_buf = ResourceProxy::new_buf((n_bins * drawobj_wgs) as u64 * BIN_HEADER_SIZE);
+    recording.clear_all(bump_buf);
+    let bump_buf = ResourceProxy::Buf(bump_buf);
+    recording.dispatch(
+        shaders.binning,
+        (drawobj_wgs, 1, 1),
+        [
+            config_buf,
+            draw_monoid_buf,
+            path_bbox_buf,
+            clip_bbox_buf,
+            draw_bbox_buf,
+            bump_buf,
+            bin_data_buf,
+            bin_header_buf,
+        ],
+    );
+    let path_buf = ResourceProxy::new_buf(n_path as u64 * PATH_SIZE);
+    let tile_buf = ResourceProxy::new_buf(1 << 20);
+    let path_wgs = (n_path + shaders::PATH_BBOX_WG - 1) / shaders::PATH_BBOX_WG;
+    recording.dispatch(
+        shaders.tile_alloc,
+        (path_wgs, 1, 1),
+        [
+            config_buf,
+            scene_buf,
+            draw_bbox_buf,
+            bump_buf,
+            path_buf,
+            tile_buf,
+        ],
+    );
+
+    let segments_buf = ResourceProxy::new_buf(1 << 24);
+    recording.dispatch(
+        shaders.path_coarse,
+        (path_coarse_wgs, 1, 1),
+        [
+            config_buf,
+            scene_buf,
+            tagmonoid_buf,
+            cubic_buf,
+            path_buf,
+            bump_buf,
+            tile_buf,
+            segments_buf,
+        ],
+    );
+    recording.dispatch(
+        shaders.backdrop,
+        (path_wgs, 1, 1),
+        [config_buf, path_buf, tile_buf],
+    );
+    let ptcl_buf = ResourceProxy::new_buf(1 << 24);
+    recording.dispatch(
+        shaders.coarse,
+        (width_in_bins, height_in_bins, 1),
+        [
+            config_buf,
+            scene_buf,
+            draw_monoid_buf,
+            bin_header_buf,
+            bin_data_buf,
+            path_buf,
+            tile_buf,
+            info_buf,
+            bump_buf,
+            ptcl_buf,
+        ],
+    );
+    let out_buf_size = config.width_in_tiles * config.height_in_tiles * 1024;
+    let out_buf = BufProxy::new(out_buf_size as u64);
+    recording.dispatch(
+        shaders.fine,
+        (config.width_in_tiles, config.height_in_tiles, 1),
+        [
+            config_buf,
+            tile_buf,
+            segments_buf,
+            ResourceProxy::Buf(out_buf),
+            ptcl_buf,
+            gradient_image,
+        ],
+    );
+
+    let download_buf = out_buf;
+    recording.download(download_buf);
+    (recording, download_buf)
+}
+
 pub fn align_up(len: usize, alignment: u32) -> usize {
    len + (len.wrapping_neg() & alignment as usize - 1)
 }
--- a/piet-wgsl/src/shaders.rs
+++ b/piet-wgsl/src/shaders.rs
@ -25,7 +25,10 @@ use wgpu::Device;
 use crate::engine::{BindType, Engine, Error, ShaderId};

 pub const PATHTAG_REDUCE_WG: u32 = 256;
+pub const PATH_BBOX_WG: u32 = 256;
 pub const PATH_COARSE_WG: u32 = 256;
+pub const PATH_DRAWOBJ_WG: u32 = 256;
+pub const CLIP_REDUCE_WG: u32 = 256;

 pub struct Shaders {
    pub pathtag_reduce: ShaderId,
@ -35,6 +38,24 @@ pub struct Shaders {
    pub fine: ShaderId,
 }

+// Shaders for the full pipeline
+pub struct FullShaders {
+    pub pathtag_reduce: ShaderId,
+    pub pathtag_scan: ShaderId,
+    pub bbox_clear: ShaderId,
+    pub pathseg: ShaderId,
+    pub draw_reduce: ShaderId,
+    pub draw_leaf: ShaderId,
+    pub clip_reduce: ShaderId,
+    pub clip_leaf: ShaderId,
+    pub binning: ShaderId,
+    pub tile_alloc: ShaderId,
+    pub path_coarse: ShaderId,
+    pub backdrop: ShaderId,
+    pub coarse: ShaderId,
+    pub fine: ShaderId,
+}
+
 pub fn init_shaders(device: &Device, engine: &mut Engine) -> Result<Shaders, Error> {
    let shader_dir = Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/shader"));
    let imports = preprocess::get_imports(shader_dir);
@ -44,12 +65,17 @@ pub fn init_shaders(device: &Device, engine: &mut Engine) -> Result<Shaders, Err
    let pathtag_reduce = engine.add_shader(
        device,
        preprocess::preprocess(&read_shader("pathtag_reduce"), &empty, &imports).into(),
-        &[BindType::BufReadOnly, BindType::Buffer],
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
+        ],
    )?;
    let pathtag_scan = engine.add_shader(
        device,
        preprocess::preprocess(&read_shader("pathtag_scan"), &empty, &imports).into(),
        &[
+            BindType::BufReadOnly,
            BindType::BufReadOnly,
            BindType::BufReadOnly,
            BindType::Buffer,
@ -65,7 +91,6 @@ pub fn init_shaders(device: &Device, engine: &mut Engine) -> Result<Shaders, Err
            BindType::BufReadOnly,
            BindType::BufReadOnly,
            BindType::BufReadOnly,
-            BindType::BufReadOnly,
            BindType::Buffer,
            BindType::Buffer,
        ],
@ -93,3 +118,188 @@ pub fn init_shaders(device: &Device, engine: &mut Engine) -> Result<Shaders, Err
        fine,
    })
 }
+
+pub fn full_shaders(device: &Device, engine: &mut Engine) -> Result<FullShaders, Error> {
+    let shader_dir = Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/shader"));
+    let imports = preprocess::get_imports(shader_dir);
+    let read_shader =
+        |path: &str| fs::read_to_string(shader_dir.join(path.to_string() + ".wgsl")).unwrap();
+    let empty = HashSet::new();
+    let mut full_config = HashSet::new();
+    full_config.insert("full".into());
+    let pathtag_reduce = engine.add_shader(
+        device,
+        preprocess::preprocess(&read_shader("pathtag_reduce"), &full_config, &imports).into(),
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
+        ],
+    )?;
+    let pathtag_scan = engine.add_shader(
+        device,
+        preprocess::preprocess(&read_shader("pathtag_scan"), &full_config, &imports).into(),
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
+        ],
+    )?;
+    let bbox_clear = engine.add_shader(
+        device,
+        preprocess::preprocess(&read_shader("bbox_clear"), &empty, &imports).into(),
+        &[BindType::BufReadOnly, BindType::Buffer],
+    )?;
+    let pathseg = engine.add_shader(
+        device,
+        preprocess::preprocess(&read_shader("pathseg"), &full_config, &imports).into(),
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
+            BindType::Buffer,
+        ],
+    )?;
+    let draw_reduce = engine.add_shader(
+        device,
+        preprocess::preprocess(&read_shader("draw_reduce"), &empty, &imports).into(),
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
+        ],
+    )?;
+    let draw_leaf = engine.add_shader(
+        device,
+        preprocess::preprocess(&read_shader("draw_leaf"), &empty, &imports).into(),
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
+            BindType::Buffer,
+            BindType::Buffer,
+        ],
+    )?;
+    let clip_reduce = engine.add_shader(
+        device,
+        preprocess::preprocess(&read_shader("clip_reduce"), &empty, &imports).into(),
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
+            BindType::Buffer,
+        ],
+    )?;
+    let clip_leaf = engine.add_shader(
+        device,
+        preprocess::preprocess(&read_shader("clip_leaf"), &empty, &imports).into(),
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
+            BindType::Buffer,
+        ],
+    )?;
+    let binning = engine.add_shader(
+        device,
+        preprocess::preprocess(&read_shader("binning"), &empty, &imports).into(),
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
+            BindType::Buffer,
+            BindType::Buffer,
+            BindType::Buffer,
+        ],
+    )?;
+    let tile_alloc = engine.add_shader(
+        device,
+        preprocess::preprocess(&read_shader("tile_alloc"), &empty, &imports).into(),
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
+            BindType::Buffer,
+            BindType::Buffer,
+        ],
+    )?;
+
+    let path_coarse = engine.add_shader(
+        device,
+        preprocess::preprocess(&read_shader("path_coarse_full"), &full_config, &imports).into(),
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
+            BindType::Buffer,
+            BindType::Buffer,
+        ],
+    )?;
+    let backdrop = engine.add_shader(
+        device,
+        preprocess::preprocess(&read_shader("backdrop_dyn"), &empty, &imports).into(),
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
+        ],
+    )?;
+    let coarse = engine.add_shader(
+        device,
+        preprocess::preprocess(&read_shader("coarse"), &empty, &imports).into(),
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
+            BindType::Buffer,
+        ],
+    )?;
+    let fine = engine.add_shader(
+        device,
+        preprocess::preprocess(&read_shader("fine"), &full_config, &imports).into(),
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
+            BindType::BufReadOnly,
+            BindType::ImageRead,
+        ],
+    )?;
+    Ok(FullShaders {
+        pathtag_reduce,
+        pathtag_scan,
+        bbox_clear,
+        pathseg,
+        draw_reduce,
+        draw_leaf,
+        clip_reduce,
+        clip_leaf,
+        binning,
+        tile_alloc,
+        path_coarse,
+        backdrop,
+        coarse,
+        fine,
+    })
+}
--- a/piet-wgsl/src/test_scene.rs
+++ b/piet-wgsl/src/test_scene.rs
@ -14,24 +14,54 @@
 //
 // Also licensed under MIT license, at your choice.

-use piet_scene::{Affine, Brush, Color, Fill, PathElement, Point, Scene, SceneBuilder};
+use kurbo::BezPath;
+use piet_scene::{
+    Affine, BlendMode, Brush, Color, Compose, ExtendMode, Fill, GradientStop, LinearGradient, Mix,
+    PathElement, Point, RadialGradient, Rect, Scene, SceneBuilder, SceneFragment, Stroke,
+};
+
+use crate::pico_svg::PicoSvg;

 pub fn gen_test_scene() -> Scene {
    let mut scene = Scene::default();
    let mut builder = SceneBuilder::for_scene(&mut scene);
-    let path = [
-        PathElement::MoveTo(Point::new(100.0, 100.0)),
-        PathElement::LineTo(Point::new(500.0, 120.0)),
-        PathElement::LineTo(Point::new(300.0, 150.0)),
-        PathElement::LineTo(Point::new(200.0, 260.0)),
-        PathElement::LineTo(Point::new(150.0, 210.0)),
-        PathElement::Close,
-    ];
-    let brush = Brush::Solid(Color::rgb8(0x80, 0x80, 0x80));
-    builder.fill(Fill::NonZero, Affine::IDENTITY, &brush, None, &path);
+    let scene_ix = 1;
+    match scene_ix {
+        0 => {
+            let path = [
+                PathElement::MoveTo(Point::new(100.0, 100.0)),
+                PathElement::LineTo(Point::new(500.0, 120.0)),
+                PathElement::LineTo(Point::new(300.0, 150.0)),
+                PathElement::LineTo(Point::new(200.0, 260.0)),
+                PathElement::LineTo(Point::new(150.0, 210.0)),
+                PathElement::Close,
+            ];
+            let brush = Brush::Solid(Color::rgb8(0x40, 0x40, 0xff));
+            builder.fill(Fill::NonZero, Affine::IDENTITY, &brush, None, &path);
+            let transform = Affine::translate(50.0, 50.0);
+            let brush = Brush::Solid(Color::rgba8(0xff, 0xff, 0x00, 0x80));
+            builder.fill(Fill::NonZero, transform, &brush, None, &path);
+            let transform = Affine::translate(100.0, 100.0);
+            let style = simple_stroke(1.0);
+            let brush = Brush::Solid(Color::rgb8(0xa0, 0x00, 0x00));
+            builder.stroke(&style, transform, &brush, None, &path);
+        }
+        1 => {
+            render_blend_grid(&mut builder);
+        }
+        _ => {
+            let xml_str =
+                std::str::from_utf8(include_bytes!("../../piet-gpu/Ghostscript_Tiger.svg"))
+                    .unwrap();
+            let svg = PicoSvg::load(xml_str, 6.0).unwrap();
+            render_svg(&mut builder, &svg, false);
+        }
+    }
+    builder.finish();
    scene
 }

+#[allow(unused)]
 pub fn dump_scene_info(scene: &Scene) {
    let data = scene.data();
    println!("tags {:?}", data.tag_stream);
@ -40,3 +70,212 @@ pub fn dump_scene_info(scene: &Scene) {
        bytemuck::cast_slice::<u8, f32>(&data.pathseg_stream)
    );
 }
+
+pub fn render_svg(sb: &mut SceneBuilder, svg: &PicoSvg, print_stats: bool) {
+    use crate::pico_svg::*;
+    let start = std::time::Instant::now();
+    for item in svg.items.iter() {
+        match item {
+            Item::Fill(fill) => {
+                sb.fill(
+                    Fill::NonZero,
+                    Affine::IDENTITY,
+                    &fill.color.into(),
+                    None,
+                    convert_bez_path(&fill.path),
+                );
+            }
+            Item::Stroke(stroke) => {
+                sb.stroke(
+                    &simple_stroke(stroke.width as f32),
+                    Affine::IDENTITY,
+                    &stroke.color.into(),
+                    None,
+                    convert_bez_path(&stroke.path),
+                );
+            }
+        }
+    }
+    if print_stats {
+        println!("flattening and encoding time: {:?}", start.elapsed());
+    }
+}
+
+fn convert_bez_path<'a>(path: &'a BezPath) -> impl Iterator<Item = PathElement> + 'a + Clone {
+    path.elements()
+        .iter()
+        .map(|el| PathElement::from_kurbo(*el))
+}
+
+fn simple_stroke(width: f32) -> Stroke<[f32; 0]> {
+    Stroke {
+        width,
+        join: piet_scene::Join::Round,
+        miter_limit: 1.4,
+        start_cap: piet_scene::Cap::Round,
+        end_cap: piet_scene::Cap::Round,
+        dash_pattern: [],
+        dash_offset: 0.0,
+        scale: true,
+    }
+}
+
+#[allow(unused)]
+pub fn render_blend_grid(sb: &mut SceneBuilder) {
+    const BLEND_MODES: &[Mix] = &[
+        Mix::Normal,
+        Mix::Multiply,
+        Mix::Darken,
+        Mix::Screen,
+        Mix::Lighten,
+        Mix::Overlay,
+        Mix::ColorDodge,
+        Mix::ColorBurn,
+        Mix::HardLight,
+        Mix::SoftLight,
+        Mix::Difference,
+        Mix::Exclusion,
+        Mix::Hue,
+        Mix::Saturation,
+        Mix::Color,
+        Mix::Luminosity,
+    ];
+    for (ix, &blend) in BLEND_MODES.iter().enumerate() {
+        let i = ix % 4;
+        let j = ix / 4;
+        let transform = Affine::translate(i as f32 * 225., j as f32 * 225.);
+        let square = blend_square(blend.into());
+        sb.append(&square, Some(transform));
+    }
+}
+
+#[allow(unused)]
+fn render_blend_square(sb: &mut SceneBuilder, blend: BlendMode, transform: Affine) {
+    // Inspired by https://developer.mozilla.org/en-US/docs/Web/CSS/mix-blend-mode
+    let rect = Rect::from_origin_size(Point::new(0., 0.), 200., 200.);
+    let stops = &[
+        GradientStop {
+            color: Color::rgb8(0, 0, 0),
+            offset: 0.0,
+        },
+        GradientStop {
+            color: Color::rgb8(255, 255, 255),
+            offset: 1.0,
+        },
+    ][..];
+    let linear = Brush::LinearGradient(LinearGradient {
+        start: Point::new(0.0, 0.0),
+        end: Point::new(200.0, 0.0),
+        stops: stops.into(),
+        extend: ExtendMode::Pad,
+    });
+    sb.fill(Fill::NonZero, transform, &linear, None, rect.elements());
+    const GRADIENTS: &[(f32, f32, Color)] = &[
+        (150., 0., Color::rgb8(255, 240, 64)),
+        (175., 100., Color::rgb8(255, 96, 240)),
+        (125., 200., Color::rgb8(64, 192, 255)),
+    ];
+    for (x, y, c) in GRADIENTS {
+        let mut color2 = c.clone();
+        color2.a = 0;
+        let stops = &[
+            GradientStop {
+                color: c.clone(),
+                offset: 0.0,
+            },
+            GradientStop {
+                color: color2,
+                offset: 1.0,
+            },
+        ][..];
+        let rad = Brush::RadialGradient(RadialGradient {
+            center0: Point::new(*x, *y),
+            center1: Point::new(*x, *y),
+            radius0: 0.0,
+            radius1: 100.0,
+            stops: stops.into(),
+            extend: ExtendMode::Pad,
+        });
+        sb.fill(Fill::NonZero, transform, &rad, None, rect.elements());
+    }
+    const COLORS: &[Color] = &[
+        Color::rgb8(255, 0, 0),
+        Color::rgb8(0, 255, 0),
+        Color::rgb8(0, 0, 255),
+    ];
+    sb.push_layer(Mix::Normal.into(), transform, rect.elements());
+    for (i, c) in COLORS.iter().enumerate() {
+        let stops = &[
+            GradientStop {
+                color: Color::rgb8(255, 255, 255),
+                offset: 0.0,
+            },
+            GradientStop {
+                color: c.clone(),
+                offset: 1.0,
+            },
+        ][..];
+        let linear = Brush::LinearGradient(LinearGradient {
+            start: Point::new(0.0, 0.0),
+            end: Point::new(0.0, 200.0),
+            stops: stops.into(),
+            extend: ExtendMode::Pad,
+        });
+        sb.push_layer(blend, transform, rect.elements());
+        // squash the ellipse
+        let a = transform
+            * Affine::translate(100., 100.)
+            * Affine::rotate(std::f32::consts::FRAC_PI_3 * (i * 2 + 1) as f32)
+            * Affine::scale(1.0, 0.357)
+            * Affine::translate(-100., -100.);
+        sb.fill(
+            Fill::NonZero,
+            a,
+            &linear,
+            None,
+            make_ellipse(100., 100., 90., 90.),
+        );
+        sb.pop_layer();
+    }
+    sb.pop_layer();
+}
+
+#[allow(unused)]
+fn blend_square(blend: BlendMode) -> SceneFragment {
+    let mut fragment = SceneFragment::default();
+    let mut sb = SceneBuilder::for_fragment(&mut fragment);
+    render_blend_square(&mut sb, blend, Affine::IDENTITY);
+    sb.finish();
+    fragment
+}
+
+fn make_ellipse(cx: f32, cy: f32, rx: f32, ry: f32) -> impl Iterator<Item = PathElement> + Clone {
+    let a = 0.551915024494;
+    let arx = a * rx;
+    let ary = a * ry;
+    let elements = [
+        PathElement::MoveTo(Point::new(cx + rx, cy)),
+        PathElement::CurveTo(
+            Point::new(cx + rx, cy + ary),
+            Point::new(cx + arx, cy + ry),
+            Point::new(cx, cy + ry),
+        ),
+        PathElement::CurveTo(
+            Point::new(cx - arx, cy + ry),
+            Point::new(cx - rx, cy + ary),
+            Point::new(cx - rx, cy),
+        ),
+        PathElement::CurveTo(
+            Point::new(cx - rx, cy - ary),
+            Point::new(cx - arx, cy - ry),
+            Point::new(cx, cy - ry),
+        ),
+        PathElement::CurveTo(
+            Point::new(cx + arx, cy - ry),
+            Point::new(cx + rx, cy - ary),
+            Point::new(cx + rx, cy),
+        ),
+        PathElement::Close,
+    ];
+    (0..elements.len()).map(move |i| elements[i])
+}