From b6da6d958ba90dcb0847e93d2d13b4e394f0df96 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Wed, 26 Oct 2022 13:55:45 -0700
Subject: [PATCH 01/17] Write more shaders

This is super WIP, but represents partially written shaders for more of
the piet-gpu pipeline. Checkpointing as other work is incoming.
---
 .vscode/settings.json                |   2 +
 piet-wgsl/shader/binning.wgsl        | 173 +++++++++++++++++++++++++++
 piet-wgsl/shader/draw_leaf.wgsl      | 145 ++++++++++++++++++++++
 piet-wgsl/shader/draw_reduce.wgsl    |  54 +++++++++
 piet-wgsl/shader/shared/bbox.wgsl    |  30 +++++
 piet-wgsl/shader/shared/config.wgsl  |   7 ++
 piet-wgsl/shader/shared/drawtag.wgsl |  60 ++++++++++
 piet-wgsl/shader/shared/pathtag.wgsl |   6 +-
 8 files changed, 472 insertions(+), 5 deletions(-)
 create mode 100644 piet-wgsl/shader/binning.wgsl
 create mode 100644 piet-wgsl/shader/draw_leaf.wgsl
 create mode 100644 piet-wgsl/shader/draw_reduce.wgsl
 create mode 100644 piet-wgsl/shader/shared/bbox.wgsl
 create mode 100644 piet-wgsl/shader/shared/drawtag.wgsl
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 323cd81..3a9bfe5 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,6 +1,8 @@
 {
   "wgsl-analyzer.customImports": {
+    "bbox": "${workspaceFolder}/piet-wgsl/shader/shared/bbox.wgsl",
     "config": "${workspaceFolder}/piet-wgsl/shader/shared/config.wgsl",
+    "drawtag": "${workspaceFolder}/piet-wgsl/shader/shared/drawtag.wgsl",
     "segment": "${workspaceFolder}/piet-wgsl/shader/shared/segment.wgsl",
     "pathtag": "${workspaceFolder}/piet-wgsl/shader/shared/pathtag.wgsl"
   },
diff --git a/piet-wgsl/shader/binning.wgsl b/piet-wgsl/shader/binning.wgsl
new file mode 100644
index 0000000..1282164
--- /dev/null
+++ b/piet-wgsl/shader/binning.wgsl
@@ -0,0 +1,173 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+// The binning stage
+
+#import config
+#import drawtag
+#import bbox
+
+@group(0) @binding(0)
+var<storage> config: Config;
+
+@group(0) @binding(1)
+var<storage> draw_monoids: array<DrawMonoid>;
+
+
+@group(0) @binding(2)
+var<storage> path_bbox_buf: array<PathBBox>;
+
+@group(0) @binding(3)
+var<storage> clip_bbox_buf: array<vec4<f32>>;
+
+@group(0) @binding(4)
+var<storage, read_write> intersected_bbox: array<vec4<f32>>;
+
+// TODO: put into shared include
+// TODO: robust memory (failure flags)
+struct BumpAllocators {
+    binning: atomic<u32>,
+}
+
+@group(0) @binding(5)
+var<storate, read_write> bump: BumpAllocators;
+
+@group(0) @binding(6)
+var<storage, read_write> bin_data: array<u32>;
+
+struct BinHeader {
+    element_count: u32,
+    chunk_offset: u32,
+}
+
+@group(0) @binding(7)
+var<storage, read_write> bin_header: array<BinHeader>;
+
+// These should probably be in a common block.
+let TILE_WIDTH = 16u;
+let TILE_HEIGHT = 16u;
+// Number of tiles per bin
+let N_TILE_X = 16u;
+let N_TILE_Y = 16u;
+let N_TILE = N_TILE_X * N_TILE_Y;
+
+// conversion factors from coordinates to bin
+let SX = 1.0 / f32(N_TILE_X * TILE_WIDTH);
+let SY = 1.0 / f32(N_TILE_Y * TILE_HEIGHT);
+
+let WG_SIZE = 256u;
+let N_SLICE = WG_SIZE / 32u;
+
+var<workgroup> sh_bitmaps: array<array<atomic<u32>, N_TILE>, N_SLICE>;
+var<workgroup> sh_count: array<array<u32, N_TILE>, N_SLICE>;
+var<workgroup> sh_chunk_offset: array<u32, N_TILE>;
+
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+    @builtin(workgroup_id) wg_id: vec3<u32>,
+) {
+    for (var i = 0u; i < N_SLICE; i += 1u) {
+        atomicStore(&sh_bitmaps[i][local_id.x], 0u);
+    }
+
+    // Read inputs and determine coverage of bins
+    let element_ix = global_id.x;
+    var x0 = 0;
+    var y0 = 0;
+    var x1 = 0;
+    var y1 = 0;
+    if element_ix < config.n_drawobj {
+        let draw_monoid = draw_monoids[element_ix];
+        var clip_bbox = vec4(-1e9, -1e9, 1e9, 1e9);
+        if draw_monoid.clip_ix > 0u {
+            clip_bbox = clip_bbox_buf[draw_monoid.clip_ix - 1u];
+        }
+        // For clip elements, clip_box is the bbox of the clip path,
+        // intersected with enclosing clips.
+        // For other elements, it is the bbox of the enclosing clips.
+        // TODO check this is true
+
+        let path_bbox = path_bbox_buf[draw_monoid.path_ix];
+        let pb = vec4(path_bbox.x0, path_bbox.y0, path_bbox.x1, path_bbox.y1);
+        let bbox = bbox_intersect(clip_bbox, pb);
+
+        bbox.zw = max(bbox.xy, bbox.zw);
+        intersected_bbox[element_ix] = bbox;
+        x0 = i32(floor(bbox.x * SX));
+        y0 = i32(floor(bbox.y * SY));
+        x1 = i32(ceil(bbox.z * SX));
+        y1 = i32(ceil(bbox.w * SY));
+    }
+    let width_in_bins = i32((config.width_in_tiles + N_TILE_X - 1u) / N_TILE_X);
+    let height_in_bins = i32((config.height_in_tiles + N_TILE_Y - 1u) / N_TILE_Y);
+    x0 = clamp(x0, 0, width_in_bins);
+    y0 = clamp(y0, 0, height_in_bins);
+    x1 = clamp(x1, 0, width_in_bins);
+    y1 = clamp(y1, 0, height_in_bins);
+    if x0 == x1 {
+        y1 = y0;
+    }
+    var x = x0;
+    var y = y0;
+    let my_slice = local_id.x / 32u;
+    let my_mask = 1u << (local_id.x & 31u);
+    while y < y1 {
+        atomicOr(&sh_bitmaps[my_slice][y * width_in_bins + x], my_mask);
+        x += 1u;
+        if x == x1 {
+            x = x0;
+            y += 1;
+        }
+    }
+
+    workgroupBarrier();
+    // Allocate output segments
+    var element_count = 0u;
+    for (var i = 0u; i < N_SLICE; i += 1u) {
+        elementCount += countOneBits(atomicLoad(&sh_bitmaps[i][local_id.x]));
+        sh_count[i][id_ix] = element_count;
+    }
+    // element_count is the number of draw objects covering this thread's bin
+    let chunk_offset = atomicAdd(&bump.binning, element_count);
+    sh_chunk_offset[local_id.x] = chunk_offset;
+    bin_header[global_id.x].element_count = element_count;
+    bin_header[global_id.x].chunk_offset = chunk_offset;
+    workgroupBarrier();
+
+    // loop over bbox of bins touched by this draw object
+    x = x0;
+    y = y0;
+    while y < y1 {
+        let bin_ix = y * width_in_bins + x;
+        let out_mask = atomicLoad(&sh_bitmaps[my_slice][bin_ix]);
+        // I think this predicate will always be true...
+        if (out_mask & my_mask) != 0 {
+            var idx = countOneBits(out_mask & (my_mask - 1u));
+            if my_slice > 0 {
+                idx += sh_count[my_slice - 1u][bin_ix];
+            }
+            let offset = sh_chunk_offset[bin_ix];
+            bin_data[offset + idx] = element_ix;
+        }
+        x += 1u;
+        if x == x1 {
+            x = x0;
+            y += 1u;
+        }
+    }
+}
diff --git a/piet-wgsl/shader/draw_leaf.wgsl b/piet-wgsl/shader/draw_leaf.wgsl
new file mode 100644
index 0000000..2169dd4
--- /dev/null
+++ b/piet-wgsl/shader/draw_leaf.wgsl
@@ -0,0 +1,145 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+// Finish prefix sum of drawtags, decode draw objects.
+
+#import config
+#import drawtag
+#import bbox
+
+@group(0) @binding(0)
+var<storage> config: Config;
+
+@group(0) @binding(1)
+var<storage> scene: array<u32>;
+
+@group(0) @binding(2)
+var<storage> reduced: array<DrawMonoid>;
+
+@group(0) @binding(3)
+var<storage, read_write> draw_monoid: array<DrawMonoid>;
+
+@group(0) @binding(4)
+var<storage> path_bbox: array<PathBbox>;
+
+@group(0) @binding(5)
+var<storage, read_write> info: array<u32>;
+
+let WG_SIZE = 256;
+
+var<workgroup> sh_scratch: array<DrawMonoid, WG_SIZE>;
+
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+    @builtin(workgroup_id) wg_id: vec3<u32>,
+) {
+    let ix = global_id.x;
+    let tag_word = scene[config.drawtag_base + ix];
+    let agg = map_draw_tag(tag_word);
+    sh_scratch[local_id.x] = agg;
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+        workgroupBarrier();
+        if local_id.x + (1u << i) < WG_SIZE {
+            let other = sh_scratch[local_id.x + (1u << i)];
+            agg = combine_draw_monoid(agg, other);
+        }
+        workgroupBarrier();
+        sh_scratch[local_id.x] = agg;
+    }
+    workgroupBarrier();
+    var m = draw_monoid_identity();
+    if wg_id.x > 0u {
+        m = parent[wg_id.x - 1u];
+    }
+    if local_id.x > 0u {
+        m = combine_draw_monoid(m, sh_scratch[local_id.x - 1u]);
+    }
+    // m now contains exclusive prefix sum of draw monoid
+    draw_monoid[ix] = m;
+    let dd = config.drawdata_base + m.scene_offset;
+    let di = m.info_offset;
+    if tag_word == DRAWTAG_FILL_COLOR || tag_word == DRAWTAG_FILL_LIN_GRADIENT ||
+        tag_word == DRAWTAG_FILL_RAD_GRADIENT || tag_word == DRAWTAG_FILL_IMAGE ||
+        tag_word == DRAWTAG_BEGIN_CLIP
+    {
+        let bbox = path_bbox[m.path_ix];
+        let x0 = f32(bbox.x0) - 32768.0;
+        let y0 = f32(bbox.y0) - 32768.0;
+        let x1 = f32(bbox.x1) - 32768.0;
+        let y1 = f32(bbox.y1) - 32768.0;
+        let bbox_f = vec4(x0, y0, x1, y1);
+        let fill_mode = u32(bbox.linewidth >= 0.0);
+        var mat: vec4<f32>;
+        var translate: vec2<f32>;
+        var linewidth = bbox.linewidth;
+        if linewidth >= 0.0 || tag_word == DRAWTAG_FILL_LIN_GRADIENT || tag_word == DRAWTAG_FILL_RAD_GRADIENT {
+            // TODO: retrieve transform from scene. Packed?
+        }
+        if linewidth >= 0.0 {
+            // Note: doesn't deal with anisotropic case
+            linewidth *= sqrt(abs(mat.x * mat.w - mat.y * mat.z)); 
+        }
+        switch tag_word {
+            case DRAWTAG_FILL_COLOR, DRAWTAG_FILL_IMAGE: {
+                info[di] = bitcast<u32>(linewidth);
+            }
+            case DRAWTAG_FILL_LIN_GRADIENT: {
+                info[di] = bitcast<u32>(linewidth);
+                var p0 = bitcast<vec2<f32>>(vec2(scene[dd + 1u], scene[dd + 2u]));
+                var p1 = bitcast<vec2<f32>>(vec2(scene[dd + 3u], scene[dd + 4u]));
+                p0 = mat.xy * p0.x + mat.zw * p0.y + translate;
+                p1 = mat.xy * p1.x + mat.zw * p1.y + translate;
+                let dxy = p1 - p0;
+                let scale = 1.0 / dot(dxy, dxy);
+                let line_xy = dxy * scale;
+                let line_c = -dot(p0, line_xy);
+                info[di + 1u] = bitcast<u32>(line_xy.x);
+                info[di + 2u] = bitcast<u32>(line_xy.y);
+                info[di + 3u] = bitcast<u32>(line_c);
+            }
+            case DRAWTAG_FILL_RAD_GRADIENT: {
+                info[di] = bitcast<u32>(linewidth);
+                var p0 = bitcast<vec2<f32>>(vec2(scene[dd + 1u], scene[dd + 2u]));
+                var p1 = bitcast<vec2<f32>>(vec2(scene[dd + 3u], scene[dd + 4u]));
+                let r0 = bitcast<f32>(scene[dd + 5u]);
+                let r1 = bitcast<f32>(scene[dd + 6u]);
+                let inv_det = 1.0 / (mat.x * mat.w - mat.y * mat.z);
+                let inv_mat = inv_det * vec4(mat.w, -mat.y, -mat.z, mat.x);
+                var inv_tr = inv_mat.xz * translate.x + inv_mat.yw * translate.y;
+                inv_tr += p0;
+                let center1 = p1 - p0;
+                let rr = r1 / (r1 - r0);
+                let ra_inv = rr / (r1 * r1 - dot(center1, center1));
+                let c1 = center1 * ra_inv;
+                let ra = rr * ra_inv;
+                let roff = rr - 1.0;
+                info[di + 1u] = bitcast<u32>(inv_mat.x);
+                info[di + 2u] = bitcast<u32>(inv_mat.y);
+                info[di + 3u] = bitcast<u32>(inv_mat.z);
+                info[di + 4u] = bitcast<u32>(inv_mat.w);
+                info[di + 5u] = bitcast<u32>(inv_tr.x);
+                info[di + 6u] = bitcast<u32>(inv_tr.y);
+                info[di + 7u] = bitcast<u32>(c1.x);
+                info[di + 8u] = bitcast<u32>(c1.y);
+                info[di + 9u] = bitcast<u32>(ra);
+                info[di + 10u] = bitcast<u32>(roff);
+            }
+            default: {}
+        }
+    }
+}
\ No newline at end of file
diff --git a/piet-wgsl/shader/draw_reduce.wgsl b/piet-wgsl/shader/draw_reduce.wgsl
new file mode 100644
index 0000000..9efae7c
--- /dev/null
+++ b/piet-wgsl/shader/draw_reduce.wgsl
@@ -0,0 +1,54 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+#import config
+#import drawtag
+
+@group(0) @binding(0)
+var<storage> config: Config;
+
+@group(0) @binding(1)
+var<storage> scene: array<u32>;
+
+@group(0) @binding(2)
+var<storage, read_write> reduced: array<DrawMonoid>;
+
+let WG_SIZE = 256;
+
+var<workgroup> sh_scratch: array<DrawMonoid, WG_SIZE>;
+
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+) {
+    let ix = global_id.x;
+    let tag_word = scene[config.drawtag_base + ix];
+    let agg = map_draw_tag(tag_word);
+    sh_scratch[local_id.x] = agg;
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+        workgroupBarrier();
+        if local_id.x + (1u << i) < WG_SIZE {
+            let other = sh_scratch[local_id.x + (1u << i)];
+            agg = combine_draw_monoid(agg, other);
+        }
+        workgroupBarrier();
+        sh_scratch[local_id.x] = agg;
+    }
+    if local_id.x == 0u {
+        reduced[ix >> LG_WG_SIZE] = agg;
+    }
+}
diff --git a/piet-wgsl/shader/shared/bbox.wgsl b/piet-wgsl/shader/shared/bbox.wgsl
new file mode 100644
index 0000000..c0eeb1c
--- /dev/null
+++ b/piet-wgsl/shader/shared/bbox.wgsl
@@ -0,0 +1,30 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+// The annotated bounding box for a path. It has been transformed,
+// but contains a link to the active transform, mostly for gradients.
+struct PathBbox {
+    x0: u32,
+    y0: u32,
+    x1: u32,
+    y1: u32,
+    linewidth: f32,
+    trans_ix: u32,
+}
+
+fn bbox_intersect(a: vec4<f32>, b: vec4<f32>) -> f32 {
+    return vec4(max(a.xy, b.xy), min(a.zyw, b.zw));
+}
diff --git a/piet-wgsl/shader/shared/config.wgsl b/piet-wgsl/shader/shared/config.wgsl
index 704a608..cdb0abc 100644
--- a/piet-wgsl/shader/shared/config.wgsl
+++ b/piet-wgsl/shader/shared/config.wgsl
@@ -17,4 +17,11 @@
 struct Config {
     width_in_tiles: u32,
     height_in_tiles: u32,
+
+    n_drawobj: u32,
+
+    // offsets within config file (in u32 units)
+    // Note: this is a difference from piet-gpu, which is in bytes
+    drawtag_base: u32,
+    drawdata_base: u32,
 }
diff --git a/piet-wgsl/shader/shared/drawtag.wgsl b/piet-wgsl/shader/shared/drawtag.wgsl
new file mode 100644
index 0000000..c318e53
--- /dev/null
+++ b/piet-wgsl/shader/shared/drawtag.wgsl
@@ -0,0 +1,60 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+// The DrawMonoid is computed as a prefix sum to aid in decoding
+// the variable-length encoding of draw objects.
+struct DrawMonoid {
+    // The number of paths preceding this draw object.
+    path_ix: u32,
+    // The number of clip operations preceding this draw object.
+    clip_ix: u32,
+    // The offset of the encoded draw object in the scene (u32s).
+    scene_offset: u32,
+    // The offset of the associated info.
+    info_offset: u32,
+}
+
+// Each draw object has a 32-bit draw tag, which is a bit-packed
+// version of the draw monoid.
+let DRAWTAG_NOP = 0u;
+let DRAWTAG_FILL_COLOR = 0x44u;
+let DRAWTAG_FILL_LIN_GRADIENT = 0x114;
+let DRAWTAG_FILL_RAD_GRADIENT = 0x2dc;
+let DRAWTAG_FILL_IMAGE = 0x48;
+let DRAWTAG_BEGIN_CLIP = 0x05;
+let DRAWTAG_END_CLIP = 0x25;
+
+fn draw_monoid_identity() -> DrawMonoid {
+    return DrawMonoid();
+}
+
+fn combine_draw_monoid(a: DrawMonoid, b: DrawMonoid) {
+    var c: DrawMonoid;
+    c.path_ix = a.path_ix + b.path_ix;
+    c.clip_ix = a.clip_ix + b.clip_ix;
+    c.scene_offset = a.scene_offset + b.scene_offset;
+    c.info_offset = a.info_offset + b.info_offset;
+}
+
+fn map_draw_tag(tag_word: u32) -> DawMonoid {
+    let has_path = ;
+    var c: DrawMonoid;
+    c.path_ix = u32(tag_word != DRAWTAG_NOP);
+    c.clip_ix = tag_word & 1u;
+    c.scene_offset = (tag_word >> 2u) & 0x07u;
+    c.info_offset = (tag_word >> 6u) & 0x0fu;
+    return c;
+}
diff --git a/piet-wgsl/shader/shared/pathtag.wgsl b/piet-wgsl/shader/shared/pathtag.wgsl
index e4cfda3..2424d2a 100644
--- a/piet-wgsl/shader/shared/pathtag.wgsl
+++ b/piet-wgsl/shader/shared/pathtag.wgsl
@@ -30,11 +30,7 @@ let PATH_TAG_PATH = 0x10u;
 let PATH_TAG_TRANSFORM = 0x20u;
 
 fn tag_monoid_identity() -> TagMonoid {
-    var c: TagMonoid;
-    c.trans_ix = 0u;
-    c.pathseg_ix = 0u;
-    c.pathseg_offset = 0u;
-    return c;
+    return TagMonoid();
 }
 
 fn combine_tag_monoid(a: TagMonoid, b: TagMonoid) -> TagMonoid {

From 06ec395b6889271df410f8d441ec214e8d8bc8a3 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Fri, 28 Oct 2022 12:01:15 -0700
Subject: [PATCH 02/17] Checkpoint coarse rasterization

The bones of coarse rasterization are in place (so far, fills only). Still not suitable for end-to-end (need to generate bounding boxes, among other things), but getting closer.
---
 .vscode/settings.json                |   4 +-
 piet-wgsl/shader/binning.wgsl        |  15 +-
 piet-wgsl/shader/coarse.wgsl         | 327 +++++++++++++++++++++++++++
 piet-wgsl/shader/draw_leaf.wgsl      |   4 +-
 piet-wgsl/shader/shared/bbox.wgsl    |   4 +-
 piet-wgsl/shader/shared/bump.wgsl    |  21 ++
 piet-wgsl/shader/shared/config.wgsl  |  22 +-
 piet-wgsl/shader/shared/drawtag.wgsl |  13 +-
 piet-wgsl/shader/shared/ptcl.wgsl    |  38 ++++
 9 files changed, 422 insertions(+), 26 deletions(-)
 create mode 100644 piet-wgsl/shader/coarse.wgsl
 create mode 100644 piet-wgsl/shader/shared/bump.wgsl
 create mode 100644 piet-wgsl/shader/shared/ptcl.wgsl

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 3a9bfe5..43d84c0 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,10 +1,12 @@
 {
   "wgsl-analyzer.customImports": {
     "bbox": "${workspaceFolder}/piet-wgsl/shader/shared/bbox.wgsl",
+    "bump": "${workspaceFolder}/piet-wgsl/shader/shared/bump.wgsl",
     "config": "${workspaceFolder}/piet-wgsl/shader/shared/config.wgsl",
     "drawtag": "${workspaceFolder}/piet-wgsl/shader/shared/drawtag.wgsl",
     "segment": "${workspaceFolder}/piet-wgsl/shader/shared/segment.wgsl",
-    "pathtag": "${workspaceFolder}/piet-wgsl/shader/shared/pathtag.wgsl"
+    "pathtag": "${workspaceFolder}/piet-wgsl/shader/shared/pathtag.wgsl",
+    "ptcl": "${workspaceFolder}/piet-wgsl/shader/shared/ptcl.wgsl"
   },
   "wgsl-analyzer.diagnostics.nagaVersion": "main"
 }
diff --git a/piet-wgsl/shader/binning.wgsl b/piet-wgsl/shader/binning.wgsl
index 1282164..d16a620 100644
--- a/piet-wgsl/shader/binning.wgsl
+++ b/piet-wgsl/shader/binning.wgsl
@@ -19,6 +19,7 @@
 #import config
 #import drawtag
 #import bbox
+#import bump
 
 @group(0) @binding(0)
 var<storage> config: Config;
@@ -26,7 +27,6 @@ var<storage> config: Config;
 @group(0) @binding(1)
 var<storage> draw_monoids: array<DrawMonoid>;
 
-
 @group(0) @binding(2)
 var<storage> path_bbox_buf: array<PathBBox>;
 
@@ -37,10 +37,6 @@ var<storage> clip_bbox_buf: array<vec4<f32>>;
 var<storage, read_write> intersected_bbox: array<vec4<f32>>;
 
 // TODO: put into shared include
-// TODO: robust memory (failure flags)
-struct BumpAllocators {
-    binning: atomic<u32>,
-}
 
 @group(0) @binding(5)
 var<storate, read_write> bump: BumpAllocators;
@@ -48,6 +44,7 @@ var<storate, read_write> bump: BumpAllocators;
 @group(0) @binding(6)
 var<storage, read_write> bin_data: array<u32>;
 
+// TODO: put in common place
 struct BinHeader {
     element_count: u32,
     chunk_offset: u32,
@@ -56,14 +53,6 @@ struct BinHeader {
 @group(0) @binding(7)
 var<storage, read_write> bin_header: array<BinHeader>;
 
-// These should probably be in a common block.
-let TILE_WIDTH = 16u;
-let TILE_HEIGHT = 16u;
-// Number of tiles per bin
-let N_TILE_X = 16u;
-let N_TILE_Y = 16u;
-let N_TILE = N_TILE_X * N_TILE_Y;
-
 // conversion factors from coordinates to bin
 let SX = 1.0 / f32(N_TILE_X * TILE_WIDTH);
 let SY = 1.0 / f32(N_TILE_Y * TILE_HEIGHT);
diff --git a/piet-wgsl/shader/coarse.wgsl b/piet-wgsl/shader/coarse.wgsl
new file mode 100644
index 0000000..159a3d7
--- /dev/null
+++ b/piet-wgsl/shader/coarse.wgsl
@@ -0,0 +1,327 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+// The coarse rasterization stage.
+
+#import config
+#import bump
+#import drawtag
+#import ptcl
+
+@group(0) @binding(0)
+var<storage> config: Config;
+
+@group(0) @binding(1)
+var<storage> scene: array<u32>;
+
+@group(0) @binding(2)
+var<storage> draw_monoids: array<DrawMonoid>;
+
+// TODO: dedup
+struct BinHeader {
+    element_count: u32,
+    chunk_offset: u32,
+}
+
+@group(0) @binding(3)
+var<storage> bin_headers: array<BinHeader>;
+
+@group(0) @binding(4)
+var<storage, read_write> bin_data: array<u32>;
+
+@group(0) @binding(5)
+var<storage, read_write> bump: BumpAllocators;
+
+@group(0) @binding(6)
+var<storage, read_write> ptcl: array<u32>;
+
+// TODO: put this in the right place
+struct Path {
+    // bounding box in pixels
+    bbox: vec4<u32>,
+    // offset (in u32's) to tile rectangle
+    tiles: u32,
+}
+
+struct Tile {
+    backdrop: i32,
+    segments: u32,
+}
+
+@group(0) @binding(7)
+var<storage> paths: array<Path>;
+
+@group(0) @binding(8)
+var<storage> tiles: array<Tile>;
+
+
+// Much of this code assumes WG_SIZE == N_TILE. If these diverge, then
+// a fair amount of fixup is needed.
+let WG_SIZE = 256u;
+let N_SLICE = WG_SIZE / 32u;
+
+var<workgroup> sh_bitmaps: array<array<atomic<u32>, N_TILE>, N_SLICE>;
+var<workgroup> sh_part_count: array<u32, WG_SIZE>;
+var<workgroup> sh_part_offsets: array<u32, WG_SIZE>;
+var<workgroup> sh_drawobj_ix: array<u32, WG_SIZE>;
+var<workgroup> sh_tile_stride: array<u32, WG_SIZE>;
+var<workgroup> sh_tile_width: array<u32, WG_SIZE>;
+var<workgroup> sh_tile_x0: array<u32, WG_SIZE>;
+var<workgroup> sh_tile_y0: array<u32, WG_SIZE>;
+var<workgroup> sh_tile_count: array<u32, WG_SIZE>;
+var<workgroup> sh_tile_base: array<u32, WG_SIZE>;
+
+// helper functions for writing ptcl
+
+var<private> cmd_offset: u32;
+var<private> cmd_limit: u32;
+
+// Make sure there is space for a command of given size, plus a jump if needed
+fn alloc_cmd(size: u32) {
+    if cmd_offset + size >= cmd_limit {
+        let new_cmd = atomicAdd(&bump.ptcl, PTCL_INCREMENT);
+        // TODO: robust memory
+        ptcl[cmd_offset] = CMD_JUMP;
+        ptcl[cmd_offset + 1u] = new_cmd;
+        cmd_offset = new_cmd;
+        cmd_limit = cmd_offset + (PTCL_INCREMENT - PTCL_HEADROOM);
+    }
+}
+
+fn write_path(tile: Tile, linewidth: f32) {
+    // TODO: take flags
+    // TODO: handle stroke
+    alloc_cmd(3u);
+    if tile.segments != 0u {
+        let fill = CmdFill(tile.segments, tile.backdrop);
+        ptcl[cmd_offset] = CMD_FILL;
+        ptcl[cmd_offset + 1u] = fill.tile;
+        ptcl[cmd_offset + 2u] = u32(fill.backdrop);
+        cmd_offset += 3u;
+    } else {
+        ptcl[cmd_offset] = CMD_SOLID;
+        cmd_offset += 1u;
+    }
+}
+
+fn write_color(color: CmdColor) {
+    alloc_cmd(2u);
+    ptcl[cmd_offset] = CMD_FILL;
+    ptcl[cmd_offset + 1u] = color.rgba_color;
+    cmd_offset += 2u;
+
+}
+
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+    @builtin(workgroup_id) wg_id: vec3<u32>,
+) {
+    let width_in_bins = (config.width_in_tiles + N_TILE_X - 1u) / N_TILE_X;
+    let bin_ix = width_in_bins * wg_id.y + wg_id.x;
+    let n_partitions = (config.n_drawobj + N_TILE - 1u) / N_TILE;
+
+    // Coordinates of the top left of this bin, in tiles.
+    let bin_tile_x = N_TILE_X * wg_id.x;
+    let bin_tile_y = N_TILE_Y * wg_id.y;
+
+    let tile_x = local_id.x % N_TILE_X;
+    let tile_y = local_id.y % N_TILE_Y;
+    let this_tile_ix = (bin_tile_y + tile_y) * config.width_in_tiles + bin_tile_x + tile_x;
+    cmd_offset = this_tile_ix * PTCL_INITIAL_ALLOC;
+    cmd_limit = cmd_offset + (PTCL_INITIAL_ALLOC - PTCL_HEADROOM);
+    // TODO: clip state
+    let clip_zero_depth = 0u;
+
+    var partition_ix = 0u;
+    var rd_ix = 0u;
+    var wr_ix = 0u;
+    var part_start_ix = 0u;
+    var ready_ix = 0u;
+    // TODO: blend state
+    
+    while true {
+        for (var i = 0u; i < N_SLICE; i += 1u) {
+            atomicStore(&sh_bitmaps[i][local_id.x], 0u);
+        }
+
+        while true {
+            if ready_ix == wr_ix && partition_ix < n_partitions {
+                part_start_ix = ready_ix;
+                var count = 0u;
+                if partition_ix + local_id.x < n_partitions {
+                    let in_ix = (partition_ix + local_id.x) * N_TILE + bin_ix;
+                    let bin_header = bin_headers[in_ix];
+                    count = bin_header.element_count;
+                    sh_part_offsets[local_id.x] = bin_header.chunk_offset;
+                }
+                // prefix sum the element counts
+                for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+                    sh_part_count[local_id.x] = count;
+                    workgroupBarrier();
+                    if local_id.x >= (1u << i) {
+                        count += sh_part_count[local_id - (1u << i)];
+                    }
+                    workgroupBarrier();
+                }
+                sh_part_count[local_id.x] = part_start_ix + count;
+                workgroupBarrier();
+                ready_ix = sh_part_count[WG_SIZE - 1u];
+                partition_ix += WG_SIZE;
+            }
+            // use binary search to find draw object to read
+            var ix = rd_ix + local_id.x;
+            if ix >= wr_ix && ix < ready_ix {
+                var part_ix = 0u;
+                for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+                    let probe = part_ix + ((N_TILE / 2u) >> i);
+                    if ix >= sh_part_count[probe - 1u] {
+                        part_ix = probe;
+                    }
+                }
+                ix -= select(part_start_ix, sh_part_count[part_ix - 1u], part_ix > 0u);
+                let offset = sh_part_offsets[part_ix];
+                sh_drawobj_ix[local_id.x] = bin_data[offset + ix];
+            }
+            wr_ix = min(rd_ix + N_TILE, ready_ix);
+            if wr_ix - rd_ix >= N_TILE || (wr_ix >= ready_ix && partition_ix >= n_partitions) {
+                break;
+            }
+        }
+        // At this point, sh_drawobj_ix[0.. wr_ix - rd_ix] contains merged binning results.
+        var tag = DRAWTAG_NOP;
+        var drawobj_ix: u32;
+        if local_id.x + rd_ix < wr_ix {
+            drawobj_ix = sh_drawobj_ix[local_id.x];
+            tag = scene[config.drawtag_base + drawobj_ix];
+        }
+
+        var tile_count = 0u;
+        // I think this predicate is the same as the last, maybe they can be combined
+        if tag != DRAWTAG_NOP {
+            let path_ix = draw_monoids[drawobj_ix].path_ix;
+            let path = paths[path_ix];
+            let stride = path.bbox.z - path.bbox.x;
+            sh_tile_stride[local_id.x] = stride;
+            let dx = i32(path.bbox.x) - i32(bin_tile_x);
+            let dy = i32(path.bbox.y) - i32(bin_tile_y);
+            let x0 = clamp(dx, 0, i32(N_TILE_X));
+            let y0 = clamp(dy, 0, i32(N_TILE_Y));
+            let x1 = clamp(i32(path.bbox.z) - i32(bin_tile_x), 0, i32(N_TILE_X));
+            let y1 = clamp(i32(path.bbox.w) - i32(bin_tile_y), 0, i32(N_TILE_Y));
+            sh_tile_width[local_id.x] = u32(x1 - x0);
+            sh_tile_x0[local_id.x] = u32(x0);
+            sh_tile_y0[local_id.x] = u32(y0);
+            tile_count = u32(x1 - x0) * u32(y1 - y0);
+            // base relative to bin
+            let base = path.tiles - u32(dy * i32(stride) + dx);
+            sh_tile_base[local_id.x] = base;
+            // TODO: there's a write_tile_alloc here in the source, not sure what it's supposed to do
+        }
+
+        // Prefix sum of tile counts
+        sh_tile_count[local_id.x] = tile_count;
+        for (var i = 0; i < firstTrailingBit(N_TILE); i += 1u) {
+            workgroupBarrier();
+            if local_id.x >= (1u << i) {
+                tile_count += sh_tile_count[local_id.x - (1u << i)];
+            }
+            workgroupBarrier();
+            sh_tile_count[local_id.x] = tile_count;
+        }
+        workgroupBarrier();
+        let total_tile_count = sh_tile_count[N_TILE - 1u];
+        // Parallel iteration over all tiles
+        for (var ix = local_id.x; ix < total_tile_count; ix += N_TILE) {
+            // Binary search to find draw object which contains this tile
+            var el_ix = 0u;
+            for (var i = 0u; i < firstTrailingBit(N_TILE); i += 1u) {
+                let probe = el_ix + ((N_TILE / 2u) >> i);
+                if ix >= sh_tile_count[probe - 1u] {
+                    el_ix = probe;
+                }
+            }
+            drawobj_ix = sh_drawobj_ix[el_ix];
+            tag = scene[config.drawtag_base + drawobj_ix];
+            // TODO: clip logic
+            let seq_ix = ix - select(0u, sh_tile_count[el_ix - 1u], el_ix > 0u);
+            let width = sh_tile_width[el_ix];
+            let x = sh_tile_x0[el_ix] + seq_ix % width;
+            let y = sh_tile_y0[el_ix] + seq_ix / width;
+            let tile_ix = sh_tile_base[el_ix] + sh_tile_stride[el_ix] * y + x;
+            let tile = tiles[tile_ix];
+            // TODO: this predicate becomes more interesting with clip
+            let include_tile = tile.segments != 0u || tile.backdrop != 0;
+            if include_tile {
+                let el_slice = el_ix / 32u;
+                let el_mask = 1u << (el_ix & 31u);
+                atomicOr(&sh_bitmaps[el_slice][y * N_TILE_X + x], el_mask);
+            }
+        }
+        workgroupBarrier();
+        // At this point bit drawobj % 32 is set in sh_bitmaps[drawobj / 32][y * N_TILE_X + x]
+        // if drawobj touches tile (x, y).
+
+        // Write per-tile command list for this tile
+        var slice_ix = 0u;
+        var bitmap = atomicLoad(&sh_bitmaps[0u][local_id.x]);
+        while true {
+            if bitmap == 0u {
+                slice_ix += 1u;
+                // potential optimization: make iteration limit dynamic
+                if slice_ix == N_SLICE {
+                    break;
+                }
+                bitmap = atomicLoad(&sh_bitmaps[slice_ix][local_id.x]);
+                if bitmap == 0u {
+                    continue;
+                }
+            }
+            let el_ix = slice_ix * 32u + firstTrailingBit(bitmap);
+            drawobj_ix = sh_drawobj_ix[el_ix];
+            // clear LSB of bitmap, using bit magic
+            bitmap &= bitmap - 1u;
+            let drawtag = scene[config.drawtag_base + drawobj_ix];
+            let dm = draw_monoids[drawobj_ix];
+            let dd = config.drawdata_base + dm.scene_offset;
+            if clip_zero_depth == 0u {
+                let tile_ix = sh_tile_base[el_ix] + sh_tile_stride[el_ix] * tile_y + tile_x;
+                let tile = tiles[tile_ix];
+                switch drawtag {
+                    case DRAWTAG_FILL_COLOR: {
+                        // TODO: get linewidth from draw object
+                        let linewidth = -1.0;
+                        let rgba_color = scene[dd];
+                        write_path(tile, linewidth);
+                        write_color(CmdColor(rgba_color));
+                    }
+                    default: {}
+                }
+            }
+        }
+
+        rd_ix += N_TILE;
+        if rd_ix >= ready_ix && partition_ix >= n_partitions {
+            break;
+        }
+        workgroupBarrier();
+    }
+    if bin_tile_x < config.width_in_tiles && bin_tile_y < config.height_in_tiles {
+        ptcl[cmd_offset] = CMD_END;
+        // TODO: blend stack allocation
+    }
+}
diff --git a/piet-wgsl/shader/draw_leaf.wgsl b/piet-wgsl/shader/draw_leaf.wgsl
index 2169dd4..21edcfe 100644
--- a/piet-wgsl/shader/draw_leaf.wgsl
+++ b/piet-wgsl/shader/draw_leaf.wgsl
@@ -38,7 +38,7 @@ var<storage> path_bbox: array<PathBbox>;
 @group(0) @binding(5)
 var<storage, read_write> info: array<u32>;
 
-let WG_SIZE = 256;
+let WG_SIZE = 256u;
 
 var<workgroup> sh_scratch: array<DrawMonoid, WG_SIZE>;
 
@@ -119,7 +119,7 @@ fn main(
                 let r0 = bitcast<f32>(scene[dd + 5u]);
                 let r1 = bitcast<f32>(scene[dd + 6u]);
                 let inv_det = 1.0 / (mat.x * mat.w - mat.y * mat.z);
-                let inv_mat = inv_det * vec4(mat.w, -mat.y, -mat.z, mat.x);
+                let inv_mat = inv_det * vec4<f32>(mat.w, -mat.y, -mat.z, mat.x);
                 var inv_tr = inv_mat.xz * translate.x + inv_mat.yw * translate.y;
                 inv_tr += p0;
                 let center1 = p1 - p0;
diff --git a/piet-wgsl/shader/shared/bbox.wgsl b/piet-wgsl/shader/shared/bbox.wgsl
index c0eeb1c..fb7728b 100644
--- a/piet-wgsl/shader/shared/bbox.wgsl
+++ b/piet-wgsl/shader/shared/bbox.wgsl
@@ -25,6 +25,6 @@ struct PathBbox {
     trans_ix: u32,
 }
 
-fn bbox_intersect(a: vec4<f32>, b: vec4<f32>) -> f32 {
-    return vec4(max(a.xy, b.xy), min(a.zyw, b.zw));
+fn bbox_intersect(a: vec4<f32>, b: vec4<f32>) -> vec4<f32> {
+    return vec4(max(a.xy, b.xy), min(a.zw, b.zw));
 }
diff --git a/piet-wgsl/shader/shared/bump.wgsl b/piet-wgsl/shader/shared/bump.wgsl
new file mode 100644
index 0000000..6d9a225
--- /dev/null
+++ b/piet-wgsl/shader/shared/bump.wgsl
@@ -0,0 +1,21 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+// TODO: robust memory (failure flags)
+struct BumpAllocators {
+    binning: atomic<u32>,
+    ptcl: atomic<u32>,
+}
diff --git a/piet-wgsl/shader/shared/config.wgsl b/piet-wgsl/shader/shared/config.wgsl
index cdb0abc..0a91782 100644
--- a/piet-wgsl/shader/shared/config.wgsl
+++ b/piet-wgsl/shader/shared/config.wgsl
@@ -20,8 +20,28 @@ struct Config {
 
     n_drawobj: u32,
 
-    // offsets within config file (in u32 units)
+    // offsets within scene buffer (in u32 units)
     // Note: this is a difference from piet-gpu, which is in bytes
     drawtag_base: u32,
     drawdata_base: u32,
 }
+
+// Geometry of tiles and bins
+
+
+let TILE_WIDTH = 16u;
+let TILE_HEIGHT = 16u;
+// Number of tiles per bin
+let N_TILE_X = 16u;
+let N_TILE_Y = 16u;
+let N_TILE = N_TILE_X * N_TILE_Y;
+
+// Should ptcl stuff move to a separate import?
+
+// Layout of per-tile command list
+// Initial allocation, in u32's.
+let PTCL_INITIAL_ALLOC = 64u;
+let PTCL_INCREMENT = 256u;
+
+// Amount of space taken by jump
+let PTCL_HEADROOM = 2u;
diff --git a/piet-wgsl/shader/shared/drawtag.wgsl b/piet-wgsl/shader/shared/drawtag.wgsl
index c318e53..25608bd 100644
--- a/piet-wgsl/shader/shared/drawtag.wgsl
+++ b/piet-wgsl/shader/shared/drawtag.wgsl
@@ -31,11 +31,11 @@ struct DrawMonoid {
 // version of the draw monoid.
 let DRAWTAG_NOP = 0u;
 let DRAWTAG_FILL_COLOR = 0x44u;
-let DRAWTAG_FILL_LIN_GRADIENT = 0x114;
-let DRAWTAG_FILL_RAD_GRADIENT = 0x2dc;
-let DRAWTAG_FILL_IMAGE = 0x48;
-let DRAWTAG_BEGIN_CLIP = 0x05;
-let DRAWTAG_END_CLIP = 0x25;
+let DRAWTAG_FILL_LIN_GRADIENT = 0x114u;
+let DRAWTAG_FILL_RAD_GRADIENT = 0x2dcu;
+let DRAWTAG_FILL_IMAGE = 0x48u;
+let DRAWTAG_BEGIN_CLIP = 0x05u;
+let DRAWTAG_END_CLIP = 0x25u;
 
 fn draw_monoid_identity() -> DrawMonoid {
     return DrawMonoid();
@@ -49,8 +49,7 @@ fn combine_draw_monoid(a: DrawMonoid, b: DrawMonoid) {
     c.info_offset = a.info_offset + b.info_offset;
 }
 
-fn map_draw_tag(tag_word: u32) -> DawMonoid {
-    let has_path = ;
+fn map_draw_tag(tag_word: u32) -> DrawMonoid {
     var c: DrawMonoid;
     c.path_ix = u32(tag_word != DRAWTAG_NOP);
     c.clip_ix = tag_word & 1u;
diff --git a/piet-wgsl/shader/shared/ptcl.wgsl b/piet-wgsl/shader/shared/ptcl.wgsl
new file mode 100644
index 0000000..516189d
--- /dev/null
+++ b/piet-wgsl/shader/shared/ptcl.wgsl
@@ -0,0 +1,38 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+// Tags for PTCL commands
+let CMD_END = 0u;
+let CMD_FILL = 1u;
+let CMD_SOLID = 3u;
+let CMD_COLOR = 5u;
+let CMD_JUMP = 11u;
+
+// The individual PTCL structs are written here, but read/write is by
+// hand in the relevant shaders
+
+struct CmdFill {
+    tile: u32,
+    backdrop: i32,
+}
+
+struct CmdJump {
+    target: u32,
+}
+
+struct CmdColor {
+    rgba_color: u32,
+}

From 40416fd2eae7676c77b1fd19af745ff1206dd4dd Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Sun, 30 Oct 2022 08:08:22 -0700
Subject: [PATCH 03/17] Another checkpoint

---
 piet-wgsl/shader/coarse.wgsl         | 11 +++++++----
 piet-wgsl/shader/draw_leaf.wgsl      | 25 ++++++++++++++-----------
 piet-wgsl/shader/path_coarse.wgsl    |  3 ---
 piet-wgsl/shader/shared/config.wgsl  | 14 ++------------
 piet-wgsl/shader/shared/drawtag.wgsl |  3 ++-
 piet-wgsl/shader/shared/ptcl.wgsl    | 10 +++++++++-
 6 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/piet-wgsl/shader/coarse.wgsl b/piet-wgsl/shader/coarse.wgsl
index 159a3d7..4ccb0c2 100644
--- a/piet-wgsl/shader/coarse.wgsl
+++ b/piet-wgsl/shader/coarse.wgsl
@@ -71,7 +71,8 @@ var<storage> tiles: array<Tile>;
 // Much of this code assumes WG_SIZE == N_TILE. If these diverge, then
 // a fair amount of fixup is needed.
 let WG_SIZE = 256u;
-let N_SLICE = WG_SIZE / 32u;
+//let N_SLICE = WG_SIZE / 32u;
+let N_SLICE = 8u;
 
 var<workgroup> sh_bitmaps: array<array<atomic<u32>, N_TILE>, N_SLICE>;
 var<workgroup> sh_part_count: array<u32, WG_SIZE>;
@@ -174,7 +175,7 @@ fn main(
                     sh_part_count[local_id.x] = count;
                     workgroupBarrier();
                     if local_id.x >= (1u << i) {
-                        count += sh_part_count[local_id - (1u << i)];
+                        count += sh_part_count[local_id.x - (1u << i)];
                     }
                     workgroupBarrier();
                 }
@@ -235,7 +236,7 @@ fn main(
 
         // Prefix sum of tile counts
         sh_tile_count[local_id.x] = tile_count;
-        for (var i = 0; i < firstTrailingBit(N_TILE); i += 1u) {
+        for (var i = 0u; i < firstTrailingBit(N_TILE); i += 1u) {
             workgroupBarrier();
             if local_id.x >= (1u << i) {
                 tile_count += sh_tile_count[local_id.x - (1u << i)];
@@ -298,11 +299,13 @@ fn main(
             let drawtag = scene[config.drawtag_base + drawobj_ix];
             let dm = draw_monoids[drawobj_ix];
             let dd = config.drawdata_base + dm.scene_offset;
+            // TODO: set up draw info from monoid
             if clip_zero_depth == 0u {
                 let tile_ix = sh_tile_base[el_ix] + sh_tile_stride[el_ix] * tile_y + tile_x;
                 let tile = tiles[tile_ix];
                 switch drawtag {
-                    case DRAWTAG_FILL_COLOR: {
+                    // DRAWTAG_FILL_COLOR
+                    case 0x44u: {
                         // TODO: get linewidth from draw object
                         let linewidth = -1.0;
                         let rgba_color = scene[dd];
diff --git a/piet-wgsl/shader/draw_leaf.wgsl b/piet-wgsl/shader/draw_leaf.wgsl
index 21edcfe..b962bf2 100644
--- a/piet-wgsl/shader/draw_leaf.wgsl
+++ b/piet-wgsl/shader/draw_leaf.wgsl
@@ -50,7 +50,7 @@ fn main(
 ) {
     let ix = global_id.x;
     let tag_word = scene[config.drawtag_base + ix];
-    let agg = map_draw_tag(tag_word);
+    var agg = map_draw_tag(tag_word);
     sh_scratch[local_id.x] = agg;
     for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
         workgroupBarrier();
@@ -64,7 +64,7 @@ fn main(
     workgroupBarrier();
     var m = draw_monoid_identity();
     if wg_id.x > 0u {
-        m = parent[wg_id.x - 1u];
+        m = reduced[wg_id.x - 1u];
     }
     if local_id.x > 0u {
         m = combine_draw_monoid(m, sh_scratch[local_id.x - 1u]);
@@ -84,7 +84,7 @@ fn main(
         let y1 = f32(bbox.y1) - 32768.0;
         let bbox_f = vec4(x0, y0, x1, y1);
         let fill_mode = u32(bbox.linewidth >= 0.0);
-        var mat: vec4<f32>;
+        var matrx: vec4<f32>;
         var translate: vec2<f32>;
         var linewidth = bbox.linewidth;
         if linewidth >= 0.0 || tag_word == DRAWTAG_FILL_LIN_GRADIENT || tag_word == DRAWTAG_FILL_RAD_GRADIENT {
@@ -92,18 +92,20 @@ fn main(
         }
         if linewidth >= 0.0 {
             // Note: doesn't deal with anisotropic case
-            linewidth *= sqrt(abs(mat.x * mat.w - mat.y * mat.z)); 
+            linewidth *= sqrt(abs(matrx.x * matrx.w - matrx.y * matrx.z)); 
         }
         switch tag_word {
-            case DRAWTAG_FILL_COLOR, DRAWTAG_FILL_IMAGE: {
+            // DRAWTAG_FILL_COLOR, DRAWTAG_FILL_IMAGE
+            case 0x44u, 0x48u: {
                 info[di] = bitcast<u32>(linewidth);
             }
-            case DRAWTAG_FILL_LIN_GRADIENT: {
+            // DRAWTAG_FILL_LIN_GRADIENT
+            case 0x114u: {
                 info[di] = bitcast<u32>(linewidth);
                 var p0 = bitcast<vec2<f32>>(vec2(scene[dd + 1u], scene[dd + 2u]));
                 var p1 = bitcast<vec2<f32>>(vec2(scene[dd + 3u], scene[dd + 4u]));
-                p0 = mat.xy * p0.x + mat.zw * p0.y + translate;
-                p1 = mat.xy * p1.x + mat.zw * p1.y + translate;
+                p0 = matrx.xy * p0.x + matrx.zw * p0.y + translate;
+                p1 = matrx.xy * p1.x + matrx.zw * p1.y + translate;
                 let dxy = p1 - p0;
                 let scale = 1.0 / dot(dxy, dxy);
                 let line_xy = dxy * scale;
@@ -112,14 +114,15 @@ fn main(
                 info[di + 2u] = bitcast<u32>(line_xy.y);
                 info[di + 3u] = bitcast<u32>(line_c);
             }
-            case DRAWTAG_FILL_RAD_GRADIENT: {
+            // DRAWTAG_FILL_RAD_GRADIENT
+            case 0x2dcu: {
                 info[di] = bitcast<u32>(linewidth);
                 var p0 = bitcast<vec2<f32>>(vec2(scene[dd + 1u], scene[dd + 2u]));
                 var p1 = bitcast<vec2<f32>>(vec2(scene[dd + 3u], scene[dd + 4u]));
                 let r0 = bitcast<f32>(scene[dd + 5u]);
                 let r1 = bitcast<f32>(scene[dd + 6u]);
-                let inv_det = 1.0 / (mat.x * mat.w - mat.y * mat.z);
-                let inv_mat = inv_det * vec4<f32>(mat.w, -mat.y, -mat.z, mat.x);
+                let inv_det = 1.0 / (matrx.x * matrx.w - matrx.y * matrx.z);
+                let inv_mat = inv_det * vec4<f32>(matrx.w, -matrx.y, -matrx.z, matrx.x);
                 var inv_tr = inv_mat.xz * translate.x + inv_mat.yw * translate.y;
                 inv_tr += p0;
                 let center1 = p1 - p0;
diff --git a/piet-wgsl/shader/path_coarse.wgsl b/piet-wgsl/shader/path_coarse.wgsl
index 23590d5..0a5139d 100644
--- a/piet-wgsl/shader/path_coarse.wgsl
+++ b/piet-wgsl/shader/path_coarse.wgsl
@@ -64,9 +64,6 @@ fn read_i16_point(ix: u32) -> vec2<f32> {
 }
 
 #ifndef cubics_out
-let TILE_WIDTH = 16u;
-let TILE_HEIGHT = 16u;
-
 struct SubdivResult {
     val: f32,
     a0: f32,
diff --git a/piet-wgsl/shader/shared/config.wgsl b/piet-wgsl/shader/shared/config.wgsl
index 0a91782..b31f915 100644
--- a/piet-wgsl/shader/shared/config.wgsl
+++ b/piet-wgsl/shader/shared/config.wgsl
@@ -28,20 +28,10 @@ struct Config {
 
 // Geometry of tiles and bins
 
-
 let TILE_WIDTH = 16u;
 let TILE_HEIGHT = 16u;
 // Number of tiles per bin
 let N_TILE_X = 16u;
 let N_TILE_Y = 16u;
-let N_TILE = N_TILE_X * N_TILE_Y;
-
-// Should ptcl stuff move to a separate import?
-
-// Layout of per-tile command list
-// Initial allocation, in u32's.
-let PTCL_INITIAL_ALLOC = 64u;
-let PTCL_INCREMENT = 256u;
-
-// Amount of space taken by jump
-let PTCL_HEADROOM = 2u;
+//let N_TILE = N_TILE_X * N_TILE_Y;
+let N_TILE = 256u;
diff --git a/piet-wgsl/shader/shared/drawtag.wgsl b/piet-wgsl/shader/shared/drawtag.wgsl
index 25608bd..749b211 100644
--- a/piet-wgsl/shader/shared/drawtag.wgsl
+++ b/piet-wgsl/shader/shared/drawtag.wgsl
@@ -41,12 +41,13 @@ fn draw_monoid_identity() -> DrawMonoid {
     return DrawMonoid();
 }
 
-fn combine_draw_monoid(a: DrawMonoid, b: DrawMonoid) {
+fn combine_draw_monoid(a: DrawMonoid, b: DrawMonoid) -> DrawMonoid {
     var c: DrawMonoid;
     c.path_ix = a.path_ix + b.path_ix;
     c.clip_ix = a.clip_ix + b.clip_ix;
     c.scene_offset = a.scene_offset + b.scene_offset;
     c.info_offset = a.info_offset + b.info_offset;
+    return c;
 }
 
 fn map_draw_tag(tag_word: u32) -> DrawMonoid {
diff --git a/piet-wgsl/shader/shared/ptcl.wgsl b/piet-wgsl/shader/shared/ptcl.wgsl
index 516189d..7121f84 100644
--- a/piet-wgsl/shader/shared/ptcl.wgsl
+++ b/piet-wgsl/shader/shared/ptcl.wgsl
@@ -14,6 +14,14 @@
 //
 // Also licensed under MIT license, at your choice.
 
+// Layout of per-tile command list
+// Initial allocation, in u32's.
+let PTCL_INITIAL_ALLOC = 64u;
+let PTCL_INCREMENT = 256u;
+
+// Amount of space taken by jump
+let PTCL_HEADROOM = 2u;
+
 // Tags for PTCL commands
 let CMD_END = 0u;
 let CMD_FILL = 1u;
@@ -30,7 +38,7 @@ struct CmdFill {
 }
 
 struct CmdJump {
-    target: u32,
+    new_ix: u32,
 }
 
 struct CmdColor {

From 5c6ec1efa3f7573d5ed51bddf1c4b8ac431c5242 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Tue, 1 Nov 2022 16:20:15 -0700
Subject: [PATCH 04/17] Checkpoint

Many shader stages written.
---
 piet-wgsl/shader/backdrop_dyn.wgsl |  95 ++++++++++++++++++++++
 piet-wgsl/shader/fine.wgsl         |  33 +++++---
 piet-wgsl/shader/shared/bump.wgsl  |   1 +
 piet-wgsl/shader/tile_alloc.wgsl   | 122 +++++++++++++++++++++++++++++
 piet-wgsl/src/render.rs            |   6 +-
 5 files changed, 247 insertions(+), 10 deletions(-)
 create mode 100644 piet-wgsl/shader/backdrop_dyn.wgsl
 create mode 100644 piet-wgsl/shader/tile_alloc.wgsl

diff --git a/piet-wgsl/shader/backdrop_dyn.wgsl b/piet-wgsl/shader/backdrop_dyn.wgsl
new file mode 100644
index 0000000..c6c7c8d
--- /dev/null
+++ b/piet-wgsl/shader/backdrop_dyn.wgsl
@@ -0,0 +1,95 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+// Prefix sum for dynamically allocated backdrops
+
+#import config
+
+// TODO: dedup & put this in the right place
+struct Path {
+    // bounding box in pixels
+    bbox: vec4<u32>,
+    // offset (in u32's) to tile rectangle
+    tiles: u32,
+}
+
+// TODO: -> shared
+struct Tile {
+    backdrop: i32,
+    segments: u32,
+}
+
+@group(0) @binding(0)
+var<storage> config: Config;
+
+@group(0) @binding(1)
+var<storage> paths: array<Path>;
+
+@group(0) @binding(2)
+var<storage, read_write> tiles: array<Tile>;
+
+let WG_SIZE = 256u;
+
+var<workgroup> sh_row_width: array<u32, WG_SIZE>;
+var<workgroup> sh_row_count: array<u32, WG_SIZE>;
+
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+) {
+    let drawobj_ix = global_id.x;
+    var row_count = 0u;
+    if drawobj_ix < config.n_drawobj {
+        // TODO: when rectangles, path and draw obj are not the same
+        let path = paths[drawobj_ix];
+        sh_row_width[local_id.x] = path.bbox.z - path.bbox.x;
+        row_count = path.bbox.w - path.bbox.y;
+        sh_row_count[local_id.x] = row_count;
+    }
+
+    // Prefix sum of row counts
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+        workgroupBarrier();
+        if local_id.x >= (1u << i) {
+            row_count += sh_row_count[local_id.x - (1u << i)];
+        }
+        workgroupBarrier();
+        sh_row_count[local_id.x] = row_count;
+    }
+    workgroupBarrier();
+    let total_rows = sh_row_count[WG_SIZE - 1u];
+    for (var row = local_id.x; row < total_rows; row += WG_SIZE) {
+        var el_ix = 0u;
+        for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+            let probe = el_ix + ((WG_SIZE / 2u) >> i);
+            if row >= sh_row_count[probe - 1u] {
+                el_ix = probe;
+            }
+        }
+        let width = sh_row_width[el_ix];
+        if width > 0u {
+            var seq_ix = row - select(0u, sh_row_count[el_ix - 1u], el_ix > 0u);
+            var tile_ix = seq_ix * width;
+            var sum = tiles[tile_ix].backdrop;
+            for (var x = 1u; x < width; x += 1u) {
+                tile_ix += 1u;
+                sum += tiles[tile_ix].backdrop;
+                tiles[tile_ix].backdrop = sum;
+            }
+        }
+    }
+}
diff --git a/piet-wgsl/shader/fine.wgsl b/piet-wgsl/shader/fine.wgsl
index 9d62793..9174a21 100644
--- a/piet-wgsl/shader/fine.wgsl
+++ b/piet-wgsl/shader/fine.wgsl
@@ -14,6 +14,9 @@
 //
 // Also licensed under MIT license, at your choice.
 
+// Fine rasterizer. This can run in simple (just path rendering) and full
+// modes, controllable by #define.
+
 // This is a cut'n'paste w/ backdrop.
 struct Tile {
     backdrop: i32,
@@ -36,17 +39,16 @@ var<storage> segments: array<Segment>;
 @group(0) @binding(3)
 var<storage, read_write> output: array<u32>;
 
+#ifdef full
+#import ptcl
+
+@group(0) @binding(4)
+var<storage> ptcl: array<u32>;
+#endif
+
 let PIXELS_PER_THREAD = 4u;
 
-@compute @workgroup_size(4, 16)
-fn main(
-    @builtin(global_invocation_id) global_id: vec3<u32>,
-    @builtin(local_invocation_id) local_id: vec3<u32>,
-    @builtin(workgroup_id) wg_id: vec3<u32>,
-) {
-    let tile_ix = wg_id.y * config.width_in_tiles + wg_id.x;
-    let xy = vec2<f32>(f32(global_id.x * PIXELS_PER_THREAD), f32(global_id.y));
-    let tile = tiles[tile_ix];
+fn fill_path(tile: Tile, xy: vec2<f32>) -> array<f32, PIXELS_PER_THREAD> {
     var area: array<f32, PIXELS_PER_THREAD>;
     let backdrop_f = f32(tile.backdrop);
     for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
@@ -89,6 +91,19 @@ fn main(
     for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
         area[i] = abs(area[i]);
     }
+    return area;
+}
+
+@compute @workgroup_size(4, 16)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+    @builtin(workgroup_id) wg_id: vec3<u32>,
+) {
+    let tile_ix = wg_id.y * config.width_in_tiles + wg_id.x;
+    let xy = vec2<f32>(f32(global_id.x * PIXELS_PER_THREAD), f32(global_id.y));
+    let tile = tiles[tile_ix];
+    let area = fill_path(tile, xy);
 
     let bytes = pack4x8unorm(vec4<f32>(area[0], area[1], area[2], area[3]));
     let out_ix = global_id.y * (config.width_in_tiles * 4u) + global_id.x;
diff --git a/piet-wgsl/shader/shared/bump.wgsl b/piet-wgsl/shader/shared/bump.wgsl
index 6d9a225..7f33ba2 100644
--- a/piet-wgsl/shader/shared/bump.wgsl
+++ b/piet-wgsl/shader/shared/bump.wgsl
@@ -18,4 +18,5 @@
 struct BumpAllocators {
     binning: atomic<u32>,
     ptcl: atomic<u32>,
+    tile: atomic<u32>,
 }
diff --git a/piet-wgsl/shader/tile_alloc.wgsl b/piet-wgsl/shader/tile_alloc.wgsl
new file mode 100644
index 0000000..867db7e
--- /dev/null
+++ b/piet-wgsl/shader/tile_alloc.wgsl
@@ -0,0 +1,122 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+// Tile allocation (and zeroing of tiles)
+
+#import config
+#import bump
+#import drawtag
+
+@group(0) @binding(0)
+var<storage> config: Config;
+
+@group(0) @binding(1)
+var<storage> scene: array<u32>;
+
+@group(0) @binding(2)
+var<storage> draw_bboxes: array<vec4<f32>>;
+
+@group(0) @binding(3)
+var<storage, read_write> bump: BumpAllocators;
+
+// TODO: put this in the right place, dedup
+struct Path {
+    // bounding box in pixels
+    bbox: vec4<u32>,
+    // offset (in u32's) to tile rectangle
+    tiles: u32,
+}
+
+struct Tile {
+    backdrop: i32,
+    segments: u32,
+}
+
+@group(0) @binding(4)
+var<storage, read_write> paths: array<Path>;
+
+@group(0) @binding(5)
+var<storage, read_write> tiles: array<Tile>;
+
+let WG_SIZE = 256u;
+
+var<workgroup> sh_tile_count: array<u32, WG_SIZE>;
+var<workgroup> sh_tile_offset: u32;
+
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+) {
+    // scale factors useful for converting coordinates to tiles
+    // TODO: make into constants
+    let SX = 1.0 / f32(TILE_WIDTH);
+    let SY = 1.0 / f32(TILE_HEIGHT);
+
+    let drawobj_ix = global_id.x;
+    var drawtag = DRAWTAG_NOP;
+    if drawobj_ix < config.n_drawobj {
+        drawtag = scene[config.drawtag_base + drawobj_ix];
+    }
+    var x0 = 0;
+    var y0 = 0;
+    var x1 = 0;
+    var y1 = 0;
+    if drawtag != DRAWTAG_NOP && drawtag != DRAWTAG_END_CLIP {
+        let bbox = draw_bboxes[drawobj_ix];
+        x0 = i32(floor(bbox.x * SX));
+        y0 = i32(floor(bbox.y * SY));
+        x1 = i32(ceil(bbox.z * SX));
+        y1 = i32(ceil(bbox.w * SY));
+    }
+    let ux0 = u32(clamp(x0, 0, i32(config.width_in_tiles)));
+    let uy0 = u32(clamp(y0, 0, i32(config.height_in_tiles)));
+    let ux1 = u32(clamp(x1, 0, i32(config.width_in_tiles)));
+    let uy1 = u32(clamp(y1, 0, i32(config.height_in_tiles)));
+    let tile_count = (ux1 - ux0) * (uy1 - uy0);
+    var total_tile_count = tile_count;
+    sh_tile_count[local_id.x] = tile_count;
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+        workgroupBarrier();
+        if local_id.x < (1u << i) {
+            total_tile_count += sh_tile_count[local_id.x - (1u << i)];
+        }
+        workgroupBarrier();
+        sh_tile_count[local_id.x] = total_tile_count;
+    }
+    if local_id.x == WG_SIZE - 1u {
+        sh_tile_offset = atomicAdd(&bump.tile, total_tile_count);
+    }
+    workgroupBarrier();
+    let tile_offset = sh_tile_offset;
+    if drawobj_ix < config.n_drawobj {
+        let tile_subix = select(0u, sh_tile_count[local_id.x - 1u], local_id.x > 0u);
+        let bbox = vec4<u32>(ux0, uy0, ux1, uy1);
+        let path = Path(bbox, tile_offset + tile_subix);
+    }
+
+    // zero allocated memory
+    // Note: if the number of draw objects is small, utilization will be poor.
+    // There are two things that can be done to improve that. One would be a
+    // separate (indirect) dispatch. Another would be to have each workgroup
+    // process fewer draw objects than the number of threads in the wg.
+    let total_count = sh_tile_count[WG_SIZE - 1u];
+    for (var i = local_id.x; i < total_count; i += WG_SIZE) {
+        // Note: could format output buffer as u32 for even better load
+        // balancing, as does piet-gpu.
+        tiles[tile_offset + i] = Tile(0, 0u);
+    }
+}
diff --git a/piet-wgsl/src/render.rs b/piet-wgsl/src/render.rs
index 3eec030..7b79236 100644
--- a/piet-wgsl/src/render.rs
+++ b/piet-wgsl/src/render.rs
@@ -11,10 +11,13 @@ use crate::{
 const TAG_MONOID_SIZE: u64 = 12;
 
 #[repr(C)]
-#[derive(Clone, Copy, Zeroable, Pod)]
+#[derive(Clone, Copy, Default, Zeroable, Pod)]
 struct Config {
     width_in_tiles: u32,
     height_in_tiles: u32,
+    n_drawobj: u32,
+    drawtag_base: u32,
+    drawdata_base: u32,
 }
 
 #[repr(C)]
@@ -60,6 +63,7 @@ pub fn render(scene: &Scene, shaders: &Shaders) -> (Recording, BufProxy) {
     let config = Config {
         width_in_tiles: 64,
         height_in_tiles: 64,
+        ..Default::default()
     };
     let config_buf = recording.upload(bytemuck::bytes_of(&config).to_owned());
     // TODO: more principled size calc

From 7ac327c68414abf06ed4c8dbfe6a30fb1ca5d05b Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Wed, 2 Nov 2022 18:07:32 -0700
Subject: [PATCH 05/17] Unify scene buffer

All streams of the scene are combined into a single buffer. This is very much like existing piet-gpu, however the various outputs from the compute stages (whether computed on CPU or GPU) will retain their separate bindings, which is more native to WGSL.

There's a touch of ergonomics loss, in particular when we do transforms we'll need to unmarshal them by hand, but I think overall not too bad.
---
 .vscode/settings.json                |  3 +-
 piet-wgsl/shader/fine.wgsl           | 57 ++++++++++++++++++++++++++++
 piet-wgsl/shader/path_coarse.wgsl    | 27 ++++++-------
 piet-wgsl/shader/pathtag_reduce.wgsl | 10 +++--
 piet-wgsl/shader/pathtag_scan.wgsl   | 10 +++--
 piet-wgsl/shader/shared/config.wgsl  |  3 ++
 piet-wgsl/src/render.rs              | 45 +++++++++++++---------
 piet-wgsl/src/shaders.rs             |  4 +-
 8 files changed, 116 insertions(+), 43 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 43d84c0..4d315c9 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -8,5 +8,6 @@
     "pathtag": "${workspaceFolder}/piet-wgsl/shader/shared/pathtag.wgsl",
     "ptcl": "${workspaceFolder}/piet-wgsl/shader/shared/ptcl.wgsl"
   },
-  "wgsl-analyzer.diagnostics.nagaVersion": "main"
+  "wgsl-analyzer.diagnostics.nagaVersion": "main",
+  "wgsl-analyzer.preprocessor.shaderDefs": ["full"]
 }
diff --git a/piet-wgsl/shader/fine.wgsl b/piet-wgsl/shader/fine.wgsl
index 9174a21..7b66d04 100644
--- a/piet-wgsl/shader/fine.wgsl
+++ b/piet-wgsl/shader/fine.wgsl
@@ -44,6 +44,17 @@ var<storage, read_write> output: array<u32>;
 
 @group(0) @binding(4)
 var<storage> ptcl: array<u32>;
+
+fn read_fill(cmd_ix: u32) -> CmdFill {
+    let tile = ptcl[cmd_ix + 1u];
+    let backdrop = i32(ptcl[cmd_ix + 2u]);
+    return CmdFill(tile, backdrop);
+}
+
+fn read_color(cmd_ix: u32) -> CmdColor {
+    let rgba_color = ptcl[cmd_ix + 1u];
+    return CmdColor(rgba_color);
+}
 #endif
 
 let PIXELS_PER_THREAD = 4u;
@@ -103,7 +114,53 @@ fn main(
     let tile_ix = wg_id.y * config.width_in_tiles + wg_id.x;
     let xy = vec2<f32>(f32(global_id.x * PIXELS_PER_THREAD), f32(global_id.y));
     let tile = tiles[tile_ix];
+#ifdef full
+    var rgba: array<vec4<f32>, PIXELS_PER_THREAD>;
+    var area: array<f32, PIXELS_PER_THREAD>;
+    var cmd_ix = tile_ix * PTCL_INITIAL_ALLOC;
+
+    // main interpretation loop
+    while true {
+        let tag = ptcl[cmd_ix];
+        if tag == CMD_END {
+            break;
+        }
+        switch tag {
+            // CMD_FILL
+            case 1u: {
+                let fill = read_fill(cmd_ix);
+                let tile = Tile(fill.backdrop, fill.tile);
+                area = fill_path(tile, xy);
+                cmd_ix += 3u;
+            }
+            // CMD_SOLID
+            case 3u: {
+                for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
+                    area[i] = 1.0;
+                }
+                cmd_ix += 1u;
+            }
+            // CMD_COLOR
+            case 5u: {
+                let color = read_color(cmd_ix);
+                let fg = unpack4x8unorm(color.rgba_color);
+                for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
+                    let fg_i = fg * area[i];
+                    rgba[i] = rgba[i] * (1.0 - fg_i.a) + fg_i;
+                }
+                cmd_ix += 1u;
+            }
+            // CMD_JUMP
+            case 11u: {
+                cmd_ix = ptcl[cmd_ix + 1u];
+            }
+            default: {}
+        }
+    }
+
+#else
     let area = fill_path(tile, xy);
+#endif
 
     let bytes = pack4x8unorm(vec4<f32>(area[0], area[1], area[2], area[3]));
     let out_ix = global_id.y * (config.width_in_tiles * 4u) + global_id.x;
diff --git a/piet-wgsl/shader/path_coarse.wgsl b/piet-wgsl/shader/path_coarse.wgsl
index 0a5139d..534963f 100644
--- a/piet-wgsl/shader/path_coarse.wgsl
+++ b/piet-wgsl/shader/path_coarse.wgsl
@@ -14,24 +14,22 @@
 //
 // Also licensed under MIT license, at your choice.
 
+#import config
 #import pathtag
 
 @group(0) @binding(0)
-var<storage> path_tags: array<u32>;
+var<storage> config: Config;
 
 @group(0) @binding(1)
-var<storage> tag_monoids: array<TagMonoid>;
+var<storage> scene: array<u32>;
 
-// TODO: should probably have single "scene" binding.
 @group(0) @binding(2)
-var<storage> path_data: array<u32>;
+var<storage> tag_monoids: array<TagMonoid>;
 
 #ifdef cubics_out
 @group(0) @binding(3)
 var<storage, read_write> output: array<vec2<f32>>;
 #else
-#import config
-
 struct Tile {
     backdrop: atomic<i32>,
     segments: atomic<u32>,
@@ -39,25 +37,23 @@ struct Tile {
 
 #import segment
 
-// Should probably be uniform binding
 @group(0) @binding(3)
-var<storage> config: Config;
-
-@group(0) @binding(4)
 var<storage, read_write> tiles: array<Tile>;
 
-@group(0) @binding(5)
+@group(0) @binding(4)
 var<storage, read_write> segments: array<Segment>;
 #endif
 
+var<private> pathdata_base: u32;
+
 fn read_f32_point(ix: u32) -> vec2<f32> {
-    let x = bitcast<f32>(path_data[ix]);
-    let y = bitcast<f32>(path_data[ix + 1u]);
+    let x = bitcast<f32>(scene[pathdata_base + ix]);
+    let y = bitcast<f32>(scene[pathdata_base + ix + 1u]);
     return vec2<f32>(x, y);
 }
 
 fn read_i16_point(ix: u32) -> vec2<f32> {
-    let raw = path_data[ix];
+    let raw = scene[pathdata_base + ix];
     let x = f32(i32(raw << 16u) >> 16u);
     let y = f32(i32(raw) >> 16u);
     return vec2<f32>(x, y);
@@ -133,7 +129,8 @@ fn main(
 ) {
     // Obtain exclusive prefix sum of tag monoid
     let ix = global_id.x;
-    let tag_word = path_tags[ix >> 2u];
+    let tag_word = scene[config.pathtag_base + (ix >> 2u)];
+    pathdata_base = config.pathdata_base;
     let shift = (ix & 3u) * 8u;
     var tm = reduce_tag(tag_word & ((1u << shift) - 1u));
     tm = combine_tag_monoid(tag_monoids[ix >> 2u], tm);
diff --git a/piet-wgsl/shader/pathtag_reduce.wgsl b/piet-wgsl/shader/pathtag_reduce.wgsl
index 92bf20d..bbf8528 100644
--- a/piet-wgsl/shader/pathtag_reduce.wgsl
+++ b/piet-wgsl/shader/pathtag_reduce.wgsl
@@ -14,14 +14,16 @@
 //
 // Also licensed under MIT license, at your choice.
 
+#import config
 #import pathtag
 
-// Note: should have a single scene binding, path_tags are a slice
-// in that; need a config uniform.
 @group(0) @binding(0)
-var<storage> path_tags: array<u32>;
+var<storage> config: Config;
 
 @group(0) @binding(1)
+var<storage> scene: array<u32>;
+
+@group(0) @binding(2)
 var<storage, read_write> reduced: array<TagMonoid>;
 
 let LG_WG_SIZE = 8u;
@@ -35,7 +37,7 @@ fn main(
     @builtin(local_invocation_id) local_id: vec3<u32>,
 ) {
     let ix = global_id.x;
-    let tag_word = path_tags[ix];
+    let tag_word = scene[config.pathtag_base + ix];
     var agg = reduce_tag(tag_word);
     sh_scratch[local_id.x] = agg;
     for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
diff --git a/piet-wgsl/shader/pathtag_scan.wgsl b/piet-wgsl/shader/pathtag_scan.wgsl
index 058ce04..d18d872 100644
--- a/piet-wgsl/shader/pathtag_scan.wgsl
+++ b/piet-wgsl/shader/pathtag_scan.wgsl
@@ -14,15 +14,19 @@
 //
 // Also licensed under MIT license, at your choice.
 
+#import config
 #import pathtag
 
 @group(0) @binding(0)
-var<storage> path_tags: array<u32>;
+var<storage> config: Config;
 
 @group(0) @binding(1)
-var<storage> reduced: array<TagMonoid>;
+var<storage> scene: array<u32>;
 
 @group(0) @binding(2)
+var<storage> reduced: array<TagMonoid>;
+
+@group(0) @binding(3)
 var<storage, read_write> tag_monoids: array<TagMonoid>;
 
 let LG_WG_SIZE = 8u;
@@ -54,7 +58,7 @@ fn main(
     }
 
     let ix = global_id.x;
-    let tag_word = path_tags[ix];
+    let tag_word = scene[config.pathtag_base + ix];
     agg = reduce_tag(tag_word);
     sh_monoid[local_id.x] = agg;
     for (var i = 0u; i < LG_WG_SIZE; i += 1u) {
diff --git a/piet-wgsl/shader/shared/config.wgsl b/piet-wgsl/shader/shared/config.wgsl
index b31f915..9058d5c 100644
--- a/piet-wgsl/shader/shared/config.wgsl
+++ b/piet-wgsl/shader/shared/config.wgsl
@@ -22,6 +22,9 @@ struct Config {
 
     // offsets within scene buffer (in u32 units)
     // Note: this is a difference from piet-gpu, which is in bytes
+    pathtag_base: u32,
+    pathdata_base: u32,
+
     drawtag_base: u32,
     drawdata_base: u32,
 }
diff --git a/piet-wgsl/src/render.rs b/piet-wgsl/src/render.rs
index 7b79236..c3c30f1 100644
--- a/piet-wgsl/src/render.rs
+++ b/piet-wgsl/src/render.rs
@@ -16,6 +16,8 @@ struct Config {
     width_in_tiles: u32,
     height_in_tiles: u32,
     n_drawobj: u32,
+    pathtag_base: u32,
+    pathdata_base: u32,
     drawtag_base: u32,
     drawdata_base: u32,
 }
@@ -29,22 +31,39 @@ pub struct PathSegment {
     next: u32,
 }
 
+fn size_to_words(byte_size: usize) -> u32 {
+    (byte_size / std::mem::size_of::<u32>()) as u32
+}
+
 pub fn render(scene: &Scene, shaders: &Shaders) -> (Recording, BufProxy) {
     let mut recording = Recording::default();
     let data = scene.data();
     let n_pathtag = data.tag_stream.len();
     let pathtag_padded = align_up(n_pathtag, 4 * shaders::PATHTAG_REDUCE_WG);
     let pathtag_wgs = pathtag_padded / (4 * shaders::PATHTAG_REDUCE_WG as usize);
-    let mut tag_data: Vec<u8> = Vec::with_capacity(pathtag_padded);
-    tag_data.extend(&data.tag_stream);
-    tag_data.resize(pathtag_padded, 0);
-    let pathtag_buf = recording.upload(tag_data);
+    let mut scene: Vec<u8> = Vec::with_capacity(pathtag_padded);
+    let pathtag_base = size_to_words(scene.len());
+    scene.extend(&data.tag_stream);
+    scene.resize(pathtag_padded, 0);
+    let pathdata_base = size_to_words(scene.len());
+    scene.extend(&data.pathseg_stream);
+
+    let config = Config {
+        width_in_tiles: 64,
+        height_in_tiles: 64,
+        pathtag_base,
+        pathdata_base,
+        ..Default::default()
+    };
+    let scene_buf = recording.upload(scene);
+    let config_buf = recording.upload(bytemuck::bytes_of(&config).to_owned());
+
     let reduced_buf = BufProxy::new(pathtag_wgs as u64 * TAG_MONOID_SIZE);
     // TODO: really only need pathtag_wgs - 1
     recording.dispatch(
         shaders.pathtag_reduce,
         (pathtag_wgs as u32, 1, 1),
-        [pathtag_buf, reduced_buf],
+        [config_buf, scene_buf, reduced_buf],
     );
 
     let tagmonoid_buf =
@@ -52,20 +71,11 @@ pub fn render(scene: &Scene, shaders: &Shaders) -> (Recording, BufProxy) {
     recording.dispatch(
         shaders.pathtag_scan,
         (pathtag_wgs as u32, 1, 1),
-        [pathtag_buf, reduced_buf, tagmonoid_buf],
+        [config_buf, scene_buf, reduced_buf, tagmonoid_buf],
     );
 
     let path_coarse_wgs = (data.n_pathseg + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
-    // The clone here is kinda BS, think about reducing copies
-    // Of course, we'll probably end up concatenating into a single scene binding.
-    let pathdata_buf = recording.upload(data.pathseg_stream.clone());
     //let cubics_buf = BufProxy::new(data.n_pathseg as u64 * 32);
-    let config = Config {
-        width_in_tiles: 64,
-        height_in_tiles: 64,
-        ..Default::default()
-    };
-    let config_buf = recording.upload(bytemuck::bytes_of(&config).to_owned());
     // TODO: more principled size calc
     let tiles_buf = BufProxy::new(4097 * 8);
     let segments_buf = BufProxy::new(256 * 24);
@@ -74,10 +84,9 @@ pub fn render(scene: &Scene, shaders: &Shaders) -> (Recording, BufProxy) {
         shaders.path_coarse,
         (path_coarse_wgs, 1, 1),
         [
-            pathtag_buf,
-            tagmonoid_buf,
-            pathdata_buf,
             config_buf,
+            scene_buf,
+            tagmonoid_buf,
             tiles_buf,
             segments_buf,
         ],
diff --git a/piet-wgsl/src/shaders.rs b/piet-wgsl/src/shaders.rs
index 64e022d..d1e8847 100644
--- a/piet-wgsl/src/shaders.rs
+++ b/piet-wgsl/src/shaders.rs
@@ -44,12 +44,13 @@ pub fn init_shaders(device: &Device, engine: &mut Engine) -> Result<Shaders, Err
     let pathtag_reduce = engine.add_shader(
         device,
         preprocess::preprocess(&read_shader("pathtag_reduce"), &empty, &imports).into(),
-        &[BindType::BufReadOnly, BindType::Buffer],
+        &[BindType::BufReadOnly, BindType::BufReadOnly, BindType::Buffer],
     )?;
     let pathtag_scan = engine.add_shader(
         device,
         preprocess::preprocess(&read_shader("pathtag_scan"), &empty, &imports).into(),
         &[
+            BindType::BufReadOnly,
             BindType::BufReadOnly,
             BindType::BufReadOnly,
             BindType::Buffer,
@@ -65,7 +66,6 @@ pub fn init_shaders(device: &Device, engine: &mut Engine) -> Result<Shaders, Err
             BindType::BufReadOnly,
             BindType::BufReadOnly,
             BindType::BufReadOnly,
-            BindType::BufReadOnly,
             BindType::Buffer,
             BindType::Buffer,
         ],

From 5851ef141780d7e58494c76a8a7493fd7b0f0875 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Thu, 3 Nov 2022 16:53:34 -0700
Subject: [PATCH 06/17] Shaders loaded

This checkpoint loads the shaders for full rendering, but there's a bunch of stuff still needing to be done.
---
 .vscode/settings.json                  |   3 +-
 piet-wgsl/shader/bbox_clear.wgsl       |  45 ++++
 piet-wgsl/shader/binning.wgsl          |  36 ++--
 piet-wgsl/shader/coarse.wgsl           |  35 ++--
 piet-wgsl/shader/draw_leaf.wgsl        |   6 +-
 piet-wgsl/shader/draw_reduce.wgsl      |   6 +-
 piet-wgsl/shader/path_coarse.wgsl      |   5 +-
 piet-wgsl/shader/path_coarse_full.wgsl | 276 +++++++++++++++++++++++++
 piet-wgsl/shader/pathseg.wgsl          | 215 +++++++++++++++++++
 piet-wgsl/shader/shared/bbox.wgsl      |  10 +-
 piet-wgsl/shader/shared/config.wgsl    |   1 +
 piet-wgsl/shader/shared/pathtag.wgsl   |  18 +-
 piet-wgsl/shader/shared/tile.wgsl      |  29 +++
 piet-wgsl/shader/tile_alloc.wgsl       |  14 +-
 piet-wgsl/src/main.rs                  |   7 +-
 piet-wgsl/src/render.rs                |   1 +
 piet-wgsl/src/shaders.rs               | 166 +++++++++++++++
 17 files changed, 805 insertions(+), 68 deletions(-)
 create mode 100644 piet-wgsl/shader/bbox_clear.wgsl
 create mode 100644 piet-wgsl/shader/path_coarse_full.wgsl
 create mode 100644 piet-wgsl/shader/pathseg.wgsl
 create mode 100644 piet-wgsl/shader/shared/tile.wgsl

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 4d315c9..bdb0028 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -6,7 +6,8 @@
     "drawtag": "${workspaceFolder}/piet-wgsl/shader/shared/drawtag.wgsl",
     "segment": "${workspaceFolder}/piet-wgsl/shader/shared/segment.wgsl",
     "pathtag": "${workspaceFolder}/piet-wgsl/shader/shared/pathtag.wgsl",
-    "ptcl": "${workspaceFolder}/piet-wgsl/shader/shared/ptcl.wgsl"
+    "ptcl": "${workspaceFolder}/piet-wgsl/shader/shared/ptcl.wgsl",
+    "tile": "${workspaceFolder}/piet-wgsl/shader/shared/tile.wgsl"
   },
   "wgsl-analyzer.diagnostics.nagaVersion": "main",
   "wgsl-analyzer.preprocessor.shaderDefs": ["full"]
diff --git a/piet-wgsl/shader/bbox_clear.wgsl b/piet-wgsl/shader/bbox_clear.wgsl
new file mode 100644
index 0000000..69a8b63
--- /dev/null
+++ b/piet-wgsl/shader/bbox_clear.wgsl
@@ -0,0 +1,45 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+#import config
+
+@group(0) @binding(0)
+var<storage> config: Config;
+
+struct PathBbox {
+    x0: i32,
+    y0: i32,
+    x1: i32,
+    y1: i32,
+    linewidth: f32,
+    trans_ix: u32,
+}
+
+@group(0) @binding(1)
+var<storage, read_write> path_bboxes: array<PathBbox>;
+
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+) {
+    let ix = global_id.x;
+    if ix < config.n_path {
+        path_bboxes[ix].x0 = 0x7fffffff;
+        path_bboxes[ix].y0 = 0x7fffffff;
+        path_bboxes[ix].x1 = -0x80000000;
+        path_bboxes[ix].y1 = -0x80000000;
+    }
+}
diff --git a/piet-wgsl/shader/binning.wgsl b/piet-wgsl/shader/binning.wgsl
index d16a620..3d5a053 100644
--- a/piet-wgsl/shader/binning.wgsl
+++ b/piet-wgsl/shader/binning.wgsl
@@ -28,7 +28,7 @@ var<storage> config: Config;
 var<storage> draw_monoids: array<DrawMonoid>;
 
 @group(0) @binding(2)
-var<storage> path_bbox_buf: array<PathBBox>;
+var<storage> path_bbox_buf: array<PathBbox>;
 
 @group(0) @binding(3)
 var<storage> clip_bbox_buf: array<vec4<f32>>;
@@ -39,7 +39,7 @@ var<storage, read_write> intersected_bbox: array<vec4<f32>>;
 // TODO: put into shared include
 
 @group(0) @binding(5)
-var<storate, read_write> bump: BumpAllocators;
+var<storage, read_write> bump: BumpAllocators;
 
 @group(0) @binding(6)
 var<storage, read_write> bin_data: array<u32>;
@@ -54,11 +54,14 @@ struct BinHeader {
 var<storage, read_write> bin_header: array<BinHeader>;
 
 // conversion factors from coordinates to bin
-let SX = 1.0 / f32(N_TILE_X * TILE_WIDTH);
-let SY = 1.0 / f32(N_TILE_Y * TILE_HEIGHT);
+let SX = 0.00390625;
+let SY = 0.00390625;
+//let SX = 1.0 / f32(N_TILE_X * TILE_WIDTH);
+//let SY = 1.0 / f32(N_TILE_Y * TILE_HEIGHT);
 
 let WG_SIZE = 256u;
-let N_SLICE = WG_SIZE / 32u;
+let N_SLICE = 8u;
+//let N_SLICE = WG_SIZE / 32u;
 
 var<workgroup> sh_bitmaps: array<array<atomic<u32>, N_TILE>, N_SLICE>;
 var<workgroup> sh_count: array<array<u32, N_TILE>, N_SLICE>;
@@ -82,7 +85,7 @@ fn main(
     var y1 = 0;
     if element_ix < config.n_drawobj {
         let draw_monoid = draw_monoids[element_ix];
-        var clip_bbox = vec4(-1e9, -1e9, 1e9, 1e9);
+        var clip_bbox = vec4<f32>(-1e9, -1e9, 1e9, 1e9);
         if draw_monoid.clip_ix > 0u {
             clip_bbox = clip_bbox_buf[draw_monoid.clip_ix - 1u];
         }
@@ -92,10 +95,11 @@ fn main(
         // TODO check this is true
 
         let path_bbox = path_bbox_buf[draw_monoid.path_ix];
-        let pb = vec4(path_bbox.x0, path_bbox.y0, path_bbox.x1, path_bbox.y1);
-        let bbox = bbox_intersect(clip_bbox, pb);
+        let pb = vec4<f32>(vec4<i32>(path_bbox.x0, path_bbox.y0, path_bbox.x1, path_bbox.y1));
+        let bbox_raw = bbox_intersect(clip_bbox, pb);
+        // TODO(naga): clunky expression a workaround for broken lhs swizzle
+        let bbox = vec4<f32>(bbox_raw.xy, max(bbox_raw.xy, bbox_raw.zw));
 
-        bbox.zw = max(bbox.xy, bbox.zw);
         intersected_bbox[element_ix] = bbox;
         x0 = i32(floor(bbox.x * SX));
         y0 = i32(floor(bbox.y * SY));
@@ -117,7 +121,7 @@ fn main(
     let my_mask = 1u << (local_id.x & 31u);
     while y < y1 {
         atomicOr(&sh_bitmaps[my_slice][y * width_in_bins + x], my_mask);
-        x += 1u;
+        x += 1;
         if x == x1 {
             x = x0;
             y += 1;
@@ -128,8 +132,8 @@ fn main(
     // Allocate output segments
     var element_count = 0u;
     for (var i = 0u; i < N_SLICE; i += 1u) {
-        elementCount += countOneBits(atomicLoad(&sh_bitmaps[i][local_id.x]));
-        sh_count[i][id_ix] = element_count;
+        element_count += countOneBits(atomicLoad(&sh_bitmaps[i][local_id.x]));
+        sh_count[i][local_id.x] = element_count;
     }
     // element_count is the number of draw objects covering this thread's bin
     let chunk_offset = atomicAdd(&bump.binning, element_count);
@@ -145,18 +149,18 @@ fn main(
         let bin_ix = y * width_in_bins + x;
         let out_mask = atomicLoad(&sh_bitmaps[my_slice][bin_ix]);
         // I think this predicate will always be true...
-        if (out_mask & my_mask) != 0 {
+        if (out_mask & my_mask) != 0u {
             var idx = countOneBits(out_mask & (my_mask - 1u));
-            if my_slice > 0 {
+            if my_slice > 0u {
                 idx += sh_count[my_slice - 1u][bin_ix];
             }
             let offset = sh_chunk_offset[bin_ix];
             bin_data[offset + idx] = element_ix;
         }
-        x += 1u;
+        x += 1;
         if x == x1 {
             x = x0;
-            y += 1u;
+            y += 1;
         }
     }
 }
diff --git a/piet-wgsl/shader/coarse.wgsl b/piet-wgsl/shader/coarse.wgsl
index 4ccb0c2..8c59ba6 100644
--- a/piet-wgsl/shader/coarse.wgsl
+++ b/piet-wgsl/shader/coarse.wgsl
@@ -20,6 +20,7 @@
 #import bump
 #import drawtag
 #import ptcl
+#import tile
 
 @group(0) @binding(0)
 var<storage> config: Config;
@@ -40,33 +41,21 @@ struct BinHeader {
 var<storage> bin_headers: array<BinHeader>;
 
 @group(0) @binding(4)
-var<storage, read_write> bin_data: array<u32>;
-
-@group(0) @binding(5)
-var<storage, read_write> bump: BumpAllocators;
-
-@group(0) @binding(6)
-var<storage, read_write> ptcl: array<u32>;
-
-// TODO: put this in the right place
-struct Path {
-    // bounding box in pixels
-    bbox: vec4<u32>,
-    // offset (in u32's) to tile rectangle
-    tiles: u32,
-}
-
-struct Tile {
-    backdrop: i32,
-    segments: u32,
-}
-
-@group(0) @binding(7)
 var<storage> paths: array<Path>;
 
-@group(0) @binding(8)
+@group(0) @binding(5)
 var<storage> tiles: array<Tile>;
 
+@group(0) @binding(6)
+var<storage> bin_data: array<u32>;
+
+@group(0) @binding(7)
+var<storage, read_write> bump: BumpAllocators;
+
+@group(0) @binding(8)
+var<storage, read_write> ptcl: array<u32>;
+
+
 
 // Much of this code assumes WG_SIZE == N_TILE. If these diverge, then
 // a fair amount of fixup is needed.
diff --git a/piet-wgsl/shader/draw_leaf.wgsl b/piet-wgsl/shader/draw_leaf.wgsl
index b962bf2..6782542 100644
--- a/piet-wgsl/shader/draw_leaf.wgsl
+++ b/piet-wgsl/shader/draw_leaf.wgsl
@@ -30,10 +30,10 @@ var<storage> scene: array<u32>;
 var<storage> reduced: array<DrawMonoid>;
 
 @group(0) @binding(3)
-var<storage, read_write> draw_monoid: array<DrawMonoid>;
+var<storage> path_bbox: array<PathBbox>;
 
 @group(0) @binding(4)
-var<storage> path_bbox: array<PathBbox>;
+var<storage, read_write> draw_monoid: array<DrawMonoid>;
 
 @group(0) @binding(5)
 var<storage, read_write> info: array<u32>;
@@ -64,6 +64,8 @@ fn main(
     workgroupBarrier();
     var m = draw_monoid_identity();
     if wg_id.x > 0u {
+        // TODO: separate dispatch to scan these, or integrate into this one?
+        // In the meantime, will be limited to 2 * WG draw objs.
         m = reduced[wg_id.x - 1u];
     }
     if local_id.x > 0u {
diff --git a/piet-wgsl/shader/draw_reduce.wgsl b/piet-wgsl/shader/draw_reduce.wgsl
index 9efae7c..8ff1a44 100644
--- a/piet-wgsl/shader/draw_reduce.wgsl
+++ b/piet-wgsl/shader/draw_reduce.wgsl
@@ -26,7 +26,7 @@ var<storage> scene: array<u32>;
 @group(0) @binding(2)
 var<storage, read_write> reduced: array<DrawMonoid>;
 
-let WG_SIZE = 256;
+let WG_SIZE = 256u;
 
 var<workgroup> sh_scratch: array<DrawMonoid, WG_SIZE>;
 
@@ -37,7 +37,7 @@ fn main(
 ) {
     let ix = global_id.x;
     let tag_word = scene[config.drawtag_base + ix];
-    let agg = map_draw_tag(tag_word);
+    var agg = map_draw_tag(tag_word);
     sh_scratch[local_id.x] = agg;
     for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
         workgroupBarrier();
@@ -49,6 +49,6 @@ fn main(
         sh_scratch[local_id.x] = agg;
     }
     if local_id.x == 0u {
-        reduced[ix >> LG_WG_SIZE] = agg;
+        reduced[ix >> firstTrailingBit(WG_SIZE)] = agg;
     }
 }
diff --git a/piet-wgsl/shader/path_coarse.wgsl b/piet-wgsl/shader/path_coarse.wgsl
index 534963f..ffdd8cf 100644
--- a/piet-wgsl/shader/path_coarse.wgsl
+++ b/piet-wgsl/shader/path_coarse.wgsl
@@ -30,7 +30,8 @@ var<storage> tag_monoids: array<TagMonoid>;
 @group(0) @binding(3)
 var<storage, read_write> output: array<vec2<f32>>;
 #else
-struct Tile {
+// We don't get this from import as it's the atomic version
+struct AtomicTile {
     backdrop: atomic<i32>,
     segments: atomic<u32>,
 }
@@ -38,7 +39,7 @@ struct Tile {
 #import segment
 
 @group(0) @binding(3)
-var<storage, read_write> tiles: array<Tile>;
+var<storage, read_write> tiles: array<AtomicTile>;
 
 @group(0) @binding(4)
 var<storage, read_write> segments: array<Segment>;
diff --git a/piet-wgsl/shader/path_coarse_full.wgsl b/piet-wgsl/shader/path_coarse_full.wgsl
new file mode 100644
index 0000000..5641642
--- /dev/null
+++ b/piet-wgsl/shader/path_coarse_full.wgsl
@@ -0,0 +1,276 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+// Path coarse rasterization for the full implementation.
+
+#import config
+#import pathtag
+#import tile
+#import segment
+
+@group(0) @binding(0)
+var<storage> config: Config;
+
+@group(0) @binding(1)
+var<storage> scene: array<u32>;
+
+// Maybe dedup?
+struct Cubic {
+    p0: vec2<f32>,
+    p1: vec2<f32>,
+    p2: vec2<f32>,
+    p3: vec2<f32>,
+}
+
+@group(0) @binding(2)
+var<storage> tag_monoids: array<TagMonoid>;
+
+@group(0) @binding(3)
+var<storage> cubics: array<Cubic>;
+
+@group(0) @binding(4)
+var<storage> paths: array<Path>;
+
+// We don't get this from import as it's the atomic version
+struct AtomicTile {
+    backdrop: atomic<i32>,
+    segments: atomic<u32>,
+}
+
+@group(0) @binding(5)
+var<storage, read_write> tiles: array<AtomicTile>;
+
+@group(0) @binding(6)
+var<storage, read_write> segments: array<Segment>;
+
+struct SubdivResult {
+    val: f32,
+    a0: f32,
+    a2: f32,
+}
+
+let D = 0.67;
+fn approx_parabola_integral(x: f32) -> f32 {
+    return x * inverseSqrt(sqrt(1.0 - D + (D * D * D * D + 0.25 * x * x)));
+}
+
+let B = 0.39;
+fn approx_parabola_inv_integral(x: f32) -> f32 {
+    return x * sqrt(1.0 - B + (B * B + 0.5 * x * x));
+}
+
+fn estimate_subdiv(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, sqrt_tol: f32) -> SubdivResult {
+    let d01 = p1 - p0;
+    let d12 = p2 - p1;
+    let dd = d01 - d12;
+    let cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x;
+    let cross_inv = 1.0 / cross;
+    let x0 = dot(d01, dd) * cross_inv;
+    let x2 = dot(d12, dd) * cross_inv;
+    let scale = abs(cross / (length(dd) * (x2 - x0)));
+
+    let a0 = approx_parabola_integral(x0);
+    let a2 = approx_parabola_integral(x2);
+    var val = 0.0;
+    if scale < 1e9 {
+        let da = abs(a2 - a0);
+        let sqrt_scale = sqrt(scale);
+        if sign(x0) == sign(x2) {
+            val = sqrt_scale;
+        } else {
+            let xmin = sqrt_tol / sqrt_scale;
+            val = sqrt_tol / approx_parabola_integral(xmin);
+        }
+        val *= da;
+    }
+    return SubdivResult(val, a0, a2);
+}
+
+fn eval_quad(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, t: f32) -> vec2<f32> {
+    let mt = 1.0 - t;
+    return p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t;
+}
+
+fn eval_cubic(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, p3: vec2<f32>, t: f32) -> vec2<f32> {
+    let mt = 1.0 - t;
+    return p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t;
+}
+
+fn alloc_segment() -> u32 {
+    // Use 0-index segment (address is sentinel) as counter
+    // TODO: separate small buffer binding for this?
+    return atomicAdd(&tiles[4096].segments, 1u) + 1u;
+}
+
+let MAX_QUADS = 16u;
+
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+) {
+    let ix = global_id.x;
+    let tag_word = scene[config.pathtag_base + (ix >> 2u)];
+    let shift = (ix & 3u) * 8u;
+    var tag_byte = (tag_word >> shift) & 0xffu;
+
+    // Reconstruct path_ix from monoid or store in cubic?
+    if (tag_byte & PATH_TAG_SEG_TYPE) != 0u {
+        let path_ix = 42u; // BIG GIANT TODO
+        let path = paths[path_ix];
+        let bbox = vec4<i32>(path.bbox);
+        let cubic = cubics[global_id.x];
+        let p0 = cubic.p0;
+        let p1 = cubic.p1;
+        let p2 = cubic.p2;
+        let p3 = cubic.p3;
+        let err_v = 3.0 * (p2 - p1) + p0 - p3;
+        let err = dot(err_v, err_v);
+        let ACCURACY = 0.25;
+        let Q_ACCURACY = ACCURACY * 0.1;
+        let REM_ACCURACY = (ACCURACY - Q_ACCURACY);
+        let MAX_HYPOT2 = 432.0 * Q_ACCURACY * Q_ACCURACY;
+        var n_quads = max(u32(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0))), 1u);
+        n_quads = min(n_quads, MAX_QUADS);
+        var keep_params: array<SubdivResult, MAX_QUADS>;
+        var val = 0.0;
+        var qp0 = p0;
+        let step = 1.0 / f32(n_quads);
+        for (var i = 0u; i < n_quads; i += 1u) {
+            let t = f32(i + 1u) * step;
+            let qp2 = eval_cubic(p0, p1, p2, p3, t);
+            var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step);
+            qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
+            let params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY));
+            keep_params[i] = params;
+            val += params.val;
+            qp0 = qp2;
+        }
+        let n = max(u32(ceil(val * (0.5 / sqrt(REM_ACCURACY)))), 1u);
+        var lp0 = p0;
+        qp0 = p0;
+        let v_step = val / f32(n);
+        var n_out = 1u;
+        var val_sum = 0.0;
+        for (var i = 0u; i < n_quads; i += 1u) {
+            let t = f32(i + 1u) * step;
+            let qp2 = eval_cubic(p0, p1, p2, p3, t);
+            var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step);
+            qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
+            let params = keep_params[i];
+            let u0 = approx_parabola_inv_integral(params.a0);
+            let u2 = approx_parabola_inv_integral(params.a2);
+            let uscale = 1.0 / (u2 - u0);
+            var val_target = f32(n_out) * v_step;
+            while n_out == n || val_target < val_sum + params.val {
+                var lp1: vec2<f32>;
+                if n_out == n {
+                    lp1 = p3;
+                } else {
+                    let u = (val_target - val_sum) / params.val;
+                    let a = mix(params.a0, params.a2, u);
+                    let au = approx_parabola_inv_integral(a);
+                    let t = (au - u0) * uscale;
+                    lp1 = eval_quad(qp0, qp1, qp2, t);
+                }
+
+                // Output line segment lp0..lp1
+                let xymin = min(lp0, lp1);
+                let xymax = max(lp0, lp1);
+                let dp = lp1 - lp0;
+                let recip_dx = 1.0 / dp.x;
+                let invslope = select(dp.x / dp.y, 1.0e9, abs(dp.y) < 1.0e-9);
+                let c = 0.5 * abs(invslope);
+                let b = invslope;
+                let SX = 1.0 / f32(TILE_WIDTH);
+                let SY = 1.0 / f32(TILE_HEIGHT);
+                let a = (lp0.x - (lp0.y - 0.5 * f32(TILE_HEIGHT)) * b) * SX;
+                var x0 = i32(floor(xymin.x * SX));
+                var x1 = i32(floor(xymax.x * SX) + 1.0);
+                var y0 = i32(floor(xymin.y * SY));
+                var y1 = i32(floor(xymax.y * SY) + 1.0);
+                x0 = clamp(x0, 0, i32(config.width_in_tiles));
+                x1 = clamp(x1, 0, i32(config.width_in_tiles));
+                y0 = clamp(y0, 0, i32(config.height_in_tiles));
+                y1 = clamp(y1, 0, i32(config.height_in_tiles));
+                var xc = a + b * f32(y0);
+                var xray = i32(floor(lp0.x * SX));
+                var last_xray = i32(floor(lp1.x * SX));
+                if dp.y < 0.0 {
+                    let tmp = xray;
+                    xray = last_xray;
+                    last_xray = tmp;
+                }
+                for (var y = y0; y < y1; y += 1) {
+                    let tile_y0 = f32(y) * f32(TILE_HEIGHT);
+                    let xbackdrop = max(xray + 1, 0);
+                    if xymin.y < tile_y0 && xbackdrop < i32(config.width_in_tiles) {
+                        let backdrop = select(-1, 1, dp.y < 0.0);
+                        let tile_ix = y * i32(config.width_in_tiles) + xbackdrop;
+                        atomicAdd(&tiles[tile_ix].backdrop, backdrop);
+                    }
+                    var next_xray = last_xray;
+                    if y + 1 < y1 {
+                        let tile_y1 = f32(y + 1) * f32(TILE_HEIGHT);
+                        let x_edge = lp0.x + (tile_y1 - lp0.y) * invslope;
+                        next_xray = i32(floor(x_edge * SX));
+                    }
+                    let min_xray = min(xray, next_xray);
+                    let max_xray = max(xray, next_xray);
+                    var xx0 = min(i32(floor(xc - c)), min_xray);
+                    var xx1 = max(i32(ceil(xc + c)), max_xray + 1);
+                    xx0 = clamp(xx0, x0, x1);
+                    xx1 = clamp(xx1, x0, x1);
+                    var tile_seg: Segment;
+                    for (var x = xx0; x < xx1; x += 1) {
+                        let tile_x0 = f32(x) * f32(TILE_WIDTH);
+                        let tile_ix = y * i32(config.width_in_tiles) + x;
+                        // allocate segment, insert linked list
+                        let seg_ix = alloc_segment();
+                        let old = atomicExchange(&tiles[tile_ix].segments, seg_ix);
+                        tile_seg.origin = lp0;
+                        tile_seg.delta = dp;
+                        var y_edge = mix(lp0.y, lp1.y, (tile_x0 - lp0.x) * recip_dx);
+                        if xymin.x < tile_x0 {
+                            let p = vec2<f32>(tile_x0, y_edge);
+                            if dp.x < 0.0 {
+                                tile_seg.delta = p - lp0;
+                            } else {
+                                tile_seg.origin = p;
+                                tile_seg.delta = lp1 - p;
+                            }
+                            if tile_seg.delta.x == 0.0 {
+                                tile_seg.delta.x = sign(dp.x) * 1e-9;
+                            }
+                        }
+                        if x <= min_xray || max_xray < x {
+                            y_edge = 1e9;
+                        }
+                        tile_seg.y_edge = y_edge;
+                        tile_seg.next = old;
+                        segments[seg_ix] = tile_seg;
+                    }
+                    xc += b;
+                    xray = next_xray;
+                }
+                n_out += 1u;
+                val_target += v_step;
+                lp0 = lp1;
+            }
+            val_sum += params.val;
+            qp0 = qp2;
+        }
+    }
+}
diff --git a/piet-wgsl/shader/pathseg.wgsl b/piet-wgsl/shader/pathseg.wgsl
new file mode 100644
index 0000000..03c5aa6
--- /dev/null
+++ b/piet-wgsl/shader/pathseg.wgsl
@@ -0,0 +1,215 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+// Path segment decoding for the full case.
+
+// In the simple case, path segments are decoded as part of the coarse
+// path rendering stage. In the full case, they are separated, as the
+// decoding process also generates bounding boxes, and those in turn are
+// used for tile allocation and clipping; actual coarse path rasterization
+// can't proceed until those are complete.
+
+// There's some duplication of the decoding code but we won't worry about
+// that just now. Perhaps it could be factored more nicely later.
+
+#import config
+#import pathtag
+
+@group(0) @binding(0)
+var<storage> config: Config;
+
+@group(0) @binding(1)
+var<storage> scene: array<u32>;
+
+@group(0) @binding(2)
+var<storage> tag_monoids: array<TagMonoid>;
+
+struct AtomicPathBbox {
+    x0: atomic<i32>,
+    y0: atomic<i32>,
+    x1: atomic<i32>,
+    y1: atomic<i32>,
+    linewidth: f32,
+    trans_ix: u32,
+}
+
+@group(0) @binding(3)
+var<storage, read_write> path_bboxes: array<AtomicPathBbox>;
+
+struct Cubic {
+    p0: vec2<f32>,
+    p1: vec2<f32>,
+    p2: vec2<f32>,
+    p3: vec2<f32>,
+}
+
+@group(0) @binding(4)
+var<storage, read_write> cubics: array<Cubic>;
+
+// Monoid is yagni, for future optimization
+
+// struct BboxMonoid {
+//     bbox: vec4<f32>,
+//     flags: u32,
+// }
+
+// let FLAG_RESET_BBOX = 1u;
+// let FLAG_SET_BBOX = 2u;
+
+// fn combine_bbox_monoid(a: BboxMonoid, b: BboxMonoid) -> BboxMonoid {
+//     var c: BboxMonoid;
+//     c.bbox = b.bbox;
+//     // TODO: previous-me thought this should be gated on b & SET_BBOX == false also
+//     if (a.flags & FLAG_RESET_BBOX) == 0u && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y {
+//         c.bbox = a.bbox;
+//     } else if (a.flags & FLAG_RESET_BBOX) == 0u && (b.flags & FLAG_SET_BBOX) == 0u ||
+//         (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)
+//     {
+//         c.bbox = vec4<f32>(min(a.bbox.xy, c.bbox.xy), max(a.bbox.xw, c.bbox.zw));
+//     }
+//     c.flags = (a.flags & FLAG_SET_BBOX) | b.flags;
+//     c.flags |= (a.flags & FLAG_RESET_BBOX) << 1u;
+//     return c;
+// }
+
+// fn bbox_monoid_identity() -> BboxMonoid {
+//     return BboxMonoid();
+// }
+
+var<private> pathdata_base: u32;
+var<private> transform_base: u32;
+
+fn read_f32_point(ix: u32) -> vec2<f32> {
+    let x = bitcast<f32>(scene[pathdata_base + ix]);
+    let y = bitcast<f32>(scene[pathdata_base + ix + 1u]);
+    return vec2<f32>(x, y);
+}
+
+fn read_i16_point(ix: u32) -> vec2<f32> {
+    let raw = scene[pathdata_base + ix];
+    let x = f32(i32(raw << 16u) >> 16u);
+    let y = f32(i32(raw) >> 16u);
+    return vec2<f32>(x, y);
+}
+
+struct Transform {
+    matrx: vec4<f32>,
+    translate: vec2<f32>,
+}
+
+fn read_transform(ix: u32) -> Transform {
+    let base = transform_base + ix * 6u;
+    let c0 = bitcast<f32>(scene[base]);
+    let c1 = bitcast<f32>(scene[base] + 1u);
+    let c2 = bitcast<f32>(scene[base] + 2u);
+    let c3 = bitcast<f32>(scene[base] + 3u);
+    let c4 = bitcast<f32>(scene[base] + 4u);
+    let c5 = bitcast<f32>(scene[base] + 5u);
+    let matrx = vec4<f32>(c0, c1, c2, c3);
+    let translate = vec2<f32>(c4, c5);
+    return Transform(matrx, translate);
+}
+
+fn transform_apply(transform: Transform, p: vec2<f32>) -> vec2<f32> {
+    return transform.matrx.xy * p.x + transform.matrx.zw * p.y + transform.translate;
+}
+
+fn round_down(x: f32) -> i32 {
+    return i32(floor(x));
+}
+
+fn round_up(x: f32) -> i32 {
+    return i32(ceil(x));
+}
+
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+) {
+    let ix = global_id.x;
+    let tag_word = scene[config.pathtag_base + (ix >> 2u)];
+    // TODO: set transform_base
+    pathdata_base = config.pathdata_base;
+    let shift = (ix & 3u) * 8u;
+    var tm = reduce_tag(tag_word & ((1u << shift) - 1u));
+    tm = combine_tag_monoid(tag_monoids[ix >> 2u], tm);
+    var tag_byte = (tag_word >> shift) & 0xffu;
+
+    let out = &path_bboxes[tm.path_ix];
+    if (tag_byte & PATH_TAG_PATH) != 0u {
+        (*out).linewidth = -1.0; // TODO: plumb linewidth
+        (*out).trans_ix = tm.trans_ix;
+    }
+    // Decode path data
+    let seg_type = tag_byte & PATH_TAG_SEG_TYPE;
+    if seg_type != 0u {
+        var p0: vec2<f32>;
+        var p1: vec2<f32>;
+        var p2: vec2<f32>;
+        var p3: vec2<f32>;
+        if (tag_byte & PATH_TAG_F32) != 0u {
+            p0 = read_f32_point(tm.pathseg_offset);
+            p1 = read_f32_point(tm.pathseg_offset + 2u);
+            if seg_type >= PATH_TAG_QUADTO {
+                p2 = read_f32_point(tm.pathseg_offset + 4u);
+                if seg_type == PATH_TAG_CUBICTO {
+                    p3 = read_f32_point(tm.pathseg_offset + 6u);
+                }
+            }
+        } else {
+            p0 = read_i16_point(tm.pathseg_offset);
+            p1 = read_i16_point(tm.pathseg_offset + 1u);
+            if seg_type >= PATH_TAG_QUADTO {
+                p2 = read_i16_point(tm.pathseg_offset + 2u);
+                if seg_type == PATH_TAG_CUBICTO {
+                    p3 = read_i16_point(tm.pathseg_offset + 3u);
+                }
+            }
+        }
+        let transform = read_transform(tm.trans_ix);
+        p0 = transform_apply(transform, p0);
+        p1 = transform_apply(transform, p1);
+        var bbox = vec4<f32>(min(p0, p1), max(p0, p1));
+        // Degree-raise
+        if seg_type == PATH_TAG_LINETO {
+            p3 = p1;
+            p2 = mix(p3, p0, 1.0 / 3.0);
+            p1 = mix(p0, p3, 1.0 / 3.0);
+        } else if seg_type >= PATH_TAG_QUADTO {
+            p2 = transform_apply(transform, p2);
+            bbox = vec4<f32>(min(bbox.xy, p2), max(bbox.zw, p2));
+            if seg_type == PATH_TAG_CUBICTO {
+                p3 = transform_apply(transform, p3);
+                bbox = vec4<f32>(min(bbox.xy, p3), max(bbox.zw, p3));
+            } else {
+                p3 = p2;
+                p2 = mix(p1, p2, 1.0 / 3.0);
+                p1 = mix(p1, p0, 1.0 / 3.0);
+            }
+        }
+        cubics[global_id.x] = Cubic(p0, p1, p2, p3);
+        // Update bounding box using atomics only. Computing a monoid is a
+        // potential future optimization.
+        if bbox.z > bbox.x && bbox.w > bbox.y {
+            atomicMin(&(*out).x0, round_down(bbox.x));
+            atomicMin(&(*out).y0, round_down(bbox.y));
+            atomicMax(&(*out).x1, round_up(bbox.z));
+            atomicMax(&(*out).y1, round_up(bbox.w));
+        }
+    }
+
+}
diff --git a/piet-wgsl/shader/shared/bbox.wgsl b/piet-wgsl/shader/shared/bbox.wgsl
index fb7728b..c260df9 100644
--- a/piet-wgsl/shader/shared/bbox.wgsl
+++ b/piet-wgsl/shader/shared/bbox.wgsl
@@ -16,11 +16,13 @@
 
 // The annotated bounding box for a path. It has been transformed,
 // but contains a link to the active transform, mostly for gradients.
+// Coordinates are integer pixels (for the convenience of atomic update)
+// but will probably become fixed-point fractions for rectangles.
 struct PathBbox {
-    x0: u32,
-    y0: u32,
-    x1: u32,
-    y1: u32,
+    x0: i32,
+    y0: i32,
+    x1: i32,
+    y1: i32,
     linewidth: f32,
     trans_ix: u32,
 }
diff --git a/piet-wgsl/shader/shared/config.wgsl b/piet-wgsl/shader/shared/config.wgsl
index 9058d5c..bf0839d 100644
--- a/piet-wgsl/shader/shared/config.wgsl
+++ b/piet-wgsl/shader/shared/config.wgsl
@@ -19,6 +19,7 @@ struct Config {
     height_in_tiles: u32,
 
     n_drawobj: u32,
+    n_path: u32,
 
     // offsets within scene buffer (in u32 units)
     // Note: this is a difference from piet-gpu, which is in bytes
diff --git a/piet-wgsl/shader/shared/pathtag.wgsl b/piet-wgsl/shader/shared/pathtag.wgsl
index 2424d2a..b248e18 100644
--- a/piet-wgsl/shader/shared/pathtag.wgsl
+++ b/piet-wgsl/shader/shared/pathtag.wgsl
@@ -18,7 +18,10 @@ struct TagMonoid {
     trans_ix: u32,
     pathseg_ix: u32,
     pathseg_offset: u32,
-    // Note: piet-gpu has linewidth and path, but not needed here
+#ifdef full
+    linewidth_ix: u32,
+    path_ix: u32,
+#endif
 }
 
 let PATH_TAG_SEG_TYPE = 3u;
@@ -26,8 +29,11 @@ let PATH_TAG_LINETO = 1u;
 let PATH_TAG_QUADTO = 2u;
 let PATH_TAG_CUBICTO = 3u;
 let PATH_TAG_F32 = 8u;
-let PATH_TAG_PATH = 0x10u;
 let PATH_TAG_TRANSFORM = 0x20u;
+#ifdef full
+let PATH_TAG_PATH = 0x10u;
+let PATH_TAG_LINEWIDTH = 0x40u;
+#endif
 
 fn tag_monoid_identity() -> TagMonoid {
     return TagMonoid();
@@ -38,6 +44,10 @@ fn combine_tag_monoid(a: TagMonoid, b: TagMonoid) -> TagMonoid {
     c.trans_ix = a.trans_ix + b.trans_ix;
     c.pathseg_ix = a.pathseg_ix + b.pathseg_ix;
     c.pathseg_offset = a.pathseg_offset + b.pathseg_offset;
+#ifdef full
+    c.linewidth_ix = a.linewidth_ix + b.linewidth_ix;
+    c.path_ix = a.path_ix + b.path_ix;
+#endif
     return c;
 }
 
@@ -51,5 +61,9 @@ fn reduce_tag(tag_word: u32) -> TagMonoid {
     a += a >> 8u;
     a += a >> 16u;
     c.pathseg_offset = a & 0xffu;
+#ifdef full
+    c.path_ix = countOneBits(tag_word & (PATH_TAG_PATH) * 0x1010101u);
+    c.linewidth_ix = countOneBits(tag_word & (PATH_TAG_LINEWIDTH * 0x1010101u));
+#endif
     return c;
 }
diff --git a/piet-wgsl/shader/shared/tile.wgsl b/piet-wgsl/shader/shared/tile.wgsl
new file mode 100644
index 0000000..f575e50
--- /dev/null
+++ b/piet-wgsl/shader/shared/tile.wgsl
@@ -0,0 +1,29 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+// Common datatypes for path and tile intermediate info.
+
+struct Path {
+    // bounding box in tiles
+    bbox: vec4<u32>,
+    // offset (in u32's) to tile rectangle
+    tiles: u32,
+}
+
+struct Tile {
+    backdrop: i32,
+    segments: u32,
+}
\ No newline at end of file
diff --git a/piet-wgsl/shader/tile_alloc.wgsl b/piet-wgsl/shader/tile_alloc.wgsl
index 867db7e..0bd550e 100644
--- a/piet-wgsl/shader/tile_alloc.wgsl
+++ b/piet-wgsl/shader/tile_alloc.wgsl
@@ -19,6 +19,7 @@
 #import config
 #import bump
 #import drawtag
+#import tile
 
 @group(0) @binding(0)
 var<storage> config: Config;
@@ -32,19 +33,6 @@ var<storage> draw_bboxes: array<vec4<f32>>;
 @group(0) @binding(3)
 var<storage, read_write> bump: BumpAllocators;
 
-// TODO: put this in the right place, dedup
-struct Path {
-    // bounding box in pixels
-    bbox: vec4<u32>,
-    // offset (in u32's) to tile rectangle
-    tiles: u32,
-}
-
-struct Tile {
-    backdrop: i32,
-    segments: u32,
-}
-
 @group(0) @binding(4)
 var<storage, read_write> paths: array<Path>;
 
diff --git a/piet-wgsl/src/main.rs b/piet-wgsl/src/main.rs
index 5cf960d..ba0d034 100644
--- a/piet-wgsl/src/main.rs
+++ b/piet-wgsl/src/main.rs
@@ -22,7 +22,7 @@ use engine::Engine;
 
 use render::render;
 use test_scene::dump_scene_info;
-use wgpu::{Device, Queue};
+use wgpu::{Device, Queue, Limits};
 
 mod engine;
 mod render;
@@ -33,12 +33,14 @@ async fn run() -> Result<(), Box<dyn std::error::Error>> {
     let instance = wgpu::Instance::new(wgpu::Backends::PRIMARY);
     let adapter = instance.request_adapter(&Default::default()).await.unwrap();
     let features = adapter.features();
+    let mut limits = Limits::default();
+    limits.max_storage_buffers_per_shader_stage = 16;
     let (device, queue) = adapter
         .request_device(
             &wgpu::DeviceDescriptor {
                 label: None,
                 features: features & wgpu::Features::TIMESTAMP_QUERY,
-                limits: Default::default(),
+                limits,
             },
             None,
         )
@@ -55,6 +57,7 @@ async fn do_render(
     engine: &mut Engine,
 ) -> Result<(), Box<dyn std::error::Error>> {
     let shaders = shaders::init_shaders(device, engine)?;
+    let full_shaders = shaders::full_shaders(device, engine)?;
     let scene = test_scene::gen_test_scene();
     dump_scene_info(&scene);
     let (recording, buf) = render(&scene, &shaders);
diff --git a/piet-wgsl/src/render.rs b/piet-wgsl/src/render.rs
index c3c30f1..3f9d184 100644
--- a/piet-wgsl/src/render.rs
+++ b/piet-wgsl/src/render.rs
@@ -16,6 +16,7 @@ struct Config {
     width_in_tiles: u32,
     height_in_tiles: u32,
     n_drawobj: u32,
+    n_path: u32,
     pathtag_base: u32,
     pathdata_base: u32,
     drawtag_base: u32,
diff --git a/piet-wgsl/src/shaders.rs b/piet-wgsl/src/shaders.rs
index d1e8847..6a7e9fb 100644
--- a/piet-wgsl/src/shaders.rs
+++ b/piet-wgsl/src/shaders.rs
@@ -35,6 +35,22 @@ pub struct Shaders {
     pub fine: ShaderId,
 }
 
+// Shaders for the full pipeline
+pub struct FullShaders {
+    pub pathtag_reduce: ShaderId,
+    pub pathtag_scan: ShaderId,
+    pub bbox_clear: ShaderId,
+    pub pathseg: ShaderId,
+    pub draw_reduce: ShaderId,
+    pub draw_leaf: ShaderId,
+    pub tile_alloc: ShaderId,
+    pub binning: ShaderId,
+    pub path_coarse: ShaderId,
+    pub backdrop: ShaderId,
+    pub coarse: ShaderId,
+    pub fine: ShaderId,
+}
+
 pub fn init_shaders(device: &Device, engine: &mut Engine) -> Result<Shaders, Error> {
     let shader_dir = Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/shader"));
     let imports = preprocess::get_imports(shader_dir);
@@ -93,3 +109,153 @@ pub fn init_shaders(device: &Device, engine: &mut Engine) -> Result<Shaders, Err
         fine,
     })
 }
+
+pub fn full_shaders(device: &Device, engine: &mut Engine) -> Result<FullShaders, Error> {
+    let shader_dir = Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/shader"));
+    let imports = preprocess::get_imports(shader_dir);
+    let read_shader =
+        |path: &str| fs::read_to_string(shader_dir.join(path.to_string() + ".wgsl")).unwrap();
+    let empty = HashSet::new();
+    let mut full_config = HashSet::new();
+    full_config.insert("full".into());
+    let pathtag_reduce = engine.add_shader(
+        device,
+        preprocess::preprocess(&read_shader("pathtag_reduce"), &full_config, &imports).into(),
+        &[BindType::BufReadOnly, BindType::BufReadOnly, BindType::Buffer],
+    )?;
+    let pathtag_scan = engine.add_shader(
+        device,
+        preprocess::preprocess(&read_shader("pathtag_scan"), &full_config, &imports).into(),
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
+        ],
+    )?;
+    let bbox_clear = engine.add_shader(
+        device,
+        preprocess::preprocess(&read_shader("bbox_clear"), &empty, &imports).into(),
+        &[
+            BindType::BufReadOnly,
+            BindType::Buffer,
+        ],
+    )?;
+    let pathseg = engine.add_shader(
+        device,
+        preprocess::preprocess(&read_shader("pathseg"), &full_config, &imports).into(),
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
+            BindType::Buffer,
+        ],
+    )?;
+    let draw_reduce = engine.add_shader(
+        device,
+        preprocess::preprocess(&read_shader("draw_reduce"), &empty, &imports).into(),
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
+        ],
+    )?;
+    let draw_leaf = engine.add_shader(
+        device,
+        preprocess::preprocess(&read_shader("draw_leaf"), &empty, &imports).into(),
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
+            BindType::Buffer,
+        ],
+    )?;
+    let tile_alloc = engine.add_shader(
+        device,
+        preprocess::preprocess(&read_shader("tile_alloc"), &empty, &imports).into(),
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
+            BindType::Buffer,
+            BindType::Buffer,
+        ],
+    )?;
+    let binning = engine.add_shader(
+        device,
+        preprocess::preprocess(&read_shader("binning"), &empty, &imports).into(),
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
+            BindType::Buffer,
+            BindType::Buffer,
+            BindType::Buffer,
+        ],
+    )?;
+
+    let path_coarse = engine.add_shader(
+        device,
+        preprocess::preprocess(&read_shader("path_coarse_full"), &full_config, &imports).into(),
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
+            BindType::Buffer,
+        ],
+    )?;
+    let backdrop = engine.add_shader(
+        device,
+        preprocess::preprocess(&read_shader("backdrop"), &empty, &imports).into(),
+        &[BindType::BufReadOnly, BindType::Buffer],
+    )?;
+    let coarse = engine.add_shader(
+        device,
+        preprocess::preprocess(&read_shader("coarse"), &empty, &imports).into(),
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
+            BindType::Buffer,
+        ],
+    )?;
+    let fine = engine.add_shader(
+        device,
+        preprocess::preprocess(&read_shader("fine"), &full_config, &imports).into(),
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
+            BindType::BufReadOnly,
+        ],
+    )?;
+    Ok(FullShaders {
+        pathtag_reduce,
+        pathtag_scan,
+        bbox_clear,
+        pathseg,
+        draw_reduce,
+        draw_leaf,
+        tile_alloc,
+        binning,
+        path_coarse,
+        backdrop,
+        coarse,
+        fine,
+    })
+}

From 06fa3cb9ab4f8b10b05419e4d58e40401e0c0203 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Thu, 3 Nov 2022 19:33:11 -0700
Subject: [PATCH 07/17] Checkpoint

Felt like checkpointing what I have before trying to run the pipeline. Theoretically everything should work.
---
 .vscode/settings.json                  |   1 +
 piet-wgsl/shader/draw_leaf.wgsl        |  34 ++++-
 piet-wgsl/shader/fine.wgsl             |   8 +-
 piet-wgsl/shader/path_coarse_full.wgsl |  43 +++---
 piet-wgsl/shader/pathseg.wgsl          |  18 +--
 piet-wgsl/shader/shared/bump.wgsl      |   1 +
 piet-wgsl/shader/shared/config.wgsl    |   2 +
 piet-wgsl/src/main.rs                  |   2 +-
 piet-wgsl/src/render.rs                | 173 ++++++++++++++++++++++++-
 piet-wgsl/src/shaders.rs               |  48 ++++---
 10 files changed, 265 insertions(+), 65 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index bdb0028..813fac6 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -3,6 +3,7 @@
     "bbox": "${workspaceFolder}/piet-wgsl/shader/shared/bbox.wgsl",
     "bump": "${workspaceFolder}/piet-wgsl/shader/shared/bump.wgsl",
     "config": "${workspaceFolder}/piet-wgsl/shader/shared/config.wgsl",
+    "cubic": "${workspaceFolder}/piet-wgsl/shader/shared/cubic.wgsl",
     "drawtag": "${workspaceFolder}/piet-wgsl/shader/shared/drawtag.wgsl",
     "segment": "${workspaceFolder}/piet-wgsl/shader/shared/segment.wgsl",
     "pathtag": "${workspaceFolder}/piet-wgsl/shader/shared/pathtag.wgsl",
diff --git a/piet-wgsl/shader/draw_leaf.wgsl b/piet-wgsl/shader/draw_leaf.wgsl
index 6782542..bce04f8 100644
--- a/piet-wgsl/shader/draw_leaf.wgsl
+++ b/piet-wgsl/shader/draw_leaf.wgsl
@@ -40,6 +40,25 @@ var<storage, read_write> info: array<u32>;
 
 let WG_SIZE = 256u;
 
+// Possibly dedup?
+struct Transform {
+    matrx: vec4<f32>,
+    translate: vec2<f32>,
+}
+
+fn read_transform(transform_base: u32, ix: u32) -> Transform {
+    let base = transform_base + ix * 6u;
+    let c0 = bitcast<f32>(scene[base]);
+    let c1 = bitcast<f32>(scene[base] + 1u);
+    let c2 = bitcast<f32>(scene[base] + 2u);
+    let c3 = bitcast<f32>(scene[base] + 3u);
+    let c4 = bitcast<f32>(scene[base] + 4u);
+    let c5 = bitcast<f32>(scene[base] + 5u);
+    let matrx = vec4<f32>(c0, c1, c2, c3);
+    let translate = vec2<f32>(c4, c5);
+    return Transform(matrx, translate);
+}
+
 var<workgroup> sh_scratch: array<DrawMonoid, WG_SIZE>;
 
 @compute @workgroup_size(256)
@@ -80,17 +99,20 @@ fn main(
         tag_word == DRAWTAG_BEGIN_CLIP
     {
         let bbox = path_bbox[m.path_ix];
-        let x0 = f32(bbox.x0) - 32768.0;
-        let y0 = f32(bbox.y0) - 32768.0;
-        let x1 = f32(bbox.x1) - 32768.0;
-        let y1 = f32(bbox.y1) - 32768.0;
-        let bbox_f = vec4(x0, y0, x1, y1);
+        // TODO: bbox is mostly yagni here, sort that out. Maybe clips?
+        // let x0 = f32(bbox.x0);
+        // let y0 = f32(bbox.y0);
+        // let x1 = f32(bbox.x1);
+        // let y1 = f32(bbox.y1);
+        // let bbox_f = vec4(x0, y0, x1, y1);
         let fill_mode = u32(bbox.linewidth >= 0.0);
         var matrx: vec4<f32>;
         var translate: vec2<f32>;
         var linewidth = bbox.linewidth;
         if linewidth >= 0.0 || tag_word == DRAWTAG_FILL_LIN_GRADIENT || tag_word == DRAWTAG_FILL_RAD_GRADIENT {
-            // TODO: retrieve transform from scene. Packed?
+            let transform = read_transform(config.transform_base, bbox.trans_ix);
+            matrx = transform.matrx;
+            translate = transform.translate;
         }
         if linewidth >= 0.0 {
             // Note: doesn't deal with anisotropic case
diff --git a/piet-wgsl/shader/fine.wgsl b/piet-wgsl/shader/fine.wgsl
index 7b66d04..751e408 100644
--- a/piet-wgsl/shader/fine.wgsl
+++ b/piet-wgsl/shader/fine.wgsl
@@ -157,12 +157,16 @@ fn main(
             default: {}
         }
     }
-
+    let out_ix = global_id.y * (config.width_in_tiles * TILE_WIDTH) + global_id.x * PIXELS_PER_THREAD;
+    for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
+        let bytes = pack4x8unorm(rgba[i]);
+        output[out_ix + i] = bytes;
+    }
 #else
     let area = fill_path(tile, xy);
-#endif
 
     let bytes = pack4x8unorm(vec4<f32>(area[0], area[1], area[2], area[3]));
     let out_ix = global_id.y * (config.width_in_tiles * 4u) + global_id.x;
     output[out_ix] = bytes;
+#endif
 }
diff --git a/piet-wgsl/shader/path_coarse_full.wgsl b/piet-wgsl/shader/path_coarse_full.wgsl
index 5641642..6b6e9b8 100644
--- a/piet-wgsl/shader/path_coarse_full.wgsl
+++ b/piet-wgsl/shader/path_coarse_full.wgsl
@@ -20,6 +20,8 @@
 #import pathtag
 #import tile
 #import segment
+#import cubic
+#import bump
 
 @group(0) @binding(0)
 var<storage> config: Config;
@@ -27,14 +29,6 @@ var<storage> config: Config;
 @group(0) @binding(1)
 var<storage> scene: array<u32>;
 
-// Maybe dedup?
-struct Cubic {
-    p0: vec2<f32>,
-    p1: vec2<f32>,
-    p2: vec2<f32>,
-    p3: vec2<f32>,
-}
-
 @group(0) @binding(2)
 var<storage> tag_monoids: array<TagMonoid>;
 
@@ -51,9 +45,12 @@ struct AtomicTile {
 }
 
 @group(0) @binding(5)
-var<storage, read_write> tiles: array<AtomicTile>;
+var<storage, read_write> bump: BumpAllocators;
 
 @group(0) @binding(6)
+var<storage, read_write> tiles: array<AtomicTile>;
+
+@group(0) @binding(7)
 var<storage, read_write> segments: array<Segment>;
 
 struct SubdivResult {
@@ -110,9 +107,7 @@ fn eval_cubic(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, p3: vec2<f32>, t: f32
 }
 
 fn alloc_segment() -> u32 {
-    // Use 0-index segment (address is sentinel) as counter
-    // TODO: separate small buffer binding for this?
-    return atomicAdd(&tiles[4096].segments, 1u) + 1u;
+    return atomicAdd(&bump.segments, 1u) + 1u;
 }
 
 let MAX_QUADS = 16u;
@@ -126,12 +121,13 @@ fn main(
     let shift = (ix & 3u) * 8u;
     var tag_byte = (tag_word >> shift) & 0xffu;
 
-    // Reconstruct path_ix from monoid or store in cubic?
     if (tag_byte & PATH_TAG_SEG_TYPE) != 0u {
-        let path_ix = 42u; // BIG GIANT TODO
-        let path = paths[path_ix];
-        let bbox = vec4<i32>(path.bbox);
+        // Discussion question: it might actually be cheaper to do the path segment
+        // decoding & transform again rather than store the result in a buffer;
+        // classic memory vs ALU tradeoff.
         let cubic = cubics[global_id.x];
+        let path = paths[cubic.path_ix];
+        let bbox = vec4<i32>(path.bbox);
         let p0 = cubic.p0;
         let p1 = cubic.p1;
         let p2 = cubic.p2;
@@ -201,11 +197,13 @@ fn main(
                 var x1 = i32(floor(xymax.x * SX) + 1.0);
                 var y0 = i32(floor(xymin.y * SY));
                 var y1 = i32(floor(xymax.y * SY) + 1.0);
-                x0 = clamp(x0, 0, i32(config.width_in_tiles));
-                x1 = clamp(x1, 0, i32(config.width_in_tiles));
-                y0 = clamp(y0, 0, i32(config.height_in_tiles));
-                y1 = clamp(y1, 0, i32(config.height_in_tiles));
+                x0 = clamp(x0, bbox.x, bbox.z);
+                x1 = clamp(x1, bbox.x, bbox.z);
+                y0 = clamp(y0, bbox.y, bbox.w);
+                y1 = clamp(y1, bbox.y, bbox.w);
                 var xc = a + b * f32(y0);
+                let stride = bbox.z - bbox.x;
+                var base = i32(path.tiles) + (y0 - bbox.y) * stride - bbox.x;
                 var xray = i32(floor(lp0.x * SX));
                 var last_xray = i32(floor(lp1.x * SX));
                 if dp.y < 0.0 {
@@ -218,7 +216,7 @@ fn main(
                     let xbackdrop = max(xray + 1, 0);
                     if xymin.y < tile_y0 && xbackdrop < i32(config.width_in_tiles) {
                         let backdrop = select(-1, 1, dp.y < 0.0);
-                        let tile_ix = y * i32(config.width_in_tiles) + xbackdrop;
+                        let tile_ix = base + xbackdrop;
                         atomicAdd(&tiles[tile_ix].backdrop, backdrop);
                     }
                     var next_xray = last_xray;
@@ -236,7 +234,7 @@ fn main(
                     var tile_seg: Segment;
                     for (var x = xx0; x < xx1; x += 1) {
                         let tile_x0 = f32(x) * f32(TILE_WIDTH);
-                        let tile_ix = y * i32(config.width_in_tiles) + x;
+                        let tile_ix = base + x;
                         // allocate segment, insert linked list
                         let seg_ix = alloc_segment();
                         let old = atomicExchange(&tiles[tile_ix].segments, seg_ix);
@@ -263,6 +261,7 @@ fn main(
                         segments[seg_ix] = tile_seg;
                     }
                     xc += b;
+                    base += stride;
                     xray = next_xray;
                 }
                 n_out += 1u;
diff --git a/piet-wgsl/shader/pathseg.wgsl b/piet-wgsl/shader/pathseg.wgsl
index 03c5aa6..1326fce 100644
--- a/piet-wgsl/shader/pathseg.wgsl
+++ b/piet-wgsl/shader/pathseg.wgsl
@@ -27,6 +27,7 @@
 
 #import config
 #import pathtag
+#import cubic
 
 @group(0) @binding(0)
 var<storage> config: Config;
@@ -49,12 +50,6 @@ struct AtomicPathBbox {
 @group(0) @binding(3)
 var<storage, read_write> path_bboxes: array<AtomicPathBbox>;
 
-struct Cubic {
-    p0: vec2<f32>,
-    p1: vec2<f32>,
-    p2: vec2<f32>,
-    p3: vec2<f32>,
-}
 
 @group(0) @binding(4)
 var<storage, read_write> cubics: array<Cubic>;
@@ -90,7 +85,6 @@ var<storage, read_write> cubics: array<Cubic>;
 // }
 
 var<private> pathdata_base: u32;
-var<private> transform_base: u32;
 
 fn read_f32_point(ix: u32) -> vec2<f32> {
     let x = bitcast<f32>(scene[pathdata_base + ix]);
@@ -110,7 +104,7 @@ struct Transform {
     translate: vec2<f32>,
 }
 
-fn read_transform(ix: u32) -> Transform {
+fn read_transform(transform_base: u32, ix: u32) -> Transform {
     let base = transform_base + ix * 6u;
     let c0 = bitcast<f32>(scene[base]);
     let c1 = bitcast<f32>(scene[base] + 1u);
@@ -142,7 +136,6 @@ fn main(
 ) {
     let ix = global_id.x;
     let tag_word = scene[config.pathtag_base + (ix >> 2u)];
-    // TODO: set transform_base
     pathdata_base = config.pathdata_base;
     let shift = (ix & 3u) * 8u;
     var tm = reduce_tag(tag_word & ((1u << shift) - 1u));
@@ -180,7 +173,7 @@ fn main(
                 }
             }
         }
-        let transform = read_transform(tm.trans_ix);
+        let transform = read_transform(config.transform_base, tm.trans_ix);
         p0 = transform_apply(transform, p0);
         p1 = transform_apply(transform, p1);
         var bbox = vec4<f32>(min(p0, p1), max(p0, p1));
@@ -201,15 +194,14 @@ fn main(
                 p1 = mix(p1, p0, 1.0 / 3.0);
             }
         }
-        cubics[global_id.x] = Cubic(p0, p1, p2, p3);
+        cubics[global_id.x] = Cubic(p0, p1, p2, p3, tm.path_ix, 0u);
         // Update bounding box using atomics only. Computing a monoid is a
         // potential future optimization.
-        if bbox.z > bbox.x && bbox.w > bbox.y {
+        if bbox.z > bbox.x || bbox.w > bbox.y {
             atomicMin(&(*out).x0, round_down(bbox.x));
             atomicMin(&(*out).y0, round_down(bbox.y));
             atomicMax(&(*out).x1, round_up(bbox.z));
             atomicMax(&(*out).y1, round_up(bbox.w));
         }
     }
-
 }
diff --git a/piet-wgsl/shader/shared/bump.wgsl b/piet-wgsl/shader/shared/bump.wgsl
index 7f33ba2..b33ffaa 100644
--- a/piet-wgsl/shader/shared/bump.wgsl
+++ b/piet-wgsl/shader/shared/bump.wgsl
@@ -19,4 +19,5 @@ struct BumpAllocators {
     binning: atomic<u32>,
     ptcl: atomic<u32>,
     tile: atomic<u32>,
+    segments: atomic<u32>,
 }
diff --git a/piet-wgsl/shader/shared/config.wgsl b/piet-wgsl/shader/shared/config.wgsl
index bf0839d..b43f35b 100644
--- a/piet-wgsl/shader/shared/config.wgsl
+++ b/piet-wgsl/shader/shared/config.wgsl
@@ -28,6 +28,8 @@ struct Config {
 
     drawtag_base: u32,
     drawdata_base: u32,
+
+    transform_base: u32,
 }
 
 // Geometry of tiles and bins
diff --git a/piet-wgsl/src/main.rs b/piet-wgsl/src/main.rs
index ba0d034..355ea33 100644
--- a/piet-wgsl/src/main.rs
+++ b/piet-wgsl/src/main.rs
@@ -22,7 +22,7 @@ use engine::Engine;
 
 use render::render;
 use test_scene::dump_scene_info;
-use wgpu::{Device, Queue, Limits};
+use wgpu::{Device, Limits, Queue};
 
 mod engine;
 mod render;
diff --git a/piet-wgsl/src/render.rs b/piet-wgsl/src/render.rs
index 3f9d184..d019886 100644
--- a/piet-wgsl/src/render.rs
+++ b/piet-wgsl/src/render.rs
@@ -5,10 +5,18 @@ use piet_scene::Scene;
 
 use crate::{
     engine::{BufProxy, Recording},
-    shaders::{self, Shaders},
+    shaders::{self, FullShaders, Shaders},
 };
 
 const TAG_MONOID_SIZE: u64 = 12;
+const TAG_MONOID_FULL_SIZE: u64 = 20;
+const PATH_BBOX_SIZE: u64 = 24;
+const CUBIC_SIZE: u64 = 40;
+const DRAWMONOID_SIZE: u64 = 16;
+const MAX_DRAWINFO_SIZE: u64 = 44;
+const PATH_SIZE: u64 = 8;
+const DRAW_BBOX_SIZE: u64 = 16;
+const BUMP_SIZE: u64 = 16;
 
 #[repr(C)]
 #[derive(Clone, Copy, Default, Zeroable, Pod)]
@@ -21,6 +29,7 @@ struct Config {
     pathdata_base: u32,
     drawtag_base: u32,
     drawdata_base: u32,
+    transform_base: u32,
 }
 
 #[repr(C)]
@@ -109,6 +118,168 @@ pub fn render(scene: &Scene, shaders: &Shaders) -> (Recording, BufProxy) {
     (recording, out_buf)
 }
 
+pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy) {
+    let mut recording = Recording::default();
+    let data = scene.data();
+    let n_pathtag = data.tag_stream.len();
+    let pathtag_padded = align_up(n_pathtag, 4 * shaders::PATHTAG_REDUCE_WG);
+    let pathtag_wgs = pathtag_padded / (4 * shaders::PATHTAG_REDUCE_WG as usize);
+    let mut scene: Vec<u8> = Vec::with_capacity(pathtag_padded);
+    let pathtag_base = size_to_words(scene.len());
+    scene.extend(&data.tag_stream);
+    scene.resize(pathtag_padded, 0);
+    let pathdata_base = size_to_words(scene.len());
+    scene.extend(&data.pathseg_stream);
+    let drawtag_base = size_to_words(scene.len());
+    scene.extend(bytemuck::cast_slice(&data.drawtag_stream));
+    let drawdata_base = size_to_words(scene.len());
+    scene.extend(&data.drawdata_stream);
+    let transform_base = size_to_words(scene.len());
+    scene.extend(bytemuck::cast_slice(&data.transform_stream));
+
+    let n_path = data.n_path;
+    // TODO: calculate for real when we do rectangles
+    let n_drawobj = n_path;
+    let config = Config {
+        width_in_tiles: 64,
+        height_in_tiles: 64,
+        n_drawobj,
+        n_path,
+        pathtag_base,
+        pathdata_base,
+        drawtag_base,
+        drawdata_base,
+        transform_base,
+    };
+    let scene_buf = recording.upload(scene);
+    let config_buf = recording.upload(bytemuck::bytes_of(&config).to_owned());
+
+    let reduced_buf = BufProxy::new(pathtag_wgs as u64 * TAG_MONOID_FULL_SIZE);
+    // TODO: really only need pathtag_wgs - 1
+    recording.dispatch(
+        shaders.pathtag_reduce,
+        (pathtag_wgs as u32, 1, 1),
+        [config_buf, scene_buf, reduced_buf],
+    );
+
+    let tagmonoid_buf =
+        BufProxy::new(pathtag_wgs as u64 * shaders::PATHTAG_REDUCE_WG as u64 * TAG_MONOID_SIZE);
+    recording.dispatch(
+        shaders.pathtag_scan,
+        (pathtag_wgs as u32, 1, 1),
+        [config_buf, scene_buf, reduced_buf, tagmonoid_buf],
+    );
+    let drawobj_wgs = (n_drawobj + shaders::PATH_BBOX_WG - 1) / shaders::PATH_BBOX_WG;
+    let path_bbox_buf = BufProxy::new(n_path as u64 * PATH_BBOX_SIZE);
+    recording.dispatch(
+        shaders.bbox_clear,
+        (drawobj_wgs, 1, 1),
+        [config_buf, path_bbox_buf],
+    );
+    let cubic_buf = BufProxy::new(n_path as u64 * CUBIC_SIZE);
+    let path_coarse_wgs = (data.n_pathseg + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
+    recording.dispatch(
+        shaders.pathseg,
+        (path_coarse_wgs, 1, 1),
+        [
+            config_buf,
+            scene_buf,
+            tagmonoid_buf,
+            path_bbox_buf,
+            cubic_buf,
+        ],
+    );
+    let draw_reduced_buf = BufProxy::new(drawobj_wgs as u64 * DRAWMONOID_SIZE);
+    recording.dispatch(
+        shaders.draw_reduce,
+        (drawobj_wgs, 1, 1),
+        [config_buf, scene_buf, draw_reduced_buf],
+    );
+    let draw_monoid_buf = BufProxy::new(n_drawobj as u64 * DRAWMONOID_SIZE);
+    let info_buf = BufProxy::new(n_drawobj as u64 * MAX_DRAWINFO_SIZE);
+    recording.dispatch(
+        shaders.draw_leaf,
+        (drawobj_wgs, 1, 1),
+        [
+            config_buf,
+            scene_buf,
+            draw_reduced_buf,
+            path_bbox_buf,
+            draw_monoid_buf,
+            info_buf,
+        ],
+    );
+    let draw_bbox_buf = BufProxy::new(n_path as u64 * DRAW_BBOX_SIZE);
+    let bump_buf = BufProxy::new(BUMP_SIZE);
+    // Not actually used yet.
+    let clip_bbox_buf = BufProxy::new(1024);
+    let bin_data_buf = BufProxy::new(1 << 16);
+    recording.clear_all(bump_buf);
+    recording.dispatch(
+        shaders.binning,
+        (drawobj_wgs, 1, 1),
+        [
+            config_buf,
+            draw_monoid_buf,
+            path_bbox_buf,
+            clip_bbox_buf,
+            draw_bbox_buf,
+            bump_buf,
+            bin_data_buf,
+        ],
+    );
+    let path_buf = BufProxy::new(n_path as u64 * PATH_SIZE);
+    let tile_buf = BufProxy::new(1 << 20);
+    let path_wgs = (n_path + shaders::PATH_BBOX_WG - 1) / shaders::PATH_BBOX_WG;
+    recording.dispatch(
+        shaders.tile_alloc,
+        (path_wgs, 1, 1),
+        [
+            config_buf,
+            scene_buf,
+            draw_bbox_buf,
+            bump_buf,
+            path_buf,
+            tile_buf,
+        ],
+    );
+
+    //let cubics_buf = BufProxy::new(data.n_pathseg as u64 * 32);
+    // TODO: more principled size calc
+    let tiles_buf = BufProxy::new(4097 * 8);
+    let segments_buf = BufProxy::new(256 * 24);
+    recording.clear_all(tiles_buf);
+    recording.dispatch(
+        shaders.path_coarse,
+        (path_coarse_wgs, 1, 1),
+        [
+            config_buf,
+            scene_buf,
+            tagmonoid_buf,
+            cubic_buf,
+            path_buf,
+            bump_buf,
+            tiles_buf,
+            segments_buf,
+        ],
+    );
+    recording.dispatch(
+        shaders.backdrop,
+        (path_wgs, 1, 1),
+        [config_buf, path_buf, tiles_buf],
+    );
+    let out_buf_size = config.width_in_tiles * config.height_in_tiles * 256;
+    let out_buf = BufProxy::new(out_buf_size as u64);
+    recording.dispatch(
+        shaders.fine,
+        (config.width_in_tiles, config.height_in_tiles, 1),
+        [config_buf, tiles_buf, segments_buf, out_buf],
+    );
+
+    recording.download(out_buf);
+    (recording, out_buf)
+}
+
 pub fn align_up(len: usize, alignment: u32) -> usize {
     len + (len.wrapping_neg() & alignment as usize - 1)
 }
diff --git a/piet-wgsl/src/shaders.rs b/piet-wgsl/src/shaders.rs
index 6a7e9fb..939c58e 100644
--- a/piet-wgsl/src/shaders.rs
+++ b/piet-wgsl/src/shaders.rs
@@ -25,7 +25,9 @@ use wgpu::Device;
 use crate::engine::{BindType, Engine, Error, ShaderId};
 
 pub const PATHTAG_REDUCE_WG: u32 = 256;
+pub const PATH_BBOX_WG: u32 = 256;
 pub const PATH_COARSE_WG: u32 = 256;
+pub const PATH_DRAWOBJ_WG: u32 = 256;
 
 pub struct Shaders {
     pub pathtag_reduce: ShaderId,
@@ -43,8 +45,8 @@ pub struct FullShaders {
     pub pathseg: ShaderId,
     pub draw_reduce: ShaderId,
     pub draw_leaf: ShaderId,
-    pub tile_alloc: ShaderId,
     pub binning: ShaderId,
+    pub tile_alloc: ShaderId,
     pub path_coarse: ShaderId,
     pub backdrop: ShaderId,
     pub coarse: ShaderId,
@@ -60,7 +62,11 @@ pub fn init_shaders(device: &Device, engine: &mut Engine) -> Result<Shaders, Err
     let pathtag_reduce = engine.add_shader(
         device,
         preprocess::preprocess(&read_shader("pathtag_reduce"), &empty, &imports).into(),
-        &[BindType::BufReadOnly, BindType::BufReadOnly, BindType::Buffer],
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
+        ],
     )?;
     let pathtag_scan = engine.add_shader(
         device,
@@ -121,7 +127,11 @@ pub fn full_shaders(device: &Device, engine: &mut Engine) -> Result<FullShaders,
     let pathtag_reduce = engine.add_shader(
         device,
         preprocess::preprocess(&read_shader("pathtag_reduce"), &full_config, &imports).into(),
-        &[BindType::BufReadOnly, BindType::BufReadOnly, BindType::Buffer],
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
+        ],
     )?;
     let pathtag_scan = engine.add_shader(
         device,
@@ -136,10 +146,7 @@ pub fn full_shaders(device: &Device, engine: &mut Engine) -> Result<FullShaders,
     let bbox_clear = engine.add_shader(
         device,
         preprocess::preprocess(&read_shader("bbox_clear"), &empty, &imports).into(),
-        &[
-            BindType::BufReadOnly,
-            BindType::Buffer,
-        ],
+        &[BindType::BufReadOnly, BindType::Buffer],
     )?;
     let pathseg = engine.add_shader(
         device,
@@ -173,18 +180,6 @@ pub fn full_shaders(device: &Device, engine: &mut Engine) -> Result<FullShaders,
             BindType::Buffer,
         ],
     )?;
-    let tile_alloc = engine.add_shader(
-        device,
-        preprocess::preprocess(&read_shader("tile_alloc"), &empty, &imports).into(),
-        &[
-            BindType::BufReadOnly,
-            BindType::BufReadOnly,
-            BindType::BufReadOnly,
-            BindType::Buffer,
-            BindType::Buffer,
-            BindType::Buffer,
-        ],
-    )?;
     let binning = engine.add_shader(
         device,
         preprocess::preprocess(&read_shader("binning"), &empty, &imports).into(),
@@ -199,6 +194,18 @@ pub fn full_shaders(device: &Device, engine: &mut Engine) -> Result<FullShaders,
             BindType::Buffer,
         ],
     )?;
+    let tile_alloc = engine.add_shader(
+        device,
+        preprocess::preprocess(&read_shader("tile_alloc"), &empty, &imports).into(),
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
+            BindType::Buffer,
+            BindType::Buffer,
+        ],
+    )?;
 
     let path_coarse = engine.add_shader(
         device,
@@ -211,6 +218,7 @@ pub fn full_shaders(device: &Device, engine: &mut Engine) -> Result<FullShaders,
             BindType::BufReadOnly,
             BindType::Buffer,
             BindType::Buffer,
+            BindType::Buffer,
         ],
     )?;
     let backdrop = engine.add_shader(
@@ -251,8 +259,8 @@ pub fn full_shaders(device: &Device, engine: &mut Engine) -> Result<FullShaders,
         pathseg,
         draw_reduce,
         draw_leaf,
-        tile_alloc,
         binning,
+        tile_alloc,
         path_coarse,
         backdrop,
         coarse,

From c3d81e0985a9593951e7ae16e4dcaa9c8b1d6a3f Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Thu, 3 Nov 2022 22:00:52 -0700
Subject: [PATCH 08/17] Mostly working path rendering

It draws multiple paths and applies affine transformations.

One problem: RGBA writing is byte-reversed and premultiplied.
---
 piet-wgsl/shader/backdrop_dyn.wgsl     | 21 +++-------
 piet-wgsl/shader/binning.wgsl          |  2 -
 piet-wgsl/shader/coarse.wgsl           | 13 +++---
 piet-wgsl/shader/draw_leaf.wgsl        | 14 +++----
 piet-wgsl/shader/fine.wgsl             |  4 +-
 piet-wgsl/shader/path_coarse_full.wgsl |  2 +-
 piet-wgsl/shader/pathseg.wgsl          | 11 +++---
 piet-wgsl/shader/shared/cubic.wgsl     | 25 ++++++++++++
 piet-wgsl/shader/shared/pathtag.wgsl   |  1 +
 piet-wgsl/shader/tile_alloc.wgsl       | 10 +++--
 piet-wgsl/src/engine.rs                |  1 +
 piet-wgsl/src/main.rs                  | 33 ++++++++++++----
 piet-wgsl/src/render.rs                | 55 ++++++++++++++++++--------
 piet-wgsl/src/shaders.rs               |  8 +++-
 piet-wgsl/src/test_scene.rs            | 12 ++++++
 15 files changed, 143 insertions(+), 69 deletions(-)
 create mode 100644 piet-wgsl/shader/shared/cubic.wgsl

diff --git a/piet-wgsl/shader/backdrop_dyn.wgsl b/piet-wgsl/shader/backdrop_dyn.wgsl
index c6c7c8d..ab094e6 100644
--- a/piet-wgsl/shader/backdrop_dyn.wgsl
+++ b/piet-wgsl/shader/backdrop_dyn.wgsl
@@ -17,20 +17,7 @@
 // Prefix sum for dynamically allocated backdrops
 
 #import config
-
-// TODO: dedup & put this in the right place
-struct Path {
-    // bounding box in pixels
-    bbox: vec4<u32>,
-    // offset (in u32's) to tile rectangle
-    tiles: u32,
-}
-
-// TODO: -> shared
-struct Tile {
-    backdrop: i32,
-    segments: u32,
-}
+#import tile
 
 @group(0) @binding(0)
 var<storage> config: Config;
@@ -45,6 +32,7 @@ let WG_SIZE = 256u;
 
 var<workgroup> sh_row_width: array<u32, WG_SIZE>;
 var<workgroup> sh_row_count: array<u32, WG_SIZE>;
+var<workgroup> sh_offset: array<u32, WG_SIZE>;
 
 @compute @workgroup_size(256)
 fn main(
@@ -58,8 +46,9 @@ fn main(
         let path = paths[drawobj_ix];
         sh_row_width[local_id.x] = path.bbox.z - path.bbox.x;
         row_count = path.bbox.w - path.bbox.y;
-        sh_row_count[local_id.x] = row_count;
+        sh_offset[local_id.x] = path.tiles;
     }
+    sh_row_count[local_id.x] = row_count;
 
     // Prefix sum of row counts
     for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
@@ -83,7 +72,7 @@ fn main(
         let width = sh_row_width[el_ix];
         if width > 0u {
             var seq_ix = row - select(0u, sh_row_count[el_ix - 1u], el_ix > 0u);
-            var tile_ix = seq_ix * width;
+            var tile_ix = sh_offset[el_ix] + seq_ix * width;
             var sum = tiles[tile_ix].backdrop;
             for (var x = 1u; x < width; x += 1u) {
                 tile_ix += 1u;
diff --git a/piet-wgsl/shader/binning.wgsl b/piet-wgsl/shader/binning.wgsl
index 3d5a053..6fd6284 100644
--- a/piet-wgsl/shader/binning.wgsl
+++ b/piet-wgsl/shader/binning.wgsl
@@ -36,8 +36,6 @@ var<storage> clip_bbox_buf: array<vec4<f32>>;
 @group(0) @binding(4)
 var<storage, read_write> intersected_bbox: array<vec4<f32>>;
 
-// TODO: put into shared include
-
 @group(0) @binding(5)
 var<storage, read_write> bump: BumpAllocators;
 
diff --git a/piet-wgsl/shader/coarse.wgsl b/piet-wgsl/shader/coarse.wgsl
index 8c59ba6..7a2ecf9 100644
--- a/piet-wgsl/shader/coarse.wgsl
+++ b/piet-wgsl/shader/coarse.wgsl
@@ -41,13 +41,13 @@ struct BinHeader {
 var<storage> bin_headers: array<BinHeader>;
 
 @group(0) @binding(4)
-var<storage> paths: array<Path>;
+var<storage> bin_data: array<u32>;
 
 @group(0) @binding(5)
-var<storage> tiles: array<Tile>;
+var<storage> paths: array<Path>;
 
 @group(0) @binding(6)
-var<storage> bin_data: array<u32>;
+var<storage> tiles: array<Tile>;
 
 @group(0) @binding(7)
 var<storage, read_write> bump: BumpAllocators;
@@ -109,7 +109,7 @@ fn write_path(tile: Tile, linewidth: f32) {
 
 fn write_color(color: CmdColor) {
     alloc_cmd(2u);
-    ptcl[cmd_offset] = CMD_FILL;
+    ptcl[cmd_offset] = CMD_COLOR;
     ptcl[cmd_offset + 1u] = color.rgba_color;
     cmd_offset += 2u;
 
@@ -117,7 +117,6 @@ fn write_color(color: CmdColor) {
 
 @compute @workgroup_size(256)
 fn main(
-    @builtin(global_invocation_id) global_id: vec3<u32>,
     @builtin(local_invocation_id) local_id: vec3<u32>,
     @builtin(workgroup_id) wg_id: vec3<u32>,
 ) {
@@ -130,7 +129,7 @@ fn main(
     let bin_tile_y = N_TILE_Y * wg_id.y;
 
     let tile_x = local_id.x % N_TILE_X;
-    let tile_y = local_id.y % N_TILE_Y;
+    let tile_y = local_id.x / N_TILE_X;
     let this_tile_ix = (bin_tile_y + tile_y) * config.width_in_tiles + bin_tile_x + tile_x;
     cmd_offset = this_tile_ix * PTCL_INITIAL_ALLOC;
     cmd_limit = cmd_offset + (PTCL_INITIAL_ALLOC - PTCL_HEADROOM);
@@ -313,7 +312,7 @@ fn main(
         workgroupBarrier();
     }
     if bin_tile_x < config.width_in_tiles && bin_tile_y < config.height_in_tiles {
-        ptcl[cmd_offset] = CMD_END;
+        //ptcl[cmd_offset] = CMD_END;
         // TODO: blend stack allocation
     }
 }
diff --git a/piet-wgsl/shader/draw_leaf.wgsl b/piet-wgsl/shader/draw_leaf.wgsl
index bce04f8..14a1163 100644
--- a/piet-wgsl/shader/draw_leaf.wgsl
+++ b/piet-wgsl/shader/draw_leaf.wgsl
@@ -49,11 +49,11 @@ struct Transform {
 fn read_transform(transform_base: u32, ix: u32) -> Transform {
     let base = transform_base + ix * 6u;
     let c0 = bitcast<f32>(scene[base]);
-    let c1 = bitcast<f32>(scene[base] + 1u);
-    let c2 = bitcast<f32>(scene[base] + 2u);
-    let c3 = bitcast<f32>(scene[base] + 3u);
-    let c4 = bitcast<f32>(scene[base] + 4u);
-    let c5 = bitcast<f32>(scene[base] + 5u);
+    let c1 = bitcast<f32>(scene[base + 1u]);
+    let c2 = bitcast<f32>(scene[base + 2u]);
+    let c3 = bitcast<f32>(scene[base + 3u]);
+    let c4 = bitcast<f32>(scene[base + 4u]);
+    let c5 = bitcast<f32>(scene[base + 5u]);
     let matrx = vec4<f32>(c0, c1, c2, c3);
     let translate = vec2<f32>(c4, c5);
     return Transform(matrx, translate);
@@ -73,8 +73,8 @@ fn main(
     sh_scratch[local_id.x] = agg;
     for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
         workgroupBarrier();
-        if local_id.x + (1u << i) < WG_SIZE {
-            let other = sh_scratch[local_id.x + (1u << i)];
+        if local_id.x >= 1u << i {
+            let other = sh_scratch[local_id.x - (1u << i)];
             agg = combine_draw_monoid(agg, other);
         }
         workgroupBarrier();
diff --git a/piet-wgsl/shader/fine.wgsl b/piet-wgsl/shader/fine.wgsl
index 751e408..4431848 100644
--- a/piet-wgsl/shader/fine.wgsl
+++ b/piet-wgsl/shader/fine.wgsl
@@ -113,7 +113,6 @@ fn main(
 ) {
     let tile_ix = wg_id.y * config.width_in_tiles + wg_id.x;
     let xy = vec2<f32>(f32(global_id.x * PIXELS_PER_THREAD), f32(global_id.y));
-    let tile = tiles[tile_ix];
 #ifdef full
     var rgba: array<vec4<f32>, PIXELS_PER_THREAD>;
     var area: array<f32, PIXELS_PER_THREAD>;
@@ -148,7 +147,7 @@ fn main(
                     let fg_i = fg * area[i];
                     rgba[i] = rgba[i] * (1.0 - fg_i.a) + fg_i;
                 }
-                cmd_ix += 1u;
+                cmd_ix += 2u;
             }
             // CMD_JUMP
             case 11u: {
@@ -163,6 +162,7 @@ fn main(
         output[out_ix + i] = bytes;
     }
 #else
+    let tile = tiles[tile_ix];
     let area = fill_path(tile, xy);
 
     let bytes = pack4x8unorm(vec4<f32>(area[0], area[1], area[2], area[3]));
diff --git a/piet-wgsl/shader/path_coarse_full.wgsl b/piet-wgsl/shader/path_coarse_full.wgsl
index 6b6e9b8..fa3609e 100644
--- a/piet-wgsl/shader/path_coarse_full.wgsl
+++ b/piet-wgsl/shader/path_coarse_full.wgsl
@@ -214,7 +214,7 @@ fn main(
                 for (var y = y0; y < y1; y += 1) {
                     let tile_y0 = f32(y) * f32(TILE_HEIGHT);
                     let xbackdrop = max(xray + 1, 0);
-                    if xymin.y < tile_y0 && xbackdrop < i32(config.width_in_tiles) {
+                    if xymin.y < tile_y0 && xbackdrop < bbox.z {
                         let backdrop = select(-1, 1, dp.y < 0.0);
                         let tile_ix = base + xbackdrop;
                         atomicAdd(&tiles[tile_ix].backdrop, backdrop);
diff --git a/piet-wgsl/shader/pathseg.wgsl b/piet-wgsl/shader/pathseg.wgsl
index 1326fce..d2c6a6a 100644
--- a/piet-wgsl/shader/pathseg.wgsl
+++ b/piet-wgsl/shader/pathseg.wgsl
@@ -107,11 +107,11 @@ struct Transform {
 fn read_transform(transform_base: u32, ix: u32) -> Transform {
     let base = transform_base + ix * 6u;
     let c0 = bitcast<f32>(scene[base]);
-    let c1 = bitcast<f32>(scene[base] + 1u);
-    let c2 = bitcast<f32>(scene[base] + 2u);
-    let c3 = bitcast<f32>(scene[base] + 3u);
-    let c4 = bitcast<f32>(scene[base] + 4u);
-    let c5 = bitcast<f32>(scene[base] + 5u);
+    let c1 = bitcast<f32>(scene[base + 1u]);
+    let c2 = bitcast<f32>(scene[base + 2u]);
+    let c3 = bitcast<f32>(scene[base + 3u]);
+    let c4 = bitcast<f32>(scene[base + 4u]);
+    let c5 = bitcast<f32>(scene[base + 5u]);
     let matrx = vec4<f32>(c0, c1, c2, c3);
     let translate = vec2<f32>(c4, c5);
     return Transform(matrx, translate);
@@ -174,6 +174,7 @@ fn main(
             }
         }
         let transform = read_transform(config.transform_base, tm.trans_ix);
+        //let transform = Transform(vec4<f32>(1.0, 0.0, 0.0, 1.0), vec2<f32>());
         p0 = transform_apply(transform, p0);
         p1 = transform_apply(transform, p1);
         var bbox = vec4<f32>(min(p0, p1), max(p0, p1));
diff --git a/piet-wgsl/shader/shared/cubic.wgsl b/piet-wgsl/shader/shared/cubic.wgsl
new file mode 100644
index 0000000..ffb85b4
--- /dev/null
+++ b/piet-wgsl/shader/shared/cubic.wgsl
@@ -0,0 +1,25 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+struct Cubic {
+    p0: vec2<f32>,
+    p1: vec2<f32>,
+    p2: vec2<f32>,
+    p3: vec2<f32>,
+    path_ix: u32,
+    // Needed?
+    padding: u32,
+}
diff --git a/piet-wgsl/shader/shared/pathtag.wgsl b/piet-wgsl/shader/shared/pathtag.wgsl
index b248e18..fed16fd 100644
--- a/piet-wgsl/shader/shared/pathtag.wgsl
+++ b/piet-wgsl/shader/shared/pathtag.wgsl
@@ -16,6 +16,7 @@
 
 struct TagMonoid {
     trans_ix: u32,
+    // TODO: I don't think pathseg_ix is used.
     pathseg_ix: u32,
     pathseg_offset: u32,
 #ifdef full
diff --git a/piet-wgsl/shader/tile_alloc.wgsl b/piet-wgsl/shader/tile_alloc.wgsl
index 0bd550e..1c27c83 100644
--- a/piet-wgsl/shader/tile_alloc.wgsl
+++ b/piet-wgsl/shader/tile_alloc.wgsl
@@ -79,14 +79,17 @@ fn main(
     sh_tile_count[local_id.x] = tile_count;
     for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
         workgroupBarrier();
-        if local_id.x < (1u << i) {
+        if local_id.x >= (1u << i) {
             total_tile_count += sh_tile_count[local_id.x - (1u << i)];
         }
         workgroupBarrier();
         sh_tile_count[local_id.x] = total_tile_count;
     }
-    if local_id.x == WG_SIZE - 1u {
-        sh_tile_offset = atomicAdd(&bump.tile, total_tile_count);
+    workgroupBarrier();
+    // should be able to avoid a barrier by adding total_tile count from
+    // thread WG_SIZE - 1, but it doesn't work
+    if local_id.x == 0u {
+        sh_tile_offset = atomicAdd(&bump.tile, sh_tile_count[WG_SIZE - 1u]);
     }
     workgroupBarrier();
     let tile_offset = sh_tile_offset;
@@ -94,6 +97,7 @@ fn main(
         let tile_subix = select(0u, sh_tile_count[local_id.x - 1u], local_id.x > 0u);
         let bbox = vec4<u32>(ux0, uy0, ux1, uy1);
         let path = Path(bbox, tile_offset + tile_subix);
+        paths[drawobj_ix] = path;
     }
 
     // zero allocated memory
diff --git a/piet-wgsl/src/engine.rs b/piet-wgsl/src/engine.rs
index 2be08db..7d1c854 100644
--- a/piet-wgsl/src/engine.rs
+++ b/piet-wgsl/src/engine.rs
@@ -183,6 +183,7 @@ impl Engine {
                     bind_map.insert_buf(buf_proxy.id, buf);
                 }
                 Command::Dispatch(shader_id, wg_size, bindings) => {
+                    println!("dispatching {:?} with {} bindings", wg_size, bindings.len());
                     let shader = &self.shaders[shader_id.0];
                     let bind_group =
                         bind_map.create_bind_group(device, &shader.bind_group_layout, bindings)?;
diff --git a/piet-wgsl/src/main.rs b/piet-wgsl/src/main.rs
index 355ea33..d8d53ad 100644
--- a/piet-wgsl/src/main.rs
+++ b/piet-wgsl/src/main.rs
@@ -20,7 +20,6 @@ use std::{fs::File, io::BufWriter};
 
 use engine::Engine;
 
-use render::render;
 use test_scene::dump_scene_info;
 use wgpu::{Device, Limits, Queue};
 
@@ -51,6 +50,20 @@ async fn run() -> Result<(), Box<dyn std::error::Error>> {
     Ok(())
 }
 
+fn dump_buf(buf: &[u32]) {
+    for (i, val) in buf.iter().enumerate() {
+        if *val != 0 {
+            let lo = val & 0x7fff_ffff;
+            if lo >= 0x3000_0000 && lo < 0x5000_0000 {
+                println!("{}: {:x} {}", i, val, f32::from_bits(*val));
+            } else {
+                println!("{}: {:x}", i, val);
+
+            }
+        }
+    }
+}
+
 async fn do_render(
     device: &Device,
     queue: &Queue,
@@ -60,17 +73,23 @@ async fn do_render(
     let full_shaders = shaders::full_shaders(device, engine)?;
     let scene = test_scene::gen_test_scene();
     dump_scene_info(&scene);
-    let (recording, buf) = render(&scene, &shaders);
+    //let (recording, buf) = render::render(&scene, &shaders);
+    let (recording, buf) = render::render_full(&scene, &full_shaders);
     let downloads = engine.run_recording(&device, &queue, &recording)?;
     let mapped = downloads.map();
     device.poll(wgpu::Maintain::Wait);
     let buf = mapped.get_mapped(buf).await?;
 
-    let file = File::create("image.png")?;
-    let w = BufWriter::new(file);
-    let encoder = png::Encoder::new(w, 1024, 1024);
-    let mut writer = encoder.write_header()?;
-    writer.write_image_data(&buf)?;
+    if false {
+        dump_buf(bytemuck::cast_slice(&buf));
+    } else {
+        let file = File::create("image.png")?;
+        let w = BufWriter::new(file);
+        let mut encoder = png::Encoder::new(w, 1024, 1024);
+        encoder.set_color(png::ColorType::Rgba);
+        let mut writer = encoder.write_header()?;
+        writer.write_image_data(&buf)?;
+    }
     Ok(())
 }
 
diff --git a/piet-wgsl/src/render.rs b/piet-wgsl/src/render.rs
index d019886..9202ef0 100644
--- a/piet-wgsl/src/render.rs
+++ b/piet-wgsl/src/render.rs
@@ -14,12 +14,13 @@ const PATH_BBOX_SIZE: u64 = 24;
 const CUBIC_SIZE: u64 = 40;
 const DRAWMONOID_SIZE: u64 = 16;
 const MAX_DRAWINFO_SIZE: u64 = 44;
-const PATH_SIZE: u64 = 8;
+const PATH_SIZE: u64 = 32;
 const DRAW_BBOX_SIZE: u64 = 16;
 const BUMP_SIZE: u64 = 16;
+const BIN_HEADER_SIZE: u64 = 8;
 
 #[repr(C)]
-#[derive(Clone, Copy, Default, Zeroable, Pod)]
+#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
 struct Config {
     width_in_tiles: u32,
     height_in_tiles: u32,
@@ -84,8 +85,8 @@ pub fn render(scene: &Scene, shaders: &Shaders) -> (Recording, BufProxy) {
         [config_buf, scene_buf, reduced_buf, tagmonoid_buf],
     );
 
-    let path_coarse_wgs = (data.n_pathseg + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
-    //let cubics_buf = BufProxy::new(data.n_pathseg as u64 * 32);
+    let n_pathtag = data.pathseg_stream.len();
+    let path_coarse_wgs = (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
     // TODO: more principled size calc
     let tiles_buf = BufProxy::new(4097 * 8);
     let segments_buf = BufProxy::new(256 * 24);
@@ -151,6 +152,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
         drawdata_base,
         transform_base,
     };
+    println!("{:?}", config);
     let scene_buf = recording.upload(scene);
     let config_buf = recording.upload(bytemuck::bytes_of(&config).to_owned());
 
@@ -176,8 +178,9 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
         (drawobj_wgs, 1, 1),
         [config_buf, path_bbox_buf],
     );
-    let cubic_buf = BufProxy::new(n_path as u64 * CUBIC_SIZE);
-    let path_coarse_wgs = (data.n_pathseg + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
+    let n_pathtag = data.pathseg_stream.len();
+    let cubic_buf = BufProxy::new(n_pathtag as u64 * CUBIC_SIZE);
+    let path_coarse_wgs = (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
     recording.dispatch(
         shaders.pathseg,
         (path_coarse_wgs, 1, 1),
@@ -214,6 +217,10 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
     // Not actually used yet.
     let clip_bbox_buf = BufProxy::new(1024);
     let bin_data_buf = BufProxy::new(1 << 16);
+    let width_in_bins = (config.width_in_tiles + 15) / 16;
+    let height_in_bins = (config.height_in_tiles + 15) / 16;
+    let n_bins = width_in_bins * height_in_bins;
+    let bin_header_buf = BufProxy::new((n_bins * drawobj_wgs) as u64 * BIN_HEADER_SIZE);
     recording.clear_all(bump_buf);
     recording.dispatch(
         shaders.binning,
@@ -226,6 +233,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
             draw_bbox_buf,
             bump_buf,
             bin_data_buf,
+            bin_header_buf,
         ],
     );
     let path_buf = BufProxy::new(n_path as u64 * PATH_SIZE);
@@ -244,11 +252,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
         ],
     );
 
-    //let cubics_buf = BufProxy::new(data.n_pathseg as u64 * 32);
-    // TODO: more principled size calc
-    let tiles_buf = BufProxy::new(4097 * 8);
-    let segments_buf = BufProxy::new(256 * 24);
-    recording.clear_all(tiles_buf);
+    let segments_buf = BufProxy::new(1 << 20);
     recording.dispatch(
         shaders.path_coarse,
         (path_coarse_wgs, 1, 1),
@@ -259,25 +263,42 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
             cubic_buf,
             path_buf,
             bump_buf,
-            tiles_buf,
+            tile_buf,
             segments_buf,
         ],
     );
     recording.dispatch(
         shaders.backdrop,
         (path_wgs, 1, 1),
-        [config_buf, path_buf, tiles_buf],
+        [config_buf, path_buf, tile_buf],
     );
-    let out_buf_size = config.width_in_tiles * config.height_in_tiles * 256;
+    let ptcl_buf = BufProxy::new(1 << 20);
+    recording.dispatch(
+        shaders.coarse,
+        (width_in_bins, height_in_bins, 1),
+        [
+            config_buf,
+            scene_buf,
+            draw_monoid_buf,
+            bin_header_buf,
+            bin_data_buf,
+            path_buf,
+            tile_buf,
+            bump_buf,
+            ptcl_buf,
+        ],
+    );
+    let out_buf_size = config.width_in_tiles * config.height_in_tiles * 1024;
     let out_buf = BufProxy::new(out_buf_size as u64);
     recording.dispatch(
         shaders.fine,
         (config.width_in_tiles, config.height_in_tiles, 1),
-        [config_buf, tiles_buf, segments_buf, out_buf],
+        [config_buf, tile_buf, segments_buf, out_buf, ptcl_buf],
     );
 
-    recording.download(out_buf);
-    (recording, out_buf)
+    let download_buf = out_buf;
+    recording.download(download_buf);
+    (recording, download_buf)
 }
 
 pub fn align_up(len: usize, alignment: u32) -> usize {
diff --git a/piet-wgsl/src/shaders.rs b/piet-wgsl/src/shaders.rs
index 939c58e..f918d96 100644
--- a/piet-wgsl/src/shaders.rs
+++ b/piet-wgsl/src/shaders.rs
@@ -223,8 +223,12 @@ pub fn full_shaders(device: &Device, engine: &mut Engine) -> Result<FullShaders,
     )?;
     let backdrop = engine.add_shader(
         device,
-        preprocess::preprocess(&read_shader("backdrop"), &empty, &imports).into(),
-        &[BindType::BufReadOnly, BindType::Buffer],
+        preprocess::preprocess(&read_shader("backdrop_dyn"), &empty, &imports).into(),
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
+        ],
     )?;
     let coarse = engine.add_shader(
         device,
diff --git a/piet-wgsl/src/test_scene.rs b/piet-wgsl/src/test_scene.rs
index c844a26..1029c59 100644
--- a/piet-wgsl/src/test_scene.rs
+++ b/piet-wgsl/src/test_scene.rs
@@ -29,6 +29,18 @@ pub fn gen_test_scene() -> Scene {
     ];
     let brush = Brush::Solid(Color::rgb8(0x80, 0x80, 0x80));
     builder.fill(Fill::NonZero, Affine::IDENTITY, &brush, None, &path);
+    let transform = Affine::translate(10.0, 200.0);
+    /*
+    let path = [
+        PathElement::MoveTo(Point::new(100.0, 300.0)),
+        PathElement::LineTo(Point::new(500.0, 320.0)),
+        PathElement::LineTo(Point::new(300.0, 350.0)),
+        PathElement::LineTo(Point::new(200.0, 460.0)),
+        PathElement::LineTo(Point::new(150.0, 410.0)),
+        PathElement::Close,
+    ];
+    */
+    builder.fill(Fill::NonZero, transform, &brush, None, &path);
     scene
 }
 

From 92d6b1188f4c7d7b688322e1e04ba0e082c78b15 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Fri, 4 Nov 2022 09:25:06 -0700
Subject: [PATCH 09/17] Fix color

Get rgba order right in rendering, plus generate separated alpha for png. The latter is just for debugging, we won't generally use separated alpha.
---
 piet-wgsl/shader/fine.wgsl  |  7 +++++--
 piet-wgsl/src/test_scene.rs | 15 +++------------
 2 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/piet-wgsl/shader/fine.wgsl b/piet-wgsl/shader/fine.wgsl
index 4431848..ad3772c 100644
--- a/piet-wgsl/shader/fine.wgsl
+++ b/piet-wgsl/shader/fine.wgsl
@@ -142,7 +142,7 @@ fn main(
             // CMD_COLOR
             case 5u: {
                 let color = read_color(cmd_ix);
-                let fg = unpack4x8unorm(color.rgba_color);
+                let fg = unpack4x8unorm(color.rgba_color).wzyx;
                 for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
                     let fg_i = fg * area[i];
                     rgba[i] = rgba[i] * (1.0 - fg_i.a) + fg_i;
@@ -158,7 +158,10 @@ fn main(
     }
     let out_ix = global_id.y * (config.width_in_tiles * TILE_WIDTH) + global_id.x * PIXELS_PER_THREAD;
     for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
-        let bytes = pack4x8unorm(rgba[i]);
+        let fg = rgba[i];
+        let a_inv = 1.0 / (fg.a + 1e-6);
+        let rgba_sep = vec4<f32>(fg.r * a_inv, fg.g * a_inv, fg.b * a_inv, fg.a);
+        let bytes = pack4x8unorm(rgba_sep);
         output[out_ix + i] = bytes;
     }
 #else
diff --git a/piet-wgsl/src/test_scene.rs b/piet-wgsl/src/test_scene.rs
index 1029c59..c0c9390 100644
--- a/piet-wgsl/src/test_scene.rs
+++ b/piet-wgsl/src/test_scene.rs
@@ -27,19 +27,10 @@ pub fn gen_test_scene() -> Scene {
         PathElement::LineTo(Point::new(150.0, 210.0)),
         PathElement::Close,
     ];
-    let brush = Brush::Solid(Color::rgb8(0x80, 0x80, 0x80));
+    let brush = Brush::Solid(Color::rgb8(0x40, 0x40, 0xff));
     builder.fill(Fill::NonZero, Affine::IDENTITY, &brush, None, &path);
-    let transform = Affine::translate(10.0, 200.0);
-    /*
-    let path = [
-        PathElement::MoveTo(Point::new(100.0, 300.0)),
-        PathElement::LineTo(Point::new(500.0, 320.0)),
-        PathElement::LineTo(Point::new(300.0, 350.0)),
-        PathElement::LineTo(Point::new(200.0, 460.0)),
-        PathElement::LineTo(Point::new(150.0, 410.0)),
-        PathElement::Close,
-    ];
-    */
+    let transform = Affine::translate(50.0, 50.0);
+    let brush = Brush::Solid(Color::rgba8(0xff, 0xff, 0x00, 0x80));
     builder.fill(Fill::NonZero, transform, &brush, None, &path);
     scene
 }

From 7ae5aa7491c480ddd31a9e909c908a67f992e9f3 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Fri, 4 Nov 2022 12:40:54 -0700
Subject: [PATCH 10/17] Mostly working strokes

The fat line in coarse path rendering is not done, but when lines are thin that mostly looks ok. Onward to tiger!
---
 piet-wgsl/shader/coarse.wgsl        | 34 +++++++++++++++++---------
 piet-wgsl/shader/fine.wgsl          | 38 +++++++++++++++++++++++++++++
 piet-wgsl/shader/pathseg.wgsl       | 11 ++++++++-
 piet-wgsl/shader/shared/config.wgsl |  1 +
 piet-wgsl/shader/shared/ptcl.wgsl   |  6 +++++
 piet-wgsl/src/render.rs             |  5 ++++
 piet-wgsl/src/shaders.rs            |  1 +
 piet-wgsl/src/test_scene.rs         | 15 +++++++++++-
 8 files changed, 97 insertions(+), 14 deletions(-)

diff --git a/piet-wgsl/shader/coarse.wgsl b/piet-wgsl/shader/coarse.wgsl
index 7a2ecf9..642ce73 100644
--- a/piet-wgsl/shader/coarse.wgsl
+++ b/piet-wgsl/shader/coarse.wgsl
@@ -50,9 +50,12 @@ var<storage> paths: array<Path>;
 var<storage> tiles: array<Tile>;
 
 @group(0) @binding(7)
-var<storage, read_write> bump: BumpAllocators;
+var<storage> info: array<u32>;
 
 @group(0) @binding(8)
+var<storage, read_write> bump: BumpAllocators;
+
+@group(0) @binding(9)
 var<storage, read_write> ptcl: array<u32>;
 
 
@@ -95,15 +98,23 @@ fn write_path(tile: Tile, linewidth: f32) {
     // TODO: take flags
     // TODO: handle stroke
     alloc_cmd(3u);
-    if tile.segments != 0u {
-        let fill = CmdFill(tile.segments, tile.backdrop);
-        ptcl[cmd_offset] = CMD_FILL;
-        ptcl[cmd_offset + 1u] = fill.tile;
-        ptcl[cmd_offset + 2u] = u32(fill.backdrop);
-        cmd_offset += 3u;
+    if linewidth < 0.0 {
+        if tile.segments != 0u {
+            let fill = CmdFill(tile.segments, tile.backdrop);
+            ptcl[cmd_offset] = CMD_FILL;
+            ptcl[cmd_offset + 1u] = fill.tile;
+            ptcl[cmd_offset + 2u] = u32(fill.backdrop);
+            cmd_offset += 3u;
+        } else {
+            ptcl[cmd_offset] = CMD_SOLID;
+            cmd_offset += 1u;
+        }
     } else {
-        ptcl[cmd_offset] = CMD_SOLID;
-        cmd_offset += 1u;
+        let stroke = CmdStroke(tile.segments, 0.5 * linewidth);
+        ptcl[cmd_offset] = CMD_STROKE;
+        ptcl[cmd_offset + 1u] = stroke.tile;
+        ptcl[cmd_offset + 2u] = bitcast<u32>(stroke.half_width);
+        cmd_offset += 3u;
     }
 }
 
@@ -287,15 +298,14 @@ fn main(
             let drawtag = scene[config.drawtag_base + drawobj_ix];
             let dm = draw_monoids[drawobj_ix];
             let dd = config.drawdata_base + dm.scene_offset;
-            // TODO: set up draw info from monoid
+            let di = dm.info_offset;
             if clip_zero_depth == 0u {
                 let tile_ix = sh_tile_base[el_ix] + sh_tile_stride[el_ix] * tile_y + tile_x;
                 let tile = tiles[tile_ix];
                 switch drawtag {
                     // DRAWTAG_FILL_COLOR
                     case 0x44u: {
-                        // TODO: get linewidth from draw object
-                        let linewidth = -1.0;
+                        let linewidth = bitcast<f32>(info[di]);
                         let rgba_color = scene[dd];
                         write_path(tile, linewidth);
                         write_color(CmdColor(rgba_color));
diff --git a/piet-wgsl/shader/fine.wgsl b/piet-wgsl/shader/fine.wgsl
index ad3772c..c347630 100644
--- a/piet-wgsl/shader/fine.wgsl
+++ b/piet-wgsl/shader/fine.wgsl
@@ -51,6 +51,12 @@ fn read_fill(cmd_ix: u32) -> CmdFill {
     return CmdFill(tile, backdrop);
 }
 
+fn read_stroke(cmd_ix: u32) -> CmdStroke {
+    let tile = ptcl[cmd_ix + 1u];
+    let half_width = bitcast<f32>(ptcl[cmd_ix + 2u]);
+    return CmdStroke(tile, half_width);
+}
+
 fn read_color(cmd_ix: u32) -> CmdColor {
     let rgba_color = ptcl[cmd_ix + 1u];
     return CmdColor(rgba_color);
@@ -105,6 +111,32 @@ fn fill_path(tile: Tile, xy: vec2<f32>) -> array<f32, PIXELS_PER_THREAD> {
     return area;
 }
 
+fn stroke_path(seg: u32, half_width: f32, xy: vec2<f32>) -> array<f32, PIXELS_PER_THREAD> {
+    var df: array<f32, PIXELS_PER_THREAD>;
+    for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
+        df[i] = 1e9;
+    }
+    var segment_ix = seg;
+    while segment_ix != 0u {
+        let segment = segments[segment_ix];
+        let delta = segment.delta;
+        let dpos0 = xy + vec2<f32>(0.5, 0.5) - segment.origin;
+        let scale = 1.0 / dot(delta, delta);
+        for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
+            let dpos = vec2<f32>(dpos0.x + f32(i), dpos0.y);
+            let t = clamp(dot(dpos, delta) * scale, 0.0, 1.0);
+            // performance idea: hoist sqrt out of loop
+            df[i] = min(df[i], length(delta * t - dpos));
+        }
+        segment_ix = segment.next;
+    }
+    for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
+        // reuse array; return alpha rather than distance
+        df[i] = clamp(half_width + 0.5 - df[i], 0.0, 1.0);
+    }
+    return df;
+}
+
 @compute @workgroup_size(4, 16)
 fn main(
     @builtin(global_invocation_id) global_id: vec3<u32>,
@@ -132,6 +164,12 @@ fn main(
                 area = fill_path(tile, xy);
                 cmd_ix += 3u;
             }
+            // CMD_STROKE
+            case 2u: {
+                let stroke = read_stroke(cmd_ix);
+                area = stroke_path(stroke.tile, stroke.half_width, xy);
+                cmd_ix += 3u;
+            }
             // CMD_SOLID
             case 3u: {
                 for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
diff --git a/piet-wgsl/shader/pathseg.wgsl b/piet-wgsl/shader/pathseg.wgsl
index d2c6a6a..b4f83e1 100644
--- a/piet-wgsl/shader/pathseg.wgsl
+++ b/piet-wgsl/shader/pathseg.wgsl
@@ -143,8 +143,10 @@ fn main(
     var tag_byte = (tag_word >> shift) & 0xffu;
 
     let out = &path_bboxes[tm.path_ix];
+    var linewidth: f32;
     if (tag_byte & PATH_TAG_PATH) != 0u {
-        (*out).linewidth = -1.0; // TODO: plumb linewidth
+        linewidth = bitcast<f32>(scene[config.linewidth_base + tm.linewidth_ix]);
+        (*out).linewidth = linewidth;
         (*out).trans_ix = tm.trans_ix;
     }
     // Decode path data
@@ -195,6 +197,13 @@ fn main(
                 p1 = mix(p1, p0, 1.0 / 3.0);
             }
         }
+        if linewidth >= 0.0 {
+            // See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm
+            // This is the correct bounding box, but we're not handling rendering
+            // in the isotropic case, so it may mismatch.
+            let stroke = 0.5 * linewidth * vec2<f32>(length(transform.matrx.xz), length(transform.matrx.yw));
+            bbox += vec4<f32>(-stroke, stroke);
+        }
         cubics[global_id.x] = Cubic(p0, p1, p2, p3, tm.path_ix, 0u);
         // Update bounding box using atomics only. Computing a monoid is a
         // potential future optimization.
diff --git a/piet-wgsl/shader/shared/config.wgsl b/piet-wgsl/shader/shared/config.wgsl
index b43f35b..73f4054 100644
--- a/piet-wgsl/shader/shared/config.wgsl
+++ b/piet-wgsl/shader/shared/config.wgsl
@@ -30,6 +30,7 @@ struct Config {
     drawdata_base: u32,
 
     transform_base: u32,
+    linewidth_base: u32,
 }
 
 // Geometry of tiles and bins
diff --git a/piet-wgsl/shader/shared/ptcl.wgsl b/piet-wgsl/shader/shared/ptcl.wgsl
index 7121f84..8d6e869 100644
--- a/piet-wgsl/shader/shared/ptcl.wgsl
+++ b/piet-wgsl/shader/shared/ptcl.wgsl
@@ -25,6 +25,7 @@ let PTCL_HEADROOM = 2u;
 // Tags for PTCL commands
 let CMD_END = 0u;
 let CMD_FILL = 1u;
+let CMD_STROKE = 2u;
 let CMD_SOLID = 3u;
 let CMD_COLOR = 5u;
 let CMD_JUMP = 11u;
@@ -37,6 +38,11 @@ struct CmdFill {
     backdrop: i32,
 }
 
+struct CmdStroke {
+    tile: u32,
+    half_width: f32,
+}
+
 struct CmdJump {
     new_ix: u32,
 }
diff --git a/piet-wgsl/src/render.rs b/piet-wgsl/src/render.rs
index 9202ef0..04f4232 100644
--- a/piet-wgsl/src/render.rs
+++ b/piet-wgsl/src/render.rs
@@ -31,6 +31,7 @@ struct Config {
     drawtag_base: u32,
     drawdata_base: u32,
     transform_base: u32,
+    linewidth_base: u32,
 }
 
 #[repr(C)]
@@ -137,6 +138,8 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
     scene.extend(&data.drawdata_stream);
     let transform_base = size_to_words(scene.len());
     scene.extend(bytemuck::cast_slice(&data.transform_stream));
+    let linewidth_base = size_to_words(scene.len());
+    scene.extend(bytemuck::cast_slice(&data.linewidth_stream));
 
     let n_path = data.n_path;
     // TODO: calculate for real when we do rectangles
@@ -151,6 +154,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
         drawtag_base,
         drawdata_base,
         transform_base,
+        linewidth_base,
     };
     println!("{:?}", config);
     let scene_buf = recording.upload(scene);
@@ -284,6 +288,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
             bin_data_buf,
             path_buf,
             tile_buf,
+            info_buf,
             bump_buf,
             ptcl_buf,
         ],
diff --git a/piet-wgsl/src/shaders.rs b/piet-wgsl/src/shaders.rs
index f918d96..b659170 100644
--- a/piet-wgsl/src/shaders.rs
+++ b/piet-wgsl/src/shaders.rs
@@ -241,6 +241,7 @@ pub fn full_shaders(device: &Device, engine: &mut Engine) -> Result<FullShaders,
             BindType::BufReadOnly,
             BindType::BufReadOnly,
             BindType::BufReadOnly,
+            BindType::BufReadOnly,
             BindType::Buffer,
             BindType::Buffer,
         ],
diff --git a/piet-wgsl/src/test_scene.rs b/piet-wgsl/src/test_scene.rs
index c0c9390..2d28659 100644
--- a/piet-wgsl/src/test_scene.rs
+++ b/piet-wgsl/src/test_scene.rs
@@ -14,7 +14,7 @@
 //
 // Also licensed under MIT license, at your choice.
 
-use piet_scene::{Affine, Brush, Color, Fill, PathElement, Point, Scene, SceneBuilder};
+use piet_scene::{Affine, Brush, Color, Fill, PathElement, Point, Scene, SceneBuilder, Stroke};
 
 pub fn gen_test_scene() -> Scene {
     let mut scene = Scene::default();
@@ -32,6 +32,19 @@ pub fn gen_test_scene() -> Scene {
     let transform = Affine::translate(50.0, 50.0);
     let brush = Brush::Solid(Color::rgba8(0xff, 0xff, 0x00, 0x80));
     builder.fill(Fill::NonZero, transform, &brush, None, &path);
+    let transform = Affine::translate(100.0, 100.0);
+    let style = Stroke {
+        width: 1.0,
+        join: piet_scene::Join::Round,
+        miter_limit: 1.4,
+        start_cap: piet_scene::Cap::Round,
+        end_cap: piet_scene::Cap::Round,
+        dash_pattern: [],
+        dash_offset: 0.0,
+        scale: true,
+    };
+    let brush = Brush::Solid(Color::rgb8(0xa0, 0x00, 0x00));
+    builder.stroke(&style, transform, &brush, None, &path);
     scene
 }
 

From 17a74fb370f782a89386e758727e50e2d1a0a176 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Fri, 4 Nov 2022 13:15:05 -0700
Subject: [PATCH 11/17] Almost rendering tiger

We cut'n'pasted the picosvg stuff, kinda ugly.

It renders a number of paths of the tiger. I think the gap might be in prefix sums.
---
 Cargo.lock                   |   2 +
 piet-wgsl/Cargo.toml         |   6 +-
 piet-wgsl/shader/coarse.wgsl |   5 +-
 piet-wgsl/src/main.rs        |   1 +
 piet-wgsl/src/pico_svg.rs    | 140 +++++++++++++++++++++++++++++++++++
 piet-wgsl/src/render.rs      |   6 +-
 piet-wgsl/src/test_scene.rs  | 102 ++++++++++++++++++-------
 7 files changed, 231 insertions(+), 31 deletions(-)
 create mode 100644 piet-wgsl/src/pico_svg.rs

diff --git a/Cargo.lock b/Cargo.lock
index f2669e2..d82ba57 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1174,10 +1174,12 @@ dependencies = [
  "bytemuck",
  "env_logger",
  "futures-intrusive",
+ "kurbo 0.8.3",
  "parking_lot",
  "piet-scene",
  "png",
  "pollster",
+ "roxmltree",
  "wgpu",
 ]
 
diff --git a/piet-wgsl/Cargo.toml b/piet-wgsl/Cargo.toml
index e7fa42d..b785eac 100644
--- a/piet-wgsl/Cargo.toml
+++ b/piet-wgsl/Cargo.toml
@@ -14,4 +14,8 @@ parking_lot = "0.12"
 bytemuck = { version = "1.12.1", features = ["derive"] }
 png = "0.17.6"
 
-piet-scene = { path = "../piet-scene" }
+piet-scene = { path = "../piet-scene", features = ["kurbo"] }
+
+# for picosvg, should be split out
+roxmltree = "0.13"
+kurbo = "0.8.3"
diff --git a/piet-wgsl/shader/coarse.wgsl b/piet-wgsl/shader/coarse.wgsl
index 642ce73..5943e51 100644
--- a/piet-wgsl/shader/coarse.wgsl
+++ b/piet-wgsl/shader/coarse.wgsl
@@ -85,7 +85,10 @@ var<private> cmd_limit: u32;
 // Make sure there is space for a command of given size, plus a jump if needed
 fn alloc_cmd(size: u32) {
     if cmd_offset + size >= cmd_limit {
-        let new_cmd = atomicAdd(&bump.ptcl, PTCL_INCREMENT);
+        // We might be able to save a little bit of computation here
+        // by setting the initial value of the bump allocator.
+        let ptcl_dyn_start = config.width_in_tiles * config.height_in_tiles * PTCL_INITIAL_ALLOC;
+        let new_cmd = ptcl_dyn_start + atomicAdd(&bump.ptcl, PTCL_INCREMENT);
         // TODO: robust memory
         ptcl[cmd_offset] = CMD_JUMP;
         ptcl[cmd_offset + 1u] = new_cmd;
diff --git a/piet-wgsl/src/main.rs b/piet-wgsl/src/main.rs
index d8d53ad..8b00a26 100644
--- a/piet-wgsl/src/main.rs
+++ b/piet-wgsl/src/main.rs
@@ -24,6 +24,7 @@ use test_scene::dump_scene_info;
 use wgpu::{Device, Limits, Queue};
 
 mod engine;
+mod pico_svg;
 mod render;
 mod shaders;
 mod test_scene;
diff --git a/piet-wgsl/src/pico_svg.rs b/piet-wgsl/src/pico_svg.rs
new file mode 100644
index 0000000..eebe3ec
--- /dev/null
+++ b/piet-wgsl/src/pico_svg.rs
@@ -0,0 +1,140 @@
+//! A loader for a tiny fragment of SVG
+
+use std::str::FromStr;
+
+use roxmltree::{Document, Node};
+
+use kurbo::{Affine, BezPath};
+
+use piet_scene::Color;
+
+pub struct PicoSvg {
+    pub items: Vec<Item>,
+}
+
+pub enum Item {
+    Fill(FillItem),
+    Stroke(StrokeItem),
+}
+
+pub struct StrokeItem {
+    pub width: f64,
+    pub color: Color,
+    pub path: BezPath,
+}
+
+pub struct FillItem {
+    pub color: Color,
+    pub path: BezPath,
+}
+
+struct Parser<'a> {
+    scale: f64,
+    items: &'a mut Vec<Item>,
+}
+
+impl PicoSvg {
+    pub fn load(xml_string: &str, scale: f64) -> Result<PicoSvg, Box<dyn std::error::Error>> {
+        let doc = Document::parse(xml_string)?;
+        let root = doc.root_element();
+        let mut items = Vec::new();
+        let mut parser = Parser::new(&mut items, scale);
+        for node in root.children() {
+            parser.rec_parse(node)?;
+        }
+        Ok(PicoSvg { items })
+    }
+}
+
+impl<'a> Parser<'a> {
+    fn new(items: &'a mut Vec<Item>, scale: f64) -> Parser<'a> {
+        Parser { scale, items }
+    }
+
+    fn rec_parse(&mut self, node: Node) -> Result<(), Box<dyn std::error::Error>> {
+        let transform = if self.scale >= 0.0 {
+            Affine::scale(self.scale)
+        } else {
+            Affine::new([-self.scale, 0.0, 0.0, self.scale, 0.0, 1536.0])
+        };
+        if node.is_element() {
+            match node.tag_name().name() {
+                "g" => {
+                    for child in node.children() {
+                        self.rec_parse(child)?;
+                    }
+                }
+                "path" => {
+                    let d = node.attribute("d").ok_or("missing 'd' attribute")?;
+                    let bp = BezPath::from_svg(d)?;
+                    let path = transform * bp;
+                    // TODO: default fill color is black, but this is overridden in tiger to this logic.
+                    if let Some(fill_color) = node.attribute("fill") {
+                        if fill_color != "none" {
+                            let color = parse_color(fill_color);
+                            let color = modify_opacity(color, "fill-opacity", node);
+                            self.items.push(Item::Fill(FillItem {
+                                color,
+                                path: path.clone(),
+                            }));
+                        }
+                    }
+                    if let Some(stroke_color) = node.attribute("stroke") {
+                        if stroke_color != "none" {
+                            let width = self.scale.abs()
+                                * f64::from_str(
+                                    node.attribute("stroke-width").ok_or("missing width")?,
+                                )?;
+                            let color = parse_color(stroke_color);
+                            let color = modify_opacity(color, "stroke-opacity", node);
+                            self.items
+                                .push(Item::Stroke(StrokeItem { width, color, path }));
+                        }
+                    }
+                }
+                _ => (),
+            }
+        }
+        Ok(())
+    }
+}
+
+fn parse_color(color: &str) -> Color {
+    if color.as_bytes()[0] == b'#' {
+        let mut hex = u32::from_str_radix(&color[1..], 16).unwrap();
+        if color.len() == 4 {
+            hex = (hex >> 8) * 0x110000 + ((hex >> 4) & 0xf) * 0x1100 + (hex & 0xf) * 0x11;
+        }
+        let rgba = (hex << 8) + 0xff;
+        let (r, g, b, a) = (
+            (rgba >> 24 & 255) as u8,
+            ((rgba >> 16) & 255) as u8,
+            ((rgba >> 8) & 255) as u8,
+            (rgba & 255) as u8,
+        );
+        Color::rgba8(r, g, b, a)
+    } else if color.starts_with("rgb(") {
+        let mut iter = color[4..color.len() - 1].split(',');
+        let r = u8::from_str(iter.next().unwrap()).unwrap();
+        let g = u8::from_str(iter.next().unwrap()).unwrap();
+        let b = u8::from_str(iter.next().unwrap()).unwrap();
+        Color::rgb8(r, g, b)
+    } else {
+        Color::rgba8(255, 0, 255, 0x80)
+    }
+}
+
+fn modify_opacity(mut color: Color, attr_name: &str, node: Node) -> Color {
+    if let Some(opacity) = node.attribute(attr_name) {
+        let alpha = if opacity.ends_with("%") {
+            let pctg = opacity[..opacity.len() - 1].parse().unwrap_or(100.0);
+            pctg * 0.01
+        } else {
+            opacity.parse().unwrap_or(1.0)
+        } as f64;
+        color.a = (alpha.min(1.0).max(0.0) * 255.0).round() as u8;
+        color
+    } else {
+        color
+    }
+}
diff --git a/piet-wgsl/src/render.rs b/piet-wgsl/src/render.rs
index 04f4232..99c068e 100644
--- a/piet-wgsl/src/render.rs
+++ b/piet-wgsl/src/render.rs
@@ -220,7 +220,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
     let bump_buf = BufProxy::new(BUMP_SIZE);
     // Not actually used yet.
     let clip_bbox_buf = BufProxy::new(1024);
-    let bin_data_buf = BufProxy::new(1 << 16);
+    let bin_data_buf = BufProxy::new(1 << 20);
     let width_in_bins = (config.width_in_tiles + 15) / 16;
     let height_in_bins = (config.height_in_tiles + 15) / 16;
     let n_bins = width_in_bins * height_in_bins;
@@ -256,7 +256,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
         ],
     );
 
-    let segments_buf = BufProxy::new(1 << 20);
+    let segments_buf = BufProxy::new(1 << 24);
     recording.dispatch(
         shaders.path_coarse,
         (path_coarse_wgs, 1, 1),
@@ -276,7 +276,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
         (path_wgs, 1, 1),
         [config_buf, path_buf, tile_buf],
     );
-    let ptcl_buf = BufProxy::new(1 << 20);
+    let ptcl_buf = BufProxy::new(1 << 24);
     recording.dispatch(
         shaders.coarse,
         (width_in_bins, height_in_bins, 1),
diff --git a/piet-wgsl/src/test_scene.rs b/piet-wgsl/src/test_scene.rs
index 2d28659..ab7d3e3 100644
--- a/piet-wgsl/src/test_scene.rs
+++ b/piet-wgsl/src/test_scene.rs
@@ -14,37 +14,37 @@
 //
 // Also licensed under MIT license, at your choice.
 
+use kurbo::BezPath;
 use piet_scene::{Affine, Brush, Color, Fill, PathElement, Point, Scene, SceneBuilder, Stroke};
 
+use crate::pico_svg::PicoSvg;
+
 pub fn gen_test_scene() -> Scene {
     let mut scene = Scene::default();
     let mut builder = SceneBuilder::for_scene(&mut scene);
-    let path = [
-        PathElement::MoveTo(Point::new(100.0, 100.0)),
-        PathElement::LineTo(Point::new(500.0, 120.0)),
-        PathElement::LineTo(Point::new(300.0, 150.0)),
-        PathElement::LineTo(Point::new(200.0, 260.0)),
-        PathElement::LineTo(Point::new(150.0, 210.0)),
-        PathElement::Close,
-    ];
-    let brush = Brush::Solid(Color::rgb8(0x40, 0x40, 0xff));
-    builder.fill(Fill::NonZero, Affine::IDENTITY, &brush, None, &path);
-    let transform = Affine::translate(50.0, 50.0);
-    let brush = Brush::Solid(Color::rgba8(0xff, 0xff, 0x00, 0x80));
-    builder.fill(Fill::NonZero, transform, &brush, None, &path);
-    let transform = Affine::translate(100.0, 100.0);
-    let style = Stroke {
-        width: 1.0,
-        join: piet_scene::Join::Round,
-        miter_limit: 1.4,
-        start_cap: piet_scene::Cap::Round,
-        end_cap: piet_scene::Cap::Round,
-        dash_pattern: [],
-        dash_offset: 0.0,
-        scale: true,
-    };
-    let brush = Brush::Solid(Color::rgb8(0xa0, 0x00, 0x00));
-    builder.stroke(&style, transform, &brush, None, &path);
+    if false {
+        let path = [
+            PathElement::MoveTo(Point::new(100.0, 100.0)),
+            PathElement::LineTo(Point::new(500.0, 120.0)),
+            PathElement::LineTo(Point::new(300.0, 150.0)),
+            PathElement::LineTo(Point::new(200.0, 260.0)),
+            PathElement::LineTo(Point::new(150.0, 210.0)),
+            PathElement::Close,
+        ];
+        let brush = Brush::Solid(Color::rgb8(0x40, 0x40, 0xff));
+        builder.fill(Fill::NonZero, Affine::IDENTITY, &brush, None, &path);
+        let transform = Affine::translate(50.0, 50.0);
+        let brush = Brush::Solid(Color::rgba8(0xff, 0xff, 0x00, 0x80));
+        builder.fill(Fill::NonZero, transform, &brush, None, &path);
+        let transform = Affine::translate(100.0, 100.0);
+        let style = simple_stroke(1.0);
+        let brush = Brush::Solid(Color::rgb8(0xa0, 0x00, 0x00));
+        builder.stroke(&style, transform, &brush, None, &path);
+    } else {
+        let xml_str = std::str::from_utf8(include_bytes!("../../piet-gpu/Ghostscript_Tiger.svg")).unwrap();
+        let svg = PicoSvg::load(xml_str, 6.0).unwrap();
+        render_svg(&mut builder, &svg, false);
+    }
     scene
 }
 
@@ -56,3 +56,53 @@ pub fn dump_scene_info(scene: &Scene) {
         bytemuck::cast_slice::<u8, f32>(&data.pathseg_stream)
     );
 }
+
+pub fn render_svg(sb: &mut SceneBuilder, svg: &PicoSvg, print_stats: bool) {
+    use crate::pico_svg::*;
+    let start = std::time::Instant::now();
+    for item in svg.items.iter() {
+        match item {
+            Item::Fill(fill) => {
+                sb.fill(
+                    Fill::NonZero,
+                    Affine::IDENTITY,
+                    &fill.color.into(),
+                    None,
+                    convert_bez_path(&fill.path),
+                );
+            }
+            Item::Stroke(stroke) => {
+                sb.stroke(
+                    &simple_stroke(stroke.width as f32),
+                    Affine::IDENTITY,
+                    &stroke.color.into(),
+                    None,
+                    convert_bez_path(&stroke.path),
+                );
+            }
+        }
+    }
+    if print_stats {
+        println!("flattening and encoding time: {:?}", start.elapsed());
+    }
+}
+
+fn convert_bez_path<'a>(path: &'a BezPath) -> impl Iterator<Item = PathElement> + 'a + Clone {
+    path.elements()
+        .iter()
+        .map(|el| PathElement::from_kurbo(*el))
+}
+
+
+fn simple_stroke(width: f32) -> Stroke<[f32; 0]> {
+    Stroke {
+        width,
+        join: piet_scene::Join::Round,
+        miter_limit: 1.4,
+        start_cap: piet_scene::Cap::Round,
+        end_cap: piet_scene::Cap::Round,
+        dash_pattern: [],
+        dash_offset: 0.0,
+        scale: true,
+    }
+}

From 494f523c4176ea288ae14afdacd00ce40e50ab68 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Fri, 4 Nov 2022 21:41:37 -0700
Subject: [PATCH 12/17] Tiger!

Still one flaw, fat lines aren't expanded with strokes in path coarse rasterization. But that's a small visual ding, and can be fixed

That said, there is some really strange stuff going on in tile_alloc. It's using storage to do a uniform broadcast (the result of bump allocation for the workgroup), which is not great at all. It should be using workgroup storage, but on my mac it behaves as if the workgroup barrier is not in place. Investigating.
---
 piet-wgsl/shader/coarse.wgsl       |  1 -
 piet-wgsl/shader/draw_leaf.wgsl    | 28 +++++++++++++++++++++-------
 piet-wgsl/shader/pathtag_scan.wgsl |  8 ++++----
 piet-wgsl/shader/tile_alloc.wgsl   | 14 +++++++-------
 piet-wgsl/src/main.rs              |  4 ++--
 piet-wgsl/src/render.rs            |  7 +++----
 piet-wgsl/src/test_scene.rs        |  1 +
 7 files changed, 38 insertions(+), 25 deletions(-)

diff --git a/piet-wgsl/shader/coarse.wgsl b/piet-wgsl/shader/coarse.wgsl
index 5943e51..e1e88d7 100644
--- a/piet-wgsl/shader/coarse.wgsl
+++ b/piet-wgsl/shader/coarse.wgsl
@@ -99,7 +99,6 @@ fn alloc_cmd(size: u32) {
 
 fn write_path(tile: Tile, linewidth: f32) {
     // TODO: take flags
-    // TODO: handle stroke
     alloc_cmd(3u);
     if linewidth < 0.0 {
         if tile.segments != 0u {
diff --git a/piet-wgsl/shader/draw_leaf.wgsl b/piet-wgsl/shader/draw_leaf.wgsl
index 14a1163..5909fdd 100644
--- a/piet-wgsl/shader/draw_leaf.wgsl
+++ b/piet-wgsl/shader/draw_leaf.wgsl
@@ -68,8 +68,28 @@ fn main(
     @builtin(workgroup_id) wg_id: vec3<u32>,
 ) {
     let ix = global_id.x;
+    // Reduce prefix of workgroups up to this one
+    var agg = draw_monoid_identity();
+    if local_id.x < wg_id.x {
+        agg = reduced[local_id.x];
+    }
+    sh_scratch[local_id.x] = agg;
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+        workgroupBarrier();
+        if local_id.x + (1u << i) < WG_SIZE {
+            let other = sh_scratch[local_id.x + (1u << i)];
+            agg = combine_draw_monoid(agg, other);
+        }
+        workgroupBarrier();
+        sh_scratch[local_id.x] = agg;
+    }
+    // Two barriers can be eliminated if we use separate shared arrays
+    // for prefix and intra-workgroup prefix sum.
+    workgroupBarrier();
+    var m = sh_scratch[0];
+    workgroupBarrier();
     let tag_word = scene[config.drawtag_base + ix];
-    var agg = map_draw_tag(tag_word);
+    agg = map_draw_tag(tag_word);
     sh_scratch[local_id.x] = agg;
     for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
         workgroupBarrier();
@@ -81,12 +101,6 @@ fn main(
         sh_scratch[local_id.x] = agg;
     }
     workgroupBarrier();
-    var m = draw_monoid_identity();
-    if wg_id.x > 0u {
-        // TODO: separate dispatch to scan these, or integrate into this one?
-        // In the meantime, will be limited to 2 * WG draw objs.
-        m = reduced[wg_id.x - 1u];
-    }
     if local_id.x > 0u {
         m = combine_draw_monoid(m, sh_scratch[local_id.x - 1u]);
     }
diff --git a/piet-wgsl/shader/pathtag_scan.wgsl b/piet-wgsl/shader/pathtag_scan.wgsl
index d18d872..fe87750 100644
--- a/piet-wgsl/shader/pathtag_scan.wgsl
+++ b/piet-wgsl/shader/pathtag_scan.wgsl
@@ -43,13 +43,13 @@ fn main(
     @builtin(workgroup_id) wg_id: vec3<u32>,
 ) {
     var agg = tag_monoid_identity();
-    if (local_id.x < wg_id.x) {
+    if local_id.x < wg_id.x {
         agg = reduced[local_id.x];
     }
     sh_parent[local_id.x] = agg;
     for (var i = 0u; i < LG_WG_SIZE; i += 1u) {
         workgroupBarrier();
-        if (local_id.x + (1u << i) < WG_SIZE) {
+        if local_id.x + (1u << i) < WG_SIZE {
             let other = sh_parent[local_id.x + (1u << i)];
             agg = combine_tag_monoid(agg, other);
         }
@@ -63,7 +63,7 @@ fn main(
     sh_monoid[local_id.x] = agg;
     for (var i = 0u; i < LG_WG_SIZE; i += 1u) {
         workgroupBarrier();
-        if (local_id.x >= 1u << i) {
+        if local_id.x >= 1u << i {
             let other = sh_monoid[local_id.x - (1u << i)];
             agg = combine_tag_monoid(other, agg);
         }
@@ -72,7 +72,7 @@ fn main(
     }
     // prefix up to this workgroup
     var tm = sh_parent[0];
-    if (local_id.x > 0u) {
+    if local_id.x > 0u {
         tm = combine_tag_monoid(tm, sh_monoid[local_id.x - 1u]);
     }
     // exclusive prefix sum, granularity of 4 tag bytes
diff --git a/piet-wgsl/shader/tile_alloc.wgsl b/piet-wgsl/shader/tile_alloc.wgsl
index 1c27c83..56771d6 100644
--- a/piet-wgsl/shader/tile_alloc.wgsl
+++ b/piet-wgsl/shader/tile_alloc.wgsl
@@ -85,14 +85,14 @@ fn main(
         workgroupBarrier();
         sh_tile_count[local_id.x] = total_tile_count;
     }
-    workgroupBarrier();
-    // should be able to avoid a barrier by adding total_tile count from
-    // thread WG_SIZE - 1, but it doesn't work
-    if local_id.x == 0u {
-        sh_tile_offset = atomicAdd(&bump.tile, sh_tile_count[WG_SIZE - 1u]);
+    if local_id.x == WG_SIZE - 1u {
+        paths[drawobj_ix].tiles = atomicAdd(&bump.tile, sh_tile_count[WG_SIZE - 1u]);
     }
-    workgroupBarrier();
-    let tile_offset = sh_tile_offset;
+    // Using storage barriers is a workaround for what appears to be a miscompilation
+    // when a normal workgroup-shared variable is used to broadcast the value.
+    storageBarrier();
+    let tile_offset = paths[drawobj_ix | (WG_SIZE - 1u)].tiles;
+    storageBarrier();
     if drawobj_ix < config.n_drawobj {
         let tile_subix = select(0u, sh_tile_count[local_id.x - 1u], local_id.x > 0u);
         let bbox = vec4<u32>(ux0, uy0, ux1, uy1);
diff --git a/piet-wgsl/src/main.rs b/piet-wgsl/src/main.rs
index 8b00a26..38645db 100644
--- a/piet-wgsl/src/main.rs
+++ b/piet-wgsl/src/main.rs
@@ -20,7 +20,6 @@ use std::{fs::File, io::BufWriter};
 
 use engine::Engine;
 
-use test_scene::dump_scene_info;
 use wgpu::{Device, Limits, Queue};
 
 mod engine;
@@ -70,10 +69,11 @@ async fn do_render(
     queue: &Queue,
     engine: &mut Engine,
 ) -> Result<(), Box<dyn std::error::Error>> {
+    #[allow(unused)]
     let shaders = shaders::init_shaders(device, engine)?;
     let full_shaders = shaders::full_shaders(device, engine)?;
     let scene = test_scene::gen_test_scene();
-    dump_scene_info(&scene);
+    //test_scene::dump_scene_info(&scene);
     //let (recording, buf) = render::render(&scene, &shaders);
     let (recording, buf) = render::render_full(&scene, &full_shaders);
     let downloads = engine.run_recording(&device, &queue, &recording)?;
diff --git a/piet-wgsl/src/render.rs b/piet-wgsl/src/render.rs
index 99c068e..690e681 100644
--- a/piet-wgsl/src/render.rs
+++ b/piet-wgsl/src/render.rs
@@ -86,7 +86,6 @@ pub fn render(scene: &Scene, shaders: &Shaders) -> (Recording, BufProxy) {
         [config_buf, scene_buf, reduced_buf, tagmonoid_buf],
     );
 
-    let n_pathtag = data.pathseg_stream.len();
     let path_coarse_wgs = (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
     // TODO: more principled size calc
     let tiles_buf = BufProxy::new(4097 * 8);
@@ -125,7 +124,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
     let data = scene.data();
     let n_pathtag = data.tag_stream.len();
     let pathtag_padded = align_up(n_pathtag, 4 * shaders::PATHTAG_REDUCE_WG);
-    let pathtag_wgs = pathtag_padded / (4 * shaders::PATHTAG_REDUCE_WG as usize);
+    // TODO: can compute size accurately, avoid reallocation
     let mut scene: Vec<u8> = Vec::with_capacity(pathtag_padded);
     let pathtag_base = size_to_words(scene.len());
     scene.extend(&data.tag_stream);
@@ -160,6 +159,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
     let scene_buf = recording.upload(scene);
     let config_buf = recording.upload(bytemuck::bytes_of(&config).to_owned());
 
+    let pathtag_wgs = pathtag_padded / (4 * shaders::PATHTAG_REDUCE_WG as usize);
     let reduced_buf = BufProxy::new(pathtag_wgs as u64 * TAG_MONOID_FULL_SIZE);
     // TODO: really only need pathtag_wgs - 1
     recording.dispatch(
@@ -169,7 +169,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
     );
 
     let tagmonoid_buf =
-        BufProxy::new(pathtag_wgs as u64 * shaders::PATHTAG_REDUCE_WG as u64 * TAG_MONOID_SIZE);
+        BufProxy::new(pathtag_wgs as u64 * shaders::PATHTAG_REDUCE_WG as u64 * TAG_MONOID_FULL_SIZE);
     recording.dispatch(
         shaders.pathtag_scan,
         (pathtag_wgs as u32, 1, 1),
@@ -182,7 +182,6 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
         (drawobj_wgs, 1, 1),
         [config_buf, path_bbox_buf],
     );
-    let n_pathtag = data.pathseg_stream.len();
     let cubic_buf = BufProxy::new(n_pathtag as u64 * CUBIC_SIZE);
     let path_coarse_wgs = (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
     recording.dispatch(
diff --git a/piet-wgsl/src/test_scene.rs b/piet-wgsl/src/test_scene.rs
index ab7d3e3..4cfe1d4 100644
--- a/piet-wgsl/src/test_scene.rs
+++ b/piet-wgsl/src/test_scene.rs
@@ -48,6 +48,7 @@ pub fn gen_test_scene() -> Scene {
     scene
 }
 
+#[allow(unused)]
 pub fn dump_scene_info(scene: &Scene) {
     let data = scene.data();
     println!("tags {:?}", data.tag_stream);

From 5bd3a3639f783e3fb677ec3cace14766aaa78006 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Tue, 8 Nov 2022 19:30:47 -0800
Subject: [PATCH 13/17] Fix precedence issue

Note that this is evidence in favor of https://github.com/gfx-rs/naga/issues/2098 - my code is actually wrong, and it was caught by trying to port it to run in Chrome Canary.
---
 piet-wgsl/shader/shared/pathtag.wgsl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/piet-wgsl/shader/shared/pathtag.wgsl b/piet-wgsl/shader/shared/pathtag.wgsl
index fed16fd..8e46979 100644
--- a/piet-wgsl/shader/shared/pathtag.wgsl
+++ b/piet-wgsl/shader/shared/pathtag.wgsl
@@ -63,7 +63,7 @@ fn reduce_tag(tag_word: u32) -> TagMonoid {
     a += a >> 16u;
     c.pathseg_offset = a & 0xffu;
 #ifdef full
-    c.path_ix = countOneBits(tag_word & (PATH_TAG_PATH) * 0x1010101u);
+    c.path_ix = countOneBits(tag_word & (PATH_TAG_PATH * 0x1010101u));
     c.linewidth_ix = countOneBits(tag_word & (PATH_TAG_LINEWIDTH * 0x1010101u));
 #endif
     return c;

From ef3ed3c9d73e6e220c27c8c59674e545230de68b Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Thu, 10 Nov 2022 19:48:36 -0800
Subject: [PATCH 14/17] Checkpoint of clip and gradient work

This is a checkpoint of partly completed work. Much of the GPU side is done, very little of the CPU side.

For clips, the clip_els bindings (binding 6 of draw_leaf) are not added. Clip logic is missing from coarse. The overflow buffer is missing from fine, as is its size calculation in coarse (but it should work as long as the max depth fits within BLEND_STACK_SPLIT).

For gradients, the texture binding is missing (binding 6) is missing from fine, as is the infrastructure in engine to deal with texture resources, and of course porting over the logic to fill it.

The code is not tested, bugs may lurk.
---
 .vscode/settings.json               |   1 +
 piet-wgsl/shader/clip_leaf.wgsl     | 195 ++++++++++++++++++++++++++++
 piet-wgsl/shader/clip_reduce.wgsl   |  66 ++++++++++
 piet-wgsl/shader/coarse.wgsl        |  48 ++++++-
 piet-wgsl/shader/fine.wgsl          |  95 ++++++++++++++
 piet-wgsl/shader/shared/clip.wgsl   |  14 ++
 piet-wgsl/shader/shared/config.wgsl |   1 +
 piet-wgsl/shader/shared/ptcl.wgsl   |  20 +++
 piet-wgsl/src/render.rs             |   3 +
 9 files changed, 441 insertions(+), 2 deletions(-)
 create mode 100644 piet-wgsl/shader/clip_leaf.wgsl
 create mode 100644 piet-wgsl/shader/clip_reduce.wgsl
 create mode 100644 piet-wgsl/shader/shared/clip.wgsl

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 813fac6..b8fa532 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -2,6 +2,7 @@
   "wgsl-analyzer.customImports": {
     "bbox": "${workspaceFolder}/piet-wgsl/shader/shared/bbox.wgsl",
     "bump": "${workspaceFolder}/piet-wgsl/shader/shared/bump.wgsl",
+    "clip": "${workspaceFolder}/piet-wgsl/shader/shared/clip.wgsl",
     "config": "${workspaceFolder}/piet-wgsl/shader/shared/config.wgsl",
     "cubic": "${workspaceFolder}/piet-wgsl/shader/shared/cubic.wgsl",
     "drawtag": "${workspaceFolder}/piet-wgsl/shader/shared/drawtag.wgsl",
diff --git a/piet-wgsl/shader/clip_leaf.wgsl b/piet-wgsl/shader/clip_leaf.wgsl
new file mode 100644
index 0000000..f294317
--- /dev/null
+++ b/piet-wgsl/shader/clip_leaf.wgsl
@@ -0,0 +1,195 @@
+
+#import config
+#import bbox
+#import clip
+#import drawtag
+
+@group(0) @binding(0)
+var<storage> config: Config;
+
+@group(0) @binding(1)
+var<storage> clip_inp: array<i32>;
+
+@group(0) @binding(2)
+var<storage> path_bboxes: array<PathBbox>;
+
+@group(0) @binding(3)
+var<storage> reduced: array<Bic>;
+
+@group(0) @binding(4)
+var<storage> clip_els: array<ClipEl>;
+
+@group(0) @binding(5)
+var<storage, read_write> draw_monoids: array<DrawMonoid>;
+
+@group(0) @binding(6)
+var<storage, read_write> clip_bboxes: array<vec4<f32>>;
+
+let WG_SIZE = 256u;
+var<workgroup> sh_bic: array<Bic, 510 >;
+var<workgroup> sh_stack: array<u32, WG_SIZE>;
+var<workgroup> sh_stack_bbox: array<vec4<f32>, WG_SIZE>;
+var<workgroup> sh_bbox: array<vec4<f32>, WG_SIZE>;
+var<workgroup> sh_link: array<i32, WG_SIZE>;
+
+fn search_link(bic: ptr<function, Bic>, ix: u32) -> i32 {
+    var j = 0u;
+    while j < firstTrailingBit(WG_SIZE) {
+        let base = 2u * WG_SIZE - (2u << (firstTrailingBit(WG_SIZE) - j));
+        if ((ix >> j) & 1u) != 0u {
+            let test = bic_combine(sh_bic[base + (ix >> j) - 1u], *bic);
+            if test.b >= 0u {
+                *bic = test;
+                ix -= 1u << j;
+            }
+        }
+        j += 1u;
+    }
+    if ix > 0u {
+        while j > 0u {
+            j -= 1u;
+            let base = 2u * WG_SIZE - (2u << (firstTrailingBit(WG_SIZE) - j));
+            let test = bic_combine(sh_bic[base + (ix >> j) - 1u], *bic);
+            if test.b == 0u {
+                *bic = test;
+                ix -= 1u << j;
+            }
+        }
+    }
+    if ix > 0u {
+        return i32(ix) - 1;
+    } else {
+        return i32(~(*bic).a);
+    }
+}
+
+fn load_clip_inp(ix: u32) -> i32 {
+    if ix < config.n_clip {
+        return clip_inp[ix];
+    } else {
+        return i32(0x80000000);
+    }
+}
+
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+    @builtin(workgroup_id) wg_id: vec3<u32>,
+) {
+    var bic: Bic;
+    if local_id.x < wg_id.x {
+        bic = reduced[local_id.x];
+    }
+    sh_bic[local_id.x] = bic;
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+        workgroupBarrier();
+        if local_id.x + (1u << i) < WG_SIZE {
+            let other = sh_bic[local_id.x + (1u << i)];
+            bic = bic_combine(bic, other);
+        }
+        workgroupBarrier();
+        sh_bic[local_id.x] = bic;
+    }
+    workgroupBarrier();
+    let stack_size = sh_bic[0].b;
+    // TODO: if stack depth > WG_SIZE desired, scan here
+
+    // binary search in stack
+    let sp = WG_SIZE - 1u - local_id.x;
+    var ix = 0u;
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+        let probe = ix + ((WG_SIZE / 2u) >> i);
+        if sp < sh_bic[probe].b {
+            ix = probe;
+        }
+    }
+    let b = sh_bic[ix].b;
+    var bbox = vec4<f32>(-1e9, -1e9, 1e9, 1e9);
+    if sp < b {
+        let el = clip_els[ix * WG_SIZE + b - sp - 1u];
+        sh_stack[local_id.x] = el.parent_ix;
+        bbox = el.bbox;
+    }
+    // forward scan of bbox values of prefix stack
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+        sh_stack_bbox[local_id.x] = bbox;
+        workgroupBarrier();
+        if local_id.x >= (1u << i) {
+            bbox = bbox_intersect(sh_stack_bbox[local_id.x - (1u << i)], bbox);
+        }
+        workgroupBarrier();
+    }
+    sh_stack_bbox[local_id.x] = bbox;
+
+    // Read input and compute Bic binary tree
+    let inp = load_clip_inp(global_id.x);
+    let is_push = inp >= 0;
+    var bic = Bic(1u - u32(is_push), u32(is_push));
+    sh_bic[local_id.x] = bic;
+    if is_push {
+        let path_bbox = path_bboxes[inp];
+        bbox = vec4<f32>(path_bbox.x0, path_bbox.y0, path_bbox.x1, path_bbox.y1);
+    } else {
+        bbox = vec4<f32>(-1e9, -1e9, 1e9, 1e9);
+    }
+    var inbase = 0u;
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+        let outbase = 2u * WG_SIZE - (1u << (firstTrailingBit(WG_SIZE) - i));
+        workgroupBarrier();
+        if local_id.x < 1u << (firstTrailingBit(WG_SIZE) - 1u - i) {
+            let in_off = inbase + local_id.x * 2u;
+            sh_bic[outbase + local_id.x] = bic_combine(sh_bic[in_off], sh_bic[in_off + 1u]);
+        }
+        inbase = outbase;
+    }
+    workgroupBarrier();
+    // search for predecessor node
+    bic = Bic();
+    var link = search_link(&bic, local_id.x);
+    sh_link[local_id.x] = link;
+    workgroupBarrier();
+    let grandparent = select(link - 1, sh_link[link], link >= 0);
+    var parent: i32;
+    if link >= 0 {
+        parent = i32(wg_id.x * WG_SIZE) + link;
+    } else if link + i32(stack_size) >= 0 {
+        parent = i32(sh_stack[i32(WG_SIZE) + link]);
+    } else {
+        parent = -1;
+    }
+    // bbox scan (intersect) across parent links
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+        if i != 0u {
+            sh_link[local_id.x] = link;
+        }
+        sh_bbox[local_id.x] = bbox;
+        workgroupBarrier();
+        if link >= 0 {
+            bbox = bbox_intersect(sh_bbox[link], bbox);
+            link = sh_link[link];
+        }
+        workgroupBarrier();
+    }
+    if link + i32(stack_size) >= 0 {
+        bbox = bbox_intersect(sh_stack_bbox[i32(WG_SIZE) + link], bbox);
+    }
+    // At this point, bbox is the intersection of bboxes on the path to the root
+    sh_bbox[local_id.x] = bbox;
+    workgroupBarrier();
+
+    if !is_push && global_id.x < config.n_clip {
+        // Fix up drawmonoid so path_ix of EndClip matches BeginClip
+        let path_ix = clip_inp[parent];
+        draw_monoids[~inp].path_ix = u32(path_ix);
+
+        if grandparent >= 0 {
+            bbox = sh_bbox[grandparent];
+        } else if grandparent + i32(stack_size) >= 0 {
+            bbox = sh_stack_bbox[i32(WG_SIZE) + grandparent];
+        } else {
+            bbox = vec4<f32>(-1e9, -1e9, 1e9, 1e9);
+        }
+    }
+    clip_bboxes[global_id.x] = bbox
+}
diff --git a/piet-wgsl/shader/clip_reduce.wgsl b/piet-wgsl/shader/clip_reduce.wgsl
new file mode 100644
index 0000000..3288b07
--- /dev/null
+++ b/piet-wgsl/shader/clip_reduce.wgsl
@@ -0,0 +1,66 @@
+
+#import config
+#import bbox
+#import clip
+
+@group(0) @binding(0)
+var<storage> config: Config;
+
+@group(0) @binding(1)
+var<storage> clip_inp: array<i32>;
+
+@group(0) @binding(2)
+var<storage> path_bboxes: array<PathBbox>;
+
+@group(0) @binding(3)
+var<storage, read_write> reduced: array<Bic>;
+
+@group(0) @binding(4)
+var<storage, read_write> clip_out: array<ClipEl>;
+
+let WG_SIZE = 256u;
+var<workgroup> sh_bic: array<Bic, WG_SIZE>;
+var<workgroup> sh_parent: array<u32, WG_SIZE>;
+var<workgroup> sh_path_ix: array<u32, WG_SIZE>;
+
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+    @builtin(workgroup_id) wg_id: vec3<u32>, 
+) {
+    let inp = clip_inp[global_id.x];
+    let is_push = inp >= 0;
+    var bic = Bic(1u - u32(is_push), u32(is_push));
+    // reverse scan of bicyclic semigroup
+    sh_bic[local_id.x] = bic;
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+        workgroupBarrier();
+        if local_id.x + (1u << i) < WG_SIZE {
+            let other = sh_bic[local_id.x + (1u << i)];
+            bic = bic_combine(bic, other);
+        }
+        workgroupBarrier();
+        sh_bic[local_id.x] = bic;
+    }
+    if local_id.x == 0u {
+        reduced[wg_id.x] = bic;
+    }
+    workgroupBarrier();
+    let size = sh_bic[0].b;
+    bic = Bic();
+    if is_push && bic.a == 0u {
+        let local_ix = size - bic.b - 1u;
+        sh_parent[local_ix] = local_id.x;
+        sh_path_ix[local_ix] = u32(inp);
+    }
+    workgroupBarrier();
+    // TODO: possibly do forward scan here if depth can exceed wg size
+    if local_id.x < size {
+        let path_ix = sh_path_ix[local_id.x];
+        let path_bbox = path_bboxes[path_ix];
+        let parent_ix = sh_parent[local_id.x] + wg_id.x * WG_SIZE;
+        let bbox = vec4<f32>(path_bbox.x0, path_bbox.y0, path_bbox.x1, path_bbox.y1);
+        clip_out[global_id.x] = ClipEl(parent_ix, bbox);
+    }
+}
diff --git a/piet-wgsl/shader/coarse.wgsl b/piet-wgsl/shader/coarse.wgsl
index e1e88d7..5741ec3 100644
--- a/piet-wgsl/shader/coarse.wgsl
+++ b/piet-wgsl/shader/coarse.wgsl
@@ -125,7 +125,27 @@ fn write_color(color: CmdColor) {
     ptcl[cmd_offset] = CMD_COLOR;
     ptcl[cmd_offset + 1u] = color.rgba_color;
     cmd_offset += 2u;
+}
 
+// Discussion point: these are basically copying from info to ptcl. We
+// could just write an info offset and have fine bind that buffer and read
+// from it.
+
+fn write_lin_grad(lin: CmdLinGrad) {
+    alloc_cmd(5u);
+    ptcl[cmd_offset] = CMD_LIN_GRAD;
+    ptcl[cmd_offset + 1u] = lin.index;
+    ptcl[cmd_offset + 2u] = bitcast<u32>(lin.line_x);
+    ptcl[cmd_offset + 3u] = bitcast<u32>(lin.line_y);
+    ptcl[cmd_offset + 4u] = bitcast<u32>(lin.line_c);
+    cmd_offset += 5u;
+}
+
+fn write_rad_grad(rad: CmdRadGrad) {
+    alloc_cmd(12u);
+    ptcl[cmd_offset] = CMD_RAD_GRAD;
+    ptcl[cmd_offset + 1u] = rad.index;
+    cmd_offset += 12u;
 }
 
 @compute @workgroup_size(256)
@@ -304,14 +324,38 @@ fn main(
             if clip_zero_depth == 0u {
                 let tile_ix = sh_tile_base[el_ix] + sh_tile_stride[el_ix] * tile_y + tile_x;
                 let tile = tiles[tile_ix];
+                let linewidth = bitcast<f32>(info[di]);
+                write_path(tile, linewidth);
                 switch drawtag {
                     // DRAWTAG_FILL_COLOR
                     case 0x44u: {
-                        let linewidth = bitcast<f32>(info[di]);
                         let rgba_color = scene[dd];
-                        write_path(tile, linewidth);
                         write_color(CmdColor(rgba_color));
                     }
+                    // DRAWTAG_FILL_LIN_GRADIENT
+                    case 0x114u: {
+                        var lin: CmdLinGrad;
+                        lin.index = scene[dd];
+                        lin.line_x = bitcast<f32>(info[di + 1u]);
+                        lin.line_y = bitcast<f32>(info[di + 2u]);
+                        lin.line_c = bitcast<f32>(info[di + 3u]);
+                        write_lin_grad(lin);
+                    }
+                    // DRAWTAG_FILL_RAD_GRADIENT
+                    case 0x2dcu: {
+                        var rad: CmdRadGrad;
+                        rad.index = scene[dd];
+                        let m0 = bitcast<f32>(info[di + 1u]);
+                        let m1 = bitcast<f32>(info[di + 2u]);
+                        let m2 = bitcast<f32>(info[di + 3u]);
+                        let m3 = bitcast<f32>(info[di + 4u]);
+                        rad.matrx = vec4<f32>(m0, m1, m2, m3);
+                        rad.xlat = vec2<f32>(bitcast<f32>(info[di + 5u]), bitcast<f32>(info[di + 6u]));
+                        rad.c1 = vec2<f32>(bitcast<f32>(info[di + 7u]), bitcast<f32>(info[di + 8u]));
+                        rad.ra = bitcast<f32>(info[di + 9u]);
+                        rad.roff = bitcast<f32>(info[di + 10u]);
+                        write_rad_grad(rad);
+                    }
                     default: {}
                 }
             }
diff --git a/piet-wgsl/shader/fine.wgsl b/piet-wgsl/shader/fine.wgsl
index c347630..d76edef 100644
--- a/piet-wgsl/shader/fine.wgsl
+++ b/piet-wgsl/shader/fine.wgsl
@@ -42,9 +42,15 @@ var<storage, read_write> output: array<u32>;
 #ifdef full
 #import ptcl
 
+let GRADIENT_WIDTH = 512;
+let BLEND_STACK_SPLIT = 4u;
+
 @group(0) @binding(4)
 var<storage> ptcl: array<u32>;
 
+@group(0) @binding(5)
+var gradients: texture_2d<f32>;
+
 fn read_fill(cmd_ix: u32) -> CmdFill {
     let tile = ptcl[cmd_ix + 1u];
     let backdrop = i32(ptcl[cmd_ix + 2u]);
@@ -61,6 +67,33 @@ fn read_color(cmd_ix: u32) -> CmdColor {
     let rgba_color = ptcl[cmd_ix + 1u];
     return CmdColor(rgba_color);
 }
+
+fn read_lin_grad(cmd_ix: u32) -> CmdLinGrad {
+    let index = ptcl[cmd_ix + 1u];
+    let line_x = bitcast<f32>(ptcl[cmd_ix + 2u]);
+    let line_y = bitcast<f32>(ptcl[cmd_ix + 3u]);
+    let line_c = bitcast<f32>(ptcl[cmd_ix + 4u]);
+    return CmdLinGrad(index, line_x, line_y, line_c);
+}
+
+fn read_rad_grad(cmd_ix: u32) -> CmdRadGrad {
+    let index = ptcl[cmd_ix + 1u];
+    let m0 = bitcast<f32>(ptcl[cmd_ix + 2u]);
+    let m1 = bitcast<f32>(ptcl[cmd_ix + 3u]);
+    let m2 = bitcast<f32>(ptcl[cmd_ix + 4u]);
+    let m3 = bitcast<f32>(ptcl[cmd_ix + 5u]);
+    let matrx = vec4<f32>(m0, m1, m2, m3);
+    let xlat = vec2<f32>(bitcast<f32>(ptcl[cmd_ix + 6u]), bitcast<f32>(ptcl[cmd_ix + 7u]));
+    let c1 = vec2<f32>(bitcast<f32>(ptcl[cmd_ix + 8u]), bitcast<f32>(ptcl[cmd_ix + 9u]));
+    let ra = bitcast<f32>(ptcl[cmd_ix + 10u]);
+    let roff = bitcast<f32>(ptcl[cmd_ix + 11u]);
+    return CmdRadGrad(index, matrx, xlat, c1, ra, roff);
+}
+
+fn mix_blend_compose(backdrop: vec4<f32>, src: vec4<f32>, mode: u32) -> vec4<f32> {
+    // TODO: ALL the blend modes. This is just vanilla src-over.
+    return backdrop * (1.0 - src.a) + src;
+}
 #endif
 
 let PIXELS_PER_THREAD = 4u;
@@ -147,6 +180,8 @@ fn main(
     let xy = vec2<f32>(f32(global_id.x * PIXELS_PER_THREAD), f32(global_id.y));
 #ifdef full
     var rgba: array<vec4<f32>, PIXELS_PER_THREAD>;
+    var blend_stack: array<array<u32, BLEND_STACK_SPLIT>, PIXELS_PER_THREAD>;
+    var clip_depth = 0u;
     var area: array<f32, PIXELS_PER_THREAD>;
     var cmd_ix = tile_ix * PTCL_INITIAL_ALLOC;
 
@@ -187,6 +222,66 @@ fn main(
                 }
                 cmd_ix += 2u;
             }
+            // CMD_LIN_GRAD
+            case 6u: {
+                let lin = read_lin_grad(cmd_ix);
+                let d = lin.line_x * xy.x + lin.line_y * xy.y + lin.line_c;
+                for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
+                    let my_d = d + lin.line_x * f32(i);
+                    let x = i32(round(clamp(my_d, 0.0, 1.0) * f32(GRADIENT_WIDTH - 1)));
+                    let fg_rgba = textureLoad(gradients, vec2<i32>(x, i32(lin.index)), 0);
+                    let fg_i = fg_rgba * area[i];
+                    rgba[i] = rgba[i] * (1.0 - fg_i.a) + fg_i;
+                }
+                cmd_ix += 12u;
+            }
+            // CMD_RAD_GRAD
+            case 7u: {
+                let rad = read_rad_grad(cmd_ix);
+                for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
+                    let my_xy = vec2<f32>(xy.x + f32(i), xy.y);
+                    // TODO: can hoist y, but for now stick to piet-gpu
+                    let xy_xformed = rad.matrx.xz * my_xy.x + rad.matrx.yw * my_xy.y - rad.xlat;
+                    let ba = dot(xy_xformed, rad.c1);
+                    let ca = rad.ra * dot(xy_xformed, xy_xformed);
+                    let t = sqrt(ba * ba + ca) - ba - rad.roff;
+                    let x = i32(round(clamp(t, 0.0, 1.0) * f32(GRADIENT_WIDTH - 1)));
+                    let fg_rgba = textureLoad(gradients, vec2<i32>(x, i32(rad.index)), 0);
+                    let fg_i = fg_rgba * area[i];
+                    rgba[i] = rgba[i] * (1.0 - fg_i.a) + fg_i;
+                }
+                cmd_ix += 12u;
+            }
+            // CMD_BEGIN_CLIP
+            case 9u: {
+                if clip_depth < BLEND_STACK_SPLIT {
+                    for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
+                        blend_stack[clip_depth][i] = pack4x8unorm(rgba[i]);
+                        rgba[i] = vec4<f32>(0.0);
+                    }
+                } else {
+                    // TODO: spill to memory
+                }
+                clip_depth += 1u;
+                cmd_ix += 1u;
+            }
+            // CMD_END_CLIP
+            case 10u: {
+                let blend = ptcl[cmd_ix + 1u];
+                clip_depth -= 1u;
+                for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
+                    var bg_rgba: u32;
+                    if clip_depth < BLEND_STACK_SPLIT {
+                        bg_rgba = blend_stack[clip_depth][i];
+                    } else {
+                        // load from memory
+                    }
+                    let bg = unpack4x8unorm(bg_rgba);
+                    let fg = rgba[i] * area[i];
+                    rgba[i] = mix_blend_compose(bg, fg, blend);
+                }
+                cmd_ix += 2u;
+            }
             // CMD_JUMP
             case 11u: {
                 cmd_ix = ptcl[cmd_ix + 1u];
diff --git a/piet-wgsl/shader/shared/clip.wgsl b/piet-wgsl/shader/shared/clip.wgsl
new file mode 100644
index 0000000..608e9bf
--- /dev/null
+++ b/piet-wgsl/shader/shared/clip.wgsl
@@ -0,0 +1,14 @@
+struct Bic {
+    a: u32,
+    b: u32,
+}
+
+fn bic_combine(x: Bic, y: Bic) -> Bic {
+    let m = min(x.b, y.a);
+    return Bic(x.a + y.a - m, x.b + y.b - m);
+}
+
+struct ClipEl {
+    parent_ix: u32,
+    bbox: vec4<f32>,
+}
diff --git a/piet-wgsl/shader/shared/config.wgsl b/piet-wgsl/shader/shared/config.wgsl
index 73f4054..5db894b 100644
--- a/piet-wgsl/shader/shared/config.wgsl
+++ b/piet-wgsl/shader/shared/config.wgsl
@@ -20,6 +20,7 @@ struct Config {
 
     n_drawobj: u32,
     n_path: u32,
+    n_clip: u32,
 
     // offsets within scene buffer (in u32 units)
     // Note: this is a difference from piet-gpu, which is in bytes
diff --git a/piet-wgsl/shader/shared/ptcl.wgsl b/piet-wgsl/shader/shared/ptcl.wgsl
index 8d6e869..92316cc 100644
--- a/piet-wgsl/shader/shared/ptcl.wgsl
+++ b/piet-wgsl/shader/shared/ptcl.wgsl
@@ -28,6 +28,10 @@ let CMD_FILL = 1u;
 let CMD_STROKE = 2u;
 let CMD_SOLID = 3u;
 let CMD_COLOR = 5u;
+let CMD_LIN_GRAD = 6u;
+let CMD_RAD_GRAD = 7u;
+let CMD_BEGIN_CLIP = 9u;
+let CMD_END_CLIP = 10u;
 let CMD_JUMP = 11u;
 
 // The individual PTCL structs are written here, but read/write is by
@@ -50,3 +54,19 @@ struct CmdJump {
 struct CmdColor {
     rgba_color: u32,
 }
+
+struct CmdLinGrad {
+    index: u32,
+    line_x: f32,
+    line_y: f32,
+    line_c: f32,
+}
+
+struct CmdRadGrad {
+    index: u32,
+    matrx: vec4<f32>,
+    xlat: vec2<f32>,
+    c1: vec2<f32>,
+    ra: f32,
+    roff: f32,
+}
diff --git a/piet-wgsl/src/render.rs b/piet-wgsl/src/render.rs
index 690e681..7d052cc 100644
--- a/piet-wgsl/src/render.rs
+++ b/piet-wgsl/src/render.rs
@@ -26,6 +26,7 @@ struct Config {
     height_in_tiles: u32,
     n_drawobj: u32,
     n_path: u32,
+    n_clip: u32,
     pathtag_base: u32,
     pathdata_base: u32,
     drawtag_base: u32,
@@ -143,11 +144,13 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
     let n_path = data.n_path;
     // TODO: calculate for real when we do rectangles
     let n_drawobj = n_path;
+    let n_clip = 0; // TODO: wire up correctly
     let config = Config {
         width_in_tiles: 64,
         height_in_tiles: 64,
         n_drawobj,
         n_path,
+        n_clip,
         pathtag_base,
         pathdata_base,
         drawtag_base,

From 20f7b68514758dc1743597c1600f9b99770a415c Mon Sep 17 00:00:00 2001
From: Chad Brokaw <cbrokaw@gmail.com>
Date: Wed, 16 Nov 2022 10:49:38 -0500
Subject: [PATCH 15/17] finish gradient support

---
 piet-wgsl/src/engine.rs     | 217 +++++++++++++++++++++++++++++++-----
 piet-wgsl/src/main.rs       |   2 +-
 piet-wgsl/src/ramp.rs       | 137 +++++++++++++++++++++++
 piet-wgsl/src/render.rs     |  96 ++++++++++++----
 piet-wgsl/src/shaders.rs    |   1 +
 piet-wgsl/src/test_scene.rs |  82 ++++++++++----
 6 files changed, 459 insertions(+), 76 deletions(-)
 create mode 100644 piet-wgsl/src/ramp.rs

diff --git a/piet-wgsl/src/engine.rs b/piet-wgsl/src/engine.rs
index 7d1c854..5e89905 100644
--- a/piet-wgsl/src/engine.rs
+++ b/piet-wgsl/src/engine.rs
@@ -17,7 +17,7 @@
 use std::{
     borrow::Cow,
     collections::{hash_map::Entry, HashMap},
-    num::NonZeroU64,
+    num::{NonZeroU32, NonZeroU64},
     sync::atomic::{AtomicU64, Ordering},
 };
 
@@ -25,7 +25,8 @@ use futures_intrusive::channel::shared::GenericOneshotReceiver;
 use parking_lot::RawMutex;
 use wgpu::{
     util::DeviceExt, BindGroup, BindGroupLayout, Buffer, BufferAsyncError, BufferSlice, BufferView,
-    ComputePipeline, Device, Queue,
+    ComputePipeline, Device, Queue, Texture, TextureAspect, TextureFormat, TextureUsages,
+    TextureView, TextureViewDimension,
 };
 
 pub type Error = Box<dyn std::error::Error>;
@@ -58,12 +59,27 @@ pub struct BufProxy {
     id: Id,
 }
 
+#[derive(Clone, Copy)]
+pub struct ImageProxy {
+    width: u32,
+    height: u32,
+    // TODO: format
+    id: Id,
+}
+
+#[derive(Clone, Copy)]
+pub enum ResourceProxy {
+    Buf(BufProxy),
+    Image(ImageProxy),
+}
+
 pub enum Command {
     Upload(BufProxy, Vec<u8>),
+    UploadImage(ImageProxy, Vec<u8>),
     // Discussion question: third argument is vec of resources?
     // Maybe use tricks to make more ergonomic?
     // Alternative: provide bufs & images as separate sequences, like piet-gpu.
-    Dispatch(ShaderId, (u32, u32, u32), Vec<BufProxy>),
+    Dispatch(ShaderId, (u32, u32, u32), Vec<ResourceProxy>),
     Download(BufProxy),
     Clear(BufProxy, u64, Option<NonZeroU64>),
 }
@@ -92,6 +108,7 @@ pub enum BindType {
 #[derive(Default)]
 struct BindMap {
     buf_map: HashMap<Id, Buffer>,
+    image_map: HashMap<Id, (Texture, TextureView)>,
 }
 
 impl Engine {
@@ -132,6 +149,16 @@ impl Engine {
                     },
                     count: None,
                 },
+                BindType::ImageRead => wgpu::BindGroupLayoutEntry {
+                    binding: i as u32,
+                    visibility: wgpu::ShaderStages::COMPUTE,
+                    ty: wgpu::BindingType::Texture {
+                        sample_type: wgpu::TextureSampleType::Float { filterable: true },
+                        view_dimension: wgpu::TextureViewDimension::D2,
+                        multisampled: false,
+                    },
+                    count: None,
+                },
                 _ => todo!(),
             })
             .collect::<Vec<_>>();
@@ -182,6 +209,58 @@ impl Engine {
                     });
                     bind_map.insert_buf(buf_proxy.id, buf);
                 }
+                Command::UploadImage(image_proxy, bytes) => {
+                    let buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+                        label: None,
+                        contents: &bytes,
+                        usage: wgpu::BufferUsages::COPY_SRC,
+                    });
+                    let texture = device.create_texture(&wgpu::TextureDescriptor {
+                        label: None,
+                        size: wgpu::Extent3d {
+                            width: image_proxy.width,
+                            height: image_proxy.height,
+                            depth_or_array_layers: 1,
+                        },
+                        mip_level_count: 1,
+                        sample_count: 1,
+                        dimension: wgpu::TextureDimension::D2,
+                        usage: TextureUsages::TEXTURE_BINDING | TextureUsages::COPY_DST,
+                        format: TextureFormat::Rgba8Unorm,
+                    });
+                    let texture_view = texture.create_view(&wgpu::TextureViewDescriptor {
+                        label: None,
+                        dimension: Some(TextureViewDimension::D2),
+                        aspect: TextureAspect::All,
+                        mip_level_count: None,
+                        base_mip_level: 0,
+                        base_array_layer: 0,
+                        array_layer_count: None,
+                        format: Some(TextureFormat::Rgba8Unorm),
+                    });
+                    encoder.copy_buffer_to_texture(
+                        wgpu::ImageCopyBuffer {
+                            buffer: &buf,
+                            layout: wgpu::ImageDataLayout {
+                                offset: 0,
+                                bytes_per_row: NonZeroU32::new(image_proxy.width * 4),
+                                rows_per_image: None,
+                            },
+                        },
+                        wgpu::ImageCopyTexture {
+                            texture: &texture,
+                            mip_level: 0,
+                            origin: wgpu::Origin3d { x: 0, y: 0, z: 0 },
+                            aspect: TextureAspect::All,
+                        },
+                        wgpu::Extent3d {
+                            width: image_proxy.width,
+                            height: image_proxy.height,
+                            depth_or_array_layers: 1,
+                        },
+                    );
+                    bind_map.insert_image(image_proxy.id, texture, texture_view)
+                }
                 Command::Dispatch(shader_id, wg_size, bindings) => {
                     println!("dispatching {:?} with {} bindings", wg_size, bindings.len());
                     let shader = &self.shaders[shader_id.0];
@@ -226,13 +305,23 @@ impl Recording {
         buf_proxy
     }
 
-    pub fn dispatch(
-        &mut self,
-        shader: ShaderId,
-        wg_size: (u32, u32, u32),
-        resources: impl Into<Vec<BufProxy>>,
-    ) {
-        self.push(Command::Dispatch(shader, wg_size, resources.into()));
+    pub fn upload_image(&mut self, width: u32, height: u32, data: impl Into<Vec<u8>>) -> ImageProxy {
+        let data = data.into();
+        let image_proxy = ImageProxy::new(width, height);
+        self.push(Command::UploadImage(image_proxy, data));
+        image_proxy
+    }
+
+    pub fn dispatch<R>(&mut self, shader: ShaderId, wg_size: (u32, u32, u32), resources: R)
+    where
+        R: IntoIterator,
+        R::Item: Into<ResourceProxy>,
+    {
+        self.push(Command::Dispatch(
+            shader,
+            wg_size,
+            resources.into_iter().map(|r| r.into()).collect(),
+        ));
     }
 
     pub fn download(&mut self, buf: BufProxy) {
@@ -251,6 +340,35 @@ impl BufProxy {
     }
 }
 
+impl ImageProxy {
+    pub fn new(width: u32, height: u32) -> Self {
+        let id = Id::next();
+        ImageProxy { width, height, id }
+    }
+}
+
+impl ResourceProxy {
+    pub fn new_buf(size: u64) -> Self {
+        Self::Buf(BufProxy::new(size))
+    }
+
+    pub fn new_image(width: u32, height: u32) -> Self {
+        Self::Image(ImageProxy::new(width, height))
+    }
+}
+
+impl From<BufProxy> for ResourceProxy {
+    fn from(value: BufProxy) -> Self {
+        Self::Buf(value)
+    }
+}
+
+impl From<ImageProxy> for ResourceProxy {
+    fn from(value: ImageProxy) -> Self {
+        Self::Image(value)
+    }
+}
+
 impl Id {
     pub fn next() -> Id {
         let val = ID_COUNTER.fetch_add(1, Ordering::Relaxed);
@@ -264,34 +382,79 @@ impl BindMap {
         self.buf_map.insert(id, buf);
     }
 
+    fn insert_image(&mut self, id: Id, image: Texture, image_view: TextureView) {
+        self.image_map.insert(id, (image, image_view));
+    }
+
     fn create_bind_group(
         &mut self,
         device: &Device,
         layout: &BindGroupLayout,
-        bindings: &[BufProxy],
+        bindings: &[ResourceProxy],
     ) -> Result<BindGroup, Error> {
         for proxy in bindings {
-            if let Entry::Vacant(v) = self.buf_map.entry(proxy.id) {
-                let buf = device.create_buffer(&wgpu::BufferDescriptor {
-                    label: None,
-                    size: proxy.size,
-                    usage: wgpu::BufferUsages::STORAGE
-                        | wgpu::BufferUsages::COPY_DST
-                        | wgpu::BufferUsages::COPY_SRC,
-                    mapped_at_creation: false,
-                });
-                v.insert(buf);
+            match proxy {
+                ResourceProxy::Buf(proxy) => {
+                    if let Entry::Vacant(v) = self.buf_map.entry(proxy.id) {
+                        let buf = device.create_buffer(&wgpu::BufferDescriptor {
+                            label: None,
+                            size: proxy.size,
+                            usage: wgpu::BufferUsages::STORAGE
+                                | wgpu::BufferUsages::COPY_DST
+                                | wgpu::BufferUsages::COPY_SRC,
+                            mapped_at_creation: false,
+                        });
+                        v.insert(buf);
+                    }
+                }
+                ResourceProxy::Image(proxy) => {
+                    if let Entry::Vacant(v) = self.image_map.entry(proxy.id) {
+                        let texture = device.create_texture(&wgpu::TextureDescriptor {
+                            label: None,
+                            size: wgpu::Extent3d {
+                                width: proxy.width,
+                                height: proxy.height,
+                                depth_or_array_layers: 1,
+                            },
+                            mip_level_count: 1,
+                            sample_count: 1,
+                            dimension: wgpu::TextureDimension::D2,
+                            usage: TextureUsages::TEXTURE_BINDING | TextureUsages::COPY_DST,
+                            format: TextureFormat::Rgba8Unorm,
+                        });
+                        let texture_view = texture.create_view(&wgpu::TextureViewDescriptor {
+                            label: None,
+                            dimension: Some(TextureViewDimension::D2),
+                            aspect: TextureAspect::All,
+                            mip_level_count: None,
+                            base_mip_level: 0,
+                            base_array_layer: 0,
+                            array_layer_count: None,
+                            format: Some(TextureFormat::Rgba8Unorm),
+                        });
+                        v.insert((texture, texture_view));
+                    }
+                }
             }
         }
         let entries = bindings
             .iter()
             .enumerate()
-            .map(|(i, proxy)| {
-                let buf = self.buf_map.get(&proxy.id).unwrap();
-                Ok(wgpu::BindGroupEntry {
-                    binding: i as u32,
-                    resource: buf.as_entire_binding(),
-                })
+            .map(|(i, proxy)| match proxy {
+                ResourceProxy::Buf(proxy) => {
+                    let buf = self.buf_map.get(&proxy.id).unwrap();
+                    Ok(wgpu::BindGroupEntry {
+                        binding: i as u32,
+                        resource: buf.as_entire_binding(),
+                    })
+                }
+                ResourceProxy::Image(proxy) => {
+                    let texture = self.image_map.get(&proxy.id).unwrap();
+                    Ok(wgpu::BindGroupEntry {
+                        binding: i as u32,
+                        resource: wgpu::BindingResource::TextureView(&texture.1),
+                    })
+                }
             })
             .collect::<Result<Vec<_>, Error>>()?;
         let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
diff --git a/piet-wgsl/src/main.rs b/piet-wgsl/src/main.rs
index 38645db..2f1d885 100644
--- a/piet-wgsl/src/main.rs
+++ b/piet-wgsl/src/main.rs
@@ -24,6 +24,7 @@ use wgpu::{Device, Limits, Queue};
 
 mod engine;
 mod pico_svg;
+mod ramp;
 mod render;
 mod shaders;
 mod test_scene;
@@ -58,7 +59,6 @@ fn dump_buf(buf: &[u32]) {
                 println!("{}: {:x} {}", i, val, f32::from_bits(*val));
             } else {
                 println!("{}: {:x}", i, val);
-
             }
         }
     }
diff --git a/piet-wgsl/src/ramp.rs b/piet-wgsl/src/ramp.rs
new file mode 100644
index 0000000..a26c3d9
--- /dev/null
+++ b/piet-wgsl/src/ramp.rs
@@ -0,0 +1,137 @@
+use piet_scene::{Color, GradientStop, GradientStops};
+
+use std::collections::HashMap;
+
+const N_SAMPLES: usize = 512;
+const RETAINED_COUNT: usize = 64;
+
+#[derive(Default)]
+pub struct RampCache {
+    epoch: u64,
+    map: HashMap<GradientStops, (u32, u64)>,
+    data: Vec<u32>,
+}
+
+impl RampCache {
+    pub fn advance(&mut self) {
+        self.epoch += 1;
+        if self.map.len() > RETAINED_COUNT {
+            self.map
+                .retain(|_key, value| value.0 < RETAINED_COUNT as u32);
+            self.data.truncate(RETAINED_COUNT * N_SAMPLES);
+        }
+    }
+
+    pub fn add(&mut self, stops: &[GradientStop]) -> u32 {
+        if let Some(entry) = self.map.get_mut(stops) {
+            entry.1 = self.epoch;
+            entry.0
+        } else if self.map.len() < RETAINED_COUNT {
+            let id = (self.data.len() / N_SAMPLES) as u32;
+            self.data.extend(make_ramp(stops));
+            self.map.insert(stops.into(), (id, self.epoch));
+            id
+        } else {
+            let mut reuse = None;
+            for (stops, (id, epoch)) in &self.map {
+                if *epoch + 2 < self.epoch {
+                    reuse = Some((stops.to_owned(), *id));
+                    break;
+                }
+            }
+            if let Some((old_stops, id)) = reuse {
+                self.map.remove(&old_stops);
+                let start = id as usize * N_SAMPLES;
+                for (dst, src) in self.data[start..start + N_SAMPLES]
+                    .iter_mut()
+                    .zip(make_ramp(stops))
+                {
+                    *dst = src;
+                }
+                self.map.insert(stops.into(), (id, self.epoch));
+                id
+            } else {
+                let id = (self.data.len() / N_SAMPLES) as u32;
+                self.data.extend(make_ramp(stops));
+                self.map.insert(stops.into(), (id, self.epoch));
+                id
+            }
+        }
+    }
+
+    pub fn data(&self) -> &[u32] {
+        &self.data
+    }
+
+    pub fn width(&self) -> u32 {
+        N_SAMPLES as u32
+    }
+
+    pub fn height(&self) -> u32 {
+        (self.data.len() / N_SAMPLES) as u32
+    }
+}
+
+fn make_ramp<'a>(stops: &'a [GradientStop]) -> impl Iterator<Item = u32> + 'a {
+    let mut last_u = 0.0;
+    let mut last_c = ColorF64::from_color(stops[0].color);
+    let mut this_u = last_u;
+    let mut this_c = last_c;
+    let mut j = 0;
+    (0..N_SAMPLES).map(move |i| {
+        let u = (i as f64) / (N_SAMPLES - 1) as f64;
+        while u > this_u {
+            last_u = this_u;
+            last_c = this_c;
+            if let Some(s) = stops.get(j + 1) {
+                this_u = s.offset as f64;
+                this_c = ColorF64::from_color(s.color);
+                j += 1;
+            } else {
+                break;
+            }
+        }
+        let du = this_u - last_u;
+        let c = if du < 1e-9 {
+            this_c
+        } else {
+            last_c.lerp(&this_c, (u - last_u) / du)
+        };
+        c.to_premul_u32()
+    })
+}
+
+#[derive(Copy, Clone, Debug)]
+struct ColorF64([f64; 4]);
+
+impl ColorF64 {
+    fn from_color(color: Color) -> Self {
+        Self([
+            color.r as f64 / 255.0,
+            color.g as f64 / 255.0,
+            color.b as f64 / 255.0,
+            color.a as f64 / 255.0,
+        ])
+    }
+
+    fn lerp(&self, other: &Self, a: f64) -> Self {
+        fn l(x: f64, y: f64, a: f64) -> f64 {
+            x * (1.0 - a) + y * a
+        }
+        Self([
+            l(self.0[0], other.0[0], a),
+            l(self.0[1], other.0[1], a),
+            l(self.0[2], other.0[2], a),
+            l(self.0[3], other.0[3], a),
+        ])
+    }
+
+    fn to_premul_u32(&self) -> u32 {
+        let a = self.0[3].min(1.0).max(0.0);
+        let r = ((self.0[0] * a).min(1.0).max(0.0) * 255.0) as u32;
+        let g = ((self.0[1] * a).min(1.0).max(0.0) * 255.0) as u32;
+        let b = ((self.0[2] * a).min(1.0).max(0.0) * 255.0) as u32;
+        let a = (a * 255.0) as u32;
+        r | (g << 8) | (b << 16) | (a << 24)
+    }
+}
diff --git a/piet-wgsl/src/render.rs b/piet-wgsl/src/render.rs
index 7d052cc..4cc63e3 100644
--- a/piet-wgsl/src/render.rs
+++ b/piet-wgsl/src/render.rs
@@ -4,7 +4,7 @@ use bytemuck::{Pod, Zeroable};
 use piet_scene::Scene;
 
 use crate::{
-    engine::{BufProxy, Recording},
+    engine::{BufProxy, Recording, ResourceProxy},
     shaders::{self, FullShaders, Shaders},
 };
 
@@ -87,7 +87,8 @@ pub fn render(scene: &Scene, shaders: &Shaders) -> (Recording, BufProxy) {
         [config_buf, scene_buf, reduced_buf, tagmonoid_buf],
     );
 
-    let path_coarse_wgs = (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
+    let path_coarse_wgs =
+        (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
     // TODO: more principled size calc
     let tiles_buf = BufProxy::new(4097 * 8);
     let segments_buf = BufProxy::new(256 * 24);
@@ -122,7 +123,29 @@ pub fn render(scene: &Scene, shaders: &Shaders) -> (Recording, BufProxy) {
 
 pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy) {
     let mut recording = Recording::default();
+    let mut ramps = crate::ramp::RampCache::default();
+    let mut drawdata_patches: Vec<(usize, u32)> = vec![];
     let data = scene.data();
+    let stop_data = &data.resources.stops;
+    for patch in &data.resources.patches {
+        use piet_scene::ResourcePatch;
+        match patch {
+            ResourcePatch::Ramp { offset, stops } => {
+                let ramp_id = ramps.add(&stop_data[stops.clone()]);
+                drawdata_patches.push((*offset, ramp_id));
+            }
+        }
+    }
+    let gradient_image = if drawdata_patches.is_empty() {
+        ResourceProxy::new_image(1, 1)
+    } else {
+        let data = ramps.data();
+        let width = ramps.width();
+        let height = ramps.height();
+        let data: &[u8] = bytemuck::cast_slice(data);
+        println!("gradient image: {}x{} ({} bytes)", width, height, data.len());
+        ResourceProxy::Image(recording.upload_image(width, height, data))
+    };
     let n_pathtag = data.tag_stream.len();
     let pathtag_padded = align_up(n_pathtag, 4 * shaders::PATHTAG_REDUCE_WG);
     // TODO: can compute size accurately, avoid reallocation
@@ -135,12 +158,27 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
     let drawtag_base = size_to_words(scene.len());
     scene.extend(bytemuck::cast_slice(&data.drawtag_stream));
     let drawdata_base = size_to_words(scene.len());
-    scene.extend(&data.drawdata_stream);
+    if !drawdata_patches.is_empty() {
+        let mut pos = 0;
+        for patch in drawdata_patches {
+            let offset = patch.0;
+            let value = patch.1;
+            if pos < offset {
+                scene.extend_from_slice(&data.drawdata_stream[pos..offset]);
+            }
+            scene.extend_from_slice(bytemuck::bytes_of(&value));
+            pos = offset + 4;
+        }
+        if pos < data.drawdata_stream.len() {
+            scene.extend_from_slice(&data.drawdata_stream[pos..])
+        }
+    } else {
+        scene.extend(&data.drawdata_stream);
+    }
     let transform_base = size_to_words(scene.len());
     scene.extend(bytemuck::cast_slice(&data.transform_stream));
     let linewidth_base = size_to_words(scene.len());
     scene.extend(bytemuck::cast_slice(&data.linewidth_stream));
-
     let n_path = data.n_path;
     // TODO: calculate for real when we do rectangles
     let n_drawobj = n_path;
@@ -159,11 +197,11 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
         linewidth_base,
     };
     println!("{:?}", config);
-    let scene_buf = recording.upload(scene);
-    let config_buf = recording.upload(bytemuck::bytes_of(&config).to_owned());
+    let scene_buf = ResourceProxy::Buf(recording.upload(scene));
+    let config_buf = ResourceProxy::Buf(recording.upload(bytemuck::bytes_of(&config).to_owned()));
 
     let pathtag_wgs = pathtag_padded / (4 * shaders::PATHTAG_REDUCE_WG as usize);
-    let reduced_buf = BufProxy::new(pathtag_wgs as u64 * TAG_MONOID_FULL_SIZE);
+    let reduced_buf = ResourceProxy::new_buf(pathtag_wgs as u64 * TAG_MONOID_FULL_SIZE);
     // TODO: really only need pathtag_wgs - 1
     recording.dispatch(
         shaders.pathtag_reduce,
@@ -171,22 +209,24 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
         [config_buf, scene_buf, reduced_buf],
     );
 
-    let tagmonoid_buf =
-        BufProxy::new(pathtag_wgs as u64 * shaders::PATHTAG_REDUCE_WG as u64 * TAG_MONOID_FULL_SIZE);
+    let tagmonoid_buf = ResourceProxy::new_buf(
+        pathtag_wgs as u64 * shaders::PATHTAG_REDUCE_WG as u64 * TAG_MONOID_FULL_SIZE,
+    );
     recording.dispatch(
         shaders.pathtag_scan,
         (pathtag_wgs as u32, 1, 1),
         [config_buf, scene_buf, reduced_buf, tagmonoid_buf],
     );
     let drawobj_wgs = (n_drawobj + shaders::PATH_BBOX_WG - 1) / shaders::PATH_BBOX_WG;
-    let path_bbox_buf = BufProxy::new(n_path as u64 * PATH_BBOX_SIZE);
+    let path_bbox_buf = ResourceProxy::new_buf(n_path as u64 * PATH_BBOX_SIZE);
     recording.dispatch(
         shaders.bbox_clear,
         (drawobj_wgs, 1, 1),
         [config_buf, path_bbox_buf],
     );
-    let cubic_buf = BufProxy::new(n_pathtag as u64 * CUBIC_SIZE);
-    let path_coarse_wgs = (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
+    let cubic_buf = ResourceProxy::new_buf(n_pathtag as u64 * CUBIC_SIZE);
+    let path_coarse_wgs =
+        (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
     recording.dispatch(
         shaders.pathseg,
         (path_coarse_wgs, 1, 1),
@@ -198,14 +238,14 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
             cubic_buf,
         ],
     );
-    let draw_reduced_buf = BufProxy::new(drawobj_wgs as u64 * DRAWMONOID_SIZE);
+    let draw_reduced_buf = ResourceProxy::new_buf(drawobj_wgs as u64 * DRAWMONOID_SIZE);
     recording.dispatch(
         shaders.draw_reduce,
         (drawobj_wgs, 1, 1),
         [config_buf, scene_buf, draw_reduced_buf],
     );
-    let draw_monoid_buf = BufProxy::new(n_drawobj as u64 * DRAWMONOID_SIZE);
-    let info_buf = BufProxy::new(n_drawobj as u64 * MAX_DRAWINFO_SIZE);
+    let draw_monoid_buf = ResourceProxy::new_buf(n_drawobj as u64 * DRAWMONOID_SIZE);
+    let info_buf = ResourceProxy::new_buf(n_drawobj as u64 * MAX_DRAWINFO_SIZE);
     recording.dispatch(
         shaders.draw_leaf,
         (drawobj_wgs, 1, 1),
@@ -218,16 +258,17 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
             info_buf,
         ],
     );
-    let draw_bbox_buf = BufProxy::new(n_path as u64 * DRAW_BBOX_SIZE);
+    let draw_bbox_buf = ResourceProxy::new_buf(n_path as u64 * DRAW_BBOX_SIZE);
     let bump_buf = BufProxy::new(BUMP_SIZE);
     // Not actually used yet.
-    let clip_bbox_buf = BufProxy::new(1024);
-    let bin_data_buf = BufProxy::new(1 << 20);
+    let clip_bbox_buf = ResourceProxy::new_buf(1024);
+    let bin_data_buf = ResourceProxy::new_buf(1 << 20);
     let width_in_bins = (config.width_in_tiles + 15) / 16;
     let height_in_bins = (config.height_in_tiles + 15) / 16;
     let n_bins = width_in_bins * height_in_bins;
-    let bin_header_buf = BufProxy::new((n_bins * drawobj_wgs) as u64 * BIN_HEADER_SIZE);
+    let bin_header_buf = ResourceProxy::new_buf((n_bins * drawobj_wgs) as u64 * BIN_HEADER_SIZE);
     recording.clear_all(bump_buf);
+    let bump_buf = ResourceProxy::Buf(bump_buf);
     recording.dispatch(
         shaders.binning,
         (drawobj_wgs, 1, 1),
@@ -242,8 +283,8 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
             bin_header_buf,
         ],
     );
-    let path_buf = BufProxy::new(n_path as u64 * PATH_SIZE);
-    let tile_buf = BufProxy::new(1 << 20);
+    let path_buf = ResourceProxy::new_buf(n_path as u64 * PATH_SIZE);
+    let tile_buf = ResourceProxy::new_buf(1 << 20);
     let path_wgs = (n_path + shaders::PATH_BBOX_WG - 1) / shaders::PATH_BBOX_WG;
     recording.dispatch(
         shaders.tile_alloc,
@@ -258,7 +299,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
         ],
     );
 
-    let segments_buf = BufProxy::new(1 << 24);
+    let segments_buf = ResourceProxy::new_buf(1 << 24);
     recording.dispatch(
         shaders.path_coarse,
         (path_coarse_wgs, 1, 1),
@@ -278,7 +319,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
         (path_wgs, 1, 1),
         [config_buf, path_buf, tile_buf],
     );
-    let ptcl_buf = BufProxy::new(1 << 24);
+    let ptcl_buf = ResourceProxy::new_buf(1 << 24);
     recording.dispatch(
         shaders.coarse,
         (width_in_bins, height_in_bins, 1),
@@ -300,7 +341,14 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
     recording.dispatch(
         shaders.fine,
         (config.width_in_tiles, config.height_in_tiles, 1),
-        [config_buf, tile_buf, segments_buf, out_buf, ptcl_buf],
+        [
+            config_buf,
+            tile_buf,
+            segments_buf,
+            ResourceProxy::Buf(out_buf),
+            ptcl_buf,
+            gradient_image,
+        ],
     );
 
     let download_buf = out_buf;
diff --git a/piet-wgsl/src/shaders.rs b/piet-wgsl/src/shaders.rs
index b659170..8e6e89f 100644
--- a/piet-wgsl/src/shaders.rs
+++ b/piet-wgsl/src/shaders.rs
@@ -255,6 +255,7 @@ pub fn full_shaders(device: &Device, engine: &mut Engine) -> Result<FullShaders,
             BindType::BufReadOnly,
             BindType::Buffer,
             BindType::BufReadOnly,
+            BindType::ImageRead,
         ],
     )?;
     Ok(FullShaders {
diff --git a/piet-wgsl/src/test_scene.rs b/piet-wgsl/src/test_scene.rs
index 4cfe1d4..cdbd467 100644
--- a/piet-wgsl/src/test_scene.rs
+++ b/piet-wgsl/src/test_scene.rs
@@ -15,35 +15,70 @@
 // Also licensed under MIT license, at your choice.
 
 use kurbo::BezPath;
-use piet_scene::{Affine, Brush, Color, Fill, PathElement, Point, Scene, SceneBuilder, Stroke};
+use piet_scene::{Affine, Brush, Color, Fill, LinearGradient, PathElement, Point, Scene, SceneBuilder, Stroke, GradientStop};
 
 use crate::pico_svg::PicoSvg;
 
 pub fn gen_test_scene() -> Scene {
     let mut scene = Scene::default();
     let mut builder = SceneBuilder::for_scene(&mut scene);
-    if false {
-        let path = [
-            PathElement::MoveTo(Point::new(100.0, 100.0)),
-            PathElement::LineTo(Point::new(500.0, 120.0)),
-            PathElement::LineTo(Point::new(300.0, 150.0)),
-            PathElement::LineTo(Point::new(200.0, 260.0)),
-            PathElement::LineTo(Point::new(150.0, 210.0)),
-            PathElement::Close,
-        ];
-        let brush = Brush::Solid(Color::rgb8(0x40, 0x40, 0xff));
-        builder.fill(Fill::NonZero, Affine::IDENTITY, &brush, None, &path);
-        let transform = Affine::translate(50.0, 50.0);
-        let brush = Brush::Solid(Color::rgba8(0xff, 0xff, 0x00, 0x80));
-        builder.fill(Fill::NonZero, transform, &brush, None, &path);
-        let transform = Affine::translate(100.0, 100.0);
-        let style = simple_stroke(1.0);
-        let brush = Brush::Solid(Color::rgb8(0xa0, 0x00, 0x00));
-        builder.stroke(&style, transform, &brush, None, &path);
-    } else {
-        let xml_str = std::str::from_utf8(include_bytes!("../../piet-gpu/Ghostscript_Tiger.svg")).unwrap();
-        let svg = PicoSvg::load(xml_str, 6.0).unwrap();
-        render_svg(&mut builder, &svg, false);
+    let scene_ix = 1;
+    match scene_ix {
+        0 => {
+            let path = [
+                PathElement::MoveTo(Point::new(100.0, 100.0)),
+                PathElement::LineTo(Point::new(500.0, 120.0)),
+                PathElement::LineTo(Point::new(300.0, 150.0)),
+                PathElement::LineTo(Point::new(200.0, 260.0)),
+                PathElement::LineTo(Point::new(150.0, 210.0)),
+                PathElement::Close,
+            ];
+            let brush = Brush::Solid(Color::rgb8(0x40, 0x40, 0xff));
+            builder.fill(Fill::NonZero, Affine::IDENTITY, &brush, None, &path);
+            let transform = Affine::translate(50.0, 50.0);
+            let brush = Brush::Solid(Color::rgba8(0xff, 0xff, 0x00, 0x80));
+            builder.fill(Fill::NonZero, transform, &brush, None, &path);
+            let transform = Affine::translate(100.0, 100.0);
+            let style = simple_stroke(1.0);
+            let brush = Brush::Solid(Color::rgb8(0xa0, 0x00, 0x00));
+            builder.stroke(&style, transform, &brush, None, &path);
+        }
+        1 => {
+            let path = [
+                PathElement::MoveTo(Point::new(100.0, 100.0)),
+                PathElement::LineTo(Point::new(300.0, 100.0)),
+                PathElement::LineTo(Point::new(300.0, 300.0)),
+                PathElement::LineTo(Point::new(100.0, 300.0)),
+                PathElement::Close,
+            ];
+            let gradient = Brush::LinearGradient(LinearGradient {
+                start: Point::new(100.0, 100.0),
+                end: Point::new(300.0, 300.0),
+                extend: piet_scene::ExtendMode::Pad,
+                stops: vec![
+                    GradientStop {
+                        offset: 0.0,
+                        color: Color::rgb8(255, 0, 0),
+                    },
+                    GradientStop {
+                        offset: 0.5,
+                        color: Color::rgb8(0, 255, 0),
+                    },
+                    GradientStop {
+                        offset: 1.0,
+                        color: Color::rgb8(0, 0, 255),
+                    },
+                ].into()
+            });
+            builder.fill(Fill::NonZero, Affine::scale(3.0, 3.0), &gradient, None, &path);
+        }
+        _ => {
+            let xml_str =
+                std::str::from_utf8(include_bytes!("../../piet-gpu/Ghostscript_Tiger.svg"))
+                    .unwrap();
+            let svg = PicoSvg::load(xml_str, 6.0).unwrap();
+            render_svg(&mut builder, &svg, false);
+        }
     }
     scene
 }
@@ -94,7 +129,6 @@ fn convert_bez_path<'a>(path: &'a BezPath) -> impl Iterator<Item = PathElement>
         .map(|el| PathElement::from_kurbo(*el))
 }
 
-
 fn simple_stroke(width: f32) -> Stroke<[f32; 0]> {
     Stroke {
         width,

From 9adeaf3e82dfb25bc8a39c29c4f28a1b464ef70e Mon Sep 17 00:00:00 2001
From: Chad Brokaw <cbrokaw@gmail.com>
Date: Wed, 16 Nov 2022 12:02:11 -0500
Subject: [PATCH 16/17] format

---
 piet-wgsl/src/engine.rs     |  7 ++++++-
 piet-wgsl/src/render.rs     |  7 ++++++-
 piet-wgsl/src/test_scene.rs | 16 +++++++++++++---
 3 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/piet-wgsl/src/engine.rs b/piet-wgsl/src/engine.rs
index 5e89905..9a3556c 100644
--- a/piet-wgsl/src/engine.rs
+++ b/piet-wgsl/src/engine.rs
@@ -305,7 +305,12 @@ impl Recording {
         buf_proxy
     }
 
-    pub fn upload_image(&mut self, width: u32, height: u32, data: impl Into<Vec<u8>>) -> ImageProxy {
+    pub fn upload_image(
+        &mut self,
+        width: u32,
+        height: u32,
+        data: impl Into<Vec<u8>>,
+    ) -> ImageProxy {
         let data = data.into();
         let image_proxy = ImageProxy::new(width, height);
         self.push(Command::UploadImage(image_proxy, data));
diff --git a/piet-wgsl/src/render.rs b/piet-wgsl/src/render.rs
index 4cc63e3..4550927 100644
--- a/piet-wgsl/src/render.rs
+++ b/piet-wgsl/src/render.rs
@@ -143,7 +143,12 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
         let width = ramps.width();
         let height = ramps.height();
         let data: &[u8] = bytemuck::cast_slice(data);
-        println!("gradient image: {}x{} ({} bytes)", width, height, data.len());
+        println!(
+            "gradient image: {}x{} ({} bytes)",
+            width,
+            height,
+            data.len()
+        );
         ResourceProxy::Image(recording.upload_image(width, height, data))
     };
     let n_pathtag = data.tag_stream.len();
diff --git a/piet-wgsl/src/test_scene.rs b/piet-wgsl/src/test_scene.rs
index cdbd467..bb6f7ad 100644
--- a/piet-wgsl/src/test_scene.rs
+++ b/piet-wgsl/src/test_scene.rs
@@ -15,7 +15,10 @@
 // Also licensed under MIT license, at your choice.
 
 use kurbo::BezPath;
-use piet_scene::{Affine, Brush, Color, Fill, LinearGradient, PathElement, Point, Scene, SceneBuilder, Stroke, GradientStop};
+use piet_scene::{
+    Affine, Brush, Color, Fill, GradientStop, LinearGradient, PathElement, Point, Scene,
+    SceneBuilder, Stroke,
+};
 
 use crate::pico_svg::PicoSvg;
 
@@ -68,9 +71,16 @@ pub fn gen_test_scene() -> Scene {
                         offset: 1.0,
                         color: Color::rgb8(0, 0, 255),
                     },
-                ].into()
+                ]
+                .into(),
             });
-            builder.fill(Fill::NonZero, Affine::scale(3.0, 3.0), &gradient, None, &path);
+            builder.fill(
+                Fill::NonZero,
+                Affine::scale(3.0, 3.0),
+                &gradient,
+                None,
+                &path,
+            );
         }
         _ => {
             let xml_str =

From adc98117764a16f4cbe5bfac64c6a97f5149517b Mon Sep 17 00:00:00 2001
From: Chad Brokaw <cbrokaw@gmail.com>
Date: Fri, 18 Nov 2022 17:26:26 -0500
Subject: [PATCH 17/17] add clips and blends

---
 piet-wgsl/shader/clip_leaf.wgsl        |  20 +-
 piet-wgsl/shader/clip_reduce.wgsl      |   2 +-
 piet-wgsl/shader/coarse.wgsl           |  88 ++++++-
 piet-wgsl/shader/draw_leaf.wgsl        |  13 +-
 piet-wgsl/shader/fine.wgsl             |  10 +-
 piet-wgsl/shader/path_coarse_full.wgsl |   2 +-
 piet-wgsl/shader/shared/blend.wgsl     | 351 +++++++++++++++++++++++++
 piet-wgsl/src/debug.rs                 |   5 +
 piet-wgsl/src/debug/clip.rs            |  13 +
 piet-wgsl/src/debug/draw.rs            |  14 +
 piet-wgsl/src/debug/fine.rs            | 153 +++++++++++
 piet-wgsl/src/engine.rs                |  19 +-
 piet-wgsl/src/main.rs                  |   1 +
 piet-wgsl/src/render.rs                |  44 +++-
 piet-wgsl/src/shaders.rs               |  30 +++
 piet-wgsl/src/test_scene.rs            | 200 +++++++++++---
 16 files changed, 901 insertions(+), 64 deletions(-)
 create mode 100644 piet-wgsl/shader/shared/blend.wgsl
 create mode 100644 piet-wgsl/src/debug.rs
 create mode 100644 piet-wgsl/src/debug/clip.rs
 create mode 100644 piet-wgsl/src/debug/draw.rs
 create mode 100644 piet-wgsl/src/debug/fine.rs

diff --git a/piet-wgsl/shader/clip_leaf.wgsl b/piet-wgsl/shader/clip_leaf.wgsl
index f294317..d935550 100644
--- a/piet-wgsl/shader/clip_leaf.wgsl
+++ b/piet-wgsl/shader/clip_leaf.wgsl
@@ -33,15 +33,17 @@ var<workgroup> sh_bbox: array<vec4<f32>, WG_SIZE>;
 var<workgroup> sh_link: array<i32, WG_SIZE>;
 
 fn search_link(bic: ptr<function, Bic>, ix: u32) -> i32 {
+    var ix = ix;
     var j = 0u;
     while j < firstTrailingBit(WG_SIZE) {
         let base = 2u * WG_SIZE - (2u << (firstTrailingBit(WG_SIZE) - j));
         if ((ix >> j) & 1u) != 0u {
             let test = bic_combine(sh_bic[base + (ix >> j) - 1u], *bic);
-            if test.b >= 0u {
-                *bic = test;
-                ix -= 1u << j;
+            if test.b > 0u {
+                break;
             }
+            *bic = test;
+            ix -= 1u << j;
         }
         j += 1u;
     }
@@ -59,7 +61,7 @@ fn search_link(bic: ptr<function, Bic>, ix: u32) -> i32 {
     if ix > 0u {
         return i32(ix) - 1;
     } else {
-        return i32(~(*bic).a);
+        return i32(~0u - (*bic).a);
     }
 }
 
@@ -67,7 +69,9 @@ fn load_clip_inp(ix: u32) -> i32 {
     if ix < config.n_clip {
         return clip_inp[ix];
     } else {
-        return i32(0x80000000);
+        return -2147483648;
+        // literal too large?
+        // return 0x80000000;
     }
 }
 
@@ -129,12 +133,12 @@ fn main(
     sh_bic[local_id.x] = bic;
     if is_push {
         let path_bbox = path_bboxes[inp];
-        bbox = vec4<f32>(path_bbox.x0, path_bbox.y0, path_bbox.x1, path_bbox.y1);
+        bbox = vec4<f32>(f32(path_bbox.x0), f32(path_bbox.y0), f32(path_bbox.x1), f32(path_bbox.y1));
     } else {
         bbox = vec4<f32>(-1e9, -1e9, 1e9, 1e9);
     }
     var inbase = 0u;
-    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE) - 1u; i += 1u) {
         let outbase = 2u * WG_SIZE - (1u << (firstTrailingBit(WG_SIZE) - i));
         workgroupBarrier();
         if local_id.x < 1u << (firstTrailingBit(WG_SIZE) - 1u - i) {
@@ -191,5 +195,5 @@ fn main(
             bbox = vec4<f32>(-1e9, -1e9, 1e9, 1e9);
         }
     }
-    clip_bboxes[global_id.x] = bbox
+    clip_bboxes[global_id.x] = bbox;
 }
diff --git a/piet-wgsl/shader/clip_reduce.wgsl b/piet-wgsl/shader/clip_reduce.wgsl
index 3288b07..50c6402 100644
--- a/piet-wgsl/shader/clip_reduce.wgsl
+++ b/piet-wgsl/shader/clip_reduce.wgsl
@@ -60,7 +60,7 @@ fn main(
         let path_ix = sh_path_ix[local_id.x];
         let path_bbox = path_bboxes[path_ix];
         let parent_ix = sh_parent[local_id.x] + wg_id.x * WG_SIZE;
-        let bbox = vec4<f32>(path_bbox.x0, path_bbox.y0, path_bbox.x1, path_bbox.y1);
+        let bbox = vec4<f32>(f32(path_bbox.x0), f32(path_bbox.y0), f32(path_bbox.x1), f32(path_bbox.y1));
         clip_out[global_id.x] = ClipEl(parent_ix, bbox);
     }
 }
diff --git a/piet-wgsl/shader/coarse.wgsl b/piet-wgsl/shader/coarse.wgsl
index 5741ec3..b27a215 100644
--- a/piet-wgsl/shader/coarse.wgsl
+++ b/piet-wgsl/shader/coarse.wgsl
@@ -145,9 +145,32 @@ fn write_rad_grad(rad: CmdRadGrad) {
     alloc_cmd(12u);
     ptcl[cmd_offset] = CMD_RAD_GRAD;
     ptcl[cmd_offset + 1u] = rad.index;
+    ptcl[cmd_offset + 2u] = bitcast<u32>(rad.matrx.x);
+    ptcl[cmd_offset + 3u] = bitcast<u32>(rad.matrx.y);
+    ptcl[cmd_offset + 4u] = bitcast<u32>(rad.matrx.z);
+    ptcl[cmd_offset + 5u] = bitcast<u32>(rad.matrx.w);
+    ptcl[cmd_offset + 6u] = bitcast<u32>(rad.xlat.x);
+    ptcl[cmd_offset + 7u] = bitcast<u32>(rad.xlat.y);
+    ptcl[cmd_offset + 8u] = bitcast<u32>(rad.c1.x);
+    ptcl[cmd_offset + 9u] = bitcast<u32>(rad.c1.y);
+    ptcl[cmd_offset + 10u] = bitcast<u32>(rad.ra);
+    ptcl[cmd_offset + 11u] = bitcast<u32>(rad.roff);
     cmd_offset += 12u;
 }
 
+fn write_begin_clip() {
+    alloc_cmd(1u);
+    ptcl[cmd_offset] = CMD_BEGIN_CLIP;
+    cmd_offset += 1u;
+}
+
+fn write_end_clip(blend: u32) {
+    alloc_cmd(2u);
+    ptcl[cmd_offset] = CMD_END_CLIP;
+    ptcl[cmd_offset + 1u] = blend;
+    cmd_offset += 2u;
+}
+
 @compute @workgroup_size(256)
 fn main(
     @builtin(local_invocation_id) local_id: vec3<u32>,
@@ -166,15 +189,20 @@ fn main(
     let this_tile_ix = (bin_tile_y + tile_y) * config.width_in_tiles + bin_tile_x + tile_x;
     cmd_offset = this_tile_ix * PTCL_INITIAL_ALLOC;
     cmd_limit = cmd_offset + (PTCL_INITIAL_ALLOC - PTCL_HEADROOM);
-    // TODO: clip state
-    let clip_zero_depth = 0u;
+
+    // clip state
+    var clip_zero_depth = 0u;
+    var clip_depth = 0u;
 
     var partition_ix = 0u;
     var rd_ix = 0u;
     var wr_ix = 0u;
     var part_start_ix = 0u;
     var ready_ix = 0u;
-    // TODO: blend state
+
+    // blend state
+    var render_blend_depth = 0u;
+    var max_blend_depth = 0u;
     
     while true {
         for (var i = 0u; i < N_SLICE; i += 1u) {
@@ -286,8 +314,16 @@ fn main(
             let y = sh_tile_y0[el_ix] + seq_ix / width;
             let tile_ix = sh_tile_base[el_ix] + sh_tile_stride[el_ix] * y + x;
             let tile = tiles[tile_ix];
-            // TODO: this predicate becomes more interesting with clip
-            let include_tile = tile.segments != 0u || tile.backdrop != 0;
+            let is_clip = (tag & 1u) != 0u;
+            var is_blend = false;
+            if is_clip {
+                let BLEND_CLIP = (128u << 8u) | 3u;
+                let scene_offset = draw_monoids[drawobj_ix].scene_offset;
+                let dd = config.drawdata_base + scene_offset;
+                let blend = scene[dd];
+                is_blend = blend != BLEND_CLIP;
+            }
+            let include_tile = tile.segments != 0u || (tile.backdrop == 0) == is_clip || is_blend;
             if include_tile {
                 let el_slice = el_ix / 32u;
                 let el_mask = 1u << (el_ix & 31u);
@@ -324,16 +360,18 @@ fn main(
             if clip_zero_depth == 0u {
                 let tile_ix = sh_tile_base[el_ix] + sh_tile_stride[el_ix] * tile_y + tile_x;
                 let tile = tiles[tile_ix];
-                let linewidth = bitcast<f32>(info[di]);
-                write_path(tile, linewidth);
                 switch drawtag {
                     // DRAWTAG_FILL_COLOR
                     case 0x44u: {
+                        let linewidth = bitcast<f32>(info[di]);
+                        write_path(tile, linewidth);
                         let rgba_color = scene[dd];
                         write_color(CmdColor(rgba_color));
                     }
                     // DRAWTAG_FILL_LIN_GRADIENT
                     case 0x114u: {
+                        let linewidth = bitcast<f32>(info[di]);
+                        write_path(tile, linewidth);
                         var lin: CmdLinGrad;
                         lin.index = scene[dd];
                         lin.line_x = bitcast<f32>(info[di + 1u]);
@@ -343,6 +381,8 @@ fn main(
                     }
                     // DRAWTAG_FILL_RAD_GRADIENT
                     case 0x2dcu: {
+                        let linewidth = bitcast<f32>(info[di]);
+                        write_path(tile, linewidth);
                         var rad: CmdRadGrad;
                         rad.index = scene[dd];
                         let m0 = bitcast<f32>(info[di + 1u]);
@@ -356,6 +396,40 @@ fn main(
                         rad.roff = bitcast<f32>(info[di + 10u]);
                         write_rad_grad(rad);
                     }
+                    // DRAWTAG_BEGIN_CLIP
+                    case 0x05u: {
+                        if tile.segments == 0u && tile.backdrop == 0 {
+                            clip_zero_depth = clip_depth + 1u;
+                        } else {
+                            write_begin_clip();
+                            render_blend_depth += 1u;
+                            max_blend_depth = max(max_blend_depth, render_blend_depth);
+                        }
+                        clip_depth += 1u;
+                    }
+                    // DRAWTAG_END_CLIP
+                    case 0x25u: {
+                        clip_depth -= 1u;
+                        write_path(tile, -1.0);
+                        write_end_clip(scene[dd]);
+                        render_blend_depth -= 1u;
+                    }
+                    default: {}
+                }
+            } else {
+                // In "clip zero" state, suppress all drawing
+                switch drawtag {
+                    // DRAWTAG_BEGIN_CLIP
+                    case 0x05u: {
+                        clip_depth += 1u;
+                    }
+                    // DRAWTAG_END_CLIP
+                    case 0x25u: {
+                        if clip_depth == clip_zero_depth {
+                            clip_zero_depth = 0u;
+                        }
+                        clip_depth -= 1u;
+                    }
                     default: {}
                 }
             }
diff --git a/piet-wgsl/shader/draw_leaf.wgsl b/piet-wgsl/shader/draw_leaf.wgsl
index 5909fdd..f5140a4 100644
--- a/piet-wgsl/shader/draw_leaf.wgsl
+++ b/piet-wgsl/shader/draw_leaf.wgsl
@@ -17,6 +17,7 @@
 // Finish prefix sum of drawtags, decode draw objects.
 
 #import config
+#import clip
 #import drawtag
 #import bbox
 
@@ -38,6 +39,9 @@ var<storage, read_write> draw_monoid: array<DrawMonoid>;
 @group(0) @binding(5)
 var<storage, read_write> info: array<u32>;
 
+@group(0) @binding(6)
+var<storage, read_write> clip_inp: array<i32>;
+
 let WG_SIZE = 256u;
 
 // Possibly dedup?
@@ -183,4 +187,11 @@ fn main(
             default: {}
         }
     }
-}
\ No newline at end of file
+    if tag_word == DRAWTAG_BEGIN_CLIP || tag_word == DRAWTAG_END_CLIP {
+        var path_ix = ~ix;
+        if tag_word == DRAWTAG_BEGIN_CLIP {
+            path_ix = m.path_ix;
+        }
+        clip_inp[m.clip_ix] = i32(path_ix);
+    }
+}
diff --git a/piet-wgsl/shader/fine.wgsl b/piet-wgsl/shader/fine.wgsl
index d76edef..5488e01 100644
--- a/piet-wgsl/shader/fine.wgsl
+++ b/piet-wgsl/shader/fine.wgsl
@@ -40,6 +40,8 @@ var<storage> segments: array<Segment>;
 var<storage, read_write> output: array<u32>;
 
 #ifdef full
+
+#import blend
 #import ptcl
 
 let GRADIENT_WIDTH = 512;
@@ -90,10 +92,6 @@ fn read_rad_grad(cmd_ix: u32) -> CmdRadGrad {
     return CmdRadGrad(index, matrx, xlat, c1, ra, roff);
 }
 
-fn mix_blend_compose(backdrop: vec4<f32>, src: vec4<f32>, mode: u32) -> vec4<f32> {
-    // TODO: ALL the blend modes. This is just vanilla src-over.
-    return backdrop * (1.0 - src.a) + src;
-}
 #endif
 
 let PIXELS_PER_THREAD = 4u;
@@ -233,7 +231,7 @@ fn main(
                     let fg_i = fg_rgba * area[i];
                     rgba[i] = rgba[i] * (1.0 - fg_i.a) + fg_i;
                 }
-                cmd_ix += 12u;
+                cmd_ix += 5u;
             }
             // CMD_RAD_GRAD
             case 7u: {
@@ -278,7 +276,7 @@ fn main(
                     }
                     let bg = unpack4x8unorm(bg_rgba);
                     let fg = rgba[i] * area[i];
-                    rgba[i] = mix_blend_compose(bg, fg, blend);
+                    rgba[i] = blend_mix_compose(bg, fg, blend);
                 }
                 cmd_ix += 2u;
             }
diff --git a/piet-wgsl/shader/path_coarse_full.wgsl b/piet-wgsl/shader/path_coarse_full.wgsl
index fa3609e..d607bac 100644
--- a/piet-wgsl/shader/path_coarse_full.wgsl
+++ b/piet-wgsl/shader/path_coarse_full.wgsl
@@ -213,7 +213,7 @@ fn main(
                 }
                 for (var y = y0; y < y1; y += 1) {
                     let tile_y0 = f32(y) * f32(TILE_HEIGHT);
-                    let xbackdrop = max(xray + 1, 0);
+                    let xbackdrop = max(xray + 1, bbox.x);
                     if xymin.y < tile_y0 && xbackdrop < bbox.z {
                         let backdrop = select(-1, 1, dp.y < 0.0);
                         let tile_ix = base + xbackdrop;
diff --git a/piet-wgsl/shader/shared/blend.wgsl b/piet-wgsl/shader/shared/blend.wgsl
new file mode 100644
index 0000000..34cb55b
--- /dev/null
+++ b/piet-wgsl/shader/shared/blend.wgsl
@@ -0,0 +1,351 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+
+// Color mixing modes
+
+let MIX_NORMAL = 0u;
+let MIX_MULTIPLY = 1u;
+let MIX_SCREEN = 2u;
+let MIX_OVERLAY = 3u;
+let MIX_DARKEN = 4u;
+let MIX_LIGHTEN = 5u;
+let MIX_COLOR_DODGE = 6u;
+let MIX_COLOR_BURN = 7u;
+let MIX_HARD_LIGHT = 8u;
+let MIX_SOFT_LIGHT = 9u;
+let MIX_DIFFERENCE = 10u;
+let MIX_EXCLUSION = 11u;
+let MIX_HUE = 12u;
+let MIX_SATURATION = 13u;
+let MIX_COLOR = 14u;
+let MIX_LUMINOSITY = 15u;
+let MIX_CLIP = 128u;
+
+fn screen(cb: vec3<f32>, cs: vec3<f32>) -> vec3<f32> {
+    return cb + cs - (cb * cs);
+}
+
+fn color_dodge(cb: f32, cs: f32) -> f32 {
+    if cb == 0.0 {
+        return 0.0;
+    } else if cs == 1.0 {
+        return 1.0;
+    } else {
+        return min(1.0, cb / (1.0 - cs));
+    }
+}
+
+fn color_burn(cb: f32, cs: f32) -> f32 {
+    if cb == 1.0 {
+        return 1.0;
+    } else if cs == 0.0 {
+        return 0.0;
+    } else {
+        return 1.0 - min(1.0, (1.0 - cb) / cs);
+    }
+}
+
+fn hard_light(cb: vec3<f32>, cs: vec3<f32>) -> vec3<f32> {
+	return mix(
+		screen(cb, 2.0 * cs - 1.0),
+		cb * 2.0 * cs,
+        vec3<f32>(cs <= vec3<f32>(0.5))
+	);
+}
+
+fn soft_light(cb: vec3<f32>, cs: vec3<f32>) -> vec3<f32> {
+    let d = mix(
+        sqrt(cb),
+        ((16.0 * cb - vec3(12.0)) * cb + vec3(4.0)) * cb,
+        vec3<f32>(cb <= vec3<f32>(0.25))
+    );
+    return mix(
+        cb + (2.0 * cs - vec3(1.0)) * (d - cb),
+        cb - (vec3(1.0) - 2.0 * cs) * cb * (vec3(1.0) - cb),
+        vec3<f32>(cs <= vec3<f32>(0.5))
+    );
+}
+
+fn sat(c: vec3<f32>) -> f32 {
+    return max(c.x, max(c.y, c.z)) - min(c.x, min(c.y, c.z));
+}
+
+fn lum(c: vec3<f32>) -> f32 {
+    let f = vec3<f32>(0.3, 0.59, 0.11);
+    return dot(c, f);
+}
+
+fn clip_color(c: vec3<f32>) -> vec3<f32> {
+    var c = c;
+    let l = lum(c);
+    let n = min(c.x, min(c.y, c.z));
+    let x = max(c.x, max(c.y, c.z));
+    if n < 0.0 {
+        c = l + (((c - l) * l) / (l - n));
+    }
+    if x > 1.0 {
+        c = l + (((c - l) * (1.0 - l)) / (x - l));
+    }
+    return c;
+}
+
+fn set_lum(c: vec3<f32>, l: f32) -> vec3<f32> {
+    return clip_color(c + (l - lum(c)));
+}
+
+fn set_sat_inner(
+    cmin: ptr<function, f32>, 
+    cmid: ptr<function, f32>, 
+    cmax: ptr<function, f32>, 
+    s: f32
+) {
+    if *cmax > *cmin {
+        *cmid = ((*cmid - *cmin) * s) / (*cmax - *cmin);
+        *cmax = s;
+    } else {
+        *cmid = 0.0;
+        *cmax = 0.0;
+    }
+    *cmin = 0.0;
+}
+
+fn set_sat(c: vec3<f32>, s: f32) -> vec3<f32> {
+    var r = c.r;
+    var g = c.g;
+    var b = c.b;
+    if r <= g {
+        if g <= b {
+            set_sat_inner(&r, &g, &b, s);
+        } else {
+            if r <= b {
+                set_sat_inner(&r, &b, &g, s);
+            } else {
+                set_sat_inner(&b, &r, &g, s);
+            }
+        }
+    } else {
+        if r <= b {
+            set_sat_inner(&g, &r, &b, s);
+        } else {
+            if g <= b {
+                set_sat_inner(&g, &b, &r, s);
+            } else {
+                set_sat_inner(&b, &g, &r, s);
+            }
+        }
+    }
+    return vec3<f32>(r, g, b);
+}
+
+// Blends two RGB colors together. The colors are assumed to be in sRGB
+// color space, and this function does not take alpha into account.
+fn blend_mix(cb: vec3<f32>, cs: vec3<f32>, mode: u32) -> vec3<f32> {
+    var b = vec3<f32>(0.0);
+    switch mode {
+        // MIX_MULTIPLY
+        case 1u: {
+            b = cb * cs;
+        }
+        // MIX_SCREEN
+        case 2u: {
+            b = screen(cb, cs);
+        }
+        // MIX_OVERLAY
+        case 3u: {
+            b = hard_light(cs, cb);
+        }
+        // MIX_DARKEN
+        case 4u: {
+            b = min(cb, cs);
+        }
+        // MIX_LIGHTEN
+        case 5u: {
+            b = max(cb, cs);
+        }
+        // MIX_COLOR_DODGE
+        case 6u: {
+            b = vec3<f32>(color_dodge(cb.x, cs.x), color_dodge(cb.y, cs.y), color_dodge(cb.z, cs.z));
+        }
+        // MIX_COLOR_BURN
+        case 7u: {
+            b = vec3<f32>(color_burn(cb.x, cs.x), color_burn(cb.y, cs.y), color_burn(cb.z, cs.z));
+        }
+        // MIX_HARD_LIGHT
+        case 8u: {
+            b = hard_light(cb, cs);
+        }
+        // MIX_SOFT_LIGHT
+        case 9u: {
+            b = soft_light(cb, cs);
+        }
+        // MIX_DIFFERENCE
+        case 10u: {
+            b = abs(cb - cs);
+        }
+        // MIX_EXCLUSION
+        case 11u: {
+            b = cb + cs - 2.0 * cb * cs;
+        }
+        // MIX_HUE
+        case 12u: {
+            b = set_lum(set_sat(cs, sat(cb)), lum(cb));
+        }
+        // MIX_SATURATION
+        case 13u: {
+            b = set_lum(set_sat(cb, sat(cs)), lum(cb));
+        }
+        // MIX_COLOR
+        case 14u: {
+            b = set_lum(cs, lum(cb));
+        }
+        // MIX_LUMINOSITY
+        case 15u: {
+            b = set_lum(cb, lum(cs));
+        }
+        default: {
+            b = cs;
+        }
+    }
+    return b;
+}
+
+// Composition modes
+
+let COMPOSE_CLEAR = 0u;
+let COMPOSE_COPY = 1u;
+let COMPOSE_DEST = 2u;
+let COMPOSE_SRC_OVER = 3u;
+let COMPOSE_DEST_OVER = 4u;
+let COMPOSE_SRC_IN = 5u;
+let COMPOSE_DEST_IN = 6u;
+let COMPOSE_SRC_OUT = 7u;
+let COMPOSE_DEST_OUT = 8u;
+let COMPOSE_SRC_ATOP = 9u;
+let COMPOSE_DEST_ATOP = 10u;
+let COMPOSE_XOR = 11u;
+let COMPOSE_PLUS = 12u;
+let COMPOSE_PLUS_LIGHTER = 13u;
+
+// Apply general compositing operation.
+// Inputs are separated colors and alpha, output is premultiplied.
+fn blend_compose(
+    cb: vec3<f32>, 
+    cs: vec3<f32>, 
+    ab: f32, 
+    as_: f32, 
+    mode: u32
+) -> vec4<f32> {
+    var fa = 0.0;
+    var fb = 0.0;
+    switch mode {
+        // COMPOSE_COPY
+        case 1u: {
+            fa = 1.0;
+            fb = 0.0;
+        }
+        // COMPOSE_DEST
+        case 2u: {
+            fa = 0.0;
+            fb = 1.0;
+        }
+        // COMPOSE_SRC_OVER
+        case 3u: {
+            fa = 1.0;
+            fb = 1.0 - as_;
+        }
+        // COMPOSE_DEST_OVER
+        case 4u: {
+            fa = 1.0 - ab;
+            fb = 1.0;
+        }
+        // COMPOSE_SRC_IN
+        case 5u: {
+            fa = ab;
+            fb = 0.0;
+        }
+        // COMPOSE_DEST_IN
+        case 6u: {
+            fa = 0.0;
+            fb = as_;
+        }
+        // COMPOSE_SRC_OUT
+        case 7u: {
+            fa = 1.0 - ab;
+            fb = 0.0;
+        }
+        // COMPOSE_DEST_OUT
+        case 8u: {
+            fa = 0.0;
+            fb = 1.0 - as_;
+        }
+        // COMPOSE_SRC_ATOP
+        case 9u: {
+            fa = ab;
+            fb = 1.0 - as_;
+        }
+        // COMPOSE_DEST_ATOP
+        case 10u: {
+            fa = 1.0 - ab;
+            fb = as_;
+        }
+        // COMPOSE_XOR
+        case 11u: {
+            fa = 1.0 - ab;
+            fb = 1.0 - as_;
+        }
+        // COMPOSE_PLUS
+        case 12u: {
+            fa = 1.0;
+            fb = 1.0;
+        }
+        // COMPOSE_PLUS_LIGHTER
+        case 13u: {
+            return min(vec4<f32>(1.0), vec4<f32>(as_ * cs + ab * cb, as_ + ab));
+        }
+        default: {}
+    }
+    let as_fa = as_ * fa;
+    let ab_fb = ab * fb;
+    let co = as_fa * cs + ab_fb * cb;
+    return vec4<f32>(co, as_fa + ab_fb);
+}
+
+// Apply color mixing and composition. Both input and output colors are
+// premultiplied RGB.
+fn blend_mix_compose(backdrop: vec4<f32>, src: vec4<f32>, mode: u32) -> vec4<f32> {
+    let BLEND_DEFAULT = ((MIX_NORMAL << 8u) | COMPOSE_SRC_OVER);
+    let EPSILON = 1e-15;
+    if (mode & 0x7fffu) == BLEND_DEFAULT {
+        // Both normal+src_over blend and clip case
+        return backdrop * (1.0 - src.a) + src;
+    }
+    // Un-premultiply colors for blending
+    let inv_src_a = 1.0 / (src.a + EPSILON);
+    var cs = src.rgb * inv_src_a;
+    let inv_backdrop_a = 1.0 / (backdrop.a + EPSILON);
+    let cb = backdrop.rgb * inv_backdrop_a;
+    let mix_mode = mode >> 8u;
+    let mixed = blend_mix(cb, cs, mix_mode);
+    cs = mix(cs, mixed, backdrop.a);
+    let compose_mode = mode & 0xffu;
+    if compose_mode == COMPOSE_SRC_OVER {
+        let co = mix(backdrop.rgb, cs, src.a);
+        return vec4<f32>(co, src.a + backdrop.a * (1.0 - src.a));
+    } else {
+        return blend_compose(cb, cs, backdrop.a, src.a, compose_mode);
+    }
+}
diff --git a/piet-wgsl/src/debug.rs b/piet-wgsl/src/debug.rs
new file mode 100644
index 0000000..964c8ee
--- /dev/null
+++ b/piet-wgsl/src/debug.rs
@@ -0,0 +1,5 @@
+#![allow(dead_code)]
+
+pub mod clip;
+pub mod draw;
+pub mod fine;
diff --git a/piet-wgsl/src/debug/clip.rs b/piet-wgsl/src/debug/clip.rs
new file mode 100644
index 0000000..8b2a0e3
--- /dev/null
+++ b/piet-wgsl/src/debug/clip.rs
@@ -0,0 +1,13 @@
+use bytemuck::{Pod, Zeroable};
+
+#[derive(Copy, Clone, Debug, Zeroable, Pod)]
+#[repr(C)]
+pub struct ClipEl {
+    pub parent_ix: u32,
+    pub pad: [u32; 3],
+    pub bbox: [f32; 4],
+}
+
+pub fn parse_clip_els(data: &[u8]) -> Vec<ClipEl> {
+    Vec::from(bytemuck::cast_slice(data))
+}
diff --git a/piet-wgsl/src/debug/draw.rs b/piet-wgsl/src/debug/draw.rs
new file mode 100644
index 0000000..ab56ed5
--- /dev/null
+++ b/piet-wgsl/src/debug/draw.rs
@@ -0,0 +1,14 @@
+use bytemuck::{Pod, Zeroable};
+
+#[derive(Copy, Clone, Debug, Zeroable, Pod)]
+#[repr(C)]
+pub struct DrawMonoid {
+    pub path_ix: u32,
+    pub clip_ix: u32,
+    pub scene_offset: u32,
+    pub info_offset: u32,
+}
+
+pub fn parse_draw_monoids(data: &[u8]) -> Vec<DrawMonoid> {
+    Vec::from(bytemuck::cast_slice(data))
+}
diff --git a/piet-wgsl/src/debug/fine.rs b/piet-wgsl/src/debug/fine.rs
new file mode 100644
index 0000000..d9f05f0
--- /dev/null
+++ b/piet-wgsl/src/debug/fine.rs
@@ -0,0 +1,153 @@
+#[derive(Copy, Clone, Debug)]
+#[repr(C)]
+pub struct Fill {
+    pub tile: u32,
+    pub backdrop: i32,
+}
+
+#[derive(Copy, Clone, Debug)]
+#[repr(C)]
+pub struct Stroke {
+    pub tile: u32,
+    pub half_width: f32,
+}
+
+#[derive(Copy, Clone, Debug)]
+#[repr(C)]
+pub struct Color {
+    abgr: [u8; 4],
+}
+
+#[derive(Copy, Clone, Debug)]
+#[repr(C)]
+pub struct LinGrad {
+    pub index: u32,
+    pub line_x: f32,
+    pub line_y: f32,
+    pub line_c: f32,
+}
+
+#[derive(Copy, Clone, Debug)]
+#[repr(C)]
+pub struct RadGrad {
+    pub index: u32,
+    pub matrix: [f32; 4],
+    pub xlat: [f32; 2],
+    pub c1: [f32; 2],
+    pub ra: f32,
+    pub roff: f32,
+}
+
+#[derive(Copy, Clone, Debug)]
+pub enum Command {
+    Fill(Fill),
+    Stroke(Stroke),
+    Solid,
+    Color(Color),
+    LinGrad(LinGrad),
+    RadGrad(RadGrad),
+    BeginClip,
+    EndClip(u32),
+    End,
+}
+
+const PTCL_INITIAL_ALLOC: usize = 64;
+
+#[derive(Debug)]
+pub struct CommandList {
+    pub tiles: Vec<(u32, u32, Vec<Command>)>,
+}
+
+impl CommandList {
+    pub fn parse(width: usize, height: usize, ptcl: &[u8]) -> Self {
+        let mut tiles = vec![];
+        let width_tiles = width / 16;
+        let height_tiles = height / 16;
+        for y in 0..height_tiles {
+            for x in 0..width_tiles {
+                let tile_ix = y * width_tiles + x;
+                let ix = tile_ix * PTCL_INITIAL_ALLOC;
+                let commands = parse_commands(ptcl, ix);
+                if !commands.is_empty() {
+                    tiles.push((x as u32, y as u32, commands));
+                }
+            }
+        }
+        Self { tiles }
+    }
+}
+
+fn parse_commands(ptcl: &[u8], mut ix: usize) -> Vec<Command> {
+    let mut commands = vec![];
+    let words: &[u32] = bytemuck::cast_slice(ptcl);
+    while ix < words.len() {
+        let tag = words[ix];
+        ix += 1;
+        match tag {
+            0 => break,
+            1 => {
+                commands.push(Command::Fill(Fill {
+                    tile: words[ix],
+                    backdrop: words[ix + 1] as i32,
+                }));
+                ix += 2;
+            }
+            2 => {
+                commands.push(Command::Stroke(Stroke {
+                    tile: words[ix],
+                    half_width: bytemuck::cast(words[ix + 1]),
+                }));
+                ix += 2;
+            }
+            3 => {
+                commands.push(Command::Solid);
+            }
+            5 => {
+                commands.push(Command::Color(Color {
+                    abgr: bytemuck::cast(words[ix]),
+                }));
+                ix += 1;
+            }
+            6 => {
+                commands.push(Command::LinGrad(LinGrad {
+                    index: words[ix],
+                    line_x: bytemuck::cast(words[ix + 1]),
+                    line_y: bytemuck::cast(words[ix + 2]),
+                    line_c: bytemuck::cast(words[ix + 3]),
+                }));
+                ix += 4;
+            }
+            7 => {
+                let matrix = [
+                    bytemuck::cast(words[ix + 1]),
+                    bytemuck::cast(words[ix + 2]),
+                    bytemuck::cast(words[ix + 3]),
+                    bytemuck::cast(words[ix + 4]),
+                ];
+                let xlat = [bytemuck::cast(words[ix + 5]), bytemuck::cast(words[ix + 6])];
+                let c1 = [bytemuck::cast(words[ix + 7]), bytemuck::cast(words[ix + 8])];
+                commands.push(Command::RadGrad(RadGrad {
+                    index: words[ix],
+                    matrix,
+                    xlat,
+                    c1,
+                    ra: bytemuck::cast(words[ix + 9]),
+                    roff: bytemuck::cast(words[ix + 10]),
+                }));
+                ix += 11;
+            }
+            9 => {
+                commands.push(Command::BeginClip);
+            }
+            10 => {
+                commands.push(Command::EndClip(words[ix]));
+                ix += 1;
+            }
+            11 => {
+                ix = words[ix] as usize;
+            }
+            _ => {}
+        }
+    }
+    commands
+}
diff --git a/piet-wgsl/src/engine.rs b/piet-wgsl/src/engine.rs
index 9a3556c..049fc80 100644
--- a/piet-wgsl/src/engine.rs
+++ b/piet-wgsl/src/engine.rs
@@ -341,7 +341,10 @@ impl Recording {
 impl BufProxy {
     pub fn new(size: u64) -> Self {
         let id = Id::next();
-        BufProxy { id, size }
+        BufProxy {
+            id,
+            size: size.max(16),
+        }
     }
 }
 
@@ -360,6 +363,20 @@ impl ResourceProxy {
     pub fn new_image(width: u32, height: u32) -> Self {
         Self::Image(ImageProxy::new(width, height))
     }
+
+    pub fn as_buf(&self) -> Option<&BufProxy> {
+        match self {
+            Self::Buf(proxy) => Some(&proxy),
+            _ => None,
+        }
+    }
+
+    pub fn as_image(&self) -> Option<&ImageProxy> {
+        match self {
+            Self::Image(proxy) => Some(&proxy),
+            _ => None,
+        }
+    }
 }
 
 impl From<BufProxy> for ResourceProxy {
diff --git a/piet-wgsl/src/main.rs b/piet-wgsl/src/main.rs
index 2f1d885..4e10486 100644
--- a/piet-wgsl/src/main.rs
+++ b/piet-wgsl/src/main.rs
@@ -22,6 +22,7 @@ use engine::Engine;
 
 use wgpu::{Device, Limits, Queue};
 
+mod debug;
 mod engine;
 mod pico_svg;
 mod ramp;
diff --git a/piet-wgsl/src/render.rs b/piet-wgsl/src/render.rs
index 4550927..ee13694 100644
--- a/piet-wgsl/src/render.rs
+++ b/piet-wgsl/src/render.rs
@@ -14,6 +14,10 @@ const PATH_BBOX_SIZE: u64 = 24;
 const CUBIC_SIZE: u64 = 40;
 const DRAWMONOID_SIZE: u64 = 16;
 const MAX_DRAWINFO_SIZE: u64 = 44;
+const CLIP_BIC_SIZE: u64 = 8;
+const CLIP_EL_SIZE: u64 = 32;
+const CLIP_INP_SIZE: u64 = 4;
+const CLIP_BBOX_SIZE: u64 = 16;
 const PATH_SIZE: u64 = 32;
 const DRAW_BBOX_SIZE: u64 = 16;
 const BUMP_SIZE: u64 = 16;
@@ -187,7 +191,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
     let n_path = data.n_path;
     // TODO: calculate for real when we do rectangles
     let n_drawobj = n_path;
-    let n_clip = 0; // TODO: wire up correctly
+    let n_clip = data.n_clip;
     let config = Config {
         width_in_tiles: 64,
         height_in_tiles: 64,
@@ -251,6 +255,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
     );
     let draw_monoid_buf = ResourceProxy::new_buf(n_drawobj as u64 * DRAWMONOID_SIZE);
     let info_buf = ResourceProxy::new_buf(n_drawobj as u64 * MAX_DRAWINFO_SIZE);
+    let clip_inp_buf = ResourceProxy::new_buf(data.n_clip as u64 * CLIP_INP_SIZE);
     recording.dispatch(
         shaders.draw_leaf,
         (drawobj_wgs, 1, 1),
@@ -261,12 +266,45 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
             path_bbox_buf,
             draw_monoid_buf,
             info_buf,
+            clip_inp_buf,
         ],
     );
+    let clip_el_buf = ResourceProxy::new_buf(data.n_clip as u64 * CLIP_EL_SIZE);
+    let clip_bic_buf =
+        ResourceProxy::new_buf((n_clip / shaders::CLIP_REDUCE_WG) as u64 * CLIP_BIC_SIZE);
+    let clip_wg_reduce = n_clip.saturating_sub(1) / shaders::CLIP_REDUCE_WG;
+    if clip_wg_reduce > 0 {
+        recording.dispatch(
+            shaders.clip_reduce,
+            (clip_wg_reduce, 1, 1),
+            [
+                config_buf,
+                clip_inp_buf,
+                path_bbox_buf,
+                clip_bic_buf,
+                clip_el_buf,
+            ],
+        );
+    }
+    let clip_wg = (n_clip + shaders::CLIP_REDUCE_WG - 1) / shaders::CLIP_REDUCE_WG;
+    let clip_bbox_buf = ResourceProxy::new_buf(n_clip as u64 * CLIP_BBOX_SIZE);
+    if clip_wg > 0 {
+        recording.dispatch(
+            shaders.clip_leaf,
+            (clip_wg, 1, 1),
+            [
+                config_buf,
+                clip_inp_buf,
+                path_bbox_buf,
+                clip_bic_buf,
+                clip_el_buf,
+                draw_monoid_buf,
+                clip_bbox_buf,
+            ],
+        );
+    }
     let draw_bbox_buf = ResourceProxy::new_buf(n_path as u64 * DRAW_BBOX_SIZE);
     let bump_buf = BufProxy::new(BUMP_SIZE);
-    // Not actually used yet.
-    let clip_bbox_buf = ResourceProxy::new_buf(1024);
     let bin_data_buf = ResourceProxy::new_buf(1 << 20);
     let width_in_bins = (config.width_in_tiles + 15) / 16;
     let height_in_bins = (config.height_in_tiles + 15) / 16;
diff --git a/piet-wgsl/src/shaders.rs b/piet-wgsl/src/shaders.rs
index 8e6e89f..0e61710 100644
--- a/piet-wgsl/src/shaders.rs
+++ b/piet-wgsl/src/shaders.rs
@@ -28,6 +28,7 @@ pub const PATHTAG_REDUCE_WG: u32 = 256;
 pub const PATH_BBOX_WG: u32 = 256;
 pub const PATH_COARSE_WG: u32 = 256;
 pub const PATH_DRAWOBJ_WG: u32 = 256;
+pub const CLIP_REDUCE_WG: u32 = 256;
 
 pub struct Shaders {
     pub pathtag_reduce: ShaderId,
@@ -45,6 +46,8 @@ pub struct FullShaders {
     pub pathseg: ShaderId,
     pub draw_reduce: ShaderId,
     pub draw_leaf: ShaderId,
+    pub clip_reduce: ShaderId,
+    pub clip_leaf: ShaderId,
     pub binning: ShaderId,
     pub tile_alloc: ShaderId,
     pub path_coarse: ShaderId,
@@ -178,6 +181,31 @@ pub fn full_shaders(device: &Device, engine: &mut Engine) -> Result<FullShaders,
             BindType::BufReadOnly,
             BindType::Buffer,
             BindType::Buffer,
+            BindType::Buffer,
+        ],
+    )?;
+    let clip_reduce = engine.add_shader(
+        device,
+        preprocess::preprocess(&read_shader("clip_reduce"), &empty, &imports).into(),
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
+            BindType::Buffer,
+        ],
+    )?;
+    let clip_leaf = engine.add_shader(
+        device,
+        preprocess::preprocess(&read_shader("clip_leaf"), &empty, &imports).into(),
+        &[
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::BufReadOnly,
+            BindType::Buffer,
+            BindType::Buffer,
         ],
     )?;
     let binning = engine.add_shader(
@@ -265,6 +293,8 @@ pub fn full_shaders(device: &Device, engine: &mut Engine) -> Result<FullShaders,
         pathseg,
         draw_reduce,
         draw_leaf,
+        clip_reduce,
+        clip_leaf,
         binning,
         tile_alloc,
         path_coarse,
diff --git a/piet-wgsl/src/test_scene.rs b/piet-wgsl/src/test_scene.rs
index bb6f7ad..861ac54 100644
--- a/piet-wgsl/src/test_scene.rs
+++ b/piet-wgsl/src/test_scene.rs
@@ -16,8 +16,8 @@
 
 use kurbo::BezPath;
 use piet_scene::{
-    Affine, Brush, Color, Fill, GradientStop, LinearGradient, PathElement, Point, Scene,
-    SceneBuilder, Stroke,
+    Affine, BlendMode, Brush, Color, Compose, ExtendMode, Fill, GradientStop, LinearGradient, Mix,
+    PathElement, Point, RadialGradient, Rect, Scene, SceneBuilder, SceneFragment, Stroke,
 };
 
 use crate::pico_svg::PicoSvg;
@@ -47,40 +47,7 @@ pub fn gen_test_scene() -> Scene {
             builder.stroke(&style, transform, &brush, None, &path);
         }
         1 => {
-            let path = [
-                PathElement::MoveTo(Point::new(100.0, 100.0)),
-                PathElement::LineTo(Point::new(300.0, 100.0)),
-                PathElement::LineTo(Point::new(300.0, 300.0)),
-                PathElement::LineTo(Point::new(100.0, 300.0)),
-                PathElement::Close,
-            ];
-            let gradient = Brush::LinearGradient(LinearGradient {
-                start: Point::new(100.0, 100.0),
-                end: Point::new(300.0, 300.0),
-                extend: piet_scene::ExtendMode::Pad,
-                stops: vec![
-                    GradientStop {
-                        offset: 0.0,
-                        color: Color::rgb8(255, 0, 0),
-                    },
-                    GradientStop {
-                        offset: 0.5,
-                        color: Color::rgb8(0, 255, 0),
-                    },
-                    GradientStop {
-                        offset: 1.0,
-                        color: Color::rgb8(0, 0, 255),
-                    },
-                ]
-                .into(),
-            });
-            builder.fill(
-                Fill::NonZero,
-                Affine::scale(3.0, 3.0),
-                &gradient,
-                None,
-                &path,
-            );
+            render_blend_grid(&mut builder);
         }
         _ => {
             let xml_str =
@@ -90,6 +57,7 @@ pub fn gen_test_scene() -> Scene {
             render_svg(&mut builder, &svg, false);
         }
     }
+    builder.finish();
     scene
 }
 
@@ -151,3 +119,163 @@ fn simple_stroke(width: f32) -> Stroke<[f32; 0]> {
         scale: true,
     }
 }
+
+#[allow(unused)]
+pub fn render_blend_grid(sb: &mut SceneBuilder) {
+    const BLEND_MODES: &[Mix] = &[
+        Mix::Normal,
+        Mix::Multiply,
+        Mix::Darken,
+        Mix::Screen,
+        Mix::Lighten,
+        Mix::Overlay,
+        Mix::ColorDodge,
+        Mix::ColorBurn,
+        Mix::HardLight,
+        Mix::SoftLight,
+        Mix::Difference,
+        Mix::Exclusion,
+        Mix::Hue,
+        Mix::Saturation,
+        Mix::Color,
+        Mix::Luminosity,
+    ];
+    for (ix, &blend) in BLEND_MODES.iter().enumerate() {
+        let i = ix % 4;
+        let j = ix / 4;
+        let transform = Affine::translate(i as f32 * 225., j as f32 * 225.);
+        let square = blend_square(blend.into());
+        sb.append(&square, Some(transform));
+    }
+}
+
+#[allow(unused)]
+fn render_blend_square(sb: &mut SceneBuilder, blend: BlendMode, transform: Affine) {
+    // Inspired by https://developer.mozilla.org/en-US/docs/Web/CSS/mix-blend-mode
+    let rect = Rect::from_origin_size(Point::new(0., 0.), 200., 200.);
+    let stops = &[
+        GradientStop {
+            color: Color::rgb8(0, 0, 0),
+            offset: 0.0,
+        },
+        GradientStop {
+            color: Color::rgb8(255, 255, 255),
+            offset: 1.0,
+        },
+    ][..];
+    let linear = Brush::LinearGradient(LinearGradient {
+        start: Point::new(0.0, 0.0),
+        end: Point::new(200.0, 0.0),
+        stops: stops.into(),
+        extend: ExtendMode::Pad,
+    });
+    sb.fill(Fill::NonZero, transform, &linear, None, rect.elements());
+    const GRADIENTS: &[(f32, f32, Color)] = &[
+        (150., 0., Color::rgb8(255, 240, 64)),
+        (175., 100., Color::rgb8(255, 96, 240)),
+        (125., 200., Color::rgb8(64, 192, 255)),
+    ];
+    for (x, y, c) in GRADIENTS {
+        let mut color2 = c.clone();
+        color2.a = 0;
+        let stops = &[
+            GradientStop {
+                color: c.clone(),
+                offset: 0.0,
+            },
+            GradientStop {
+                color: color2,
+                offset: 1.0,
+            },
+        ][..];
+        let rad = Brush::RadialGradient(RadialGradient {
+            center0: Point::new(*x, *y),
+            center1: Point::new(*x, *y),
+            radius0: 0.0,
+            radius1: 100.0,
+            stops: stops.into(),
+            extend: ExtendMode::Pad,
+        });
+        sb.fill(Fill::NonZero, transform, &rad, None, rect.elements());
+    }
+    const COLORS: &[Color] = &[
+        Color::rgb8(255, 0, 0),
+        Color::rgb8(0, 255, 0),
+        Color::rgb8(0, 0, 255),
+    ];
+    sb.push_layer(Mix::Normal.into(), transform, rect.elements());
+    for (i, c) in COLORS.iter().enumerate() {
+        let stops = &[
+            GradientStop {
+                color: Color::rgb8(255, 255, 255),
+                offset: 0.0,
+            },
+            GradientStop {
+                color: c.clone(),
+                offset: 1.0,
+            },
+        ][..];
+        let linear = Brush::LinearGradient(LinearGradient {
+            start: Point::new(0.0, 0.0),
+            end: Point::new(0.0, 200.0),
+            stops: stops.into(),
+            extend: ExtendMode::Pad,
+        });
+        sb.push_layer(blend, transform, rect.elements());
+        // squash the ellipse
+        let a = transform
+            * Affine::translate(100., 100.)
+            * Affine::rotate(std::f32::consts::FRAC_PI_3 * (i * 2 + 1) as f32)
+            * Affine::scale(1.0, 0.357)
+            * Affine::translate(-100., -100.);
+        sb.fill(
+            Fill::NonZero,
+            a,
+            &linear,
+            None,
+            make_ellipse(100., 100., 90., 90.),
+        );
+        sb.pop_layer();
+    }
+    sb.pop_layer();
+}
+
+#[allow(unused)]
+fn blend_square(blend: BlendMode) -> SceneFragment {
+    let mut fragment = SceneFragment::default();
+    let mut sb = SceneBuilder::for_fragment(&mut fragment);
+    render_blend_square(&mut sb, blend, Affine::IDENTITY);
+    sb.finish();
+    fragment
+}
+
+fn make_ellipse(cx: f32, cy: f32, rx: f32, ry: f32) -> impl Iterator<Item = PathElement> + Clone {
+    let a = 0.551915024494;
+    let arx = a * rx;
+    let ary = a * ry;
+    let elements = [
+        PathElement::MoveTo(Point::new(cx + rx, cy)),
+        PathElement::CurveTo(
+            Point::new(cx + rx, cy + ary),
+            Point::new(cx + arx, cy + ry),
+            Point::new(cx, cy + ry),
+        ),
+        PathElement::CurveTo(
+            Point::new(cx - arx, cy + ry),
+            Point::new(cx - rx, cy + ary),
+            Point::new(cx - rx, cy),
+        ),
+        PathElement::CurveTo(
+            Point::new(cx - rx, cy - ary),
+            Point::new(cx - arx, cy - ry),
+            Point::new(cx, cy - ry),
+        ),
+        PathElement::CurveTo(
+            Point::new(cx + arx, cy - ry),
+            Point::new(cx + rx, cy - ary),
+            Point::new(cx + rx, cy),
+        ),
+        PathElement::Close,
+    ];
+    (0..elements.len()).map(move |i| elements[i])
+}