Merge pull request #150 from linebender/clip

New clip implementation
2025-01-10 12:41:30 +11:00 · 2022-02-21 13:23:31 -08:00 · 2022-02-21 13:23:31 -08:00 · d81e5cb4ee
parent a968f13382 3b67a4e7c1
commit d81e5cb4ee
80 changed files with 3318 additions and 1343 deletions
--- a/piet-gpu/shader/binning.comp
+++ b/piet-gpu/shader/binning.comp
@ -20,6 +20,7 @@ layout(set = 0, binding = 1) readonly buffer ConfigBuf {

 #include "annotated.h"
 #include "bins.h"
+#include "drawtag.h"

 // scale factors useful for converting coordinates to bins
 #define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX))
@ -35,6 +36,47 @@ shared uint count[N_SLICE][N_TILE];
 shared Alloc sh_chunk_alloc[N_TILE];
 shared bool sh_alloc_failed;

+DrawMonoid load_draw_monoid(uint element_ix) {
+    uint base = (conf.drawmonoid_alloc.offset >> 2) + 2 * element_ix;
+    uint path_ix = memory[base];
+    uint clip_ix = memory[base + 1];
+    return DrawMonoid(path_ix, clip_ix);
+}
+
+// Load bounding box computed by clip processing
+vec4 load_clip_bbox(uint clip_ix) {
+    uint base = (conf.clip_bbox_alloc.offset >> 2) + 4 * clip_ix;
+    float x0 = uintBitsToFloat(memory[base]);
+    float y0 = uintBitsToFloat(memory[base + 1]);
+    float x1 = uintBitsToFloat(memory[base + 2]);
+    float y1 = uintBitsToFloat(memory[base + 3]);
+    vec4 bbox = vec4(x0, y0, x1, y1);
+    return bbox;
+}
+
+vec4 bbox_intersect(vec4 a, vec4 b) {
+    return vec4(max(a.xy, b.xy), min(a.zw, b.zw));
+}
+
+// Load path's bbox from bbox (as written by pathseg).
+vec4 load_path_bbox(uint path_ix) {
+    uint base = (conf.bbox_alloc.offset >> 2) + 6 * path_ix;
+    float bbox_l = float(memory[base]) - 32768.0;
+    float bbox_t = float(memory[base + 1]) - 32768.0;
+    float bbox_r = float(memory[base + 2]) - 32768.0;
+    float bbox_b = float(memory[base + 3]) - 32768.0;
+    vec4 bbox = vec4(bbox_l, bbox_t, bbox_r, bbox_b);
+    return bbox;
+}
+
+void store_path_bbox(AnnotatedRef ref, vec4 bbox) {
+    uint ix = ref.offset >> 2;
+    memory[ix + 1] = floatBitsToUint(bbox.x);
+    memory[ix + 2] = floatBitsToUint(bbox.y);
+    memory[ix + 3] = floatBitsToUint(bbox.z);
+    memory[ix + 4] = floatBitsToUint(bbox.w);
+}
+
 void main() {
    uint my_n_elements = conf.n_elements;
    uint my_partition = gl_WorkGroupID.x;
@ -61,13 +103,27 @@ void main() {
    case Annotated_Image:
    case Annotated_BeginClip:
    case Annotated_EndClip:
-        // Note: we take advantage of the fact that these drawing elements
-        // have the bbox at the same place in their layout.
-        AnnoEndClip clip = Annotated_EndClip_read(conf.anno_alloc, ref);
-        x0 = int(floor(clip.bbox.x * SX));
-        y0 = int(floor(clip.bbox.y * SY));
-        x1 = int(ceil(clip.bbox.z * SX));
-        y1 = int(ceil(clip.bbox.w * SY));
+        DrawMonoid draw_monoid = load_draw_monoid(element_ix);
+        uint path_ix = draw_monoid.path_ix;
+        vec4 clip_bbox = vec4(-1e9, -1e9, 1e9, 1e9);
+        uint clip_ix = draw_monoid.clip_ix;
+        if (clip_ix > 0) {
+            clip_bbox = load_clip_bbox(clip_ix - 1);
+        }
+        // For clip elements, clip_bbox is the bbox of the clip path, intersected
+        // with enclosing clips.
+        // For other elements, it is the bbox of the enclosing clips.
+
+        vec4 path_bbox = load_path_bbox(path_ix);
+        vec4 bbox = bbox_intersect(path_bbox, clip_bbox);
+        // Avoid negative-size bbox (is this necessary)?
+        bbox.zw = max(bbox.xy, bbox.zw);
+        // Store clip-intersected bbox for tile_alloc.
+        store_path_bbox(ref, bbox);
+        x0 = int(floor(bbox.x * SX));
+        y0 = int(floor(bbox.y * SY));
+        x1 = int(ceil(bbox.z * SX));
+        y1 = int(ceil(bbox.w * SY));
        break;
    }

--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@ -22,7 +22,7 @@ rule dxil
 rule msl
  command = $spirv_cross --msl $in --output $out $msl_flags

-build gen/binning.spv: glsl binning.comp | annotated.h state.h bins.h setup.h mem.h
+build gen/binning.spv: glsl binning.comp | annotated.h bins.h drawtag.h setup.h mem.h
 build gen/binning.hlsl: hlsl gen/binning.spv
 build gen/binning.dxil: dxil gen/binning.hlsl
 build gen/binning.msl: msl gen/binning.spv
@ -119,6 +119,16 @@ build gen/draw_leaf.hlsl: hlsl gen/draw_leaf.spv
 build gen/draw_leaf.dxil: dxil gen/draw_leaf.hlsl
 build gen/draw_leaf.msl: msl gen/draw_leaf.spv

-build spv: phony gen/backdrop_lg.spv gen/backdrop.spv gen/bbox_clear.spv gen/binning.spv gen/coarse.spv gen/draw_leaf.spv gen/draw_reduce.spv gen/draw_root.spv gen/kernel4.spv gen/kernel4_gray.spv gen/path_coarse.spv gen/pathseg.spv gen/pathtag_reduce.spv gen/pathtag_root.spv gen/tile_alloc.spv gen/transform_leaf.spv gen/transform_reduce.spv gen/transform_root.spv
-build dxil: phony gen/backdrop.hlsl gen/backdrop_lg.hlsl gen/bbox_clear.hlsl gen/binning.hlsl gen/coarse.hlsl gen/draw_leaf.hlsl gen/draw_reduce.hlsl gen/draw_root.hlsl gen/kernel4.hlsl gen/kernel4_gray.hlsl gen/path_coarse.hlsl gen/pathseg.hlsl gen/pathtag_reduce.hlsl gen/pathtag_root.hlsl gen/tile_alloc.hlsl gen/transform_leaf.hlsl gen/transform_reduce.hlsl gen/transform_root.hlsl
-build msl: phony gen/backdrop_lg.msl gen/backdrop.msl gen/bbox_clear.msl gen/binning.msl gen/coarse.msl gen/draw_leaf.msl gen/draw_reduce.msl gen/draw_root.msl gen/kernel4.msl gen/kernel4_gray.msl gen/path_coarse.msl gen/pathseg.msl gen/pathtag_reduce.msl gen/pathtag_root.msl gen/tile_alloc.msl gen/transform_leaf.msl gen/transform_reduce.msl gen/transform_root.msl
+build gen/clip_reduce.spv: glsl clip_reduce.comp | mem.h setup.h annotated.h
+build gen/clip_reduce.hlsl: hlsl gen/clip_reduce.spv
+build gen/clip_reduce.dxil: dxil gen/clip_reduce.hlsl
+build gen/clip_reduce.msl: msl gen/clip_reduce.spv
+
+build gen/clip_leaf.spv: glsl clip_leaf.comp | mem.h setup.h annotated.h
+build gen/clip_leaf.hlsl: hlsl gen/clip_leaf.spv
+build gen/clip_leaf.dxil: dxil gen/clip_leaf.hlsl
+build gen/clip_leaf.msl: msl gen/clip_leaf.spv
+
+build spv: phony gen/backdrop_lg.spv gen/backdrop.spv gen/bbox_clear.spv gen/binning.spv gen/clip_leaf.spv gen/clip_reduce.spv gen/coarse.spv gen/draw_leaf.spv gen/draw_reduce.spv gen/draw_root.spv gen/kernel4.spv gen/kernel4_gray.spv gen/path_coarse.spv gen/pathseg.spv gen/pathtag_reduce.spv gen/pathtag_root.spv gen/tile_alloc.spv gen/transform_leaf.spv gen/transform_reduce.spv gen/transform_root.spv
+build dxil: phony gen/backdrop.hlsl gen/backdrop_lg.hlsl gen/bbox_clear.hlsl gen/binning.hlsl gen/clip_leaf.hlsl gen/clip_reduce.hlsl gen/coarse.hlsl gen/draw_leaf.hlsl gen/draw_reduce.hlsl gen/draw_root.hlsl gen/kernel4.hlsl gen/kernel4_gray.hlsl gen/path_coarse.hlsl gen/pathseg.hlsl gen/pathtag_reduce.hlsl gen/pathtag_root.hlsl gen/tile_alloc.hlsl gen/transform_leaf.hlsl gen/transform_reduce.hlsl gen/transform_root.hlsl
+build msl: phony gen/backdrop_lg.msl gen/backdrop.msl gen/bbox_clear.msl gen/binning.msl gen/clip_leaf.msl gen/clip_reduce.msl gen/coarse.msl gen/draw_leaf.msl gen/draw_reduce.msl gen/draw_root.msl gen/kernel4.msl gen/kernel4_gray.msl gen/path_coarse.msl gen/pathseg.msl gen/pathtag_reduce.msl gen/pathtag_root.msl gen/tile_alloc.msl gen/transform_leaf.msl gen/transform_reduce.msl gen/transform_root.msl
--- a/piet-gpu/shader/clip_leaf.comp
+++ b/piet-gpu/shader/clip_leaf.comp
@ -0,0 +1,287 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// The second dispatch of clip stack processing.
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+#include "mem.h"
+#include "setup.h"
+
+#define LG_WG_SIZE (7 + LG_WG_FACTOR)
+#define WG_SIZE (1 << LG_WG_SIZE)
+#define PARTITION_SIZE WG_SIZE
+
+layout(local_size_x = WG_SIZE) in;
+
+layout(binding = 1) readonly buffer ConfigBuf {
+    Config conf;
+};
+
+#include "annotated.h"
+
+// Some of this is cut'n'paste duplication with the reduce pass, and
+// arguably should be moved to a common .h file.
+// The bicyclic monoid
+
+struct ClipEl {
+    // index of parent node
+    uint parent_ix;
+    // bounding box
+    vec4 bbox;
+};
+
+struct Bic {
+    uint a;
+    uint b;
+};
+
+Bic bic_combine(Bic x, Bic y) {
+    uint m = min(x.b, y.a);
+    return Bic(x.a + y.a - m, x.b + y.b - m);
+}
+
+// Load path's bbox from bbox (as written by pathseg).
+vec4 load_path_bbox(uint path_ix) {
+    uint base = (conf.bbox_alloc.offset >> 2) + 6 * path_ix;
+    float bbox_l = float(memory[base]) - 32768.0;
+    float bbox_t = float(memory[base + 1]) - 32768.0;
+    float bbox_r = float(memory[base + 2]) - 32768.0;
+    float bbox_b = float(memory[base + 3]) - 32768.0;
+    vec4 bbox = vec4(bbox_l, bbox_t, bbox_r, bbox_b);
+    return bbox;
+}
+
+vec4 bbox_intersect(vec4 a, vec4 b) {
+    return vec4(max(a.xy, b.xy), min(a.zw, b.zw));
+}
+
+shared Bic sh_bic[WG_SIZE * 2 - 2];
+shared uint sh_stack[PARTITION_SIZE];
+shared vec4 sh_stack_bbox[PARTITION_SIZE];
+shared uint sh_link[PARTITION_SIZE];
+shared vec4 sh_bbox[PARTITION_SIZE];
+
+// This is adapted directly from the stack monoid impl.
+// Return value is reference within partition if >= 0,
+// otherwise reference to stack.
+uint search_link(inout Bic bic) {
+    uint ix = gl_LocalInvocationID.x;
+    uint j = 0;
+    while (j < LG_WG_SIZE) {
+        uint base = 2 * WG_SIZE - (2u << (LG_WG_SIZE - j));
+        if (((ix >> j) & 1) != 0) {
+            Bic test = bic_combine(sh_bic[base + (ix >> j) - 1], bic);
+            if (test.b > 0) {
+                break;
+            }
+            bic = test;
+            ix -= 1u << j;
+        }
+        j++;
+    }
+    if (ix > 0) {
+        while (j > 0) {
+            j--;
+            uint base = 2 * WG_SIZE - (2u << (LG_WG_SIZE - j));
+            Bic test = bic_combine(sh_bic[base + (ix >> j) - 1], bic);
+            if (test.b == 0) {
+                bic = test;
+                ix -= 1u << j;
+            }
+        }
+    }
+    // ix is the smallest value such that reduce(ix..th).b == 0
+    if (ix > 0) {
+        return ix - 1;
+    } else {
+        return ~0u - bic.a;
+    }
+}
+
+Bic load_bic(uint ix) {
+    uint base = (conf.clip_bic_alloc.offset >> 2) + 2 * ix;
+    return Bic(memory[base], memory[base + 1]);
+}
+
+ClipEl load_clip_el(uint ix) {
+    uint base = (conf.clip_stack_alloc.offset >> 2) + 5 * ix;
+    uint parent_ix = memory[base];
+    float x0 = uintBitsToFloat(memory[base + 1]);
+    float y0 = uintBitsToFloat(memory[base + 2]);
+    float x1 = uintBitsToFloat(memory[base + 3]);
+    float y1 = uintBitsToFloat(memory[base + 4]);
+    vec4 bbox = vec4(x0, y0, x1, y1);
+    return ClipEl(parent_ix, bbox);
+}
+
+uint load_path_ix(uint ix) {
+    // This is one approach to a partial final block. Another would be
+    // to do a memset to the padding in the command queue.
+    if (ix < conf.n_clip) {
+        return memory[(conf.clip_alloc.offset >> 2) + ix];
+    } else {
+        // EndClip tags don't implicate further loads.
+        return 0x80000000;
+    }
+}
+
+void store_clip_bbox(uint ix, vec4 bbox) {
+    uint base = (conf.clip_bbox_alloc.offset >> 2) + 4 * ix;
+    memory[base] = floatBitsToUint(bbox.x);
+    memory[base + 1] = floatBitsToUint(bbox.y);
+    memory[base + 2] = floatBitsToUint(bbox.z);
+    memory[base + 3] = floatBitsToUint(bbox.w);
+}
+
+void main() {
+    // materialize stack up to the start of this partition. This
+    // is based on the pure stack monoid, but with two additions.
+
+    // First, (this only matters if the stack goes deeper than the
+    // partition size, which might be unlikely in practice), the
+    // topmost stack element from each partition is picked, then an
+    // exclusive scan of those. Also note that if this is skipped,
+    // a scan is not needed in the reduce stage.
+
+    // Second, after the stream compaction, do a scan of the retrieved
+    // bbox values.
+    uint th = gl_LocalInvocationID.x;
+    Bic bic = Bic(0, 0);
+    if (th < gl_WorkGroupID.x) {
+        bic = load_bic(th);
+    }
+    sh_bic[th] = bic;
+    for (uint i = 0; i < LG_WG_SIZE; i++) {
+        barrier();
+        if (th + (1u << i) < WG_SIZE) {
+            Bic other = sh_bic[th + (1u << i)];
+            bic = bic_combine(bic, other);
+        }
+        barrier();
+        sh_bic[th] = bic;
+    }
+    barrier();
+    uint stack_size = sh_bic[0].b;
+
+    // TODO: do bbox scan here (to unlock greater stack depth)
+
+    // binary search in stack
+    uint sp = PARTITION_SIZE - 1 - th;
+    uint ix = 0;
+    for (uint i = 0; i < LG_WG_SIZE; i++) {
+        uint probe = ix + (uint(PARTITION_SIZE / 2) >> i);
+        if (sp < sh_bic[probe].b) {
+            ix = probe;
+        }
+    }
+    // ix is largest value such that sp < sh_bic[ix].b (if any)
+    uint b = sh_bic[ix].b;
+    vec4 bbox = vec4(-1e9, -1e9, 1e9, 1e9);
+    if (sp < b) {
+        // maybe store the index here for future use?
+        ClipEl el = load_clip_el(ix * PARTITION_SIZE + b - sp - 1);
+        sh_stack[th] = el.parent_ix;
+        bbox = el.bbox;
+        // other element values here?
+    }
+
+    // forward scan of bbox values of prefix stack
+    for (uint i = 0; i < LG_WG_SIZE; i++) {
+        sh_stack_bbox[th] = bbox;
+        barrier();
+        if (th >= (1u << i)) {
+            bbox = bbox_intersect(sh_stack_bbox[th - (1u << i)], bbox);
+        }
+        barrier();
+    }
+    sh_stack_bbox[th] = bbox;
+
+    // Read input and compute bicyclic semigroup binary tree
+    uint inp = load_path_ix(gl_GlobalInvocationID.x);
+    bool is_push = int(inp) >= 0;
+    bic = Bic(1 - uint(is_push), uint(is_push));
+    sh_bic[th] = bic;
+    if (is_push) {
+        bbox = load_path_bbox(inp);
+    } else {
+        bbox = vec4(-1e9, -1e9, 1e9, 1e9);
+    }
+    uint inbase = 0;
+    for (uint i = 0; i < LG_WG_SIZE - 1; i++) {
+        uint outbase = 2 * WG_SIZE - (1u << (LG_WG_SIZE - i));
+        barrier();
+        if (th < (1u << (LG_WG_SIZE - 1 - i))) {
+            sh_bic[outbase + th] = bic_combine(sh_bic[inbase + th * 2], sh_bic[inbase + th * 2 + 1]);
+        }
+        inbase = outbase;
+    }
+    barrier();
+    // Search for predecessor node
+    bic = Bic(0, 0);
+    uint link = search_link(bic);
+    // we use N_SEQ > 1 convention here:
+    // link >= 0 is index within partition
+    // link < 0 is reference to stack
+
+    // We want grandparent bbox for pop nodes, so follow those links.
+    sh_link[th] = link;
+    barrier();
+    uint grandparent;
+    if (int(link) >= 0) {
+        grandparent = sh_link[link];
+    } else {
+        grandparent = link - 1;
+    }
+
+    // Resolve parent
+    uint parent;
+    if (int(link) >= 0) {
+        parent = gl_WorkGroupID.x * PARTITION_SIZE + link;
+    } else if (int(link + stack_size) >= 0) {
+        parent = sh_stack[PARTITION_SIZE + link];
+    } else {
+        parent = ~0u;
+    }
+
+    // bbox scan along parent links
+    for (uint i = 0; i < LG_WG_SIZE; i++) {
+        // sh_link was already stored for first iteration
+        if (i != 0) {
+            sh_link[th] = link;
+        }
+        sh_bbox[th] = bbox;
+        barrier();
+        if (int(link) >= 0) {
+            bbox = bbox_intersect(sh_bbox[link], bbox);
+            link = sh_link[link];
+        }
+        barrier();
+    }
+    if (int(link + stack_size) >= 0) {
+        bbox = bbox_intersect(sh_stack_bbox[PARTITION_SIZE + link], bbox);
+    }
+    // At this point, bbox is the reduction of bounding boxes along the tree.
+    sh_bbox[th] = bbox;
+    barrier();
+
+    uint path_ix = inp;
+    if (!is_push && gl_GlobalInvocationID.x < conf.n_clip) {
+        // Is this load expensive? If so, it's loaded earlier for in-partition
+        // and is in the ClipEl for cross-partition.
+        // If not, can probably get rid of it in the stack intermediate buf.
+        path_ix = load_path_ix(parent);
+        uint drawmonoid_out_base = (conf.drawmonoid_alloc.offset >> 2) + 2 * ~inp;
+        // Fix up drawmonoid so path_ix at EndClip matches BeginClip
+        memory[drawmonoid_out_base] = path_ix;
+
+        if (int(grandparent) >= 0) {
+            bbox = sh_bbox[grandparent];
+        } else if (int(grandparent + stack_size) >= 0) {
+            bbox = sh_stack_bbox[PARTITION_SIZE + grandparent];
+        } else {
+            bbox = vec4(-1e9, -1e9, 1e9, 1e9);
+        }
+    }
+    store_clip_bbox(gl_GlobalInvocationID.x, bbox);
+}
--- a/piet-gpu/shader/clip_reduce.comp
+++ b/piet-gpu/shader/clip_reduce.comp
@ -0,0 +1,148 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// The reduce pass for clip stack processing.
+
+// The primary input is a sequence of path ids representing paths to
+// push, with a special value of ~0 to represent pop.
+
+// For each path, the bounding box is found in the anno stream
+// (anno_alloc), though this may change.
+
+// Output is a stack monoid reduction for the partition. The Bic
+// is stored in the BicBuf, and the stack slice in StackBuf.
+
+// Note: for this shader, only pushes are represented in the stack
+// monoid reduction output, so we don't have to worry about the
+// interpretation of pops.
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+#include "mem.h"
+#include "setup.h"
+
+#define LG_WG_SIZE (7 + LG_WG_FACTOR)
+#define WG_SIZE (1 << LG_WG_SIZE)
+#define PARTITION_SIZE WG_SIZE
+
+layout(local_size_x = WG_SIZE) in;
+
+layout(binding = 1) readonly buffer ConfigBuf {
+    Config conf;
+};
+
+#include "annotated.h"
+
+// The intermediate state for clip processing.
+struct ClipEl {
+    // index of parent node
+    uint parent_ix;
+    // bounding box
+    vec4 bbox;
+};
+
+// The bicyclic monoid
+struct Bic {
+    uint a;
+    uint b;
+};
+
+Bic bic_combine(Bic x, Bic y) {
+    uint m = min(x.b, y.a);
+    return Bic(x.a + y.a - m, x.b + y.b - m);
+}
+
+shared Bic sh_bic[WG_SIZE];
+shared uint sh_parent[WG_SIZE];
+shared uint sh_path_ix[WG_SIZE];
+shared vec4 sh_bbox[WG_SIZE];
+
+// Load path's bbox from bbox (as written by pathseg).
+vec4 load_path_bbox(uint path_ix) {
+    uint base = (conf.bbox_alloc.offset >> 2) + 6 * path_ix;
+    float bbox_l = float(memory[base]) - 32768.0;
+    float bbox_t = float(memory[base + 1]) - 32768.0;
+    float bbox_r = float(memory[base + 2]) - 32768.0;
+    float bbox_b = float(memory[base + 3]) - 32768.0;
+    vec4 bbox = vec4(bbox_l, bbox_t, bbox_r, bbox_b);
+    return bbox;
+}
+
+vec4 bbox_intersect(vec4 a, vec4 b) {
+    return vec4(max(a.xy, b.xy), min(a.zw, b.zw));
+}
+
+void store_bic(uint ix, Bic bic) {
+    uint base = (conf.clip_bic_alloc.offset >> 2) + 2 * ix;
+    memory[base] = bic.a;
+    memory[base + 1] = bic.b;
+}
+
+void store_clip_el(uint ix, ClipEl el) {
+    uint base = (conf.clip_stack_alloc.offset >> 2) + 5 * ix;
+    memory[base] = el.parent_ix;
+    memory[base + 1] = floatBitsToUint(el.bbox.x);
+    memory[base + 2] = floatBitsToUint(el.bbox.y);
+    memory[base + 3] = floatBitsToUint(el.bbox.z);
+    memory[base + 4] = floatBitsToUint(el.bbox.w);
+}
+
+void main() {
+    uint th = gl_LocalInvocationID.x;
+    uint inp = memory[(conf.clip_alloc.offset >> 2) + gl_GlobalInvocationID.x];
+    bool is_push = int(inp) >= 0;
+    // reverse scan of bicyclic semigroup
+    Bic bic = Bic(1 - uint(is_push), uint(is_push));
+    sh_bic[gl_LocalInvocationID.x] = bic;
+    for (uint i = 0; i < LG_WG_SIZE; i++) {
+        barrier();
+        if (th + (1u << i) < WG_SIZE) {
+            Bic other = sh_bic[gl_LocalInvocationID.x + (1u << i)];
+            bic = bic_combine(bic, other);
+        }
+        barrier();
+        sh_bic[th] = bic;
+    }
+    if (th == 0) {
+        store_bic(gl_WorkGroupID.x, bic);
+    }
+    barrier();
+    uint size = sh_bic[0].b;
+    bic = Bic(0, 0);
+    if (th + 1 < WG_SIZE) {
+        bic = sh_bic[th + 1];
+    }
+    if (is_push && bic.a == 0) {
+        uint local_ix = size - bic.b - 1;
+        sh_parent[local_ix] = th;
+        sh_path_ix[local_ix] = inp;
+    }
+    barrier();
+    // Do forward scan of bounding box intersection
+    vec4 bbox;
+    uint path_ix;
+    if (th < size) {
+        path_ix = sh_path_ix[th];
+        bbox = load_path_bbox(path_ix);
+    }
+    // Not necessary if depth is bounded by wg size
+#if 0
+    for (uint i = 0; i < LG_WG_SIZE; i++) {
+        // We gate so we never access uninit data, but it might
+        // be more efficient to avoid the conditionals.
+        if (th < size) {
+            sh_bbox[th] = bbox;
+        }
+        barrier();
+        if (th < size && th >= (1u << i)) {
+            bbox = bbox_intersect(sh_bbox[th - (1u << i)], bbox);
+        }
+        barrier();
+    }
+#endif
+    if (th < size) {
+        uint parent_ix = sh_parent[th] + gl_WorkGroupID.x * PARTITION_SIZE;
+        ClipEl el = ClipEl(parent_ix, bbox);
+        store_clip_el(gl_GlobalInvocationID.x, el);
+    }
+}
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@ -136,9 +136,6 @@ void main() {
    // currently in a clip for which the entire tile has an alpha of zero, and
    // the value is the depth after the "begin clip" of that element.
    uint clip_zero_depth = 0;
-    // State for the "clip one" optimization. If bit `i` is set, then that means
-    // that the clip pushed at depth `i` has an alpha of all one.
-    uint clip_one_mask = 0;

    // I'm sure we can figure out how to do this with at least one fewer register...
    // Items up to rd_ix have been read from sh_elements
@ -227,9 +224,8 @@ void main() {
        case Annotated_LinGradient:
        case Annotated_BeginClip:
        case Annotated_EndClip:
-            // We have one "path" for each element, even if the element isn't
-            // actually a path (currently EndClip, but images etc in the future).
-            uint path_ix = element_ix;
+            uint drawmonoid_base = (conf.drawmonoid_alloc.offset >> 2) + 2 * element_ix;
+            uint path_ix = memory[drawmonoid_base];
            Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size));
            uint stride = path.bbox.z - path.bbox.x;
            sh_tile_stride[th_ix] = stride;
@ -283,15 +279,15 @@ void main() {
            uint x = sh_tile_x0[el_ix] + seq_ix % width;
            uint y = sh_tile_y0[el_ix] + seq_ix / width;
            bool include_tile = false;
-            if (tag == Annotated_BeginClip || tag == Annotated_EndClip) {
-                include_tile = true;
-            } else if (mem_ok) {
+            if (mem_ok) {
                Tile tile = Tile_read(read_tile_alloc(el_ix, mem_ok),
                                      TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
-                // Include the path in the tile if
-                // - the tile contains at least a segment (tile offset non-zero)
-                // - the tile is completely covered (backdrop non-zero)
-                include_tile = tile.tile.offset != 0 || tile.backdrop != 0;
+                bool is_clip = tag == Annotated_BeginClip || tag == Annotated_EndClip;
+                // Always include the tile if it contains a path segment.
+                // For draws, include the tile if it is solid.
+                // For clips, include the tile if it is empty - this way, logic
+                // below will suppress the drawing of inner elements.
+                include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip;
            }
            if (include_tile) {
                uint el_slice = el_ix / 32;
@ -378,33 +374,26 @@ void main() {
                                             (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
                    if (tile.tile.offset == 0 && tile.backdrop == 0) {
                        clip_zero_depth = clip_depth + 1;
-                    } else if (tile.tile.offset == 0 && clip_depth < 32) {
-                        clip_one_mask |= (1u << clip_depth);
                    } else {
-                        AnnoBeginClip begin_clip = Annotated_BeginClip_read(conf.anno_alloc, ref);
                        if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
                            break;
                        }
-                        write_fill(cmd_alloc, cmd_ref, tag.flags, tile, begin_clip.linewidth);
                        Cmd_BeginClip_write(cmd_alloc, cmd_ref);
                        cmd_ref.offset += 4;
-                        if (clip_depth < 32) {
-                            clip_one_mask &= ~(1u << clip_depth);
-                        }
                    }
                    clip_depth++;
                    break;
                case Annotated_EndClip:
+                    tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok),
+                                     TileRef(sh_tile_base[element_ref_ix] +
+                                             (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
                    clip_depth--;
-                    if (clip_depth >= 32 || (clip_one_mask & (1u << clip_depth)) == 0) {
-                        if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
-                            break;
-                        }
-                        Cmd_Solid_write(cmd_alloc, cmd_ref);
-                        cmd_ref.offset += 4;
-                        Cmd_EndClip_write(cmd_alloc, cmd_ref);
-                        cmd_ref.offset += 4;
+                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
+                        break;
                    }
+                    write_fill(cmd_alloc, cmd_ref, MODE_NONZERO, tile, 0.0);
+                    Cmd_EndClip_write(cmd_alloc, cmd_ref);
+                    cmd_ref.offset += 4;
                    break;
                }
            } else {
--- a/piet-gpu/shader/draw_leaf.comp
+++ b/piet-gpu/shader/draw_leaf.comp
@ -72,9 +72,14 @@ void main() {
    }
    uint out_ix = gl_GlobalInvocationID.x * N_ROWS;
    uint out_base = (conf.drawmonoid_alloc.offset >> 2) + out_ix * 2;
+    uint clip_out_base = conf.clip_alloc.offset >> 2;
    AnnotatedRef out_ref = AnnotatedRef(conf.anno_alloc.offset + out_ix * Annotated_size);
    for (uint i = 0; i < N_ROWS; i++) {
-        Monoid m = combine_tag_monoid(row, local[i]);
+        Monoid m = row;
+        if (i > 0) {
+            m = combine_tag_monoid(m, local[i - 1]);
+        }
+        // m now holds exclusive scan of draw monoid
        memory[out_base + i * 2] = m.path_ix;
        memory[out_base + i * 2 + 1] = m.clip_ix;

@ -83,8 +88,9 @@ void main() {
        // later stages read scene + bbox etc.
        ElementRef this_ref = Element_index(ref, i);
        tag_word = Element_tag(this_ref).tag;
-        if (tag_word == Element_FillColor || tag_word == Element_FillLinGradient || tag_word == Element_FillImage) {
-            uint bbox_offset = (conf.bbox_alloc.offset >> 2) + 6 * (m.path_ix - 1);
+        if (tag_word == Element_FillColor || tag_word == Element_FillLinGradient || tag_word == Element_FillImage ||
+            tag_word == Element_BeginClip) {
+            uint bbox_offset = (conf.bbox_alloc.offset >> 2) + 6 * m.path_ix;
            float bbox_l = float(memory[bbox_offset]) - 32768.0;
            float bbox_t = float(memory[bbox_offset + 1]) - 32768.0;
            float bbox_r = float(memory[bbox_offset + 2]) - 32768.0;
@ -142,21 +148,27 @@ void main() {
                anno_img.offset = fill_img.offset;
                Annotated_Image_write(conf.anno_alloc, out_ref, fill_mode, anno_img);
                break;
+            case Element_BeginClip:
+                AnnoBeginClip anno_begin_clip;
+                anno_begin_clip.bbox = bbox;
+                anno_begin_clip.linewidth = 0.0; // don't support clip-with-stroke
+                Annotated_BeginClip_write(conf.anno_alloc, out_ref, 0, anno_begin_clip);
+                break;
            }
-        } else if (tag_word == Element_BeginClip) {
-            Clip begin_clip = Element_BeginClip_read(this_ref);
-            AnnoBeginClip anno_begin_clip;
-            // This is the absolute bbox, it's been transformed during encoding.
-            anno_begin_clip.bbox = begin_clip.bbox;
-            anno_begin_clip.linewidth = 0.0; // don't support clip-with-stroke
-            Annotated_BeginClip_write(conf.anno_alloc, out_ref, 0, anno_begin_clip);
        } else if (tag_word == Element_EndClip) {
-            Clip end_clip = Element_EndClip_read(this_ref);
            AnnoEndClip anno_end_clip;
-            // This bbox is expected to be the same as the begin one.
-            anno_end_clip.bbox = end_clip.bbox;
+            // The actual bbox will be reconstructed from clip stream output.
+            anno_end_clip.bbox = vec4(-1e9, -1e9, 1e9, 1e9);
            Annotated_EndClip_write(conf.anno_alloc, out_ref, anno_end_clip);
        }
+        // Generate clip stream.
+        if (tag_word == Element_BeginClip || tag_word == Element_EndClip) {
+            uint path_ix = ~(out_ix + i);
+            if (tag_word == Element_BeginClip) {
+                path_ix = m.path_ix;
+            }
+            memory[clip_out_base + m.clip_ix] = path_ix;
+        }
        out_ref.offset += Annotated_size;
    }
 }
--- a/piet-gpu/shader/gen/backdrop.hlsl
+++ b/piet-gpu/shader/gen/backdrop.hlsl
@ -44,8 +44,13 @@ struct Config
    Alloc trans_alloc;
    Alloc bbox_alloc;
    Alloc drawmonoid_alloc;
+    Alloc clip_alloc;
+    Alloc clip_bic_alloc;
+    Alloc clip_stack_alloc;
+    Alloc clip_bbox_alloc;
    uint n_trans;
    uint n_path;
+    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
--- a/piet-gpu/shader/gen/backdrop.msl
+++ b/piet-gpu/shader/gen/backdrop.msl
@ -63,8 +63,13 @@ struct Config
    Alloc_1 trans_alloc;
    Alloc_1 bbox_alloc;
    Alloc_1 drawmonoid_alloc;
+    Alloc_1 clip_alloc;
+    Alloc_1 clip_bic_alloc;
+    Alloc_1 clip_stack_alloc;
+    Alloc_1 clip_bbox_alloc;
    uint n_trans;
    uint n_path;
+    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
--- a/piet-gpu/shader/gen/backdrop.spv
+++ b/piet-gpu/shader/gen/backdrop.spv
--- a/piet-gpu/shader/gen/backdrop_lg.hlsl
+++ b/piet-gpu/shader/gen/backdrop_lg.hlsl
@ -44,8 +44,13 @@ struct Config
    Alloc trans_alloc;
    Alloc bbox_alloc;
    Alloc drawmonoid_alloc;
+    Alloc clip_alloc;
+    Alloc clip_bic_alloc;
+    Alloc clip_stack_alloc;
+    Alloc clip_bbox_alloc;
    uint n_trans;
    uint n_path;
+    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
--- a/piet-gpu/shader/gen/backdrop_lg.msl
+++ b/piet-gpu/shader/gen/backdrop_lg.msl
@ -63,8 +63,13 @@ struct Config
    Alloc_1 trans_alloc;
    Alloc_1 bbox_alloc;
    Alloc_1 drawmonoid_alloc;
+    Alloc_1 clip_alloc;
+    Alloc_1 clip_bic_alloc;
+    Alloc_1 clip_stack_alloc;
+    Alloc_1 clip_bbox_alloc;
    uint n_trans;
    uint n_path;
+    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
--- a/piet-gpu/shader/gen/backdrop_lg.spv
+++ b/piet-gpu/shader/gen/backdrop_lg.spv
--- a/piet-gpu/shader/gen/bbox_clear.dxil
+++ b/piet-gpu/shader/gen/bbox_clear.dxil
--- a/piet-gpu/shader/gen/bbox_clear.hlsl
+++ b/piet-gpu/shader/gen/bbox_clear.hlsl
@ -17,8 +17,13 @@ struct Config
    Alloc trans_alloc;
    Alloc bbox_alloc;
    Alloc drawmonoid_alloc;
+    Alloc clip_alloc;
+    Alloc clip_bic_alloc;
+    Alloc clip_stack_alloc;
+    Alloc clip_bbox_alloc;
    uint n_trans;
    uint n_path;
+    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
@ -39,7 +44,7 @@ struct SPIRV_Cross_Input
 void comp_main()
 {
    uint ix = gl_GlobalInvocationID.x;
-    if (ix < _21.Load(52))
+    if (ix < _21.Load(68))
    {
        uint out_ix = (_21.Load(40) >> uint(2)) + (6u * ix);
        _45.Store(out_ix * 4 + 8, 65535u);
--- a/piet-gpu/shader/gen/bbox_clear.msl
+++ b/piet-gpu/shader/gen/bbox_clear.msl
@ -22,8 +22,13 @@ struct Config
    Alloc trans_alloc;
    Alloc bbox_alloc;
    Alloc drawmonoid_alloc;
+    Alloc clip_alloc;
+    Alloc clip_bic_alloc;
+    Alloc clip_stack_alloc;
+    Alloc clip_bbox_alloc;
    uint n_trans;
    uint n_path;
+    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
--- a/piet-gpu/shader/gen/bbox_clear.spv
+++ b/piet-gpu/shader/gen/bbox_clear.spv
--- a/piet-gpu/shader/gen/binning.dxil
+++ b/piet-gpu/shader/gen/binning.dxil
--- a/piet-gpu/shader/gen/binning.hlsl
+++ b/piet-gpu/shader/gen/binning.hlsl
@ -9,16 +9,6 @@ struct MallocResult
    bool failed;
 };

-struct AnnoEndClipRef
-{
-    uint offset;
-};
-
-struct AnnoEndClip
-{
-    float4 bbox;
-};
-
 struct AnnotatedRef
 {
    uint offset;
@ -40,6 +30,12 @@ struct BinInstance
    uint element_ix;
 };

+struct DrawMonoid
+{
+    uint path_ix;
+    uint clip_ix;
+};
+
 struct Config
 {
    uint n_elements;
@ -54,8 +50,13 @@ struct Config
    Alloc trans_alloc;
    Alloc bbox_alloc;
    Alloc drawmonoid_alloc;
+    Alloc clip_alloc;
+    Alloc clip_bic_alloc;
+    Alloc clip_stack_alloc;
+    Alloc clip_bbox_alloc;
    uint n_trans;
    uint n_path;
+    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
@ -64,8 +65,8 @@ struct Config

 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);

-RWByteAddressBuffer _84 : register(u0, space0);
-ByteAddressBuffer _253 : register(t1, space0);
+RWByteAddressBuffer _94 : register(u0, space0);
+ByteAddressBuffer _202 : register(t1, space0);

 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@ -93,7 +94,7 @@ uint read_mem(Alloc alloc, uint offset)
    {
        return 0u;
    }
-    uint v = _84.Load(offset * 4 + 8);
+    uint v = _94.Load(offset * 4 + 8);
    return v;
 }

@ -102,36 +103,53 @@ AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref)
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
    uint tag_and_flags = read_mem(param, param_1);
-    AnnotatedTag _221 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _221;
+    AnnotatedTag _181 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+    return _181;
 }

-AnnoEndClip AnnoEndClip_read(Alloc a, AnnoEndClipRef ref)
+DrawMonoid load_draw_monoid(uint element_ix)
+{
+    uint base = (_202.Load(44) >> uint(2)) + (2u * element_ix);
+    uint path_ix = _94.Load(base * 4 + 8);
+    uint clip_ix = _94.Load((base + 1u) * 4 + 8);
+    DrawMonoid _222 = { path_ix, clip_ix };
+    return _222;
+}
+
+float4 load_clip_bbox(uint clip_ix)
+{
+    uint base = (_202.Load(60) >> uint(2)) + (4u * clip_ix);
+    float x0 = asfloat(_94.Load(base * 4 + 8));
+    float y0 = asfloat(_94.Load((base + 1u) * 4 + 8));
+    float x1 = asfloat(_94.Load((base + 2u) * 4 + 8));
+    float y1 = asfloat(_94.Load((base + 3u) * 4 + 8));
+    float4 bbox = float4(x0, y0, x1, y1);
+    return bbox;
+}
+
+float4 load_path_bbox(uint path_ix)
+{
+    uint base = (_202.Load(40) >> uint(2)) + (6u * path_ix);
+    float bbox_l = float(_94.Load(base * 4 + 8)) - 32768.0f;
+    float bbox_t = float(_94.Load((base + 1u) * 4 + 8)) - 32768.0f;
+    float bbox_r = float(_94.Load((base + 2u) * 4 + 8)) - 32768.0f;
+    float bbox_b = float(_94.Load((base + 3u) * 4 + 8)) - 32768.0f;
+    float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
+    return bbox;
+}
+
+float4 bbox_intersect(float4 a, float4 b)
+{
+    return float4(max(a.xy, b.xy), min(a.zw, b.zw));
+}
+
+void store_path_bbox(AnnotatedRef ref, float4 bbox)
 {
    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1);
-    Alloc param_2 = a;
-    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3);
-    Alloc param_4 = a;
-    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5);
-    Alloc param_6 = a;
-    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7);
-    AnnoEndClip s;
-    s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
-    return s;
-}
-
-AnnoEndClip Annotated_EndClip_read(Alloc a, AnnotatedRef ref)
-{
-    AnnoEndClipRef _228 = { ref.offset + 4u };
-    Alloc param = a;
-    AnnoEndClipRef param_1 = _228;
-    return AnnoEndClip_read(param, param_1);
+    _94.Store((ix + 1u) * 4 + 8, asuint(bbox.x));
+    _94.Store((ix + 2u) * 4 + 8, asuint(bbox.y));
+    _94.Store((ix + 3u) * 4 + 8, asuint(bbox.z));
+    _94.Store((ix + 4u) * 4 + 8, asuint(bbox.w));
 }

 Alloc new_alloc(uint offset, uint size, bool mem_ok)
@ -143,22 +161,22 @@ Alloc new_alloc(uint offset, uint size, bool mem_ok)

 MallocResult malloc(uint size)
 {
-    uint _90;
-    _84.InterlockedAdd(0, size, _90);
-    uint offset = _90;
-    uint _97;
-    _84.GetDimensions(_97);
-    _97 = (_97 - 8) / 4;
+    uint _100;
+    _94.InterlockedAdd(0, size, _100);
+    uint offset = _100;
+    uint _107;
+    _94.GetDimensions(_107);
+    _107 = (_107 - 8) / 4;
    MallocResult r;
-    r.failed = (offset + size) > uint(int(_97) * 4);
+    r.failed = (offset + size) > uint(int(_107) * 4);
    uint param = offset;
    uint param_1 = size;
    bool param_2 = !r.failed;
    r.alloc = new_alloc(param, param_1, param_2);
    if (r.failed)
    {
-        uint _119;
-        _84.InterlockedMax(4, 1u, _119);
+        uint _129;
+        _94.InterlockedMax(4, 1u, _129);
        return r;
    }
    return r;
@ -172,7 +190,7 @@ void write_mem(Alloc alloc, uint offset, uint val)
    {
        return;
    }
-    _84.Store(offset * 4 + 8, val);
+    _94.Store(offset * 4 + 8, val);
 }

 void BinInstance_write(Alloc a, BinInstanceRef ref, BinInstance s)
@ -186,7 +204,7 @@ void BinInstance_write(Alloc a, BinInstanceRef ref, BinInstance s)

 void comp_main()
 {
-    uint my_n_elements = _253.Load(0);
+    uint my_n_elements = _202.Load(0);
    uint my_partition = gl_WorkGroupID.x;
    for (uint i = 0u; i < 8u; i++)
    {
@ -198,15 +216,15 @@ void comp_main()
    }
    GroupMemoryBarrierWithGroupSync();
    uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x;
-    AnnotatedRef _308 = { _253.Load(32) + (element_ix * 40u) };
-    AnnotatedRef ref = _308;
+    AnnotatedRef _415 = { _202.Load(32) + (element_ix * 40u) };
+    AnnotatedRef ref = _415;
    uint tag = 0u;
    if (element_ix < my_n_elements)
    {
-        Alloc _318;
-        _318.offset = _253.Load(32);
+        Alloc _425;
+        _425.offset = _202.Load(32);
        Alloc param;
-        param.offset = _318.offset;
+        param.offset = _425.offset;
        AnnotatedRef param_1 = ref;
        tag = Annotated_tag(param, param_1).tag;
    }
@ -222,21 +240,38 @@ void comp_main()
        case 4u:
        case 5u:
        {
-            Alloc _336;
-            _336.offset = _253.Load(32);
-            Alloc param_2;
-            param_2.offset = _336.offset;
-            AnnotatedRef param_3 = ref;
-            AnnoEndClip clip = Annotated_EndClip_read(param_2, param_3);
-            x0 = int(floor(clip.bbox.x * 0.00390625f));
-            y0 = int(floor(clip.bbox.y * 0.00390625f));
-            x1 = int(ceil(clip.bbox.z * 0.00390625f));
-            y1 = int(ceil(clip.bbox.w * 0.00390625f));
+            uint param_2 = element_ix;
+            DrawMonoid draw_monoid = load_draw_monoid(param_2);
+            uint path_ix = draw_monoid.path_ix;
+            float4 clip_bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f);
+            uint clip_ix = draw_monoid.clip_ix;
+            if (clip_ix > 0u)
+            {
+                uint param_3 = clip_ix - 1u;
+                clip_bbox = load_clip_bbox(param_3);
+            }
+            uint param_4 = path_ix;
+            float4 path_bbox = load_path_bbox(param_4);
+            float4 param_5 = path_bbox;
+            float4 param_6 = clip_bbox;
+            float4 bbox = bbox_intersect(param_5, param_6);
+            float4 _473 = bbox;
+            float4 _475 = bbox;
+            float2 _477 = max(_473.xy, _475.zw);
+            bbox.z = _477.x;
+            bbox.w = _477.y;
+            AnnotatedRef param_7 = ref;
+            float4 param_8 = bbox;
+            store_path_bbox(param_7, param_8);
+            x0 = int(floor(bbox.x * 0.00390625f));
+            y0 = int(floor(bbox.y * 0.00390625f));
+            x1 = int(ceil(bbox.z * 0.00390625f));
+            y1 = int(ceil(bbox.w * 0.00390625f));
            break;
        }
    }
-    uint width_in_bins = ((_253.Load(8) + 16u) - 1u) / 16u;
-    uint height_in_bins = ((_253.Load(12) + 16u) - 1u) / 16u;
+    uint width_in_bins = ((_202.Load(8) + 16u) - 1u) / 16u;
+    uint height_in_bins = ((_202.Load(12) + 16u) - 1u) / 16u;
    x0 = clamp(x0, 0, int(width_in_bins));
    x1 = clamp(x1, x0, int(width_in_bins));
    y0 = clamp(y0, 0, int(height_in_bins));
@ -251,8 +286,8 @@ void comp_main()
    uint my_mask = 1u << (gl_LocalInvocationID.x & 31u);
    while (y < y1)
    {
-        uint _437;
-        InterlockedOr(bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, _437);
+        uint _581;
+        InterlockedOr(bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, _581);
        x++;
        if (x == x1)
        {
@ -267,15 +302,15 @@ void comp_main()
        element_count += uint(int(countbits(bitmaps[i_1][gl_LocalInvocationID.x])));
        count[i_1][gl_LocalInvocationID.x] = element_count;
    }
-    uint param_4 = 0u;
-    uint param_5 = 0u;
-    bool param_6 = true;
-    Alloc chunk_alloc = new_alloc(param_4, param_5, param_6);
+    uint param_9 = 0u;
+    uint param_10 = 0u;
+    bool param_11 = true;
+    Alloc chunk_alloc = new_alloc(param_9, param_10, param_11);
    if (element_count != 0u)
    {
-        uint param_7 = element_count * 4u;
-        MallocResult _487 = malloc(param_7);
-        MallocResult chunk = _487;
+        uint param_12 = element_count * 4u;
+        MallocResult _631 = malloc(param_12);
+        MallocResult chunk = _631;
        chunk_alloc = chunk.alloc;
        sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
        if (chunk.failed)
@ -283,32 +318,32 @@ void comp_main()
            sh_alloc_failed = true;
        }
    }
-    uint out_ix = (_253.Load(20) >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u);
-    Alloc _516;
-    _516.offset = _253.Load(20);
-    Alloc param_8;
-    param_8.offset = _516.offset;
-    uint param_9 = out_ix;
-    uint param_10 = element_count;
-    write_mem(param_8, param_9, param_10);
-    Alloc _528;
-    _528.offset = _253.Load(20);
-    Alloc param_11;
-    param_11.offset = _528.offset;
-    uint param_12 = out_ix + 1u;
-    uint param_13 = chunk_alloc.offset;
-    write_mem(param_11, param_12, param_13);
+    uint out_ix = (_202.Load(20) >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u);
+    Alloc _660;
+    _660.offset = _202.Load(20);
+    Alloc param_13;
+    param_13.offset = _660.offset;
+    uint param_14 = out_ix;
+    uint param_15 = element_count;
+    write_mem(param_13, param_14, param_15);
+    Alloc _672;
+    _672.offset = _202.Load(20);
+    Alloc param_16;
+    param_16.offset = _672.offset;
+    uint param_17 = out_ix + 1u;
+    uint param_18 = chunk_alloc.offset;
+    write_mem(param_16, param_17, param_18);
    GroupMemoryBarrierWithGroupSync();
-    bool _543;
+    bool _687;
    if (!sh_alloc_failed)
    {
-        _543 = _84.Load(4) != 0u;
+        _687 = _94.Load(4) != 0u;
    }
    else
    {
-        _543 = sh_alloc_failed;
+        _687 = sh_alloc_failed;
    }
-    if (_543)
+    if (_687)
    {
        return;
    }
@ -327,12 +362,12 @@ void comp_main()
            }
            Alloc out_alloc = sh_chunk_alloc[bin_ix];
            uint out_offset = out_alloc.offset + (idx * 4u);
-            BinInstanceRef _605 = { out_offset };
-            BinInstance _607 = { element_ix };
-            Alloc param_14 = out_alloc;
-            BinInstanceRef param_15 = _605;
-            BinInstance param_16 = _607;
-            BinInstance_write(param_14, param_15, param_16);
+            BinInstanceRef _749 = { out_offset };
+            BinInstance _751 = { element_ix };
+            Alloc param_19 = out_alloc;
+            BinInstanceRef param_20 = _749;
+            BinInstance param_21 = _751;
+            BinInstance_write(param_19, param_20, param_21);
        }
        x++;
        if (x == x1)
--- a/piet-gpu/shader/gen/binning.msl
+++ b/piet-gpu/shader/gen/binning.msl
@ -18,16 +18,6 @@ struct MallocResult
    bool failed;
 };

-struct AnnoEndClipRef
-{
-    uint offset;
-};
-
-struct AnnoEndClip
-{
-    float4 bbox;
-};
-
 struct AnnotatedRef
 {
    uint offset;
@ -49,6 +39,12 @@ struct BinInstance
    uint element_ix;
 };

+struct DrawMonoid
+{
+    uint path_ix;
+    uint clip_ix;
+};
+
 struct Memory
 {
    uint mem_offset;
@ -75,8 +71,13 @@ struct Config
    Alloc_1 trans_alloc;
    Alloc_1 bbox_alloc;
    Alloc_1 drawmonoid_alloc;
+    Alloc_1 clip_alloc;
+    Alloc_1 clip_bic_alloc;
+    Alloc_1 clip_stack_alloc;
+    Alloc_1 clip_bbox_alloc;
    uint n_trans;
    uint n_path;
+    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
@ -97,7 +98,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
 }

 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_84, constant uint& v_84BufferSize)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_94, constant uint& v_94BufferSize)
 {
    Alloc param = alloc;
    uint param_1 = offset;
@ -105,46 +106,66 @@ uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memor
    {
        return 0u;
    }
-    uint v = v_84.memory[offset];
+    uint v = v_94.memory[offset];
    return v;
 }

 static inline __attribute__((always_inline))
-AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_84, constant uint& v_84BufferSize)
+AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_94, constant uint& v_94BufferSize)
 {
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_84, v_84BufferSize);
+    uint tag_and_flags = read_mem(param, param_1, v_94, v_94BufferSize);
    return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
 }

 static inline __attribute__((always_inline))
-AnnoEndClip AnnoEndClip_read(thread const Alloc& a, thread const AnnoEndClipRef& ref, device Memory& v_84, constant uint& v_84BufferSize)
+DrawMonoid load_draw_monoid(thread const uint& element_ix, device Memory& v_94, constant uint& v_94BufferSize, const device ConfigBuf& v_202)
 {
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_84, v_84BufferSize);
-    Alloc param_2 = a;
-    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_84, v_84BufferSize);
-    Alloc param_4 = a;
-    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_84, v_84BufferSize);
-    Alloc param_6 = a;
-    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_84, v_84BufferSize);
-    AnnoEndClip s;
-    s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
-    return s;
+    uint base = (v_202.conf.drawmonoid_alloc.offset >> uint(2)) + (2u * element_ix);
+    uint path_ix = v_94.memory[base];
+    uint clip_ix = v_94.memory[base + 1u];
+    return DrawMonoid{ path_ix, clip_ix };
 }

 static inline __attribute__((always_inline))
-AnnoEndClip Annotated_EndClip_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_84, constant uint& v_84BufferSize)
+float4 load_clip_bbox(thread const uint& clip_ix, device Memory& v_94, constant uint& v_94BufferSize, const device ConfigBuf& v_202)
 {
-    Alloc param = a;
-    AnnoEndClipRef param_1 = AnnoEndClipRef{ ref.offset + 4u };
-    return AnnoEndClip_read(param, param_1, v_84, v_84BufferSize);
+    uint base = (v_202.conf.clip_bbox_alloc.offset >> uint(2)) + (4u * clip_ix);
+    float x0 = as_type<float>(v_94.memory[base]);
+    float y0 = as_type<float>(v_94.memory[base + 1u]);
+    float x1 = as_type<float>(v_94.memory[base + 2u]);
+    float y1 = as_type<float>(v_94.memory[base + 3u]);
+    float4 bbox = float4(x0, y0, x1, y1);
+    return bbox;
+}
+
+static inline __attribute__((always_inline))
+float4 load_path_bbox(thread const uint& path_ix, device Memory& v_94, constant uint& v_94BufferSize, const device ConfigBuf& v_202)
+{
+    uint base = (v_202.conf.bbox_alloc.offset >> uint(2)) + (6u * path_ix);
+    float bbox_l = float(v_94.memory[base]) - 32768.0;
+    float bbox_t = float(v_94.memory[base + 1u]) - 32768.0;
+    float bbox_r = float(v_94.memory[base + 2u]) - 32768.0;
+    float bbox_b = float(v_94.memory[base + 3u]) - 32768.0;
+    float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
+    return bbox;
+}
+
+static inline __attribute__((always_inline))
+float4 bbox_intersect(thread const float4& a, thread const float4& b)
+{
+    return float4(fast::max(a.xy, b.xy), fast::min(a.zw, b.zw));
+}
+
+static inline __attribute__((always_inline))
+void store_path_bbox(thread const AnnotatedRef& ref, thread const float4& bbox, device Memory& v_94, constant uint& v_94BufferSize)
+{
+    uint ix = ref.offset >> uint(2);
+    v_94.memory[ix + 1u] = as_type<uint>(bbox.x);
+    v_94.memory[ix + 2u] = as_type<uint>(bbox.y);
+    v_94.memory[ix + 3u] = as_type<uint>(bbox.z);
+    v_94.memory[ix + 4u] = as_type<uint>(bbox.w);
 }

 static inline __attribute__((always_inline))
@ -156,26 +177,26 @@ Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const
 }

 static inline __attribute__((always_inline))
-MallocResult malloc(thread const uint& size, device Memory& v_84, constant uint& v_84BufferSize)
+MallocResult malloc(thread const uint& size, device Memory& v_94, constant uint& v_94BufferSize)
 {
-    uint _90 = atomic_fetch_add_explicit((device atomic_uint*)&v_84.mem_offset, size, memory_order_relaxed);
-    uint offset = _90;
+    uint _100 = atomic_fetch_add_explicit((device atomic_uint*)&v_94.mem_offset, size, memory_order_relaxed);
+    uint offset = _100;
    MallocResult r;
-    r.failed = (offset + size) > uint(int((v_84BufferSize - 8) / 4) * 4);
+    r.failed = (offset + size) > uint(int((v_94BufferSize - 8) / 4) * 4);
    uint param = offset;
    uint param_1 = size;
    bool param_2 = !r.failed;
    r.alloc = new_alloc(param, param_1, param_2);
    if (r.failed)
    {
-        uint _119 = atomic_fetch_max_explicit((device atomic_uint*)&v_84.mem_error, 1u, memory_order_relaxed);
+        uint _129 = atomic_fetch_max_explicit((device atomic_uint*)&v_94.mem_error, 1u, memory_order_relaxed);
        return r;
    }
    return r;
 }

 static inline __attribute__((always_inline))
-void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_84, constant uint& v_84BufferSize)
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_94, constant uint& v_94BufferSize)
 {
    Alloc param = alloc;
    uint param_1 = offset;
@ -183,27 +204,27 @@ void write_mem(thread const Alloc& alloc, thread const uint& offset, thread cons
    {
        return;
    }
-    v_84.memory[offset] = val;
+    v_94.memory[offset] = val;
 }

 static inline __attribute__((always_inline))
-void BinInstance_write(thread const Alloc& a, thread const BinInstanceRef& ref, thread const BinInstance& s, device Memory& v_84, constant uint& v_84BufferSize)
+void BinInstance_write(thread const Alloc& a, thread const BinInstanceRef& ref, thread const BinInstance& s, device Memory& v_94, constant uint& v_94BufferSize)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
    uint param_2 = s.element_ix;
-    write_mem(param, param_1, param_2, v_84, v_84BufferSize);
+    write_mem(param, param_1, param_2, v_94, v_94BufferSize);
 }

-kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_84 [[buffer(0)]], const device ConfigBuf& _253 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_94 [[buffer(0)]], const device ConfigBuf& v_202 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
    threadgroup uint bitmaps[8][256];
    threadgroup short sh_alloc_failed;
    threadgroup uint count[8][256];
    threadgroup Alloc sh_chunk_alloc[256];
-    constant uint& v_84BufferSize = spvBufferSizeConstants[0];
-    uint my_n_elements = _253.conf.n_elements;
+    constant uint& v_94BufferSize = spvBufferSizeConstants[0];
+    uint my_n_elements = v_202.conf.n_elements;
    uint my_partition = gl_WorkGroupID.x;
    for (uint i = 0u; i < 8u; i++)
    {
@ -215,14 +236,14 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);
    uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x;
-    AnnotatedRef ref = AnnotatedRef{ _253.conf.anno_alloc.offset + (element_ix * 40u) };
+    AnnotatedRef ref = AnnotatedRef{ v_202.conf.anno_alloc.offset + (element_ix * 40u) };
    uint tag = 0u;
    if (element_ix < my_n_elements)
    {
        Alloc param;
-        param.offset = _253.conf.anno_alloc.offset;
+        param.offset = v_202.conf.anno_alloc.offset;
        AnnotatedRef param_1 = ref;
-        tag = Annotated_tag(param, param_1, v_84, v_84BufferSize).tag;
+        tag = Annotated_tag(param, param_1, v_94, v_94BufferSize).tag;
    }
    int x0 = 0;
    int y0 = 0;
@ -236,19 +257,38 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
        case 4u:
        case 5u:
        {
-            Alloc param_2;
-            param_2.offset = _253.conf.anno_alloc.offset;
-            AnnotatedRef param_3 = ref;
-            AnnoEndClip clip = Annotated_EndClip_read(param_2, param_3, v_84, v_84BufferSize);
-            x0 = int(floor(clip.bbox.x * 0.00390625));
-            y0 = int(floor(clip.bbox.y * 0.00390625));
-            x1 = int(ceil(clip.bbox.z * 0.00390625));
-            y1 = int(ceil(clip.bbox.w * 0.00390625));
+            uint param_2 = element_ix;
+            DrawMonoid draw_monoid = load_draw_monoid(param_2, v_94, v_94BufferSize, v_202);
+            uint path_ix = draw_monoid.path_ix;
+            float4 clip_bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0);
+            uint clip_ix = draw_monoid.clip_ix;
+            if (clip_ix > 0u)
+            {
+                uint param_3 = clip_ix - 1u;
+                clip_bbox = load_clip_bbox(param_3, v_94, v_94BufferSize, v_202);
+            }
+            uint param_4 = path_ix;
+            float4 path_bbox = load_path_bbox(param_4, v_94, v_94BufferSize, v_202);
+            float4 param_5 = path_bbox;
+            float4 param_6 = clip_bbox;
+            float4 bbox = bbox_intersect(param_5, param_6);
+            float4 _473 = bbox;
+            float4 _475 = bbox;
+            float2 _477 = fast::max(_473.xy, _475.zw);
+            bbox.z = _477.x;
+            bbox.w = _477.y;
+            AnnotatedRef param_7 = ref;
+            float4 param_8 = bbox;
+            store_path_bbox(param_7, param_8, v_94, v_94BufferSize);
+            x0 = int(floor(bbox.x * 0.00390625));
+            y0 = int(floor(bbox.y * 0.00390625));
+            x1 = int(ceil(bbox.z * 0.00390625));
+            y1 = int(ceil(bbox.w * 0.00390625));
            break;
        }
    }
-    uint width_in_bins = ((_253.conf.width_in_tiles + 16u) - 1u) / 16u;
-    uint height_in_bins = ((_253.conf.height_in_tiles + 16u) - 1u) / 16u;
+    uint width_in_bins = ((v_202.conf.width_in_tiles + 16u) - 1u) / 16u;
+    uint height_in_bins = ((v_202.conf.height_in_tiles + 16u) - 1u) / 16u;
    x0 = clamp(x0, 0, int(width_in_bins));
    x1 = clamp(x1, x0, int(width_in_bins));
    y0 = clamp(y0, 0, int(height_in_bins));
@ -263,7 +303,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
    uint my_mask = 1u << (gl_LocalInvocationID.x & 31u);
    while (y < y1)
    {
-        uint _437 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, memory_order_relaxed);
+        uint _581 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, memory_order_relaxed);
        x++;
        if (x == x1)
        {
@ -278,15 +318,15 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
        element_count += uint(int(popcount(bitmaps[i_1][gl_LocalInvocationID.x])));
        count[i_1][gl_LocalInvocationID.x] = element_count;
    }
-    uint param_4 = 0u;
-    uint param_5 = 0u;
-    bool param_6 = true;
-    Alloc chunk_alloc = new_alloc(param_4, param_5, param_6);
+    uint param_9 = 0u;
+    uint param_10 = 0u;
+    bool param_11 = true;
+    Alloc chunk_alloc = new_alloc(param_9, param_10, param_11);
    if (element_count != 0u)
    {
-        uint param_7 = element_count * 4u;
-        MallocResult _487 = malloc(param_7, v_84, v_84BufferSize);
-        MallocResult chunk = _487;
+        uint param_12 = element_count * 4u;
+        MallocResult _631 = malloc(param_12, v_94, v_94BufferSize);
+        MallocResult chunk = _631;
        chunk_alloc = chunk.alloc;
        sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
        if (chunk.failed)
@ -294,28 +334,28 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
            sh_alloc_failed = short(true);
        }
    }
-    uint out_ix = (_253.conf.bin_alloc.offset >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u);
-    Alloc param_8;
-    param_8.offset = _253.conf.bin_alloc.offset;
-    uint param_9 = out_ix;
-    uint param_10 = element_count;
-    write_mem(param_8, param_9, param_10, v_84, v_84BufferSize);
-    Alloc param_11;
-    param_11.offset = _253.conf.bin_alloc.offset;
-    uint param_12 = out_ix + 1u;
-    uint param_13 = chunk_alloc.offset;
-    write_mem(param_11, param_12, param_13, v_84, v_84BufferSize);
+    uint out_ix = (v_202.conf.bin_alloc.offset >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u);
+    Alloc param_13;
+    param_13.offset = v_202.conf.bin_alloc.offset;
+    uint param_14 = out_ix;
+    uint param_15 = element_count;
+    write_mem(param_13, param_14, param_15, v_94, v_94BufferSize);
+    Alloc param_16;
+    param_16.offset = v_202.conf.bin_alloc.offset;
+    uint param_17 = out_ix + 1u;
+    uint param_18 = chunk_alloc.offset;
+    write_mem(param_16, param_17, param_18, v_94, v_94BufferSize);
    threadgroup_barrier(mem_flags::mem_threadgroup);
-    bool _543;
+    bool _687;
    if (!bool(sh_alloc_failed))
    {
-        _543 = v_84.mem_error != 0u;
+        _687 = v_94.mem_error != 0u;
    }
    else
    {
-        _543 = bool(sh_alloc_failed);
+        _687 = bool(sh_alloc_failed);
    }
-    if (_543)
+    if (_687)
    {
        return;
    }
@ -334,10 +374,10 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
            }
            Alloc out_alloc = sh_chunk_alloc[bin_ix];
            uint out_offset = out_alloc.offset + (idx * 4u);
-            Alloc param_14 = out_alloc;
-            BinInstanceRef param_15 = BinInstanceRef{ out_offset };
-            BinInstance param_16 = BinInstance{ element_ix };
-            BinInstance_write(param_14, param_15, param_16, v_84, v_84BufferSize);
+            Alloc param_19 = out_alloc;
+            BinInstanceRef param_20 = BinInstanceRef{ out_offset };
+            BinInstance param_21 = BinInstance{ element_ix };
+            BinInstance_write(param_19, param_20, param_21, v_94, v_94BufferSize);
        }
        x++;
        if (x == x1)
--- a/piet-gpu/shader/gen/binning.spv
+++ b/piet-gpu/shader/gen/binning.spv
--- a/piet-gpu/shader/gen/clip_leaf.dxil
+++ b/piet-gpu/shader/gen/clip_leaf.dxil
--- a/piet-gpu/shader/gen/clip_leaf.hlsl
+++ b/piet-gpu/shader/gen/clip_leaf.hlsl
@ -0,0 +1,367 @@
+struct Bic
+{
+    uint a;
+    uint b;
+};
+
+struct ClipEl
+{
+    uint parent_ix;
+    float4 bbox;
+};
+
+struct Alloc
+{
+    uint offset;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+    Alloc trans_alloc;
+    Alloc bbox_alloc;
+    Alloc drawmonoid_alloc;
+    Alloc clip_alloc;
+    Alloc clip_bic_alloc;
+    Alloc clip_stack_alloc;
+    Alloc clip_bbox_alloc;
+    uint n_trans;
+    uint n_path;
+    uint n_clip;
+    uint trans_offset;
+    uint linewidth_offset;
+    uint pathtag_offset;
+    uint pathseg_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
+
+static const Bic _393 = { 0u, 0u };
+
+ByteAddressBuffer _80 : register(t1, space0);
+RWByteAddressBuffer _96 : register(u0, space0);
+
+static uint3 gl_WorkGroupID;
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_WorkGroupID : SV_GroupID;
+    uint3 gl_LocalInvocationID : SV_GroupThreadID;
+    uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+groupshared Bic sh_bic[510];
+groupshared uint sh_stack[256];
+groupshared float4 sh_stack_bbox[256];
+groupshared uint sh_link[256];
+groupshared float4 sh_bbox[256];
+
+Bic load_bic(uint ix)
+{
+    uint base = (_80.Load(52) >> uint(2)) + (2u * ix);
+    Bic _286 = { _96.Load(base * 4 + 8), _96.Load((base + 1u) * 4 + 8) };
+    return _286;
+}
+
+Bic bic_combine(Bic x, Bic y)
+{
+    uint m = min(x.b, y.a);
+    Bic _72 = { (x.a + y.a) - m, (x.b + y.b) - m };
+    return _72;
+}
+
+ClipEl load_clip_el(uint ix)
+{
+    uint base = (_80.Load(56) >> uint(2)) + (5u * ix);
+    uint parent_ix = _96.Load(base * 4 + 8);
+    float x0 = asfloat(_96.Load((base + 1u) * 4 + 8));
+    float y0 = asfloat(_96.Load((base + 2u) * 4 + 8));
+    float x1 = asfloat(_96.Load((base + 3u) * 4 + 8));
+    float y1 = asfloat(_96.Load((base + 4u) * 4 + 8));
+    float4 bbox = float4(x0, y0, x1, y1);
+    ClipEl _335 = { parent_ix, bbox };
+    return _335;
+}
+
+float4 bbox_intersect(float4 a, float4 b)
+{
+    return float4(max(a.xy, b.xy), min(a.zw, b.zw));
+}
+
+uint load_path_ix(uint ix)
+{
+    if (ix < _80.Load(72))
+    {
+        return _96.Load(((_80.Load(48) >> uint(2)) + ix) * 4 + 8);
+    }
+    else
+    {
+        return 2147483648u;
+    }
+}
+
+float4 load_path_bbox(uint path_ix)
+{
+    uint base = (_80.Load(40) >> uint(2)) + (6u * path_ix);
+    float bbox_l = float(_96.Load(base * 4 + 8)) - 32768.0f;
+    float bbox_t = float(_96.Load((base + 1u) * 4 + 8)) - 32768.0f;
+    float bbox_r = float(_96.Load((base + 2u) * 4 + 8)) - 32768.0f;
+    float bbox_b = float(_96.Load((base + 3u) * 4 + 8)) - 32768.0f;
+    float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
+    return bbox;
+}
+
+uint search_link(inout Bic bic)
+{
+    uint ix = gl_LocalInvocationID.x;
+    uint j = 0u;
+    while (j < 8u)
+    {
+        uint base = 512u - (2u << (8u - j));
+        if (((ix >> j) & 1u) != 0u)
+        {
+            Bic param = sh_bic[(base + (ix >> j)) - 1u];
+            Bic param_1 = bic;
+            Bic test = bic_combine(param, param_1);
+            if (test.b > 0u)
+            {
+                break;
+            }
+            bic = test;
+            ix -= (1u << j);
+        }
+        j++;
+    }
+    if (ix > 0u)
+    {
+        while (j > 0u)
+        {
+            j--;
+            uint base_1 = 512u - (2u << (8u - j));
+            Bic param_2 = sh_bic[(base_1 + (ix >> j)) - 1u];
+            Bic param_3 = bic;
+            Bic test_1 = bic_combine(param_2, param_3);
+            if (test_1.b == 0u)
+            {
+                bic = test_1;
+                ix -= (1u << j);
+            }
+        }
+    }
+    if (ix > 0u)
+    {
+        return ix - 1u;
+    }
+    else
+    {
+        return 4294967295u - bic.a;
+    }
+}
+
+void store_clip_bbox(uint ix, float4 bbox)
+{
+    uint base = (_80.Load(60) >> uint(2)) + (4u * ix);
+    _96.Store(base * 4 + 8, asuint(bbox.x));
+    _96.Store((base + 1u) * 4 + 8, asuint(bbox.y));
+    _96.Store((base + 2u) * 4 + 8, asuint(bbox.z));
+    _96.Store((base + 3u) * 4 + 8, asuint(bbox.w));
+}
+
+void comp_main()
+{
+    uint th = gl_LocalInvocationID.x;
+    Bic bic = _393;
+    if (th < gl_WorkGroupID.x)
+    {
+        uint param = th;
+        bic = load_bic(param);
+    }
+    sh_bic[th] = bic;
+    for (uint i = 0u; i < 8u; i++)
+    {
+        GroupMemoryBarrierWithGroupSync();
+        if ((th + (1u << i)) < 256u)
+        {
+            Bic other = sh_bic[th + (1u << i)];
+            Bic param_1 = bic;
+            Bic param_2 = other;
+            bic = bic_combine(param_1, param_2);
+        }
+        GroupMemoryBarrierWithGroupSync();
+        sh_bic[th] = bic;
+    }
+    GroupMemoryBarrierWithGroupSync();
+    uint stack_size = sh_bic[0].b;
+    uint sp = 255u - th;
+    uint ix = 0u;
+    for (uint i_1 = 0u; i_1 < 8u; i_1++)
+    {
+        uint probe = ix + (128u >> i_1);
+        if (sp < sh_bic[probe].b)
+        {
+            ix = probe;
+        }
+    }
+    uint b = sh_bic[ix].b;
+    float4 bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f);
+    if (sp < b)
+    {
+        uint param_3 = (((ix * 256u) + b) - sp) - 1u;
+        ClipEl el = load_clip_el(param_3);
+        sh_stack[th] = el.parent_ix;
+        bbox = el.bbox;
+    }
+    for (uint i_2 = 0u; i_2 < 8u; i_2++)
+    {
+        sh_stack_bbox[th] = bbox;
+        GroupMemoryBarrierWithGroupSync();
+        if (th >= (1u << i_2))
+        {
+            float4 param_4 = sh_stack_bbox[th - (1u << i_2)];
+            float4 param_5 = bbox;
+            bbox = bbox_intersect(param_4, param_5);
+        }
+        GroupMemoryBarrierWithGroupSync();
+    }
+    sh_stack_bbox[th] = bbox;
+    uint param_6 = gl_GlobalInvocationID.x;
+    uint inp = load_path_ix(param_6);
+    bool is_push = int(inp) >= 0;
+    Bic _559 = { 1u - uint(is_push), uint(is_push) };
+    bic = _559;
+    sh_bic[th] = bic;
+    if (is_push)
+    {
+        uint param_7 = inp;
+        bbox = load_path_bbox(param_7);
+    }
+    else
+    {
+        bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f);
+    }
+    uint inbase = 0u;
+    for (uint i_3 = 0u; i_3 < 7u; i_3++)
+    {
+        uint outbase = 512u - (1u << (8u - i_3));
+        GroupMemoryBarrierWithGroupSync();
+        if (th < (1u << (7u - i_3)))
+        {
+            Bic param_8 = sh_bic[inbase + (th * 2u)];
+            Bic param_9 = sh_bic[(inbase + (th * 2u)) + 1u];
+            sh_bic[outbase + th] = bic_combine(param_8, param_9);
+        }
+        inbase = outbase;
+    }
+    GroupMemoryBarrierWithGroupSync();
+    bic = _393;
+    Bic param_10 = bic;
+    uint _618 = search_link(param_10);
+    bic = param_10;
+    uint link = _618;
+    sh_link[th] = link;
+    GroupMemoryBarrierWithGroupSync();
+    uint grandparent;
+    if (int(link) >= 0)
+    {
+        grandparent = sh_link[link];
+    }
+    else
+    {
+        grandparent = link - 1u;
+    }
+    uint parent;
+    if (int(link) >= 0)
+    {
+        parent = (gl_WorkGroupID.x * 256u) + link;
+    }
+    else
+    {
+        if (int(link + stack_size) >= 0)
+        {
+            parent = sh_stack[256u + link];
+        }
+        else
+        {
+            parent = 4294967295u;
+        }
+    }
+    for (uint i_4 = 0u; i_4 < 8u; i_4++)
+    {
+        if (i_4 != 0u)
+        {
+            sh_link[th] = link;
+        }
+        sh_bbox[th] = bbox;
+        GroupMemoryBarrierWithGroupSync();
+        if (int(link) >= 0)
+        {
+            float4 param_11 = sh_bbox[link];
+            float4 param_12 = bbox;
+            bbox = bbox_intersect(param_11, param_12);
+            link = sh_link[link];
+        }
+        GroupMemoryBarrierWithGroupSync();
+    }
+    if (int(link + stack_size) >= 0)
+    {
+        float4 param_13 = sh_stack_bbox[256u + link];
+        float4 param_14 = bbox;
+        bbox = bbox_intersect(param_13, param_14);
+    }
+    sh_bbox[th] = bbox;
+    GroupMemoryBarrierWithGroupSync();
+    uint path_ix = inp;
+    bool _717 = !is_push;
+    bool _725;
+    if (_717)
+    {
+        _725 = gl_GlobalInvocationID.x < _80.Load(72);
+    }
+    else
+    {
+        _725 = _717;
+    }
+    if (_725)
+    {
+        uint param_15 = parent;
+        path_ix = load_path_ix(param_15);
+        uint drawmonoid_out_base = (_80.Load(44) >> uint(2)) + (2u * (~inp));
+        _96.Store(drawmonoid_out_base * 4 + 8, path_ix);
+        if (int(grandparent) >= 0)
+        {
+            bbox = sh_bbox[grandparent];
+        }
+        else
+        {
+            if (int(grandparent + stack_size) >= 0)
+            {
+                bbox = sh_stack_bbox[256u + grandparent];
+            }
+            else
+            {
+                bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f);
+            }
+        }
+    }
+    uint param_16 = gl_GlobalInvocationID.x;
+    float4 param_17 = bbox;
+    store_clip_bbox(param_16, param_17);
+}
+
+[numthreads(256, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_WorkGroupID = stage_input.gl_WorkGroupID;
+    gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+    gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+    comp_main();
+}
--- a/piet-gpu/shader/gen/clip_leaf.msl
+++ b/piet-gpu/shader/gen/clip_leaf.msl
@ -0,0 +1,366 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct Bic
+{
+    uint a;
+    uint b;
+};
+
+struct ClipEl
+{
+    uint parent_ix;
+    float4 bbox;
+};
+
+struct Alloc
+{
+    uint offset;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+    Alloc trans_alloc;
+    Alloc bbox_alloc;
+    Alloc drawmonoid_alloc;
+    Alloc clip_alloc;
+    Alloc clip_bic_alloc;
+    Alloc clip_stack_alloc;
+    Alloc clip_bbox_alloc;
+    uint n_trans;
+    uint n_path;
+    uint n_clip;
+    uint trans_offset;
+    uint linewidth_offset;
+    uint pathtag_offset;
+    uint pathseg_offset;
+};
+
+struct ConfigBuf
+{
+    Config conf;
+};
+
+struct Memory
+{
+    uint mem_offset;
+    uint mem_error;
+    uint memory[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+Bic load_bic(thread const uint& ix, const device ConfigBuf& v_80, device Memory& v_96)
+{
+    uint base = (v_80.conf.clip_bic_alloc.offset >> uint(2)) + (2u * ix);
+    return Bic{ v_96.memory[base], v_96.memory[base + 1u] };
+}
+
+static inline __attribute__((always_inline))
+Bic bic_combine(thread const Bic& x, thread const Bic& y)
+{
+    uint m = min(x.b, y.a);
+    return Bic{ (x.a + y.a) - m, (x.b + y.b) - m };
+}
+
+static inline __attribute__((always_inline))
+ClipEl load_clip_el(thread const uint& ix, const device ConfigBuf& v_80, device Memory& v_96)
+{
+    uint base = (v_80.conf.clip_stack_alloc.offset >> uint(2)) + (5u * ix);
+    uint parent_ix = v_96.memory[base];
+    float x0 = as_type<float>(v_96.memory[base + 1u]);
+    float y0 = as_type<float>(v_96.memory[base + 2u]);
+    float x1 = as_type<float>(v_96.memory[base + 3u]);
+    float y1 = as_type<float>(v_96.memory[base + 4u]);
+    float4 bbox = float4(x0, y0, x1, y1);
+    return ClipEl{ parent_ix, bbox };
+}
+
+static inline __attribute__((always_inline))
+float4 bbox_intersect(thread const float4& a, thread const float4& b)
+{
+    return float4(fast::max(a.xy, b.xy), fast::min(a.zw, b.zw));
+}
+
+static inline __attribute__((always_inline))
+uint load_path_ix(thread const uint& ix, const device ConfigBuf& v_80, device Memory& v_96)
+{
+    if (ix < v_80.conf.n_clip)
+    {
+        return v_96.memory[(v_80.conf.clip_alloc.offset >> uint(2)) + ix];
+    }
+    else
+    {
+        return 2147483648u;
+    }
+}
+
+static inline __attribute__((always_inline))
+float4 load_path_bbox(thread const uint& path_ix, const device ConfigBuf& v_80, device Memory& v_96)
+{
+    uint base = (v_80.conf.bbox_alloc.offset >> uint(2)) + (6u * path_ix);
+    float bbox_l = float(v_96.memory[base]) - 32768.0;
+    float bbox_t = float(v_96.memory[base + 1u]) - 32768.0;
+    float bbox_r = float(v_96.memory[base + 2u]) - 32768.0;
+    float bbox_b = float(v_96.memory[base + 3u]) - 32768.0;
+    float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
+    return bbox;
+}
+
+static inline __attribute__((always_inline))
+uint search_link(thread Bic& bic, thread uint3& gl_LocalInvocationID, threadgroup Bic (&sh_bic)[510])
+{
+    uint ix = gl_LocalInvocationID.x;
+    uint j = 0u;
+    while (j < 8u)
+    {
+        uint base = 512u - (2u << (8u - j));
+        if (((ix >> j) & 1u) != 0u)
+        {
+            Bic param = sh_bic[(base + (ix >> j)) - 1u];
+            Bic param_1 = bic;
+            Bic test = bic_combine(param, param_1);
+            if (test.b > 0u)
+            {
+                break;
+            }
+            bic = test;
+            ix -= (1u << j);
+        }
+        j++;
+    }
+    if (ix > 0u)
+    {
+        while (j > 0u)
+        {
+            j--;
+            uint base_1 = 512u - (2u << (8u - j));
+            Bic param_2 = sh_bic[(base_1 + (ix >> j)) - 1u];
+            Bic param_3 = bic;
+            Bic test_1 = bic_combine(param_2, param_3);
+            if (test_1.b == 0u)
+            {
+                bic = test_1;
+                ix -= (1u << j);
+            }
+        }
+    }
+    if (ix > 0u)
+    {
+        return ix - 1u;
+    }
+    else
+    {
+        return 4294967295u - bic.a;
+    }
+}
+
+static inline __attribute__((always_inline))
+void store_clip_bbox(thread const uint& ix, thread const float4& bbox, const device ConfigBuf& v_80, device Memory& v_96)
+{
+    uint base = (v_80.conf.clip_bbox_alloc.offset >> uint(2)) + (4u * ix);
+    v_96.memory[base] = as_type<uint>(bbox.x);
+    v_96.memory[base + 1u] = as_type<uint>(bbox.y);
+    v_96.memory[base + 2u] = as_type<uint>(bbox.z);
+    v_96.memory[base + 3u] = as_type<uint>(bbox.w);
+}
+
+kernel void main0(device Memory& v_96 [[buffer(0)]], const device ConfigBuf& v_80 [[buffer(1)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+{
+    threadgroup Bic sh_bic[510];
+    threadgroup uint sh_stack[256];
+    threadgroup float4 sh_stack_bbox[256];
+    threadgroup uint sh_link[256];
+    threadgroup float4 sh_bbox[256];
+    uint th = gl_LocalInvocationID.x;
+    Bic bic = Bic{ 0u, 0u };
+    if (th < gl_WorkGroupID.x)
+    {
+        uint param = th;
+        bic = load_bic(param, v_80, v_96);
+    }
+    sh_bic[th] = bic;
+    for (uint i = 0u; i < 8u; i++)
+    {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        if ((th + (1u << i)) < 256u)
+        {
+            Bic other = sh_bic[th + (1u << i)];
+            Bic param_1 = bic;
+            Bic param_2 = other;
+            bic = bic_combine(param_1, param_2);
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        sh_bic[th] = bic;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    uint stack_size = sh_bic[0].b;
+    uint sp = 255u - th;
+    uint ix = 0u;
+    for (uint i_1 = 0u; i_1 < 8u; i_1++)
+    {
+        uint probe = ix + (128u >> i_1);
+        if (sp < sh_bic[probe].b)
+        {
+            ix = probe;
+        }
+    }
+    uint b = sh_bic[ix].b;
+    float4 bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0);
+    if (sp < b)
+    {
+        uint param_3 = (((ix * 256u) + b) - sp) - 1u;
+        ClipEl el = load_clip_el(param_3, v_80, v_96);
+        sh_stack[th] = el.parent_ix;
+        bbox = el.bbox;
+    }
+    for (uint i_2 = 0u; i_2 < 8u; i_2++)
+    {
+        sh_stack_bbox[th] = bbox;
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        if (th >= (1u << i_2))
+        {
+            float4 param_4 = sh_stack_bbox[th - (1u << i_2)];
+            float4 param_5 = bbox;
+            bbox = bbox_intersect(param_4, param_5);
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+    }
+    sh_stack_bbox[th] = bbox;
+    uint param_6 = gl_GlobalInvocationID.x;
+    uint inp = load_path_ix(param_6, v_80, v_96);
+    bool is_push = int(inp) >= 0;
+    bic = Bic{ 1u - uint(is_push), uint(is_push) };
+    sh_bic[th] = bic;
+    if (is_push)
+    {
+        uint param_7 = inp;
+        bbox = load_path_bbox(param_7, v_80, v_96);
+    }
+    else
+    {
+        bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0);
+    }
+    uint inbase = 0u;
+    for (uint i_3 = 0u; i_3 < 7u; i_3++)
+    {
+        uint outbase = 512u - (1u << (8u - i_3));
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        if (th < (1u << (7u - i_3)))
+        {
+            Bic param_8 = sh_bic[inbase + (th * 2u)];
+            Bic param_9 = sh_bic[(inbase + (th * 2u)) + 1u];
+            sh_bic[outbase + th] = bic_combine(param_8, param_9);
+        }
+        inbase = outbase;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    bic = Bic{ 0u, 0u };
+    Bic param_10 = bic;
+    uint _618 = search_link(param_10, gl_LocalInvocationID, sh_bic);
+    bic = param_10;
+    uint link = _618;
+    sh_link[th] = link;
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    uint grandparent;
+    if (int(link) >= 0)
+    {
+        grandparent = sh_link[link];
+    }
+    else
+    {
+        grandparent = link - 1u;
+    }
+    uint parent;
+    if (int(link) >= 0)
+    {
+        parent = (gl_WorkGroupID.x * 256u) + link;
+    }
+    else
+    {
+        if (int(link + stack_size) >= 0)
+        {
+            parent = sh_stack[256u + link];
+        }
+        else
+        {
+            parent = 4294967295u;
+        }
+    }
+    for (uint i_4 = 0u; i_4 < 8u; i_4++)
+    {
+        if (i_4 != 0u)
+        {
+            sh_link[th] = link;
+        }
+        sh_bbox[th] = bbox;
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        if (int(link) >= 0)
+        {
+            float4 param_11 = sh_bbox[link];
+            float4 param_12 = bbox;
+            bbox = bbox_intersect(param_11, param_12);
+            link = sh_link[link];
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+    }
+    if (int(link + stack_size) >= 0)
+    {
+        float4 param_13 = sh_stack_bbox[256u + link];
+        float4 param_14 = bbox;
+        bbox = bbox_intersect(param_13, param_14);
+    }
+    sh_bbox[th] = bbox;
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    uint path_ix = inp;
+    bool _717 = !is_push;
+    bool _725;
+    if (_717)
+    {
+        _725 = gl_GlobalInvocationID.x < v_80.conf.n_clip;
+    }
+    else
+    {
+        _725 = _717;
+    }
+    if (_725)
+    {
+        uint param_15 = parent;
+        path_ix = load_path_ix(param_15, v_80, v_96);
+        uint drawmonoid_out_base = (v_80.conf.drawmonoid_alloc.offset >> uint(2)) + (2u * (~inp));
+        v_96.memory[drawmonoid_out_base] = path_ix;
+        if (int(grandparent) >= 0)
+        {
+            bbox = sh_bbox[grandparent];
+        }
+        else
+        {
+            if (int(grandparent + stack_size) >= 0)
+            {
+                bbox = sh_stack_bbox[256u + grandparent];
+            }
+            else
+            {
+                bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0);
+            }
+        }
+    }
+    uint param_16 = gl_GlobalInvocationID.x;
+    float4 param_17 = bbox;
+    store_clip_bbox(param_16, param_17, v_80, v_96);
+}
+
--- a/piet-gpu/shader/gen/clip_leaf.spv
+++ b/piet-gpu/shader/gen/clip_leaf.spv
--- a/piet-gpu/shader/gen/clip_reduce.dxil
+++ b/piet-gpu/shader/gen/clip_reduce.dxil
--- a/piet-gpu/shader/gen/clip_reduce.hlsl
+++ b/piet-gpu/shader/gen/clip_reduce.hlsl
@ -0,0 +1,177 @@
+struct Bic
+{
+    uint a;
+    uint b;
+};
+
+struct ClipEl
+{
+    uint parent_ix;
+    float4 bbox;
+};
+
+struct Alloc
+{
+    uint offset;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+    Alloc trans_alloc;
+    Alloc bbox_alloc;
+    Alloc drawmonoid_alloc;
+    Alloc clip_alloc;
+    Alloc clip_bic_alloc;
+    Alloc clip_stack_alloc;
+    Alloc clip_bbox_alloc;
+    uint n_trans;
+    uint n_path;
+    uint n_clip;
+    uint trans_offset;
+    uint linewidth_offset;
+    uint pathtag_offset;
+    uint pathseg_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
+
+static const Bic _267 = { 0u, 0u };
+
+ByteAddressBuffer _64 : register(t1, space0);
+RWByteAddressBuffer _80 : register(u0, space0);
+
+static uint3 gl_WorkGroupID;
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_WorkGroupID : SV_GroupID;
+    uint3 gl_LocalInvocationID : SV_GroupThreadID;
+    uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+groupshared Bic sh_bic[256];
+groupshared uint sh_parent[256];
+groupshared uint sh_path_ix[256];
+groupshared float4 sh_bbox[256];
+
+Bic bic_combine(Bic x, Bic y)
+{
+    uint m = min(x.b, y.a);
+    Bic _56 = { (x.a + y.a) - m, (x.b + y.b) - m };
+    return _56;
+}
+
+void store_bic(uint ix, Bic bic)
+{
+    uint base = (_64.Load(52) >> uint(2)) + (2u * ix);
+    _80.Store(base * 4 + 8, bic.a);
+    _80.Store((base + 1u) * 4 + 8, bic.b);
+}
+
+float4 load_path_bbox(uint path_ix)
+{
+    uint base = (_64.Load(40) >> uint(2)) + (6u * path_ix);
+    float bbox_l = float(_80.Load(base * 4 + 8)) - 32768.0f;
+    float bbox_t = float(_80.Load((base + 1u) * 4 + 8)) - 32768.0f;
+    float bbox_r = float(_80.Load((base + 2u) * 4 + 8)) - 32768.0f;
+    float bbox_b = float(_80.Load((base + 3u) * 4 + 8)) - 32768.0f;
+    float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
+    return bbox;
+}
+
+void store_clip_el(uint ix, ClipEl el)
+{
+    uint base = (_64.Load(56) >> uint(2)) + (5u * ix);
+    _80.Store(base * 4 + 8, el.parent_ix);
+    _80.Store((base + 1u) * 4 + 8, asuint(el.bbox.x));
+    _80.Store((base + 2u) * 4 + 8, asuint(el.bbox.y));
+    _80.Store((base + 3u) * 4 + 8, asuint(el.bbox.z));
+    _80.Store((base + 4u) * 4 + 8, asuint(el.bbox.w));
+}
+
+void comp_main()
+{
+    uint th = gl_LocalInvocationID.x;
+    uint inp = _80.Load(((_64.Load(48) >> uint(2)) + gl_GlobalInvocationID.x) * 4 + 8);
+    bool is_push = int(inp) >= 0;
+    Bic _207 = { 1u - uint(is_push), uint(is_push) };
+    Bic bic = _207;
+    sh_bic[gl_LocalInvocationID.x] = bic;
+    for (uint i = 0u; i < 8u; i++)
+    {
+        GroupMemoryBarrierWithGroupSync();
+        if ((th + (1u << i)) < 256u)
+        {
+            Bic other = sh_bic[gl_LocalInvocationID.x + (1u << i)];
+            Bic param = bic;
+            Bic param_1 = other;
+            bic = bic_combine(param, param_1);
+        }
+        GroupMemoryBarrierWithGroupSync();
+        sh_bic[th] = bic;
+    }
+    if (th == 0u)
+    {
+        uint param_2 = gl_WorkGroupID.x;
+        Bic param_3 = bic;
+        store_bic(param_2, param_3);
+    }
+    GroupMemoryBarrierWithGroupSync();
+    uint size = sh_bic[0].b;
+    bic = _267;
+    if ((th + 1u) < 256u)
+    {
+        bic = sh_bic[th + 1u];
+    }
+    bool _283;
+    if (is_push)
+    {
+        _283 = bic.a == 0u;
+    }
+    else
+    {
+        _283 = is_push;
+    }
+    if (_283)
+    {
+        uint local_ix = (size - bic.b) - 1u;
+        sh_parent[local_ix] = th;
+        sh_path_ix[local_ix] = inp;
+    }
+    GroupMemoryBarrierWithGroupSync();
+    float4 bbox;
+    if (th < size)
+    {
+        uint path_ix = sh_path_ix[th];
+        uint param_4 = path_ix;
+        bbox = load_path_bbox(param_4);
+    }
+    if (th < size)
+    {
+        uint parent_ix = sh_parent[th] + (gl_WorkGroupID.x * 256u);
+        ClipEl _331 = { parent_ix, bbox };
+        ClipEl el = _331;
+        uint param_5 = gl_GlobalInvocationID.x;
+        ClipEl param_6 = el;
+        store_clip_el(param_5, param_6);
+    }
+}
+
+[numthreads(256, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_WorkGroupID = stage_input.gl_WorkGroupID;
+    gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+    gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+    comp_main();
+}
--- a/piet-gpu/shader/gen/clip_reduce.msl
+++ b/piet-gpu/shader/gen/clip_reduce.msl
@ -0,0 +1,173 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct Bic
+{
+    uint a;
+    uint b;
+};
+
+struct ClipEl
+{
+    uint parent_ix;
+    float4 bbox;
+};
+
+struct Alloc
+{
+    uint offset;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+    Alloc trans_alloc;
+    Alloc bbox_alloc;
+    Alloc drawmonoid_alloc;
+    Alloc clip_alloc;
+    Alloc clip_bic_alloc;
+    Alloc clip_stack_alloc;
+    Alloc clip_bbox_alloc;
+    uint n_trans;
+    uint n_path;
+    uint n_clip;
+    uint trans_offset;
+    uint linewidth_offset;
+    uint pathtag_offset;
+    uint pathseg_offset;
+};
+
+struct ConfigBuf
+{
+    Config conf;
+};
+
+struct Memory
+{
+    uint mem_offset;
+    uint mem_error;
+    uint memory[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+Bic bic_combine(thread const Bic& x, thread const Bic& y)
+{
+    uint m = min(x.b, y.a);
+    return Bic{ (x.a + y.a) - m, (x.b + y.b) - m };
+}
+
+static inline __attribute__((always_inline))
+void store_bic(thread const uint& ix, thread const Bic& bic, const device ConfigBuf& v_64, device Memory& v_80)
+{
+    uint base = (v_64.conf.clip_bic_alloc.offset >> uint(2)) + (2u * ix);
+    v_80.memory[base] = bic.a;
+    v_80.memory[base + 1u] = bic.b;
+}
+
+static inline __attribute__((always_inline))
+float4 load_path_bbox(thread const uint& path_ix, const device ConfigBuf& v_64, device Memory& v_80)
+{
+    uint base = (v_64.conf.bbox_alloc.offset >> uint(2)) + (6u * path_ix);
+    float bbox_l = float(v_80.memory[base]) - 32768.0;
+    float bbox_t = float(v_80.memory[base + 1u]) - 32768.0;
+    float bbox_r = float(v_80.memory[base + 2u]) - 32768.0;
+    float bbox_b = float(v_80.memory[base + 3u]) - 32768.0;
+    float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
+    return bbox;
+}
+
+static inline __attribute__((always_inline))
+void store_clip_el(thread const uint& ix, thread const ClipEl& el, const device ConfigBuf& v_64, device Memory& v_80)
+{
+    uint base = (v_64.conf.clip_stack_alloc.offset >> uint(2)) + (5u * ix);
+    v_80.memory[base] = el.parent_ix;
+    v_80.memory[base + 1u] = as_type<uint>(el.bbox.x);
+    v_80.memory[base + 2u] = as_type<uint>(el.bbox.y);
+    v_80.memory[base + 3u] = as_type<uint>(el.bbox.z);
+    v_80.memory[base + 4u] = as_type<uint>(el.bbox.w);
+}
+
+kernel void main0(device Memory& v_80 [[buffer(0)]], const device ConfigBuf& v_64 [[buffer(1)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+{
+    threadgroup Bic sh_bic[256];
+    threadgroup uint sh_parent[256];
+    threadgroup uint sh_path_ix[256];
+    threadgroup float4 sh_bbox[256];
+    uint th = gl_LocalInvocationID.x;
+    uint inp = v_80.memory[(v_64.conf.clip_alloc.offset >> uint(2)) + gl_GlobalInvocationID.x];
+    bool is_push = int(inp) >= 0;
+    Bic bic = Bic{ 1u - uint(is_push), uint(is_push) };
+    sh_bic[gl_LocalInvocationID.x] = bic;
+    for (uint i = 0u; i < 8u; i++)
+    {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        if ((th + (1u << i)) < 256u)
+        {
+            Bic other = sh_bic[gl_LocalInvocationID.x + (1u << i)];
+            Bic param = bic;
+            Bic param_1 = other;
+            bic = bic_combine(param, param_1);
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        sh_bic[th] = bic;
+    }
+    if (th == 0u)
+    {
+        uint param_2 = gl_WorkGroupID.x;
+        Bic param_3 = bic;
+        store_bic(param_2, param_3, v_64, v_80);
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    uint size = sh_bic[0].b;
+    bic = Bic{ 0u, 0u };
+    if ((th + 1u) < 256u)
+    {
+        bic = sh_bic[th + 1u];
+    }
+    bool _283;
+    if (is_push)
+    {
+        _283 = bic.a == 0u;
+    }
+    else
+    {
+        _283 = is_push;
+    }
+    if (_283)
+    {
+        uint local_ix = (size - bic.b) - 1u;
+        sh_parent[local_ix] = th;
+        sh_path_ix[local_ix] = inp;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    float4 bbox;
+    if (th < size)
+    {
+        uint path_ix = sh_path_ix[th];
+        uint param_4 = path_ix;
+        bbox = load_path_bbox(param_4, v_64, v_80);
+    }
+    if (th < size)
+    {
+        uint parent_ix = sh_parent[th] + (gl_WorkGroupID.x * 256u);
+        ClipEl el = ClipEl{ parent_ix, bbox };
+        uint param_5 = gl_GlobalInvocationID.x;
+        ClipEl param_6 = el;
+        store_clip_el(param_5, param_6, v_64, v_80);
+    }
+}
+
--- a/piet-gpu/shader/gen/clip_reduce.spv
+++ b/piet-gpu/shader/gen/clip_reduce.spv
--- a/piet-gpu/shader/gen/coarse.dxil
+++ b/piet-gpu/shader/gen/coarse.dxil
--- a/piet-gpu/shader/gen/coarse.hlsl
+++ b/piet-gpu/shader/gen/coarse.hlsl
@ -49,17 +49,6 @@ struct AnnoLinGradient
    float line_c;
 };

-struct AnnoBeginClipRef
-{
-    uint offset;
-};
-
-struct AnnoBeginClip
-{
-    float4 bbox;
-    float linewidth;
-};
-
 struct AnnotatedRef
 {
    uint offset;
@ -193,8 +182,13 @@ struct Config
    Alloc trans_alloc;
    Alloc bbox_alloc;
    Alloc drawmonoid_alloc;
+    Alloc clip_alloc;
+    Alloc clip_bic_alloc;
+    Alloc clip_stack_alloc;
+    Alloc clip_bbox_alloc;
    uint n_trans;
    uint n_path;
+    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
@ -203,8 +197,8 @@ struct Config

 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);

-RWByteAddressBuffer _296 : register(u0, space0);
-ByteAddressBuffer _1249 : register(t1, space0);
+RWByteAddressBuffer _283 : register(u0, space0);
+ByteAddressBuffer _1169 : register(t1, space0);

 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@ -227,8 +221,8 @@ groupshared uint sh_tile_count[256];

 Alloc slice_mem(Alloc a, uint offset, uint size)
 {
-    Alloc _373 = { a.offset + offset };
-    return _373;
+    Alloc _360 = { a.offset + offset };
+    return _360;
 }

 bool touch_mem(Alloc alloc, uint offset)
@ -244,7 +238,7 @@ uint read_mem(Alloc alloc, uint offset)
    {
        return 0u;
    }
-    uint v = _296.Load(offset * 4 + 8);
+    uint v = _283.Load(offset * 4 + 8);
    return v;
 }

@ -257,8 +251,8 @@ Alloc new_alloc(uint offset, uint size, bool mem_ok)

 BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index)
 {
-    BinInstanceRef _754 = { ref.offset + (index * 4u) };
-    return _754;
+    BinInstanceRef _674 = { ref.offset + (index * 4u) };
+    return _674;
 }

 BinInstance BinInstance_read(Alloc a, BinInstanceRef ref)
@ -277,8 +271,8 @@ AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref)
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
    uint tag_and_flags = read_mem(param, param_1);
-    AnnotatedTag _706 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _706;
+    AnnotatedTag _636 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+    return _636;
 }

 Path Path_read(Alloc a, PathRef ref)
@ -295,8 +289,8 @@ Path Path_read(Alloc a, PathRef ref)
    uint raw2 = read_mem(param_4, param_5);
    Path s;
    s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
-    TileRef _814 = { raw2 };
-    s.tiles = _814;
+    TileRef _734 = { raw2 };
+    s.tiles = _734;
    return s;
 }

@ -306,11 +300,11 @@ void write_tile_alloc(uint el_ix, Alloc a)

 Alloc read_tile_alloc(uint el_ix, bool mem_ok)
 {
-    uint _1135;
-    _296.GetDimensions(_1135);
-    _1135 = (_1135 - 8) / 4;
+    uint _1055;
+    _283.GetDimensions(_1055);
+    _1055 = (_1055 - 8) / 4;
    uint param = 0u;
-    uint param_1 = uint(int(_1135) * 4);
+    uint param_1 = uint(int(_1055) * 4);
    bool param_2 = mem_ok;
    return new_alloc(param, param_1, param_2);
 }
@ -324,9 +318,9 @@ Tile Tile_read(Alloc a, TileRef ref)
    Alloc param_2 = a;
    uint param_3 = ix + 1u;
    uint raw1 = read_mem(param_2, param_3);
-    TileSegRef _839 = { raw0 };
+    TileSegRef _759 = { raw0 };
    Tile s;
-    s.tile = _839;
+    s.tile = _759;
    s.backdrop = int(raw1);
    return s;
 }
@ -361,30 +355,30 @@ AnnoColor AnnoColor_read(Alloc a, AnnoColorRef ref)

 AnnoColor Annotated_Color_read(Alloc a, AnnotatedRef ref)
 {
-    AnnoColorRef _712 = { ref.offset + 4u };
+    AnnoColorRef _642 = { ref.offset + 4u };
    Alloc param = a;
-    AnnoColorRef param_1 = _712;
+    AnnoColorRef param_1 = _642;
    return AnnoColor_read(param, param_1);
 }

 MallocResult malloc(uint size)
 {
-    uint _302;
-    _296.InterlockedAdd(0, size, _302);
-    uint offset = _302;
-    uint _309;
-    _296.GetDimensions(_309);
-    _309 = (_309 - 8) / 4;
+    uint _289;
+    _283.InterlockedAdd(0, size, _289);
+    uint offset = _289;
+    uint _296;
+    _283.GetDimensions(_296);
+    _296 = (_296 - 8) / 4;
    MallocResult r;
-    r.failed = (offset + size) > uint(int(_309) * 4);
+    r.failed = (offset + size) > uint(int(_296) * 4);
    uint param = offset;
    uint param_1 = size;
    bool param_2 = !r.failed;
    r.alloc = new_alloc(param, param_1, param_2);
    if (r.failed)
    {
-        uint _331;
-        _296.InterlockedMax(4, 1u, _331);
+        uint _318;
+        _283.InterlockedMax(4, 1u, _318);
        return r;
    }
    return r;
@ -398,7 +392,7 @@ void write_mem(Alloc alloc, uint offset, uint val)
    {
        return;
    }
-    _296.Store(offset * 4 + 8, val);
+    _283.Store(offset * 4 + 8, val);
 }

 void CmdJump_write(Alloc a, CmdJumpRef ref, CmdJump s)
@ -416,9 +410,9 @@ void Cmd_Jump_write(Alloc a, CmdRef ref, CmdJump s)
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = 10u;
    write_mem(param, param_1, param_2);
-    CmdJumpRef _1128 = { ref.offset + 4u };
+    CmdJumpRef _1048 = { ref.offset + 4u };
    Alloc param_3 = a;
-    CmdJumpRef param_4 = _1128;
+    CmdJumpRef param_4 = _1048;
    CmdJump param_5 = s;
    CmdJump_write(param_3, param_4, param_5);
 }
@ -430,21 +424,21 @@ bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit
        return true;
    }
    uint param = 1024u;
-    MallocResult _1156 = malloc(param);
-    MallocResult new_cmd = _1156;
+    MallocResult _1076 = malloc(param);
+    MallocResult new_cmd = _1076;
    if (new_cmd.failed)
    {
        return false;
    }
-    CmdJump _1166 = { new_cmd.alloc.offset };
-    CmdJump jump = _1166;
+    CmdJump _1086 = { new_cmd.alloc.offset };
+    CmdJump jump = _1086;
    Alloc param_1 = cmd_alloc;
    CmdRef param_2 = cmd_ref;
    CmdJump param_3 = jump;
    Cmd_Jump_write(param_1, param_2, param_3);
    cmd_alloc = new_cmd.alloc;
-    CmdRef _1178 = { cmd_alloc.offset };
-    cmd_ref = _1178;
+    CmdRef _1098 = { cmd_alloc.offset };
+    cmd_ref = _1098;
    cmd_limit = (cmd_alloc.offset + 1024u) - 60u;
    return true;
 }
@ -473,9 +467,9 @@ void Cmd_Fill_write(Alloc a, CmdRef ref, CmdFill s)
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = 1u;
    write_mem(param, param_1, param_2);
-    CmdFillRef _1012 = { ref.offset + 4u };
+    CmdFillRef _932 = { ref.offset + 4u };
    Alloc param_3 = a;
-    CmdFillRef param_4 = _1012;
+    CmdFillRef param_4 = _932;
    CmdFill param_5 = s;
    CmdFill_write(param_3, param_4, param_5);
 }
@ -507,9 +501,9 @@ void Cmd_Stroke_write(Alloc a, CmdRef ref, CmdStroke s)
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = 2u;
    write_mem(param, param_1, param_2);
-    CmdStrokeRef _1030 = { ref.offset + 4u };
+    CmdStrokeRef _950 = { ref.offset + 4u };
    Alloc param_3 = a;
-    CmdStrokeRef param_4 = _1030;
+    CmdStrokeRef param_4 = _950;
    CmdStroke param_5 = s;
    CmdStroke_write(param_3, param_4, param_5);
 }
@ -521,8 +515,8 @@ void write_fill(Alloc alloc, inout CmdRef cmd_ref, uint flags, Tile tile, float
    {
        if (tile.tile.offset != 0u)
        {
-            CmdFill _1202 = { tile.tile.offset, tile.backdrop };
-            CmdFill cmd_fill = _1202;
+            CmdFill _1122 = { tile.tile.offset, tile.backdrop };
+            CmdFill cmd_fill = _1122;
            Alloc param_1 = alloc;
            CmdRef param_2 = cmd_ref;
            CmdFill param_3 = cmd_fill;
@ -539,8 +533,8 @@ void write_fill(Alloc alloc, inout CmdRef cmd_ref, uint flags, Tile tile, float
    }
    else
    {
-        CmdStroke _1232 = { tile.tile.offset, 0.5f * linewidth };
-        CmdStroke cmd_stroke = _1232;
+        CmdStroke _1152 = { tile.tile.offset, 0.5f * linewidth };
+        CmdStroke cmd_stroke = _1152;
        Alloc param_6 = alloc;
        CmdRef param_7 = cmd_ref;
        CmdStroke param_8 = cmd_stroke;
@ -564,9 +558,9 @@ void Cmd_Color_write(Alloc a, CmdRef ref, CmdColor s)
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = 5u;
    write_mem(param, param_1, param_2);
-    CmdColorRef _1056 = { ref.offset + 4u };
+    CmdColorRef _976 = { ref.offset + 4u };
    Alloc param_3 = a;
-    CmdColorRef param_4 = _1056;
+    CmdColorRef param_4 = _976;
    CmdColor param_5 = s;
    CmdColor_write(param_3, param_4, param_5);
 }
@ -613,9 +607,9 @@ AnnoLinGradient AnnoLinGradient_read(Alloc a, AnnoLinGradientRef ref)

 AnnoLinGradient Annotated_LinGradient_read(Alloc a, AnnotatedRef ref)
 {
-    AnnoLinGradientRef _722 = { ref.offset + 4u };
+    AnnoLinGradientRef _652 = { ref.offset + 4u };
    Alloc param = a;
-    AnnoLinGradientRef param_1 = _722;
+    AnnoLinGradientRef param_1 = _652;
    return AnnoLinGradient_read(param, param_1);
 }

@ -646,9 +640,9 @@ void Cmd_LinGrad_write(Alloc a, CmdRef ref, CmdLinGrad s)
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = 6u;
    write_mem(param, param_1, param_2);
-    CmdLinGradRef _1074 = { ref.offset + 4u };
+    CmdLinGradRef _994 = { ref.offset + 4u };
    Alloc param_3 = a;
-    CmdLinGradRef param_4 = _1074;
+    CmdLinGradRef param_4 = _994;
    CmdLinGrad param_5 = s;
    CmdLinGrad_write(param_3, param_4, param_5);
 }
@ -687,9 +681,9 @@ AnnoImage AnnoImage_read(Alloc a, AnnoImageRef ref)

 AnnoImage Annotated_Image_read(Alloc a, AnnotatedRef ref)
 {
-    AnnoImageRef _732 = { ref.offset + 4u };
+    AnnoImageRef _662 = { ref.offset + 4u };
    Alloc param = a;
-    AnnoImageRef param_1 = _732;
+    AnnoImageRef param_1 = _662;
    return AnnoImage_read(param, param_1);
 }

@ -712,45 +706,13 @@ void Cmd_Image_write(Alloc a, CmdRef ref, CmdImage s)
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = 7u;
    write_mem(param, param_1, param_2);
-    CmdImageRef _1092 = { ref.offset + 4u };
+    CmdImageRef _1012 = { ref.offset + 4u };
    Alloc param_3 = a;
-    CmdImageRef param_4 = _1092;
+    CmdImageRef param_4 = _1012;
    CmdImage param_5 = s;
    CmdImage_write(param_3, param_4, param_5);
 }

-AnnoBeginClip AnnoBeginClip_read(Alloc a, AnnoBeginClipRef ref)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1);
-    Alloc param_2 = a;
-    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3);
-    Alloc param_4 = a;
-    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5);
-    Alloc param_6 = a;
-    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7);
-    Alloc param_8 = a;
-    uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9);
-    AnnoBeginClip s;
-    s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
-    s.linewidth = asfloat(raw4);
-    return s;
-}
-
-AnnoBeginClip Annotated_BeginClip_read(Alloc a, AnnotatedRef ref)
-{
-    AnnoBeginClipRef _742 = { ref.offset + 4u };
-    Alloc param = a;
-    AnnoBeginClipRef param_1 = _742;
-    return AnnoBeginClip_read(param, param_1);
-}
-
 void Cmd_BeginClip_write(Alloc a, CmdRef ref)
 {
    Alloc param = a;
@ -777,44 +739,43 @@ void Cmd_End_write(Alloc a, CmdRef ref)

 void comp_main()
 {
-    uint width_in_bins = ((_1249.Load(8) + 16u) - 1u) / 16u;
+    uint width_in_bins = ((_1169.Load(8) + 16u) - 1u) / 16u;
    uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x;
    uint partition_ix = 0u;
-    uint n_partitions = ((_1249.Load(0) + 256u) - 1u) / 256u;
+    uint n_partitions = ((_1169.Load(0) + 256u) - 1u) / 256u;
    uint th_ix = gl_LocalInvocationID.x;
    uint bin_tile_x = 16u * gl_WorkGroupID.x;
    uint bin_tile_y = 16u * gl_WorkGroupID.y;
    uint tile_x = gl_LocalInvocationID.x % 16u;
    uint tile_y = gl_LocalInvocationID.x / 16u;
-    uint this_tile_ix = (((bin_tile_y + tile_y) * _1249.Load(8)) + bin_tile_x) + tile_x;
-    Alloc _1314;
-    _1314.offset = _1249.Load(24);
+    uint this_tile_ix = (((bin_tile_y + tile_y) * _1169.Load(8)) + bin_tile_x) + tile_x;
+    Alloc _1234;
+    _1234.offset = _1169.Load(24);
    Alloc param;
-    param.offset = _1314.offset;
+    param.offset = _1234.offset;
    uint param_1 = this_tile_ix * 1024u;
    uint param_2 = 1024u;
    Alloc cmd_alloc = slice_mem(param, param_1, param_2);
-    CmdRef _1323 = { cmd_alloc.offset };
-    CmdRef cmd_ref = _1323;
+    CmdRef _1243 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _1243;
    uint cmd_limit = (cmd_ref.offset + 1024u) - 60u;
    uint clip_depth = 0u;
    uint clip_zero_depth = 0u;
-    uint clip_one_mask = 0u;
    uint rd_ix = 0u;
    uint wr_ix = 0u;
    uint part_start_ix = 0u;
    uint ready_ix = 0u;
-    bool mem_ok = _296.Load(4) == 0u;
+    bool mem_ok = _283.Load(4) == 0u;
    Alloc param_3;
    Alloc param_5;
-    uint _1529;
+    uint _1448;
    uint element_ix;
    AnnotatedRef ref;
    Alloc param_14;
    Alloc param_16;
    uint tile_count;
    Alloc param_23;
-    uint _1841;
+    uint _1770;
    Alloc param_29;
    Tile tile_1;
    AnnoColor fill;
@ -822,41 +783,40 @@ void comp_main()
    Alloc param_52;
    CmdLinGrad cmd_lin;
    Alloc param_69;
-    Alloc param_86;
    while (true)
    {
        for (uint i = 0u; i < 8u; i++)
        {
            sh_bitmaps[i][th_ix] = 0u;
        }
-        bool _1581;
+        bool _1500;
        for (;;)
        {
            if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
            {
                part_start_ix = ready_ix;
                uint count = 0u;
-                bool _1379 = th_ix < 256u;
-                bool _1387;
-                if (_1379)
+                bool _1298 = th_ix < 256u;
+                bool _1306;
+                if (_1298)
                {
-                    _1387 = (partition_ix + th_ix) < n_partitions;
+                    _1306 = (partition_ix + th_ix) < n_partitions;
                }
                else
                {
-                    _1387 = _1379;
+                    _1306 = _1298;
                }
-                if (_1387)
+                if (_1306)
                {
-                    uint in_ix = (_1249.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
-                    Alloc _1404;
-                    _1404.offset = _1249.Load(20);
-                    param_3.offset = _1404.offset;
+                    uint in_ix = (_1169.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
+                    Alloc _1323;
+                    _1323.offset = _1169.Load(20);
+                    param_3.offset = _1323.offset;
                    uint param_4 = in_ix;
                    count = read_mem(param_3, param_4);
-                    Alloc _1415;
-                    _1415.offset = _1249.Load(20);
-                    param_5.offset = _1415.offset;
+                    Alloc _1334;
+                    _1334.offset = _1169.Load(20);
+                    param_5.offset = _1334.offset;
                    uint param_6 = in_ix + 1u;
                    uint offset = read_mem(param_5, param_6);
                    uint param_7 = offset;
@ -902,16 +862,16 @@ void comp_main()
                }
                if (part_ix > 0u)
                {
-                    _1529 = sh_part_count[part_ix - 1u];
+                    _1448 = sh_part_count[part_ix - 1u];
                }
                else
                {
-                    _1529 = part_start_ix;
+                    _1448 = part_start_ix;
                }
-                ix -= _1529;
+                ix -= _1448;
                Alloc bin_alloc = sh_part_elements[part_ix];
-                BinInstanceRef _1548 = { bin_alloc.offset };
-                BinInstanceRef inst_ref = _1548;
+                BinInstanceRef _1467 = { bin_alloc.offset };
+                BinInstanceRef inst_ref = _1467;
                BinInstanceRef param_10 = inst_ref;
                uint param_11 = ix;
                Alloc param_12 = bin_alloc;
@ -921,16 +881,16 @@ void comp_main()
            }
            GroupMemoryBarrierWithGroupSync();
            wr_ix = min((rd_ix + 256u), ready_ix);
-            bool _1571 = (wr_ix - rd_ix) < 256u;
-            if (_1571)
+            bool _1490 = (wr_ix - rd_ix) < 256u;
+            if (_1490)
            {
-                _1581 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
+                _1500 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
            }
            else
            {
-                _1581 = _1571;
+                _1500 = _1490;
            }
-            if (_1581)
+            if (_1500)
            {
                continue;
            }
@ -943,11 +903,11 @@ void comp_main()
        if ((th_ix + rd_ix) < wr_ix)
        {
            element_ix = sh_elements[th_ix];
-            AnnotatedRef _1602 = { _1249.Load(32) + (element_ix * 40u) };
-            ref = _1602;
-            Alloc _1605;
-            _1605.offset = _1249.Load(32);
-            param_14.offset = _1605.offset;
+            AnnotatedRef _1521 = { _1169.Load(32) + (element_ix * 40u) };
+            ref = _1521;
+            Alloc _1524;
+            _1524.offset = _1169.Load(32);
+            param_14.offset = _1524.offset;
            AnnotatedRef param_15 = ref;
            tag = Annotated_tag(param_14, param_15).tag;
        }
@ -959,12 +919,13 @@ void comp_main()
            case 4u:
            case 5u:
            {
-                uint path_ix = element_ix;
-                PathRef _1624 = { _1249.Load(16) + (path_ix * 12u) };
-                Alloc _1627;
-                _1627.offset = _1249.Load(16);
-                param_16.offset = _1627.offset;
-                PathRef param_17 = _1624;
+                uint drawmonoid_base = (_1169.Load(44) >> uint(2)) + (2u * element_ix);
+                uint path_ix = _283.Load(drawmonoid_base * 4 + 8);
+                PathRef _1553 = { _1169.Load(16) + (path_ix * 12u) };
+                Alloc _1556;
+                _1556.offset = _1169.Load(16);
+                param_16.offset = _1556.offset;
+                PathRef param_17 = _1553;
                Path path = Path_read(param_16, param_17);
                uint stride = path.bbox.z - path.bbox.x;
                sh_tile_stride[th_ix] = stride;
@ -1019,59 +980,53 @@ void comp_main()
                    el_ix = probe_1;
                }
            }
-            AnnotatedRef _1826 = { _1249.Load(32) + (sh_elements[el_ix] * 40u) };
-            AnnotatedRef ref_1 = _1826;
-            Alloc _1830;
-            _1830.offset = _1249.Load(32);
-            param_23.offset = _1830.offset;
+            AnnotatedRef _1755 = { _1169.Load(32) + (sh_elements[el_ix] * 40u) };
+            AnnotatedRef ref_1 = _1755;
+            Alloc _1759;
+            _1759.offset = _1169.Load(32);
+            param_23.offset = _1759.offset;
            AnnotatedRef param_24 = ref_1;
            uint tag_1 = Annotated_tag(param_23, param_24).tag;
            if (el_ix > 0u)
            {
-                _1841 = sh_tile_count[el_ix - 1u];
+                _1770 = sh_tile_count[el_ix - 1u];
            }
            else
            {
-                _1841 = 0u;
+                _1770 = 0u;
            }
-            uint seq_ix = ix_1 - _1841;
+            uint seq_ix = ix_1 - _1770;
            uint width = sh_tile_width[el_ix];
            uint x = sh_tile_x0[el_ix] + (seq_ix % width);
            uint y = sh_tile_y0[el_ix] + (seq_ix / width);
            bool include_tile = false;
-            if ((tag_1 == 4u) || (tag_1 == 5u))
+            if (mem_ok)
            {
-                include_tile = true;
-            }
-            else
-            {
-                if (mem_ok)
+                uint param_25 = el_ix;
+                bool param_26 = mem_ok;
+                TileRef _1822 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
+                Alloc param_27 = read_tile_alloc(param_25, param_26);
+                TileRef param_28 = _1822;
+                Tile tile = Tile_read(param_27, param_28);
+                bool is_clip = (tag_1 == 4u) || (tag_1 == 5u);
+                bool _1834 = tile.tile.offset != 0u;
+                bool _1843;
+                if (!_1834)
                {
-                    uint param_25 = el_ix;
-                    bool param_26 = mem_ok;
-                    TileRef _1901 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
-                    Alloc param_27 = read_tile_alloc(param_25, param_26);
-                    TileRef param_28 = _1901;
-                    Tile tile = Tile_read(param_27, param_28);
-                    bool _1907 = tile.tile.offset != 0u;
-                    bool _1914;
-                    if (!_1907)
-                    {
-                        _1914 = tile.backdrop != 0;
-                    }
-                    else
-                    {
-                        _1914 = _1907;
-                    }
-                    include_tile = _1914;
+                    _1843 = (tile.backdrop == 0) == is_clip;
                }
+                else
+                {
+                    _1843 = _1834;
+                }
+                include_tile = _1843;
            }
            if (include_tile)
            {
                uint el_slice = el_ix / 32u;
                uint el_mask = 1u << (el_ix & 31u);
-                uint _1934;
-                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1934);
+                uint _1863;
+                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1863);
            }
        }
        GroupMemoryBarrierWithGroupSync();
@ -1095,11 +1050,11 @@ void comp_main()
            uint element_ref_ix = (slice_ix * 32u) + uint(int(firstbitlow(bitmap)));
            uint element_ix_1 = sh_elements[element_ref_ix];
            bitmap &= (bitmap - 1u);
-            AnnotatedRef _1988 = { _1249.Load(32) + (element_ix_1 * 40u) };
-            ref = _1988;
-            Alloc _1993;
-            _1993.offset = _1249.Load(32);
-            param_29.offset = _1993.offset;
+            AnnotatedRef _1917 = { _1169.Load(32) + (element_ix_1 * 40u) };
+            ref = _1917;
+            Alloc _1922;
+            _1922.offset = _1169.Load(32);
+            param_29.offset = _1922.offset;
            AnnotatedRef param_30 = ref;
            AnnotatedTag tag_2 = Annotated_tag(param_29, param_30);
            if (clip_zero_depth == 0u)
@ -1110,23 +1065,23 @@ void comp_main()
                    {
                        uint param_31 = element_ref_ix;
                        bool param_32 = mem_ok;
-                        TileRef _2029 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                        TileRef _1958 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
                        Alloc param_33 = read_tile_alloc(param_31, param_32);
-                        TileRef param_34 = _2029;
+                        TileRef param_34 = _1958;
                        tile_1 = Tile_read(param_33, param_34);
-                        Alloc _2036;
-                        _2036.offset = _1249.Load(32);
-                        param_35.offset = _2036.offset;
+                        Alloc _1965;
+                        _1965.offset = _1169.Load(32);
+                        param_35.offset = _1965.offset;
                        AnnotatedRef param_36 = ref;
                        fill = Annotated_Color_read(param_35, param_36);
                        Alloc param_37 = cmd_alloc;
                        CmdRef param_38 = cmd_ref;
                        uint param_39 = cmd_limit;
-                        bool _2048 = alloc_cmd(param_37, param_38, param_39);
+                        bool _1977 = alloc_cmd(param_37, param_38, param_39);
                        cmd_alloc = param_37;
                        cmd_ref = param_38;
                        cmd_limit = param_39;
-                        if (!_2048)
+                        if (!_1977)
                        {
                            break;
                        }
@ -1137,10 +1092,10 @@ void comp_main()
                        float param_44 = fill.linewidth;
                        write_fill(param_40, param_41, param_42, param_43, param_44);
                        cmd_ref = param_41;
-                        CmdColor _2072 = { fill.rgba_color };
+                        CmdColor _2001 = { fill.rgba_color };
                        Alloc param_45 = cmd_alloc;
                        CmdRef param_46 = cmd_ref;
-                        CmdColor param_47 = _2072;
+                        CmdColor param_47 = _2001;
                        Cmd_Color_write(param_45, param_46, param_47);
                        cmd_ref.offset += 8u;
                        break;
@ -1149,23 +1104,23 @@ void comp_main()
                    {
                        uint param_48 = element_ref_ix;
                        bool param_49 = mem_ok;
-                        TileRef _2101 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                        TileRef _2030 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
                        Alloc param_50 = read_tile_alloc(param_48, param_49);
-                        TileRef param_51 = _2101;
+                        TileRef param_51 = _2030;
                        tile_1 = Tile_read(param_50, param_51);
-                        Alloc _2108;
-                        _2108.offset = _1249.Load(32);
-                        param_52.offset = _2108.offset;
+                        Alloc _2037;
+                        _2037.offset = _1169.Load(32);
+                        param_52.offset = _2037.offset;
                        AnnotatedRef param_53 = ref;
                        AnnoLinGradient lin = Annotated_LinGradient_read(param_52, param_53);
                        Alloc param_54 = cmd_alloc;
                        CmdRef param_55 = cmd_ref;
                        uint param_56 = cmd_limit;
-                        bool _2120 = alloc_cmd(param_54, param_55, param_56);
+                        bool _2049 = alloc_cmd(param_54, param_55, param_56);
                        cmd_alloc = param_54;
                        cmd_ref = param_55;
                        cmd_limit = param_56;
-                        if (!_2120)
+                        if (!_2049)
                        {
                            break;
                        }
@ -1191,23 +1146,23 @@ void comp_main()
                    {
                        uint param_65 = element_ref_ix;
                        bool param_66 = mem_ok;
-                        TileRef _2185 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                        TileRef _2114 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
                        Alloc param_67 = read_tile_alloc(param_65, param_66);
-                        TileRef param_68 = _2185;
+                        TileRef param_68 = _2114;
                        tile_1 = Tile_read(param_67, param_68);
-                        Alloc _2192;
-                        _2192.offset = _1249.Load(32);
-                        param_69.offset = _2192.offset;
+                        Alloc _2121;
+                        _2121.offset = _1169.Load(32);
+                        param_69.offset = _2121.offset;
                        AnnotatedRef param_70 = ref;
                        AnnoImage fill_img = Annotated_Image_read(param_69, param_70);
                        Alloc param_71 = cmd_alloc;
                        CmdRef param_72 = cmd_ref;
                        uint param_73 = cmd_limit;
-                        bool _2204 = alloc_cmd(param_71, param_72, param_73);
+                        bool _2133 = alloc_cmd(param_71, param_72, param_73);
                        cmd_alloc = param_71;
                        cmd_ref = param_72;
                        cmd_limit = param_73;
-                        if (!_2204)
+                        if (!_2133)
                        {
                            break;
                        }
@ -1218,10 +1173,10 @@ void comp_main()
                        float param_78 = fill_img.linewidth;
                        write_fill(param_74, param_75, param_76, param_77, param_78);
                        cmd_ref = param_75;
-                        CmdImage _2230 = { fill_img.index, fill_img.offset };
+                        CmdImage _2159 = { fill_img.index, fill_img.offset };
                        Alloc param_79 = cmd_alloc;
                        CmdRef param_80 = cmd_ref;
-                        CmdImage param_81 = _2230;
+                        CmdImage param_81 = _2159;
                        Cmd_Image_write(param_79, param_80, param_81);
                        cmd_ref.offset += 12u;
                        break;
@ -1230,103 +1185,76 @@ void comp_main()
                    {
                        uint param_82 = element_ref_ix;
                        bool param_83 = mem_ok;
-                        TileRef _2259 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                        TileRef _2188 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
                        Alloc param_84 = read_tile_alloc(param_82, param_83);
-                        TileRef param_85 = _2259;
+                        TileRef param_85 = _2188;
                        tile_1 = Tile_read(param_84, param_85);
-                        bool _2265 = tile_1.tile.offset == 0u;
-                        bool _2271;
-                        if (_2265)
+                        bool _2194 = tile_1.tile.offset == 0u;
+                        bool _2200;
+                        if (_2194)
                        {
-                            _2271 = tile_1.backdrop == 0;
+                            _2200 = tile_1.backdrop == 0;
                        }
                        else
                        {
-                            _2271 = _2265;
+                            _2200 = _2194;
                        }
-                        if (_2271)
+                        if (_2200)
                        {
                            clip_zero_depth = clip_depth + 1u;
                        }
                        else
                        {
-                            if ((tile_1.tile.offset == 0u) && (clip_depth < 32u))
+                            Alloc param_86 = cmd_alloc;
+                            CmdRef param_87 = cmd_ref;
+                            uint param_88 = cmd_limit;
+                            bool _2212 = alloc_cmd(param_86, param_87, param_88);
+                            cmd_alloc = param_86;
+                            cmd_ref = param_87;
+                            cmd_limit = param_88;
+                            if (!_2212)
                            {
-                                clip_one_mask |= (1u << clip_depth);
-                            }
-                            else
-                            {
-                                Alloc _2293;
-                                _2293.offset = _1249.Load(32);
-                                param_86.offset = _2293.offset;
-                                AnnotatedRef param_87 = ref;
-                                AnnoBeginClip begin_clip = Annotated_BeginClip_read(param_86, param_87);
-                                Alloc param_88 = cmd_alloc;
-                                CmdRef param_89 = cmd_ref;
-                                uint param_90 = cmd_limit;
-                                bool _2305 = alloc_cmd(param_88, param_89, param_90);
-                                cmd_alloc = param_88;
-                                cmd_ref = param_89;
-                                cmd_limit = param_90;
-                                if (!_2305)
-                                {
-                                    break;
-                                }
-                                Alloc param_91 = cmd_alloc;
-                                CmdRef param_92 = cmd_ref;
-                                uint param_93 = tag_2.flags;
-                                Tile param_94 = tile_1;
-                                float param_95 = begin_clip.linewidth;
-                                write_fill(param_91, param_92, param_93, param_94, param_95);
-                                cmd_ref = param_92;
-                                Alloc param_96 = cmd_alloc;
-                                CmdRef param_97 = cmd_ref;
-                                Cmd_BeginClip_write(param_96, param_97);
-                                cmd_ref.offset += 4u;
-                                if (clip_depth < 32u)
-                                {
-                                    clip_one_mask &= (~(1u << clip_depth));
-                                }
+                                break;
                            }
+                            Alloc param_89 = cmd_alloc;
+                            CmdRef param_90 = cmd_ref;
+                            Cmd_BeginClip_write(param_89, param_90);
+                            cmd_ref.offset += 4u;
                        }
                        clip_depth++;
                        break;
                    }
                    case 5u:
                    {
+                        uint param_91 = element_ref_ix;
+                        bool param_92 = mem_ok;
+                        TileRef _2249 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                        Alloc param_93 = read_tile_alloc(param_91, param_92);
+                        TileRef param_94 = _2249;
+                        tile_1 = Tile_read(param_93, param_94);
                        clip_depth--;
-                        bool _2351 = clip_depth >= 32u;
-                        bool _2360;
-                        if (!_2351)
+                        Alloc param_95 = cmd_alloc;
+                        CmdRef param_96 = cmd_ref;
+                        uint param_97 = cmd_limit;
+                        bool _2261 = alloc_cmd(param_95, param_96, param_97);
+                        cmd_alloc = param_95;
+                        cmd_ref = param_96;
+                        cmd_limit = param_97;
+                        if (!_2261)
                        {
-                            _2360 = (clip_one_mask & (1u << clip_depth)) == 0u;
-                        }
-                        else
-                        {
-                            _2360 = _2351;
-                        }
-                        if (_2360)
-                        {
-                            Alloc param_98 = cmd_alloc;
-                            CmdRef param_99 = cmd_ref;
-                            uint param_100 = cmd_limit;
-                            bool _2369 = alloc_cmd(param_98, param_99, param_100);
-                            cmd_alloc = param_98;
-                            cmd_ref = param_99;
-                            cmd_limit = param_100;
-                            if (!_2369)
-                            {
-                                break;
-                            }
-                            Alloc param_101 = cmd_alloc;
-                            CmdRef param_102 = cmd_ref;
-                            Cmd_Solid_write(param_101, param_102);
-                            cmd_ref.offset += 4u;
-                            Alloc param_103 = cmd_alloc;
-                            CmdRef param_104 = cmd_ref;
-                            Cmd_EndClip_write(param_103, param_104);
-                            cmd_ref.offset += 4u;
+                            break;
                        }
+                        Alloc param_98 = cmd_alloc;
+                        CmdRef param_99 = cmd_ref;
+                        uint param_100 = 0u;
+                        Tile param_101 = tile_1;
+                        float param_102 = 0.0f;
+                        write_fill(param_98, param_99, param_100, param_101, param_102);
+                        cmd_ref = param_99;
+                        Alloc param_103 = cmd_alloc;
+                        CmdRef param_104 = cmd_ref;
+                        Cmd_EndClip_write(param_103, param_104);
+                        cmd_ref.offset += 4u;
                        break;
                    }
                }
@ -1359,17 +1287,17 @@ void comp_main()
            break;
        }
    }
-    bool _2432 = (bin_tile_x + tile_x) < _1249.Load(8);
-    bool _2441;
-    if (_2432)
+    bool _2326 = (bin_tile_x + tile_x) < _1169.Load(8);
+    bool _2335;
+    if (_2326)
    {
-        _2441 = (bin_tile_y + tile_y) < _1249.Load(12);
+        _2335 = (bin_tile_y + tile_y) < _1169.Load(12);
    }
    else
    {
-        _2441 = _2432;
+        _2335 = _2326;
    }
-    if (_2441)
+    if (_2335)
    {
        Alloc param_105 = cmd_alloc;
        CmdRef param_106 = cmd_ref;
--- a/piet-gpu/shader/gen/coarse.msl
+++ b/piet-gpu/shader/gen/coarse.msl
--- a/piet-gpu/shader/gen/coarse.spv
+++ b/piet-gpu/shader/gen/coarse.spv
--- a/piet-gpu/shader/gen/draw_leaf.dxil
+++ b/piet-gpu/shader/gen/draw_leaf.dxil
--- a/piet-gpu/shader/gen/draw_leaf.hlsl
+++ b/piet-gpu/shader/gen/draw_leaf.hlsl
@ -41,16 +41,6 @@ struct FillImage
    int2 offset;
 };

-struct ClipRef
-{
-    uint offset;
-};
-
-struct Clip
-{
-    float4 bbox;
-};
-
 struct ElementTag
 {
    uint tag;
@ -143,8 +133,13 @@ struct Config
    Alloc trans_alloc;
    Alloc bbox_alloc;
    Alloc drawmonoid_alloc;
+    Alloc clip_alloc;
+    Alloc clip_bic_alloc;
+    Alloc clip_stack_alloc;
+    Alloc clip_bbox_alloc;
    uint n_trans;
    uint n_path;
+    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
@ -153,14 +148,14 @@ struct Config

 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);

-static const DrawMonoid _418 = { 0u, 0u };
-static const DrawMonoid _442 = { 1u, 0u };
-static const DrawMonoid _444 = { 1u, 1u };
+static const DrawMonoid _348 = { 0u, 0u };
+static const DrawMonoid _372 = { 1u, 0u };
+static const DrawMonoid _374 = { 1u, 1u };

-RWByteAddressBuffer _201 : register(u0, space0);
-ByteAddressBuffer _225 : register(t2, space0);
-ByteAddressBuffer _1004 : register(t3, space0);
-ByteAddressBuffer _1038 : register(t1, space0);
+RWByteAddressBuffer _187 : register(u0, space0);
+ByteAddressBuffer _211 : register(t2, space0);
+ByteAddressBuffer _934 : register(t3, space0);
+ByteAddressBuffer _968 : register(t1, space0);

 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@ -176,9 +171,9 @@ groupshared DrawMonoid sh_scratch[256];

 ElementTag Element_tag(ElementRef ref)
 {
-    uint tag_and_flags = _225.Load((ref.offset >> uint(2)) * 4 + 0);
-    ElementTag _375 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _375;
+    uint tag_and_flags = _211.Load((ref.offset >> uint(2)) * 4 + 0);
+    ElementTag _321 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+    return _321;
 }

 DrawMonoid map_tag(uint tag_word)
@ -189,24 +184,24 @@ DrawMonoid map_tag(uint tag_word)
        case 5u:
        case 6u:
        {
-            return _442;
+            return _372;
        }
        case 9u:
        case 10u:
        {
-            return _444;
+            return _374;
        }
        default:
        {
-            return _418;
+            return _348;
        }
    }
 }

 ElementRef Element_index(ElementRef ref, uint index)
 {
-    ElementRef _214 = { ref.offset + (index * 36u) };
-    return _214;
+    ElementRef _200 = { ref.offset + (index * 36u) };
+    return _200;
 }

 DrawMonoid combine_tag_monoid(DrawMonoid a, DrawMonoid b)
@ -219,13 +214,13 @@ DrawMonoid combine_tag_monoid(DrawMonoid a, DrawMonoid b)

 DrawMonoid tag_monoid_identity()
 {
-    return _418;
+    return _348;
 }

 FillColor FillColor_read(FillColorRef ref)
 {
    uint ix = ref.offset >> uint(2);
-    uint raw0 = _225.Load((ix + 0u) * 4 + 0);
+    uint raw0 = _211.Load((ix + 0u) * 4 + 0);
    FillColor s;
    s.rgba_color = raw0;
    return s;
@ -233,8 +228,8 @@ FillColor FillColor_read(FillColorRef ref)

 FillColor Element_FillColor_read(ElementRef ref)
 {
-    FillColorRef _381 = { ref.offset + 4u };
-    FillColorRef param = _381;
+    FillColorRef _327 = { ref.offset + 4u };
+    FillColorRef param = _327;
    return FillColor_read(param);
 }

@ -251,7 +246,7 @@ void write_mem(Alloc alloc, uint offset, uint val)
    {
        return;
    }
-    _201.Store(offset * 4 + 8, val);
+    _187.Store(offset * 4 + 8, val);
 }

 void AnnoColor_write(Alloc a, AnnoColorRef ref, AnnoColor s)
@ -289,9 +284,9 @@ void Annotated_Color_write(Alloc a, AnnotatedRef ref, uint flags, AnnoColor s)
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = (flags << uint(16)) | 1u;
    write_mem(param, param_1, param_2);
-    AnnoColorRef _805 = { ref.offset + 4u };
+    AnnoColorRef _735 = { ref.offset + 4u };
    Alloc param_3 = a;
-    AnnoColorRef param_4 = _805;
+    AnnoColorRef param_4 = _735;
    AnnoColor param_5 = s;
    AnnoColor_write(param_3, param_4, param_5);
 }
@ -299,11 +294,11 @@ void Annotated_Color_write(Alloc a, AnnotatedRef ref, uint flags, AnnoColor s)
 FillLinGradient FillLinGradient_read(FillLinGradientRef ref)
 {
    uint ix = ref.offset >> uint(2);
-    uint raw0 = _225.Load((ix + 0u) * 4 + 0);
-    uint raw1 = _225.Load((ix + 1u) * 4 + 0);
-    uint raw2 = _225.Load((ix + 2u) * 4 + 0);
-    uint raw3 = _225.Load((ix + 3u) * 4 + 0);
-    uint raw4 = _225.Load((ix + 4u) * 4 + 0);
+    uint raw0 = _211.Load((ix + 0u) * 4 + 0);
+    uint raw1 = _211.Load((ix + 1u) * 4 + 0);
+    uint raw2 = _211.Load((ix + 2u) * 4 + 0);
+    uint raw3 = _211.Load((ix + 3u) * 4 + 0);
+    uint raw4 = _211.Load((ix + 4u) * 4 + 0);
    FillLinGradient s;
    s.index = raw0;
    s.p0 = float2(asfloat(raw1), asfloat(raw2));
@ -313,8 +308,8 @@ FillLinGradient FillLinGradient_read(FillLinGradientRef ref)

 FillLinGradient Element_FillLinGradient_read(ElementRef ref)
 {
-    FillLinGradientRef _389 = { ref.offset + 4u };
-    FillLinGradientRef param = _389;
+    FillLinGradientRef _335 = { ref.offset + 4u };
+    FillLinGradientRef param = _335;
    return FillLinGradient_read(param);
 }

@ -365,9 +360,9 @@ void Annotated_LinGradient_write(Alloc a, AnnotatedRef ref, uint flags, AnnoLinG
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = (flags << uint(16)) | 2u;
    write_mem(param, param_1, param_2);
-    AnnoLinGradientRef _826 = { ref.offset + 4u };
+    AnnoLinGradientRef _756 = { ref.offset + 4u };
    Alloc param_3 = a;
-    AnnoLinGradientRef param_4 = _826;
+    AnnoLinGradientRef param_4 = _756;
    AnnoLinGradient param_5 = s;
    AnnoLinGradient_write(param_3, param_4, param_5);
 }
@ -375,8 +370,8 @@ void Annotated_LinGradient_write(Alloc a, AnnotatedRef ref, uint flags, AnnoLinG
 FillImage FillImage_read(FillImageRef ref)
 {
    uint ix = ref.offset >> uint(2);
-    uint raw0 = _225.Load((ix + 0u) * 4 + 0);
-    uint raw1 = _225.Load((ix + 1u) * 4 + 0);
+    uint raw0 = _211.Load((ix + 0u) * 4 + 0);
+    uint raw1 = _211.Load((ix + 1u) * 4 + 0);
    FillImage s;
    s.index = raw0;
    s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
@ -385,8 +380,8 @@ FillImage FillImage_read(FillImageRef ref)

 FillImage Element_FillImage_read(ElementRef ref)
 {
-    FillImageRef _397 = { ref.offset + 4u };
-    FillImageRef param = _397;
+    FillImageRef _343 = { ref.offset + 4u };
+    FillImageRef param = _343;
    return FillImage_read(param);
 }

@ -429,32 +424,13 @@ void Annotated_Image_write(Alloc a, AnnotatedRef ref, uint flags, AnnoImage s)
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = (flags << uint(16)) | 3u;
    write_mem(param, param_1, param_2);
-    AnnoImageRef _847 = { ref.offset + 4u };
+    AnnoImageRef _777 = { ref.offset + 4u };
    Alloc param_3 = a;
-    AnnoImageRef param_4 = _847;
+    AnnoImageRef param_4 = _777;
    AnnoImage param_5 = s;
    AnnoImage_write(param_3, param_4, param_5);
 }

-Clip Clip_read(ClipRef ref)
-{
-    uint ix = ref.offset >> uint(2);
-    uint raw0 = _225.Load((ix + 0u) * 4 + 0);
-    uint raw1 = _225.Load((ix + 1u) * 4 + 0);
-    uint raw2 = _225.Load((ix + 2u) * 4 + 0);
-    uint raw3 = _225.Load((ix + 3u) * 4 + 0);
-    Clip s;
-    s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
-    return s;
-}
-
-Clip Element_BeginClip_read(ElementRef ref)
-{
-    ClipRef _405 = { ref.offset + 4u };
-    ClipRef param = _405;
-    return Clip_read(param);
-}
-
 void AnnoBeginClip_write(Alloc a, AnnoBeginClipRef ref, AnnoBeginClip s)
 {
    uint ix = ref.offset >> uint(2);
@ -486,20 +462,13 @@ void Annotated_BeginClip_write(Alloc a, AnnotatedRef ref, uint flags, AnnoBeginC
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = (flags << uint(16)) | 4u;
    write_mem(param, param_1, param_2);
-    AnnoBeginClipRef _868 = { ref.offset + 4u };
+    AnnoBeginClipRef _798 = { ref.offset + 4u };
    Alloc param_3 = a;
-    AnnoBeginClipRef param_4 = _868;
+    AnnoBeginClipRef param_4 = _798;
    AnnoBeginClip param_5 = s;
    AnnoBeginClip_write(param_3, param_4, param_5);
 }

-Clip Element_EndClip_read(ElementRef ref)
-{
-    ClipRef _413 = { ref.offset + 4u };
-    ClipRef param = _413;
-    return Clip_read(param);
-}
-
 void AnnoEndClip_write(Alloc a, AnnoEndClipRef ref, AnnoEndClip s)
 {
    uint ix = ref.offset >> uint(2);
@ -527,9 +496,9 @@ void Annotated_EndClip_write(Alloc a, AnnotatedRef ref, AnnoEndClip s)
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = 5u;
    write_mem(param, param_1, param_2);
-    AnnoEndClipRef _886 = { ref.offset + 4u };
+    AnnoEndClipRef _816 = { ref.offset + 4u };
    Alloc param_3 = a;
-    AnnoEndClipRef param_4 = _886;
+    AnnoEndClipRef param_4 = _816;
    AnnoEndClip param_5 = s;
    AnnoEndClip_write(param_3, param_4, param_5);
 }
@ -537,8 +506,8 @@ void Annotated_EndClip_write(Alloc a, AnnotatedRef ref, AnnoEndClip s)
 void comp_main()
 {
    uint ix = gl_GlobalInvocationID.x * 8u;
-    ElementRef _904 = { ix * 36u };
-    ElementRef ref = _904;
+    ElementRef _834 = { ix * 36u };
+    ElementRef ref = _834;
    ElementRef param = ref;
    uint tag_word = Element_tag(param).tag;
    uint param_1 = tag_word;
@ -575,11 +544,11 @@ void comp_main()
    DrawMonoid row = tag_monoid_identity();
    if (gl_WorkGroupID.x > 0u)
    {
-        DrawMonoid _1010;
-        _1010.path_ix = _1004.Load((gl_WorkGroupID.x - 1u) * 8 + 0);
-        _1010.clip_ix = _1004.Load((gl_WorkGroupID.x - 1u) * 8 + 4);
-        row.path_ix = _1010.path_ix;
-        row.clip_ix = _1010.clip_ix;
+        DrawMonoid _940;
+        _940.path_ix = _934.Load((gl_WorkGroupID.x - 1u) * 8 + 0);
+        _940.clip_ix = _934.Load((gl_WorkGroupID.x - 1u) * 8 + 4);
+        row.path_ix = _940.path_ix;
+        row.clip_ix = _940.clip_ix;
    }
    if (gl_LocalInvocationID.x > 0u)
    {
@ -588,9 +557,10 @@ void comp_main()
        row = combine_tag_monoid(param_10, param_11);
    }
    uint out_ix = gl_GlobalInvocationID.x * 8u;
-    uint out_base = (_1038.Load(44) >> uint(2)) + (out_ix * 2u);
-    AnnotatedRef _1054 = { _1038.Load(32) + (out_ix * 40u) };
-    AnnotatedRef out_ref = _1054;
+    uint out_base = (_968.Load(44) >> uint(2)) + (out_ix * 2u);
+    uint clip_out_base = _968.Load(48) >> uint(2);
+    AnnotatedRef _989 = { _968.Load(32) + (out_ix * 40u) };
+    AnnotatedRef out_ref = _989;
    float4 mat;
    float2 translate;
    AnnoColor anno_fill;
@ -600,39 +570,43 @@ void comp_main()
    AnnoImage anno_img;
    Alloc param_28;
    AnnoBeginClip anno_begin_clip;
-    Alloc param_33;
+    Alloc param_32;
    AnnoEndClip anno_end_clip;
-    Alloc param_38;
+    Alloc param_36;
    for (uint i_2 = 0u; i_2 < 8u; i_2++)
    {
-        DrawMonoid param_12 = row;
-        DrawMonoid param_13 = local[i_2];
-        DrawMonoid m = combine_tag_monoid(param_12, param_13);
-        _201.Store((out_base + (i_2 * 2u)) * 4 + 8, m.path_ix);
-        _201.Store(((out_base + (i_2 * 2u)) + 1u) * 4 + 8, m.clip_ix);
+        DrawMonoid m = row;
+        if (i_2 > 0u)
+        {
+            DrawMonoid param_12 = m;
+            DrawMonoid param_13 = local[i_2 - 1u];
+            m = combine_tag_monoid(param_12, param_13);
+        }
+        _187.Store((out_base + (i_2 * 2u)) * 4 + 8, m.path_ix);
+        _187.Store(((out_base + (i_2 * 2u)) + 1u) * 4 + 8, m.clip_ix);
        ElementRef param_14 = ref;
        uint param_15 = i_2;
        ElementRef this_ref = Element_index(param_14, param_15);
        ElementRef param_16 = this_ref;
        tag_word = Element_tag(param_16).tag;
-        if (((tag_word == 4u) || (tag_word == 5u)) || (tag_word == 6u))
+        if ((((tag_word == 4u) || (tag_word == 5u)) || (tag_word == 6u)) || (tag_word == 9u))
        {
-            uint bbox_offset = (_1038.Load(40) >> uint(2)) + (6u * (m.path_ix - 1u));
-            float bbox_l = float(_201.Load(bbox_offset * 4 + 8)) - 32768.0f;
-            float bbox_t = float(_201.Load((bbox_offset + 1u) * 4 + 8)) - 32768.0f;
-            float bbox_r = float(_201.Load((bbox_offset + 2u) * 4 + 8)) - 32768.0f;
-            float bbox_b = float(_201.Load((bbox_offset + 3u) * 4 + 8)) - 32768.0f;
+            uint bbox_offset = (_968.Load(40) >> uint(2)) + (6u * m.path_ix);
+            float bbox_l = float(_187.Load(bbox_offset * 4 + 8)) - 32768.0f;
+            float bbox_t = float(_187.Load((bbox_offset + 1u) * 4 + 8)) - 32768.0f;
+            float bbox_r = float(_187.Load((bbox_offset + 2u) * 4 + 8)) - 32768.0f;
+            float bbox_b = float(_187.Load((bbox_offset + 3u) * 4 + 8)) - 32768.0f;
            float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
-            float linewidth = asfloat(_201.Load((bbox_offset + 4u) * 4 + 8));
+            float linewidth = asfloat(_187.Load((bbox_offset + 4u) * 4 + 8));
            uint fill_mode = uint(linewidth >= 0.0f);
            if ((linewidth >= 0.0f) || (tag_word == 5u))
            {
-                uint trans_ix = _201.Load((bbox_offset + 5u) * 4 + 8);
-                uint t = (_1038.Load(36) >> uint(2)) + (6u * trans_ix);
-                mat = asfloat(uint4(_201.Load(t * 4 + 8), _201.Load((t + 1u) * 4 + 8), _201.Load((t + 2u) * 4 + 8), _201.Load((t + 3u) * 4 + 8)));
+                uint trans_ix = _187.Load((bbox_offset + 5u) * 4 + 8);
+                uint t = (_968.Load(36) >> uint(2)) + (6u * trans_ix);
+                mat = asfloat(uint4(_187.Load(t * 4 + 8), _187.Load((t + 1u) * 4 + 8), _187.Load((t + 2u) * 4 + 8), _187.Load((t + 3u) * 4 + 8)));
                if (tag_word == 5u)
                {
-                    translate = asfloat(uint2(_201.Load((t + 4u) * 4 + 8), _201.Load((t + 5u) * 4 + 8)));
+                    translate = asfloat(uint2(_187.Load((t + 4u) * 4 + 8), _187.Load((t + 5u) * 4 + 8)));
                }
            }
            if (linewidth >= 0.0f)
@ -649,9 +623,9 @@ void comp_main()
                    anno_fill.bbox = bbox;
                    anno_fill.linewidth = linewidth;
                    anno_fill.rgba_color = fill.rgba_color;
-                    Alloc _1257;
-                    _1257.offset = _1038.Load(32);
-                    param_18.offset = _1257.offset;
+                    Alloc _1203;
+                    _1203.offset = _968.Load(32);
+                    param_18.offset = _1203.offset;
                    AnnotatedRef param_19 = out_ref;
                    uint param_20 = fill_mode;
                    AnnoColor param_21 = anno_fill;
@ -674,9 +648,9 @@ void comp_main()
                    anno_lin.line_x = line_x;
                    anno_lin.line_y = line_y;
                    anno_lin.line_c = -((p0.x * line_x) + (p0.y * line_y));
-                    Alloc _1353;
-                    _1353.offset = _1038.Load(32);
-                    param_23.offset = _1353.offset;
+                    Alloc _1299;
+                    _1299.offset = _968.Load(32);
+                    param_23.offset = _1299.offset;
                    AnnotatedRef param_24 = out_ref;
                    uint param_25 = fill_mode;
                    AnnoLinGradient param_26 = anno_lin;
@ -691,48 +665,51 @@ void comp_main()
                    anno_img.linewidth = linewidth;
                    anno_img.index = fill_img.index;
                    anno_img.offset = fill_img.offset;
-                    Alloc _1381;
-                    _1381.offset = _1038.Load(32);
-                    param_28.offset = _1381.offset;
+                    Alloc _1327;
+                    _1327.offset = _968.Load(32);
+                    param_28.offset = _1327.offset;
                    AnnotatedRef param_29 = out_ref;
                    uint param_30 = fill_mode;
                    AnnoImage param_31 = anno_img;
                    Annotated_Image_write(param_28, param_29, param_30, param_31);
                    break;
                }
+                case 9u:
+                {
+                    anno_begin_clip.bbox = bbox;
+                    anno_begin_clip.linewidth = 0.0f;
+                    Alloc _1344;
+                    _1344.offset = _968.Load(32);
+                    param_32.offset = _1344.offset;
+                    AnnotatedRef param_33 = out_ref;
+                    uint param_34 = 0u;
+                    AnnoBeginClip param_35 = anno_begin_clip;
+                    Annotated_BeginClip_write(param_32, param_33, param_34, param_35);
+                    break;
+                }
            }
        }
        else
        {
+            if (tag_word == 10u)
+            {
+                anno_end_clip.bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f);
+                Alloc _1368;
+                _1368.offset = _968.Load(32);
+                param_36.offset = _1368.offset;
+                AnnotatedRef param_37 = out_ref;
+                AnnoEndClip param_38 = anno_end_clip;
+                Annotated_EndClip_write(param_36, param_37, param_38);
+            }
+        }
+        if ((tag_word == 9u) || (tag_word == 10u))
+        {
+            uint path_ix = ~(out_ix + i_2);
            if (tag_word == 9u)
            {
-                ElementRef param_32 = this_ref;
-                Clip begin_clip = Element_BeginClip_read(param_32);
-                anno_begin_clip.bbox = begin_clip.bbox;
-                anno_begin_clip.linewidth = 0.0f;
-                Alloc _1410;
-                _1410.offset = _1038.Load(32);
-                param_33.offset = _1410.offset;
-                AnnotatedRef param_34 = out_ref;
-                uint param_35 = 0u;
-                AnnoBeginClip param_36 = anno_begin_clip;
-                Annotated_BeginClip_write(param_33, param_34, param_35, param_36);
-            }
-            else
-            {
-                if (tag_word == 10u)
-                {
-                    ElementRef param_37 = this_ref;
-                    Clip end_clip = Element_EndClip_read(param_37);
-                    anno_end_clip.bbox = end_clip.bbox;
-                    Alloc _1435;
-                    _1435.offset = _1038.Load(32);
-                    param_38.offset = _1435.offset;
-                    AnnotatedRef param_39 = out_ref;
-                    AnnoEndClip param_40 = anno_end_clip;
-                    Annotated_EndClip_write(param_38, param_39, param_40);
-                }
+                path_ix = m.path_ix;
            }
+            _187.Store((clip_out_base + m.clip_ix) * 4 + 8, path_ix);
        }
        out_ref.offset += 40u;
    }
--- a/piet-gpu/shader/gen/draw_leaf.msl
+++ b/piet-gpu/shader/gen/draw_leaf.msl
@ -87,16 +87,6 @@ struct FillImage
    int2 offset;
 };

-struct ClipRef
-{
-    uint offset;
-};
-
-struct Clip
-{
-    float4 bbox;
-};
-
 struct ElementTag
 {
    uint tag;
@ -217,8 +207,13 @@ struct Config
    Alloc_1 trans_alloc;
    Alloc_1 bbox_alloc;
    Alloc_1 drawmonoid_alloc;
+    Alloc_1 clip_alloc;
+    Alloc_1 clip_bic_alloc;
+    Alloc_1 clip_stack_alloc;
+    Alloc_1 clip_bbox_alloc;
    uint n_trans;
    uint n_path;
+    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
@ -233,9 +228,9 @@ struct ConfigBuf
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);

 static inline __attribute__((always_inline))
-ElementTag Element_tag(thread const ElementRef& ref, const device SceneBuf& v_225)
+ElementTag Element_tag(thread const ElementRef& ref, const device SceneBuf& v_211)
 {
-    uint tag_and_flags = v_225.scene[ref.offset >> uint(2)];
+    uint tag_and_flags = v_211.scene[ref.offset >> uint(2)];
    return ElementTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
 }

@ -284,20 +279,20 @@ DrawMonoid tag_monoid_identity()
 }

 static inline __attribute__((always_inline))
-FillColor FillColor_read(thread const FillColorRef& ref, const device SceneBuf& v_225)
+FillColor FillColor_read(thread const FillColorRef& ref, const device SceneBuf& v_211)
 {
    uint ix = ref.offset >> uint(2);
-    uint raw0 = v_225.scene[ix + 0u];
+    uint raw0 = v_211.scene[ix + 0u];
    FillColor s;
    s.rgba_color = raw0;
    return s;
 }

 static inline __attribute__((always_inline))
-FillColor Element_FillColor_read(thread const ElementRef& ref, const device SceneBuf& v_225)
+FillColor Element_FillColor_read(thread const ElementRef& ref, const device SceneBuf& v_211)
 {
    FillColorRef param = FillColorRef{ ref.offset + 4u };
-    return FillColor_read(param, v_225);
+    return FillColor_read(param, v_211);
 }

 static inline __attribute__((always_inline))
@ -307,7 +302,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
 }

 static inline __attribute__((always_inline))
-void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_201)
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_187)
 {
    Alloc param = alloc;
    uint param_1 = offset;
@ -315,61 +310,61 @@ void write_mem(thread const Alloc& alloc, thread const uint& offset, thread cons
    {
        return;
    }
-    v_201.memory[offset] = val;
+    v_187.memory[offset] = val;
 }

 static inline __attribute__((always_inline))
-void AnnoColor_write(thread const Alloc& a, thread const AnnoColorRef& ref, thread const AnnoColor& s, device Memory& v_201)
+void AnnoColor_write(thread const Alloc& a, thread const AnnoColorRef& ref, thread const AnnoColor& s, device Memory& v_187)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
    uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_201);
+    write_mem(param, param_1, param_2, v_187);
    Alloc param_3 = a;
    uint param_4 = ix + 1u;
    uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_201);
+    write_mem(param_3, param_4, param_5, v_187);
    Alloc param_6 = a;
    uint param_7 = ix + 2u;
    uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_201);
+    write_mem(param_6, param_7, param_8, v_187);
    Alloc param_9 = a;
    uint param_10 = ix + 3u;
    uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_201);
+    write_mem(param_9, param_10, param_11, v_187);
    Alloc param_12 = a;
    uint param_13 = ix + 4u;
    uint param_14 = as_type<uint>(s.linewidth);
-    write_mem(param_12, param_13, param_14, v_201);
+    write_mem(param_12, param_13, param_14, v_187);
    Alloc param_15 = a;
    uint param_16 = ix + 5u;
    uint param_17 = s.rgba_color;
-    write_mem(param_15, param_16, param_17, v_201);
+    write_mem(param_15, param_16, param_17, v_187);
 }

 static inline __attribute__((always_inline))
-void Annotated_Color_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoColor& s, device Memory& v_201)
+void Annotated_Color_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoColor& s, device Memory& v_187)
 {
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = (flags << uint(16)) | 1u;
-    write_mem(param, param_1, param_2, v_201);
+    write_mem(param, param_1, param_2, v_187);
    Alloc param_3 = a;
    AnnoColorRef param_4 = AnnoColorRef{ ref.offset + 4u };
    AnnoColor param_5 = s;
-    AnnoColor_write(param_3, param_4, param_5, v_201);
+    AnnoColor_write(param_3, param_4, param_5, v_187);
 }

 static inline __attribute__((always_inline))
-FillLinGradient FillLinGradient_read(thread const FillLinGradientRef& ref, const device SceneBuf& v_225)
+FillLinGradient FillLinGradient_read(thread const FillLinGradientRef& ref, const device SceneBuf& v_211)
 {
    uint ix = ref.offset >> uint(2);
-    uint raw0 = v_225.scene[ix + 0u];
-    uint raw1 = v_225.scene[ix + 1u];
-    uint raw2 = v_225.scene[ix + 2u];
-    uint raw3 = v_225.scene[ix + 3u];
-    uint raw4 = v_225.scene[ix + 4u];
+    uint raw0 = v_211.scene[ix + 0u];
+    uint raw1 = v_211.scene[ix + 1u];
+    uint raw2 = v_211.scene[ix + 2u];
+    uint raw3 = v_211.scene[ix + 3u];
+    uint raw4 = v_211.scene[ix + 4u];
    FillLinGradient s;
    s.index = raw0;
    s.p0 = float2(as_type<float>(raw1), as_type<float>(raw2));
@ -378,73 +373,73 @@ FillLinGradient FillLinGradient_read(thread const FillLinGradientRef& ref, const
 }

 static inline __attribute__((always_inline))
-FillLinGradient Element_FillLinGradient_read(thread const ElementRef& ref, const device SceneBuf& v_225)
+FillLinGradient Element_FillLinGradient_read(thread const ElementRef& ref, const device SceneBuf& v_211)
 {
    FillLinGradientRef param = FillLinGradientRef{ ref.offset + 4u };
-    return FillLinGradient_read(param, v_225);
+    return FillLinGradient_read(param, v_211);
 }

 static inline __attribute__((always_inline))
-void AnnoLinGradient_write(thread const Alloc& a, thread const AnnoLinGradientRef& ref, thread const AnnoLinGradient& s, device Memory& v_201)
+void AnnoLinGradient_write(thread const Alloc& a, thread const AnnoLinGradientRef& ref, thread const AnnoLinGradient& s, device Memory& v_187)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
    uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_201);
+    write_mem(param, param_1, param_2, v_187);
    Alloc param_3 = a;
    uint param_4 = ix + 1u;
    uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_201);
+    write_mem(param_3, param_4, param_5, v_187);
    Alloc param_6 = a;
    uint param_7 = ix + 2u;
    uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_201);
+    write_mem(param_6, param_7, param_8, v_187);
    Alloc param_9 = a;
    uint param_10 = ix + 3u;
    uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_201);
+    write_mem(param_9, param_10, param_11, v_187);
    Alloc param_12 = a;
    uint param_13 = ix + 4u;
    uint param_14 = as_type<uint>(s.linewidth);
-    write_mem(param_12, param_13, param_14, v_201);
+    write_mem(param_12, param_13, param_14, v_187);
    Alloc param_15 = a;
    uint param_16 = ix + 5u;
    uint param_17 = s.index;
-    write_mem(param_15, param_16, param_17, v_201);
+    write_mem(param_15, param_16, param_17, v_187);
    Alloc param_18 = a;
    uint param_19 = ix + 6u;
    uint param_20 = as_type<uint>(s.line_x);
-    write_mem(param_18, param_19, param_20, v_201);
+    write_mem(param_18, param_19, param_20, v_187);
    Alloc param_21 = a;
    uint param_22 = ix + 7u;
    uint param_23 = as_type<uint>(s.line_y);
-    write_mem(param_21, param_22, param_23, v_201);
+    write_mem(param_21, param_22, param_23, v_187);
    Alloc param_24 = a;
    uint param_25 = ix + 8u;
    uint param_26 = as_type<uint>(s.line_c);
-    write_mem(param_24, param_25, param_26, v_201);
+    write_mem(param_24, param_25, param_26, v_187);
 }

 static inline __attribute__((always_inline))
-void Annotated_LinGradient_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoLinGradient& s, device Memory& v_201)
+void Annotated_LinGradient_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoLinGradient& s, device Memory& v_187)
 {
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = (flags << uint(16)) | 2u;
-    write_mem(param, param_1, param_2, v_201);
+    write_mem(param, param_1, param_2, v_187);
    Alloc param_3 = a;
    AnnoLinGradientRef param_4 = AnnoLinGradientRef{ ref.offset + 4u };
    AnnoLinGradient param_5 = s;
-    AnnoLinGradient_write(param_3, param_4, param_5, v_201);
+    AnnoLinGradient_write(param_3, param_4, param_5, v_187);
 }

 static inline __attribute__((always_inline))
-FillImage FillImage_read(thread const FillImageRef& ref, const device SceneBuf& v_225)
+FillImage FillImage_read(thread const FillImageRef& ref, const device SceneBuf& v_211)
 {
    uint ix = ref.offset >> uint(2);
-    uint raw0 = v_225.scene[ix + 0u];
-    uint raw1 = v_225.scene[ix + 1u];
+    uint raw0 = v_211.scene[ix + 0u];
+    uint raw1 = v_211.scene[ix + 1u];
    FillImage s;
    s.index = raw0;
    s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
@ -452,167 +447,140 @@ FillImage FillImage_read(thread const FillImageRef& ref, const device SceneBuf&
 }

 static inline __attribute__((always_inline))
-FillImage Element_FillImage_read(thread const ElementRef& ref, const device SceneBuf& v_225)
+FillImage Element_FillImage_read(thread const ElementRef& ref, const device SceneBuf& v_211)
 {
    FillImageRef param = FillImageRef{ ref.offset + 4u };
-    return FillImage_read(param, v_225);
+    return FillImage_read(param, v_211);
 }

 static inline __attribute__((always_inline))
-void AnnoImage_write(thread const Alloc& a, thread const AnnoImageRef& ref, thread const AnnoImage& s, device Memory& v_201)
+void AnnoImage_write(thread const Alloc& a, thread const AnnoImageRef& ref, thread const AnnoImage& s, device Memory& v_187)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
    uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_201);
+    write_mem(param, param_1, param_2, v_187);
    Alloc param_3 = a;
    uint param_4 = ix + 1u;
    uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_201);
+    write_mem(param_3, param_4, param_5, v_187);
    Alloc param_6 = a;
    uint param_7 = ix + 2u;
    uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_201);
+    write_mem(param_6, param_7, param_8, v_187);
    Alloc param_9 = a;
    uint param_10 = ix + 3u;
    uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_201);
+    write_mem(param_9, param_10, param_11, v_187);
    Alloc param_12 = a;
    uint param_13 = ix + 4u;
    uint param_14 = as_type<uint>(s.linewidth);
-    write_mem(param_12, param_13, param_14, v_201);
+    write_mem(param_12, param_13, param_14, v_187);
    Alloc param_15 = a;
    uint param_16 = ix + 5u;
    uint param_17 = s.index;
-    write_mem(param_15, param_16, param_17, v_201);
+    write_mem(param_15, param_16, param_17, v_187);
    Alloc param_18 = a;
    uint param_19 = ix + 6u;
    uint param_20 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16));
-    write_mem(param_18, param_19, param_20, v_201);
+    write_mem(param_18, param_19, param_20, v_187);
 }

 static inline __attribute__((always_inline))
-void Annotated_Image_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoImage& s, device Memory& v_201)
+void Annotated_Image_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoImage& s, device Memory& v_187)
 {
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = (flags << uint(16)) | 3u;
-    write_mem(param, param_1, param_2, v_201);
+    write_mem(param, param_1, param_2, v_187);
    Alloc param_3 = a;
    AnnoImageRef param_4 = AnnoImageRef{ ref.offset + 4u };
    AnnoImage param_5 = s;
-    AnnoImage_write(param_3, param_4, param_5, v_201);
+    AnnoImage_write(param_3, param_4, param_5, v_187);
 }

 static inline __attribute__((always_inline))
-Clip Clip_read(thread const ClipRef& ref, const device SceneBuf& v_225)
-{
-    uint ix = ref.offset >> uint(2);
-    uint raw0 = v_225.scene[ix + 0u];
-    uint raw1 = v_225.scene[ix + 1u];
-    uint raw2 = v_225.scene[ix + 2u];
-    uint raw3 = v_225.scene[ix + 3u];
-    Clip s;
-    s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
-    return s;
-}
-
-static inline __attribute__((always_inline))
-Clip Element_BeginClip_read(thread const ElementRef& ref, const device SceneBuf& v_225)
-{
-    ClipRef param = ClipRef{ ref.offset + 4u };
-    return Clip_read(param, v_225);
-}
-
-static inline __attribute__((always_inline))
-void AnnoBeginClip_write(thread const Alloc& a, thread const AnnoBeginClipRef& ref, thread const AnnoBeginClip& s, device Memory& v_201)
+void AnnoBeginClip_write(thread const Alloc& a, thread const AnnoBeginClipRef& ref, thread const AnnoBeginClip& s, device Memory& v_187)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
    uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_201);
+    write_mem(param, param_1, param_2, v_187);
    Alloc param_3 = a;
    uint param_4 = ix + 1u;
    uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_201);
+    write_mem(param_3, param_4, param_5, v_187);
    Alloc param_6 = a;
    uint param_7 = ix + 2u;
    uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_201);
+    write_mem(param_6, param_7, param_8, v_187);
    Alloc param_9 = a;
    uint param_10 = ix + 3u;
    uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_201);
+    write_mem(param_9, param_10, param_11, v_187);
    Alloc param_12 = a;
    uint param_13 = ix + 4u;
    uint param_14 = as_type<uint>(s.linewidth);
-    write_mem(param_12, param_13, param_14, v_201);
+    write_mem(param_12, param_13, param_14, v_187);
 }

 static inline __attribute__((always_inline))
-void Annotated_BeginClip_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoBeginClip& s, device Memory& v_201)
+void Annotated_BeginClip_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoBeginClip& s, device Memory& v_187)
 {
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = (flags << uint(16)) | 4u;
-    write_mem(param, param_1, param_2, v_201);
+    write_mem(param, param_1, param_2, v_187);
    Alloc param_3 = a;
    AnnoBeginClipRef param_4 = AnnoBeginClipRef{ ref.offset + 4u };
    AnnoBeginClip param_5 = s;
-    AnnoBeginClip_write(param_3, param_4, param_5, v_201);
+    AnnoBeginClip_write(param_3, param_4, param_5, v_187);
 }

 static inline __attribute__((always_inline))
-Clip Element_EndClip_read(thread const ElementRef& ref, const device SceneBuf& v_225)
-{
-    ClipRef param = ClipRef{ ref.offset + 4u };
-    return Clip_read(param, v_225);
-}
-
-static inline __attribute__((always_inline))
-void AnnoEndClip_write(thread const Alloc& a, thread const AnnoEndClipRef& ref, thread const AnnoEndClip& s, device Memory& v_201)
+void AnnoEndClip_write(thread const Alloc& a, thread const AnnoEndClipRef& ref, thread const AnnoEndClip& s, device Memory& v_187)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
    uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_201);
+    write_mem(param, param_1, param_2, v_187);
    Alloc param_3 = a;
    uint param_4 = ix + 1u;
    uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_201);
+    write_mem(param_3, param_4, param_5, v_187);
    Alloc param_6 = a;
    uint param_7 = ix + 2u;
    uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_201);
+    write_mem(param_6, param_7, param_8, v_187);
    Alloc param_9 = a;
    uint param_10 = ix + 3u;
    uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_201);
+    write_mem(param_9, param_10, param_11, v_187);
 }

 static inline __attribute__((always_inline))
-void Annotated_EndClip_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const AnnoEndClip& s, device Memory& v_201)
+void Annotated_EndClip_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const AnnoEndClip& s, device Memory& v_187)
 {
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = 5u;
-    write_mem(param, param_1, param_2, v_201);
+    write_mem(param, param_1, param_2, v_187);
    Alloc param_3 = a;
    AnnoEndClipRef param_4 = AnnoEndClipRef{ ref.offset + 4u };
    AnnoEndClip param_5 = s;
-    AnnoEndClip_write(param_3, param_4, param_5, v_201);
+    AnnoEndClip_write(param_3, param_4, param_5, v_187);
 }

-kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1038 [[buffer(1)]], const device SceneBuf& v_225 [[buffer(2)]], const device ParentBuf& _1004 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+kernel void main0(device Memory& v_187 [[buffer(0)]], const device ConfigBuf& _968 [[buffer(1)]], const device SceneBuf& v_211 [[buffer(2)]], const device ParentBuf& _934 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
 {
    threadgroup DrawMonoid sh_scratch[256];
    uint ix = gl_GlobalInvocationID.x * 8u;
    ElementRef ref = ElementRef{ ix * 36u };
    ElementRef param = ref;
-    uint tag_word = Element_tag(param, v_225).tag;
+    uint tag_word = Element_tag(param, v_211).tag;
    uint param_1 = tag_word;
    DrawMonoid agg = map_tag(param_1);
    spvUnsafeArray<DrawMonoid, 8> local;
@ -622,7 +590,7 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1
        ElementRef param_2 = ref;
        uint param_3 = i;
        ElementRef param_4 = Element_index(param_2, param_3);
-        tag_word = Element_tag(param_4, v_225).tag;
+        tag_word = Element_tag(param_4, v_211).tag;
        uint param_5 = tag_word;
        DrawMonoid param_6 = agg;
        DrawMonoid param_7 = map_tag(param_5);
@ -647,9 +615,9 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1
    DrawMonoid row = tag_monoid_identity();
    if (gl_WorkGroupID.x > 0u)
    {
-        uint _1007 = gl_WorkGroupID.x - 1u;
-        row.path_ix = _1004.parent[_1007].path_ix;
-        row.clip_ix = _1004.parent[_1007].clip_ix;
+        uint _937 = gl_WorkGroupID.x - 1u;
+        row.path_ix = _934.parent[_937].path_ix;
+        row.clip_ix = _934.parent[_937].clip_ix;
    }
    if (gl_LocalInvocationID.x > 0u)
    {
@ -658,8 +626,9 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1
        row = combine_tag_monoid(param_10, param_11);
    }
    uint out_ix = gl_GlobalInvocationID.x * 8u;
-    uint out_base = (_1038.conf.drawmonoid_alloc.offset >> uint(2)) + (out_ix * 2u);
-    AnnotatedRef out_ref = AnnotatedRef{ _1038.conf.anno_alloc.offset + (out_ix * 40u) };
+    uint out_base = (_968.conf.drawmonoid_alloc.offset >> uint(2)) + (out_ix * 2u);
+    uint clip_out_base = _968.conf.clip_alloc.offset >> uint(2);
+    AnnotatedRef out_ref = AnnotatedRef{ _968.conf.anno_alloc.offset + (out_ix * 40u) };
    float4 mat;
    float2 translate;
    AnnoColor anno_fill;
@ -669,39 +638,43 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1
    AnnoImage anno_img;
    Alloc param_28;
    AnnoBeginClip anno_begin_clip;
-    Alloc param_33;
+    Alloc param_32;
    AnnoEndClip anno_end_clip;
-    Alloc param_38;
+    Alloc param_36;
    for (uint i_2 = 0u; i_2 < 8u; i_2++)
    {
-        DrawMonoid param_12 = row;
-        DrawMonoid param_13 = local[i_2];
-        DrawMonoid m = combine_tag_monoid(param_12, param_13);
-        v_201.memory[out_base + (i_2 * 2u)] = m.path_ix;
-        v_201.memory[(out_base + (i_2 * 2u)) + 1u] = m.clip_ix;
+        DrawMonoid m = row;
+        if (i_2 > 0u)
+        {
+            DrawMonoid param_12 = m;
+            DrawMonoid param_13 = local[i_2 - 1u];
+            m = combine_tag_monoid(param_12, param_13);
+        }
+        v_187.memory[out_base + (i_2 * 2u)] = m.path_ix;
+        v_187.memory[(out_base + (i_2 * 2u)) + 1u] = m.clip_ix;
        ElementRef param_14 = ref;
        uint param_15 = i_2;
        ElementRef this_ref = Element_index(param_14, param_15);
        ElementRef param_16 = this_ref;
-        tag_word = Element_tag(param_16, v_225).tag;
-        if (((tag_word == 4u) || (tag_word == 5u)) || (tag_word == 6u))
+        tag_word = Element_tag(param_16, v_211).tag;
+        if ((((tag_word == 4u) || (tag_word == 5u)) || (tag_word == 6u)) || (tag_word == 9u))
        {
-            uint bbox_offset = (_1038.conf.bbox_alloc.offset >> uint(2)) + (6u * (m.path_ix - 1u));
-            float bbox_l = float(v_201.memory[bbox_offset]) - 32768.0;
-            float bbox_t = float(v_201.memory[bbox_offset + 1u]) - 32768.0;
-            float bbox_r = float(v_201.memory[bbox_offset + 2u]) - 32768.0;
-            float bbox_b = float(v_201.memory[bbox_offset + 3u]) - 32768.0;
+            uint bbox_offset = (_968.conf.bbox_alloc.offset >> uint(2)) + (6u * m.path_ix);
+            float bbox_l = float(v_187.memory[bbox_offset]) - 32768.0;
+            float bbox_t = float(v_187.memory[bbox_offset + 1u]) - 32768.0;
+            float bbox_r = float(v_187.memory[bbox_offset + 2u]) - 32768.0;
+            float bbox_b = float(v_187.memory[bbox_offset + 3u]) - 32768.0;
            float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
-            float linewidth = as_type<float>(v_201.memory[bbox_offset + 4u]);
+            float linewidth = as_type<float>(v_187.memory[bbox_offset + 4u]);
            uint fill_mode = uint(linewidth >= 0.0);
            if ((linewidth >= 0.0) || (tag_word == 5u))
            {
-                uint trans_ix = v_201.memory[bbox_offset + 5u];
-                uint t = (_1038.conf.trans_alloc.offset >> uint(2)) + (6u * trans_ix);
-                mat = as_type<float4>(uint4(v_201.memory[t], v_201.memory[t + 1u], v_201.memory[t + 2u], v_201.memory[t + 3u]));
+                uint trans_ix = v_187.memory[bbox_offset + 5u];
+                uint t = (_968.conf.trans_alloc.offset >> uint(2)) + (6u * trans_ix);
+                mat = as_type<float4>(uint4(v_187.memory[t], v_187.memory[t + 1u], v_187.memory[t + 2u], v_187.memory[t + 3u]));
                if (tag_word == 5u)
                {
-                    translate = as_type<float2>(uint2(v_201.memory[t + 4u], v_201.memory[t + 5u]));
+                    translate = as_type<float2>(uint2(v_187.memory[t + 4u], v_187.memory[t + 5u]));
                }
            }
            if (linewidth >= 0.0)
@ -714,21 +687,21 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1
                case 4u:
                {
                    ElementRef param_17 = this_ref;
-                    FillColor fill = Element_FillColor_read(param_17, v_225);
+                    FillColor fill = Element_FillColor_read(param_17, v_211);
                    anno_fill.bbox = bbox;
                    anno_fill.linewidth = linewidth;
                    anno_fill.rgba_color = fill.rgba_color;
-                    param_18.offset = _1038.conf.anno_alloc.offset;
+                    param_18.offset = _968.conf.anno_alloc.offset;
                    AnnotatedRef param_19 = out_ref;
                    uint param_20 = fill_mode;
                    AnnoColor param_21 = anno_fill;
-                    Annotated_Color_write(param_18, param_19, param_20, param_21, v_201);
+                    Annotated_Color_write(param_18, param_19, param_20, param_21, v_187);
                    break;
                }
                case 5u:
                {
                    ElementRef param_22 = this_ref;
-                    FillLinGradient lin = Element_FillLinGradient_read(param_22, v_225);
+                    FillLinGradient lin = Element_FillLinGradient_read(param_22, v_211);
                    anno_lin.bbox = bbox;
                    anno_lin.linewidth = linewidth;
                    anno_lin.index = lin.index;
@ -741,57 +714,60 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1
                    anno_lin.line_x = line_x;
                    anno_lin.line_y = line_y;
                    anno_lin.line_c = -((p0.x * line_x) + (p0.y * line_y));
-                    param_23.offset = _1038.conf.anno_alloc.offset;
+                    param_23.offset = _968.conf.anno_alloc.offset;
                    AnnotatedRef param_24 = out_ref;
                    uint param_25 = fill_mode;
                    AnnoLinGradient param_26 = anno_lin;
-                    Annotated_LinGradient_write(param_23, param_24, param_25, param_26, v_201);
+                    Annotated_LinGradient_write(param_23, param_24, param_25, param_26, v_187);
                    break;
                }
                case 6u:
                {
                    ElementRef param_27 = this_ref;
-                    FillImage fill_img = Element_FillImage_read(param_27, v_225);
+                    FillImage fill_img = Element_FillImage_read(param_27, v_211);
                    anno_img.bbox = bbox;
                    anno_img.linewidth = linewidth;
                    anno_img.index = fill_img.index;
                    anno_img.offset = fill_img.offset;
-                    param_28.offset = _1038.conf.anno_alloc.offset;
+                    param_28.offset = _968.conf.anno_alloc.offset;
                    AnnotatedRef param_29 = out_ref;
                    uint param_30 = fill_mode;
                    AnnoImage param_31 = anno_img;
-                    Annotated_Image_write(param_28, param_29, param_30, param_31, v_201);
+                    Annotated_Image_write(param_28, param_29, param_30, param_31, v_187);
+                    break;
+                }
+                case 9u:
+                {
+                    anno_begin_clip.bbox = bbox;
+                    anno_begin_clip.linewidth = 0.0;
+                    param_32.offset = _968.conf.anno_alloc.offset;
+                    AnnotatedRef param_33 = out_ref;
+                    uint param_34 = 0u;
+                    AnnoBeginClip param_35 = anno_begin_clip;
+                    Annotated_BeginClip_write(param_32, param_33, param_34, param_35, v_187);
                    break;
                }
            }
        }
        else
        {
+            if (tag_word == 10u)
+            {
+                anno_end_clip.bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0);
+                param_36.offset = _968.conf.anno_alloc.offset;
+                AnnotatedRef param_37 = out_ref;
+                AnnoEndClip param_38 = anno_end_clip;
+                Annotated_EndClip_write(param_36, param_37, param_38, v_187);
+            }
+        }
+        if ((tag_word == 9u) || (tag_word == 10u))
+        {
+            uint path_ix = ~(out_ix + i_2);
            if (tag_word == 9u)
            {
-                ElementRef param_32 = this_ref;
-                Clip begin_clip = Element_BeginClip_read(param_32, v_225);
-                anno_begin_clip.bbox = begin_clip.bbox;
-                anno_begin_clip.linewidth = 0.0;
-                param_33.offset = _1038.conf.anno_alloc.offset;
-                AnnotatedRef param_34 = out_ref;
-                uint param_35 = 0u;
-                AnnoBeginClip param_36 = anno_begin_clip;
-                Annotated_BeginClip_write(param_33, param_34, param_35, param_36, v_201);
-            }
-            else
-            {
-                if (tag_word == 10u)
-                {
-                    ElementRef param_37 = this_ref;
-                    Clip end_clip = Element_EndClip_read(param_37, v_225);
-                    anno_end_clip.bbox = end_clip.bbox;
-                    param_38.offset = _1038.conf.anno_alloc.offset;
-                    AnnotatedRef param_39 = out_ref;
-                    AnnoEndClip param_40 = anno_end_clip;
-                    Annotated_EndClip_write(param_38, param_39, param_40, v_201);
-                }
+                path_ix = m.path_ix;
            }
+            v_187.memory[clip_out_base + m.clip_ix] = path_ix;
        }
        out_ref.offset += 40u;
    }
--- a/piet-gpu/shader/gen/draw_leaf.spv
+++ b/piet-gpu/shader/gen/draw_leaf.spv
--- a/piet-gpu/shader/gen/draw_reduce.hlsl
+++ b/piet-gpu/shader/gen/draw_reduce.hlsl
@ -36,8 +36,13 @@ struct Config
    Alloc trans_alloc;
    Alloc bbox_alloc;
    Alloc drawmonoid_alloc;
+    Alloc clip_alloc;
+    Alloc clip_bic_alloc;
+    Alloc clip_stack_alloc;
+    Alloc clip_bbox_alloc;
    uint n_trans;
    uint n_path;
+    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
--- a/piet-gpu/shader/gen/draw_reduce.msl
+++ b/piet-gpu/shader/gen/draw_reduce.msl
@ -66,8 +66,13 @@ struct Config
    Alloc trans_alloc;
    Alloc bbox_alloc;
    Alloc drawmonoid_alloc;
+    Alloc clip_alloc;
+    Alloc clip_bic_alloc;
+    Alloc clip_stack_alloc;
+    Alloc clip_bbox_alloc;
    uint n_trans;
    uint n_path;
+    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
--- a/piet-gpu/shader/gen/draw_reduce.spv
+++ b/piet-gpu/shader/gen/draw_reduce.spv
--- a/piet-gpu/shader/gen/kernel4.dxil
+++ b/piet-gpu/shader/gen/kernel4.dxil
--- a/piet-gpu/shader/gen/kernel4.hlsl
+++ b/piet-gpu/shader/gen/kernel4.hlsl
@ -117,8 +117,13 @@ struct Config
    Alloc trans_alloc;
    Alloc bbox_alloc;
    Alloc drawmonoid_alloc;
+    Alloc clip_alloc;
+    Alloc clip_bic_alloc;
+    Alloc clip_stack_alloc;
+    Alloc clip_bbox_alloc;
    uint n_trans;
    uint n_path;
+    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
@ -457,7 +462,6 @@ void comp_main()
    TileSegRef tile_seg_ref;
    float area[8];
    uint blend_stack[128][8];
-    float blend_alpha_stack[128][8];
    while (mem_ok)
    {
        Alloc param_3 = cmd_alloc;
@ -640,7 +644,6 @@ void comp_main()
                    float4 param_34 = float4(rgba[k_11]);
                    uint _1390 = packsRGB(param_34);
                    blend_stack[d_2][k_11] = _1390;
-                    blend_alpha_stack[d_2][k_11] = clamp(abs(area[k_11]), 0.0f, 1.0f);
                    rgba[k_11] = 0.0f.xxxx;
                }
                clip_depth++;
@ -655,7 +658,7 @@ void comp_main()
                    uint d_3 = min(clip_depth, 127u);
                    uint param_35 = blend_stack[d_3][k_12];
                    float4 bg = unpacksRGB(param_35);
-                    float4 fg_1 = (rgba[k_12] * area[k_12]) * blend_alpha_stack[d_3][k_12];
+                    float4 fg_1 = rgba[k_12] * area[k_12];
                    rgba[k_12] = (bg * (1.0f - fg_1.w)) + fg_1;
                }
                cmd_ref.offset += 4u;
@ -665,8 +668,8 @@ void comp_main()
            {
                Alloc param_36 = cmd_alloc;
                CmdRef param_37 = cmd_ref;
-                CmdRef _1469 = { Cmd_Jump_read(param_36, param_37).new_ref };
-                cmd_ref = _1469;
+                CmdRef _1453 = { Cmd_Jump_read(param_36, param_37).new_ref };
+                cmd_ref = _1453;
                cmd_alloc.offset = cmd_ref.offset;
                break;
            }
--- a/piet-gpu/shader/gen/kernel4.msl
+++ b/piet-gpu/shader/gen/kernel4.msl
@ -175,8 +175,13 @@ struct Config
    Alloc_1 trans_alloc;
    Alloc_1 bbox_alloc;
    Alloc_1 drawmonoid_alloc;
+    Alloc_1 clip_alloc;
+    Alloc_1 clip_bic_alloc;
+    Alloc_1 clip_stack_alloc;
+    Alloc_1 clip_bbox_alloc;
    uint n_trans;
    uint n_path;
+    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
@ -507,7 +512,6 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
    TileSegRef tile_seg_ref;
    spvUnsafeArray<float, 8> area;
    spvUnsafeArray<spvUnsafeArray<uint, 8>, 128> blend_stack;
-    spvUnsafeArray<spvUnsafeArray<float, 8>, 128> blend_alpha_stack;
    while (mem_ok)
    {
        Alloc param_3 = cmd_alloc;
@ -687,7 +691,6 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
                    float4 param_34 = float4(rgba[k_11]);
                    uint _1390 = packsRGB(param_34);
                    blend_stack[d_2][k_11] = _1390;
-                    blend_alpha_stack[d_2][k_11] = fast::clamp(abs(area[k_11]), 0.0, 1.0);
                    rgba[k_11] = float4(0.0);
                }
                clip_depth++;
@ -702,7 +705,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
                    uint d_3 = min(clip_depth, 127u);
                    uint param_35 = blend_stack[d_3][k_12];
                    float4 bg = unpacksRGB(param_35);
-                    float4 fg_1 = (rgba[k_12] * area[k_12]) * blend_alpha_stack[d_3][k_12];
+                    float4 fg_1 = rgba[k_12] * area[k_12];
                    rgba[k_12] = (bg * (1.0 - fg_1.w)) + fg_1;
                }
                cmd_ref.offset += 4u;
--- a/piet-gpu/shader/gen/kernel4.spv
+++ b/piet-gpu/shader/gen/kernel4.spv
--- a/piet-gpu/shader/gen/kernel4_gray.dxil
+++ b/piet-gpu/shader/gen/kernel4_gray.dxil
--- a/piet-gpu/shader/gen/kernel4_gray.hlsl
+++ b/piet-gpu/shader/gen/kernel4_gray.hlsl
@ -117,8 +117,13 @@ struct Config
    Alloc trans_alloc;
    Alloc bbox_alloc;
    Alloc drawmonoid_alloc;
+    Alloc clip_alloc;
+    Alloc clip_bic_alloc;
+    Alloc clip_stack_alloc;
+    Alloc clip_bbox_alloc;
    uint n_trans;
    uint n_path;
+    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
@ -457,7 +462,6 @@ void comp_main()
    TileSegRef tile_seg_ref;
    float area[8];
    uint blend_stack[128][8];
-    float blend_alpha_stack[128][8];
    while (mem_ok)
    {
        Alloc param_3 = cmd_alloc;
@ -640,7 +644,6 @@ void comp_main()
                    float4 param_34 = float4(rgba[k_11]);
                    uint _1390 = packsRGB(param_34);
                    blend_stack[d_2][k_11] = _1390;
-                    blend_alpha_stack[d_2][k_11] = clamp(abs(area[k_11]), 0.0f, 1.0f);
                    rgba[k_11] = 0.0f.xxxx;
                }
                clip_depth++;
@ -655,7 +658,7 @@ void comp_main()
                    uint d_3 = min(clip_depth, 127u);
                    uint param_35 = blend_stack[d_3][k_12];
                    float4 bg = unpacksRGB(param_35);
-                    float4 fg_1 = (rgba[k_12] * area[k_12]) * blend_alpha_stack[d_3][k_12];
+                    float4 fg_1 = rgba[k_12] * area[k_12];
                    rgba[k_12] = (bg * (1.0f - fg_1.w)) + fg_1;
                }
                cmd_ref.offset += 4u;
@ -665,8 +668,8 @@ void comp_main()
            {
                Alloc param_36 = cmd_alloc;
                CmdRef param_37 = cmd_ref;
-                CmdRef _1469 = { Cmd_Jump_read(param_36, param_37).new_ref };
-                cmd_ref = _1469;
+                CmdRef _1453 = { Cmd_Jump_read(param_36, param_37).new_ref };
+                cmd_ref = _1453;
                cmd_alloc.offset = cmd_ref.offset;
                break;
            }
--- a/piet-gpu/shader/gen/kernel4_gray.msl
+++ b/piet-gpu/shader/gen/kernel4_gray.msl
@ -175,8 +175,13 @@ struct Config
    Alloc_1 trans_alloc;
    Alloc_1 bbox_alloc;
    Alloc_1 drawmonoid_alloc;
+    Alloc_1 clip_alloc;
+    Alloc_1 clip_bic_alloc;
+    Alloc_1 clip_stack_alloc;
+    Alloc_1 clip_bbox_alloc;
    uint n_trans;
    uint n_path;
+    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
@ -507,7 +512,6 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
    TileSegRef tile_seg_ref;
    spvUnsafeArray<float, 8> area;
    spvUnsafeArray<spvUnsafeArray<uint, 8>, 128> blend_stack;
-    spvUnsafeArray<spvUnsafeArray<float, 8>, 128> blend_alpha_stack;
    while (mem_ok)
    {
        Alloc param_3 = cmd_alloc;
@ -687,7 +691,6 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
                    float4 param_34 = float4(rgba[k_11]);
                    uint _1390 = packsRGB(param_34);
                    blend_stack[d_2][k_11] = _1390;
-                    blend_alpha_stack[d_2][k_11] = fast::clamp(abs(area[k_11]), 0.0, 1.0);
                    rgba[k_11] = float4(0.0);
                }
                clip_depth++;
@ -702,7 +705,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
                    uint d_3 = min(clip_depth, 127u);
                    uint param_35 = blend_stack[d_3][k_12];
                    float4 bg = unpacksRGB(param_35);
-                    float4 fg_1 = (rgba[k_12] * area[k_12]) * blend_alpha_stack[d_3][k_12];
+                    float4 fg_1 = rgba[k_12] * area[k_12];
                    rgba[k_12] = (bg * (1.0 - fg_1.w)) + fg_1;
                }
                cmd_ref.offset += 4u;
--- a/piet-gpu/shader/gen/kernel4_gray.spv
+++ b/piet-gpu/shader/gen/kernel4_gray.spv
--- a/piet-gpu/shader/gen/path_coarse.hlsl
+++ b/piet-gpu/shader/gen/path_coarse.hlsl
@ -86,8 +86,13 @@ struct Config
    Alloc trans_alloc;
    Alloc bbox_alloc;
    Alloc drawmonoid_alloc;
+    Alloc clip_alloc;
+    Alloc clip_bic_alloc;
+    Alloc clip_stack_alloc;
+    Alloc clip_bbox_alloc;
    uint n_trans;
    uint n_path;
+    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
--- a/piet-gpu/shader/gen/path_coarse.msl
+++ b/piet-gpu/shader/gen/path_coarse.msl
@ -146,8 +146,13 @@ struct Config
    Alloc_1 trans_alloc;
    Alloc_1 bbox_alloc;
    Alloc_1 drawmonoid_alloc;
+    Alloc_1 clip_alloc;
+    Alloc_1 clip_bic_alloc;
+    Alloc_1 clip_stack_alloc;
+    Alloc_1 clip_bbox_alloc;
    uint n_trans;
    uint n_path;
+    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
--- a/piet-gpu/shader/gen/path_coarse.spv
+++ b/piet-gpu/shader/gen/path_coarse.spv
--- a/piet-gpu/shader/gen/pathseg.dxil
+++ b/piet-gpu/shader/gen/pathseg.dxil
--- a/piet-gpu/shader/gen/pathseg.hlsl
+++ b/piet-gpu/shader/gen/pathseg.hlsl
@ -64,8 +64,13 @@ struct Config
    Alloc trans_alloc;
    Alloc bbox_alloc;
    Alloc drawmonoid_alloc;
+    Alloc clip_alloc;
+    Alloc clip_bic_alloc;
+    Alloc clip_stack_alloc;
+    Alloc clip_bbox_alloc;
    uint n_trans;
    uint n_path;
+    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
@ -80,7 +85,7 @@ static const Monoid _567 = { 0.0f.xxxx, 0u };
 RWByteAddressBuffer _111 : register(u0, space0);
 ByteAddressBuffer _574 : register(t2, space0);
 ByteAddressBuffer _639 : register(t1, space0);
-ByteAddressBuffer _709 : register(t3, space0);
+ByteAddressBuffer _710 : register(t3, space0);

 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@ -356,7 +361,7 @@ uint round_up(float x)
 void comp_main()
 {
    uint ix = gl_GlobalInvocationID.x * 4u;
-    uint tag_word = _574.Load(((_639.Load(64) >> uint(2)) + (ix >> uint(2))) * 4 + 0);
+    uint tag_word = _574.Load(((_639.Load(84) >> uint(2)) + (ix >> uint(2))) * 4 + 0);
    uint param = tag_word;
    TagMonoid local_tm = reduce_tag(param);
    sh_tag[gl_LocalInvocationID.x] = local_tm;
@ -377,17 +382,17 @@ void comp_main()
    TagMonoid tm = tag_monoid_identity();
    if (gl_WorkGroupID.x > 0u)
    {
-        TagMonoid _715;
-        _715.trans_ix = _709.Load((gl_WorkGroupID.x - 1u) * 20 + 0);
-        _715.linewidth_ix = _709.Load((gl_WorkGroupID.x - 1u) * 20 + 4);
-        _715.pathseg_ix = _709.Load((gl_WorkGroupID.x - 1u) * 20 + 8);
-        _715.path_ix = _709.Load((gl_WorkGroupID.x - 1u) * 20 + 12);
-        _715.pathseg_offset = _709.Load((gl_WorkGroupID.x - 1u) * 20 + 16);
-        tm.trans_ix = _715.trans_ix;
-        tm.linewidth_ix = _715.linewidth_ix;
-        tm.pathseg_ix = _715.pathseg_ix;
-        tm.path_ix = _715.path_ix;
-        tm.pathseg_offset = _715.pathseg_offset;
+        TagMonoid _716;
+        _716.trans_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 0);
+        _716.linewidth_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 4);
+        _716.pathseg_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 8);
+        _716.path_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 12);
+        _716.pathseg_offset = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 16);
+        tm.trans_ix = _716.trans_ix;
+        tm.linewidth_ix = _716.linewidth_ix;
+        tm.pathseg_ix = _716.pathseg_ix;
+        tm.path_ix = _716.path_ix;
+        tm.pathseg_offset = _716.pathseg_offset;
    }
    if (gl_LocalInvocationID.x > 0u)
    {
@ -395,14 +400,14 @@ void comp_main()
        TagMonoid param_4 = sh_tag[gl_LocalInvocationID.x - 1u];
        tm = combine_tag_monoid(param_3, param_4);
    }
-    uint ps_ix = (_639.Load(68) >> uint(2)) + tm.pathseg_offset;
-    uint lw_ix = (_639.Load(60) >> uint(2)) + tm.linewidth_ix;
+    uint ps_ix = (_639.Load(88) >> uint(2)) + tm.pathseg_offset;
+    uint lw_ix = (_639.Load(80) >> uint(2)) + tm.linewidth_ix;
    uint save_path_ix = tm.path_ix;
    uint trans_ix = tm.trans_ix;
-    TransformSegRef _770 = { _639.Load(36) + (trans_ix * 24u) };
-    TransformSegRef trans_ref = _770;
-    PathSegRef _780 = { _639.Load(28) + (tm.pathseg_ix * 52u) };
-    PathSegRef ps_ref = _780;
+    TransformSegRef _771 = { _639.Load(36) + (trans_ix * 24u) };
+    TransformSegRef trans_ref = _771;
+    PathSegRef _781 = { _639.Load(28) + (tm.pathseg_ix * 52u) };
+    PathSegRef ps_ref = _781;
    float linewidth[4];
    uint save_trans_ix[4];
    float2 p0;
@ -455,9 +460,9 @@ void comp_main()
                    }
                }
            }
-            Alloc _876;
-            _876.offset = _639.Load(36);
-            param_13.offset = _876.offset;
+            Alloc _877;
+            _877.offset = _639.Load(36);
+            param_13.offset = _877.offset;
            TransformSegRef param_14 = trans_ref;
            TransformSeg transform = TransformSeg_read(param_13, param_14);
            p0 = ((transform.mat.xy * p0.x) + (transform.mat.zw * p0.y)) + transform.translate;
@ -466,25 +471,25 @@ void comp_main()
            if (seg_type >= 2u)
            {
                p2 = ((transform.mat.xy * p2.x) + (transform.mat.zw * p2.y)) + transform.translate;
-                float4 _946 = bbox;
-                float2 _949 = min(_946.xy, p2);
-                bbox.x = _949.x;
-                bbox.y = _949.y;
-                float4 _954 = bbox;
-                float2 _957 = max(_954.zw, p2);
-                bbox.z = _957.x;
-                bbox.w = _957.y;
+                float4 _947 = bbox;
+                float2 _950 = min(_947.xy, p2);
+                bbox.x = _950.x;
+                bbox.y = _950.y;
+                float4 _955 = bbox;
+                float2 _958 = max(_955.zw, p2);
+                bbox.z = _958.x;
+                bbox.w = _958.y;
                if (seg_type == 3u)
                {
                    p3 = ((transform.mat.xy * p3.x) + (transform.mat.zw * p3.y)) + transform.translate;
-                    float4 _982 = bbox;
-                    float2 _985 = min(_982.xy, p3);
-                    bbox.x = _985.x;
-                    bbox.y = _985.y;
-                    float4 _990 = bbox;
-                    float2 _993 = max(_990.zw, p3);
-                    bbox.z = _993.x;
-                    bbox.w = _993.y;
+                    float4 _983 = bbox;
+                    float2 _986 = min(_983.xy, p3);
+                    bbox.x = _986.x;
+                    bbox.y = _986.y;
+                    float4 _991 = bbox;
+                    float2 _994 = max(_991.zw, p3);
+                    bbox.z = _994.x;
+                    bbox.w = _994.y;
                }
                else
                {
@ -515,9 +520,9 @@ void comp_main()
            cubic.trans_ix = (gl_GlobalInvocationID.x * 4u) + i_1;
            cubic.stroke = stroke;
            uint fill_mode = uint(linewidth[i_1] >= 0.0f);
-            Alloc _1088;
-            _1088.offset = _639.Load(28);
-            param_15.offset = _1088.offset;
+            Alloc _1089;
+            _1089.offset = _639.Load(28);
+            param_15.offset = _1089.offset;
            PathSegRef param_16 = ps_ref;
            uint param_17 = fill_mode;
            PathCubic param_18 = cubic;
@ -574,17 +579,17 @@ void comp_main()
        Monoid param_24 = local[i_4];
        Monoid m = combine_monoid(param_23, param_24);
        bool do_atomic = false;
-        bool _1263 = i_4 == 3u;
-        bool _1269;
-        if (_1263)
+        bool _1264 = i_4 == 3u;
+        bool _1270;
+        if (_1264)
        {
-            _1269 = gl_LocalInvocationID.x == 255u;
+            _1270 = gl_LocalInvocationID.x == 255u;
        }
        else
        {
-            _1269 = _1263;
+            _1270 = _1264;
        }
-        if (_1269)
+        if (_1270)
        {
            do_atomic = true;
        }
@ -612,30 +617,30 @@ void comp_main()
        }
        if (do_atomic)
        {
-            bool _1334 = m.bbox.z > m.bbox.x;
-            bool _1343;
-            if (!_1334)
+            bool _1335 = m.bbox.z > m.bbox.x;
+            bool _1344;
+            if (!_1335)
            {
-                _1343 = m.bbox.w > m.bbox.y;
+                _1344 = m.bbox.w > m.bbox.y;
            }
            else
            {
-                _1343 = _1334;
+                _1344 = _1335;
            }
-            if (_1343)
+            if (_1344)
            {
                float param_29 = m.bbox.x;
-                uint _1352;
-                _111.InterlockedMin(bbox_out_ix * 4 + 8, round_down(param_29), _1352);
+                uint _1353;
+                _111.InterlockedMin(bbox_out_ix * 4 + 8, round_down(param_29), _1353);
                float param_30 = m.bbox.y;
-                uint _1360;
-                _111.InterlockedMin((bbox_out_ix + 1u) * 4 + 8, round_down(param_30), _1360);
+                uint _1361;
+                _111.InterlockedMin((bbox_out_ix + 1u) * 4 + 8, round_down(param_30), _1361);
                float param_31 = m.bbox.z;
-                uint _1368;
-                _111.InterlockedMax((bbox_out_ix + 2u) * 4 + 8, round_up(param_31), _1368);
+                uint _1369;
+                _111.InterlockedMax((bbox_out_ix + 2u) * 4 + 8, round_up(param_31), _1369);
                float param_32 = m.bbox.w;
-                uint _1376;
-                _111.InterlockedMax((bbox_out_ix + 3u) * 4 + 8, round_up(param_32), _1376);
+                uint _1377;
+                _111.InterlockedMax((bbox_out_ix + 3u) * 4 + 8, round_up(param_32), _1377);
            }
            bbox_out_ix += 6u;
        }
--- a/piet-gpu/shader/gen/pathseg.msl
+++ b/piet-gpu/shader/gen/pathseg.msl
@ -129,8 +129,13 @@ struct Config
    Alloc_1 trans_alloc;
    Alloc_1 bbox_alloc;
    Alloc_1 drawmonoid_alloc;
+    Alloc_1 clip_alloc;
+    Alloc_1 clip_bic_alloc;
+    Alloc_1 clip_stack_alloc;
+    Alloc_1 clip_bbox_alloc;
    uint n_trans;
    uint n_path;
+    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
@ -430,7 +435,7 @@ uint round_up(thread const float& x)
    return uint(fast::min(65535.0, ceil(x) + 32768.0));
 }

-kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _639 [[buffer(1)]], const device SceneBuf& v_574 [[buffer(2)]], const device ParentBuf& _709 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _639 [[buffer(1)]], const device SceneBuf& v_574 [[buffer(2)]], const device ParentBuf& _710 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
 {
    threadgroup TagMonoid sh_tag[256];
    threadgroup Monoid sh_scratch[256];
@ -456,12 +461,12 @@ kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _6
    TagMonoid tm = tag_monoid_identity();
    if (gl_WorkGroupID.x > 0u)
    {
-        uint _712 = gl_WorkGroupID.x - 1u;
-        tm.trans_ix = _709.parent[_712].trans_ix;
-        tm.linewidth_ix = _709.parent[_712].linewidth_ix;
-        tm.pathseg_ix = _709.parent[_712].pathseg_ix;
-        tm.path_ix = _709.parent[_712].path_ix;
-        tm.pathseg_offset = _709.parent[_712].pathseg_offset;
+        uint _713 = gl_WorkGroupID.x - 1u;
+        tm.trans_ix = _710.parent[_713].trans_ix;
+        tm.linewidth_ix = _710.parent[_713].linewidth_ix;
+        tm.pathseg_ix = _710.parent[_713].pathseg_ix;
+        tm.path_ix = _710.parent[_713].path_ix;
+        tm.pathseg_offset = _710.parent[_713].pathseg_offset;
    }
    if (gl_LocalInvocationID.x > 0u)
    {
@ -536,25 +541,25 @@ kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _6
            if (seg_type >= 2u)
            {
                p2 = ((transform.mat.xy * p2.x) + (transform.mat.zw * p2.y)) + transform.translate;
-                float4 _946 = bbox;
-                float2 _949 = fast::min(_946.xy, p2);
-                bbox.x = _949.x;
-                bbox.y = _949.y;
-                float4 _954 = bbox;
-                float2 _957 = fast::max(_954.zw, p2);
-                bbox.z = _957.x;
-                bbox.w = _957.y;
+                float4 _947 = bbox;
+                float2 _950 = fast::min(_947.xy, p2);
+                bbox.x = _950.x;
+                bbox.y = _950.y;
+                float4 _955 = bbox;
+                float2 _958 = fast::max(_955.zw, p2);
+                bbox.z = _958.x;
+                bbox.w = _958.y;
                if (seg_type == 3u)
                {
                    p3 = ((transform.mat.xy * p3.x) + (transform.mat.zw * p3.y)) + transform.translate;
-                    float4 _982 = bbox;
-                    float2 _985 = fast::min(_982.xy, p3);
-                    bbox.x = _985.x;
-                    bbox.y = _985.y;
-                    float4 _990 = bbox;
-                    float2 _993 = fast::max(_990.zw, p3);
-                    bbox.z = _993.x;
-                    bbox.w = _993.y;
+                    float4 _983 = bbox;
+                    float2 _986 = fast::min(_983.xy, p3);
+                    bbox.x = _986.x;
+                    bbox.y = _986.y;
+                    float4 _991 = bbox;
+                    float2 _994 = fast::max(_991.zw, p3);
+                    bbox.z = _994.x;
+                    bbox.w = _994.y;
                }
                else
                {
@ -642,17 +647,17 @@ kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _6
        Monoid param_24 = local[i_4];
        Monoid m = combine_monoid(param_23, param_24);
        bool do_atomic = false;
-        bool _1263 = i_4 == 3u;
-        bool _1269;
-        if (_1263)
+        bool _1264 = i_4 == 3u;
+        bool _1270;
+        if (_1264)
        {
-            _1269 = gl_LocalInvocationID.x == 255u;
+            _1270 = gl_LocalInvocationID.x == 255u;
        }
        else
        {
-            _1269 = _1263;
+            _1270 = _1264;
        }
-        if (_1269)
+        if (_1270)
        {
            do_atomic = true;
        }
@ -680,26 +685,26 @@ kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _6
        }
        if (do_atomic)
        {
-            bool _1334 = m.bbox.z > m.bbox.x;
-            bool _1343;
-            if (!_1334)
+            bool _1335 = m.bbox.z > m.bbox.x;
+            bool _1344;
+            if (!_1335)
            {
-                _1343 = m.bbox.w > m.bbox.y;
+                _1344 = m.bbox.w > m.bbox.y;
            }
            else
            {
-                _1343 = _1334;
+                _1344 = _1335;
            }
-            if (_1343)
+            if (_1344)
            {
                float param_29 = m.bbox.x;
-                uint _1352 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix], round_down(param_29), memory_order_relaxed);
+                uint _1353 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix], round_down(param_29), memory_order_relaxed);
                float param_30 = m.bbox.y;
-                uint _1360 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 1u], round_down(param_30), memory_order_relaxed);
+                uint _1361 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 1u], round_down(param_30), memory_order_relaxed);
                float param_31 = m.bbox.z;
-                uint _1368 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 2u], round_up(param_31), memory_order_relaxed);
+                uint _1369 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 2u], round_up(param_31), memory_order_relaxed);
                float param_32 = m.bbox.w;
-                uint _1376 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 3u], round_up(param_32), memory_order_relaxed);
+                uint _1377 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 3u], round_up(param_32), memory_order_relaxed);
            }
            bbox_out_ix += 6u;
        }
--- a/piet-gpu/shader/gen/pathseg.spv
+++ b/piet-gpu/shader/gen/pathseg.spv
--- a/piet-gpu/shader/gen/pathtag_reduce.dxil
+++ b/piet-gpu/shader/gen/pathtag_reduce.dxil
--- a/piet-gpu/shader/gen/pathtag_reduce.hlsl
+++ b/piet-gpu/shader/gen/pathtag_reduce.hlsl
@ -26,8 +26,13 @@ struct Config
    Alloc trans_alloc;
    Alloc bbox_alloc;
    Alloc drawmonoid_alloc;
+    Alloc clip_alloc;
+    Alloc clip_bic_alloc;
+    Alloc clip_stack_alloc;
+    Alloc clip_bbox_alloc;
    uint n_trans;
    uint n_path;
+    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
@ -37,9 +42,9 @@ struct Config
 static const uint3 gl_WorkGroupSize = uint3(128u, 1u, 1u);

 ByteAddressBuffer _139 : register(t1, space0);
-ByteAddressBuffer _150 : register(t2, space0);
-RWByteAddressBuffer _237 : register(u3, space0);
-RWByteAddressBuffer _257 : register(u0, space0);
+ByteAddressBuffer _151 : register(t2, space0);
+RWByteAddressBuffer _238 : register(u3, space0);
+RWByteAddressBuffer _258 : register(u0, space0);

 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@ -83,13 +88,13 @@ TagMonoid combine_tag_monoid(TagMonoid a, TagMonoid b)
 void comp_main()
 {
    uint ix = gl_GlobalInvocationID.x * 2u;
-    uint scene_ix = (_139.Load(64) >> uint(2)) + ix;
-    uint tag_word = _150.Load(scene_ix * 4 + 0);
+    uint scene_ix = (_139.Load(84) >> uint(2)) + ix;
+    uint tag_word = _151.Load(scene_ix * 4 + 0);
    uint param = tag_word;
    TagMonoid agg = reduce_tag(param);
    for (uint i = 1u; i < 2u; i++)
    {
-        tag_word = _150.Load((scene_ix + i) * 4 + 0);
+        tag_word = _151.Load((scene_ix + i) * 4 + 0);
        uint param_1 = tag_word;
        TagMonoid param_2 = agg;
        TagMonoid param_3 = reduce_tag(param_1);
@ -111,11 +116,11 @@ void comp_main()
    }
    if (gl_LocalInvocationID.x == 0u)
    {
-        _237.Store(gl_WorkGroupID.x * 20 + 0, agg.trans_ix);
-        _237.Store(gl_WorkGroupID.x * 20 + 4, agg.linewidth_ix);
-        _237.Store(gl_WorkGroupID.x * 20 + 8, agg.pathseg_ix);
-        _237.Store(gl_WorkGroupID.x * 20 + 12, agg.path_ix);
-        _237.Store(gl_WorkGroupID.x * 20 + 16, agg.pathseg_offset);
+        _238.Store(gl_WorkGroupID.x * 20 + 0, agg.trans_ix);
+        _238.Store(gl_WorkGroupID.x * 20 + 4, agg.linewidth_ix);
+        _238.Store(gl_WorkGroupID.x * 20 + 8, agg.pathseg_ix);
+        _238.Store(gl_WorkGroupID.x * 20 + 12, agg.path_ix);
+        _238.Store(gl_WorkGroupID.x * 20 + 16, agg.pathseg_offset);
    }
 }

--- a/piet-gpu/shader/gen/pathtag_reduce.msl
+++ b/piet-gpu/shader/gen/pathtag_reduce.msl
@ -33,8 +33,13 @@ struct Config
    Alloc trans_alloc;
    Alloc bbox_alloc;
    Alloc drawmonoid_alloc;
+    Alloc clip_alloc;
+    Alloc clip_bic_alloc;
+    Alloc clip_stack_alloc;
+    Alloc clip_bbox_alloc;
    uint n_trans;
    uint n_path;
+    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
@ -103,17 +108,17 @@ TagMonoid combine_tag_monoid(thread const TagMonoid& a, thread const TagMonoid&
    return c;
 }

-kernel void main0(const device ConfigBuf& _139 [[buffer(1)]], const device SceneBuf& _150 [[buffer(2)]], device OutBuf& _237 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+kernel void main0(const device ConfigBuf& _139 [[buffer(1)]], const device SceneBuf& _151 [[buffer(2)]], device OutBuf& _238 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
 {
    threadgroup TagMonoid sh_scratch[128];
    uint ix = gl_GlobalInvocationID.x * 2u;
    uint scene_ix = (_139.conf.pathtag_offset >> uint(2)) + ix;
-    uint tag_word = _150.scene[scene_ix];
+    uint tag_word = _151.scene[scene_ix];
    uint param = tag_word;
    TagMonoid agg = reduce_tag(param);
    for (uint i = 1u; i < 2u; i++)
    {
-        tag_word = _150.scene[scene_ix + i];
+        tag_word = _151.scene[scene_ix + i];
        uint param_1 = tag_word;
        TagMonoid param_2 = agg;
        TagMonoid param_3 = reduce_tag(param_1);
@ -135,11 +140,11 @@ kernel void main0(const device ConfigBuf& _139 [[buffer(1)]], const device Scene
    }
    if (gl_LocalInvocationID.x == 0u)
    {
-        _237.outbuf[gl_WorkGroupID.x].trans_ix = agg.trans_ix;
-        _237.outbuf[gl_WorkGroupID.x].linewidth_ix = agg.linewidth_ix;
-        _237.outbuf[gl_WorkGroupID.x].pathseg_ix = agg.pathseg_ix;
-        _237.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix;
-        _237.outbuf[gl_WorkGroupID.x].pathseg_offset = agg.pathseg_offset;
+        _238.outbuf[gl_WorkGroupID.x].trans_ix = agg.trans_ix;
+        _238.outbuf[gl_WorkGroupID.x].linewidth_ix = agg.linewidth_ix;
+        _238.outbuf[gl_WorkGroupID.x].pathseg_ix = agg.pathseg_ix;
+        _238.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix;
+        _238.outbuf[gl_WorkGroupID.x].pathseg_offset = agg.pathseg_offset;
    }
 }

--- a/piet-gpu/shader/gen/pathtag_reduce.spv
+++ b/piet-gpu/shader/gen/pathtag_reduce.spv
--- a/piet-gpu/shader/gen/tile_alloc.hlsl
+++ b/piet-gpu/shader/gen/tile_alloc.hlsl
@ -60,8 +60,13 @@ struct Config
    Alloc trans_alloc;
    Alloc bbox_alloc;
    Alloc drawmonoid_alloc;
+    Alloc clip_alloc;
+    Alloc clip_bic_alloc;
+    Alloc clip_stack_alloc;
+    Alloc clip_bbox_alloc;
    uint n_trans;
    uint n_path;
+    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
--- a/piet-gpu/shader/gen/tile_alloc.msl
+++ b/piet-gpu/shader/gen/tile_alloc.msl
@ -81,8 +81,13 @@ struct Config
    Alloc_1 trans_alloc;
    Alloc_1 bbox_alloc;
    Alloc_1 drawmonoid_alloc;
+    Alloc_1 clip_alloc;
+    Alloc_1 clip_bic_alloc;
+    Alloc_1 clip_stack_alloc;
+    Alloc_1 clip_bbox_alloc;
    uint n_trans;
    uint n_path;
+    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
--- a/piet-gpu/shader/gen/tile_alloc.spv
+++ b/piet-gpu/shader/gen/tile_alloc.spv
--- a/piet-gpu/shader/gen/transform_leaf.dxil
+++ b/piet-gpu/shader/gen/transform_leaf.dxil
--- a/piet-gpu/shader/gen/transform_leaf.hlsl
+++ b/piet-gpu/shader/gen/transform_leaf.hlsl
@ -39,8 +39,13 @@ struct Config
    Alloc trans_alloc;
    Alloc bbox_alloc;
    Alloc drawmonoid_alloc;
+    Alloc clip_alloc;
+    Alloc clip_bic_alloc;
+    Alloc clip_stack_alloc;
+    Alloc clip_bbox_alloc;
    uint n_trans;
    uint n_path;
+    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
@ -150,7 +155,7 @@ void TransformSeg_write(Alloc a, TransformSegRef ref, TransformSeg s)
 void comp_main()
 {
    uint ix = gl_GlobalInvocationID.x * 8u;
-    TransformRef _285 = { _278.Load(56) + (ix * 24u) };
+    TransformRef _285 = { _278.Load(76) + (ix * 24u) };
    TransformRef ref = _285;
    TransformRef param = ref;
    Transform agg = Transform_read(param);
--- a/piet-gpu/shader/gen/transform_leaf.msl
+++ b/piet-gpu/shader/gen/transform_leaf.msl
@ -102,8 +102,13 @@ struct Config
    Alloc_1 trans_alloc;
    Alloc_1 bbox_alloc;
    Alloc_1 drawmonoid_alloc;
+    Alloc_1 clip_alloc;
+    Alloc_1 clip_bic_alloc;
+    Alloc_1 clip_stack_alloc;
+    Alloc_1 clip_bbox_alloc;
    uint n_trans;
    uint n_path;
+    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
--- a/piet-gpu/shader/gen/transform_leaf.spv
+++ b/piet-gpu/shader/gen/transform_leaf.spv
--- a/piet-gpu/shader/gen/transform_reduce.dxil
+++ b/piet-gpu/shader/gen/transform_reduce.dxil
--- a/piet-gpu/shader/gen/transform_reduce.hlsl
+++ b/piet-gpu/shader/gen/transform_reduce.hlsl
@ -28,8 +28,13 @@ struct Config
    Alloc trans_alloc;
    Alloc bbox_alloc;
    Alloc drawmonoid_alloc;
+    Alloc clip_alloc;
+    Alloc clip_bic_alloc;
+    Alloc clip_stack_alloc;
+    Alloc clip_bbox_alloc;
    uint n_trans;
    uint n_path;
+    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
@ -87,7 +92,7 @@ Transform combine_monoid(Transform a, Transform b)
 void comp_main()
 {
    uint ix = gl_GlobalInvocationID.x * 8u;
-    TransformRef _168 = { _161.Load(56) + (ix * 24u) };
+    TransformRef _168 = { _161.Load(76) + (ix * 24u) };
    TransformRef ref = _168;
    TransformRef param = ref;
    Transform agg = Transform_read(param);
--- a/piet-gpu/shader/gen/transform_reduce.msl
+++ b/piet-gpu/shader/gen/transform_reduce.msl
@ -40,8 +40,13 @@ struct Config
    Alloc trans_alloc;
    Alloc bbox_alloc;
    Alloc drawmonoid_alloc;
+    Alloc clip_alloc;
+    Alloc clip_bic_alloc;
+    Alloc clip_stack_alloc;
+    Alloc clip_bbox_alloc;
    uint n_trans;
    uint n_path;
+    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
--- a/piet-gpu/shader/gen/transform_reduce.spv
+++ b/piet-gpu/shader/gen/transform_reduce.spv
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@ -91,7 +91,6 @@ void main() {
    vec2 xy = vec2(xy_uint);
    mediump vec4 rgba[CHUNK];
    uint blend_stack[MAX_BLEND_STACK][CHUNK];
-    mediump float blend_alpha_stack[MAX_BLEND_STACK][CHUNK];
    for (uint i = 0; i < CHUNK; i++) {
        rgba[i] = vec4(0.0);
    }
@ -211,7 +210,6 @@ void main() {
                // The following is a sanity check so we don't corrupt memory should there be malformed inputs.
                uint d = min(clip_depth, MAX_BLEND_STACK - 1);
                blend_stack[d][k] = packsRGB(vec4(rgba[k]));
-                blend_alpha_stack[d][k] = clamp(abs(area[k]), 0.0, 1.0);
                rgba[k] = vec4(0.0);
            }
            clip_depth++;
@ -222,7 +220,7 @@ void main() {
            for (uint k = 0; k < CHUNK; k++) {
                uint d = min(clip_depth, MAX_BLEND_STACK - 1);
                mediump vec4 bg = unpacksRGB(blend_stack[d][k]);
-                mediump vec4 fg = rgba[k] * area[k] * blend_alpha_stack[d][k];
+                mediump vec4 fg = rgba[k] * area[k];
                rgba[k] = bg * (1.0 - fg.a) + fg;
            }
            cmd_ref.offset += 4;
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@ -46,11 +46,23 @@ struct Config {
    // Monoid for draw objects
    Alloc drawmonoid_alloc;

+    // BeginClip(path_ix) / EndClip
+    Alloc clip_alloc;
+    // Intermediate bicyclic semigroup
+    Alloc clip_bic_alloc;
+    // Intermediate stack
+    Alloc clip_stack_alloc;
+    // Clip processing results (path_ix + bbox)
+    Alloc clip_bbox_alloc;
+
    // Number of transforms in scene
    // This is probably not needed.
    uint n_trans;
-    // This only counts actual paths, not EndClip.
+    // This *should* count only actual paths, but in the current
+    // implementation is redundant with n_elements.
    uint n_path;
+    // Total number of BeginClip and EndClip draw objects.
+    uint n_clip;
    // Offset (in bytes) of transform stream in scene buffer
    uint trans_offset;
    // Offset (in bytes) of linewidth stream in scene
--- a/piet-gpu/src/encoder.rs
+++ b/piet-gpu/src/encoder.rs
@ -20,7 +20,8 @@ use bytemuck::{Pod, Zeroable};
 use piet_gpu_hal::BufWrite;

 use crate::stages::{
-    self, Config, PathEncoder, Transform, DRAW_PART_SIZE, PATHSEG_PART_SIZE, TRANSFORM_PART_SIZE,
+    self, Config, PathEncoder, Transform, CLIP_PART_SIZE, DRAW_PART_SIZE, PATHSEG_PART_SIZE,
+    TRANSFORM_PART_SIZE,
 };

 pub struct Encoder {
@ -31,6 +32,7 @@ pub struct Encoder {
    drawobj_stream: Vec<u8>,
    n_path: u32,
    n_pathseg: u32,
+    n_clip: u32,
 }

 /// A scene fragment encoding a glyph.
@ -98,6 +100,7 @@ impl Encoder {
            drawobj_stream: Vec::new(),
            n_path: 0,
            n_pathseg: 0,
+            n_clip: 0,
        }
    }

@ -155,6 +158,7 @@ impl Encoder {
            ..Default::default()
        };
        self.drawobj_stream.extend(bytemuck::bytes_of(&element));
+        self.n_clip += 1;
        saved
    }

@ -170,6 +174,7 @@ impl Encoder {
        // This is a dummy path, and will go away with the new clip impl.
        self.tag_stream.push(0x10);
        self.n_path += 1;
+        self.n_clip += 1;
    }

    /// Return a config for the element processing pipeline.
@ -203,6 +208,20 @@ impl Encoder {
        alloc += n_drawobj_padded * DRAWMONOID_SIZE;
        let anno_alloc = alloc;
        alloc += n_drawobj * ANNOTATED_SIZE;
+        let clip_alloc = alloc;
+        let n_clip = self.n_clip as usize;
+        const CLIP_SIZE: usize = 4;
+        alloc += n_clip * CLIP_SIZE;
+        let clip_bic_alloc = alloc;
+        const CLIP_BIC_SIZE: usize = 8;
+        // This can round down, as we only reduce the prefix
+        alloc += (n_clip / CLIP_PART_SIZE as usize) * CLIP_BIC_SIZE;
+        let clip_stack_alloc = alloc;
+        const CLIP_EL_SIZE: usize = 20;
+        alloc += n_clip * CLIP_EL_SIZE;
+        let clip_bbox_alloc = alloc;
+        const CLIP_BBOX_SIZE: usize = 16;
+        alloc += align_up(n_clip as usize, CLIP_PART_SIZE as usize) * CLIP_BBOX_SIZE;

        let config = Config {
            n_elements: n_drawobj as u32,
@ -212,8 +231,13 @@ impl Encoder {
            trans_alloc: trans_alloc as u32,
            bbox_alloc: bbox_alloc as u32,
            drawmonoid_alloc: drawmonoid_alloc as u32,
+            clip_alloc: clip_alloc as u32,
+            clip_bic_alloc: clip_bic_alloc as u32,
+            clip_stack_alloc: clip_stack_alloc as u32,
+            clip_bbox_alloc: clip_bbox_alloc as u32,
            n_trans: n_trans as u32,
            n_path: self.n_path,
+            n_clip: self.n_clip,
            trans_offset: trans_offset as u32,
            linewidth_offset: linewidth_offset as u32,
            pathtag_offset: pathtag_offset as u32,
@ -261,6 +285,10 @@ impl Encoder {
        self.tag_stream.len()
    }

+    pub(crate) fn n_clip(&self) -> u32 {
+        self.n_clip
+    }
+
    pub(crate) fn encode_glyph(&mut self, glyph: &GlyphEncoder) {
        self.tag_stream.extend(&glyph.tag_stream);
        self.pathseg_stream.extend(&glyph.pathseg_stream);
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -20,9 +20,9 @@ use piet_gpu_hal::{
 };

 use pico_svg::PicoSvg;
-use stages::{ElementBinding, ElementCode};
+use stages::{ClipBinding, ElementBinding, ElementCode};

-use crate::stages::{Config, ElementStage};
+use crate::stages::{ClipCode, Config, ElementStage};

 const TILE_W: usize = 16;
 const TILE_H: usize = 16;
@ -86,6 +86,9 @@ pub struct Renderer {
    element_stage: ElementStage,
    element_bindings: Vec<ElementBinding>,

+    clip_code: ClipCode,
+    clip_binding: ClipBinding,
+
    tile_pipeline: Pipeline,
    tile_ds: DescriptorSet,

@ -110,6 +113,7 @@ pub struct Renderer {
    n_paths: usize,
    n_pathseg: usize,
    n_pathtag: usize,
+    n_clip: u32,

    // Keep a reference to the image so that it is not destroyed.
    _bg_image: Image,
@ -191,18 +195,20 @@ impl Renderer {
        let element_stage = ElementStage::new(session, &element_code);
        let element_bindings = scene_bufs
            .iter()
-            .zip(&config_bufs)
-            .map(|(scene_buf, config_buf)| {
+            .map(|scene_buf| {
                element_stage.bind(
                    session,
                    &element_code,
-                    config_buf,
+                    &config_buf,
                    scene_buf,
                    &memory_buf_dev,
                )
            })
            .collect();

+        let clip_code = ClipCode::new(session);
+        let clip_binding = ClipBinding::new(session, &clip_code, &config_buf, &memory_buf_dev);
+
        let tile_alloc_code = include_shader!(session, "../shader/gen/tile_alloc");
        let tile_pipeline = session
            .create_compute_pipeline(tile_alloc_code, &[BindType::Buffer, BindType::BufReadOnly])?;
@ -286,6 +292,8 @@ impl Renderer {
            element_code,
            element_stage,
            element_bindings,
+            clip_code,
+            clip_binding,
            tile_pipeline,
            tile_ds,
            path_pipeline,
@ -304,6 +312,7 @@ impl Renderer {
            n_paths: 0,
            n_pathseg: 0,
            n_pathtag: 0,
+            n_clip: 0,
            _bg_image: bg_image,
            gradient_bufs,
            gradients,
@ -329,6 +338,7 @@ impl Renderer {
        self.n_drawobj = render_ctx.n_drawobj();
        self.n_pathseg = render_ctx.n_pathseg() as usize;
        self.n_pathtag = render_ctx.n_pathtag();
+        self.n_clip = render_ctx.n_clip();

        // These constants depend on encoding and may need to be updated.
        // Perhaps we can plumb these from piet-gpu-derive?
@ -342,6 +352,7 @@ impl Renderer {
        alloc += ((n_drawobj + 255) & !255) * BIN_SIZE;
        let ptcl_base = alloc;
        alloc += width_in_tiles * height_in_tiles * PTCL_INITIAL_ALLOC;
+
        config.width_in_tiles = width_in_tiles as u32;
        config.height_in_tiles = height_in_tiles as u32;
        config.tile_alloc = tile_base as u32;
@ -401,6 +412,19 @@ impl Renderer {
        cmd_buf.end_debug_label();
        cmd_buf.write_timestamp(&query_pool, 1);
        cmd_buf.memory_barrier();
+        cmd_buf.begin_debug_label("Clip bounding box calculation");
+        self.clip_binding
+            .record(cmd_buf, &self.clip_code, self.n_clip as u32);
+        cmd_buf.end_debug_label();
+        cmd_buf.begin_debug_label("Element binning");
+        cmd_buf.dispatch(
+            &self.bin_pipeline,
+            &self.bin_ds,
+            (((self.n_paths + 255) / 256) as u32, 1, 1),
+            (256, 1, 1),
+        );
+        cmd_buf.end_debug_label();
+        cmd_buf.memory_barrier();
        cmd_buf.begin_debug_label("Tile allocation");
        cmd_buf.dispatch(
            &self.tile_pipeline,
@ -430,18 +454,7 @@ impl Renderer {
        );
        cmd_buf.end_debug_label();
        cmd_buf.write_timestamp(&query_pool, 4);
-        // Note: this barrier is not needed as an actual dependency between
-        // pipeline stages, but I am keeping it in so that timer queries are
-        // easier to interpret.
-        cmd_buf.memory_barrier();
-        cmd_buf.begin_debug_label("Element binning");
-        cmd_buf.dispatch(
-            &self.bin_pipeline,
-            &self.bin_ds,
-            (((self.n_paths + 255) / 256) as u32, 1, 1),
-            (256, 1, 1),
-        );
-        cmd_buf.end_debug_label();
+        // TODO: redo query accounting
        cmd_buf.write_timestamp(&query_pool, 5);
        cmd_buf.memory_barrier();
        cmd_buf.begin_debug_label("Coarse raster");
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@ -123,6 +123,10 @@ impl PietGpuRenderContext {
        self.new_encoder.n_transform()
    }

+    pub fn n_clip(&self) -> u32 {
+        self.new_encoder.n_clip()
+    }
+
    pub fn write_scene(&self, buf: &mut BufWrite) {
        self.new_encoder.write_scene(buf);
    }
--- a/piet-gpu/src/stages.rs
+++ b/piet-gpu/src/stages.rs
@ -16,12 +16,14 @@

 //! Stages for new element pipeline, exposed for testing.

+mod clip;
 mod draw;
 mod path;
 mod transform;

 use bytemuck::{Pod, Zeroable};

+pub use clip::{ClipBinding, ClipCode, CLIP_PART_SIZE};
 pub use draw::{DrawBinding, DrawCode, DrawMonoid, DrawStage, DRAW_PART_SIZE};
 pub use path::{PathBinding, PathCode, PathEncoder, PathStage, PATHSEG_PART_SIZE};
 use piet_gpu_hal::{Buffer, CmdBuf, Session};
@ -47,8 +49,13 @@ pub struct Config {
    pub trans_alloc: u32,
    pub bbox_alloc: u32,
    pub drawmonoid_alloc: u32,
+    pub clip_alloc: u32,
+    pub clip_bic_alloc: u32,
+    pub clip_stack_alloc: u32,
+    pub clip_bbox_alloc: u32,
    pub n_trans: u32,
    pub n_path: u32,
+    pub n_clip: u32,
    pub trans_offset: u32,
    pub linewidth_offset: u32,
    pub pathtag_offset: u32,
--- a/piet-gpu/src/stages/clip.rs
+++ b/piet-gpu/src/stages/clip.rs
@ -0,0 +1,94 @@
+// Copyright 2022 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+//! The clip processing stage (includes substages).
+
+use piet_gpu_hal::{include_shader, BindType, Buffer, CmdBuf, DescriptorSet, Pipeline, Session};
+
+// Note that this isn't the code/stage/binding pattern of most of the other stages
+// in the new element processing pipeline. We want to move those temporary buffers
+// into common memory and converge on this pattern.
+pub struct ClipCode {
+    reduce_pipeline: Pipeline,
+    leaf_pipeline: Pipeline,
+}
+
+pub struct ClipBinding {
+    reduce_ds: DescriptorSet,
+    leaf_ds: DescriptorSet,
+}
+
+pub const CLIP_PART_SIZE: u32 = 256;
+
+impl ClipCode {
+    pub unsafe fn new(session: &Session) -> ClipCode {
+        let reduce_code = include_shader!(session, "../../shader/gen/clip_reduce");
+        let reduce_pipeline = session
+            .create_compute_pipeline(reduce_code, &[BindType::Buffer, BindType::BufReadOnly])
+            .unwrap();
+        let leaf_code = include_shader!(session, "../../shader/gen/clip_leaf");
+        let leaf_pipeline = session
+            .create_compute_pipeline(leaf_code, &[BindType::Buffer, BindType::BufReadOnly])
+            .unwrap();
+        ClipCode {
+            reduce_pipeline,
+            leaf_pipeline,
+        }
+    }
+}
+
+impl ClipBinding {
+    pub unsafe fn new(
+        session: &Session,
+        code: &ClipCode,
+        config: &Buffer,
+        memory: &Buffer,
+    ) -> ClipBinding {
+        let reduce_ds = session
+            .create_simple_descriptor_set(&code.reduce_pipeline, &[memory, config])
+            .unwrap();
+        let leaf_ds = session
+            .create_simple_descriptor_set(&code.leaf_pipeline, &[memory, config])
+            .unwrap();
+        ClipBinding { reduce_ds, leaf_ds }
+    }
+
+    /// Record the clip dispatches.
+    ///
+    /// Assumes memory barrier on entry. Provides memory barrier on exit.
+    pub unsafe fn record(&self, cmd_buf: &mut CmdBuf, code: &ClipCode, n_clip: u32) {
+        let n_wg_reduce = n_clip.saturating_sub(1) / CLIP_PART_SIZE;
+        if n_wg_reduce > 0 {
+            cmd_buf.dispatch(
+                &code.reduce_pipeline,
+                &self.reduce_ds,
+                (n_wg_reduce, 1, 1),
+                (CLIP_PART_SIZE, 1, 1),
+            );
+            cmd_buf.memory_barrier();
+        }
+        let n_wg = (n_clip + CLIP_PART_SIZE - 1) / CLIP_PART_SIZE;
+        if n_wg > 0 {
+            cmd_buf.dispatch(
+                &code.leaf_pipeline,
+                &self.leaf_ds,
+                (n_wg, 1, 1),
+                (CLIP_PART_SIZE, 1, 1),
+            );
+            cmd_buf.memory_barrier();
+        }
+    }
+}
--- a/tests/src/clip.rs
+++ b/tests/src/clip.rs
@ -0,0 +1,237 @@
+// Copyright 2022 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+//! Tests for the piet-gpu clip processing stage.
+
+use bytemuck::{Pod, Zeroable};
+use rand::Rng;
+
+use piet_gpu::stages::{self, ClipBinding, ClipCode, DrawMonoid};
+use piet_gpu_hal::{BufWrite, BufferUsage};
+
+use crate::{Config, Runner, TestResult};
+
+struct ClipData {
+    clip_stream: Vec<u32>,
+    // In the atomic-int friendly encoding
+    path_bbox_stream: Vec<PathBbox>,
+}
+
+#[derive(Copy, Clone, Debug, Pod, Zeroable, Default)]
+#[repr(C)]
+struct PathBbox {
+    bbox: [u32; 4],
+    linewidth: f32,
+    trans_ix: u32,
+}
+
+pub unsafe fn clip_test(runner: &mut Runner, config: &Config) -> TestResult {
+    let mut result = TestResult::new("clip");
+    let n_clip: u64 = config.size.choose(1 << 8, 1 << 12, 1 << 16);
+    let data = ClipData::new(n_clip);
+    let stage_config = data.get_config();
+    let config_buf = runner
+        .session
+        .create_buffer_init(std::slice::from_ref(&stage_config), BufferUsage::STORAGE)
+        .unwrap();
+    // Need to actually get data uploaded
+    let mut memory = runner.buf_down(data.memory_size(), BufferUsage::STORAGE);
+    {
+        let mut buf_write = memory.map_write(..);
+        data.fill_memory(&mut buf_write);
+    }
+
+    let code = ClipCode::new(&runner.session);
+    let binding = ClipBinding::new(&runner.session, &code, &config_buf, &memory.dev_buf);
+
+    let mut commands = runner.commands();
+    commands.write_timestamp(0);
+    commands.upload(&memory);
+    binding.record(&mut commands.cmd_buf, &code, n_clip as u32);
+    commands.download(&memory);
+    commands.write_timestamp(1);
+    runner.submit(commands);
+    let dst = memory.map_read(..);
+    if let Some(failure) = data.verify(&dst) {
+        result.fail(failure);
+    }
+    result
+}
+
+fn rand_bbox() -> [u32; 4] {
+    let mut rng = rand::thread_rng();
+    const Y_MIN: u32 = 32768;
+    const Y_MAX: u32 = Y_MIN + 1000;
+    let mut x0 = rng.gen_range(Y_MIN, Y_MAX);
+    let mut y0 = rng.gen_range(Y_MIN, Y_MAX);
+    let mut x1 = rng.gen_range(Y_MIN, Y_MAX);
+    let mut y1 = rng.gen_range(Y_MIN, Y_MAX);
+    if x0 > x1 {
+        std::mem::swap(&mut x0, &mut x1);
+    }
+    if y0 > y1 {
+        std::mem::swap(&mut y0, &mut y1);
+    }
+    [x0, y0, x1, y1]
+}
+
+/// Convert from atomic-friendly to normal float bbox.
+fn decode_bbox(raw: [u32; 4]) -> [f32; 4] {
+    fn decode(x: u32) -> f32 {
+        x as f32 - 32768.0
+    }
+    [
+        decode(raw[0]),
+        decode(raw[1]),
+        decode(raw[2]),
+        decode(raw[3]),
+    ]
+}
+
+fn intersect_bbox(b0: [f32; 4], b1: [f32; 4]) -> [f32; 4] {
+    [
+        b0[0].max(b1[0]),
+        b0[1].max(b1[1]),
+        b0[2].min(b1[2]),
+        b0[3].min(b1[3]),
+    ]
+}
+
+const INFTY_BBOX: [f32; 4] = [-1e9, -1e9, 1e9, 1e9];
+
+impl ClipData {
+    /// Generate a random clip sequence
+    fn new(n: u64) -> ClipData {
+        // Simple LCG random generator, for deterministic results
+        let mut z = 20170705u64;
+        let mut depth = 0;
+        let mut path_bbox_stream = Vec::new();
+        let clip_stream = (0..n)
+            .map(|i| {
+                let is_push = if depth == 0 {
+                    true
+                } else if depth >= 255 {
+                    false
+                } else {
+                    z = z.wrapping_mul(742938285) % ((1 << 31) - 1);
+                    (z % 2) != 0
+                };
+                if is_push {
+                    depth += 1;
+                    let path_ix = path_bbox_stream.len() as u32;
+                    let bbox = rand_bbox();
+                    let path_bbox = PathBbox {
+                        bbox,
+                        ..Default::default()
+                    };
+                    path_bbox_stream.push(path_bbox);
+                    path_ix
+                } else {
+                    depth -= 1;
+                    !(i as u32)
+                }
+            })
+            .collect();
+        ClipData {
+            clip_stream,
+            path_bbox_stream,
+        }
+    }
+
+    fn get_config(&self) -> stages::Config {
+        let n_clip = self.clip_stream.len();
+        let n_path = self.path_bbox_stream.len();
+        let clip_alloc = 0;
+        let path_bbox_alloc = clip_alloc + 4 * n_clip;
+        let drawmonoid_alloc = path_bbox_alloc + 24 * n_path;
+        let clip_bic_alloc = drawmonoid_alloc + 8 * n_clip;
+        // TODO: this is over-allocated, we only need one bic per wg
+        let clip_stack_alloc = clip_bic_alloc + 8 * n_clip;
+        let clip_bbox_alloc = clip_stack_alloc + 20 * n_clip;
+        stages::Config {
+            clip_alloc: clip_alloc as u32,
+            // TODO: this wants to be renamed to path_bbox_alloc
+            bbox_alloc: path_bbox_alloc as u32,
+            drawmonoid_alloc: drawmonoid_alloc as u32,
+            clip_bic_alloc: clip_bic_alloc as u32,
+            clip_stack_alloc: clip_stack_alloc as u32,
+            clip_bbox_alloc: clip_bbox_alloc as u32,
+            n_clip: n_clip as u32,
+            ..Default::default()
+        }
+    }
+
+    fn memory_size(&self) -> u64 {
+        (8 + self.clip_stream.len() * (4 + 8 + 8 + 20 + 16) + self.path_bbox_stream.len() * 24)
+            as u64
+    }
+
+    fn fill_memory(&self, buf: &mut BufWrite) {
+        // offset / header; no dynamic allocation
+        buf.fill_zero(8);
+        buf.extend_slice(&self.clip_stream);
+        buf.extend_slice(&self.path_bbox_stream);
+        // drawmonoid is left uninitialized
+    }
+
+    fn verify(&self, buf: &[u8]) -> Option<String> {
+        let n_clip = self.clip_stream.len();
+        let n_path = self.path_bbox_stream.len();
+        let clip_bbox_start = 8 + n_clip * (4 + 8 + 8 + 20) + n_path * 24;
+        let clip_range = clip_bbox_start..(clip_bbox_start + n_clip * 16);
+        let clip_result = bytemuck::cast_slice::<u8, [f32; 4]>(&buf[clip_range]);
+        let draw_start = 8 + n_clip * 4 + n_path * 24;
+        let draw_range = draw_start..(draw_start + n_clip * 8);
+        let draw_result = bytemuck::cast_slice::<u8, DrawMonoid>(&buf[draw_range]);
+        let mut bbox_stack = Vec::new();
+        let mut parent_stack = Vec::new();
+        for (i, path_ix) in self.clip_stream.iter().enumerate() {
+            let mut expected_path = None;
+            if *path_ix >= 0x8000_0000 {
+                let parent = parent_stack.pop().unwrap();
+                expected_path = Some(self.clip_stream[parent as usize]);
+                bbox_stack.pop().unwrap();
+            } else {
+                parent_stack.push(i);
+                let path_bbox_stream = self.path_bbox_stream[*path_ix as usize];
+                let bbox = decode_bbox(path_bbox_stream.bbox);
+                let new = match bbox_stack.last() {
+                    None => bbox,
+                    Some(old) => intersect_bbox(*old, bbox),
+                };
+                bbox_stack.push(new);
+            };
+            let expected = bbox_stack.last().copied().unwrap_or(INFTY_BBOX);
+            let clip_bbox = clip_result[i];
+            if clip_bbox != expected {
+                return Some(format!(
+                    "{}: path_ix={}, expected bbox={:?}, clip_bbox={:?}",
+                    i, path_ix, expected, clip_bbox
+                ));
+            }
+            if let Some(expected_path) = expected_path {
+                let actual_path = draw_result[i].path_ix;
+                if expected_path != actual_path {
+                    return Some(format!(
+                        "{}: expected path {}, actual {}",
+                        i, expected_path, actual_path
+                    ));
+                }
+            }
+        }
+        None
+    }
+}
--- a/tests/src/draw.rs
+++ b/tests/src/draw.rs
@ -102,17 +102,21 @@ impl DrawTestData {
        // Layout of memory
        let drawmonoid_alloc = 0;
        let anno_alloc = drawmonoid_alloc + 8 * n_tags;
+        let clip_alloc = anno_alloc + ANNOTATED_SIZE * n_tags;
        let stage_config = stages::Config {
            n_elements: n_tags as u32,
            anno_alloc: anno_alloc as u32,
            drawmonoid_alloc: drawmonoid_alloc as u32,
+            clip_alloc: clip_alloc as u32,
            ..Default::default()
        };
        stage_config
    }

    fn memory_size(&self) -> u64 {
-        (8 + self.tags.len() * (8 + ANNOTATED_SIZE)) as u64
+        // Note: this overallocates the clip buf a bit - only needed for the
+        // total number of begin_clip and end_clip tags.
+        (8 + self.tags.len() * (8 + 4 + ANNOTATED_SIZE)) as u64
    }

    fn fill_scene(&self, buf: &mut BufWrite) {
@ -128,14 +132,13 @@ impl DrawTestData {
        let actual = bytemuck::cast_slice::<u8, DrawMonoid>(&buf[8..8 + size]);
        let mut expected = DrawMonoid::default();
        for (i, (tag, actual)) in self.tags.iter().zip(actual).enumerate() {
-            // We compute an inclusive prefix sum, but for this application
-            // exclusive would be slightly better. We can adapt though.
+            // Verify exclusive prefix sum.
            let (path_ix, clip_ix) = Self::reduce_tag(*tag);
-            expected.path_ix += path_ix;
-            expected.clip_ix += clip_ix;
            if *actual != expected {
                return Some(format!("draw mismatch at {}", i));
            }
+            expected.path_ix += path_ix;
+            expected.clip_ix += clip_ix;
        }
        None
    }
--- a/tests/src/main.rs
+++ b/tests/src/main.rs
@ -17,6 +17,7 @@
 //! Tests for piet-gpu shaders and GPU capabilities.

 mod clear;
+mod clip;
 mod config;
 mod draw;
 mod linkedlist;
@ -139,6 +140,7 @@ fn main() {
            report(&transform::transform_test(&mut runner, &config));
            report(&path::path_test(&mut runner, &config));
            report(&draw::draw_test(&mut runner, &config));
+            report(&clip::clip_test(&mut runner, &config));
        }
    }
 }
--- a/tests/src/runner.rs
+++ b/tests/src/runner.rs
@ -20,8 +20,8 @@ use std::ops::RangeBounds;

 use bytemuck::Pod;
 use piet_gpu_hal::{
-    BackendType, BufReadGuard, Buffer, BufferUsage, CmdBuf, Instance, InstanceFlags, QueryPool,
-    Session,
+    BackendType, BufReadGuard, BufWriteGuard, Buffer, BufferUsage, CmdBuf, Instance, InstanceFlags,
+    QueryPool, Session,
 };

 pub struct Runner {
@ -37,15 +37,8 @@ pub struct Commands {
    query_pool: QueryPool,
 }

-/// Buffer for uploading data to GPU.
-#[allow(unused)]
-pub struct BufUp {
-    pub stage_buf: Buffer,
-    pub dev_buf: Buffer,
-}
-
-/// Buffer for downloading data from GPU.
-pub struct BufDown {
+/// Buffer for both uploading and downloading
+pub struct BufStage {
    pub stage_buf: Buffer,
    pub dev_buf: Buffer,
 }
@ -92,7 +85,7 @@ impl Runner {
    }

    #[allow(unused)]
-    pub fn buf_up(&self, size: u64) -> BufUp {
+    pub fn buf_up(&self, size: u64) -> BufStage {
        let stage_buf = self
            .session
            .create_buffer(size, BufferUsage::MAP_WRITE | BufferUsage::COPY_SRC)
@ -101,13 +94,13 @@ impl Runner {
            .session
            .create_buffer(size, BufferUsage::COPY_DST | BufferUsage::STORAGE)
            .unwrap();
-        BufUp { stage_buf, dev_buf }
+        BufStage { stage_buf, dev_buf }
    }

    /// Create a buffer for download (readback).
    ///
    /// The `usage` parameter need not include COPY_SRC and STORAGE.
-    pub fn buf_down(&self, size: u64, usage: BufferUsage) -> BufDown {
+    pub fn buf_down(&self, size: u64, usage: BufferUsage) -> BufStage {
        let stage_buf = self
            .session
            .create_buffer(size, BufferUsage::MAP_READ | BufferUsage::COPY_DST)
@ -116,7 +109,7 @@ impl Runner {
            .session
            .create_buffer(size, usage | BufferUsage::COPY_SRC | BufferUsage::STORAGE)
            .unwrap();
-        BufDown { stage_buf, dev_buf }
+        BufStage { stage_buf, dev_buf }
    }

    pub fn backend_type(&self) -> BackendType {
@ -129,17 +122,16 @@ impl Commands {
        self.cmd_buf.write_timestamp(&self.query_pool, query);
    }

-    #[allow(unused)]
-    pub unsafe fn upload(&mut self, buf: &BufUp) {
+    pub unsafe fn upload(&mut self, buf: &BufStage) {
        self.cmd_buf.copy_buffer(&buf.stage_buf, &buf.dev_buf);
    }

-    pub unsafe fn download(&mut self, buf: &BufDown) {
+    pub unsafe fn download(&mut self, buf: &BufStage) {
        self.cmd_buf.copy_buffer(&buf.dev_buf, &buf.stage_buf);
    }
 }

-impl BufDown {
+impl BufStage {
    pub unsafe fn read(&self, dst: &mut Vec<impl Pod>) {
        self.stage_buf.read(dst).unwrap()
    }
@ -147,4 +139,8 @@ impl BufDown {
    pub unsafe fn map_read<'a>(&'a self, range: impl RangeBounds<usize>) -> BufReadGuard<'a> {
        self.stage_buf.map_read(range).unwrap()
    }
+
+    pub unsafe fn map_write<'a>(&'a mut self, range: impl RangeBounds<usize>) -> BufWriteGuard {
+        self.stage_buf.map_write(range).unwrap()
+    }
 }