diff --git a/piet-gpu/bin/winit.rs b/piet-gpu/bin/winit.rs
index f12a1cf..3ca0742 100644
--- a/piet-gpu/bin/winit.rs
+++ b/piet-gpu/bin/winit.rs
@@ -57,16 +57,27 @@ fn main() -> Result<(), Error> {
         let mut submitted: [Option<SubmittedCmdBuf>; NUM_FRAMES] = Default::default();
 
         let mut renderer = Renderer::new(&session, WIDTH, HEIGHT, NUM_FRAMES)?;
+        let mut mode = 0usize;
 
         event_loop.run(move |event, _, control_flow| {
             *control_flow = ControlFlow::Poll; // `ControlFlow::Wait` if only re-render on event
 
             match event {
                 Event::WindowEvent { event, window_id } if window_id == window.id() => {
+                    use winit::event::{ElementState, VirtualKeyCode};
                     match event {
                         WindowEvent::CloseRequested => {
                             *control_flow = ControlFlow::Exit;
                         }
+                        WindowEvent::KeyboardInput { input, .. } => {
+                            if input.state == ElementState::Pressed {
+                                match input.virtual_keycode {
+                                    Some(VirtualKeyCode::Left) => mode = mode.wrapping_sub(1),
+                                    Some(VirtualKeyCode::Right) => mode = mode.wrapping_add(1),
+                                    _ => {}
+                                }
+                            }
+                        }
                         _ => (),
                     }
                 }
@@ -105,7 +116,41 @@ fn main() -> Result<(), Error> {
                         }
                         test_scenes::render_svg(&mut ctx, input, scale);
                     } else {
-                        test_scenes::render_anim_frame(&mut ctx, current_frame);
+                        use piet_gpu::{Blend, BlendMode::*, CompositionMode::*};
+                        let blends = [
+                            Blend::new(Normal, SrcOver),
+                            Blend::new(Multiply, SrcOver),
+                            Blend::new(Screen, SrcOver),
+                            Blend::new(Overlay, SrcOver),
+                            Blend::new(Darken, SrcOver),
+                            Blend::new(Lighten, SrcOver),
+                            Blend::new(ColorDodge, SrcOver),
+                            Blend::new(ColorBurn, SrcOver),
+                            Blend::new(HardLight, SrcOver),
+                            Blend::new(SoftLight, SrcOver),
+                            Blend::new(Difference, SrcOver),
+                            Blend::new(Exclusion, SrcOver),
+                            Blend::new(Hue, SrcOver),
+                            Blend::new(Saturation, SrcOver),
+                            Blend::new(Color, SrcOver),
+                            Blend::new(Luminosity, SrcOver),
+                            Blend::new(Normal, Clear),
+                            Blend::new(Normal, Copy),
+                            Blend::new(Normal, Dest),
+                            Blend::new(Normal, SrcOver),
+                            Blend::new(Normal, DestOver),
+                            Blend::new(Normal, SrcIn),
+                            Blend::new(Normal, DestIn),
+                            Blend::new(Normal, SrcOut),
+                            Blend::new(Normal, DestOut),
+                            Blend::new(Normal, SrcAtop),
+                            Blend::new(Normal, DestAtop),
+                            Blend::new(Normal, Xor),
+                            Blend::new(Normal, Plus),
+                        ];
+                        let blend = blends[mode % blends.len()];
+                        test_scenes::render_blend_test(&mut ctx, current_frame, blend);
+                        info_string = format!("{:?}", blend);
                     }
                     render_info_string(&mut ctx, &info_string);
                     if let Err(e) = renderer.upload_render_ctx(&mut ctx, frame_idx) {
diff --git a/piet-gpu/shader/annotated.h b/piet-gpu/shader/annotated.h
index b833574..5a35088 100644
--- a/piet-gpu/shader/annotated.h
+++ b/piet-gpu/shader/annotated.h
@@ -69,9 +69,10 @@ AnnoLinGradientRef AnnoLinGradient_index(AnnoLinGradientRef ref, uint index) {
 struct AnnoBeginClip {
     vec4 bbox;
     float linewidth;
+    uint blend;
 };
 
-#define AnnoBeginClip_size 20
+#define AnnoBeginClip_size 24
 
 AnnoBeginClipRef AnnoBeginClip_index(AnnoBeginClipRef ref, uint index) {
     return AnnoBeginClipRef(ref.offset + index * AnnoBeginClip_size);
@@ -79,9 +80,10 @@ AnnoBeginClipRef AnnoBeginClip_index(AnnoBeginClipRef ref, uint index) {
 
 struct AnnoEndClip {
     vec4 bbox;
+    uint blend;
 };
 
-#define AnnoEndClip_size 16
+#define AnnoEndClip_size 20
 
 AnnoEndClipRef AnnoEndClip_index(AnnoEndClipRef ref, uint index) {
     return AnnoEndClipRef(ref.offset + index * AnnoEndClip_size);
@@ -198,9 +200,11 @@ AnnoBeginClip AnnoBeginClip_read(Alloc a, AnnoBeginClipRef ref) {
     uint raw2 = read_mem(a, ix + 2);
     uint raw3 = read_mem(a, ix + 3);
     uint raw4 = read_mem(a, ix + 4);
+    uint raw5 = read_mem(a, ix + 5);
     AnnoBeginClip s;
     s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
     s.linewidth = uintBitsToFloat(raw4);
+    s.blend = raw5;
     return s;
 }
 
@@ -211,6 +215,7 @@ void AnnoBeginClip_write(Alloc a, AnnoBeginClipRef ref, AnnoBeginClip s) {
     write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
     write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
     write_mem(a, ix + 4, floatBitsToUint(s.linewidth));
+    write_mem(a, ix + 5, s.blend);
 }
 
 AnnoEndClip AnnoEndClip_read(Alloc a, AnnoEndClipRef ref) {
@@ -219,8 +224,10 @@ AnnoEndClip AnnoEndClip_read(Alloc a, AnnoEndClipRef ref) {
     uint raw1 = read_mem(a, ix + 1);
     uint raw2 = read_mem(a, ix + 2);
     uint raw3 = read_mem(a, ix + 3);
+    uint raw4 = read_mem(a, ix + 4);
     AnnoEndClip s;
     s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.blend = raw4;
     return s;
 }
 
@@ -230,6 +237,7 @@ void AnnoEndClip_write(Alloc a, AnnoEndClipRef ref, AnnoEndClip s) {
     write_mem(a, ix + 1, floatBitsToUint(s.bbox.y));
     write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
     write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
+    write_mem(a, ix + 4, s.blend);
 }
 
 AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref) {
@@ -281,8 +289,8 @@ void Annotated_BeginClip_write(Alloc a, AnnotatedRef ref, uint flags, AnnoBeginC
     AnnoBeginClip_write(a, AnnoBeginClipRef(ref.offset + 4), s);
 }
 
-void Annotated_EndClip_write(Alloc a, AnnotatedRef ref, AnnoEndClip s) {
-    write_mem(a, ref.offset >> 2, Annotated_EndClip);
+void Annotated_EndClip_write(Alloc a, AnnotatedRef ref, uint flags, AnnoEndClip s) {
+    write_mem(a, ref.offset >> 2, (flags << 16) | Annotated_EndClip);
     AnnoEndClip_write(a, AnnoEndClipRef(ref.offset + 4), s);
 }
 
diff --git a/piet-gpu/shader/blend.h b/piet-gpu/shader/blend.h
new file mode 100644
index 0000000..1ac4bd6
--- /dev/null
+++ b/piet-gpu/shader/blend.h
@@ -0,0 +1,260 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Mode definitions and functions for blending and composition.
+
+#define Blend_Normal 0
+#define Blend_Multiply 1
+#define Blend_Screen 2
+#define Blend_Overlay 3
+#define Blend_Darken 4
+#define Blend_Lighten 5
+#define Blend_ColorDodge 6
+#define Blend_ColorBurn 7
+#define Blend_HardLight 8
+#define Blend_SoftLight 9
+#define Blend_Difference 10
+#define Blend_Exclusion 11
+#define Blend_Hue 12
+#define Blend_Saturation 13
+#define Blend_Color 14
+#define Blend_Luminosity 15
+
+vec3 screen(vec3 cb, vec3 cs) {
+	return cb + cs - (cb * cs);
+}
+
+float color_dodge(float cb, float cs) {
+    if (cb == 0.0)
+        return 0.0;
+    else if (cs == 1.0)
+        return 1.0;
+    else
+        return min(1.0, cb / (1.0 - cs));
+}
+
+float color_burn(float cb, float cs) {
+    if (cb == 1.0)
+        return 1.0;
+    else if (cs == 0.0)
+        return 0.0;
+    else
+        return 1.0 - min(1.0, (1.0 - cb) / cs);
+}
+
+vec3 hard_light(vec3 cb, vec3 cs) {
+	return mix(
+		screen(cb, 2.0 * cs - 1.0),
+		cb * 2.0 * cs, 
+		vec3(lessThanEqual(cs, vec3(0.5)))
+	);
+}
+
+vec3 soft_light(vec3 cb, vec3 cs) {
+	vec3 d = mix(
+		sqrt(cb),
+		((16.0 * cb - vec3(12.0)) * cb + vec3(4.0)) * cb,
+		vec3(lessThanEqual(cb, vec3(0.25)))
+	);
+	return mix(
+		cb + (2.0 * cs - vec3(1.0)) * (d - cb),
+		cb - (vec3(1.0) - 2.0 * cs) * cb * (vec3(1.0) - cb),
+		vec3(lessThanEqual(cs, vec3(0.5)))
+	);
+}
+
+float sat(vec3 c) {
+    return max(c.r, max(c.g, c.b)) - min(c.r, min(c.g, c.b));
+}
+
+float lum(vec3 c) {
+    vec3 f = vec3(0.3, 0.59, 0.11);
+    return dot(c, f);
+}
+
+vec3 clip_color(vec3 c) {
+    float L = lum(c);
+    float n = min(c.r, min(c.g, c.b));
+    float x = max(c.r, max(c.g, c.b));
+    if (n < 0.0)
+        c = L + (((c - L) * L) / (L - n));
+    if (x > 1.0)
+        c = L + (((c - L) * (1.0 - L)) / (x - L));
+    return c;
+}
+
+vec3 set_lum(vec3 c, float l) {
+    return clip_color(c + (l - lum(c)));
+}
+
+void set_sat_inner(inout float cmin, inout float cmid, inout float cmax, float s) {
+    if (cmax > cmin) {
+        cmid = (((cmid - cmin) * s) / (cmax - cmin));
+        cmax = s;
+    } else {
+        cmid = 0.0;
+        cmax = 0.0;
+    }
+    cmin = 0.0;
+}
+
+vec3 set_sat(vec3 c, float s) {
+    if (c.r <= c.g) {
+        if (c.g <= c.b) {
+            set_sat_inner(c.r, c.g, c.b, s);
+        } else {
+            if (c.r <= c.b) {
+                set_sat_inner(c.r, c.b, c.g, s);
+            } else {
+                set_sat_inner(c.b, c.r, c.g, s);
+            }
+        }
+    } else {
+        if (c.r <= c.b) {
+            set_sat_inner(c.g, c.r, c.b, s);
+        } else {
+            if (c.g <= c.b) {
+                set_sat_inner(c.g, c.b, c.r, s);
+            } else {
+                set_sat_inner(c.b, c.g, c.r, s);
+            }
+        }
+    }
+    return c;
+}
+
+vec3 mix_blend(vec3 cb, vec3 cs, uint mode) {
+	vec3 b = vec3(0.0);
+	switch (mode) {
+	case Blend_Multiply:
+		b = cb * cs;
+		break;
+	case Blend_Screen:
+		b = screen(cb, cs);
+		break;
+	case Blend_Overlay:
+		b = hard_light(cs, cb);
+		break;
+	case Blend_Darken:
+		b = min(cb, cs);
+		break;
+	case Blend_Lighten:
+		b = max(cb, cs);
+		break;
+	case Blend_ColorDodge:
+		b = vec3(color_dodge(cb.x, cs.x), color_dodge(cb.y, cs.y), color_dodge(cb.z, cs.z));
+		break;
+	case Blend_ColorBurn:
+		b = vec3(color_burn(cb.x, cs.x), color_burn(cb.y, cs.y), color_burn(cb.z, cs.z));
+		break;
+	case Blend_HardLight:
+		b = hard_light(cb, cs);
+		break;
+	case Blend_SoftLight:
+		b = soft_light(cb, cs);
+		break;
+	case Blend_Difference:
+		b = abs(cb - cs);
+		break;
+	case Blend_Exclusion:
+		b = cb + cs - 2 * cb * cs;
+		break;
+	case Blend_Hue:
+		b = set_lum(set_sat(cs, sat(cb)), lum(cb));
+		break;
+	case Blend_Saturation:
+		b = set_lum(set_sat(cb, sat(cs)), lum(cb));
+		break;
+	case Blend_Color:
+		b = set_lum(cs, lum(cb));
+		break;
+	case Blend_Luminosity:
+		b = set_lum(cb, lum(cs));
+		break;
+	default:
+		b = cs;
+		break;
+	}
+	return b;
+}
+
+#define Comp_Clear 0
+#define Comp_Copy 1
+#define Comp_Dest 2
+#define Comp_SrcOver 3
+#define Comp_DestOver 4
+#define Comp_SrcIn 5
+#define Comp_DestIn 6
+#define Comp_SrcOut 7
+#define Comp_DestOut 8
+#define Comp_SrcAtop 9
+#define Comp_DestAtop 10
+#define Comp_Xor 11
+#define Comp_Plus 12
+#define Comp_PlusDarker 13
+#define Comp_PlusLighter 14
+
+vec4 mix_compose(vec3 cb, vec3 cs, float ab, float as, uint mode) {
+	float fa = 0.0;
+	float fb = 0.0;
+	switch (mode) {
+	case Comp_Copy:
+		fa = 1.0;
+		fb = 0.0;
+		break;
+	case Comp_Dest:
+		fa = 0.0;
+		fb = 1.0;
+		break;
+	case Comp_SrcOver:
+		fa = 1.0;
+		fb = 1.0 - as;
+		break;
+	case Comp_DestOver:
+		fa = 1.0 - ab;
+		fb = 1.0;
+		break;
+	case Comp_SrcIn:
+		fa = ab;
+		fb = 0.0;
+		break;
+	case Comp_DestIn:
+		fa = 0.0;
+		fb = as;
+		break;
+	case Comp_SrcOut:
+		fa = 1.0 - ab;
+		fb = 0.0;
+		break;
+	case Comp_DestOut:
+		fa = 0.0;
+		fb = 1.0 - as;
+		break;
+	case Comp_SrcAtop:
+		fa = ab;
+		fb = 1.0 - as;
+		break;
+	case Comp_DestAtop:
+		fa = 1.0 - ab;
+		fb = as;
+		break;
+	case Comp_Xor:
+		fa = 1.0 - ab;
+		fb = 1.0 - as;
+		break;
+	case Comp_Plus:
+		fa = 1.0;
+		fb = 1.0;
+		break;
+	case Comp_PlusDarker:
+		return vec4(max(vec4(0.0), 1.0 - as * vec4(cs, as) + 1.0 - ab * vec4(cb, ab)).xyz, 
+			    max(0.0, 1.0 - as + 1.0 - ab));
+	case Comp_PlusLighter:
+		return vec4(min(vec4(1.0), as * vec4(cs, as) + ab * vec4(cb, ab)).xyz,
+			    min(1.0, as + ab));
+	default:
+		break;
+	}
+	return as * fa * vec4(cs, as) + ab * fb * vec4(cb, ab);
+}
+
+#define BlendComp_default (Blend_Normal << 8 | Comp_SrcOver)
diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja
index 8b9998f..ac4f3d7 100644
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@@ -53,7 +53,7 @@ build gen/coarse.hlsl: hlsl gen/coarse.spv
 build gen/coarse.dxil: dxil gen/coarse.hlsl
 build gen/coarse.msl: msl gen/coarse.spv
 
-build gen/kernel4.spv: glsl kernel4.comp | ptcl.h setup.h
+build gen/kernel4.spv: glsl kernel4.comp | blend.h ptcl.h setup.h
 build gen/kernel4.hlsl: hlsl gen/kernel4.spv
 build gen/kernel4.dxil: dxil gen/kernel4.hlsl
 build gen/kernel4.msl: msl gen/kernel4.spv
@@ -114,7 +114,7 @@ build gen/draw_root.hlsl: hlsl gen/draw_root.spv
 build gen/draw_root.dxil: dxil gen/draw_root.hlsl
 build gen/draw_root.msl: msl gen/draw_root.spv
 
-build gen/draw_leaf.spv: glsl draw_leaf.comp | scene.h drawtag.h annotated.h setup.h mem.h
+build gen/draw_leaf.spv: glsl draw_leaf.comp | blend.h scene.h drawtag.h annotated.h setup.h mem.h
 build gen/draw_leaf.hlsl: hlsl gen/draw_leaf.spv
 build gen/draw_leaf.dxil: dxil gen/draw_leaf.hlsl
 build gen/draw_leaf.msl: msl gen/draw_leaf.spv
diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
index 98ab270..df306e0 100644
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@@ -273,7 +273,8 @@ void main() {
                 }
             }
             AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + sh_elements[el_ix] * Annotated_size);
-            uint tag = Annotated_tag(conf.anno_alloc, ref).tag;
+            AnnotatedTag anno_tag = Annotated_tag(conf.anno_alloc, ref);
+            uint tag = anno_tag.tag;
             uint seq_ix = ix - (el_ix > 0 ? sh_tile_count[el_ix - 1] : 0);
             uint width = sh_tile_width[el_ix];
             uint x = sh_tile_x0[el_ix] + seq_ix % width;
@@ -287,7 +288,10 @@ void main() {
                 // For draws, include the tile if it is solid.
                 // For clips, include the tile if it is empty - this way, logic
                 // below will suppress the drawing of inner elements.
-                include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip;
+                // For blends, include the tile if
+                // (blend_mode, composition_mode) != (Normal, SrcOver)
+                include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip
+                    || (is_clip && (anno_tag.flags & 0x2) != 0);
             }
             if (include_tile) {
                 uint el_slice = el_ix / 32;
@@ -387,13 +391,14 @@ void main() {
                     tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok),
                                      TileRef(sh_tile_base[element_ref_ix] +
                                              (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
+                    AnnoEndClip end_clip = Annotated_EndClip_read(conf.anno_alloc, ref);
                     clip_depth--;
                     if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
                         break;
                     }
                     write_fill(cmd_alloc, cmd_ref, MODE_NONZERO, tile, 0.0);
-                    Cmd_EndClip_write(cmd_alloc, cmd_ref);
-                    cmd_ref.offset += 4;
+                    Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(end_clip.blend));
+                    cmd_ref.offset += 4 + CmdEndClip_size;
                     break;
                 }
             } else {
diff --git a/piet-gpu/shader/draw_leaf.comp b/piet-gpu/shader/draw_leaf.comp
index f236b7f..74fc2f8 100644
--- a/piet-gpu/shader/draw_leaf.comp
+++ b/piet-gpu/shader/draw_leaf.comp
@@ -28,6 +28,7 @@ layout(binding = 2) readonly buffer SceneBuf {
 #include "tile.h"
 #include "drawtag.h"
 #include "annotated.h"
+#include "blend.h"
 
 #define Monoid DrawMonoid
 
@@ -149,17 +150,23 @@ void main() {
                 Annotated_Image_write(conf.anno_alloc, out_ref, fill_mode, anno_img);
                 break;
             case Element_BeginClip:
+                Clip begin_clip = Element_BeginClip_read(this_ref);
                 AnnoBeginClip anno_begin_clip;
                 anno_begin_clip.bbox = bbox;
                 anno_begin_clip.linewidth = 0.0; // don't support clip-with-stroke
-                Annotated_BeginClip_write(conf.anno_alloc, out_ref, 0, anno_begin_clip);
+                anno_begin_clip.blend = begin_clip.blend;
+                uint flags = uint(begin_clip.blend != BlendComp_default) << 1;
+                Annotated_BeginClip_write(conf.anno_alloc, out_ref, flags, anno_begin_clip);
                 break;
             }
         } else if (tag_word == Element_EndClip) {
+            Clip end_clip = Element_BeginClip_read(this_ref);
             AnnoEndClip anno_end_clip;
             // The actual bbox will be reconstructed from clip stream output.
             anno_end_clip.bbox = vec4(-1e9, -1e9, 1e9, 1e9);
-            Annotated_EndClip_write(conf.anno_alloc, out_ref, anno_end_clip);
+            anno_end_clip.blend = end_clip.blend;
+            uint flags = uint(end_clip.blend != BlendComp_default) << 1;
+            Annotated_EndClip_write(conf.anno_alloc, out_ref, flags, anno_end_clip);
         }
         // Generate clip stream.
         if (tag_word == Element_BeginClip || tag_word == Element_EndClip) {
diff --git a/piet-gpu/shader/gen/coarse.dxil b/piet-gpu/shader/gen/coarse.dxil
index 0599eb8..c7e1682 100644
Binary files a/piet-gpu/shader/gen/coarse.dxil and b/piet-gpu/shader/gen/coarse.dxil differ
diff --git a/piet-gpu/shader/gen/coarse.hlsl b/piet-gpu/shader/gen/coarse.hlsl
index b5f3949..f00eabe 100644
--- a/piet-gpu/shader/gen/coarse.hlsl
+++ b/piet-gpu/shader/gen/coarse.hlsl
@@ -49,6 +49,17 @@ struct AnnoLinGradient
     float line_c;
 };
 
+struct AnnoEndClipRef
+{
+    uint offset;
+};
+
+struct AnnoEndClip
+{
+    float4 bbox;
+    uint blend;
+};
+
 struct AnnotatedRef
 {
     uint offset;
@@ -153,6 +164,16 @@ struct CmdImage
     int2 offset;
 };
 
+struct CmdEndClipRef
+{
+    uint offset;
+};
+
+struct CmdEndClip
+{
+    uint blend;
+};
+
 struct CmdJumpRef
 {
     uint offset;
@@ -197,8 +218,8 @@ struct Config
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-RWByteAddressBuffer _283 : register(u0, space0);
-ByteAddressBuffer _1169 : register(t1, space0);
+RWByteAddressBuffer _308 : register(u0, space0);
+ByteAddressBuffer _1283 : register(t1, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -221,8 +242,8 @@ groupshared uint sh_tile_count[256];
 
 Alloc slice_mem(Alloc a, uint offset, uint size)
 {
-    Alloc _360 = { a.offset + offset };
-    return _360;
+    Alloc _385 = { a.offset + offset };
+    return _385;
 }
 
 bool touch_mem(Alloc alloc, uint offset)
@@ -238,7 +259,7 @@ uint read_mem(Alloc alloc, uint offset)
     {
         return 0u;
     }
-    uint v = _283.Load(offset * 4 + 8);
+    uint v = _308.Load(offset * 4 + 8);
     return v;
 }
 
@@ -251,8 +272,8 @@ Alloc new_alloc(uint offset, uint size, bool mem_ok)
 
 BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index)
 {
-    BinInstanceRef _674 = { ref.offset + (index * 4u) };
-    return _674;
+    BinInstanceRef _765 = { ref.offset + (index * 4u) };
+    return _765;
 }
 
 BinInstance BinInstance_read(Alloc a, BinInstanceRef ref)
@@ -271,8 +292,8 @@ AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref)
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint tag_and_flags = read_mem(param, param_1);
-    AnnotatedTag _636 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _636;
+    AnnotatedTag _717 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+    return _717;
 }
 
 Path Path_read(Alloc a, PathRef ref)
@@ -289,8 +310,8 @@ Path Path_read(Alloc a, PathRef ref)
     uint raw2 = read_mem(param_4, param_5);
     Path s;
     s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
-    TileRef _734 = { raw2 };
-    s.tiles = _734;
+    TileRef _825 = { raw2 };
+    s.tiles = _825;
     return s;
 }
 
@@ -300,11 +321,11 @@ void write_tile_alloc(uint el_ix, Alloc a)
 
 Alloc read_tile_alloc(uint el_ix, bool mem_ok)
 {
-    uint _1055;
-    _283.GetDimensions(_1055);
-    _1055 = (_1055 - 8) / 4;
+    uint _1169;
+    _308.GetDimensions(_1169);
+    _1169 = (_1169 - 8) / 4;
     uint param = 0u;
-    uint param_1 = uint(int(_1055) * 4);
+    uint param_1 = uint(int(_1169) * 4);
     bool param_2 = mem_ok;
     return new_alloc(param, param_1, param_2);
 }
@@ -318,9 +339,9 @@ Tile Tile_read(Alloc a, TileRef ref)
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
     uint raw1 = read_mem(param_2, param_3);
-    TileSegRef _759 = { raw0 };
+    TileSegRef _850 = { raw0 };
     Tile s;
-    s.tile = _759;
+    s.tile = _850;
     s.backdrop = int(raw1);
     return s;
 }
@@ -355,30 +376,30 @@ AnnoColor AnnoColor_read(Alloc a, AnnoColorRef ref)
 
 AnnoColor Annotated_Color_read(Alloc a, AnnotatedRef ref)
 {
-    AnnoColorRef _642 = { ref.offset + 4u };
+    AnnoColorRef _723 = { ref.offset + 4u };
     Alloc param = a;
-    AnnoColorRef param_1 = _642;
+    AnnoColorRef param_1 = _723;
     return AnnoColor_read(param, param_1);
 }
 
 MallocResult malloc(uint size)
 {
-    uint _289;
-    _283.InterlockedAdd(0, size, _289);
-    uint offset = _289;
-    uint _296;
-    _283.GetDimensions(_296);
-    _296 = (_296 - 8) / 4;
+    uint _314;
+    _308.InterlockedAdd(0, size, _314);
+    uint offset = _314;
+    uint _321;
+    _308.GetDimensions(_321);
+    _321 = (_321 - 8) / 4;
     MallocResult r;
-    r.failed = (offset + size) > uint(int(_296) * 4);
+    r.failed = (offset + size) > uint(int(_321) * 4);
     uint param = offset;
     uint param_1 = size;
     bool param_2 = !r.failed;
     r.alloc = new_alloc(param, param_1, param_2);
     if (r.failed)
     {
-        uint _318;
-        _283.InterlockedMax(4, 1u, _318);
+        uint _343;
+        _308.InterlockedMax(4, 1u, _343);
         return r;
     }
     return r;
@@ -392,7 +413,7 @@ void write_mem(Alloc alloc, uint offset, uint val)
     {
         return;
     }
-    _283.Store(offset * 4 + 8, val);
+    _308.Store(offset * 4 + 8, val);
 }
 
 void CmdJump_write(Alloc a, CmdJumpRef ref, CmdJump s)
@@ -410,9 +431,9 @@ void Cmd_Jump_write(Alloc a, CmdRef ref, CmdJump s)
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 10u;
     write_mem(param, param_1, param_2);
-    CmdJumpRef _1048 = { ref.offset + 4u };
+    CmdJumpRef _1162 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdJumpRef param_4 = _1048;
+    CmdJumpRef param_4 = _1162;
     CmdJump param_5 = s;
     CmdJump_write(param_3, param_4, param_5);
 }
@@ -424,21 +445,21 @@ bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit
         return true;
     }
     uint param = 1024u;
-    MallocResult _1076 = malloc(param);
-    MallocResult new_cmd = _1076;
+    MallocResult _1190 = malloc(param);
+    MallocResult new_cmd = _1190;
     if (new_cmd.failed)
     {
         return false;
     }
-    CmdJump _1086 = { new_cmd.alloc.offset };
-    CmdJump jump = _1086;
+    CmdJump _1200 = { new_cmd.alloc.offset };
+    CmdJump jump = _1200;
     Alloc param_1 = cmd_alloc;
     CmdRef param_2 = cmd_ref;
     CmdJump param_3 = jump;
     Cmd_Jump_write(param_1, param_2, param_3);
     cmd_alloc = new_cmd.alloc;
-    CmdRef _1098 = { cmd_alloc.offset };
-    cmd_ref = _1098;
+    CmdRef _1212 = { cmd_alloc.offset };
+    cmd_ref = _1212;
     cmd_limit = (cmd_alloc.offset + 1024u) - 60u;
     return true;
 }
@@ -467,9 +488,9 @@ void Cmd_Fill_write(Alloc a, CmdRef ref, CmdFill s)
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 1u;
     write_mem(param, param_1, param_2);
-    CmdFillRef _932 = { ref.offset + 4u };
+    CmdFillRef _1036 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdFillRef param_4 = _932;
+    CmdFillRef param_4 = _1036;
     CmdFill param_5 = s;
     CmdFill_write(param_3, param_4, param_5);
 }
@@ -501,9 +522,9 @@ void Cmd_Stroke_write(Alloc a, CmdRef ref, CmdStroke s)
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 2u;
     write_mem(param, param_1, param_2);
-    CmdStrokeRef _950 = { ref.offset + 4u };
+    CmdStrokeRef _1054 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdStrokeRef param_4 = _950;
+    CmdStrokeRef param_4 = _1054;
     CmdStroke param_5 = s;
     CmdStroke_write(param_3, param_4, param_5);
 }
@@ -515,8 +536,8 @@ void write_fill(Alloc alloc, inout CmdRef cmd_ref, uint flags, Tile tile, float
     {
         if (tile.tile.offset != 0u)
         {
-            CmdFill _1122 = { tile.tile.offset, tile.backdrop };
-            CmdFill cmd_fill = _1122;
+            CmdFill _1236 = { tile.tile.offset, tile.backdrop };
+            CmdFill cmd_fill = _1236;
             Alloc param_1 = alloc;
             CmdRef param_2 = cmd_ref;
             CmdFill param_3 = cmd_fill;
@@ -533,8 +554,8 @@ void write_fill(Alloc alloc, inout CmdRef cmd_ref, uint flags, Tile tile, float
     }
     else
     {
-        CmdStroke _1152 = { tile.tile.offset, 0.5f * linewidth };
-        CmdStroke cmd_stroke = _1152;
+        CmdStroke _1266 = { tile.tile.offset, 0.5f * linewidth };
+        CmdStroke cmd_stroke = _1266;
         Alloc param_6 = alloc;
         CmdRef param_7 = cmd_ref;
         CmdStroke param_8 = cmd_stroke;
@@ -558,9 +579,9 @@ void Cmd_Color_write(Alloc a, CmdRef ref, CmdColor s)
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 5u;
     write_mem(param, param_1, param_2);
-    CmdColorRef _976 = { ref.offset + 4u };
+    CmdColorRef _1080 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdColorRef param_4 = _976;
+    CmdColorRef param_4 = _1080;
     CmdColor param_5 = s;
     CmdColor_write(param_3, param_4, param_5);
 }
@@ -607,9 +628,9 @@ AnnoLinGradient AnnoLinGradient_read(Alloc a, AnnoLinGradientRef ref)
 
 AnnoLinGradient Annotated_LinGradient_read(Alloc a, AnnotatedRef ref)
 {
-    AnnoLinGradientRef _652 = { ref.offset + 4u };
+    AnnoLinGradientRef _733 = { ref.offset + 4u };
     Alloc param = a;
-    AnnoLinGradientRef param_1 = _652;
+    AnnoLinGradientRef param_1 = _733;
     return AnnoLinGradient_read(param, param_1);
 }
 
@@ -640,9 +661,9 @@ void Cmd_LinGrad_write(Alloc a, CmdRef ref, CmdLinGrad s)
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 6u;
     write_mem(param, param_1, param_2);
-    CmdLinGradRef _994 = { ref.offset + 4u };
+    CmdLinGradRef _1098 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdLinGradRef param_4 = _994;
+    CmdLinGradRef param_4 = _1098;
     CmdLinGrad param_5 = s;
     CmdLinGrad_write(param_3, param_4, param_5);
 }
@@ -681,9 +702,9 @@ AnnoImage AnnoImage_read(Alloc a, AnnoImageRef ref)
 
 AnnoImage Annotated_Image_read(Alloc a, AnnotatedRef ref)
 {
-    AnnoImageRef _662 = { ref.offset + 4u };
+    AnnoImageRef _743 = { ref.offset + 4u };
     Alloc param = a;
-    AnnoImageRef param_1 = _662;
+    AnnoImageRef param_1 = _743;
     return AnnoImage_read(param, param_1);
 }
 
@@ -706,9 +727,9 @@ void Cmd_Image_write(Alloc a, CmdRef ref, CmdImage s)
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 7u;
     write_mem(param, param_1, param_2);
-    CmdImageRef _1012 = { ref.offset + 4u };
+    CmdImageRef _1116 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdImageRef param_4 = _1012;
+    CmdImageRef param_4 = _1116;
     CmdImage param_5 = s;
     CmdImage_write(param_3, param_4, param_5);
 }
@@ -721,12 +742,58 @@ void Cmd_BeginClip_write(Alloc a, CmdRef ref)
     write_mem(param, param_1, param_2);
 }
 
-void Cmd_EndClip_write(Alloc a, CmdRef ref)
+AnnoEndClip AnnoEndClip_read(Alloc a, AnnoEndClipRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9);
+    AnnoEndClip s;
+    s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
+    s.blend = raw4;
+    return s;
+}
+
+AnnoEndClip Annotated_EndClip_read(Alloc a, AnnotatedRef ref)
+{
+    AnnoEndClipRef _753 = { ref.offset + 4u };
+    Alloc param = a;
+    AnnoEndClipRef param_1 = _753;
+    return AnnoEndClip_read(param, param_1);
+}
+
+void CmdEndClip_write(Alloc a, CmdEndClipRef ref, CmdEndClip s)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = s.blend;
+    write_mem(param, param_1, param_2);
+}
+
+void Cmd_EndClip_write(Alloc a, CmdRef ref, CmdEndClip s)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 9u;
     write_mem(param, param_1, param_2);
+    CmdEndClipRef _1143 = { ref.offset + 4u };
+    Alloc param_3 = a;
+    CmdEndClipRef param_4 = _1143;
+    CmdEndClip param_5 = s;
+    CmdEndClip_write(param_3, param_4, param_5);
 }
 
 void Cmd_End_write(Alloc a, CmdRef ref)
@@ -739,25 +806,25 @@ void Cmd_End_write(Alloc a, CmdRef ref)
 
 void comp_main()
 {
-    uint width_in_bins = ((_1169.Load(8) + 16u) - 1u) / 16u;
+    uint width_in_bins = ((_1283.Load(8) + 16u) - 1u) / 16u;
     uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x;
     uint partition_ix = 0u;
-    uint n_partitions = ((_1169.Load(0) + 256u) - 1u) / 256u;
+    uint n_partitions = ((_1283.Load(0) + 256u) - 1u) / 256u;
     uint th_ix = gl_LocalInvocationID.x;
     uint bin_tile_x = 16u * gl_WorkGroupID.x;
     uint bin_tile_y = 16u * gl_WorkGroupID.y;
     uint tile_x = gl_LocalInvocationID.x % 16u;
     uint tile_y = gl_LocalInvocationID.x / 16u;
-    uint this_tile_ix = (((bin_tile_y + tile_y) * _1169.Load(8)) + bin_tile_x) + tile_x;
-    Alloc _1234;
-    _1234.offset = _1169.Load(24);
+    uint this_tile_ix = (((bin_tile_y + tile_y) * _1283.Load(8)) + bin_tile_x) + tile_x;
+    Alloc _1348;
+    _1348.offset = _1283.Load(24);
     Alloc param;
-    param.offset = _1234.offset;
+    param.offset = _1348.offset;
     uint param_1 = this_tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
-    CmdRef _1243 = { cmd_alloc.offset };
-    CmdRef cmd_ref = _1243;
+    CmdRef _1357 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _1357;
     uint cmd_limit = (cmd_ref.offset + 1024u) - 60u;
     uint clip_depth = 0u;
     uint clip_zero_depth = 0u;
@@ -765,17 +832,17 @@ void comp_main()
     uint wr_ix = 0u;
     uint part_start_ix = 0u;
     uint ready_ix = 0u;
-    bool mem_ok = _283.Load(4) == 0u;
+    bool mem_ok = _308.Load(4) == 0u;
     Alloc param_3;
     Alloc param_5;
-    uint _1448;
+    uint _1562;
     uint element_ix;
     AnnotatedRef ref;
     Alloc param_14;
     Alloc param_16;
     uint tile_count;
     Alloc param_23;
-    uint _1770;
+    uint _1887;
     Alloc param_29;
     Tile tile_1;
     AnnoColor fill;
@@ -783,40 +850,41 @@ void comp_main()
     Alloc param_52;
     CmdLinGrad cmd_lin;
     Alloc param_69;
+    Alloc param_95;
     while (true)
     {
         for (uint i = 0u; i < 8u; i++)
         {
             sh_bitmaps[i][th_ix] = 0u;
         }
-        bool _1500;
+        bool _1614;
         for (;;)
         {
             if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
             {
                 part_start_ix = ready_ix;
                 uint count = 0u;
-                bool _1298 = th_ix < 256u;
-                bool _1306;
-                if (_1298)
+                bool _1412 = th_ix < 256u;
+                bool _1420;
+                if (_1412)
                 {
-                    _1306 = (partition_ix + th_ix) < n_partitions;
+                    _1420 = (partition_ix + th_ix) < n_partitions;
                 }
                 else
                 {
-                    _1306 = _1298;
+                    _1420 = _1412;
                 }
-                if (_1306)
+                if (_1420)
                 {
-                    uint in_ix = (_1169.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
-                    Alloc _1323;
-                    _1323.offset = _1169.Load(20);
-                    param_3.offset = _1323.offset;
+                    uint in_ix = (_1283.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
+                    Alloc _1437;
+                    _1437.offset = _1283.Load(20);
+                    param_3.offset = _1437.offset;
                     uint param_4 = in_ix;
                     count = read_mem(param_3, param_4);
-                    Alloc _1334;
-                    _1334.offset = _1169.Load(20);
-                    param_5.offset = _1334.offset;
+                    Alloc _1448;
+                    _1448.offset = _1283.Load(20);
+                    param_5.offset = _1448.offset;
                     uint param_6 = in_ix + 1u;
                     uint offset = read_mem(param_5, param_6);
                     uint param_7 = offset;
@@ -862,16 +930,16 @@ void comp_main()
                 }
                 if (part_ix > 0u)
                 {
-                    _1448 = sh_part_count[part_ix - 1u];
+                    _1562 = sh_part_count[part_ix - 1u];
                 }
                 else
                 {
-                    _1448 = part_start_ix;
+                    _1562 = part_start_ix;
                 }
-                ix -= _1448;
+                ix -= _1562;
                 Alloc bin_alloc = sh_part_elements[part_ix];
-                BinInstanceRef _1467 = { bin_alloc.offset };
-                BinInstanceRef inst_ref = _1467;
+                BinInstanceRef _1581 = { bin_alloc.offset };
+                BinInstanceRef inst_ref = _1581;
                 BinInstanceRef param_10 = inst_ref;
                 uint param_11 = ix;
                 Alloc param_12 = bin_alloc;
@@ -881,16 +949,16 @@ void comp_main()
             }
             GroupMemoryBarrierWithGroupSync();
             wr_ix = min((rd_ix + 256u), ready_ix);
-            bool _1490 = (wr_ix - rd_ix) < 256u;
-            if (_1490)
+            bool _1604 = (wr_ix - rd_ix) < 256u;
+            if (_1604)
             {
-                _1500 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
+                _1614 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
             }
             else
             {
-                _1500 = _1490;
+                _1614 = _1604;
             }
-            if (_1500)
+            if (_1614)
             {
                 continue;
             }
@@ -903,11 +971,11 @@ void comp_main()
         if ((th_ix + rd_ix) < wr_ix)
         {
             element_ix = sh_elements[th_ix];
-            AnnotatedRef _1521 = { _1169.Load(32) + (element_ix * 40u) };
-            ref = _1521;
-            Alloc _1524;
-            _1524.offset = _1169.Load(32);
-            param_14.offset = _1524.offset;
+            AnnotatedRef _1635 = { _1283.Load(32) + (element_ix * 40u) };
+            ref = _1635;
+            Alloc _1638;
+            _1638.offset = _1283.Load(32);
+            param_14.offset = _1638.offset;
             AnnotatedRef param_15 = ref;
             tag = Annotated_tag(param_14, param_15).tag;
         }
@@ -919,13 +987,13 @@ void comp_main()
             case 4u:
             case 5u:
             {
-                uint drawmonoid_base = (_1169.Load(44) >> uint(2)) + (2u * element_ix);
-                uint path_ix = _283.Load(drawmonoid_base * 4 + 8);
-                PathRef _1553 = { _1169.Load(16) + (path_ix * 12u) };
-                Alloc _1556;
-                _1556.offset = _1169.Load(16);
-                param_16.offset = _1556.offset;
-                PathRef param_17 = _1553;
+                uint drawmonoid_base = (_1283.Load(44) >> uint(2)) + (2u * element_ix);
+                uint path_ix = _308.Load(drawmonoid_base * 4 + 8);
+                PathRef _1667 = { _1283.Load(16) + (path_ix * 12u) };
+                Alloc _1670;
+                _1670.offset = _1283.Load(16);
+                param_16.offset = _1670.offset;
+                PathRef param_17 = _1667;
                 Path path = Path_read(param_16, param_17);
                 uint stride = path.bbox.z - path.bbox.x;
                 sh_tile_stride[th_ix] = stride;
@@ -980,22 +1048,23 @@ void comp_main()
                     el_ix = probe_1;
                 }
             }
-            AnnotatedRef _1755 = { _1169.Load(32) + (sh_elements[el_ix] * 40u) };
-            AnnotatedRef ref_1 = _1755;
-            Alloc _1759;
-            _1759.offset = _1169.Load(32);
-            param_23.offset = _1759.offset;
+            AnnotatedRef _1869 = { _1283.Load(32) + (sh_elements[el_ix] * 40u) };
+            AnnotatedRef ref_1 = _1869;
+            Alloc _1874;
+            _1874.offset = _1283.Load(32);
+            param_23.offset = _1874.offset;
             AnnotatedRef param_24 = ref_1;
-            uint tag_1 = Annotated_tag(param_23, param_24).tag;
+            AnnotatedTag anno_tag = Annotated_tag(param_23, param_24);
+            uint tag_1 = anno_tag.tag;
             if (el_ix > 0u)
             {
-                _1770 = sh_tile_count[el_ix - 1u];
+                _1887 = sh_tile_count[el_ix - 1u];
             }
             else
             {
-                _1770 = 0u;
+                _1887 = 0u;
             }
-            uint seq_ix = ix_1 - _1770;
+            uint seq_ix = ix_1 - _1887;
             uint width = sh_tile_width[el_ix];
             uint x = sh_tile_x0[el_ix] + (seq_ix % width);
             uint y = sh_tile_y0[el_ix] + (seq_ix / width);
@@ -1004,29 +1073,47 @@ void comp_main()
             {
                 uint param_25 = el_ix;
                 bool param_26 = mem_ok;
-                TileRef _1822 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
+                TileRef _1939 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
                 Alloc param_27 = read_tile_alloc(param_25, param_26);
-                TileRef param_28 = _1822;
+                TileRef param_28 = _1939;
                 Tile tile = Tile_read(param_27, param_28);
                 bool is_clip = (tag_1 == 4u) || (tag_1 == 5u);
-                bool _1834 = tile.tile.offset != 0u;
-                bool _1843;
-                if (!_1834)
+                bool _1951 = tile.tile.offset != 0u;
+                bool _1960;
+                if (!_1951)
                 {
-                    _1843 = (tile.backdrop == 0) == is_clip;
+                    _1960 = (tile.backdrop == 0) == is_clip;
                 }
                 else
                 {
-                    _1843 = _1834;
+                    _1960 = _1951;
                 }
-                include_tile = _1843;
+                bool _1972;
+                if (!_1960)
+                {
+                    bool _1971;
+                    if (is_clip)
+                    {
+                        _1971 = (anno_tag.flags & 2u) != 0u;
+                    }
+                    else
+                    {
+                        _1971 = is_clip;
+                    }
+                    _1972 = _1971;
+                }
+                else
+                {
+                    _1972 = _1960;
+                }
+                include_tile = _1972;
             }
             if (include_tile)
             {
                 uint el_slice = el_ix / 32u;
                 uint el_mask = 1u << (el_ix & 31u);
-                uint _1863;
-                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1863);
+                uint _1992;
+                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1992);
             }
         }
         GroupMemoryBarrierWithGroupSync();
@@ -1050,11 +1137,11 @@ void comp_main()
             uint element_ref_ix = (slice_ix * 32u) + uint(int(firstbitlow(bitmap)));
             uint element_ix_1 = sh_elements[element_ref_ix];
             bitmap &= (bitmap - 1u);
-            AnnotatedRef _1917 = { _1169.Load(32) + (element_ix_1 * 40u) };
-            ref = _1917;
-            Alloc _1922;
-            _1922.offset = _1169.Load(32);
-            param_29.offset = _1922.offset;
+            AnnotatedRef _2046 = { _1283.Load(32) + (element_ix_1 * 40u) };
+            ref = _2046;
+            Alloc _2050;
+            _2050.offset = _1283.Load(32);
+            param_29.offset = _2050.offset;
             AnnotatedRef param_30 = ref;
             AnnotatedTag tag_2 = Annotated_tag(param_29, param_30);
             if (clip_zero_depth == 0u)
@@ -1065,23 +1152,23 @@ void comp_main()
                     {
                         uint param_31 = element_ref_ix;
                         bool param_32 = mem_ok;
-                        TileRef _1958 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                        TileRef _2086 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
                         Alloc param_33 = read_tile_alloc(param_31, param_32);
-                        TileRef param_34 = _1958;
+                        TileRef param_34 = _2086;
                         tile_1 = Tile_read(param_33, param_34);
-                        Alloc _1965;
-                        _1965.offset = _1169.Load(32);
-                        param_35.offset = _1965.offset;
+                        Alloc _2093;
+                        _2093.offset = _1283.Load(32);
+                        param_35.offset = _2093.offset;
                         AnnotatedRef param_36 = ref;
                         fill = Annotated_Color_read(param_35, param_36);
                         Alloc param_37 = cmd_alloc;
                         CmdRef param_38 = cmd_ref;
                         uint param_39 = cmd_limit;
-                        bool _1977 = alloc_cmd(param_37, param_38, param_39);
+                        bool _2105 = alloc_cmd(param_37, param_38, param_39);
                         cmd_alloc = param_37;
                         cmd_ref = param_38;
                         cmd_limit = param_39;
-                        if (!_1977)
+                        if (!_2105)
                         {
                             break;
                         }
@@ -1092,10 +1179,10 @@ void comp_main()
                         float param_44 = fill.linewidth;
                         write_fill(param_40, param_41, param_42, param_43, param_44);
                         cmd_ref = param_41;
-                        CmdColor _2001 = { fill.rgba_color };
+                        CmdColor _2129 = { fill.rgba_color };
                         Alloc param_45 = cmd_alloc;
                         CmdRef param_46 = cmd_ref;
-                        CmdColor param_47 = _2001;
+                        CmdColor param_47 = _2129;
                         Cmd_Color_write(param_45, param_46, param_47);
                         cmd_ref.offset += 8u;
                         break;
@@ -1104,23 +1191,23 @@ void comp_main()
                     {
                         uint param_48 = element_ref_ix;
                         bool param_49 = mem_ok;
-                        TileRef _2030 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                        TileRef _2158 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
                         Alloc param_50 = read_tile_alloc(param_48, param_49);
-                        TileRef param_51 = _2030;
+                        TileRef param_51 = _2158;
                         tile_1 = Tile_read(param_50, param_51);
-                        Alloc _2037;
-                        _2037.offset = _1169.Load(32);
-                        param_52.offset = _2037.offset;
+                        Alloc _2165;
+                        _2165.offset = _1283.Load(32);
+                        param_52.offset = _2165.offset;
                         AnnotatedRef param_53 = ref;
                         AnnoLinGradient lin = Annotated_LinGradient_read(param_52, param_53);
                         Alloc param_54 = cmd_alloc;
                         CmdRef param_55 = cmd_ref;
                         uint param_56 = cmd_limit;
-                        bool _2049 = alloc_cmd(param_54, param_55, param_56);
+                        bool _2177 = alloc_cmd(param_54, param_55, param_56);
                         cmd_alloc = param_54;
                         cmd_ref = param_55;
                         cmd_limit = param_56;
-                        if (!_2049)
+                        if (!_2177)
                         {
                             break;
                         }
@@ -1146,23 +1233,23 @@ void comp_main()
                     {
                         uint param_65 = element_ref_ix;
                         bool param_66 = mem_ok;
-                        TileRef _2114 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                        TileRef _2242 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
                         Alloc param_67 = read_tile_alloc(param_65, param_66);
-                        TileRef param_68 = _2114;
+                        TileRef param_68 = _2242;
                         tile_1 = Tile_read(param_67, param_68);
-                        Alloc _2121;
-                        _2121.offset = _1169.Load(32);
-                        param_69.offset = _2121.offset;
+                        Alloc _2249;
+                        _2249.offset = _1283.Load(32);
+                        param_69.offset = _2249.offset;
                         AnnotatedRef param_70 = ref;
                         AnnoImage fill_img = Annotated_Image_read(param_69, param_70);
                         Alloc param_71 = cmd_alloc;
                         CmdRef param_72 = cmd_ref;
                         uint param_73 = cmd_limit;
-                        bool _2133 = alloc_cmd(param_71, param_72, param_73);
+                        bool _2261 = alloc_cmd(param_71, param_72, param_73);
                         cmd_alloc = param_71;
                         cmd_ref = param_72;
                         cmd_limit = param_73;
-                        if (!_2133)
+                        if (!_2261)
                         {
                             break;
                         }
@@ -1173,10 +1260,10 @@ void comp_main()
                         float param_78 = fill_img.linewidth;
                         write_fill(param_74, param_75, param_76, param_77, param_78);
                         cmd_ref = param_75;
-                        CmdImage _2159 = { fill_img.index, fill_img.offset };
+                        CmdImage _2287 = { fill_img.index, fill_img.offset };
                         Alloc param_79 = cmd_alloc;
                         CmdRef param_80 = cmd_ref;
-                        CmdImage param_81 = _2159;
+                        CmdImage param_81 = _2287;
                         Cmd_Image_write(param_79, param_80, param_81);
                         cmd_ref.offset += 12u;
                         break;
@@ -1185,21 +1272,21 @@ void comp_main()
                     {
                         uint param_82 = element_ref_ix;
                         bool param_83 = mem_ok;
-                        TileRef _2188 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                        TileRef _2316 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
                         Alloc param_84 = read_tile_alloc(param_82, param_83);
-                        TileRef param_85 = _2188;
+                        TileRef param_85 = _2316;
                         tile_1 = Tile_read(param_84, param_85);
-                        bool _2194 = tile_1.tile.offset == 0u;
-                        bool _2200;
-                        if (_2194)
+                        bool _2322 = tile_1.tile.offset == 0u;
+                        bool _2328;
+                        if (_2322)
                         {
-                            _2200 = tile_1.backdrop == 0;
+                            _2328 = tile_1.backdrop == 0;
                         }
                         else
                         {
-                            _2200 = _2194;
+                            _2328 = _2322;
                         }
-                        if (_2200)
+                        if (_2328)
                         {
                             clip_zero_depth = clip_depth + 1u;
                         }
@@ -1208,11 +1295,11 @@ void comp_main()
                             Alloc param_86 = cmd_alloc;
                             CmdRef param_87 = cmd_ref;
                             uint param_88 = cmd_limit;
-                            bool _2212 = alloc_cmd(param_86, param_87, param_88);
+                            bool _2340 = alloc_cmd(param_86, param_87, param_88);
                             cmd_alloc = param_86;
                             cmd_ref = param_87;
                             cmd_limit = param_88;
-                            if (!_2212)
+                            if (!_2340)
                             {
                                 break;
                             }
@@ -1228,33 +1315,40 @@ void comp_main()
                     {
                         uint param_91 = element_ref_ix;
                         bool param_92 = mem_ok;
-                        TileRef _2249 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                        TileRef _2377 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
                         Alloc param_93 = read_tile_alloc(param_91, param_92);
-                        TileRef param_94 = _2249;
+                        TileRef param_94 = _2377;
                         tile_1 = Tile_read(param_93, param_94);
+                        Alloc _2384;
+                        _2384.offset = _1283.Load(32);
+                        param_95.offset = _2384.offset;
+                        AnnotatedRef param_96 = ref;
+                        AnnoEndClip end_clip = Annotated_EndClip_read(param_95, param_96);
                         clip_depth--;
-                        Alloc param_95 = cmd_alloc;
-                        CmdRef param_96 = cmd_ref;
-                        uint param_97 = cmd_limit;
-                        bool _2261 = alloc_cmd(param_95, param_96, param_97);
-                        cmd_alloc = param_95;
-                        cmd_ref = param_96;
-                        cmd_limit = param_97;
-                        if (!_2261)
+                        Alloc param_97 = cmd_alloc;
+                        CmdRef param_98 = cmd_ref;
+                        uint param_99 = cmd_limit;
+                        bool _2398 = alloc_cmd(param_97, param_98, param_99);
+                        cmd_alloc = param_97;
+                        cmd_ref = param_98;
+                        cmd_limit = param_99;
+                        if (!_2398)
                         {
                             break;
                         }
-                        Alloc param_98 = cmd_alloc;
-                        CmdRef param_99 = cmd_ref;
-                        uint param_100 = 0u;
-                        Tile param_101 = tile_1;
-                        float param_102 = 0.0f;
-                        write_fill(param_98, param_99, param_100, param_101, param_102);
-                        cmd_ref = param_99;
-                        Alloc param_103 = cmd_alloc;
-                        CmdRef param_104 = cmd_ref;
-                        Cmd_EndClip_write(param_103, param_104);
-                        cmd_ref.offset += 4u;
+                        Alloc param_100 = cmd_alloc;
+                        CmdRef param_101 = cmd_ref;
+                        uint param_102 = 0u;
+                        Tile param_103 = tile_1;
+                        float param_104 = 0.0f;
+                        write_fill(param_100, param_101, param_102, param_103, param_104);
+                        cmd_ref = param_101;
+                        CmdEndClip _2419 = { end_clip.blend };
+                        Alloc param_105 = cmd_alloc;
+                        CmdRef param_106 = cmd_ref;
+                        CmdEndClip param_107 = _2419;
+                        Cmd_EndClip_write(param_105, param_106, param_107);
+                        cmd_ref.offset += 8u;
                         break;
                     }
                 }
@@ -1287,21 +1381,21 @@ void comp_main()
             break;
         }
     }
-    bool _2326 = (bin_tile_x + tile_x) < _1169.Load(8);
-    bool _2335;
-    if (_2326)
+    bool _2467 = (bin_tile_x + tile_x) < _1283.Load(8);
+    bool _2476;
+    if (_2467)
     {
-        _2335 = (bin_tile_y + tile_y) < _1169.Load(12);
+        _2476 = (bin_tile_y + tile_y) < _1283.Load(12);
     }
     else
     {
-        _2335 = _2326;
+        _2476 = _2467;
     }
-    if (_2335)
+    if (_2476)
     {
-        Alloc param_105 = cmd_alloc;
-        CmdRef param_106 = cmd_ref;
-        Cmd_End_write(param_105, param_106);
+        Alloc param_108 = cmd_alloc;
+        CmdRef param_109 = cmd_ref;
+        Cmd_End_write(param_108, param_109);
     }
 }
 
diff --git a/piet-gpu/shader/gen/coarse.msl b/piet-gpu/shader/gen/coarse.msl
index e5a0f0d..1422ff1 100644
--- a/piet-gpu/shader/gen/coarse.msl
+++ b/piet-gpu/shader/gen/coarse.msl
@@ -65,6 +65,17 @@ struct AnnoLinGradient
     float line_c;
 };
 
+struct AnnoEndClipRef
+{
+    uint offset;
+};
+
+struct AnnoEndClip
+{
+    float4 bbox;
+    uint blend;
+};
+
 struct AnnotatedRef
 {
     uint offset;
@@ -169,6 +180,16 @@ struct CmdImage
     int2 offset;
 };
 
+struct CmdEndClipRef
+{
+    uint offset;
+};
+
+struct CmdEndClip
+{
+    uint blend;
+};
+
 struct CmdJumpRef
 {
     uint offset;
@@ -243,7 +264,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
 }
 
 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_283, constant uint& v_283BufferSize)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -251,7 +272,7 @@ uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memor
     {
         return 0u;
     }
-    uint v = v_283.memory[offset];
+    uint v = v_308.memory[offset];
     return v;
 }
 
@@ -270,39 +291,39 @@ BinInstanceRef BinInstance_index(thread const BinInstanceRef& ref, thread const
 }
 
 static inline __attribute__((always_inline))
-BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
+BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_283, v_283BufferSize);
+    uint raw0 = read_mem(param, param_1, v_308, v_308BufferSize);
     BinInstance s;
     s.element_ix = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
+AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_283, v_283BufferSize);
+    uint tag_and_flags = read_mem(param, param_1, v_308, v_308BufferSize);
     return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
 }
 
 static inline __attribute__((always_inline))
-Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
+Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_283, v_283BufferSize);
+    uint raw0 = read_mem(param, param_1, v_308, v_308BufferSize);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_283, v_283BufferSize);
+    uint raw1 = read_mem(param_2, param_3, v_308, v_308BufferSize);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_283, v_283BufferSize);
+    uint raw2 = read_mem(param_4, param_5, v_308, v_308BufferSize);
     Path s;
     s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
     s.tiles = TileRef{ raw2 };
@@ -315,24 +336,24 @@ void write_tile_alloc(thread const uint& el_ix, thread const Alloc& a)
 }
 
 static inline __attribute__((always_inline))
-Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, device Memory& v_283, constant uint& v_283BufferSize)
+Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, device Memory& v_308, constant uint& v_308BufferSize)
 {
     uint param = 0u;
-    uint param_1 = uint(int((v_283BufferSize - 8) / 4) * 4);
+    uint param_1 = uint(int((v_308BufferSize - 8) / 4) * 4);
     bool param_2 = mem_ok;
     return new_alloc(param, param_1, param_2);
 }
 
 static inline __attribute__((always_inline))
-Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
+Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_283, v_283BufferSize);
+    uint raw0 = read_mem(param, param_1, v_308, v_308BufferSize);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_283, v_283BufferSize);
+    uint raw1 = read_mem(param_2, param_3, v_308, v_308BufferSize);
     Tile s;
     s.tile = TileSegRef{ raw0 };
     s.backdrop = int(raw1);
@@ -340,27 +361,27 @@ Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory&
 }
 
 static inline __attribute__((always_inline))
-AnnoColor AnnoColor_read(thread const Alloc& a, thread const AnnoColorRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
+AnnoColor AnnoColor_read(thread const Alloc& a, thread const AnnoColorRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_283, v_283BufferSize);
+    uint raw0 = read_mem(param, param_1, v_308, v_308BufferSize);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_283, v_283BufferSize);
+    uint raw1 = read_mem(param_2, param_3, v_308, v_308BufferSize);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_283, v_283BufferSize);
+    uint raw2 = read_mem(param_4, param_5, v_308, v_308BufferSize);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_283, v_283BufferSize);
+    uint raw3 = read_mem(param_6, param_7, v_308, v_308BufferSize);
     Alloc param_8 = a;
     uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_283, v_283BufferSize);
+    uint raw4 = read_mem(param_8, param_9, v_308, v_308BufferSize);
     Alloc param_10 = a;
     uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_283, v_283BufferSize);
+    uint raw5 = read_mem(param_10, param_11, v_308, v_308BufferSize);
     AnnoColor s;
     s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
     s.linewidth = as_type<float>(raw4);
@@ -369,34 +390,34 @@ AnnoColor AnnoColor_read(thread const Alloc& a, thread const AnnoColorRef& ref,
 }
 
 static inline __attribute__((always_inline))
-AnnoColor Annotated_Color_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
+AnnoColor Annotated_Color_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = a;
     AnnoColorRef param_1 = AnnoColorRef{ ref.offset + 4u };
-    return AnnoColor_read(param, param_1, v_283, v_283BufferSize);
+    return AnnoColor_read(param, param_1, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-MallocResult malloc(thread const uint& size, device Memory& v_283, constant uint& v_283BufferSize)
+MallocResult malloc(thread const uint& size, device Memory& v_308, constant uint& v_308BufferSize)
 {
-    uint _289 = atomic_fetch_add_explicit((device atomic_uint*)&v_283.mem_offset, size, memory_order_relaxed);
-    uint offset = _289;
+    uint _314 = atomic_fetch_add_explicit((device atomic_uint*)&v_308.mem_offset, size, memory_order_relaxed);
+    uint offset = _314;
     MallocResult r;
-    r.failed = (offset + size) > uint(int((v_283BufferSize - 8) / 4) * 4);
+    r.failed = (offset + size) > uint(int((v_308BufferSize - 8) / 4) * 4);
     uint param = offset;
     uint param_1 = size;
     bool param_2 = !r.failed;
     r.alloc = new_alloc(param, param_1, param_2);
     if (r.failed)
     {
-        uint _318 = atomic_fetch_max_explicit((device atomic_uint*)&v_283.mem_error, 1u, memory_order_relaxed);
+        uint _343 = atomic_fetch_max_explicit((device atomic_uint*)&v_308.mem_error, 1u, memory_order_relaxed);
         return r;
     }
     return r;
 }
 
 static inline __attribute__((always_inline))
-void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_283, constant uint& v_283BufferSize)
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -404,42 +425,42 @@ void write_mem(thread const Alloc& alloc, thread const uint& offset, thread cons
     {
         return;
     }
-    v_283.memory[offset] = val;
+    v_308.memory[offset] = val;
 }
 
 static inline __attribute__((always_inline))
-void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_283, constant uint& v_283BufferSize)
+void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_308, constant uint& v_308BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.new_ref;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_283, constant uint& v_283BufferSize)
+void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 10u;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
     Alloc param_3 = a;
     CmdJumpRef param_4 = CmdJumpRef{ ref.offset + 4u };
     CmdJump param_5 = s;
-    CmdJump_write(param_3, param_4, param_5, v_283, v_283BufferSize);
+    CmdJump_write(param_3, param_4, param_5, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, device Memory& v_283, constant uint& v_283BufferSize)
+bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, device Memory& v_308, constant uint& v_308BufferSize)
 {
     if (cmd_ref.offset < cmd_limit)
     {
         return true;
     }
     uint param = 1024u;
-    MallocResult _1076 = malloc(param, v_283, v_283BufferSize);
-    MallocResult new_cmd = _1076;
+    MallocResult _1190 = malloc(param, v_308, v_308BufferSize);
+    MallocResult new_cmd = _1190;
     if (new_cmd.failed)
     {
         return false;
@@ -448,7 +469,7 @@ bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd
     Alloc param_1 = cmd_alloc;
     CmdRef param_2 = cmd_ref;
     CmdJump param_3 = jump;
-    Cmd_Jump_write(param_1, param_2, param_3, v_283, v_283BufferSize);
+    Cmd_Jump_write(param_1, param_2, param_3, v_308, v_308BufferSize);
     cmd_alloc = new_cmd.alloc;
     cmd_ref = CmdRef{ cmd_alloc.offset };
     cmd_limit = (cmd_alloc.offset + 1024u) - 60u;
@@ -462,70 +483,70 @@ uint fill_mode_from_flags(thread const uint& flags)
 }
 
 static inline __attribute__((always_inline))
-void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_283, constant uint& v_283BufferSize)
+void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_308, constant uint& v_308BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.tile_ref;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = uint(s.backdrop);
-    write_mem(param_3, param_4, param_5, v_283, v_283BufferSize);
+    write_mem(param_3, param_4, param_5, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_283, constant uint& v_283BufferSize)
+void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 1u;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
     Alloc param_3 = a;
     CmdFillRef param_4 = CmdFillRef{ ref.offset + 4u };
     CmdFill param_5 = s;
-    CmdFill_write(param_3, param_4, param_5, v_283, v_283BufferSize);
+    CmdFill_write(param_3, param_4, param_5, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
+void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 3u;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_283, constant uint& v_283BufferSize)
+void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_308, constant uint& v_308BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.tile_ref;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = as_type<uint>(s.half_width);
-    write_mem(param_3, param_4, param_5, v_283, v_283BufferSize);
+    write_mem(param_3, param_4, param_5, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_283, constant uint& v_283BufferSize)
+void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 2u;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
     Alloc param_3 = a;
     CmdStrokeRef param_4 = CmdStrokeRef{ ref.offset + 4u };
     CmdStroke param_5 = s;
-    CmdStroke_write(param_3, param_4, param_5, v_283, v_283BufferSize);
+    CmdStroke_write(param_3, param_4, param_5, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const uint& flags, thread const Tile& tile, thread const float& linewidth, device Memory& v_283, constant uint& v_283BufferSize)
+void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const uint& flags, thread const Tile& tile, thread const float& linewidth, device Memory& v_308, constant uint& v_308BufferSize)
 {
     uint param = flags;
     if (fill_mode_from_flags(param) == 0u)
@@ -536,14 +557,14 @@ void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const
             Alloc param_1 = alloc;
             CmdRef param_2 = cmd_ref;
             CmdFill param_3 = cmd_fill;
-            Cmd_Fill_write(param_1, param_2, param_3, v_283, v_283BufferSize);
+            Cmd_Fill_write(param_1, param_2, param_3, v_308, v_308BufferSize);
             cmd_ref.offset += 12u;
         }
         else
         {
             Alloc param_4 = alloc;
             CmdRef param_5 = cmd_ref;
-            Cmd_Solid_write(param_4, param_5, v_283, v_283BufferSize);
+            Cmd_Solid_write(param_4, param_5, v_308, v_308BufferSize);
             cmd_ref.offset += 4u;
         }
     }
@@ -553,65 +574,65 @@ void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const
         Alloc param_6 = alloc;
         CmdRef param_7 = cmd_ref;
         CmdStroke param_8 = cmd_stroke;
-        Cmd_Stroke_write(param_6, param_7, param_8, v_283, v_283BufferSize);
+        Cmd_Stroke_write(param_6, param_7, param_8, v_308, v_308BufferSize);
         cmd_ref.offset += 12u;
     }
 }
 
 static inline __attribute__((always_inline))
-void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_283, constant uint& v_283BufferSize)
+void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_308, constant uint& v_308BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.rgba_color;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_283, constant uint& v_283BufferSize)
+void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 5u;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
     Alloc param_3 = a;
     CmdColorRef param_4 = CmdColorRef{ ref.offset + 4u };
     CmdColor param_5 = s;
-    CmdColor_write(param_3, param_4, param_5, v_283, v_283BufferSize);
+    CmdColor_write(param_3, param_4, param_5, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-AnnoLinGradient AnnoLinGradient_read(thread const Alloc& a, thread const AnnoLinGradientRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
+AnnoLinGradient AnnoLinGradient_read(thread const Alloc& a, thread const AnnoLinGradientRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_283, v_283BufferSize);
+    uint raw0 = read_mem(param, param_1, v_308, v_308BufferSize);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_283, v_283BufferSize);
+    uint raw1 = read_mem(param_2, param_3, v_308, v_308BufferSize);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_283, v_283BufferSize);
+    uint raw2 = read_mem(param_4, param_5, v_308, v_308BufferSize);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_283, v_283BufferSize);
+    uint raw3 = read_mem(param_6, param_7, v_308, v_308BufferSize);
     Alloc param_8 = a;
     uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_283, v_283BufferSize);
+    uint raw4 = read_mem(param_8, param_9, v_308, v_308BufferSize);
     Alloc param_10 = a;
     uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_283, v_283BufferSize);
+    uint raw5 = read_mem(param_10, param_11, v_308, v_308BufferSize);
     Alloc param_12 = a;
     uint param_13 = ix + 6u;
-    uint raw6 = read_mem(param_12, param_13, v_283, v_283BufferSize);
+    uint raw6 = read_mem(param_12, param_13, v_308, v_308BufferSize);
     Alloc param_14 = a;
     uint param_15 = ix + 7u;
-    uint raw7 = read_mem(param_14, param_15, v_283, v_283BufferSize);
+    uint raw7 = read_mem(param_14, param_15, v_308, v_308BufferSize);
     Alloc param_16 = a;
     uint param_17 = ix + 8u;
-    uint raw8 = read_mem(param_16, param_17, v_283, v_283BufferSize);
+    uint raw8 = read_mem(param_16, param_17, v_308, v_308BufferSize);
     AnnoLinGradient s;
     s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
     s.linewidth = as_type<float>(raw4);
@@ -623,73 +644,73 @@ AnnoLinGradient AnnoLinGradient_read(thread const Alloc& a, thread const AnnoLin
 }
 
 static inline __attribute__((always_inline))
-AnnoLinGradient Annotated_LinGradient_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
+AnnoLinGradient Annotated_LinGradient_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = a;
     AnnoLinGradientRef param_1 = AnnoLinGradientRef{ ref.offset + 4u };
-    return AnnoLinGradient_read(param, param_1, v_283, v_283BufferSize);
+    return AnnoLinGradient_read(param, param_1, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_283, constant uint& v_283BufferSize)
+void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_308, constant uint& v_308BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.index;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = as_type<uint>(s.line_x);
-    write_mem(param_3, param_4, param_5, v_283, v_283BufferSize);
+    write_mem(param_3, param_4, param_5, v_308, v_308BufferSize);
     Alloc param_6 = a;
     uint param_7 = ix + 2u;
     uint param_8 = as_type<uint>(s.line_y);
-    write_mem(param_6, param_7, param_8, v_283, v_283BufferSize);
+    write_mem(param_6, param_7, param_8, v_308, v_308BufferSize);
     Alloc param_9 = a;
     uint param_10 = ix + 3u;
     uint param_11 = as_type<uint>(s.line_c);
-    write_mem(param_9, param_10, param_11, v_283, v_283BufferSize);
+    write_mem(param_9, param_10, param_11, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_283, constant uint& v_283BufferSize)
+void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 6u;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
     Alloc param_3 = a;
     CmdLinGradRef param_4 = CmdLinGradRef{ ref.offset + 4u };
     CmdLinGrad param_5 = s;
-    CmdLinGrad_write(param_3, param_4, param_5, v_283, v_283BufferSize);
+    CmdLinGrad_write(param_3, param_4, param_5, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-AnnoImage AnnoImage_read(thread const Alloc& a, thread const AnnoImageRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
+AnnoImage AnnoImage_read(thread const Alloc& a, thread const AnnoImageRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_283, v_283BufferSize);
+    uint raw0 = read_mem(param, param_1, v_308, v_308BufferSize);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_283, v_283BufferSize);
+    uint raw1 = read_mem(param_2, param_3, v_308, v_308BufferSize);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_283, v_283BufferSize);
+    uint raw2 = read_mem(param_4, param_5, v_308, v_308BufferSize);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_283, v_283BufferSize);
+    uint raw3 = read_mem(param_6, param_7, v_308, v_308BufferSize);
     Alloc param_8 = a;
     uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_283, v_283BufferSize);
+    uint raw4 = read_mem(param_8, param_9, v_308, v_308BufferSize);
     Alloc param_10 = a;
     uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_283, v_283BufferSize);
+    uint raw5 = read_mem(param_10, param_11, v_308, v_308BufferSize);
     Alloc param_12 = a;
     uint param_13 = ix + 6u;
-    uint raw6 = read_mem(param_12, param_13, v_283, v_283BufferSize);
+    uint raw6 = read_mem(param_12, param_13, v_308, v_308BufferSize);
     AnnoImage s;
     s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
     s.linewidth = as_type<float>(raw4);
@@ -699,68 +720,115 @@ AnnoImage AnnoImage_read(thread const Alloc& a, thread const AnnoImageRef& ref,
 }
 
 static inline __attribute__((always_inline))
-AnnoImage Annotated_Image_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
+AnnoImage Annotated_Image_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = a;
     AnnoImageRef param_1 = AnnoImageRef{ ref.offset + 4u };
-    return AnnoImage_read(param, param_1, v_283, v_283BufferSize);
+    return AnnoImage_read(param, param_1, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_283, constant uint& v_283BufferSize)
+void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_308, constant uint& v_308BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.index;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16));
-    write_mem(param_3, param_4, param_5, v_283, v_283BufferSize);
+    write_mem(param_3, param_4, param_5, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_283, constant uint& v_283BufferSize)
+void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 7u;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
     Alloc param_3 = a;
     CmdImageRef param_4 = CmdImageRef{ ref.offset + 4u };
     CmdImage param_5 = s;
-    CmdImage_write(param_3, param_4, param_5, v_283, v_283BufferSize);
+    CmdImage_write(param_3, param_4, param_5, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
+void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 8u;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
+AnnoEndClip AnnoEndClip_read(thread const Alloc& a, thread const AnnoEndClipRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_308, v_308BufferSize);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3, v_308, v_308BufferSize);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5, v_308, v_308BufferSize);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7, v_308, v_308BufferSize);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9, v_308, v_308BufferSize);
+    AnnoEndClip s;
+    s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
+    s.blend = raw4;
+    return s;
+}
+
+static inline __attribute__((always_inline))
+AnnoEndClip Annotated_EndClip_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
+{
+    Alloc param = a;
+    AnnoEndClipRef param_1 = AnnoEndClipRef{ ref.offset + 4u };
+    return AnnoEndClip_read(param, param_1, v_308, v_308BufferSize);
+}
+
+static inline __attribute__((always_inline))
+void CmdEndClip_write(thread const Alloc& a, thread const CmdEndClipRef& ref, thread const CmdEndClip& s, device Memory& v_308, constant uint& v_308BufferSize)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = s.blend;
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+}
+
+static inline __attribute__((always_inline))
+void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdEndClip& s, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 9u;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
+    Alloc param_3 = a;
+    CmdEndClipRef param_4 = CmdEndClipRef{ ref.offset + 4u };
+    CmdEndClip param_5 = s;
+    CmdEndClip_write(param_3, param_4, param_5, v_308, v_308BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
+void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_308, constant uint& v_308BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 0u;
-    write_mem(param, param_1, param_2, v_283, v_283BufferSize);
+    write_mem(param, param_1, param_2, v_308, v_308BufferSize);
 }
 
-kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_283 [[buffer(0)]], const device ConfigBuf& _1169 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_308 [[buffer(0)]], const device ConfigBuf& _1283 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     threadgroup uint sh_bitmaps[8][256];
     threadgroup Alloc sh_part_elements[256];
@@ -772,19 +840,19 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
     threadgroup uint sh_tile_y0[256];
     threadgroup uint sh_tile_base[256];
     threadgroup uint sh_tile_count[256];
-    constant uint& v_283BufferSize = spvBufferSizeConstants[0];
-    uint width_in_bins = ((_1169.conf.width_in_tiles + 16u) - 1u) / 16u;
+    constant uint& v_308BufferSize = spvBufferSizeConstants[0];
+    uint width_in_bins = ((_1283.conf.width_in_tiles + 16u) - 1u) / 16u;
     uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x;
     uint partition_ix = 0u;
-    uint n_partitions = ((_1169.conf.n_elements + 256u) - 1u) / 256u;
+    uint n_partitions = ((_1283.conf.n_elements + 256u) - 1u) / 256u;
     uint th_ix = gl_LocalInvocationID.x;
     uint bin_tile_x = 16u * gl_WorkGroupID.x;
     uint bin_tile_y = 16u * gl_WorkGroupID.y;
     uint tile_x = gl_LocalInvocationID.x % 16u;
     uint tile_y = gl_LocalInvocationID.x / 16u;
-    uint this_tile_ix = (((bin_tile_y + tile_y) * _1169.conf.width_in_tiles) + bin_tile_x) + tile_x;
+    uint this_tile_ix = (((bin_tile_y + tile_y) * _1283.conf.width_in_tiles) + bin_tile_x) + tile_x;
     Alloc param;
-    param.offset = _1169.conf.ptcl_alloc.offset;
+    param.offset = _1283.conf.ptcl_alloc.offset;
     uint param_1 = this_tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
@@ -796,17 +864,17 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
     uint wr_ix = 0u;
     uint part_start_ix = 0u;
     uint ready_ix = 0u;
-    bool mem_ok = v_283.mem_error == 0u;
+    bool mem_ok = v_308.mem_error == 0u;
     Alloc param_3;
     Alloc param_5;
-    uint _1448;
+    uint _1562;
     uint element_ix;
     AnnotatedRef ref;
     Alloc param_14;
     Alloc param_16;
     uint tile_count;
     Alloc param_23;
-    uint _1770;
+    uint _1887;
     Alloc param_29;
     Tile tile_1;
     AnnoColor fill;
@@ -814,38 +882,39 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
     Alloc param_52;
     CmdLinGrad cmd_lin;
     Alloc param_69;
+    Alloc param_95;
     while (true)
     {
         for (uint i = 0u; i < 8u; i++)
         {
             sh_bitmaps[i][th_ix] = 0u;
         }
-        bool _1500;
+        bool _1614;
         for (;;)
         {
             if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
             {
                 part_start_ix = ready_ix;
                 uint count = 0u;
-                bool _1298 = th_ix < 256u;
-                bool _1306;
-                if (_1298)
+                bool _1412 = th_ix < 256u;
+                bool _1420;
+                if (_1412)
                 {
-                    _1306 = (partition_ix + th_ix) < n_partitions;
+                    _1420 = (partition_ix + th_ix) < n_partitions;
                 }
                 else
                 {
-                    _1306 = _1298;
+                    _1420 = _1412;
                 }
-                if (_1306)
+                if (_1420)
                 {
-                    uint in_ix = (_1169.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
-                    param_3.offset = _1169.conf.bin_alloc.offset;
+                    uint in_ix = (_1283.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
+                    param_3.offset = _1283.conf.bin_alloc.offset;
                     uint param_4 = in_ix;
-                    count = read_mem(param_3, param_4, v_283, v_283BufferSize);
-                    param_5.offset = _1169.conf.bin_alloc.offset;
+                    count = read_mem(param_3, param_4, v_308, v_308BufferSize);
+                    param_5.offset = _1283.conf.bin_alloc.offset;
                     uint param_6 = in_ix + 1u;
-                    uint offset = read_mem(param_5, param_6, v_283, v_283BufferSize);
+                    uint offset = read_mem(param_5, param_6, v_308, v_308BufferSize);
                     uint param_7 = offset;
                     uint param_8 = count * 4u;
                     bool param_9 = mem_ok;
@@ -889,34 +958,34 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                 }
                 if (part_ix > 0u)
                 {
-                    _1448 = sh_part_count[part_ix - 1u];
+                    _1562 = sh_part_count[part_ix - 1u];
                 }
                 else
                 {
-                    _1448 = part_start_ix;
+                    _1562 = part_start_ix;
                 }
-                ix -= _1448;
+                ix -= _1562;
                 Alloc bin_alloc = sh_part_elements[part_ix];
                 BinInstanceRef inst_ref = BinInstanceRef{ bin_alloc.offset };
                 BinInstanceRef param_10 = inst_ref;
                 uint param_11 = ix;
                 Alloc param_12 = bin_alloc;
                 BinInstanceRef param_13 = BinInstance_index(param_10, param_11);
-                BinInstance inst = BinInstance_read(param_12, param_13, v_283, v_283BufferSize);
+                BinInstance inst = BinInstance_read(param_12, param_13, v_308, v_308BufferSize);
                 sh_elements[th_ix] = inst.element_ix;
             }
             threadgroup_barrier(mem_flags::mem_threadgroup);
             wr_ix = min((rd_ix + 256u), ready_ix);
-            bool _1490 = (wr_ix - rd_ix) < 256u;
-            if (_1490)
+            bool _1604 = (wr_ix - rd_ix) < 256u;
+            if (_1604)
             {
-                _1500 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
+                _1614 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
             }
             else
             {
-                _1500 = _1490;
+                _1614 = _1604;
             }
-            if (_1500)
+            if (_1614)
             {
                 continue;
             }
@@ -929,10 +998,10 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
         if ((th_ix + rd_ix) < wr_ix)
         {
             element_ix = sh_elements[th_ix];
-            ref = AnnotatedRef{ _1169.conf.anno_alloc.offset + (element_ix * 40u) };
-            param_14.offset = _1169.conf.anno_alloc.offset;
+            ref = AnnotatedRef{ _1283.conf.anno_alloc.offset + (element_ix * 40u) };
+            param_14.offset = _1283.conf.anno_alloc.offset;
             AnnotatedRef param_15 = ref;
-            tag = Annotated_tag(param_14, param_15, v_283, v_283BufferSize).tag;
+            tag = Annotated_tag(param_14, param_15, v_308, v_308BufferSize).tag;
         }
         switch (tag)
         {
@@ -942,11 +1011,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
             case 4u:
             case 5u:
             {
-                uint drawmonoid_base = (_1169.conf.drawmonoid_alloc.offset >> uint(2)) + (2u * element_ix);
-                uint path_ix = v_283.memory[drawmonoid_base];
-                param_16.offset = _1169.conf.tile_alloc.offset;
-                PathRef param_17 = PathRef{ _1169.conf.tile_alloc.offset + (path_ix * 12u) };
-                Path path = Path_read(param_16, param_17, v_283, v_283BufferSize);
+                uint drawmonoid_base = (_1283.conf.drawmonoid_alloc.offset >> uint(2)) + (2u * element_ix);
+                uint path_ix = v_308.memory[drawmonoid_base];
+                param_16.offset = _1283.conf.tile_alloc.offset;
+                PathRef param_17 = PathRef{ _1283.conf.tile_alloc.offset + (path_ix * 12u) };
+                Path path = Path_read(param_16, param_17, v_308, v_308BufferSize);
                 uint stride = path.bbox.z - path.bbox.x;
                 sh_tile_stride[th_ix] = stride;
                 int dx = int(path.bbox.x) - int(bin_tile_x);
@@ -1000,19 +1069,20 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                     el_ix = probe_1;
                 }
             }
-            AnnotatedRef ref_1 = AnnotatedRef{ _1169.conf.anno_alloc.offset + (sh_elements[el_ix] * 40u) };
-            param_23.offset = _1169.conf.anno_alloc.offset;
+            AnnotatedRef ref_1 = AnnotatedRef{ _1283.conf.anno_alloc.offset + (sh_elements[el_ix] * 40u) };
+            param_23.offset = _1283.conf.anno_alloc.offset;
             AnnotatedRef param_24 = ref_1;
-            uint tag_1 = Annotated_tag(param_23, param_24, v_283, v_283BufferSize).tag;
+            AnnotatedTag anno_tag = Annotated_tag(param_23, param_24, v_308, v_308BufferSize);
+            uint tag_1 = anno_tag.tag;
             if (el_ix > 0u)
             {
-                _1770 = sh_tile_count[el_ix - 1u];
+                _1887 = sh_tile_count[el_ix - 1u];
             }
             else
             {
-                _1770 = 0u;
+                _1887 = 0u;
             }
-            uint seq_ix = ix_1 - _1770;
+            uint seq_ix = ix_1 - _1887;
             uint width = sh_tile_width[el_ix];
             uint x = sh_tile_x0[el_ix] + (seq_ix % width);
             uint y = sh_tile_y0[el_ix] + (seq_ix / width);
@@ -1021,27 +1091,45 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
             {
                 uint param_25 = el_ix;
                 bool param_26 = mem_ok;
-                Alloc param_27 = read_tile_alloc(param_25, param_26, v_283, v_283BufferSize);
+                Alloc param_27 = read_tile_alloc(param_25, param_26, v_308, v_308BufferSize);
                 TileRef param_28 = TileRef{ sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
-                Tile tile = Tile_read(param_27, param_28, v_283, v_283BufferSize);
+                Tile tile = Tile_read(param_27, param_28, v_308, v_308BufferSize);
                 bool is_clip = (tag_1 == 4u) || (tag_1 == 5u);
-                bool _1834 = tile.tile.offset != 0u;
-                bool _1843;
-                if (!_1834)
+                bool _1951 = tile.tile.offset != 0u;
+                bool _1960;
+                if (!_1951)
                 {
-                    _1843 = (tile.backdrop == 0) == is_clip;
+                    _1960 = (tile.backdrop == 0) == is_clip;
                 }
                 else
                 {
-                    _1843 = _1834;
+                    _1960 = _1951;
                 }
-                include_tile = _1843;
+                bool _1972;
+                if (!_1960)
+                {
+                    bool _1971;
+                    if (is_clip)
+                    {
+                        _1971 = (anno_tag.flags & 2u) != 0u;
+                    }
+                    else
+                    {
+                        _1971 = is_clip;
+                    }
+                    _1972 = _1971;
+                }
+                else
+                {
+                    _1972 = _1960;
+                }
+                include_tile = _1972;
             }
             if (include_tile)
             {
                 uint el_slice = el_ix / 32u;
                 uint el_mask = 1u << (el_ix & 31u);
-                uint _1863 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
+                uint _1992 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
             }
         }
         threadgroup_barrier(mem_flags::mem_threadgroup);
@@ -1065,10 +1153,10 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
             uint element_ref_ix = (slice_ix * 32u) + uint(int(spvFindLSB(bitmap)));
             uint element_ix_1 = sh_elements[element_ref_ix];
             bitmap &= (bitmap - 1u);
-            ref = AnnotatedRef{ _1169.conf.anno_alloc.offset + (element_ix_1 * 40u) };
-            param_29.offset = _1169.conf.anno_alloc.offset;
+            ref = AnnotatedRef{ _1283.conf.anno_alloc.offset + (element_ix_1 * 40u) };
+            param_29.offset = _1283.conf.anno_alloc.offset;
             AnnotatedRef param_30 = ref;
-            AnnotatedTag tag_2 = Annotated_tag(param_29, param_30, v_283, v_283BufferSize);
+            AnnotatedTag tag_2 = Annotated_tag(param_29, param_30, v_308, v_308BufferSize);
             if (clip_zero_depth == 0u)
             {
                 switch (tag_2.tag)
@@ -1077,20 +1165,20 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                     {
                         uint param_31 = element_ref_ix;
                         bool param_32 = mem_ok;
-                        Alloc param_33 = read_tile_alloc(param_31, param_32, v_283, v_283BufferSize);
+                        Alloc param_33 = read_tile_alloc(param_31, param_32, v_308, v_308BufferSize);
                         TileRef param_34 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        tile_1 = Tile_read(param_33, param_34, v_283, v_283BufferSize);
-                        param_35.offset = _1169.conf.anno_alloc.offset;
+                        tile_1 = Tile_read(param_33, param_34, v_308, v_308BufferSize);
+                        param_35.offset = _1283.conf.anno_alloc.offset;
                         AnnotatedRef param_36 = ref;
-                        fill = Annotated_Color_read(param_35, param_36, v_283, v_283BufferSize);
+                        fill = Annotated_Color_read(param_35, param_36, v_308, v_308BufferSize);
                         Alloc param_37 = cmd_alloc;
                         CmdRef param_38 = cmd_ref;
                         uint param_39 = cmd_limit;
-                        bool _1977 = alloc_cmd(param_37, param_38, param_39, v_283, v_283BufferSize);
+                        bool _2105 = alloc_cmd(param_37, param_38, param_39, v_308, v_308BufferSize);
                         cmd_alloc = param_37;
                         cmd_ref = param_38;
                         cmd_limit = param_39;
-                        if (!_1977)
+                        if (!_2105)
                         {
                             break;
                         }
@@ -1099,12 +1187,12 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                         uint param_42 = tag_2.flags;
                         Tile param_43 = tile_1;
                         float param_44 = fill.linewidth;
-                        write_fill(param_40, param_41, param_42, param_43, param_44, v_283, v_283BufferSize);
+                        write_fill(param_40, param_41, param_42, param_43, param_44, v_308, v_308BufferSize);
                         cmd_ref = param_41;
                         Alloc param_45 = cmd_alloc;
                         CmdRef param_46 = cmd_ref;
                         CmdColor param_47 = CmdColor{ fill.rgba_color };
-                        Cmd_Color_write(param_45, param_46, param_47, v_283, v_283BufferSize);
+                        Cmd_Color_write(param_45, param_46, param_47, v_308, v_308BufferSize);
                         cmd_ref.offset += 8u;
                         break;
                     }
@@ -1112,20 +1200,20 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                     {
                         uint param_48 = element_ref_ix;
                         bool param_49 = mem_ok;
-                        Alloc param_50 = read_tile_alloc(param_48, param_49, v_283, v_283BufferSize);
+                        Alloc param_50 = read_tile_alloc(param_48, param_49, v_308, v_308BufferSize);
                         TileRef param_51 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        tile_1 = Tile_read(param_50, param_51, v_283, v_283BufferSize);
-                        param_52.offset = _1169.conf.anno_alloc.offset;
+                        tile_1 = Tile_read(param_50, param_51, v_308, v_308BufferSize);
+                        param_52.offset = _1283.conf.anno_alloc.offset;
                         AnnotatedRef param_53 = ref;
-                        AnnoLinGradient lin = Annotated_LinGradient_read(param_52, param_53, v_283, v_283BufferSize);
+                        AnnoLinGradient lin = Annotated_LinGradient_read(param_52, param_53, v_308, v_308BufferSize);
                         Alloc param_54 = cmd_alloc;
                         CmdRef param_55 = cmd_ref;
                         uint param_56 = cmd_limit;
-                        bool _2049 = alloc_cmd(param_54, param_55, param_56, v_283, v_283BufferSize);
+                        bool _2177 = alloc_cmd(param_54, param_55, param_56, v_308, v_308BufferSize);
                         cmd_alloc = param_54;
                         cmd_ref = param_55;
                         cmd_limit = param_56;
-                        if (!_2049)
+                        if (!_2177)
                         {
                             break;
                         }
@@ -1134,7 +1222,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                         uint param_59 = tag_2.flags;
                         Tile param_60 = tile_1;
                         float param_61 = fill.linewidth;
-                        write_fill(param_57, param_58, param_59, param_60, param_61, v_283, v_283BufferSize);
+                        write_fill(param_57, param_58, param_59, param_60, param_61, v_308, v_308BufferSize);
                         cmd_ref = param_58;
                         cmd_lin.index = lin.index;
                         cmd_lin.line_x = lin.line_x;
@@ -1143,7 +1231,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                         Alloc param_62 = cmd_alloc;
                         CmdRef param_63 = cmd_ref;
                         CmdLinGrad param_64 = cmd_lin;
-                        Cmd_LinGrad_write(param_62, param_63, param_64, v_283, v_283BufferSize);
+                        Cmd_LinGrad_write(param_62, param_63, param_64, v_308, v_308BufferSize);
                         cmd_ref.offset += 20u;
                         break;
                     }
@@ -1151,20 +1239,20 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                     {
                         uint param_65 = element_ref_ix;
                         bool param_66 = mem_ok;
-                        Alloc param_67 = read_tile_alloc(param_65, param_66, v_283, v_283BufferSize);
+                        Alloc param_67 = read_tile_alloc(param_65, param_66, v_308, v_308BufferSize);
                         TileRef param_68 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        tile_1 = Tile_read(param_67, param_68, v_283, v_283BufferSize);
-                        param_69.offset = _1169.conf.anno_alloc.offset;
+                        tile_1 = Tile_read(param_67, param_68, v_308, v_308BufferSize);
+                        param_69.offset = _1283.conf.anno_alloc.offset;
                         AnnotatedRef param_70 = ref;
-                        AnnoImage fill_img = Annotated_Image_read(param_69, param_70, v_283, v_283BufferSize);
+                        AnnoImage fill_img = Annotated_Image_read(param_69, param_70, v_308, v_308BufferSize);
                         Alloc param_71 = cmd_alloc;
                         CmdRef param_72 = cmd_ref;
                         uint param_73 = cmd_limit;
-                        bool _2133 = alloc_cmd(param_71, param_72, param_73, v_283, v_283BufferSize);
+                        bool _2261 = alloc_cmd(param_71, param_72, param_73, v_308, v_308BufferSize);
                         cmd_alloc = param_71;
                         cmd_ref = param_72;
                         cmd_limit = param_73;
-                        if (!_2133)
+                        if (!_2261)
                         {
                             break;
                         }
@@ -1173,12 +1261,12 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                         uint param_76 = tag_2.flags;
                         Tile param_77 = tile_1;
                         float param_78 = fill_img.linewidth;
-                        write_fill(param_74, param_75, param_76, param_77, param_78, v_283, v_283BufferSize);
+                        write_fill(param_74, param_75, param_76, param_77, param_78, v_308, v_308BufferSize);
                         cmd_ref = param_75;
                         Alloc param_79 = cmd_alloc;
                         CmdRef param_80 = cmd_ref;
                         CmdImage param_81 = CmdImage{ fill_img.index, fill_img.offset };
-                        Cmd_Image_write(param_79, param_80, param_81, v_283, v_283BufferSize);
+                        Cmd_Image_write(param_79, param_80, param_81, v_308, v_308BufferSize);
                         cmd_ref.offset += 12u;
                         break;
                     }
@@ -1186,20 +1274,20 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                     {
                         uint param_82 = element_ref_ix;
                         bool param_83 = mem_ok;
-                        Alloc param_84 = read_tile_alloc(param_82, param_83, v_283, v_283BufferSize);
+                        Alloc param_84 = read_tile_alloc(param_82, param_83, v_308, v_308BufferSize);
                         TileRef param_85 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        tile_1 = Tile_read(param_84, param_85, v_283, v_283BufferSize);
-                        bool _2194 = tile_1.tile.offset == 0u;
-                        bool _2200;
-                        if (_2194)
+                        tile_1 = Tile_read(param_84, param_85, v_308, v_308BufferSize);
+                        bool _2322 = tile_1.tile.offset == 0u;
+                        bool _2328;
+                        if (_2322)
                         {
-                            _2200 = tile_1.backdrop == 0;
+                            _2328 = tile_1.backdrop == 0;
                         }
                         else
                         {
-                            _2200 = _2194;
+                            _2328 = _2322;
                         }
-                        if (_2200)
+                        if (_2328)
                         {
                             clip_zero_depth = clip_depth + 1u;
                         }
@@ -1208,17 +1296,17 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                             Alloc param_86 = cmd_alloc;
                             CmdRef param_87 = cmd_ref;
                             uint param_88 = cmd_limit;
-                            bool _2212 = alloc_cmd(param_86, param_87, param_88, v_283, v_283BufferSize);
+                            bool _2340 = alloc_cmd(param_86, param_87, param_88, v_308, v_308BufferSize);
                             cmd_alloc = param_86;
                             cmd_ref = param_87;
                             cmd_limit = param_88;
-                            if (!_2212)
+                            if (!_2340)
                             {
                                 break;
                             }
                             Alloc param_89 = cmd_alloc;
                             CmdRef param_90 = cmd_ref;
-                            Cmd_BeginClip_write(param_89, param_90, v_283, v_283BufferSize);
+                            Cmd_BeginClip_write(param_89, param_90, v_308, v_308BufferSize);
                             cmd_ref.offset += 4u;
                         }
                         clip_depth++;
@@ -1228,32 +1316,36 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                     {
                         uint param_91 = element_ref_ix;
                         bool param_92 = mem_ok;
-                        Alloc param_93 = read_tile_alloc(param_91, param_92, v_283, v_283BufferSize);
+                        Alloc param_93 = read_tile_alloc(param_91, param_92, v_308, v_308BufferSize);
                         TileRef param_94 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                        tile_1 = Tile_read(param_93, param_94, v_283, v_283BufferSize);
+                        tile_1 = Tile_read(param_93, param_94, v_308, v_308BufferSize);
+                        param_95.offset = _1283.conf.anno_alloc.offset;
+                        AnnotatedRef param_96 = ref;
+                        AnnoEndClip end_clip = Annotated_EndClip_read(param_95, param_96, v_308, v_308BufferSize);
                         clip_depth--;
-                        Alloc param_95 = cmd_alloc;
-                        CmdRef param_96 = cmd_ref;
-                        uint param_97 = cmd_limit;
-                        bool _2261 = alloc_cmd(param_95, param_96, param_97, v_283, v_283BufferSize);
-                        cmd_alloc = param_95;
-                        cmd_ref = param_96;
-                        cmd_limit = param_97;
-                        if (!_2261)
+                        Alloc param_97 = cmd_alloc;
+                        CmdRef param_98 = cmd_ref;
+                        uint param_99 = cmd_limit;
+                        bool _2398 = alloc_cmd(param_97, param_98, param_99, v_308, v_308BufferSize);
+                        cmd_alloc = param_97;
+                        cmd_ref = param_98;
+                        cmd_limit = param_99;
+                        if (!_2398)
                         {
                             break;
                         }
-                        Alloc param_98 = cmd_alloc;
-                        CmdRef param_99 = cmd_ref;
-                        uint param_100 = 0u;
-                        Tile param_101 = tile_1;
-                        float param_102 = 0.0;
-                        write_fill(param_98, param_99, param_100, param_101, param_102, v_283, v_283BufferSize);
-                        cmd_ref = param_99;
-                        Alloc param_103 = cmd_alloc;
-                        CmdRef param_104 = cmd_ref;
-                        Cmd_EndClip_write(param_103, param_104, v_283, v_283BufferSize);
-                        cmd_ref.offset += 4u;
+                        Alloc param_100 = cmd_alloc;
+                        CmdRef param_101 = cmd_ref;
+                        uint param_102 = 0u;
+                        Tile param_103 = tile_1;
+                        float param_104 = 0.0;
+                        write_fill(param_100, param_101, param_102, param_103, param_104, v_308, v_308BufferSize);
+                        cmd_ref = param_101;
+                        Alloc param_105 = cmd_alloc;
+                        CmdRef param_106 = cmd_ref;
+                        CmdEndClip param_107 = CmdEndClip{ end_clip.blend };
+                        Cmd_EndClip_write(param_105, param_106, param_107, v_308, v_308BufferSize);
+                        cmd_ref.offset += 8u;
                         break;
                     }
                 }
@@ -1286,21 +1378,21 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
             break;
         }
     }
-    bool _2326 = (bin_tile_x + tile_x) < _1169.conf.width_in_tiles;
-    bool _2335;
-    if (_2326)
+    bool _2467 = (bin_tile_x + tile_x) < _1283.conf.width_in_tiles;
+    bool _2476;
+    if (_2467)
     {
-        _2335 = (bin_tile_y + tile_y) < _1169.conf.height_in_tiles;
+        _2476 = (bin_tile_y + tile_y) < _1283.conf.height_in_tiles;
     }
     else
     {
-        _2335 = _2326;
+        _2476 = _2467;
     }
-    if (_2335)
+    if (_2476)
     {
-        Alloc param_105 = cmd_alloc;
-        CmdRef param_106 = cmd_ref;
-        Cmd_End_write(param_105, param_106, v_283, v_283BufferSize);
+        Alloc param_108 = cmd_alloc;
+        CmdRef param_109 = cmd_ref;
+        Cmd_End_write(param_108, param_109, v_308, v_308BufferSize);
     }
 }
 
diff --git a/piet-gpu/shader/gen/coarse.spv b/piet-gpu/shader/gen/coarse.spv
index b30e2d8..1fef2d7 100644
Binary files a/piet-gpu/shader/gen/coarse.spv and b/piet-gpu/shader/gen/coarse.spv differ
diff --git a/piet-gpu/shader/gen/draw_leaf.dxil b/piet-gpu/shader/gen/draw_leaf.dxil
index d901a80..d1567c9 100644
Binary files a/piet-gpu/shader/gen/draw_leaf.dxil and b/piet-gpu/shader/gen/draw_leaf.dxil differ
diff --git a/piet-gpu/shader/gen/draw_leaf.hlsl b/piet-gpu/shader/gen/draw_leaf.hlsl
index 0ca5843..1f2f78b 100644
--- a/piet-gpu/shader/gen/draw_leaf.hlsl
+++ b/piet-gpu/shader/gen/draw_leaf.hlsl
@@ -41,6 +41,17 @@ struct FillImage
     int2 offset;
 };
 
+struct ClipRef
+{
+    uint offset;
+};
+
+struct Clip
+{
+    float4 bbox;
+    uint blend;
+};
+
 struct ElementTag
 {
     uint tag;
@@ -102,6 +113,7 @@ struct AnnoBeginClip
 {
     float4 bbox;
     float linewidth;
+    uint blend;
 };
 
 struct AnnoEndClipRef
@@ -112,6 +124,7 @@ struct AnnoEndClipRef
 struct AnnoEndClip
 {
     float4 bbox;
+    uint blend;
 };
 
 struct AnnotatedRef
@@ -148,14 +161,14 @@ struct Config
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-static const DrawMonoid _348 = { 0u, 0u };
-static const DrawMonoid _372 = { 1u, 0u };
-static const DrawMonoid _374 = { 1u, 1u };
+static const DrawMonoid _413 = { 0u, 0u };
+static const DrawMonoid _437 = { 1u, 0u };
+static const DrawMonoid _439 = { 1u, 1u };
 
-RWByteAddressBuffer _187 : register(u0, space0);
-ByteAddressBuffer _211 : register(t2, space0);
-ByteAddressBuffer _934 : register(t3, space0);
-ByteAddressBuffer _968 : register(t1, space0);
+RWByteAddressBuffer _199 : register(u0, space0);
+ByteAddressBuffer _223 : register(t2, space0);
+ByteAddressBuffer _1020 : register(t3, space0);
+ByteAddressBuffer _1054 : register(t1, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -171,9 +184,9 @@ groupshared DrawMonoid sh_scratch[256];
 
 ElementTag Element_tag(ElementRef ref)
 {
-    uint tag_and_flags = _211.Load((ref.offset >> uint(2)) * 4 + 0);
-    ElementTag _321 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _321;
+    uint tag_and_flags = _223.Load((ref.offset >> uint(2)) * 4 + 0);
+    ElementTag _378 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+    return _378;
 }
 
 DrawMonoid map_tag(uint tag_word)
@@ -184,24 +197,24 @@ DrawMonoid map_tag(uint tag_word)
         case 5u:
         case 6u:
         {
-            return _372;
+            return _437;
         }
         case 9u:
         case 10u:
         {
-            return _374;
+            return _439;
         }
         default:
         {
-            return _348;
+            return _413;
         }
     }
 }
 
 ElementRef Element_index(ElementRef ref, uint index)
 {
-    ElementRef _200 = { ref.offset + (index * 36u) };
-    return _200;
+    ElementRef _212 = { ref.offset + (index * 36u) };
+    return _212;
 }
 
 DrawMonoid combine_tag_monoid(DrawMonoid a, DrawMonoid b)
@@ -214,13 +227,13 @@ DrawMonoid combine_tag_monoid(DrawMonoid a, DrawMonoid b)
 
 DrawMonoid tag_monoid_identity()
 {
-    return _348;
+    return _413;
 }
 
 FillColor FillColor_read(FillColorRef ref)
 {
     uint ix = ref.offset >> uint(2);
-    uint raw0 = _211.Load((ix + 0u) * 4 + 0);
+    uint raw0 = _223.Load((ix + 0u) * 4 + 0);
     FillColor s;
     s.rgba_color = raw0;
     return s;
@@ -228,8 +241,8 @@ FillColor FillColor_read(FillColorRef ref)
 
 FillColor Element_FillColor_read(ElementRef ref)
 {
-    FillColorRef _327 = { ref.offset + 4u };
-    FillColorRef param = _327;
+    FillColorRef _384 = { ref.offset + 4u };
+    FillColorRef param = _384;
     return FillColor_read(param);
 }
 
@@ -246,7 +259,7 @@ void write_mem(Alloc alloc, uint offset, uint val)
     {
         return;
     }
-    _187.Store(offset * 4 + 8, val);
+    _199.Store(offset * 4 + 8, val);
 }
 
 void AnnoColor_write(Alloc a, AnnoColorRef ref, AnnoColor s)
@@ -284,9 +297,9 @@ void Annotated_Color_write(Alloc a, AnnotatedRef ref, uint flags, AnnoColor s)
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = (flags << uint(16)) | 1u;
     write_mem(param, param_1, param_2);
-    AnnoColorRef _735 = { ref.offset + 4u };
+    AnnoColorRef _818 = { ref.offset + 4u };
     Alloc param_3 = a;
-    AnnoColorRef param_4 = _735;
+    AnnoColorRef param_4 = _818;
     AnnoColor param_5 = s;
     AnnoColor_write(param_3, param_4, param_5);
 }
@@ -294,11 +307,11 @@ void Annotated_Color_write(Alloc a, AnnotatedRef ref, uint flags, AnnoColor s)
 FillLinGradient FillLinGradient_read(FillLinGradientRef ref)
 {
     uint ix = ref.offset >> uint(2);
-    uint raw0 = _211.Load((ix + 0u) * 4 + 0);
-    uint raw1 = _211.Load((ix + 1u) * 4 + 0);
-    uint raw2 = _211.Load((ix + 2u) * 4 + 0);
-    uint raw3 = _211.Load((ix + 3u) * 4 + 0);
-    uint raw4 = _211.Load((ix + 4u) * 4 + 0);
+    uint raw0 = _223.Load((ix + 0u) * 4 + 0);
+    uint raw1 = _223.Load((ix + 1u) * 4 + 0);
+    uint raw2 = _223.Load((ix + 2u) * 4 + 0);
+    uint raw3 = _223.Load((ix + 3u) * 4 + 0);
+    uint raw4 = _223.Load((ix + 4u) * 4 + 0);
     FillLinGradient s;
     s.index = raw0;
     s.p0 = float2(asfloat(raw1), asfloat(raw2));
@@ -308,8 +321,8 @@ FillLinGradient FillLinGradient_read(FillLinGradientRef ref)
 
 FillLinGradient Element_FillLinGradient_read(ElementRef ref)
 {
-    FillLinGradientRef _335 = { ref.offset + 4u };
-    FillLinGradientRef param = _335;
+    FillLinGradientRef _392 = { ref.offset + 4u };
+    FillLinGradientRef param = _392;
     return FillLinGradient_read(param);
 }
 
@@ -360,9 +373,9 @@ void Annotated_LinGradient_write(Alloc a, AnnotatedRef ref, uint flags, AnnoLinG
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = (flags << uint(16)) | 2u;
     write_mem(param, param_1, param_2);
-    AnnoLinGradientRef _756 = { ref.offset + 4u };
+    AnnoLinGradientRef _839 = { ref.offset + 4u };
     Alloc param_3 = a;
-    AnnoLinGradientRef param_4 = _756;
+    AnnoLinGradientRef param_4 = _839;
     AnnoLinGradient param_5 = s;
     AnnoLinGradient_write(param_3, param_4, param_5);
 }
@@ -370,8 +383,8 @@ void Annotated_LinGradient_write(Alloc a, AnnotatedRef ref, uint flags, AnnoLinG
 FillImage FillImage_read(FillImageRef ref)
 {
     uint ix = ref.offset >> uint(2);
-    uint raw0 = _211.Load((ix + 0u) * 4 + 0);
-    uint raw1 = _211.Load((ix + 1u) * 4 + 0);
+    uint raw0 = _223.Load((ix + 0u) * 4 + 0);
+    uint raw1 = _223.Load((ix + 1u) * 4 + 0);
     FillImage s;
     s.index = raw0;
     s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
@@ -380,8 +393,8 @@ FillImage FillImage_read(FillImageRef ref)
 
 FillImage Element_FillImage_read(ElementRef ref)
 {
-    FillImageRef _343 = { ref.offset + 4u };
-    FillImageRef param = _343;
+    FillImageRef _400 = { ref.offset + 4u };
+    FillImageRef param = _400;
     return FillImage_read(param);
 }
 
@@ -424,13 +437,33 @@ void Annotated_Image_write(Alloc a, AnnotatedRef ref, uint flags, AnnoImage s)
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = (flags << uint(16)) | 3u;
     write_mem(param, param_1, param_2);
-    AnnoImageRef _777 = { ref.offset + 4u };
+    AnnoImageRef _860 = { ref.offset + 4u };
     Alloc param_3 = a;
-    AnnoImageRef param_4 = _777;
+    AnnoImageRef param_4 = _860;
     AnnoImage param_5 = s;
     AnnoImage_write(param_3, param_4, param_5);
 }
 
+Clip Clip_read(ClipRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    uint raw0 = _223.Load((ix + 0u) * 4 + 0);
+    uint raw1 = _223.Load((ix + 1u) * 4 + 0);
+    uint raw2 = _223.Load((ix + 2u) * 4 + 0);
+    uint raw3 = _223.Load((ix + 3u) * 4 + 0);
+    Clip s;
+    s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
+    s.blend = _223.Load((ix + 4u) * 4 + 0);
+    return s;
+}
+
+Clip Element_BeginClip_read(ElementRef ref)
+{
+    ClipRef _408 = { ref.offset + 4u };
+    ClipRef param = _408;
+    return Clip_read(param);
+}
+
 void AnnoBeginClip_write(Alloc a, AnnoBeginClipRef ref, AnnoBeginClip s)
 {
     uint ix = ref.offset >> uint(2);
@@ -454,6 +487,10 @@ void AnnoBeginClip_write(Alloc a, AnnoBeginClipRef ref, AnnoBeginClip s)
     uint param_13 = ix + 4u;
     uint param_14 = asuint(s.linewidth);
     write_mem(param_12, param_13, param_14);
+    Alloc param_15 = a;
+    uint param_16 = ix + 5u;
+    uint param_17 = s.blend;
+    write_mem(param_15, param_16, param_17);
 }
 
 void Annotated_BeginClip_write(Alloc a, AnnotatedRef ref, uint flags, AnnoBeginClip s)
@@ -462,9 +499,9 @@ void Annotated_BeginClip_write(Alloc a, AnnotatedRef ref, uint flags, AnnoBeginC
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = (flags << uint(16)) | 4u;
     write_mem(param, param_1, param_2);
-    AnnoBeginClipRef _798 = { ref.offset + 4u };
+    AnnoBeginClipRef _881 = { ref.offset + 4u };
     Alloc param_3 = a;
-    AnnoBeginClipRef param_4 = _798;
+    AnnoBeginClipRef param_4 = _881;
     AnnoBeginClip param_5 = s;
     AnnoBeginClip_write(param_3, param_4, param_5);
 }
@@ -488,17 +525,21 @@ void AnnoEndClip_write(Alloc a, AnnoEndClipRef ref, AnnoEndClip s)
     uint param_10 = ix + 3u;
     uint param_11 = asuint(s.bbox.w);
     write_mem(param_9, param_10, param_11);
+    Alloc param_12 = a;
+    uint param_13 = ix + 4u;
+    uint param_14 = s.blend;
+    write_mem(param_12, param_13, param_14);
 }
 
-void Annotated_EndClip_write(Alloc a, AnnotatedRef ref, AnnoEndClip s)
+void Annotated_EndClip_write(Alloc a, AnnotatedRef ref, uint flags, AnnoEndClip s)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint param_2 = 5u;
+    uint param_2 = (flags << uint(16)) | 5u;
     write_mem(param, param_1, param_2);
-    AnnoEndClipRef _816 = { ref.offset + 4u };
+    AnnoEndClipRef _902 = { ref.offset + 4u };
     Alloc param_3 = a;
-    AnnoEndClipRef param_4 = _816;
+    AnnoEndClipRef param_4 = _902;
     AnnoEndClip param_5 = s;
     AnnoEndClip_write(param_3, param_4, param_5);
 }
@@ -506,8 +547,8 @@ void Annotated_EndClip_write(Alloc a, AnnotatedRef ref, AnnoEndClip s)
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 8u;
-    ElementRef _834 = { ix * 36u };
-    ElementRef ref = _834;
+    ElementRef _920 = { ix * 36u };
+    ElementRef ref = _920;
     ElementRef param = ref;
     uint tag_word = Element_tag(param).tag;
     uint param_1 = tag_word;
@@ -544,11 +585,11 @@ void comp_main()
     DrawMonoid row = tag_monoid_identity();
     if (gl_WorkGroupID.x > 0u)
     {
-        DrawMonoid _940;
-        _940.path_ix = _934.Load((gl_WorkGroupID.x - 1u) * 8 + 0);
-        _940.clip_ix = _934.Load((gl_WorkGroupID.x - 1u) * 8 + 4);
-        row.path_ix = _940.path_ix;
-        row.clip_ix = _940.clip_ix;
+        DrawMonoid _1026;
+        _1026.path_ix = _1020.Load((gl_WorkGroupID.x - 1u) * 8 + 0);
+        _1026.clip_ix = _1020.Load((gl_WorkGroupID.x - 1u) * 8 + 4);
+        row.path_ix = _1026.path_ix;
+        row.clip_ix = _1026.clip_ix;
     }
     if (gl_LocalInvocationID.x > 0u)
     {
@@ -557,10 +598,10 @@ void comp_main()
         row = combine_tag_monoid(param_10, param_11);
     }
     uint out_ix = gl_GlobalInvocationID.x * 8u;
-    uint out_base = (_968.Load(44) >> uint(2)) + (out_ix * 2u);
-    uint clip_out_base = _968.Load(48) >> uint(2);
-    AnnotatedRef _989 = { _968.Load(32) + (out_ix * 40u) };
-    AnnotatedRef out_ref = _989;
+    uint out_base = (_1054.Load(44) >> uint(2)) + (out_ix * 2u);
+    uint clip_out_base = _1054.Load(48) >> uint(2);
+    AnnotatedRef _1075 = { _1054.Load(32) + (out_ix * 40u) };
+    AnnotatedRef out_ref = _1075;
     float4 mat;
     float2 translate;
     AnnoColor anno_fill;
@@ -570,9 +611,9 @@ void comp_main()
     AnnoImage anno_img;
     Alloc param_28;
     AnnoBeginClip anno_begin_clip;
-    Alloc param_32;
+    Alloc param_33;
     AnnoEndClip anno_end_clip;
-    Alloc param_36;
+    Alloc param_38;
     for (uint i_2 = 0u; i_2 < 8u; i_2++)
     {
         DrawMonoid m = row;
@@ -582,8 +623,8 @@ void comp_main()
             DrawMonoid param_13 = local[i_2 - 1u];
             m = combine_tag_monoid(param_12, param_13);
         }
-        _187.Store((out_base + (i_2 * 2u)) * 4 + 8, m.path_ix);
-        _187.Store(((out_base + (i_2 * 2u)) + 1u) * 4 + 8, m.clip_ix);
+        _199.Store((out_base + (i_2 * 2u)) * 4 + 8, m.path_ix);
+        _199.Store(((out_base + (i_2 * 2u)) + 1u) * 4 + 8, m.clip_ix);
         ElementRef param_14 = ref;
         uint param_15 = i_2;
         ElementRef this_ref = Element_index(param_14, param_15);
@@ -591,22 +632,22 @@ void comp_main()
         tag_word = Element_tag(param_16).tag;
         if ((((tag_word == 4u) || (tag_word == 5u)) || (tag_word == 6u)) || (tag_word == 9u))
         {
-            uint bbox_offset = (_968.Load(40) >> uint(2)) + (6u * m.path_ix);
-            float bbox_l = float(_187.Load(bbox_offset * 4 + 8)) - 32768.0f;
-            float bbox_t = float(_187.Load((bbox_offset + 1u) * 4 + 8)) - 32768.0f;
-            float bbox_r = float(_187.Load((bbox_offset + 2u) * 4 + 8)) - 32768.0f;
-            float bbox_b = float(_187.Load((bbox_offset + 3u) * 4 + 8)) - 32768.0f;
+            uint bbox_offset = (_1054.Load(40) >> uint(2)) + (6u * m.path_ix);
+            float bbox_l = float(_199.Load(bbox_offset * 4 + 8)) - 32768.0f;
+            float bbox_t = float(_199.Load((bbox_offset + 1u) * 4 + 8)) - 32768.0f;
+            float bbox_r = float(_199.Load((bbox_offset + 2u) * 4 + 8)) - 32768.0f;
+            float bbox_b = float(_199.Load((bbox_offset + 3u) * 4 + 8)) - 32768.0f;
             float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
-            float linewidth = asfloat(_187.Load((bbox_offset + 4u) * 4 + 8));
+            float linewidth = asfloat(_199.Load((bbox_offset + 4u) * 4 + 8));
             uint fill_mode = uint(linewidth >= 0.0f);
             if ((linewidth >= 0.0f) || (tag_word == 5u))
             {
-                uint trans_ix = _187.Load((bbox_offset + 5u) * 4 + 8);
-                uint t = (_968.Load(36) >> uint(2)) + (6u * trans_ix);
-                mat = asfloat(uint4(_187.Load(t * 4 + 8), _187.Load((t + 1u) * 4 + 8), _187.Load((t + 2u) * 4 + 8), _187.Load((t + 3u) * 4 + 8)));
+                uint trans_ix = _199.Load((bbox_offset + 5u) * 4 + 8);
+                uint t = (_1054.Load(36) >> uint(2)) + (6u * trans_ix);
+                mat = asfloat(uint4(_199.Load(t * 4 + 8), _199.Load((t + 1u) * 4 + 8), _199.Load((t + 2u) * 4 + 8), _199.Load((t + 3u) * 4 + 8)));
                 if (tag_word == 5u)
                 {
-                    translate = asfloat(uint2(_187.Load((t + 4u) * 4 + 8), _187.Load((t + 5u) * 4 + 8)));
+                    translate = asfloat(uint2(_199.Load((t + 4u) * 4 + 8), _199.Load((t + 5u) * 4 + 8)));
                 }
             }
             if (linewidth >= 0.0f)
@@ -623,9 +664,9 @@ void comp_main()
                     anno_fill.bbox = bbox;
                     anno_fill.linewidth = linewidth;
                     anno_fill.rgba_color = fill.rgba_color;
-                    Alloc _1203;
-                    _1203.offset = _968.Load(32);
-                    param_18.offset = _1203.offset;
+                    Alloc _1288;
+                    _1288.offset = _1054.Load(32);
+                    param_18.offset = _1288.offset;
                     AnnotatedRef param_19 = out_ref;
                     uint param_20 = fill_mode;
                     AnnoColor param_21 = anno_fill;
@@ -648,9 +689,9 @@ void comp_main()
                     anno_lin.line_x = line_x;
                     anno_lin.line_y = line_y;
                     anno_lin.line_c = -((p0.x * line_x) + (p0.y * line_y));
-                    Alloc _1299;
-                    _1299.offset = _968.Load(32);
-                    param_23.offset = _1299.offset;
+                    Alloc _1384;
+                    _1384.offset = _1054.Load(32);
+                    param_23.offset = _1384.offset;
                     AnnotatedRef param_24 = out_ref;
                     uint param_25 = fill_mode;
                     AnnoLinGradient param_26 = anno_lin;
@@ -665,9 +706,9 @@ void comp_main()
                     anno_img.linewidth = linewidth;
                     anno_img.index = fill_img.index;
                     anno_img.offset = fill_img.offset;
-                    Alloc _1327;
-                    _1327.offset = _968.Load(32);
-                    param_28.offset = _1327.offset;
+                    Alloc _1412;
+                    _1412.offset = _1054.Load(32);
+                    param_28.offset = _1412.offset;
                     AnnotatedRef param_29 = out_ref;
                     uint param_30 = fill_mode;
                     AnnoImage param_31 = anno_img;
@@ -676,15 +717,19 @@ void comp_main()
                 }
                 case 9u:
                 {
+                    ElementRef param_32 = this_ref;
+                    Clip begin_clip = Element_BeginClip_read(param_32);
                     anno_begin_clip.bbox = bbox;
                     anno_begin_clip.linewidth = 0.0f;
-                    Alloc _1344;
-                    _1344.offset = _968.Load(32);
-                    param_32.offset = _1344.offset;
-                    AnnotatedRef param_33 = out_ref;
-                    uint param_34 = 0u;
-                    AnnoBeginClip param_35 = anno_begin_clip;
-                    Annotated_BeginClip_write(param_32, param_33, param_34, param_35);
+                    anno_begin_clip.blend = begin_clip.blend;
+                    uint flags = uint(begin_clip.blend != 3u) << uint(1);
+                    Alloc _1442;
+                    _1442.offset = _1054.Load(32);
+                    param_33.offset = _1442.offset;
+                    AnnotatedRef param_34 = out_ref;
+                    uint param_35 = flags;
+                    AnnoBeginClip param_36 = anno_begin_clip;
+                    Annotated_BeginClip_write(param_33, param_34, param_35, param_36);
                     break;
                 }
             }
@@ -693,13 +738,18 @@ void comp_main()
         {
             if (tag_word == 10u)
             {
+                ElementRef param_37 = this_ref;
+                Clip end_clip = Element_BeginClip_read(param_37);
                 anno_end_clip.bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f);
-                Alloc _1368;
-                _1368.offset = _968.Load(32);
-                param_36.offset = _1368.offset;
-                AnnotatedRef param_37 = out_ref;
-                AnnoEndClip param_38 = anno_end_clip;
-                Annotated_EndClip_write(param_36, param_37, param_38);
+                anno_end_clip.blend = end_clip.blend;
+                uint flags_1 = uint(end_clip.blend != 3u) << uint(1);
+                Alloc _1480;
+                _1480.offset = _1054.Load(32);
+                param_38.offset = _1480.offset;
+                AnnotatedRef param_39 = out_ref;
+                uint param_40 = flags_1;
+                AnnoEndClip param_41 = anno_end_clip;
+                Annotated_EndClip_write(param_38, param_39, param_40, param_41);
             }
         }
         if ((tag_word == 9u) || (tag_word == 10u))
@@ -709,7 +759,7 @@ void comp_main()
             {
                 path_ix = m.path_ix;
             }
-            _187.Store((clip_out_base + m.clip_ix) * 4 + 8, path_ix);
+            _199.Store((clip_out_base + m.clip_ix) * 4 + 8, path_ix);
         }
         out_ref.offset += 40u;
     }
diff --git a/piet-gpu/shader/gen/draw_leaf.msl b/piet-gpu/shader/gen/draw_leaf.msl
index 8de5379..5b9ecc6 100644
--- a/piet-gpu/shader/gen/draw_leaf.msl
+++ b/piet-gpu/shader/gen/draw_leaf.msl
@@ -87,6 +87,17 @@ struct FillImage
     int2 offset;
 };
 
+struct ClipRef
+{
+    uint offset;
+};
+
+struct Clip
+{
+    float4 bbox;
+    uint blend;
+};
+
 struct ElementTag
 {
     uint tag;
@@ -148,6 +159,7 @@ struct AnnoBeginClip
 {
     float4 bbox;
     float linewidth;
+    uint blend;
 };
 
 struct AnnoEndClipRef
@@ -158,6 +170,7 @@ struct AnnoEndClipRef
 struct AnnoEndClip
 {
     float4 bbox;
+    uint blend;
 };
 
 struct AnnotatedRef
@@ -228,9 +241,9 @@ struct ConfigBuf
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
 
 static inline __attribute__((always_inline))
-ElementTag Element_tag(thread const ElementRef& ref, const device SceneBuf& v_211)
+ElementTag Element_tag(thread const ElementRef& ref, const device SceneBuf& v_223)
 {
-    uint tag_and_flags = v_211.scene[ref.offset >> uint(2)];
+    uint tag_and_flags = v_223.scene[ref.offset >> uint(2)];
     return ElementTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
 }
 
@@ -279,20 +292,20 @@ DrawMonoid tag_monoid_identity()
 }
 
 static inline __attribute__((always_inline))
-FillColor FillColor_read(thread const FillColorRef& ref, const device SceneBuf& v_211)
+FillColor FillColor_read(thread const FillColorRef& ref, const device SceneBuf& v_223)
 {
     uint ix = ref.offset >> uint(2);
-    uint raw0 = v_211.scene[ix + 0u];
+    uint raw0 = v_223.scene[ix + 0u];
     FillColor s;
     s.rgba_color = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-FillColor Element_FillColor_read(thread const ElementRef& ref, const device SceneBuf& v_211)
+FillColor Element_FillColor_read(thread const ElementRef& ref, const device SceneBuf& v_223)
 {
     FillColorRef param = FillColorRef{ ref.offset + 4u };
-    return FillColor_read(param, v_211);
+    return FillColor_read(param, v_223);
 }
 
 static inline __attribute__((always_inline))
@@ -302,7 +315,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
 }
 
 static inline __attribute__((always_inline))
-void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_187)
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_199)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -310,61 +323,61 @@ void write_mem(thread const Alloc& alloc, thread const uint& offset, thread cons
     {
         return;
     }
-    v_187.memory[offset] = val;
+    v_199.memory[offset] = val;
 }
 
 static inline __attribute__((always_inline))
-void AnnoColor_write(thread const Alloc& a, thread const AnnoColorRef& ref, thread const AnnoColor& s, device Memory& v_187)
+void AnnoColor_write(thread const Alloc& a, thread const AnnoColorRef& ref, thread const AnnoColor& s, device Memory& v_199)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_187);
+    write_mem(param, param_1, param_2, v_199);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_187);
+    write_mem(param_3, param_4, param_5, v_199);
     Alloc param_6 = a;
     uint param_7 = ix + 2u;
     uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_187);
+    write_mem(param_6, param_7, param_8, v_199);
     Alloc param_9 = a;
     uint param_10 = ix + 3u;
     uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_187);
+    write_mem(param_9, param_10, param_11, v_199);
     Alloc param_12 = a;
     uint param_13 = ix + 4u;
     uint param_14 = as_type<uint>(s.linewidth);
-    write_mem(param_12, param_13, param_14, v_187);
+    write_mem(param_12, param_13, param_14, v_199);
     Alloc param_15 = a;
     uint param_16 = ix + 5u;
     uint param_17 = s.rgba_color;
-    write_mem(param_15, param_16, param_17, v_187);
+    write_mem(param_15, param_16, param_17, v_199);
 }
 
 static inline __attribute__((always_inline))
-void Annotated_Color_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoColor& s, device Memory& v_187)
+void Annotated_Color_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoColor& s, device Memory& v_199)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = (flags << uint(16)) | 1u;
-    write_mem(param, param_1, param_2, v_187);
+    write_mem(param, param_1, param_2, v_199);
     Alloc param_3 = a;
     AnnoColorRef param_4 = AnnoColorRef{ ref.offset + 4u };
     AnnoColor param_5 = s;
-    AnnoColor_write(param_3, param_4, param_5, v_187);
+    AnnoColor_write(param_3, param_4, param_5, v_199);
 }
 
 static inline __attribute__((always_inline))
-FillLinGradient FillLinGradient_read(thread const FillLinGradientRef& ref, const device SceneBuf& v_211)
+FillLinGradient FillLinGradient_read(thread const FillLinGradientRef& ref, const device SceneBuf& v_223)
 {
     uint ix = ref.offset >> uint(2);
-    uint raw0 = v_211.scene[ix + 0u];
-    uint raw1 = v_211.scene[ix + 1u];
-    uint raw2 = v_211.scene[ix + 2u];
-    uint raw3 = v_211.scene[ix + 3u];
-    uint raw4 = v_211.scene[ix + 4u];
+    uint raw0 = v_223.scene[ix + 0u];
+    uint raw1 = v_223.scene[ix + 1u];
+    uint raw2 = v_223.scene[ix + 2u];
+    uint raw3 = v_223.scene[ix + 3u];
+    uint raw4 = v_223.scene[ix + 4u];
     FillLinGradient s;
     s.index = raw0;
     s.p0 = float2(as_type<float>(raw1), as_type<float>(raw2));
@@ -373,73 +386,73 @@ FillLinGradient FillLinGradient_read(thread const FillLinGradientRef& ref, const
 }
 
 static inline __attribute__((always_inline))
-FillLinGradient Element_FillLinGradient_read(thread const ElementRef& ref, const device SceneBuf& v_211)
+FillLinGradient Element_FillLinGradient_read(thread const ElementRef& ref, const device SceneBuf& v_223)
 {
     FillLinGradientRef param = FillLinGradientRef{ ref.offset + 4u };
-    return FillLinGradient_read(param, v_211);
+    return FillLinGradient_read(param, v_223);
 }
 
 static inline __attribute__((always_inline))
-void AnnoLinGradient_write(thread const Alloc& a, thread const AnnoLinGradientRef& ref, thread const AnnoLinGradient& s, device Memory& v_187)
+void AnnoLinGradient_write(thread const Alloc& a, thread const AnnoLinGradientRef& ref, thread const AnnoLinGradient& s, device Memory& v_199)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_187);
+    write_mem(param, param_1, param_2, v_199);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_187);
+    write_mem(param_3, param_4, param_5, v_199);
     Alloc param_6 = a;
     uint param_7 = ix + 2u;
     uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_187);
+    write_mem(param_6, param_7, param_8, v_199);
     Alloc param_9 = a;
     uint param_10 = ix + 3u;
     uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_187);
+    write_mem(param_9, param_10, param_11, v_199);
     Alloc param_12 = a;
     uint param_13 = ix + 4u;
     uint param_14 = as_type<uint>(s.linewidth);
-    write_mem(param_12, param_13, param_14, v_187);
+    write_mem(param_12, param_13, param_14, v_199);
     Alloc param_15 = a;
     uint param_16 = ix + 5u;
     uint param_17 = s.index;
-    write_mem(param_15, param_16, param_17, v_187);
+    write_mem(param_15, param_16, param_17, v_199);
     Alloc param_18 = a;
     uint param_19 = ix + 6u;
     uint param_20 = as_type<uint>(s.line_x);
-    write_mem(param_18, param_19, param_20, v_187);
+    write_mem(param_18, param_19, param_20, v_199);
     Alloc param_21 = a;
     uint param_22 = ix + 7u;
     uint param_23 = as_type<uint>(s.line_y);
-    write_mem(param_21, param_22, param_23, v_187);
+    write_mem(param_21, param_22, param_23, v_199);
     Alloc param_24 = a;
     uint param_25 = ix + 8u;
     uint param_26 = as_type<uint>(s.line_c);
-    write_mem(param_24, param_25, param_26, v_187);
+    write_mem(param_24, param_25, param_26, v_199);
 }
 
 static inline __attribute__((always_inline))
-void Annotated_LinGradient_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoLinGradient& s, device Memory& v_187)
+void Annotated_LinGradient_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoLinGradient& s, device Memory& v_199)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = (flags << uint(16)) | 2u;
-    write_mem(param, param_1, param_2, v_187);
+    write_mem(param, param_1, param_2, v_199);
     Alloc param_3 = a;
     AnnoLinGradientRef param_4 = AnnoLinGradientRef{ ref.offset + 4u };
     AnnoLinGradient param_5 = s;
-    AnnoLinGradient_write(param_3, param_4, param_5, v_187);
+    AnnoLinGradient_write(param_3, param_4, param_5, v_199);
 }
 
 static inline __attribute__((always_inline))
-FillImage FillImage_read(thread const FillImageRef& ref, const device SceneBuf& v_211)
+FillImage FillImage_read(thread const FillImageRef& ref, const device SceneBuf& v_223)
 {
     uint ix = ref.offset >> uint(2);
-    uint raw0 = v_211.scene[ix + 0u];
-    uint raw1 = v_211.scene[ix + 1u];
+    uint raw0 = v_223.scene[ix + 0u];
+    uint raw1 = v_223.scene[ix + 1u];
     FillImage s;
     s.index = raw0;
     s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
@@ -447,140 +460,169 @@ FillImage FillImage_read(thread const FillImageRef& ref, const device SceneBuf&
 }
 
 static inline __attribute__((always_inline))
-FillImage Element_FillImage_read(thread const ElementRef& ref, const device SceneBuf& v_211)
+FillImage Element_FillImage_read(thread const ElementRef& ref, const device SceneBuf& v_223)
 {
     FillImageRef param = FillImageRef{ ref.offset + 4u };
-    return FillImage_read(param, v_211);
+    return FillImage_read(param, v_223);
 }
 
 static inline __attribute__((always_inline))
-void AnnoImage_write(thread const Alloc& a, thread const AnnoImageRef& ref, thread const AnnoImage& s, device Memory& v_187)
+void AnnoImage_write(thread const Alloc& a, thread const AnnoImageRef& ref, thread const AnnoImage& s, device Memory& v_199)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_187);
+    write_mem(param, param_1, param_2, v_199);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_187);
+    write_mem(param_3, param_4, param_5, v_199);
     Alloc param_6 = a;
     uint param_7 = ix + 2u;
     uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_187);
+    write_mem(param_6, param_7, param_8, v_199);
     Alloc param_9 = a;
     uint param_10 = ix + 3u;
     uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_187);
+    write_mem(param_9, param_10, param_11, v_199);
     Alloc param_12 = a;
     uint param_13 = ix + 4u;
     uint param_14 = as_type<uint>(s.linewidth);
-    write_mem(param_12, param_13, param_14, v_187);
+    write_mem(param_12, param_13, param_14, v_199);
     Alloc param_15 = a;
     uint param_16 = ix + 5u;
     uint param_17 = s.index;
-    write_mem(param_15, param_16, param_17, v_187);
+    write_mem(param_15, param_16, param_17, v_199);
     Alloc param_18 = a;
     uint param_19 = ix + 6u;
     uint param_20 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16));
-    write_mem(param_18, param_19, param_20, v_187);
+    write_mem(param_18, param_19, param_20, v_199);
 }
 
 static inline __attribute__((always_inline))
-void Annotated_Image_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoImage& s, device Memory& v_187)
+void Annotated_Image_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoImage& s, device Memory& v_199)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = (flags << uint(16)) | 3u;
-    write_mem(param, param_1, param_2, v_187);
+    write_mem(param, param_1, param_2, v_199);
     Alloc param_3 = a;
     AnnoImageRef param_4 = AnnoImageRef{ ref.offset + 4u };
     AnnoImage param_5 = s;
-    AnnoImage_write(param_3, param_4, param_5, v_187);
+    AnnoImage_write(param_3, param_4, param_5, v_199);
 }
 
 static inline __attribute__((always_inline))
-void AnnoBeginClip_write(thread const Alloc& a, thread const AnnoBeginClipRef& ref, thread const AnnoBeginClip& s, device Memory& v_187)
+Clip Clip_read(thread const ClipRef& ref, const device SceneBuf& v_223)
+{
+    uint ix = ref.offset >> uint(2);
+    uint raw0 = v_223.scene[ix + 0u];
+    uint raw1 = v_223.scene[ix + 1u];
+    uint raw2 = v_223.scene[ix + 2u];
+    uint raw3 = v_223.scene[ix + 3u];
+    Clip s;
+    s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
+    s.blend = v_223.scene[ix + 4u];
+    return s;
+}
+
+static inline __attribute__((always_inline))
+Clip Element_BeginClip_read(thread const ElementRef& ref, const device SceneBuf& v_223)
+{
+    ClipRef param = ClipRef{ ref.offset + 4u };
+    return Clip_read(param, v_223);
+}
+
+static inline __attribute__((always_inline))
+void AnnoBeginClip_write(thread const Alloc& a, thread const AnnoBeginClipRef& ref, thread const AnnoBeginClip& s, device Memory& v_199)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_187);
+    write_mem(param, param_1, param_2, v_199);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_187);
+    write_mem(param_3, param_4, param_5, v_199);
     Alloc param_6 = a;
     uint param_7 = ix + 2u;
     uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_187);
+    write_mem(param_6, param_7, param_8, v_199);
     Alloc param_9 = a;
     uint param_10 = ix + 3u;
     uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_187);
+    write_mem(param_9, param_10, param_11, v_199);
     Alloc param_12 = a;
     uint param_13 = ix + 4u;
     uint param_14 = as_type<uint>(s.linewidth);
-    write_mem(param_12, param_13, param_14, v_187);
+    write_mem(param_12, param_13, param_14, v_199);
+    Alloc param_15 = a;
+    uint param_16 = ix + 5u;
+    uint param_17 = s.blend;
+    write_mem(param_15, param_16, param_17, v_199);
 }
 
 static inline __attribute__((always_inline))
-void Annotated_BeginClip_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoBeginClip& s, device Memory& v_187)
+void Annotated_BeginClip_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoBeginClip& s, device Memory& v_199)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = (flags << uint(16)) | 4u;
-    write_mem(param, param_1, param_2, v_187);
+    write_mem(param, param_1, param_2, v_199);
     Alloc param_3 = a;
     AnnoBeginClipRef param_4 = AnnoBeginClipRef{ ref.offset + 4u };
     AnnoBeginClip param_5 = s;
-    AnnoBeginClip_write(param_3, param_4, param_5, v_187);
+    AnnoBeginClip_write(param_3, param_4, param_5, v_199);
 }
 
 static inline __attribute__((always_inline))
-void AnnoEndClip_write(thread const Alloc& a, thread const AnnoEndClipRef& ref, thread const AnnoEndClip& s, device Memory& v_187)
+void AnnoEndClip_write(thread const Alloc& a, thread const AnnoEndClipRef& ref, thread const AnnoEndClip& s, device Memory& v_199)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_187);
+    write_mem(param, param_1, param_2, v_199);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_187);
+    write_mem(param_3, param_4, param_5, v_199);
     Alloc param_6 = a;
     uint param_7 = ix + 2u;
     uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_187);
+    write_mem(param_6, param_7, param_8, v_199);
     Alloc param_9 = a;
     uint param_10 = ix + 3u;
     uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_187);
+    write_mem(param_9, param_10, param_11, v_199);
+    Alloc param_12 = a;
+    uint param_13 = ix + 4u;
+    uint param_14 = s.blend;
+    write_mem(param_12, param_13, param_14, v_199);
 }
 
 static inline __attribute__((always_inline))
-void Annotated_EndClip_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const AnnoEndClip& s, device Memory& v_187)
+void Annotated_EndClip_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoEndClip& s, device Memory& v_199)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint param_2 = 5u;
-    write_mem(param, param_1, param_2, v_187);
+    uint param_2 = (flags << uint(16)) | 5u;
+    write_mem(param, param_1, param_2, v_199);
     Alloc param_3 = a;
     AnnoEndClipRef param_4 = AnnoEndClipRef{ ref.offset + 4u };
     AnnoEndClip param_5 = s;
-    AnnoEndClip_write(param_3, param_4, param_5, v_187);
+    AnnoEndClip_write(param_3, param_4, param_5, v_199);
 }
 
-kernel void main0(device Memory& v_187 [[buffer(0)]], const device ConfigBuf& _968 [[buffer(1)]], const device SceneBuf& v_211 [[buffer(2)]], const device ParentBuf& _934 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+kernel void main0(device Memory& v_199 [[buffer(0)]], const device ConfigBuf& _1054 [[buffer(1)]], const device SceneBuf& v_223 [[buffer(2)]], const device ParentBuf& _1020 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
 {
     threadgroup DrawMonoid sh_scratch[256];
     uint ix = gl_GlobalInvocationID.x * 8u;
     ElementRef ref = ElementRef{ ix * 36u };
     ElementRef param = ref;
-    uint tag_word = Element_tag(param, v_211).tag;
+    uint tag_word = Element_tag(param, v_223).tag;
     uint param_1 = tag_word;
     DrawMonoid agg = map_tag(param_1);
     spvUnsafeArray<DrawMonoid, 8> local;
@@ -590,7 +632,7 @@ kernel void main0(device Memory& v_187 [[buffer(0)]], const device ConfigBuf& _9
         ElementRef param_2 = ref;
         uint param_3 = i;
         ElementRef param_4 = Element_index(param_2, param_3);
-        tag_word = Element_tag(param_4, v_211).tag;
+        tag_word = Element_tag(param_4, v_223).tag;
         uint param_5 = tag_word;
         DrawMonoid param_6 = agg;
         DrawMonoid param_7 = map_tag(param_5);
@@ -615,9 +657,9 @@ kernel void main0(device Memory& v_187 [[buffer(0)]], const device ConfigBuf& _9
     DrawMonoid row = tag_monoid_identity();
     if (gl_WorkGroupID.x > 0u)
     {
-        uint _937 = gl_WorkGroupID.x - 1u;
-        row.path_ix = _934.parent[_937].path_ix;
-        row.clip_ix = _934.parent[_937].clip_ix;
+        uint _1023 = gl_WorkGroupID.x - 1u;
+        row.path_ix = _1020.parent[_1023].path_ix;
+        row.clip_ix = _1020.parent[_1023].clip_ix;
     }
     if (gl_LocalInvocationID.x > 0u)
     {
@@ -626,9 +668,9 @@ kernel void main0(device Memory& v_187 [[buffer(0)]], const device ConfigBuf& _9
         row = combine_tag_monoid(param_10, param_11);
     }
     uint out_ix = gl_GlobalInvocationID.x * 8u;
-    uint out_base = (_968.conf.drawmonoid_alloc.offset >> uint(2)) + (out_ix * 2u);
-    uint clip_out_base = _968.conf.clip_alloc.offset >> uint(2);
-    AnnotatedRef out_ref = AnnotatedRef{ _968.conf.anno_alloc.offset + (out_ix * 40u) };
+    uint out_base = (_1054.conf.drawmonoid_alloc.offset >> uint(2)) + (out_ix * 2u);
+    uint clip_out_base = _1054.conf.clip_alloc.offset >> uint(2);
+    AnnotatedRef out_ref = AnnotatedRef{ _1054.conf.anno_alloc.offset + (out_ix * 40u) };
     float4 mat;
     float2 translate;
     AnnoColor anno_fill;
@@ -638,9 +680,9 @@ kernel void main0(device Memory& v_187 [[buffer(0)]], const device ConfigBuf& _9
     AnnoImage anno_img;
     Alloc param_28;
     AnnoBeginClip anno_begin_clip;
-    Alloc param_32;
+    Alloc param_33;
     AnnoEndClip anno_end_clip;
-    Alloc param_36;
+    Alloc param_38;
     for (uint i_2 = 0u; i_2 < 8u; i_2++)
     {
         DrawMonoid m = row;
@@ -650,31 +692,31 @@ kernel void main0(device Memory& v_187 [[buffer(0)]], const device ConfigBuf& _9
             DrawMonoid param_13 = local[i_2 - 1u];
             m = combine_tag_monoid(param_12, param_13);
         }
-        v_187.memory[out_base + (i_2 * 2u)] = m.path_ix;
-        v_187.memory[(out_base + (i_2 * 2u)) + 1u] = m.clip_ix;
+        v_199.memory[out_base + (i_2 * 2u)] = m.path_ix;
+        v_199.memory[(out_base + (i_2 * 2u)) + 1u] = m.clip_ix;
         ElementRef param_14 = ref;
         uint param_15 = i_2;
         ElementRef this_ref = Element_index(param_14, param_15);
         ElementRef param_16 = this_ref;
-        tag_word = Element_tag(param_16, v_211).tag;
+        tag_word = Element_tag(param_16, v_223).tag;
         if ((((tag_word == 4u) || (tag_word == 5u)) || (tag_word == 6u)) || (tag_word == 9u))
         {
-            uint bbox_offset = (_968.conf.bbox_alloc.offset >> uint(2)) + (6u * m.path_ix);
-            float bbox_l = float(v_187.memory[bbox_offset]) - 32768.0;
-            float bbox_t = float(v_187.memory[bbox_offset + 1u]) - 32768.0;
-            float bbox_r = float(v_187.memory[bbox_offset + 2u]) - 32768.0;
-            float bbox_b = float(v_187.memory[bbox_offset + 3u]) - 32768.0;
+            uint bbox_offset = (_1054.conf.bbox_alloc.offset >> uint(2)) + (6u * m.path_ix);
+            float bbox_l = float(v_199.memory[bbox_offset]) - 32768.0;
+            float bbox_t = float(v_199.memory[bbox_offset + 1u]) - 32768.0;
+            float bbox_r = float(v_199.memory[bbox_offset + 2u]) - 32768.0;
+            float bbox_b = float(v_199.memory[bbox_offset + 3u]) - 32768.0;
             float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
-            float linewidth = as_type<float>(v_187.memory[bbox_offset + 4u]);
+            float linewidth = as_type<float>(v_199.memory[bbox_offset + 4u]);
             uint fill_mode = uint(linewidth >= 0.0);
             if ((linewidth >= 0.0) || (tag_word == 5u))
             {
-                uint trans_ix = v_187.memory[bbox_offset + 5u];
-                uint t = (_968.conf.trans_alloc.offset >> uint(2)) + (6u * trans_ix);
-                mat = as_type<float4>(uint4(v_187.memory[t], v_187.memory[t + 1u], v_187.memory[t + 2u], v_187.memory[t + 3u]));
+                uint trans_ix = v_199.memory[bbox_offset + 5u];
+                uint t = (_1054.conf.trans_alloc.offset >> uint(2)) + (6u * trans_ix);
+                mat = as_type<float4>(uint4(v_199.memory[t], v_199.memory[t + 1u], v_199.memory[t + 2u], v_199.memory[t + 3u]));
                 if (tag_word == 5u)
                 {
-                    translate = as_type<float2>(uint2(v_187.memory[t + 4u], v_187.memory[t + 5u]));
+                    translate = as_type<float2>(uint2(v_199.memory[t + 4u], v_199.memory[t + 5u]));
                 }
             }
             if (linewidth >= 0.0)
@@ -687,21 +729,21 @@ kernel void main0(device Memory& v_187 [[buffer(0)]], const device ConfigBuf& _9
                 case 4u:
                 {
                     ElementRef param_17 = this_ref;
-                    FillColor fill = Element_FillColor_read(param_17, v_211);
+                    FillColor fill = Element_FillColor_read(param_17, v_223);
                     anno_fill.bbox = bbox;
                     anno_fill.linewidth = linewidth;
                     anno_fill.rgba_color = fill.rgba_color;
-                    param_18.offset = _968.conf.anno_alloc.offset;
+                    param_18.offset = _1054.conf.anno_alloc.offset;
                     AnnotatedRef param_19 = out_ref;
                     uint param_20 = fill_mode;
                     AnnoColor param_21 = anno_fill;
-                    Annotated_Color_write(param_18, param_19, param_20, param_21, v_187);
+                    Annotated_Color_write(param_18, param_19, param_20, param_21, v_199);
                     break;
                 }
                 case 5u:
                 {
                     ElementRef param_22 = this_ref;
-                    FillLinGradient lin = Element_FillLinGradient_read(param_22, v_211);
+                    FillLinGradient lin = Element_FillLinGradient_read(param_22, v_223);
                     anno_lin.bbox = bbox;
                     anno_lin.linewidth = linewidth;
                     anno_lin.index = lin.index;
@@ -714,37 +756,41 @@ kernel void main0(device Memory& v_187 [[buffer(0)]], const device ConfigBuf& _9
                     anno_lin.line_x = line_x;
                     anno_lin.line_y = line_y;
                     anno_lin.line_c = -((p0.x * line_x) + (p0.y * line_y));
-                    param_23.offset = _968.conf.anno_alloc.offset;
+                    param_23.offset = _1054.conf.anno_alloc.offset;
                     AnnotatedRef param_24 = out_ref;
                     uint param_25 = fill_mode;
                     AnnoLinGradient param_26 = anno_lin;
-                    Annotated_LinGradient_write(param_23, param_24, param_25, param_26, v_187);
+                    Annotated_LinGradient_write(param_23, param_24, param_25, param_26, v_199);
                     break;
                 }
                 case 6u:
                 {
                     ElementRef param_27 = this_ref;
-                    FillImage fill_img = Element_FillImage_read(param_27, v_211);
+                    FillImage fill_img = Element_FillImage_read(param_27, v_223);
                     anno_img.bbox = bbox;
                     anno_img.linewidth = linewidth;
                     anno_img.index = fill_img.index;
                     anno_img.offset = fill_img.offset;
-                    param_28.offset = _968.conf.anno_alloc.offset;
+                    param_28.offset = _1054.conf.anno_alloc.offset;
                     AnnotatedRef param_29 = out_ref;
                     uint param_30 = fill_mode;
                     AnnoImage param_31 = anno_img;
-                    Annotated_Image_write(param_28, param_29, param_30, param_31, v_187);
+                    Annotated_Image_write(param_28, param_29, param_30, param_31, v_199);
                     break;
                 }
                 case 9u:
                 {
+                    ElementRef param_32 = this_ref;
+                    Clip begin_clip = Element_BeginClip_read(param_32, v_223);
                     anno_begin_clip.bbox = bbox;
                     anno_begin_clip.linewidth = 0.0;
-                    param_32.offset = _968.conf.anno_alloc.offset;
-                    AnnotatedRef param_33 = out_ref;
-                    uint param_34 = 0u;
-                    AnnoBeginClip param_35 = anno_begin_clip;
-                    Annotated_BeginClip_write(param_32, param_33, param_34, param_35, v_187);
+                    anno_begin_clip.blend = begin_clip.blend;
+                    uint flags = uint(begin_clip.blend != 3u) << uint(1);
+                    param_33.offset = _1054.conf.anno_alloc.offset;
+                    AnnotatedRef param_34 = out_ref;
+                    uint param_35 = flags;
+                    AnnoBeginClip param_36 = anno_begin_clip;
+                    Annotated_BeginClip_write(param_33, param_34, param_35, param_36, v_199);
                     break;
                 }
             }
@@ -753,11 +799,16 @@ kernel void main0(device Memory& v_187 [[buffer(0)]], const device ConfigBuf& _9
         {
             if (tag_word == 10u)
             {
+                ElementRef param_37 = this_ref;
+                Clip end_clip = Element_BeginClip_read(param_37, v_223);
                 anno_end_clip.bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0);
-                param_36.offset = _968.conf.anno_alloc.offset;
-                AnnotatedRef param_37 = out_ref;
-                AnnoEndClip param_38 = anno_end_clip;
-                Annotated_EndClip_write(param_36, param_37, param_38, v_187);
+                anno_end_clip.blend = end_clip.blend;
+                uint flags_1 = uint(end_clip.blend != 3u) << uint(1);
+                param_38.offset = _1054.conf.anno_alloc.offset;
+                AnnotatedRef param_39 = out_ref;
+                uint param_40 = flags_1;
+                AnnoEndClip param_41 = anno_end_clip;
+                Annotated_EndClip_write(param_38, param_39, param_40, param_41, v_199);
             }
         }
         if ((tag_word == 9u) || (tag_word == 10u))
@@ -767,7 +818,7 @@ kernel void main0(device Memory& v_187 [[buffer(0)]], const device ConfigBuf& _9
             {
                 path_ix = m.path_ix;
             }
-            v_187.memory[clip_out_base + m.clip_ix] = path_ix;
+            v_199.memory[clip_out_base + m.clip_ix] = path_ix;
         }
         out_ref.offset += 40u;
     }
diff --git a/piet-gpu/shader/gen/draw_leaf.spv b/piet-gpu/shader/gen/draw_leaf.spv
index d5e9136..bdbdb0c 100644
Binary files a/piet-gpu/shader/gen/draw_leaf.spv and b/piet-gpu/shader/gen/draw_leaf.spv differ
diff --git a/piet-gpu/shader/gen/kernel4.dxil b/piet-gpu/shader/gen/kernel4.dxil
index 0a14cfa..c0c27c9 100644
Binary files a/piet-gpu/shader/gen/kernel4.dxil and b/piet-gpu/shader/gen/kernel4.dxil differ
diff --git a/piet-gpu/shader/gen/kernel4.hlsl b/piet-gpu/shader/gen/kernel4.hlsl
index 9457d14..21bd083 100644
--- a/piet-gpu/shader/gen/kernel4.hlsl
+++ b/piet-gpu/shader/gen/kernel4.hlsl
@@ -69,6 +69,16 @@ struct CmdAlpha
     float alpha;
 };
 
+struct CmdEndClipRef
+{
+    uint offset;
+};
+
+struct CmdEndClip
+{
+    uint blend;
+};
+
 struct CmdJumpRef
 {
     uint offset;
@@ -132,8 +142,8 @@ struct Config
 
 static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);
 
-RWByteAddressBuffer _202 : register(u0, space0);
-ByteAddressBuffer _723 : register(t1, space0);
+RWByteAddressBuffer _278 : register(u0, space0);
+ByteAddressBuffer _1521 : register(t1, space0);
 RWTexture2D<unorm float4> image_atlas : register(u3, space0);
 RWTexture2D<unorm float4> gradients : register(u4, space0);
 RWTexture2D<unorm float4> image : register(u2, space0);
@@ -160,8 +170,8 @@ float4 spvUnpackUnorm4x8(uint value)
 
 Alloc slice_mem(Alloc a, uint offset, uint size)
 {
-    Alloc _215 = { a.offset + offset };
-    return _215;
+    Alloc _291 = { a.offset + offset };
+    return _291;
 }
 
 bool touch_mem(Alloc alloc, uint offset)
@@ -177,7 +187,7 @@ uint read_mem(Alloc alloc, uint offset)
     {
         return 0u;
     }
-    uint v = _202.Load(offset * 4 + 8);
+    uint v = _278.Load(offset * 4 + 8);
     return v;
 }
 
@@ -186,8 +196,8 @@ CmdTag Cmd_tag(Alloc a, CmdRef ref)
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint tag_and_flags = read_mem(param, param_1);
-    CmdTag _432 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _432;
+    CmdTag _525 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+    return _525;
 }
 
 CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref)
@@ -207,9 +217,9 @@ CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref)
 
 CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref)
 {
-    CmdStrokeRef _449 = { ref.offset + 4u };
+    CmdStrokeRef _542 = { ref.offset + 4u };
     Alloc param = a;
-    CmdStrokeRef param_1 = _449;
+    CmdStrokeRef param_1 = _542;
     return CmdStroke_read(param, param_1);
 }
 
@@ -245,8 +255,8 @@ TileSeg TileSeg_read(Alloc a, TileSegRef ref)
     s.origin = float2(asfloat(raw0), asfloat(raw1));
     s._vector = float2(asfloat(raw2), asfloat(raw3));
     s.y_edge = asfloat(raw4);
-    TileSegRef _572 = { raw5 };
-    s.next = _572;
+    TileSegRef _675 = { raw5 };
+    s.next = _675;
     return s;
 }
 
@@ -272,9 +282,9 @@ CmdFill CmdFill_read(Alloc a, CmdFillRef ref)
 
 CmdFill Cmd_Fill_read(Alloc a, CmdRef ref)
 {
-    CmdFillRef _439 = { ref.offset + 4u };
+    CmdFillRef _532 = { ref.offset + 4u };
     Alloc param = a;
-    CmdFillRef param_1 = _439;
+    CmdFillRef param_1 = _532;
     return CmdFill_read(param, param_1);
 }
 
@@ -291,9 +301,9 @@ CmdAlpha CmdAlpha_read(Alloc a, CmdAlphaRef ref)
 
 CmdAlpha Cmd_Alpha_read(Alloc a, CmdRef ref)
 {
-    CmdAlphaRef _459 = { ref.offset + 4u };
+    CmdAlphaRef _552 = { ref.offset + 4u };
     Alloc param = a;
-    CmdAlphaRef param_1 = _459;
+    CmdAlphaRef param_1 = _552;
     return CmdAlpha_read(param, param_1);
 }
 
@@ -310,9 +320,9 @@ CmdColor CmdColor_read(Alloc a, CmdColorRef ref)
 
 CmdColor Cmd_Color_read(Alloc a, CmdRef ref)
 {
-    CmdColorRef _469 = { ref.offset + 4u };
+    CmdColorRef _562 = { ref.offset + 4u };
     Alloc param = a;
-    CmdColorRef param_1 = _469;
+    CmdColorRef param_1 = _562;
     return CmdColor_read(param, param_1);
 }
 
@@ -356,9 +366,9 @@ CmdLinGrad CmdLinGrad_read(Alloc a, CmdLinGradRef ref)
 
 CmdLinGrad Cmd_LinGrad_read(Alloc a, CmdRef ref)
 {
-    CmdLinGradRef _479 = { ref.offset + 4u };
+    CmdLinGradRef _572 = { ref.offset + 4u };
     Alloc param = a;
-    CmdLinGradRef param_1 = _479;
+    CmdLinGradRef param_1 = _572;
     return CmdLinGrad_read(param, param_1);
 }
 
@@ -379,9 +389,9 @@ CmdImage CmdImage_read(Alloc a, CmdImageRef ref)
 
 CmdImage Cmd_Image_read(Alloc a, CmdRef ref)
 {
-    CmdImageRef _489 = { ref.offset + 4u };
+    CmdImageRef _582 = { ref.offset + 4u };
     Alloc param = a;
-    CmdImageRef param_1 = _489;
+    CmdImageRef param_1 = _582;
     return CmdImage_read(param, param_1);
 }
 
@@ -394,10 +404,10 @@ void fillImage(out float4 spvReturnValue[8], uint2 xy, CmdImage cmd_img)
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas[uv];
         float3 param_1 = fg_rgba.xyz;
-        float3 _695 = fromsRGB(param_1);
-        fg_rgba.x = _695.x;
-        fg_rgba.y = _695.y;
-        fg_rgba.z = _695.z;
+        float3 _1493 = fromsRGB(param_1);
+        fg_rgba.x = _1493.x;
+        fg_rgba.y = _1493.y;
+        fg_rgba.z = _1493.z;
         rgba[i] = fg_rgba;
     }
     spvReturnValue = rgba;
@@ -418,6 +428,438 @@ uint packsRGB(inout float4 rgba)
     return spvPackUnorm4x8(rgba.wzyx);
 }
 
+CmdEndClip CmdEndClip_read(Alloc a, CmdEndClipRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    CmdEndClip s;
+    s.blend = raw0;
+    return s;
+}
+
+CmdEndClip Cmd_EndClip_read(Alloc a, CmdRef ref)
+{
+    CmdEndClipRef _592 = { ref.offset + 4u };
+    Alloc param = a;
+    CmdEndClipRef param_1 = _592;
+    return CmdEndClip_read(param, param_1);
+}
+
+float3 screen(float3 cb, float3 cs)
+{
+    return (cb + cs) - (cb * cs);
+}
+
+float3 hard_light(float3 cb, float3 cs)
+{
+    float3 param = cb;
+    float3 param_1 = (cs * 2.0f) - 1.0f.xxx;
+    return lerp(screen(param, param_1), (cb * 2.0f) * cs, float3(bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z)));
+}
+
+float color_dodge(float cb, float cs)
+{
+    if (cb == 0.0f)
+    {
+        return 0.0f;
+    }
+    else
+    {
+        if (cs == 1.0f)
+        {
+            return 1.0f;
+        }
+        else
+        {
+            return min(1.0f, cb / (1.0f - cs));
+        }
+    }
+}
+
+float color_burn(float cb, float cs)
+{
+    if (cb == 1.0f)
+    {
+        return 1.0f;
+    }
+    else
+    {
+        if (cs == 0.0f)
+        {
+            return 0.0f;
+        }
+        else
+        {
+            return 1.0f - min(1.0f, (1.0f - cb) / cs);
+        }
+    }
+}
+
+float3 soft_light(float3 cb, float3 cs)
+{
+    float3 d = lerp(sqrt(cb), ((((cb * 16.0f) - 12.0f.xxx) * cb) + 4.0f.xxx) * cb, float3(bool3(cb.x <= 0.25f.xxx.x, cb.y <= 0.25f.xxx.y, cb.z <= 0.25f.xxx.z)));
+    return lerp(cb + (((cs * 2.0f) - 1.0f.xxx) * (d - cb)), cb - (((1.0f.xxx - (cs * 2.0f)) * cb) * (1.0f.xxx - cb)), float3(bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z)));
+}
+
+float sat(float3 c)
+{
+    return max(c.x, max(c.y, c.z)) - min(c.x, min(c.y, c.z));
+}
+
+void set_sat_inner(inout float cmin, inout float cmid, inout float cmax, float s)
+{
+    if (cmax > cmin)
+    {
+        cmid = ((cmid - cmin) * s) / (cmax - cmin);
+        cmax = s;
+    }
+    else
+    {
+        cmid = 0.0f;
+        cmax = 0.0f;
+    }
+    cmin = 0.0f;
+}
+
+float3 set_sat(inout float3 c, float s)
+{
+    if (c.x <= c.y)
+    {
+        if (c.y <= c.z)
+        {
+            float param = c.x;
+            float param_1 = c.y;
+            float param_2 = c.z;
+            float param_3 = s;
+            set_sat_inner(param, param_1, param_2, param_3);
+            c.x = param;
+            c.y = param_1;
+            c.z = param_2;
+        }
+        else
+        {
+            if (c.x <= c.z)
+            {
+                float param_4 = c.x;
+                float param_5 = c.z;
+                float param_6 = c.y;
+                float param_7 = s;
+                set_sat_inner(param_4, param_5, param_6, param_7);
+                c.x = param_4;
+                c.z = param_5;
+                c.y = param_6;
+            }
+            else
+            {
+                float param_8 = c.z;
+                float param_9 = c.x;
+                float param_10 = c.y;
+                float param_11 = s;
+                set_sat_inner(param_8, param_9, param_10, param_11);
+                c.z = param_8;
+                c.x = param_9;
+                c.y = param_10;
+            }
+        }
+    }
+    else
+    {
+        if (c.x <= c.z)
+        {
+            float param_12 = c.y;
+            float param_13 = c.x;
+            float param_14 = c.z;
+            float param_15 = s;
+            set_sat_inner(param_12, param_13, param_14, param_15);
+            c.y = param_12;
+            c.x = param_13;
+            c.z = param_14;
+        }
+        else
+        {
+            if (c.y <= c.z)
+            {
+                float param_16 = c.y;
+                float param_17 = c.z;
+                float param_18 = c.x;
+                float param_19 = s;
+                set_sat_inner(param_16, param_17, param_18, param_19);
+                c.y = param_16;
+                c.z = param_17;
+                c.x = param_18;
+            }
+            else
+            {
+                float param_20 = c.z;
+                float param_21 = c.y;
+                float param_22 = c.x;
+                float param_23 = s;
+                set_sat_inner(param_20, param_21, param_22, param_23);
+                c.z = param_20;
+                c.y = param_21;
+                c.x = param_22;
+            }
+        }
+    }
+    return c;
+}
+
+float lum(float3 c)
+{
+    float3 f = float3(0.300000011920928955078125f, 0.589999973773956298828125f, 0.10999999940395355224609375f);
+    return dot(c, f);
+}
+
+float3 clip_color(inout float3 c)
+{
+    float3 param = c;
+    float L = lum(param);
+    float n = min(c.x, min(c.y, c.z));
+    float x = max(c.x, max(c.y, c.z));
+    if (n < 0.0f)
+    {
+        c = L.xxx + (((c - L.xxx) * L) / (L - n).xxx);
+    }
+    if (x > 1.0f)
+    {
+        c = L.xxx + (((c - L.xxx) * (1.0f - L)) / (x - L).xxx);
+    }
+    return c;
+}
+
+float3 set_lum(float3 c, float l)
+{
+    float3 param = c;
+    float3 param_1 = c + (l - lum(param)).xxx;
+    float3 _901 = clip_color(param_1);
+    return _901;
+}
+
+float3 mix_blend(float3 cb, float3 cs, uint mode)
+{
+    float3 b = 0.0f.xxx;
+    switch (mode)
+    {
+        case 1u:
+        {
+            b = cb * cs;
+            break;
+        }
+        case 2u:
+        {
+            float3 param = cb;
+            float3 param_1 = cs;
+            b = screen(param, param_1);
+            break;
+        }
+        case 3u:
+        {
+            float3 param_2 = cs;
+            float3 param_3 = cb;
+            b = hard_light(param_2, param_3);
+            break;
+        }
+        case 4u:
+        {
+            b = min(cb, cs);
+            break;
+        }
+        case 5u:
+        {
+            b = max(cb, cs);
+            break;
+        }
+        case 6u:
+        {
+            float param_4 = cb.x;
+            float param_5 = cs.x;
+            float param_6 = cb.y;
+            float param_7 = cs.y;
+            float param_8 = cb.z;
+            float param_9 = cs.z;
+            b = float3(color_dodge(param_4, param_5), color_dodge(param_6, param_7), color_dodge(param_8, param_9));
+            break;
+        }
+        case 7u:
+        {
+            float param_10 = cb.x;
+            float param_11 = cs.x;
+            float param_12 = cb.y;
+            float param_13 = cs.y;
+            float param_14 = cb.z;
+            float param_15 = cs.z;
+            b = float3(color_burn(param_10, param_11), color_burn(param_12, param_13), color_burn(param_14, param_15));
+            break;
+        }
+        case 8u:
+        {
+            float3 param_16 = cb;
+            float3 param_17 = cs;
+            b = hard_light(param_16, param_17);
+            break;
+        }
+        case 9u:
+        {
+            float3 param_18 = cb;
+            float3 param_19 = cs;
+            b = soft_light(param_18, param_19);
+            break;
+        }
+        case 10u:
+        {
+            b = abs(cb - cs);
+            break;
+        }
+        case 11u:
+        {
+            b = (cb + cs) - ((cb * 2.0f) * cs);
+            break;
+        }
+        case 12u:
+        {
+            float3 param_20 = cb;
+            float3 param_21 = cs;
+            float param_22 = sat(param_20);
+            float3 _1192 = set_sat(param_21, param_22);
+            float3 param_23 = cb;
+            float3 param_24 = _1192;
+            float param_25 = lum(param_23);
+            b = set_lum(param_24, param_25);
+            break;
+        }
+        case 13u:
+        {
+            float3 param_26 = cs;
+            float3 param_27 = cb;
+            float param_28 = sat(param_26);
+            float3 _1206 = set_sat(param_27, param_28);
+            float3 param_29 = cb;
+            float3 param_30 = _1206;
+            float param_31 = lum(param_29);
+            b = set_lum(param_30, param_31);
+            break;
+        }
+        case 14u:
+        {
+            float3 param_32 = cb;
+            float3 param_33 = cs;
+            float param_34 = lum(param_32);
+            b = set_lum(param_33, param_34);
+            break;
+        }
+        case 15u:
+        {
+            float3 param_35 = cs;
+            float3 param_36 = cb;
+            float param_37 = lum(param_35);
+            b = set_lum(param_36, param_37);
+            break;
+        }
+        default:
+        {
+            b = cs;
+            break;
+        }
+    }
+    return b;
+}
+
+float4 mix_compose(float3 cb, float3 cs, float ab, float as, uint mode)
+{
+    float fa = 0.0f;
+    float fb = 0.0f;
+    switch (mode)
+    {
+        case 1u:
+        {
+            fa = 1.0f;
+            fb = 0.0f;
+            break;
+        }
+        case 2u:
+        {
+            fa = 0.0f;
+            fb = 1.0f;
+            break;
+        }
+        case 3u:
+        {
+            fa = 1.0f;
+            fb = 1.0f - as;
+            break;
+        }
+        case 4u:
+        {
+            fa = 1.0f - ab;
+            fb = 1.0f;
+            break;
+        }
+        case 5u:
+        {
+            fa = ab;
+            fb = 0.0f;
+            break;
+        }
+        case 6u:
+        {
+            fa = 0.0f;
+            fb = as;
+            break;
+        }
+        case 7u:
+        {
+            fa = 1.0f - ab;
+            fb = 0.0f;
+            break;
+        }
+        case 8u:
+        {
+            fa = 0.0f;
+            fb = 1.0f - as;
+            break;
+        }
+        case 9u:
+        {
+            fa = ab;
+            fb = 1.0f - as;
+            break;
+        }
+        case 10u:
+        {
+            fa = 1.0f - ab;
+            fb = as;
+            break;
+        }
+        case 11u:
+        {
+            fa = 1.0f - ab;
+            fb = 1.0f - as;
+            break;
+        }
+        case 12u:
+        {
+            fa = 1.0f;
+            fb = 1.0f;
+            break;
+        }
+        case 13u:
+        {
+            return float4(max(0.0f.xxxx, ((1.0f.xxxx - (float4(cs, as) * as)) + 1.0f.xxxx) - (float4(cb, ab) * ab)).xyz, max(0.0f, ((1.0f - as) + 1.0f) - ab));
+        }
+        case 14u:
+        {
+            return float4(min(1.0f.xxxx, (float4(cs, as) * as) + (float4(cb, ab) * ab)).xyz, min(1.0f, as + ab));
+        }
+        default:
+        {
+            break;
+        }
+    }
+    return (float4(cs, as) * (as * fa)) + (float4(cb, ab) * (ab * fb));
+}
+
 CmdJump CmdJump_read(Alloc a, CmdJumpRef ref)
 {
     uint ix = ref.offset >> uint(2);
@@ -431,24 +873,24 @@ CmdJump CmdJump_read(Alloc a, CmdJumpRef ref)
 
 CmdJump Cmd_Jump_read(Alloc a, CmdRef ref)
 {
-    CmdJumpRef _499 = { ref.offset + 4u };
+    CmdJumpRef _602 = { ref.offset + 4u };
     Alloc param = a;
-    CmdJumpRef param_1 = _499;
+    CmdJumpRef param_1 = _602;
     return CmdJump_read(param, param_1);
 }
 
 void comp_main()
 {
-    uint tile_ix = (gl_WorkGroupID.y * _723.Load(8)) + gl_WorkGroupID.x;
-    Alloc _738;
-    _738.offset = _723.Load(24);
+    uint tile_ix = (gl_WorkGroupID.y * _1521.Load(8)) + gl_WorkGroupID.x;
+    Alloc _1536;
+    _1536.offset = _1521.Load(24);
     Alloc param;
-    param.offset = _738.offset;
+    param.offset = _1536.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
-    CmdRef _747 = { cmd_alloc.offset };
-    CmdRef cmd_ref = _747;
+    CmdRef _1545 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _1545;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
     float4 rgba[8];
@@ -457,7 +899,7 @@ void comp_main()
         rgba[i] = 0.0f.xxxx;
     }
     uint clip_depth = 0u;
-    bool mem_ok = _202.Load(4) == 0u;
+    bool mem_ok = _278.Load(4) == 0u;
     float df[8];
     TileSegRef tile_seg_ref;
     float area[8];
@@ -482,8 +924,8 @@ void comp_main()
                 {
                     df[k] = 1000000000.0f;
                 }
-                TileSegRef _842 = { stroke.tile_ref };
-                tile_seg_ref = _842;
+                TileSegRef _1638 = { stroke.tile_ref };
+                tile_seg_ref = _1638;
                 do
                 {
                     uint param_7 = tile_seg_ref.offset;
@@ -519,8 +961,8 @@ void comp_main()
                 {
                     area[k_3] = float(fill.backdrop);
                 }
-                TileSegRef _964 = { fill.tile_ref };
-                tile_seg_ref = _964;
+                TileSegRef _1758 = { fill.tile_ref };
+                tile_seg_ref = _1758;
                 do
                 {
                     uint param_15 = tile_seg_ref.offset;
@@ -609,10 +1051,10 @@ void comp_main()
                     int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba = gradients[int2(x, int(lin.index))];
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _1298 = fromsRGB(param_29);
-                    fg_rgba.x = _1298.x;
-                    fg_rgba.y = _1298.y;
-                    fg_rgba.z = _1298.z;
+                    float3 _2092 = fromsRGB(param_29);
+                    fg_rgba.x = _2092.x;
+                    fg_rgba.y = _2092.y;
+                    fg_rgba.z = _2092.z;
                     rgba[k_9] = fg_rgba;
                 }
                 cmd_ref.offset += 20u;
@@ -625,9 +1067,9 @@ void comp_main()
                 CmdImage fill_img = Cmd_Image_read(param_30, param_31);
                 uint2 param_32 = xy_uint;
                 CmdImage param_33 = fill_img;
-                float4 _1327[8];
-                fillImage(_1327, param_32, param_33);
-                float4 img[8] = _1327;
+                float4 _2121[8];
+                fillImage(_2121, param_32, param_33);
+                float4 img[8] = _2121;
                 for (uint k_10 = 0u; k_10 < 8u; k_10++)
                 {
                     float4 fg_k_1 = img[k_10] * area[k_10];
@@ -642,8 +1084,8 @@ void comp_main()
                 {
                     uint d_2 = min(clip_depth, 127u);
                     float4 param_34 = float4(rgba[k_11]);
-                    uint _1390 = packsRGB(param_34);
-                    blend_stack[d_2][k_11] = _1390;
+                    uint _2184 = packsRGB(param_34);
+                    blend_stack[d_2][k_11] = _2184;
                     rgba[k_11] = 0.0f.xxxx;
                 }
                 clip_depth++;
@@ -652,24 +1094,44 @@ void comp_main()
             }
             case 9u:
             {
+                Alloc param_35 = cmd_alloc;
+                CmdRef param_36 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_35, param_36);
+                uint blend_mode = end_clip.blend >> uint(8);
+                uint comp_mode = end_clip.blend & 255u;
                 clip_depth--;
                 for (uint k_12 = 0u; k_12 < 8u; k_12++)
                 {
                     uint d_3 = min(clip_depth, 127u);
-                    uint param_35 = blend_stack[d_3][k_12];
-                    float4 bg = unpacksRGB(param_35);
+                    uint param_37 = blend_stack[d_3][k_12];
+                    float4 bg = unpacksRGB(param_37);
                     float4 fg_1 = rgba[k_12] * area[k_12];
-                    rgba[k_12] = (bg * (1.0f - fg_1.w)) + fg_1;
+                    float3 param_38 = bg.xyz;
+                    float3 param_39 = fg_1.xyz;
+                    uint param_40 = blend_mode;
+                    float3 blend = mix_blend(param_38, param_39, param_40);
+                    float4 _2251 = fg_1;
+                    float _2255 = fg_1.w;
+                    float3 _2262 = lerp(_2251.xyz, blend, float((_2255 * bg.w) > 0.0f).xxx);
+                    fg_1.x = _2262.x;
+                    fg_1.y = _2262.y;
+                    fg_1.z = _2262.z;
+                    float3 param_41 = bg.xyz;
+                    float3 param_42 = fg_1.xyz;
+                    float param_43 = bg.w;
+                    float param_44 = fg_1.w;
+                    uint param_45 = comp_mode;
+                    rgba[k_12] = mix_compose(param_41, param_42, param_43, param_44, param_45);
                 }
-                cmd_ref.offset += 4u;
+                cmd_ref.offset += 8u;
                 break;
             }
             case 10u:
             {
-                Alloc param_36 = cmd_alloc;
-                CmdRef param_37 = cmd_ref;
-                CmdRef _1453 = { Cmd_Jump_read(param_36, param_37).new_ref };
-                cmd_ref = _1453;
+                Alloc param_46 = cmd_alloc;
+                CmdRef param_47 = cmd_ref;
+                CmdRef _2299 = { Cmd_Jump_read(param_46, param_47).new_ref };
+                cmd_ref = _2299;
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -677,9 +1139,9 @@ void comp_main()
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_38 = i_1;
-        float3 param_39 = rgba[i_1].xyz;
-        image[int2(xy_uint + chunk_offset(param_38))] = float4(tosRGB(param_39), rgba[i_1].w);
+        uint param_48 = i_1;
+        float3 param_49 = rgba[i_1].xyz;
+        image[int2(xy_uint + chunk_offset(param_48))] = float4(tosRGB(param_49), rgba[i_1].w);
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4.msl b/piet-gpu/shader/gen/kernel4.msl
index 3dc7517..9a8fbd0 100644
--- a/piet-gpu/shader/gen/kernel4.msl
+++ b/piet-gpu/shader/gen/kernel4.msl
@@ -115,6 +115,16 @@ struct CmdAlpha
     float alpha;
 };
 
+struct CmdEndClipRef
+{
+    uint offset;
+};
+
+struct CmdEndClip
+{
+    uint blend;
+};
+
 struct CmdJumpRef
 {
     uint offset;
@@ -208,7 +218,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
 }
 
 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_202)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_278)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -216,29 +226,29 @@ uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memor
     {
         return 0u;
     }
-    uint v = v_202.memory[offset];
+    uint v = v_278.memory[offset];
     return v;
 }
 
 static inline __attribute__((always_inline))
-CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_202);
+    uint tag_and_flags = read_mem(param, param_1, v_278);
     return CmdTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
 }
 
 static inline __attribute__((always_inline))
-CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_202)
+CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
     CmdStroke s;
     s.tile_ref = raw0;
     s.half_width = as_type<float>(raw1);
@@ -246,11 +256,11 @@ CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref,
 }
 
 static inline __attribute__((always_inline))
-CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     CmdStrokeRef param_1 = CmdStrokeRef{ ref.offset + 4u };
-    return CmdStroke_read(param, param_1, v_202);
+    return CmdStroke_read(param, param_1, v_278);
 }
 
 static inline __attribute__((always_inline))
@@ -262,27 +272,27 @@ Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const
 }
 
 static inline __attribute__((always_inline))
-TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_202)
+TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_202);
+    uint raw2 = read_mem(param_4, param_5, v_278);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_202);
+    uint raw3 = read_mem(param_6, param_7, v_278);
     Alloc param_8 = a;
     uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_202);
+    uint raw4 = read_mem(param_8, param_9, v_278);
     Alloc param_10 = a;
     uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_202);
+    uint raw5 = read_mem(param_10, param_11, v_278);
     TileSeg s;
     s.origin = float2(as_type<float>(raw0), as_type<float>(raw1));
     s.vector = float2(as_type<float>(raw2), as_type<float>(raw3));
@@ -298,15 +308,15 @@ uint2 chunk_offset(thread const uint& i)
 }
 
 static inline __attribute__((always_inline))
-CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_202)
+CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
     CmdFill s;
     s.tile_ref = raw0;
     s.backdrop = int(raw1);
@@ -314,51 +324,51 @@ CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device
 }
 
 static inline __attribute__((always_inline))
-CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     CmdFillRef param_1 = CmdFillRef{ ref.offset + 4u };
-    return CmdFill_read(param, param_1, v_202);
+    return CmdFill_read(param, param_1, v_278);
 }
 
 static inline __attribute__((always_inline))
-CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_202)
+CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
     CmdAlpha s;
     s.alpha = as_type<float>(raw0);
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     CmdAlphaRef param_1 = CmdAlphaRef{ ref.offset + 4u };
-    return CmdAlpha_read(param, param_1, v_202);
+    return CmdAlpha_read(param, param_1, v_278);
 }
 
 static inline __attribute__((always_inline))
-CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_202)
+CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
     CmdColor s;
     s.rgba_color = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     CmdColorRef param_1 = CmdColorRef{ ref.offset + 4u };
-    return CmdColor_read(param, param_1, v_202);
+    return CmdColor_read(param, param_1, v_278);
 }
 
 static inline __attribute__((always_inline))
@@ -379,21 +389,21 @@ float4 unpacksRGB(thread const uint& srgba)
 }
 
 static inline __attribute__((always_inline))
-CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_202)
+CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_202);
+    uint raw2 = read_mem(param_4, param_5, v_278);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_202);
+    uint raw3 = read_mem(param_6, param_7, v_278);
     CmdLinGrad s;
     s.index = raw0;
     s.line_x = as_type<float>(raw1);
@@ -403,23 +413,23 @@ CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& re
 }
 
 static inline __attribute__((always_inline))
-CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     CmdLinGradRef param_1 = CmdLinGradRef{ ref.offset + 4u };
-    return CmdLinGrad_read(param, param_1, v_202);
+    return CmdLinGrad_read(param, param_1, v_278);
 }
 
 static inline __attribute__((always_inline))
-CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_202)
+CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
     CmdImage s;
     s.index = raw0;
     s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
@@ -427,11 +437,11 @@ CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, dev
 }
 
 static inline __attribute__((always_inline))
-CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     CmdImageRef param_1 = CmdImageRef{ ref.offset + 4u };
-    return CmdImage_read(param, param_1, v_202);
+    return CmdImage_read(param, param_1, v_278);
 }
 
 static inline __attribute__((always_inline))
@@ -444,10 +454,10 @@ spvUnsafeArray<float4, 8> fillImage(thread const uint2& xy, thread const CmdImag
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas.read(uint2(uv));
         float3 param_1 = fg_rgba.xyz;
-        float3 _695 = fromsRGB(param_1);
-        fg_rgba.x = _695.x;
-        fg_rgba.y = _695.y;
-        fg_rgba.z = _695.z;
+        float3 _1493 = fromsRGB(param_1);
+        fg_rgba.x = _1493.x;
+        fg_rgba.y = _1493.y;
+        fg_rgba.z = _1493.z;
         rgba[i] = fg_rgba;
     }
     return rgba;
@@ -471,30 +481,476 @@ uint packsRGB(thread float4& rgba)
 }
 
 static inline __attribute__((always_inline))
-CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_202)
+CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
+    CmdEndClip s;
+    s.blend = raw0;
+    return s;
+}
+
+static inline __attribute__((always_inline))
+CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+{
+    Alloc param = a;
+    CmdEndClipRef param_1 = CmdEndClipRef{ ref.offset + 4u };
+    return CmdEndClip_read(param, param_1, v_278);
+}
+
+static inline __attribute__((always_inline))
+float3 screen(thread const float3& cb, thread const float3& cs)
+{
+    return (cb + cs) - (cb * cs);
+}
+
+static inline __attribute__((always_inline))
+float3 hard_light(thread const float3& cb, thread const float3& cs)
+{
+    float3 param = cb;
+    float3 param_1 = (cs * 2.0) - float3(1.0);
+    return mix(screen(param, param_1), (cb * 2.0) * cs, float3(cs <= float3(0.5)));
+}
+
+static inline __attribute__((always_inline))
+float color_dodge(thread const float& cb, thread const float& cs)
+{
+    if (cb == 0.0)
+    {
+        return 0.0;
+    }
+    else
+    {
+        if (cs == 1.0)
+        {
+            return 1.0;
+        }
+        else
+        {
+            return fast::min(1.0, cb / (1.0 - cs));
+        }
+    }
+}
+
+static inline __attribute__((always_inline))
+float color_burn(thread const float& cb, thread const float& cs)
+{
+    if (cb == 1.0)
+    {
+        return 1.0;
+    }
+    else
+    {
+        if (cs == 0.0)
+        {
+            return 0.0;
+        }
+        else
+        {
+            return 1.0 - fast::min(1.0, (1.0 - cb) / cs);
+        }
+    }
+}
+
+static inline __attribute__((always_inline))
+float3 soft_light(thread const float3& cb, thread const float3& cs)
+{
+    float3 d = mix(sqrt(cb), ((((cb * 16.0) - float3(12.0)) * cb) + float3(4.0)) * cb, float3(cb <= float3(0.25)));
+    return mix(cb + (((cs * 2.0) - float3(1.0)) * (d - cb)), cb - (((float3(1.0) - (cs * 2.0)) * cb) * (float3(1.0) - cb)), float3(cs <= float3(0.5)));
+}
+
+static inline __attribute__((always_inline))
+float sat(thread const float3& c)
+{
+    return fast::max(c.x, fast::max(c.y, c.z)) - fast::min(c.x, fast::min(c.y, c.z));
+}
+
+static inline __attribute__((always_inline))
+void set_sat_inner(thread float& cmin, thread float& cmid, thread float& cmax, thread const float& s)
+{
+    if (cmax > cmin)
+    {
+        cmid = ((cmid - cmin) * s) / (cmax - cmin);
+        cmax = s;
+    }
+    else
+    {
+        cmid = 0.0;
+        cmax = 0.0;
+    }
+    cmin = 0.0;
+}
+
+static inline __attribute__((always_inline))
+float3 set_sat(thread float3& c, thread const float& s)
+{
+    if (c.x <= c.y)
+    {
+        if (c.y <= c.z)
+        {
+            float param = c.x;
+            float param_1 = c.y;
+            float param_2 = c.z;
+            float param_3 = s;
+            set_sat_inner(param, param_1, param_2, param_3);
+            c.x = param;
+            c.y = param_1;
+            c.z = param_2;
+        }
+        else
+        {
+            if (c.x <= c.z)
+            {
+                float param_4 = c.x;
+                float param_5 = c.z;
+                float param_6 = c.y;
+                float param_7 = s;
+                set_sat_inner(param_4, param_5, param_6, param_7);
+                c.x = param_4;
+                c.z = param_5;
+                c.y = param_6;
+            }
+            else
+            {
+                float param_8 = c.z;
+                float param_9 = c.x;
+                float param_10 = c.y;
+                float param_11 = s;
+                set_sat_inner(param_8, param_9, param_10, param_11);
+                c.z = param_8;
+                c.x = param_9;
+                c.y = param_10;
+            }
+        }
+    }
+    else
+    {
+        if (c.x <= c.z)
+        {
+            float param_12 = c.y;
+            float param_13 = c.x;
+            float param_14 = c.z;
+            float param_15 = s;
+            set_sat_inner(param_12, param_13, param_14, param_15);
+            c.y = param_12;
+            c.x = param_13;
+            c.z = param_14;
+        }
+        else
+        {
+            if (c.y <= c.z)
+            {
+                float param_16 = c.y;
+                float param_17 = c.z;
+                float param_18 = c.x;
+                float param_19 = s;
+                set_sat_inner(param_16, param_17, param_18, param_19);
+                c.y = param_16;
+                c.z = param_17;
+                c.x = param_18;
+            }
+            else
+            {
+                float param_20 = c.z;
+                float param_21 = c.y;
+                float param_22 = c.x;
+                float param_23 = s;
+                set_sat_inner(param_20, param_21, param_22, param_23);
+                c.z = param_20;
+                c.y = param_21;
+                c.x = param_22;
+            }
+        }
+    }
+    return c;
+}
+
+static inline __attribute__((always_inline))
+float lum(thread const float3& c)
+{
+    float3 f = float3(0.300000011920928955078125, 0.589999973773956298828125, 0.10999999940395355224609375);
+    return dot(c, f);
+}
+
+static inline __attribute__((always_inline))
+float3 clip_color(thread float3& c)
+{
+    float3 param = c;
+    float L = lum(param);
+    float n = fast::min(c.x, fast::min(c.y, c.z));
+    float x = fast::max(c.x, fast::max(c.y, c.z));
+    if (n < 0.0)
+    {
+        c = float3(L) + (((c - float3(L)) * L) / float3(L - n));
+    }
+    if (x > 1.0)
+    {
+        c = float3(L) + (((c - float3(L)) * (1.0 - L)) / float3(x - L));
+    }
+    return c;
+}
+
+static inline __attribute__((always_inline))
+float3 set_lum(thread const float3& c, thread const float& l)
+{
+    float3 param = c;
+    float3 param_1 = c + float3(l - lum(param));
+    float3 _901 = clip_color(param_1);
+    return _901;
+}
+
+static inline __attribute__((always_inline))
+float3 mix_blend(thread const float3& cb, thread const float3& cs, thread const uint& mode)
+{
+    float3 b = float3(0.0);
+    switch (mode)
+    {
+        case 1u:
+        {
+            b = cb * cs;
+            break;
+        }
+        case 2u:
+        {
+            float3 param = cb;
+            float3 param_1 = cs;
+            b = screen(param, param_1);
+            break;
+        }
+        case 3u:
+        {
+            float3 param_2 = cs;
+            float3 param_3 = cb;
+            b = hard_light(param_2, param_3);
+            break;
+        }
+        case 4u:
+        {
+            b = fast::min(cb, cs);
+            break;
+        }
+        case 5u:
+        {
+            b = fast::max(cb, cs);
+            break;
+        }
+        case 6u:
+        {
+            float param_4 = cb.x;
+            float param_5 = cs.x;
+            float param_6 = cb.y;
+            float param_7 = cs.y;
+            float param_8 = cb.z;
+            float param_9 = cs.z;
+            b = float3(color_dodge(param_4, param_5), color_dodge(param_6, param_7), color_dodge(param_8, param_9));
+            break;
+        }
+        case 7u:
+        {
+            float param_10 = cb.x;
+            float param_11 = cs.x;
+            float param_12 = cb.y;
+            float param_13 = cs.y;
+            float param_14 = cb.z;
+            float param_15 = cs.z;
+            b = float3(color_burn(param_10, param_11), color_burn(param_12, param_13), color_burn(param_14, param_15));
+            break;
+        }
+        case 8u:
+        {
+            float3 param_16 = cb;
+            float3 param_17 = cs;
+            b = hard_light(param_16, param_17);
+            break;
+        }
+        case 9u:
+        {
+            float3 param_18 = cb;
+            float3 param_19 = cs;
+            b = soft_light(param_18, param_19);
+            break;
+        }
+        case 10u:
+        {
+            b = abs(cb - cs);
+            break;
+        }
+        case 11u:
+        {
+            b = (cb + cs) - ((cb * 2.0) * cs);
+            break;
+        }
+        case 12u:
+        {
+            float3 param_20 = cb;
+            float3 param_21 = cs;
+            float param_22 = sat(param_20);
+            float3 _1192 = set_sat(param_21, param_22);
+            float3 param_23 = cb;
+            float3 param_24 = _1192;
+            float param_25 = lum(param_23);
+            b = set_lum(param_24, param_25);
+            break;
+        }
+        case 13u:
+        {
+            float3 param_26 = cs;
+            float3 param_27 = cb;
+            float param_28 = sat(param_26);
+            float3 _1206 = set_sat(param_27, param_28);
+            float3 param_29 = cb;
+            float3 param_30 = _1206;
+            float param_31 = lum(param_29);
+            b = set_lum(param_30, param_31);
+            break;
+        }
+        case 14u:
+        {
+            float3 param_32 = cb;
+            float3 param_33 = cs;
+            float param_34 = lum(param_32);
+            b = set_lum(param_33, param_34);
+            break;
+        }
+        case 15u:
+        {
+            float3 param_35 = cs;
+            float3 param_36 = cb;
+            float param_37 = lum(param_35);
+            b = set_lum(param_36, param_37);
+            break;
+        }
+        default:
+        {
+            b = cs;
+            break;
+        }
+    }
+    return b;
+}
+
+static inline __attribute__((always_inline))
+float4 mix_compose(thread const float3& cb, thread const float3& cs, thread const float& ab, thread const float& as, thread const uint& mode)
+{
+    float fa = 0.0;
+    float fb = 0.0;
+    switch (mode)
+    {
+        case 1u:
+        {
+            fa = 1.0;
+            fb = 0.0;
+            break;
+        }
+        case 2u:
+        {
+            fa = 0.0;
+            fb = 1.0;
+            break;
+        }
+        case 3u:
+        {
+            fa = 1.0;
+            fb = 1.0 - as;
+            break;
+        }
+        case 4u:
+        {
+            fa = 1.0 - ab;
+            fb = 1.0;
+            break;
+        }
+        case 5u:
+        {
+            fa = ab;
+            fb = 0.0;
+            break;
+        }
+        case 6u:
+        {
+            fa = 0.0;
+            fb = as;
+            break;
+        }
+        case 7u:
+        {
+            fa = 1.0 - ab;
+            fb = 0.0;
+            break;
+        }
+        case 8u:
+        {
+            fa = 0.0;
+            fb = 1.0 - as;
+            break;
+        }
+        case 9u:
+        {
+            fa = ab;
+            fb = 1.0 - as;
+            break;
+        }
+        case 10u:
+        {
+            fa = 1.0 - ab;
+            fb = as;
+            break;
+        }
+        case 11u:
+        {
+            fa = 1.0 - ab;
+            fb = 1.0 - as;
+            break;
+        }
+        case 12u:
+        {
+            fa = 1.0;
+            fb = 1.0;
+            break;
+        }
+        case 13u:
+        {
+            return float4(fast::max(float4(0.0), ((float4(1.0) - (float4(cs, as) * as)) + float4(1.0)) - (float4(cb, ab) * ab)).xyz, fast::max(0.0, ((1.0 - as) + 1.0) - ab));
+        }
+        case 14u:
+        {
+            return float4(fast::min(float4(1.0), (float4(cs, as) * as) + (float4(cb, ab) * ab)).xyz, fast::min(1.0, as + ab));
+        }
+        default:
+        {
+            break;
+        }
+    }
+    return (float4(cs, as) * (as * fa)) + (float4(cb, ab) * (ab * fb));
+}
+
+static inline __attribute__((always_inline))
+CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_278)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_278);
     CmdJump s;
     s.new_ref = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     CmdJumpRef param_1 = CmdJumpRef{ ref.offset + 4u };
-    return CmdJump_read(param, param_1, v_202);
+    return CmdJump_read(param, param_1, v_278);
 }
 
-kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _723 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1521 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
-    uint tile_ix = (gl_WorkGroupID.y * _723.conf.width_in_tiles) + gl_WorkGroupID.x;
+    uint tile_ix = (gl_WorkGroupID.y * _1521.conf.width_in_tiles) + gl_WorkGroupID.x;
     Alloc param;
-    param.offset = _723.conf.ptcl_alloc.offset;
+    param.offset = _1521.conf.ptcl_alloc.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
@@ -507,7 +963,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
         rgba[i] = float4(0.0);
     }
     uint clip_depth = 0u;
-    bool mem_ok = v_202.mem_error == 0u;
+    bool mem_ok = v_278.mem_error == 0u;
     spvUnsafeArray<float, 8> df;
     TileSegRef tile_seg_ref;
     spvUnsafeArray<float, 8> area;
@@ -516,7 +972,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
     {
         Alloc param_3 = cmd_alloc;
         CmdRef param_4 = cmd_ref;
-        uint tag = Cmd_tag(param_3, param_4, v_202).tag;
+        uint tag = Cmd_tag(param_3, param_4, v_278).tag;
         if (tag == 0u)
         {
             break;
@@ -527,7 +983,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
             {
                 Alloc param_5 = cmd_alloc;
                 CmdRef param_6 = cmd_ref;
-                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_202);
+                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_278);
                 for (uint k = 0u; k < 8u; k++)
                 {
                     df[k] = 1000000000.0;
@@ -540,7 +996,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
                     bool param_9 = mem_ok;
                     Alloc param_10 = new_alloc(param_7, param_8, param_9);
                     TileSegRef param_11 = tile_seg_ref;
-                    TileSeg seg = TileSeg_read(param_10, param_11, v_202);
+                    TileSeg seg = TileSeg_read(param_10, param_11, v_278);
                     float2 line_vec = seg.vector;
                     for (uint k_1 = 0u; k_1 < 8u; k_1++)
                     {
@@ -563,7 +1019,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
             {
                 Alloc param_13 = cmd_alloc;
                 CmdRef param_14 = cmd_ref;
-                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_202);
+                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_278);
                 for (uint k_3 = 0u; k_3 < 8u; k_3++)
                 {
                     area[k_3] = float(fill.backdrop);
@@ -576,7 +1032,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
                     bool param_17 = mem_ok;
                     Alloc param_18 = new_alloc(param_15, param_16, param_17);
                     TileSegRef param_19 = tile_seg_ref;
-                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_202);
+                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_278);
                     for (uint k_4 = 0u; k_4 < 8u; k_4++)
                     {
                         uint param_20 = k_4;
@@ -620,7 +1076,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
             {
                 Alloc param_21 = cmd_alloc;
                 CmdRef param_22 = cmd_ref;
-                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_202);
+                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_278);
                 for (uint k_7 = 0u; k_7 < 8u; k_7++)
                 {
                     area[k_7] = alpha.alpha;
@@ -632,7 +1088,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
             {
                 Alloc param_23 = cmd_alloc;
                 CmdRef param_24 = cmd_ref;
-                CmdColor color = Cmd_Color_read(param_23, param_24, v_202);
+                CmdColor color = Cmd_Color_read(param_23, param_24, v_278);
                 uint param_25 = color.rgba_color;
                 float4 fg = unpacksRGB(param_25);
                 for (uint k_8 = 0u; k_8 < 8u; k_8++)
@@ -647,7 +1103,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
             {
                 Alloc param_26 = cmd_alloc;
                 CmdRef param_27 = cmd_ref;
-                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_202);
+                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_278);
                 float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c;
                 for (uint k_9 = 0u; k_9 < 8u; k_9++)
                 {
@@ -657,10 +1113,10 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
                     int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                     float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _1298 = fromsRGB(param_29);
-                    fg_rgba.x = _1298.x;
-                    fg_rgba.y = _1298.y;
-                    fg_rgba.z = _1298.z;
+                    float3 _2092 = fromsRGB(param_29);
+                    fg_rgba.x = _2092.x;
+                    fg_rgba.y = _2092.y;
+                    fg_rgba.z = _2092.z;
                     rgba[k_9] = fg_rgba;
                 }
                 cmd_ref.offset += 20u;
@@ -670,7 +1126,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
             {
                 Alloc param_30 = cmd_alloc;
                 CmdRef param_31 = cmd_ref;
-                CmdImage fill_img = Cmd_Image_read(param_30, param_31, v_202);
+                CmdImage fill_img = Cmd_Image_read(param_30, param_31, v_278);
                 uint2 param_32 = xy_uint;
                 CmdImage param_33 = fill_img;
                 spvUnsafeArray<float4, 8> img;
@@ -689,8 +1145,8 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
                 {
                     uint d_2 = min(clip_depth, 127u);
                     float4 param_34 = float4(rgba[k_11]);
-                    uint _1390 = packsRGB(param_34);
-                    blend_stack[d_2][k_11] = _1390;
+                    uint _2184 = packsRGB(param_34);
+                    blend_stack[d_2][k_11] = _2184;
                     rgba[k_11] = float4(0.0);
                 }
                 clip_depth++;
@@ -699,23 +1155,43 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
             }
             case 9u:
             {
+                Alloc param_35 = cmd_alloc;
+                CmdRef param_36 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_35, param_36, v_278);
+                uint blend_mode = end_clip.blend >> uint(8);
+                uint comp_mode = end_clip.blend & 255u;
                 clip_depth--;
                 for (uint k_12 = 0u; k_12 < 8u; k_12++)
                 {
                     uint d_3 = min(clip_depth, 127u);
-                    uint param_35 = blend_stack[d_3][k_12];
-                    float4 bg = unpacksRGB(param_35);
+                    uint param_37 = blend_stack[d_3][k_12];
+                    float4 bg = unpacksRGB(param_37);
                     float4 fg_1 = rgba[k_12] * area[k_12];
-                    rgba[k_12] = (bg * (1.0 - fg_1.w)) + fg_1;
+                    float3 param_38 = bg.xyz;
+                    float3 param_39 = fg_1.xyz;
+                    uint param_40 = blend_mode;
+                    float3 blend = mix_blend(param_38, param_39, param_40);
+                    float4 _2251 = fg_1;
+                    float _2255 = fg_1.w;
+                    float3 _2262 = mix(_2251.xyz, blend, float3(float((_2255 * bg.w) > 0.0)));
+                    fg_1.x = _2262.x;
+                    fg_1.y = _2262.y;
+                    fg_1.z = _2262.z;
+                    float3 param_41 = bg.xyz;
+                    float3 param_42 = fg_1.xyz;
+                    float param_43 = bg.w;
+                    float param_44 = fg_1.w;
+                    uint param_45 = comp_mode;
+                    rgba[k_12] = mix_compose(param_41, param_42, param_43, param_44, param_45);
                 }
-                cmd_ref.offset += 4u;
+                cmd_ref.offset += 8u;
                 break;
             }
             case 10u:
             {
-                Alloc param_36 = cmd_alloc;
-                CmdRef param_37 = cmd_ref;
-                cmd_ref = CmdRef{ Cmd_Jump_read(param_36, param_37, v_202).new_ref };
+                Alloc param_46 = cmd_alloc;
+                CmdRef param_47 = cmd_ref;
+                cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_278).new_ref };
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -723,9 +1199,9 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_38 = i_1;
-        float3 param_39 = rgba[i_1].xyz;
-        image.write(float4(tosRGB(param_39), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_38))));
+        uint param_48 = i_1;
+        float3 param_49 = rgba[i_1].xyz;
+        image.write(float4(tosRGB(param_49), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48))));
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4.spv b/piet-gpu/shader/gen/kernel4.spv
index 31f11c9..4d205ce 100644
Binary files a/piet-gpu/shader/gen/kernel4.spv and b/piet-gpu/shader/gen/kernel4.spv differ
diff --git a/piet-gpu/shader/gen/kernel4_gray.dxil b/piet-gpu/shader/gen/kernel4_gray.dxil
index f3bd028..18c4b7e 100644
Binary files a/piet-gpu/shader/gen/kernel4_gray.dxil and b/piet-gpu/shader/gen/kernel4_gray.dxil differ
diff --git a/piet-gpu/shader/gen/kernel4_gray.hlsl b/piet-gpu/shader/gen/kernel4_gray.hlsl
index 5ff97fb..7dc2e01 100644
--- a/piet-gpu/shader/gen/kernel4_gray.hlsl
+++ b/piet-gpu/shader/gen/kernel4_gray.hlsl
@@ -69,6 +69,16 @@ struct CmdAlpha
     float alpha;
 };
 
+struct CmdEndClipRef
+{
+    uint offset;
+};
+
+struct CmdEndClip
+{
+    uint blend;
+};
+
 struct CmdJumpRef
 {
     uint offset;
@@ -132,8 +142,8 @@ struct Config
 
 static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);
 
-RWByteAddressBuffer _202 : register(u0, space0);
-ByteAddressBuffer _723 : register(t1, space0);
+RWByteAddressBuffer _278 : register(u0, space0);
+ByteAddressBuffer _1521 : register(t1, space0);
 RWTexture2D<unorm float4> image_atlas : register(u3, space0);
 RWTexture2D<unorm float4> gradients : register(u4, space0);
 RWTexture2D<unorm float> image : register(u2, space0);
@@ -160,8 +170,8 @@ float4 spvUnpackUnorm4x8(uint value)
 
 Alloc slice_mem(Alloc a, uint offset, uint size)
 {
-    Alloc _215 = { a.offset + offset };
-    return _215;
+    Alloc _291 = { a.offset + offset };
+    return _291;
 }
 
 bool touch_mem(Alloc alloc, uint offset)
@@ -177,7 +187,7 @@ uint read_mem(Alloc alloc, uint offset)
     {
         return 0u;
     }
-    uint v = _202.Load(offset * 4 + 8);
+    uint v = _278.Load(offset * 4 + 8);
     return v;
 }
 
@@ -186,8 +196,8 @@ CmdTag Cmd_tag(Alloc a, CmdRef ref)
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint tag_and_flags = read_mem(param, param_1);
-    CmdTag _432 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _432;
+    CmdTag _525 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+    return _525;
 }
 
 CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref)
@@ -207,9 +217,9 @@ CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref)
 
 CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref)
 {
-    CmdStrokeRef _449 = { ref.offset + 4u };
+    CmdStrokeRef _542 = { ref.offset + 4u };
     Alloc param = a;
-    CmdStrokeRef param_1 = _449;
+    CmdStrokeRef param_1 = _542;
     return CmdStroke_read(param, param_1);
 }
 
@@ -245,8 +255,8 @@ TileSeg TileSeg_read(Alloc a, TileSegRef ref)
     s.origin = float2(asfloat(raw0), asfloat(raw1));
     s._vector = float2(asfloat(raw2), asfloat(raw3));
     s.y_edge = asfloat(raw4);
-    TileSegRef _572 = { raw5 };
-    s.next = _572;
+    TileSegRef _675 = { raw5 };
+    s.next = _675;
     return s;
 }
 
@@ -272,9 +282,9 @@ CmdFill CmdFill_read(Alloc a, CmdFillRef ref)
 
 CmdFill Cmd_Fill_read(Alloc a, CmdRef ref)
 {
-    CmdFillRef _439 = { ref.offset + 4u };
+    CmdFillRef _532 = { ref.offset + 4u };
     Alloc param = a;
-    CmdFillRef param_1 = _439;
+    CmdFillRef param_1 = _532;
     return CmdFill_read(param, param_1);
 }
 
@@ -291,9 +301,9 @@ CmdAlpha CmdAlpha_read(Alloc a, CmdAlphaRef ref)
 
 CmdAlpha Cmd_Alpha_read(Alloc a, CmdRef ref)
 {
-    CmdAlphaRef _459 = { ref.offset + 4u };
+    CmdAlphaRef _552 = { ref.offset + 4u };
     Alloc param = a;
-    CmdAlphaRef param_1 = _459;
+    CmdAlphaRef param_1 = _552;
     return CmdAlpha_read(param, param_1);
 }
 
@@ -310,9 +320,9 @@ CmdColor CmdColor_read(Alloc a, CmdColorRef ref)
 
 CmdColor Cmd_Color_read(Alloc a, CmdRef ref)
 {
-    CmdColorRef _469 = { ref.offset + 4u };
+    CmdColorRef _562 = { ref.offset + 4u };
     Alloc param = a;
-    CmdColorRef param_1 = _469;
+    CmdColorRef param_1 = _562;
     return CmdColor_read(param, param_1);
 }
 
@@ -356,9 +366,9 @@ CmdLinGrad CmdLinGrad_read(Alloc a, CmdLinGradRef ref)
 
 CmdLinGrad Cmd_LinGrad_read(Alloc a, CmdRef ref)
 {
-    CmdLinGradRef _479 = { ref.offset + 4u };
+    CmdLinGradRef _572 = { ref.offset + 4u };
     Alloc param = a;
-    CmdLinGradRef param_1 = _479;
+    CmdLinGradRef param_1 = _572;
     return CmdLinGrad_read(param, param_1);
 }
 
@@ -379,9 +389,9 @@ CmdImage CmdImage_read(Alloc a, CmdImageRef ref)
 
 CmdImage Cmd_Image_read(Alloc a, CmdRef ref)
 {
-    CmdImageRef _489 = { ref.offset + 4u };
+    CmdImageRef _582 = { ref.offset + 4u };
     Alloc param = a;
-    CmdImageRef param_1 = _489;
+    CmdImageRef param_1 = _582;
     return CmdImage_read(param, param_1);
 }
 
@@ -394,10 +404,10 @@ void fillImage(out float4 spvReturnValue[8], uint2 xy, CmdImage cmd_img)
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas[uv];
         float3 param_1 = fg_rgba.xyz;
-        float3 _695 = fromsRGB(param_1);
-        fg_rgba.x = _695.x;
-        fg_rgba.y = _695.y;
-        fg_rgba.z = _695.z;
+        float3 _1493 = fromsRGB(param_1);
+        fg_rgba.x = _1493.x;
+        fg_rgba.y = _1493.y;
+        fg_rgba.z = _1493.z;
         rgba[i] = fg_rgba;
     }
     spvReturnValue = rgba;
@@ -418,6 +428,438 @@ uint packsRGB(inout float4 rgba)
     return spvPackUnorm4x8(rgba.wzyx);
 }
 
+CmdEndClip CmdEndClip_read(Alloc a, CmdEndClipRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    CmdEndClip s;
+    s.blend = raw0;
+    return s;
+}
+
+CmdEndClip Cmd_EndClip_read(Alloc a, CmdRef ref)
+{
+    CmdEndClipRef _592 = { ref.offset + 4u };
+    Alloc param = a;
+    CmdEndClipRef param_1 = _592;
+    return CmdEndClip_read(param, param_1);
+}
+
+float3 screen(float3 cb, float3 cs)
+{
+    return (cb + cs) - (cb * cs);
+}
+
+float3 hard_light(float3 cb, float3 cs)
+{
+    float3 param = cb;
+    float3 param_1 = (cs * 2.0f) - 1.0f.xxx;
+    return lerp(screen(param, param_1), (cb * 2.0f) * cs, float3(bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z)));
+}
+
+float color_dodge(float cb, float cs)
+{
+    if (cb == 0.0f)
+    {
+        return 0.0f;
+    }
+    else
+    {
+        if (cs == 1.0f)
+        {
+            return 1.0f;
+        }
+        else
+        {
+            return min(1.0f, cb / (1.0f - cs));
+        }
+    }
+}
+
+float color_burn(float cb, float cs)
+{
+    if (cb == 1.0f)
+    {
+        return 1.0f;
+    }
+    else
+    {
+        if (cs == 0.0f)
+        {
+            return 0.0f;
+        }
+        else
+        {
+            return 1.0f - min(1.0f, (1.0f - cb) / cs);
+        }
+    }
+}
+
+float3 soft_light(float3 cb, float3 cs)
+{
+    float3 d = lerp(sqrt(cb), ((((cb * 16.0f) - 12.0f.xxx) * cb) + 4.0f.xxx) * cb, float3(bool3(cb.x <= 0.25f.xxx.x, cb.y <= 0.25f.xxx.y, cb.z <= 0.25f.xxx.z)));
+    return lerp(cb + (((cs * 2.0f) - 1.0f.xxx) * (d - cb)), cb - (((1.0f.xxx - (cs * 2.0f)) * cb) * (1.0f.xxx - cb)), float3(bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z)));
+}
+
+float sat(float3 c)
+{
+    return max(c.x, max(c.y, c.z)) - min(c.x, min(c.y, c.z));
+}
+
+void set_sat_inner(inout float cmin, inout float cmid, inout float cmax, float s)
+{
+    if (cmax > cmin)
+    {
+        cmid = ((cmid - cmin) * s) / (cmax - cmin);
+        cmax = s;
+    }
+    else
+    {
+        cmid = 0.0f;
+        cmax = 0.0f;
+    }
+    cmin = 0.0f;
+}
+
+float3 set_sat(inout float3 c, float s)
+{
+    if (c.x <= c.y)
+    {
+        if (c.y <= c.z)
+        {
+            float param = c.x;
+            float param_1 = c.y;
+            float param_2 = c.z;
+            float param_3 = s;
+            set_sat_inner(param, param_1, param_2, param_3);
+            c.x = param;
+            c.y = param_1;
+            c.z = param_2;
+        }
+        else
+        {
+            if (c.x <= c.z)
+            {
+                float param_4 = c.x;
+                float param_5 = c.z;
+                float param_6 = c.y;
+                float param_7 = s;
+                set_sat_inner(param_4, param_5, param_6, param_7);
+                c.x = param_4;
+                c.z = param_5;
+                c.y = param_6;
+            }
+            else
+            {
+                float param_8 = c.z;
+                float param_9 = c.x;
+                float param_10 = c.y;
+                float param_11 = s;
+                set_sat_inner(param_8, param_9, param_10, param_11);
+                c.z = param_8;
+                c.x = param_9;
+                c.y = param_10;
+            }
+        }
+    }
+    else
+    {
+        if (c.x <= c.z)
+        {
+            float param_12 = c.y;
+            float param_13 = c.x;
+            float param_14 = c.z;
+            float param_15 = s;
+            set_sat_inner(param_12, param_13, param_14, param_15);
+            c.y = param_12;
+            c.x = param_13;
+            c.z = param_14;
+        }
+        else
+        {
+            if (c.y <= c.z)
+            {
+                float param_16 = c.y;
+                float param_17 = c.z;
+                float param_18 = c.x;
+                float param_19 = s;
+                set_sat_inner(param_16, param_17, param_18, param_19);
+                c.y = param_16;
+                c.z = param_17;
+                c.x = param_18;
+            }
+            else
+            {
+                float param_20 = c.z;
+                float param_21 = c.y;
+                float param_22 = c.x;
+                float param_23 = s;
+                set_sat_inner(param_20, param_21, param_22, param_23);
+                c.z = param_20;
+                c.y = param_21;
+                c.x = param_22;
+            }
+        }
+    }
+    return c;
+}
+
+float lum(float3 c)
+{
+    float3 f = float3(0.300000011920928955078125f, 0.589999973773956298828125f, 0.10999999940395355224609375f);
+    return dot(c, f);
+}
+
+float3 clip_color(inout float3 c)
+{
+    float3 param = c;
+    float L = lum(param);
+    float n = min(c.x, min(c.y, c.z));
+    float x = max(c.x, max(c.y, c.z));
+    if (n < 0.0f)
+    {
+        c = L.xxx + (((c - L.xxx) * L) / (L - n).xxx);
+    }
+    if (x > 1.0f)
+    {
+        c = L.xxx + (((c - L.xxx) * (1.0f - L)) / (x - L).xxx);
+    }
+    return c;
+}
+
+float3 set_lum(float3 c, float l)
+{
+    float3 param = c;
+    float3 param_1 = c + (l - lum(param)).xxx;
+    float3 _901 = clip_color(param_1);
+    return _901;
+}
+
+float3 mix_blend(float3 cb, float3 cs, uint mode)
+{
+    float3 b = 0.0f.xxx;
+    switch (mode)
+    {
+        case 1u:
+        {
+            b = cb * cs;
+            break;
+        }
+        case 2u:
+        {
+            float3 param = cb;
+            float3 param_1 = cs;
+            b = screen(param, param_1);
+            break;
+        }
+        case 3u:
+        {
+            float3 param_2 = cs;
+            float3 param_3 = cb;
+            b = hard_light(param_2, param_3);
+            break;
+        }
+        case 4u:
+        {
+            b = min(cb, cs);
+            break;
+        }
+        case 5u:
+        {
+            b = max(cb, cs);
+            break;
+        }
+        case 6u:
+        {
+            float param_4 = cb.x;
+            float param_5 = cs.x;
+            float param_6 = cb.y;
+            float param_7 = cs.y;
+            float param_8 = cb.z;
+            float param_9 = cs.z;
+            b = float3(color_dodge(param_4, param_5), color_dodge(param_6, param_7), color_dodge(param_8, param_9));
+            break;
+        }
+        case 7u:
+        {
+            float param_10 = cb.x;
+            float param_11 = cs.x;
+            float param_12 = cb.y;
+            float param_13 = cs.y;
+            float param_14 = cb.z;
+            float param_15 = cs.z;
+            b = float3(color_burn(param_10, param_11), color_burn(param_12, param_13), color_burn(param_14, param_15));
+            break;
+        }
+        case 8u:
+        {
+            float3 param_16 = cb;
+            float3 param_17 = cs;
+            b = hard_light(param_16, param_17);
+            break;
+        }
+        case 9u:
+        {
+            float3 param_18 = cb;
+            float3 param_19 = cs;
+            b = soft_light(param_18, param_19);
+            break;
+        }
+        case 10u:
+        {
+            b = abs(cb - cs);
+            break;
+        }
+        case 11u:
+        {
+            b = (cb + cs) - ((cb * 2.0f) * cs);
+            break;
+        }
+        case 12u:
+        {
+            float3 param_20 = cb;
+            float3 param_21 = cs;
+            float param_22 = sat(param_20);
+            float3 _1192 = set_sat(param_21, param_22);
+            float3 param_23 = cb;
+            float3 param_24 = _1192;
+            float param_25 = lum(param_23);
+            b = set_lum(param_24, param_25);
+            break;
+        }
+        case 13u:
+        {
+            float3 param_26 = cs;
+            float3 param_27 = cb;
+            float param_28 = sat(param_26);
+            float3 _1206 = set_sat(param_27, param_28);
+            float3 param_29 = cb;
+            float3 param_30 = _1206;
+            float param_31 = lum(param_29);
+            b = set_lum(param_30, param_31);
+            break;
+        }
+        case 14u:
+        {
+            float3 param_32 = cb;
+            float3 param_33 = cs;
+            float param_34 = lum(param_32);
+            b = set_lum(param_33, param_34);
+            break;
+        }
+        case 15u:
+        {
+            float3 param_35 = cs;
+            float3 param_36 = cb;
+            float param_37 = lum(param_35);
+            b = set_lum(param_36, param_37);
+            break;
+        }
+        default:
+        {
+            b = cs;
+            break;
+        }
+    }
+    return b;
+}
+
+float4 mix_compose(float3 cb, float3 cs, float ab, float as, uint mode)
+{
+    float fa = 0.0f;
+    float fb = 0.0f;
+    switch (mode)
+    {
+        case 1u:
+        {
+            fa = 1.0f;
+            fb = 0.0f;
+            break;
+        }
+        case 2u:
+        {
+            fa = 0.0f;
+            fb = 1.0f;
+            break;
+        }
+        case 3u:
+        {
+            fa = 1.0f;
+            fb = 1.0f - as;
+            break;
+        }
+        case 4u:
+        {
+            fa = 1.0f - ab;
+            fb = 1.0f;
+            break;
+        }
+        case 5u:
+        {
+            fa = ab;
+            fb = 0.0f;
+            break;
+        }
+        case 6u:
+        {
+            fa = 0.0f;
+            fb = as;
+            break;
+        }
+        case 7u:
+        {
+            fa = 1.0f - ab;
+            fb = 0.0f;
+            break;
+        }
+        case 8u:
+        {
+            fa = 0.0f;
+            fb = 1.0f - as;
+            break;
+        }
+        case 9u:
+        {
+            fa = ab;
+            fb = 1.0f - as;
+            break;
+        }
+        case 10u:
+        {
+            fa = 1.0f - ab;
+            fb = as;
+            break;
+        }
+        case 11u:
+        {
+            fa = 1.0f - ab;
+            fb = 1.0f - as;
+            break;
+        }
+        case 12u:
+        {
+            fa = 1.0f;
+            fb = 1.0f;
+            break;
+        }
+        case 13u:
+        {
+            return float4(max(0.0f.xxxx, ((1.0f.xxxx - (float4(cs, as) * as)) + 1.0f.xxxx) - (float4(cb, ab) * ab)).xyz, max(0.0f, ((1.0f - as) + 1.0f) - ab));
+        }
+        case 14u:
+        {
+            return float4(min(1.0f.xxxx, (float4(cs, as) * as) + (float4(cb, ab) * ab)).xyz, min(1.0f, as + ab));
+        }
+        default:
+        {
+            break;
+        }
+    }
+    return (float4(cs, as) * (as * fa)) + (float4(cb, ab) * (ab * fb));
+}
+
 CmdJump CmdJump_read(Alloc a, CmdJumpRef ref)
 {
     uint ix = ref.offset >> uint(2);
@@ -431,24 +873,24 @@ CmdJump CmdJump_read(Alloc a, CmdJumpRef ref)
 
 CmdJump Cmd_Jump_read(Alloc a, CmdRef ref)
 {
-    CmdJumpRef _499 = { ref.offset + 4u };
+    CmdJumpRef _602 = { ref.offset + 4u };
     Alloc param = a;
-    CmdJumpRef param_1 = _499;
+    CmdJumpRef param_1 = _602;
     return CmdJump_read(param, param_1);
 }
 
 void comp_main()
 {
-    uint tile_ix = (gl_WorkGroupID.y * _723.Load(8)) + gl_WorkGroupID.x;
-    Alloc _738;
-    _738.offset = _723.Load(24);
+    uint tile_ix = (gl_WorkGroupID.y * _1521.Load(8)) + gl_WorkGroupID.x;
+    Alloc _1536;
+    _1536.offset = _1521.Load(24);
     Alloc param;
-    param.offset = _738.offset;
+    param.offset = _1536.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
-    CmdRef _747 = { cmd_alloc.offset };
-    CmdRef cmd_ref = _747;
+    CmdRef _1545 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _1545;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
     float4 rgba[8];
@@ -457,7 +899,7 @@ void comp_main()
         rgba[i] = 0.0f.xxxx;
     }
     uint clip_depth = 0u;
-    bool mem_ok = _202.Load(4) == 0u;
+    bool mem_ok = _278.Load(4) == 0u;
     float df[8];
     TileSegRef tile_seg_ref;
     float area[8];
@@ -482,8 +924,8 @@ void comp_main()
                 {
                     df[k] = 1000000000.0f;
                 }
-                TileSegRef _842 = { stroke.tile_ref };
-                tile_seg_ref = _842;
+                TileSegRef _1638 = { stroke.tile_ref };
+                tile_seg_ref = _1638;
                 do
                 {
                     uint param_7 = tile_seg_ref.offset;
@@ -519,8 +961,8 @@ void comp_main()
                 {
                     area[k_3] = float(fill.backdrop);
                 }
-                TileSegRef _964 = { fill.tile_ref };
-                tile_seg_ref = _964;
+                TileSegRef _1758 = { fill.tile_ref };
+                tile_seg_ref = _1758;
                 do
                 {
                     uint param_15 = tile_seg_ref.offset;
@@ -609,10 +1051,10 @@ void comp_main()
                     int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba = gradients[int2(x, int(lin.index))];
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _1298 = fromsRGB(param_29);
-                    fg_rgba.x = _1298.x;
-                    fg_rgba.y = _1298.y;
-                    fg_rgba.z = _1298.z;
+                    float3 _2092 = fromsRGB(param_29);
+                    fg_rgba.x = _2092.x;
+                    fg_rgba.y = _2092.y;
+                    fg_rgba.z = _2092.z;
                     rgba[k_9] = fg_rgba;
                 }
                 cmd_ref.offset += 20u;
@@ -625,9 +1067,9 @@ void comp_main()
                 CmdImage fill_img = Cmd_Image_read(param_30, param_31);
                 uint2 param_32 = xy_uint;
                 CmdImage param_33 = fill_img;
-                float4 _1327[8];
-                fillImage(_1327, param_32, param_33);
-                float4 img[8] = _1327;
+                float4 _2121[8];
+                fillImage(_2121, param_32, param_33);
+                float4 img[8] = _2121;
                 for (uint k_10 = 0u; k_10 < 8u; k_10++)
                 {
                     float4 fg_k_1 = img[k_10] * area[k_10];
@@ -642,8 +1084,8 @@ void comp_main()
                 {
                     uint d_2 = min(clip_depth, 127u);
                     float4 param_34 = float4(rgba[k_11]);
-                    uint _1390 = packsRGB(param_34);
-                    blend_stack[d_2][k_11] = _1390;
+                    uint _2184 = packsRGB(param_34);
+                    blend_stack[d_2][k_11] = _2184;
                     rgba[k_11] = 0.0f.xxxx;
                 }
                 clip_depth++;
@@ -652,24 +1094,44 @@ void comp_main()
             }
             case 9u:
             {
+                Alloc param_35 = cmd_alloc;
+                CmdRef param_36 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_35, param_36);
+                uint blend_mode = end_clip.blend >> uint(8);
+                uint comp_mode = end_clip.blend & 255u;
                 clip_depth--;
                 for (uint k_12 = 0u; k_12 < 8u; k_12++)
                 {
                     uint d_3 = min(clip_depth, 127u);
-                    uint param_35 = blend_stack[d_3][k_12];
-                    float4 bg = unpacksRGB(param_35);
+                    uint param_37 = blend_stack[d_3][k_12];
+                    float4 bg = unpacksRGB(param_37);
                     float4 fg_1 = rgba[k_12] * area[k_12];
-                    rgba[k_12] = (bg * (1.0f - fg_1.w)) + fg_1;
+                    float3 param_38 = bg.xyz;
+                    float3 param_39 = fg_1.xyz;
+                    uint param_40 = blend_mode;
+                    float3 blend = mix_blend(param_38, param_39, param_40);
+                    float4 _2251 = fg_1;
+                    float _2255 = fg_1.w;
+                    float3 _2262 = lerp(_2251.xyz, blend, float((_2255 * bg.w) > 0.0f).xxx);
+                    fg_1.x = _2262.x;
+                    fg_1.y = _2262.y;
+                    fg_1.z = _2262.z;
+                    float3 param_41 = bg.xyz;
+                    float3 param_42 = fg_1.xyz;
+                    float param_43 = bg.w;
+                    float param_44 = fg_1.w;
+                    uint param_45 = comp_mode;
+                    rgba[k_12] = mix_compose(param_41, param_42, param_43, param_44, param_45);
                 }
-                cmd_ref.offset += 4u;
+                cmd_ref.offset += 8u;
                 break;
             }
             case 10u:
             {
-                Alloc param_36 = cmd_alloc;
-                CmdRef param_37 = cmd_ref;
-                CmdRef _1453 = { Cmd_Jump_read(param_36, param_37).new_ref };
-                cmd_ref = _1453;
+                Alloc param_46 = cmd_alloc;
+                CmdRef param_47 = cmd_ref;
+                CmdRef _2299 = { Cmd_Jump_read(param_46, param_47).new_ref };
+                cmd_ref = _2299;
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -677,8 +1139,8 @@ void comp_main()
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_38 = i_1;
-        image[int2(xy_uint + chunk_offset(param_38))] = rgba[i_1].w.x;
+        uint param_48 = i_1;
+        image[int2(xy_uint + chunk_offset(param_48))] = rgba[i_1].w.x;
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4_gray.msl b/piet-gpu/shader/gen/kernel4_gray.msl
index 15351a0..38506dd 100644
--- a/piet-gpu/shader/gen/kernel4_gray.msl
+++ b/piet-gpu/shader/gen/kernel4_gray.msl
@@ -115,6 +115,16 @@ struct CmdAlpha
     float alpha;
 };
 
+struct CmdEndClipRef
+{
+    uint offset;
+};
+
+struct CmdEndClip
+{
+    uint blend;
+};
+
 struct CmdJumpRef
 {
     uint offset;
@@ -208,7 +218,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
 }
 
 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_202)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_278)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -216,29 +226,29 @@ uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memor
     {
         return 0u;
     }
-    uint v = v_202.memory[offset];
+    uint v = v_278.memory[offset];
     return v;
 }
 
 static inline __attribute__((always_inline))
-CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_202);
+    uint tag_and_flags = read_mem(param, param_1, v_278);
     return CmdTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
 }
 
 static inline __attribute__((always_inline))
-CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_202)
+CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
     CmdStroke s;
     s.tile_ref = raw0;
     s.half_width = as_type<float>(raw1);
@@ -246,11 +256,11 @@ CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref,
 }
 
 static inline __attribute__((always_inline))
-CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     CmdStrokeRef param_1 = CmdStrokeRef{ ref.offset + 4u };
-    return CmdStroke_read(param, param_1, v_202);
+    return CmdStroke_read(param, param_1, v_278);
 }
 
 static inline __attribute__((always_inline))
@@ -262,27 +272,27 @@ Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const
 }
 
 static inline __attribute__((always_inline))
-TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_202)
+TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_202);
+    uint raw2 = read_mem(param_4, param_5, v_278);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_202);
+    uint raw3 = read_mem(param_6, param_7, v_278);
     Alloc param_8 = a;
     uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_202);
+    uint raw4 = read_mem(param_8, param_9, v_278);
     Alloc param_10 = a;
     uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_202);
+    uint raw5 = read_mem(param_10, param_11, v_278);
     TileSeg s;
     s.origin = float2(as_type<float>(raw0), as_type<float>(raw1));
     s.vector = float2(as_type<float>(raw2), as_type<float>(raw3));
@@ -298,15 +308,15 @@ uint2 chunk_offset(thread const uint& i)
 }
 
 static inline __attribute__((always_inline))
-CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_202)
+CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
     CmdFill s;
     s.tile_ref = raw0;
     s.backdrop = int(raw1);
@@ -314,51 +324,51 @@ CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device
 }
 
 static inline __attribute__((always_inline))
-CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     CmdFillRef param_1 = CmdFillRef{ ref.offset + 4u };
-    return CmdFill_read(param, param_1, v_202);
+    return CmdFill_read(param, param_1, v_278);
 }
 
 static inline __attribute__((always_inline))
-CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_202)
+CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
     CmdAlpha s;
     s.alpha = as_type<float>(raw0);
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     CmdAlphaRef param_1 = CmdAlphaRef{ ref.offset + 4u };
-    return CmdAlpha_read(param, param_1, v_202);
+    return CmdAlpha_read(param, param_1, v_278);
 }
 
 static inline __attribute__((always_inline))
-CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_202)
+CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
     CmdColor s;
     s.rgba_color = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     CmdColorRef param_1 = CmdColorRef{ ref.offset + 4u };
-    return CmdColor_read(param, param_1, v_202);
+    return CmdColor_read(param, param_1, v_278);
 }
 
 static inline __attribute__((always_inline))
@@ -379,21 +389,21 @@ float4 unpacksRGB(thread const uint& srgba)
 }
 
 static inline __attribute__((always_inline))
-CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_202)
+CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_202);
+    uint raw2 = read_mem(param_4, param_5, v_278);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_202);
+    uint raw3 = read_mem(param_6, param_7, v_278);
     CmdLinGrad s;
     s.index = raw0;
     s.line_x = as_type<float>(raw1);
@@ -403,23 +413,23 @@ CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& re
 }
 
 static inline __attribute__((always_inline))
-CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     CmdLinGradRef param_1 = CmdLinGradRef{ ref.offset + 4u };
-    return CmdLinGrad_read(param, param_1, v_202);
+    return CmdLinGrad_read(param, param_1, v_278);
 }
 
 static inline __attribute__((always_inline))
-CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_202)
+CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
     CmdImage s;
     s.index = raw0;
     s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
@@ -427,11 +437,11 @@ CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, dev
 }
 
 static inline __attribute__((always_inline))
-CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     CmdImageRef param_1 = CmdImageRef{ ref.offset + 4u };
-    return CmdImage_read(param, param_1, v_202);
+    return CmdImage_read(param, param_1, v_278);
 }
 
 static inline __attribute__((always_inline))
@@ -444,10 +454,10 @@ spvUnsafeArray<float4, 8> fillImage(thread const uint2& xy, thread const CmdImag
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas.read(uint2(uv));
         float3 param_1 = fg_rgba.xyz;
-        float3 _695 = fromsRGB(param_1);
-        fg_rgba.x = _695.x;
-        fg_rgba.y = _695.y;
-        fg_rgba.z = _695.z;
+        float3 _1493 = fromsRGB(param_1);
+        fg_rgba.x = _1493.x;
+        fg_rgba.y = _1493.y;
+        fg_rgba.z = _1493.z;
         rgba[i] = fg_rgba;
     }
     return rgba;
@@ -471,30 +481,476 @@ uint packsRGB(thread float4& rgba)
 }
 
 static inline __attribute__((always_inline))
-CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_202)
+CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_278)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
+    CmdEndClip s;
+    s.blend = raw0;
+    return s;
+}
+
+static inline __attribute__((always_inline))
+CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+{
+    Alloc param = a;
+    CmdEndClipRef param_1 = CmdEndClipRef{ ref.offset + 4u };
+    return CmdEndClip_read(param, param_1, v_278);
+}
+
+static inline __attribute__((always_inline))
+float3 screen(thread const float3& cb, thread const float3& cs)
+{
+    return (cb + cs) - (cb * cs);
+}
+
+static inline __attribute__((always_inline))
+float3 hard_light(thread const float3& cb, thread const float3& cs)
+{
+    float3 param = cb;
+    float3 param_1 = (cs * 2.0) - float3(1.0);
+    return mix(screen(param, param_1), (cb * 2.0) * cs, float3(cs <= float3(0.5)));
+}
+
+static inline __attribute__((always_inline))
+float color_dodge(thread const float& cb, thread const float& cs)
+{
+    if (cb == 0.0)
+    {
+        return 0.0;
+    }
+    else
+    {
+        if (cs == 1.0)
+        {
+            return 1.0;
+        }
+        else
+        {
+            return fast::min(1.0, cb / (1.0 - cs));
+        }
+    }
+}
+
+static inline __attribute__((always_inline))
+float color_burn(thread const float& cb, thread const float& cs)
+{
+    if (cb == 1.0)
+    {
+        return 1.0;
+    }
+    else
+    {
+        if (cs == 0.0)
+        {
+            return 0.0;
+        }
+        else
+        {
+            return 1.0 - fast::min(1.0, (1.0 - cb) / cs);
+        }
+    }
+}
+
+static inline __attribute__((always_inline))
+float3 soft_light(thread const float3& cb, thread const float3& cs)
+{
+    float3 d = mix(sqrt(cb), ((((cb * 16.0) - float3(12.0)) * cb) + float3(4.0)) * cb, float3(cb <= float3(0.25)));
+    return mix(cb + (((cs * 2.0) - float3(1.0)) * (d - cb)), cb - (((float3(1.0) - (cs * 2.0)) * cb) * (float3(1.0) - cb)), float3(cs <= float3(0.5)));
+}
+
+static inline __attribute__((always_inline))
+float sat(thread const float3& c)
+{
+    return fast::max(c.x, fast::max(c.y, c.z)) - fast::min(c.x, fast::min(c.y, c.z));
+}
+
+static inline __attribute__((always_inline))
+void set_sat_inner(thread float& cmin, thread float& cmid, thread float& cmax, thread const float& s)
+{
+    if (cmax > cmin)
+    {
+        cmid = ((cmid - cmin) * s) / (cmax - cmin);
+        cmax = s;
+    }
+    else
+    {
+        cmid = 0.0;
+        cmax = 0.0;
+    }
+    cmin = 0.0;
+}
+
+static inline __attribute__((always_inline))
+float3 set_sat(thread float3& c, thread const float& s)
+{
+    if (c.x <= c.y)
+    {
+        if (c.y <= c.z)
+        {
+            float param = c.x;
+            float param_1 = c.y;
+            float param_2 = c.z;
+            float param_3 = s;
+            set_sat_inner(param, param_1, param_2, param_3);
+            c.x = param;
+            c.y = param_1;
+            c.z = param_2;
+        }
+        else
+        {
+            if (c.x <= c.z)
+            {
+                float param_4 = c.x;
+                float param_5 = c.z;
+                float param_6 = c.y;
+                float param_7 = s;
+                set_sat_inner(param_4, param_5, param_6, param_7);
+                c.x = param_4;
+                c.z = param_5;
+                c.y = param_6;
+            }
+            else
+            {
+                float param_8 = c.z;
+                float param_9 = c.x;
+                float param_10 = c.y;
+                float param_11 = s;
+                set_sat_inner(param_8, param_9, param_10, param_11);
+                c.z = param_8;
+                c.x = param_9;
+                c.y = param_10;
+            }
+        }
+    }
+    else
+    {
+        if (c.x <= c.z)
+        {
+            float param_12 = c.y;
+            float param_13 = c.x;
+            float param_14 = c.z;
+            float param_15 = s;
+            set_sat_inner(param_12, param_13, param_14, param_15);
+            c.y = param_12;
+            c.x = param_13;
+            c.z = param_14;
+        }
+        else
+        {
+            if (c.y <= c.z)
+            {
+                float param_16 = c.y;
+                float param_17 = c.z;
+                float param_18 = c.x;
+                float param_19 = s;
+                set_sat_inner(param_16, param_17, param_18, param_19);
+                c.y = param_16;
+                c.z = param_17;
+                c.x = param_18;
+            }
+            else
+            {
+                float param_20 = c.z;
+                float param_21 = c.y;
+                float param_22 = c.x;
+                float param_23 = s;
+                set_sat_inner(param_20, param_21, param_22, param_23);
+                c.z = param_20;
+                c.y = param_21;
+                c.x = param_22;
+            }
+        }
+    }
+    return c;
+}
+
+static inline __attribute__((always_inline))
+float lum(thread const float3& c)
+{
+    float3 f = float3(0.300000011920928955078125, 0.589999973773956298828125, 0.10999999940395355224609375);
+    return dot(c, f);
+}
+
+static inline __attribute__((always_inline))
+float3 clip_color(thread float3& c)
+{
+    float3 param = c;
+    float L = lum(param);
+    float n = fast::min(c.x, fast::min(c.y, c.z));
+    float x = fast::max(c.x, fast::max(c.y, c.z));
+    if (n < 0.0)
+    {
+        c = float3(L) + (((c - float3(L)) * L) / float3(L - n));
+    }
+    if (x > 1.0)
+    {
+        c = float3(L) + (((c - float3(L)) * (1.0 - L)) / float3(x - L));
+    }
+    return c;
+}
+
+static inline __attribute__((always_inline))
+float3 set_lum(thread const float3& c, thread const float& l)
+{
+    float3 param = c;
+    float3 param_1 = c + float3(l - lum(param));
+    float3 _901 = clip_color(param_1);
+    return _901;
+}
+
+static inline __attribute__((always_inline))
+float3 mix_blend(thread const float3& cb, thread const float3& cs, thread const uint& mode)
+{
+    float3 b = float3(0.0);
+    switch (mode)
+    {
+        case 1u:
+        {
+            b = cb * cs;
+            break;
+        }
+        case 2u:
+        {
+            float3 param = cb;
+            float3 param_1 = cs;
+            b = screen(param, param_1);
+            break;
+        }
+        case 3u:
+        {
+            float3 param_2 = cs;
+            float3 param_3 = cb;
+            b = hard_light(param_2, param_3);
+            break;
+        }
+        case 4u:
+        {
+            b = fast::min(cb, cs);
+            break;
+        }
+        case 5u:
+        {
+            b = fast::max(cb, cs);
+            break;
+        }
+        case 6u:
+        {
+            float param_4 = cb.x;
+            float param_5 = cs.x;
+            float param_6 = cb.y;
+            float param_7 = cs.y;
+            float param_8 = cb.z;
+            float param_9 = cs.z;
+            b = float3(color_dodge(param_4, param_5), color_dodge(param_6, param_7), color_dodge(param_8, param_9));
+            break;
+        }
+        case 7u:
+        {
+            float param_10 = cb.x;
+            float param_11 = cs.x;
+            float param_12 = cb.y;
+            float param_13 = cs.y;
+            float param_14 = cb.z;
+            float param_15 = cs.z;
+            b = float3(color_burn(param_10, param_11), color_burn(param_12, param_13), color_burn(param_14, param_15));
+            break;
+        }
+        case 8u:
+        {
+            float3 param_16 = cb;
+            float3 param_17 = cs;
+            b = hard_light(param_16, param_17);
+            break;
+        }
+        case 9u:
+        {
+            float3 param_18 = cb;
+            float3 param_19 = cs;
+            b = soft_light(param_18, param_19);
+            break;
+        }
+        case 10u:
+        {
+            b = abs(cb - cs);
+            break;
+        }
+        case 11u:
+        {
+            b = (cb + cs) - ((cb * 2.0) * cs);
+            break;
+        }
+        case 12u:
+        {
+            float3 param_20 = cb;
+            float3 param_21 = cs;
+            float param_22 = sat(param_20);
+            float3 _1192 = set_sat(param_21, param_22);
+            float3 param_23 = cb;
+            float3 param_24 = _1192;
+            float param_25 = lum(param_23);
+            b = set_lum(param_24, param_25);
+            break;
+        }
+        case 13u:
+        {
+            float3 param_26 = cs;
+            float3 param_27 = cb;
+            float param_28 = sat(param_26);
+            float3 _1206 = set_sat(param_27, param_28);
+            float3 param_29 = cb;
+            float3 param_30 = _1206;
+            float param_31 = lum(param_29);
+            b = set_lum(param_30, param_31);
+            break;
+        }
+        case 14u:
+        {
+            float3 param_32 = cb;
+            float3 param_33 = cs;
+            float param_34 = lum(param_32);
+            b = set_lum(param_33, param_34);
+            break;
+        }
+        case 15u:
+        {
+            float3 param_35 = cs;
+            float3 param_36 = cb;
+            float param_37 = lum(param_35);
+            b = set_lum(param_36, param_37);
+            break;
+        }
+        default:
+        {
+            b = cs;
+            break;
+        }
+    }
+    return b;
+}
+
+static inline __attribute__((always_inline))
+float4 mix_compose(thread const float3& cb, thread const float3& cs, thread const float& ab, thread const float& as, thread const uint& mode)
+{
+    float fa = 0.0;
+    float fb = 0.0;
+    switch (mode)
+    {
+        case 1u:
+        {
+            fa = 1.0;
+            fb = 0.0;
+            break;
+        }
+        case 2u:
+        {
+            fa = 0.0;
+            fb = 1.0;
+            break;
+        }
+        case 3u:
+        {
+            fa = 1.0;
+            fb = 1.0 - as;
+            break;
+        }
+        case 4u:
+        {
+            fa = 1.0 - ab;
+            fb = 1.0;
+            break;
+        }
+        case 5u:
+        {
+            fa = ab;
+            fb = 0.0;
+            break;
+        }
+        case 6u:
+        {
+            fa = 0.0;
+            fb = as;
+            break;
+        }
+        case 7u:
+        {
+            fa = 1.0 - ab;
+            fb = 0.0;
+            break;
+        }
+        case 8u:
+        {
+            fa = 0.0;
+            fb = 1.0 - as;
+            break;
+        }
+        case 9u:
+        {
+            fa = ab;
+            fb = 1.0 - as;
+            break;
+        }
+        case 10u:
+        {
+            fa = 1.0 - ab;
+            fb = as;
+            break;
+        }
+        case 11u:
+        {
+            fa = 1.0 - ab;
+            fb = 1.0 - as;
+            break;
+        }
+        case 12u:
+        {
+            fa = 1.0;
+            fb = 1.0;
+            break;
+        }
+        case 13u:
+        {
+            return float4(fast::max(float4(0.0), ((float4(1.0) - (float4(cs, as) * as)) + float4(1.0)) - (float4(cb, ab) * ab)).xyz, fast::max(0.0, ((1.0 - as) + 1.0) - ab));
+        }
+        case 14u:
+        {
+            return float4(fast::min(float4(1.0), (float4(cs, as) * as) + (float4(cb, ab) * ab)).xyz, fast::min(1.0, as + ab));
+        }
+        default:
+        {
+            break;
+        }
+    }
+    return (float4(cs, as) * (as * fa)) + (float4(cb, ab) * (ab * fb));
+}
+
+static inline __attribute__((always_inline))
+CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_278)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_278);
     CmdJump s;
     s.new_ref = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
     Alloc param = a;
     CmdJumpRef param_1 = CmdJumpRef{ ref.offset + 4u };
-    return CmdJump_read(param, param_1, v_202);
+    return CmdJump_read(param, param_1, v_278);
 }
 
-kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _723 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1521 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
-    uint tile_ix = (gl_WorkGroupID.y * _723.conf.width_in_tiles) + gl_WorkGroupID.x;
+    uint tile_ix = (gl_WorkGroupID.y * _1521.conf.width_in_tiles) + gl_WorkGroupID.x;
     Alloc param;
-    param.offset = _723.conf.ptcl_alloc.offset;
+    param.offset = _1521.conf.ptcl_alloc.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
@@ -507,7 +963,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
         rgba[i] = float4(0.0);
     }
     uint clip_depth = 0u;
-    bool mem_ok = v_202.mem_error == 0u;
+    bool mem_ok = v_278.mem_error == 0u;
     spvUnsafeArray<float, 8> df;
     TileSegRef tile_seg_ref;
     spvUnsafeArray<float, 8> area;
@@ -516,7 +972,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
     {
         Alloc param_3 = cmd_alloc;
         CmdRef param_4 = cmd_ref;
-        uint tag = Cmd_tag(param_3, param_4, v_202).tag;
+        uint tag = Cmd_tag(param_3, param_4, v_278).tag;
         if (tag == 0u)
         {
             break;
@@ -527,7 +983,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
             {
                 Alloc param_5 = cmd_alloc;
                 CmdRef param_6 = cmd_ref;
-                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_202);
+                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_278);
                 for (uint k = 0u; k < 8u; k++)
                 {
                     df[k] = 1000000000.0;
@@ -540,7 +996,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
                     bool param_9 = mem_ok;
                     Alloc param_10 = new_alloc(param_7, param_8, param_9);
                     TileSegRef param_11 = tile_seg_ref;
-                    TileSeg seg = TileSeg_read(param_10, param_11, v_202);
+                    TileSeg seg = TileSeg_read(param_10, param_11, v_278);
                     float2 line_vec = seg.vector;
                     for (uint k_1 = 0u; k_1 < 8u; k_1++)
                     {
@@ -563,7 +1019,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
             {
                 Alloc param_13 = cmd_alloc;
                 CmdRef param_14 = cmd_ref;
-                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_202);
+                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_278);
                 for (uint k_3 = 0u; k_3 < 8u; k_3++)
                 {
                     area[k_3] = float(fill.backdrop);
@@ -576,7 +1032,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
                     bool param_17 = mem_ok;
                     Alloc param_18 = new_alloc(param_15, param_16, param_17);
                     TileSegRef param_19 = tile_seg_ref;
-                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_202);
+                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_278);
                     for (uint k_4 = 0u; k_4 < 8u; k_4++)
                     {
                         uint param_20 = k_4;
@@ -620,7 +1076,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
             {
                 Alloc param_21 = cmd_alloc;
                 CmdRef param_22 = cmd_ref;
-                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_202);
+                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_278);
                 for (uint k_7 = 0u; k_7 < 8u; k_7++)
                 {
                     area[k_7] = alpha.alpha;
@@ -632,7 +1088,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
             {
                 Alloc param_23 = cmd_alloc;
                 CmdRef param_24 = cmd_ref;
-                CmdColor color = Cmd_Color_read(param_23, param_24, v_202);
+                CmdColor color = Cmd_Color_read(param_23, param_24, v_278);
                 uint param_25 = color.rgba_color;
                 float4 fg = unpacksRGB(param_25);
                 for (uint k_8 = 0u; k_8 < 8u; k_8++)
@@ -647,7 +1103,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
             {
                 Alloc param_26 = cmd_alloc;
                 CmdRef param_27 = cmd_ref;
-                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_202);
+                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_278);
                 float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c;
                 for (uint k_9 = 0u; k_9 < 8u; k_9++)
                 {
@@ -657,10 +1113,10 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
                     int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                     float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _1298 = fromsRGB(param_29);
-                    fg_rgba.x = _1298.x;
-                    fg_rgba.y = _1298.y;
-                    fg_rgba.z = _1298.z;
+                    float3 _2092 = fromsRGB(param_29);
+                    fg_rgba.x = _2092.x;
+                    fg_rgba.y = _2092.y;
+                    fg_rgba.z = _2092.z;
                     rgba[k_9] = fg_rgba;
                 }
                 cmd_ref.offset += 20u;
@@ -670,7 +1126,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
             {
                 Alloc param_30 = cmd_alloc;
                 CmdRef param_31 = cmd_ref;
-                CmdImage fill_img = Cmd_Image_read(param_30, param_31, v_202);
+                CmdImage fill_img = Cmd_Image_read(param_30, param_31, v_278);
                 uint2 param_32 = xy_uint;
                 CmdImage param_33 = fill_img;
                 spvUnsafeArray<float4, 8> img;
@@ -689,8 +1145,8 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
                 {
                     uint d_2 = min(clip_depth, 127u);
                     float4 param_34 = float4(rgba[k_11]);
-                    uint _1390 = packsRGB(param_34);
-                    blend_stack[d_2][k_11] = _1390;
+                    uint _2184 = packsRGB(param_34);
+                    blend_stack[d_2][k_11] = _2184;
                     rgba[k_11] = float4(0.0);
                 }
                 clip_depth++;
@@ -699,23 +1155,43 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
             }
             case 9u:
             {
+                Alloc param_35 = cmd_alloc;
+                CmdRef param_36 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_35, param_36, v_278);
+                uint blend_mode = end_clip.blend >> uint(8);
+                uint comp_mode = end_clip.blend & 255u;
                 clip_depth--;
                 for (uint k_12 = 0u; k_12 < 8u; k_12++)
                 {
                     uint d_3 = min(clip_depth, 127u);
-                    uint param_35 = blend_stack[d_3][k_12];
-                    float4 bg = unpacksRGB(param_35);
+                    uint param_37 = blend_stack[d_3][k_12];
+                    float4 bg = unpacksRGB(param_37);
                     float4 fg_1 = rgba[k_12] * area[k_12];
-                    rgba[k_12] = (bg * (1.0 - fg_1.w)) + fg_1;
+                    float3 param_38 = bg.xyz;
+                    float3 param_39 = fg_1.xyz;
+                    uint param_40 = blend_mode;
+                    float3 blend = mix_blend(param_38, param_39, param_40);
+                    float4 _2251 = fg_1;
+                    float _2255 = fg_1.w;
+                    float3 _2262 = mix(_2251.xyz, blend, float3(float((_2255 * bg.w) > 0.0)));
+                    fg_1.x = _2262.x;
+                    fg_1.y = _2262.y;
+                    fg_1.z = _2262.z;
+                    float3 param_41 = bg.xyz;
+                    float3 param_42 = fg_1.xyz;
+                    float param_43 = bg.w;
+                    float param_44 = fg_1.w;
+                    uint param_45 = comp_mode;
+                    rgba[k_12] = mix_compose(param_41, param_42, param_43, param_44, param_45);
                 }
-                cmd_ref.offset += 4u;
+                cmd_ref.offset += 8u;
                 break;
             }
             case 10u:
             {
-                Alloc param_36 = cmd_alloc;
-                CmdRef param_37 = cmd_ref;
-                cmd_ref = CmdRef{ Cmd_Jump_read(param_36, param_37, v_202).new_ref };
+                Alloc param_46 = cmd_alloc;
+                CmdRef param_47 = cmd_ref;
+                cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_278).new_ref };
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -723,8 +1199,8 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_38 = i_1;
-        image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_38))));
+        uint param_48 = i_1;
+        image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48))));
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4_gray.spv b/piet-gpu/shader/gen/kernel4_gray.spv
index 42964c8..305facd 100644
Binary files a/piet-gpu/shader/gen/kernel4_gray.spv and b/piet-gpu/shader/gen/kernel4_gray.spv differ
diff --git a/piet-gpu/shader/gen/tile_alloc.dxil b/piet-gpu/shader/gen/tile_alloc.dxil
index d69db16..fdc60a1 100644
Binary files a/piet-gpu/shader/gen/tile_alloc.dxil and b/piet-gpu/shader/gen/tile_alloc.dxil differ
diff --git a/piet-gpu/shader/gen/tile_alloc.hlsl b/piet-gpu/shader/gen/tile_alloc.hlsl
index 97e1c23..1c9d04b 100644
--- a/piet-gpu/shader/gen/tile_alloc.hlsl
+++ b/piet-gpu/shader/gen/tile_alloc.hlsl
@@ -17,6 +17,7 @@ struct AnnoEndClipRef
 struct AnnoEndClip
 {
     float4 bbox;
+    uint blend;
 };
 
 struct AnnotatedRef
@@ -76,7 +77,7 @@ struct Config
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
 RWByteAddressBuffer _92 : register(u0, space0);
-ByteAddressBuffer _305 : register(t1, space0);
+ByteAddressBuffer _314 : register(t1, space0);
 
 static uint3 gl_LocalInvocationID;
 static uint3 gl_GlobalInvocationID;
@@ -111,8 +112,8 @@ AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref)
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint tag_and_flags = read_mem(param, param_1);
-    AnnotatedTag _236 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _236;
+    AnnotatedTag _246 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+    return _246;
 }
 
 AnnoEndClip AnnoEndClip_read(Alloc a, AnnoEndClipRef ref)
@@ -130,16 +131,20 @@ AnnoEndClip AnnoEndClip_read(Alloc a, AnnoEndClipRef ref)
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
     uint raw3 = read_mem(param_6, param_7);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9);
     AnnoEndClip s;
     s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
+    s.blend = raw4;
     return s;
 }
 
 AnnoEndClip Annotated_EndClip_read(Alloc a, AnnotatedRef ref)
 {
-    AnnoEndClipRef _243 = { ref.offset + 4u };
+    AnnoEndClipRef _252 = { ref.offset + 4u };
     Alloc param = a;
-    AnnoEndClipRef param_1 = _243;
+    AnnoEndClipRef param_1 = _252;
     return AnnoEndClip_read(param, param_1);
 }
 
@@ -211,17 +216,17 @@ void comp_main()
 {
     uint th_ix = gl_LocalInvocationID.x;
     uint element_ix = gl_GlobalInvocationID.x;
-    PathRef _312 = { _305.Load(16) + (element_ix * 12u) };
-    PathRef path_ref = _312;
-    AnnotatedRef _321 = { _305.Load(32) + (element_ix * 40u) };
-    AnnotatedRef ref = _321;
+    PathRef _321 = { _314.Load(16) + (element_ix * 12u) };
+    PathRef path_ref = _321;
+    AnnotatedRef _330 = { _314.Load(32) + (element_ix * 40u) };
+    AnnotatedRef ref = _330;
     uint tag = 0u;
-    if (element_ix < _305.Load(0))
+    if (element_ix < _314.Load(0))
     {
-        Alloc _332;
-        _332.offset = _305.Load(32);
+        Alloc _341;
+        _341.offset = _314.Load(32);
         Alloc param;
-        param.offset = _332.offset;
+        param.offset = _341.offset;
         AnnotatedRef param_1 = ref;
         tag = Annotated_tag(param, param_1).tag;
     }
@@ -237,10 +242,10 @@ void comp_main()
         case 4u:
         case 5u:
         {
-            Alloc _350;
-            _350.offset = _305.Load(32);
+            Alloc _359;
+            _359.offset = _314.Load(32);
             Alloc param_2;
-            param_2.offset = _350.offset;
+            param_2.offset = _359.offset;
             AnnotatedRef param_3 = ref;
             AnnoEndClip clip = Annotated_EndClip_read(param_2, param_3);
             x0 = int(floor(clip.bbox.x * 0.0625f));
@@ -250,10 +255,10 @@ void comp_main()
             break;
         }
     }
-    x0 = clamp(x0, 0, int(_305.Load(8)));
-    y0 = clamp(y0, 0, int(_305.Load(12)));
-    x1 = clamp(x1, 0, int(_305.Load(8)));
-    y1 = clamp(y1, 0, int(_305.Load(12)));
+    x0 = clamp(x0, 0, int(_314.Load(8)));
+    y0 = clamp(y0, 0, int(_314.Load(12)));
+    x1 = clamp(x1, 0, int(_314.Load(8)));
+    y1 = clamp(y1, 0, int(_314.Load(12)));
     Path path;
     path.bbox = uint4(uint(x0), uint(y0), uint(x1), uint(y1));
     uint tile_count = uint((x1 - x0) * (y1 - y0));
@@ -276,46 +281,46 @@ void comp_main()
     if (th_ix == 255u)
     {
         uint param_4 = total_tile_count * 8u;
-        MallocResult _476 = malloc(param_4);
-        sh_tile_alloc = _476;
+        MallocResult _485 = malloc(param_4);
+        sh_tile_alloc = _485;
     }
     GroupMemoryBarrierWithGroupSync();
     MallocResult alloc_start = sh_tile_alloc;
-    bool _487;
+    bool _496;
     if (!alloc_start.failed)
     {
-        _487 = _92.Load(4) != 0u;
+        _496 = _92.Load(4) != 0u;
     }
     else
     {
-        _487 = alloc_start.failed;
+        _496 = alloc_start.failed;
     }
-    if (_487)
+    if (_496)
     {
         return;
     }
-    if (element_ix < _305.Load(0))
+    if (element_ix < _314.Load(0))
     {
-        uint _500;
+        uint _509;
         if (th_ix > 0u)
         {
-            _500 = sh_tile_count[th_ix - 1u];
+            _509 = sh_tile_count[th_ix - 1u];
         }
         else
         {
-            _500 = 0u;
+            _509 = 0u;
         }
-        uint tile_subix = _500;
+        uint tile_subix = _509;
         Alloc param_5 = alloc_start.alloc;
         uint param_6 = 8u * tile_subix;
         uint param_7 = 8u * tile_count;
         Alloc tiles_alloc = slice_mem(param_5, param_6, param_7);
-        TileRef _522 = { tiles_alloc.offset };
-        path.tiles = _522;
-        Alloc _527;
-        _527.offset = _305.Load(16);
+        TileRef _531 = { tiles_alloc.offset };
+        path.tiles = _531;
+        Alloc _536;
+        _536.offset = _314.Load(16);
         Alloc param_8;
-        param_8.offset = _527.offset;
+        param_8.offset = _536.offset;
         PathRef param_9 = path_ref;
         Path param_10 = path;
         Path_write(param_8, param_9, param_10);
diff --git a/piet-gpu/shader/gen/tile_alloc.msl b/piet-gpu/shader/gen/tile_alloc.msl
index bb10cf0..c03e830 100644
--- a/piet-gpu/shader/gen/tile_alloc.msl
+++ b/piet-gpu/shader/gen/tile_alloc.msl
@@ -26,6 +26,7 @@ struct AnnoEndClipRef
 struct AnnoEndClip
 {
     float4 bbox;
+    uint blend;
 };
 
 struct AnnotatedRef
@@ -145,8 +146,12 @@ AnnoEndClip AnnoEndClip_read(thread const Alloc& a, thread const AnnoEndClipRef&
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
     uint raw3 = read_mem(param_6, param_7, v_92, v_92BufferSize);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9, v_92, v_92BufferSize);
     AnnoEndClip s;
     s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
+    s.blend = raw4;
     return s;
 }
 
@@ -221,20 +226,20 @@ void Path_write(thread const Alloc& a, thread const PathRef& ref, thread const P
     write_mem(param_6, param_7, param_8, v_92, v_92BufferSize);
 }
 
-kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_92 [[buffer(0)]], const device ConfigBuf& _305 [[buffer(1)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_92 [[buffer(0)]], const device ConfigBuf& _314 [[buffer(1)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
 {
     threadgroup uint sh_tile_count[256];
     threadgroup MallocResult sh_tile_alloc;
     constant uint& v_92BufferSize = spvBufferSizeConstants[0];
     uint th_ix = gl_LocalInvocationID.x;
     uint element_ix = gl_GlobalInvocationID.x;
-    PathRef path_ref = PathRef{ _305.conf.tile_alloc.offset + (element_ix * 12u) };
-    AnnotatedRef ref = AnnotatedRef{ _305.conf.anno_alloc.offset + (element_ix * 40u) };
+    PathRef path_ref = PathRef{ _314.conf.tile_alloc.offset + (element_ix * 12u) };
+    AnnotatedRef ref = AnnotatedRef{ _314.conf.anno_alloc.offset + (element_ix * 40u) };
     uint tag = 0u;
-    if (element_ix < _305.conf.n_elements)
+    if (element_ix < _314.conf.n_elements)
     {
         Alloc param;
-        param.offset = _305.conf.anno_alloc.offset;
+        param.offset = _314.conf.anno_alloc.offset;
         AnnotatedRef param_1 = ref;
         tag = Annotated_tag(param, param_1, v_92, v_92BufferSize).tag;
     }
@@ -251,7 +256,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
         case 5u:
         {
             Alloc param_2;
-            param_2.offset = _305.conf.anno_alloc.offset;
+            param_2.offset = _314.conf.anno_alloc.offset;
             AnnotatedRef param_3 = ref;
             AnnoEndClip clip = Annotated_EndClip_read(param_2, param_3, v_92, v_92BufferSize);
             x0 = int(floor(clip.bbox.x * 0.0625));
@@ -261,10 +266,10 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
             break;
         }
     }
-    x0 = clamp(x0, 0, int(_305.conf.width_in_tiles));
-    y0 = clamp(y0, 0, int(_305.conf.height_in_tiles));
-    x1 = clamp(x1, 0, int(_305.conf.width_in_tiles));
-    y1 = clamp(y1, 0, int(_305.conf.height_in_tiles));
+    x0 = clamp(x0, 0, int(_314.conf.width_in_tiles));
+    y0 = clamp(y0, 0, int(_314.conf.height_in_tiles));
+    x1 = clamp(x1, 0, int(_314.conf.width_in_tiles));
+    y1 = clamp(y1, 0, int(_314.conf.height_in_tiles));
     Path path;
     path.bbox = uint4(uint(x0), uint(y0), uint(x1), uint(y1));
     uint tile_count = uint((x1 - x0) * (y1 - y0));
@@ -287,43 +292,43 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
     if (th_ix == 255u)
     {
         uint param_4 = total_tile_count * 8u;
-        MallocResult _476 = malloc(param_4, v_92, v_92BufferSize);
-        sh_tile_alloc = _476;
+        MallocResult _485 = malloc(param_4, v_92, v_92BufferSize);
+        sh_tile_alloc = _485;
     }
     threadgroup_barrier(mem_flags::mem_threadgroup);
     MallocResult alloc_start = sh_tile_alloc;
-    bool _487;
+    bool _496;
     if (!alloc_start.failed)
     {
-        _487 = v_92.mem_error != 0u;
+        _496 = v_92.mem_error != 0u;
     }
     else
     {
-        _487 = alloc_start.failed;
+        _496 = alloc_start.failed;
     }
-    if (_487)
+    if (_496)
     {
         return;
     }
-    if (element_ix < _305.conf.n_elements)
+    if (element_ix < _314.conf.n_elements)
     {
-        uint _500;
+        uint _509;
         if (th_ix > 0u)
         {
-            _500 = sh_tile_count[th_ix - 1u];
+            _509 = sh_tile_count[th_ix - 1u];
         }
         else
         {
-            _500 = 0u;
+            _509 = 0u;
         }
-        uint tile_subix = _500;
+        uint tile_subix = _509;
         Alloc param_5 = alloc_start.alloc;
         uint param_6 = 8u * tile_subix;
         uint param_7 = 8u * tile_count;
         Alloc tiles_alloc = slice_mem(param_5, param_6, param_7);
         path.tiles = TileRef{ tiles_alloc.offset };
         Alloc param_8;
-        param_8.offset = _305.conf.tile_alloc.offset;
+        param_8.offset = _314.conf.tile_alloc.offset;
         PathRef param_9 = path_ref;
         Path param_10 = path;
         Path_write(param_8, param_9, param_10, v_92, v_92BufferSize);
diff --git a/piet-gpu/shader/gen/tile_alloc.spv b/piet-gpu/shader/gen/tile_alloc.spv
index 12277f1..cf2f01c 100644
Binary files a/piet-gpu/shader/gen/tile_alloc.spv and b/piet-gpu/shader/gen/tile_alloc.spv differ
diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp
index dd4a855..a97715a 100644
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@@ -35,6 +35,7 @@ layout(rgba8, set = 0, binding = 4) uniform restrict readonly image2D gradients;
 
 #include "ptcl.h"
 #include "tile.h"
+#include "blend.h"
 
 #define MAX_BLEND_STACK 128
 mediump vec3 tosRGB(mediump vec3 rgb) {
@@ -216,14 +217,20 @@ void main() {
             cmd_ref.offset += 4;
             break;
         case Cmd_EndClip:
+            CmdEndClip end_clip = Cmd_EndClip_read(cmd_alloc, cmd_ref);
+            uint blend_mode = uint(end_clip.blend >> 8);
+            uint comp_mode = uint(end_clip.blend & 0xFF);
             clip_depth--;
             for (uint k = 0; k < CHUNK; k++) {
                 uint d = min(clip_depth, MAX_BLEND_STACK - 1);
                 mediump vec4 bg = unpacksRGB(blend_stack[d][k]);
                 mediump vec4 fg = rgba[k] * area[k];
-                rgba[k] = bg * (1.0 - fg.a) + fg;
+                vec3 blend = mix_blend(bg.rgb, fg.rgb, blend_mode);
+                // Apply the blend color only where the foreground and background overlap.
+                fg.rgb = mix(fg.rgb, blend, float((fg.a * bg.a) > 0.0));
+                rgba[k] = mix_compose(bg.rgb, fg.rgb, bg.a, fg.a, comp_mode);
             }
-            cmd_ref.offset += 4;
+            cmd_ref.offset += 4 + CmdEndClip_size;
             break;
         case Cmd_Jump:
             cmd_ref = CmdRef(Cmd_Jump_read(cmd_alloc, cmd_ref).new_ref);
diff --git a/piet-gpu/shader/ptcl.h b/piet-gpu/shader/ptcl.h
index 936c431..9b9b341 100644
--- a/piet-gpu/shader/ptcl.h
+++ b/piet-gpu/shader/ptcl.h
@@ -26,6 +26,10 @@ struct CmdAlphaRef {
     uint offset;
 };
 
+struct CmdEndClipRef {
+    uint offset;
+};
+
 struct CmdJumpRef {
     uint offset;
 };
@@ -100,6 +104,16 @@ CmdAlphaRef CmdAlpha_index(CmdAlphaRef ref, uint index) {
     return CmdAlphaRef(ref.offset + index * CmdAlpha_size);
 }
 
+struct CmdEndClip {
+    uint blend;
+};
+
+#define CmdEndClip_size 4
+
+CmdEndClipRef CmdEndClip_index(CmdEndClipRef ref, uint index) {
+    return CmdEndClipRef(ref.offset + index * CmdEndClip_size);
+}
+
 struct CmdJump {
     uint new_ref;
 };
@@ -228,6 +242,19 @@ void CmdAlpha_write(Alloc a, CmdAlphaRef ref, CmdAlpha s) {
     write_mem(a, ix + 0, floatBitsToUint(s.alpha));
 }
 
+CmdEndClip CmdEndClip_read(Alloc a, CmdEndClipRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    CmdEndClip s;
+    s.blend = raw0;
+    return s;
+}
+
+void CmdEndClip_write(Alloc a, CmdEndClipRef ref, CmdEndClip s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, s.blend);
+}
+
 CmdJump CmdJump_read(Alloc a, CmdJumpRef ref) {
     uint ix = ref.offset >> 2;
     uint raw0 = read_mem(a, ix + 0);
@@ -270,6 +297,10 @@ CmdImage Cmd_Image_read(Alloc a, CmdRef ref) {
     return CmdImage_read(a, CmdImageRef(ref.offset + 4));
 }
 
+CmdEndClip Cmd_EndClip_read(Alloc a, CmdRef ref) {
+    return CmdEndClip_read(a, CmdEndClipRef(ref.offset + 4));
+}
+
 CmdJump Cmd_Jump_read(Alloc a, CmdRef ref) {
     return CmdJump_read(a, CmdJumpRef(ref.offset + 4));
 }
@@ -316,8 +347,9 @@ void Cmd_BeginClip_write(Alloc a, CmdRef ref) {
     write_mem(a, ref.offset >> 2, Cmd_BeginClip);
 }
 
-void Cmd_EndClip_write(Alloc a, CmdRef ref) {
+void Cmd_EndClip_write(Alloc a, CmdRef ref, CmdEndClip s) {
     write_mem(a, ref.offset >> 2, Cmd_EndClip);
+    CmdEndClip_write(a, CmdEndClipRef(ref.offset + 4), s);
 }
 
 void Cmd_Jump_write(Alloc a, CmdRef ref, CmdJump s) {
diff --git a/piet-gpu/shader/scene.h b/piet-gpu/shader/scene.h
index 254d4aa..3e74b69 100644
--- a/piet-gpu/shader/scene.h
+++ b/piet-gpu/shader/scene.h
@@ -138,9 +138,10 @@ TransformRef Transform_index(TransformRef ref, uint index) {
 
 struct Clip {
     vec4 bbox;
+    uint blend;
 };
 
-#define Clip_size 16
+#define Clip_size 20
 
 ClipRef Clip_index(ClipRef ref, uint index) {
     return ClipRef(ref.offset + index * Clip_size);
@@ -286,6 +287,7 @@ Clip Clip_read(ClipRef ref) {
     uint raw3 = scene[ix + 3];
     Clip s;
     s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.blend = scene[ix + 4];
     return s;
 }
 
diff --git a/piet-gpu/src/blend.rs b/piet-gpu/src/blend.rs
new file mode 100644
index 0000000..6f1e791
--- /dev/null
+++ b/piet-gpu/src/blend.rs
@@ -0,0 +1,99 @@
+// Copyright 2022 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+#[repr(C)]
+pub enum BlendMode {
+    Normal = 0,
+    Multiply = 1,
+    Screen = 2,
+    Overlay = 3,
+    Darken = 4,
+    Lighten = 5,
+    ColorDodge = 6,
+    ColorBurn = 7,
+    HardLight = 8,
+    SoftLight = 9,
+    Difference = 10,
+    Exclusion = 11,
+    Hue = 12,
+    Saturation = 13,
+    Color = 14,
+    Luminosity = 15,
+}
+
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+#[repr(C)]
+pub enum CompositionMode {
+    Clear = 0,
+    Copy = 1,
+    Dest = 2,
+    SrcOver = 3,
+    DestOver = 4,
+    SrcIn = 5,
+    DestIn = 6,
+    SrcOut = 7,
+    DestOut = 8,
+    SrcAtop = 9,
+    DestAtop = 10,
+    Xor = 11,
+    Plus = 12,
+    PlusDarker = 13,
+    PlusLighter = 14,
+}
+
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+pub struct Blend {
+    pub mode: BlendMode,
+    pub composition_mode: CompositionMode,
+}
+
+impl Blend {
+    pub fn new(mode: BlendMode, composition_mode: CompositionMode) -> Self {
+        Self { mode, composition_mode }
+    }
+
+    pub(crate) fn pack(&self) -> u32 {
+        (self.mode as u32) << 8 | self.composition_mode as u32
+    }
+}
+
+impl Default for Blend {
+    fn default() -> Self {
+        Self {
+            mode: BlendMode::Normal,
+            composition_mode: CompositionMode::SrcOver,
+        }
+    }
+}
+
+impl From<BlendMode> for Blend {
+    fn from(mode: BlendMode) -> Self {
+        Self {
+            mode,
+            composition_mode: CompositionMode::SrcOver,
+        }
+    }
+}
+
+impl From<CompositionMode> for Blend {
+    fn from(mode: CompositionMode) -> Self {
+        Self {
+            mode: BlendMode::Normal,
+            composition_mode: mode,
+        }
+    }
+}
diff --git a/piet-gpu/src/encoder.rs b/piet-gpu/src/encoder.rs
index 767f4ba..c24615e 100644
--- a/piet-gpu/src/encoder.rs
+++ b/piet-gpu/src/encoder.rs
@@ -16,6 +16,7 @@
 
 //! Low-level scene encoding.
 
+use crate::Blend;
 use bytemuck::{Pod, Zeroable};
 use piet_gpu_hal::BufWrite;
 
@@ -87,7 +88,8 @@ pub struct FillLinGradient {
 pub struct Clip {
     tag: u32,
     bbox: [f32; 4],
-    padding: [u32; 4],
+    blend: u32,
+    padding: [u32; 3],
 }
 
 impl Encoder {
@@ -151,10 +153,11 @@ impl Encoder {
     }
 
     /// Start a clip and return a save point to be filled in later.
-    pub fn begin_clip(&mut self) -> usize {
+    pub fn begin_clip(&mut self, blend: Option<Blend>) -> usize {
         let saved = self.drawobj_stream.len();
         let element = Clip {
             tag: ELEMENT_BEGINCLIP,
+            blend: blend.unwrap_or(Blend::default()).pack(),
             ..Default::default()
         };
         self.drawobj_stream.extend(bytemuck::bytes_of(&element));
@@ -162,10 +165,11 @@ impl Encoder {
         saved
     }
 
-    pub fn end_clip(&mut self, bbox: [f32; 4], save_point: usize) {
+    pub fn end_clip(&mut self, bbox: [f32; 4], blend: Option<Blend>, save_point: usize) {
         let element = Clip {
             tag: ELEMENT_ENDCLIP,
             bbox,
+            blend: blend.unwrap_or(Blend::default()).pack(),
             ..Default::default()
         };
         self.drawobj_stream[save_point + 4..save_point + 20]
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index b8b7532..bd26d45 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -1,3 +1,4 @@
+mod blend;
 mod encoder;
 pub mod glyph_render;
 mod gradient;
@@ -9,6 +10,7 @@ mod text;
 
 use std::convert::TryInto;
 
+pub use blend::{Blend, BlendMode, CompositionMode};
 pub use render_ctx::PietGpuRenderContext;
 
 use piet::kurbo::Vec2;
diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs
index ef0a3a7..1fe1ce9 100644
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@@ -16,6 +16,7 @@ use piet_gpu_types::scene::Element;
 use crate::gradient::{LinearGradient, RampCache};
 use crate::text::Font;
 pub use crate::text::{PietGpuText, PietGpuTextLayout, PietGpuTextLayoutBuilder};
+use crate::Blend;
 
 pub struct PietGpuImage;
 
@@ -66,6 +67,7 @@ struct ClipElement {
     /// Byte offset of BeginClip element in element vec, for bbox fixup.
     save_point: usize,
     bbox: Option<Rect>,
+    blend: Option<Blend>,
 }
 
 const TOLERANCE: f64 = 0.25;
@@ -230,13 +232,14 @@ impl RenderContext for PietGpuRenderContext {
         self.encode_linewidth(-1.0);
         let path = shape.path_elements(TOLERANCE);
         self.encode_path(path, true);
-        let save_point = self.new_encoder.begin_clip();
+        let save_point = self.new_encoder.begin_clip(None);
         if self.clip_stack.len() >= MAX_BLEND_STACK {
             panic!("Maximum clip/blend stack size {} exceeded", MAX_BLEND_STACK);
         }
         self.clip_stack.push(ClipElement {
             bbox: None,
             save_point,
+            blend: None,
         });
         if let Some(tos) = self.state_stack.last_mut() {
             tos.n_clip += 1;
@@ -333,6 +336,25 @@ impl RenderContext for PietGpuRenderContext {
 }
 
 impl PietGpuRenderContext {
+    pub fn blend(&mut self, shape: impl Shape, blend: Blend) {
+        self.encode_linewidth(-1.0);
+        let path = shape.path_elements(TOLERANCE);
+        self.encode_path(path, true);
+        let save_point = self.new_encoder.begin_clip(Some(blend));
+        if self.clip_stack.len() >= MAX_BLEND_STACK {
+            panic!("Maximum clip/blend stack size {} exceeded", MAX_BLEND_STACK);
+        }
+        self.clip_stack.push(ClipElement {
+            bbox: None,
+            save_point,
+            blend: Some(blend),
+        });
+        self.accumulate_bbox(|| shape.bounding_box());
+        if let Some(tos) = self.state_stack.last_mut() {
+            tos.n_clip += 1;
+        }
+    }
+
     fn encode_path(&mut self, path: impl Iterator<Item = PathEl>, is_fill: bool) {
         if is_fill {
             self.encode_path_inner(
@@ -386,7 +408,7 @@ impl PietGpuRenderContext {
         let tos = self.clip_stack.pop().unwrap();
         let bbox = tos.bbox.unwrap_or_default();
         let bbox_f32_4 = rect_to_f32_4(bbox);
-        self.new_encoder.end_clip(bbox_f32_4, tos.save_point);
+        self.new_encoder.end_clip(bbox_f32_4, tos.blend, tos.save_point);
         if let Some(bbox) = tos.bbox {
             self.union_bbox(bbox);
         }
diff --git a/piet-gpu/src/test_scenes.rs b/piet-gpu/src/test_scenes.rs
index 47ace66..118b727 100644
--- a/piet-gpu/src/test_scenes.rs
+++ b/piet-gpu/src/test_scenes.rs
@@ -2,7 +2,8 @@
 
 use rand::{Rng, RngCore};
 
-use piet::kurbo::{BezPath, Circle, Line, Point, Rect, Shape};
+use crate::{PietGpuRenderContext, Blend, BlendMode, CompositionMode};
+use piet::kurbo::{Affine, BezPath, Circle, Line, Point, Rect, Shape};
 use piet::{
     Color, FixedGradient, FixedLinearGradient, GradientStop, Text, TextAttribute, TextLayoutBuilder,
 };
@@ -11,6 +12,18 @@ use crate::{PicoSvg, RenderContext, Vec2};
 
 const N_CIRCLES: usize = 0;
 
+pub fn render_blend_test(rc: &mut PietGpuRenderContext, i: usize, blend: Blend) {
+    rc.fill(
+        Rect::new(400., 400., 800., 800.),
+        &Color::rgb8(0, 0, 200),
+    );
+    rc.save().unwrap();
+    rc.blend(Rect::new(0., 0., 1000., 1000.), blend);
+    rc.transform(Affine::translate(Vec2::new(600., 600.)) * Affine::rotate(0.01 * i as f64));
+    rc.fill(Rect::new(0., 0., 400., 400.), &Color::rgba8(255, 0, 0, 255));
+    rc.restore().unwrap();
+}
+
 pub fn render_svg(rc: &mut impl RenderContext, filename: &str, scale: f64) {
     let xml_str = std::fs::read_to_string(filename).unwrap();
     let start = std::time::Instant::now();