Initial implementation of blend modes

* Add blend and composition mode enums to API * Mirror these in the shaders * Add new public blend function to PietGpuRenderContext that mirrors clip * Plumb the modes through the pipeline from scene to kernel4
2025-01-10 12:41:30 +11:00 · 2022-02-28 12:38:14 -05:00 · 2022-02-28 12:38:14 -05:00 · d3b08e4c52
parent d81e5cb4ee
commit d3b08e4c52
25 changed files with 2161 additions and 554 deletions
--- a/piet-gpu/bin/winit.rs
+++ b/piet-gpu/bin/winit.rs
@ -57,16 +57,27 @@ fn main() -> Result<(), Error> {
        let mut submitted: [Option<SubmittedCmdBuf>; NUM_FRAMES] = Default::default();

        let mut renderer = Renderer::new(&session, WIDTH, HEIGHT, NUM_FRAMES)?;
+        let mut mode = 0usize;

        event_loop.run(move |event, _, control_flow| {
            *control_flow = ControlFlow::Poll; // `ControlFlow::Wait` if only re-render on event

            match event {
                Event::WindowEvent { event, window_id } if window_id == window.id() => {
+                    use winit::event::{ElementState, VirtualKeyCode};
                    match event {
                        WindowEvent::CloseRequested => {
                            *control_flow = ControlFlow::Exit;
                        }
+                        WindowEvent::KeyboardInput { input, .. } => {
+                            if input.state == ElementState::Pressed {
+                                match input.virtual_keycode {
+                                    Some(VirtualKeyCode::Left) => mode = mode.wrapping_sub(1),
+                                    Some(VirtualKeyCode::Right) => mode = mode.wrapping_add(1),
+                                    _ => {}
+                                }
+                            }
+                        }
                        _ => (),
                    }
                }
@ -105,7 +116,41 @@ fn main() -> Result<(), Error> {
                        }
                        test_scenes::render_svg(&mut ctx, input, scale);
                    } else {
-                        test_scenes::render_anim_frame(&mut ctx, current_frame);
+                        use piet_gpu::{Blend, BlendMode::*, CompositionMode::*};
+                        let blends = [
+                            Blend::new(Normal, SrcOver),
+                            Blend::new(Multiply, SrcOver),
+                            Blend::new(Screen, SrcOver),
+                            Blend::new(Overlay, SrcOver),
+                            Blend::new(Darken, SrcOver),
+                            Blend::new(Lighten, SrcOver),
+                            Blend::new(ColorDodge, SrcOver),
+                            Blend::new(ColorBurn, SrcOver),
+                            Blend::new(HardLight, SrcOver),
+                            Blend::new(SoftLight, SrcOver),
+                            Blend::new(Difference, SrcOver),
+                            Blend::new(Exclusion, SrcOver),
+                            Blend::new(Hue, SrcOver),
+                            Blend::new(Saturation, SrcOver),
+                            Blend::new(Color, SrcOver),
+                            Blend::new(Luminosity, SrcOver),
+                            Blend::new(Normal, Clear),
+                            Blend::new(Normal, Copy),
+                            Blend::new(Normal, Dest),
+                            Blend::new(Normal, SrcOver),
+                            Blend::new(Normal, DestOver),
+                            Blend::new(Normal, SrcIn),
+                            Blend::new(Normal, DestIn),
+                            Blend::new(Normal, SrcOut),
+                            Blend::new(Normal, DestOut),
+                            Blend::new(Normal, SrcAtop),
+                            Blend::new(Normal, DestAtop),
+                            Blend::new(Normal, Xor),
+                            Blend::new(Normal, Plus),
+                        ];
+                        let blend = blends[mode % blends.len()];
+                        test_scenes::render_blend_test(&mut ctx, current_frame, blend);
+                        info_string = format!("{:?}", blend);
                    }
                    render_info_string(&mut ctx, &info_string);
                    if let Err(e) = renderer.upload_render_ctx(&mut ctx, frame_idx) {
--- a/piet-gpu/shader/annotated.h
+++ b/piet-gpu/shader/annotated.h
@ -69,9 +69,10 @@ AnnoLinGradientRef AnnoLinGradient_index(AnnoLinGradientRef ref, uint index) {
 struct AnnoBeginClip {
    vec4 bbox;
    float linewidth;
+    uint blend;
 };

-#define AnnoBeginClip_size 20
+#define AnnoBeginClip_size 24

 AnnoBeginClipRef AnnoBeginClip_index(AnnoBeginClipRef ref, uint index) {
    return AnnoBeginClipRef(ref.offset + index * AnnoBeginClip_size);
@ -79,9 +80,10 @@ AnnoBeginClipRef AnnoBeginClip_index(AnnoBeginClipRef ref, uint index) {

 struct AnnoEndClip {
    vec4 bbox;
+    uint blend;
 };

-#define AnnoEndClip_size 16
+#define AnnoEndClip_size 20

 AnnoEndClipRef AnnoEndClip_index(AnnoEndClipRef ref, uint index) {
    return AnnoEndClipRef(ref.offset + index * AnnoEndClip_size);
@ -198,9 +200,11 @@ AnnoBeginClip AnnoBeginClip_read(Alloc a, AnnoBeginClipRef ref) {
    uint raw2 = read_mem(a, ix + 2);
    uint raw3 = read_mem(a, ix + 3);
    uint raw4 = read_mem(a, ix + 4);
+    uint raw5 = read_mem(a, ix + 5);
    AnnoBeginClip s;
    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.linewidth = uintBitsToFloat(raw4);
+    s.blend = raw5;
    return s;
 }

@ -211,6 +215,7 @@ void AnnoBeginClip_write(Alloc a, AnnoBeginClipRef ref, AnnoBeginClip s) {
    write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
    write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
    write_mem(a, ix + 4, floatBitsToUint(s.linewidth));
+    write_mem(a, ix + 5, s.blend);
 }

 AnnoEndClip AnnoEndClip_read(Alloc a, AnnoEndClipRef ref) {
@ -219,8 +224,10 @@ AnnoEndClip AnnoEndClip_read(Alloc a, AnnoEndClipRef ref) {
    uint raw1 = read_mem(a, ix + 1);
    uint raw2 = read_mem(a, ix + 2);
    uint raw3 = read_mem(a, ix + 3);
+    uint raw4 = read_mem(a, ix + 4);
    AnnoEndClip s;
    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.blend = raw4;
    return s;
 }

@ -230,6 +237,7 @@ void AnnoEndClip_write(Alloc a, AnnoEndClipRef ref, AnnoEndClip s) {
    write_mem(a, ix + 1, floatBitsToUint(s.bbox.y));
    write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
    write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
+    write_mem(a, ix + 4, s.blend);
 }

 AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref) {
@ -281,8 +289,8 @@ void Annotated_BeginClip_write(Alloc a, AnnotatedRef ref, uint flags, AnnoBeginC
    AnnoBeginClip_write(a, AnnoBeginClipRef(ref.offset + 4), s);
 }

-void Annotated_EndClip_write(Alloc a, AnnotatedRef ref, AnnoEndClip s) {
-    write_mem(a, ref.offset >> 2, Annotated_EndClip);
+void Annotated_EndClip_write(Alloc a, AnnotatedRef ref, uint flags, AnnoEndClip s) {
+    write_mem(a, ref.offset >> 2, (flags << 16) | Annotated_EndClip);
    AnnoEndClip_write(a, AnnoEndClipRef(ref.offset + 4), s);
 }

--- a/piet-gpu/shader/blend.h
+++ b/piet-gpu/shader/blend.h
@ -0,0 +1,260 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Mode definitions and functions for blending and composition.
+
+#define Blend_Normal 0
+#define Blend_Multiply 1
+#define Blend_Screen 2
+#define Blend_Overlay 3
+#define Blend_Darken 4
+#define Blend_Lighten 5
+#define Blend_ColorDodge 6
+#define Blend_ColorBurn 7
+#define Blend_HardLight 8
+#define Blend_SoftLight 9
+#define Blend_Difference 10
+#define Blend_Exclusion 11
+#define Blend_Hue 12
+#define Blend_Saturation 13
+#define Blend_Color 14
+#define Blend_Luminosity 15
+
+vec3 screen(vec3 cb, vec3 cs) {
+	return cb + cs - (cb * cs);
+}
+
+float color_dodge(float cb, float cs) {
+    if (cb == 0.0)
+        return 0.0;
+    else if (cs == 1.0)
+        return 1.0;
+    else
+        return min(1.0, cb / (1.0 - cs));
+}
+
+float color_burn(float cb, float cs) {
+    if (cb == 1.0)
+        return 1.0;
+    else if (cs == 0.0)
+        return 0.0;
+    else
+        return 1.0 - min(1.0, (1.0 - cb) / cs);
+}
+
+vec3 hard_light(vec3 cb, vec3 cs) {
+	return mix(
+		screen(cb, 2.0 * cs - 1.0),
+		cb * 2.0 * cs, 
+		vec3(lessThanEqual(cs, vec3(0.5)))
+	);
+}
+
+vec3 soft_light(vec3 cb, vec3 cs) {
+	vec3 d = mix(
+		sqrt(cb),
+		((16.0 * cb - vec3(12.0)) * cb + vec3(4.0)) * cb,
+		vec3(lessThanEqual(cb, vec3(0.25)))
+	);
+	return mix(
+		cb + (2.0 * cs - vec3(1.0)) * (d - cb),
+		cb - (vec3(1.0) - 2.0 * cs) * cb * (vec3(1.0) - cb),
+		vec3(lessThanEqual(cs, vec3(0.5)))
+	);
+}
+
+float sat(vec3 c) {
+    return max(c.r, max(c.g, c.b)) - min(c.r, min(c.g, c.b));
+}
+
+float lum(vec3 c) {
+    vec3 f = vec3(0.3, 0.59, 0.11);
+    return dot(c, f);
+}
+
+vec3 clip_color(vec3 c) {
+    float L = lum(c);
+    float n = min(c.r, min(c.g, c.b));
+    float x = max(c.r, max(c.g, c.b));
+    if (n < 0.0)
+        c = L + (((c - L) * L) / (L - n));
+    if (x > 1.0)
+        c = L + (((c - L) * (1.0 - L)) / (x - L));
+    return c;
+}
+
+vec3 set_lum(vec3 c, float l) {
+    return clip_color(c + (l - lum(c)));
+}
+
+void set_sat_inner(inout float cmin, inout float cmid, inout float cmax, float s) {
+    if (cmax > cmin) {
+        cmid = (((cmid - cmin) * s) / (cmax - cmin));
+        cmax = s;
+    } else {
+        cmid = 0.0;
+        cmax = 0.0;
+    }
+    cmin = 0.0;
+}
+
+vec3 set_sat(vec3 c, float s) {
+    if (c.r <= c.g) {
+        if (c.g <= c.b) {
+            set_sat_inner(c.r, c.g, c.b, s);
+        } else {
+            if (c.r <= c.b) {
+                set_sat_inner(c.r, c.b, c.g, s);
+            } else {
+                set_sat_inner(c.b, c.r, c.g, s);
+            }
+        }
+    } else {
+        if (c.r <= c.b) {
+            set_sat_inner(c.g, c.r, c.b, s);
+        } else {
+            if (c.g <= c.b) {
+                set_sat_inner(c.g, c.b, c.r, s);
+            } else {
+                set_sat_inner(c.b, c.g, c.r, s);
+            }
+        }
+    }
+    return c;
+}
+
+vec3 mix_blend(vec3 cb, vec3 cs, uint mode) {
+	vec3 b = vec3(0.0);
+	switch (mode) {
+	case Blend_Multiply:
+		b = cb * cs;
+		break;
+	case Blend_Screen:
+		b = screen(cb, cs);
+		break;
+	case Blend_Overlay:
+		b = hard_light(cs, cb);
+		break;
+	case Blend_Darken:
+		b = min(cb, cs);
+		break;
+	case Blend_Lighten:
+		b = max(cb, cs);
+		break;
+	case Blend_ColorDodge:
+		b = vec3(color_dodge(cb.x, cs.x), color_dodge(cb.y, cs.y), color_dodge(cb.z, cs.z));
+		break;
+	case Blend_ColorBurn:
+		b = vec3(color_burn(cb.x, cs.x), color_burn(cb.y, cs.y), color_burn(cb.z, cs.z));
+		break;
+	case Blend_HardLight:
+		b = hard_light(cb, cs);
+		break;
+	case Blend_SoftLight:
+		b = soft_light(cb, cs);
+		break;
+	case Blend_Difference:
+		b = abs(cb - cs);
+		break;
+	case Blend_Exclusion:
+		b = cb + cs - 2 * cb * cs;
+		break;
+	case Blend_Hue:
+		b = set_lum(set_sat(cs, sat(cb)), lum(cb));
+		break;
+	case Blend_Saturation:
+		b = set_lum(set_sat(cb, sat(cs)), lum(cb));
+		break;
+	case Blend_Color:
+		b = set_lum(cs, lum(cb));
+		break;
+	case Blend_Luminosity:
+		b = set_lum(cb, lum(cs));
+		break;
+	default:
+		b = cs;
+		break;
+	}
+	return b;
+}
+
+#define Comp_Clear 0
+#define Comp_Copy 1
+#define Comp_Dest 2
+#define Comp_SrcOver 3
+#define Comp_DestOver 4
+#define Comp_SrcIn 5
+#define Comp_DestIn 6
+#define Comp_SrcOut 7
+#define Comp_DestOut 8
+#define Comp_SrcAtop 9
+#define Comp_DestAtop 10
+#define Comp_Xor 11
+#define Comp_Plus 12
+#define Comp_PlusDarker 13
+#define Comp_PlusLighter 14
+
+vec4 mix_compose(vec3 cb, vec3 cs, float ab, float as, uint mode) {
+	float fa = 0.0;
+	float fb = 0.0;
+	switch (mode) {
+	case Comp_Copy:
+		fa = 1.0;
+		fb = 0.0;
+		break;
+	case Comp_Dest:
+		fa = 0.0;
+		fb = 1.0;
+		break;
+	case Comp_SrcOver:
+		fa = 1.0;
+		fb = 1.0 - as;
+		break;
+	case Comp_DestOver:
+		fa = 1.0 - ab;
+		fb = 1.0;
+		break;
+	case Comp_SrcIn:
+		fa = ab;
+		fb = 0.0;
+		break;
+	case Comp_DestIn:
+		fa = 0.0;
+		fb = as;
+		break;
+	case Comp_SrcOut:
+		fa = 1.0 - ab;
+		fb = 0.0;
+		break;
+	case Comp_DestOut:
+		fa = 0.0;
+		fb = 1.0 - as;
+		break;
+	case Comp_SrcAtop:
+		fa = ab;
+		fb = 1.0 - as;
+		break;
+	case Comp_DestAtop:
+		fa = 1.0 - ab;
+		fb = as;
+		break;
+	case Comp_Xor:
+		fa = 1.0 - ab;
+		fb = 1.0 - as;
+		break;
+	case Comp_Plus:
+		fa = 1.0;
+		fb = 1.0;
+		break;
+	case Comp_PlusDarker:
+		return vec4(max(vec4(0.0), 1.0 - as * vec4(cs, as) + 1.0 - ab * vec4(cb, ab)).xyz, 
+			    max(0.0, 1.0 - as + 1.0 - ab));
+	case Comp_PlusLighter:
+		return vec4(min(vec4(1.0), as * vec4(cs, as) + ab * vec4(cb, ab)).xyz,
+			    min(1.0, as + ab));
+	default:
+		break;
+	}
+	return as * fa * vec4(cs, as) + ab * fb * vec4(cb, ab);
+}
+
+#define BlendComp_default (Blend_Normal << 8 | Comp_SrcOver)
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@ -53,7 +53,7 @@ build gen/coarse.hlsl: hlsl gen/coarse.spv
 build gen/coarse.dxil: dxil gen/coarse.hlsl
 build gen/coarse.msl: msl gen/coarse.spv

-build gen/kernel4.spv: glsl kernel4.comp | ptcl.h setup.h
+build gen/kernel4.spv: glsl kernel4.comp | blend.h ptcl.h setup.h
 build gen/kernel4.hlsl: hlsl gen/kernel4.spv
 build gen/kernel4.dxil: dxil gen/kernel4.hlsl
 build gen/kernel4.msl: msl gen/kernel4.spv
@ -114,7 +114,7 @@ build gen/draw_root.hlsl: hlsl gen/draw_root.spv
 build gen/draw_root.dxil: dxil gen/draw_root.hlsl
 build gen/draw_root.msl: msl gen/draw_root.spv

-build gen/draw_leaf.spv: glsl draw_leaf.comp | scene.h drawtag.h annotated.h setup.h mem.h
+build gen/draw_leaf.spv: glsl draw_leaf.comp | blend.h scene.h drawtag.h annotated.h setup.h mem.h
 build gen/draw_leaf.hlsl: hlsl gen/draw_leaf.spv
 build gen/draw_leaf.dxil: dxil gen/draw_leaf.hlsl
 build gen/draw_leaf.msl: msl gen/draw_leaf.spv
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@ -273,7 +273,8 @@ void main() {
                }
            }
            AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + sh_elements[el_ix] * Annotated_size);
-            uint tag = Annotated_tag(conf.anno_alloc, ref).tag;
+            AnnotatedTag anno_tag = Annotated_tag(conf.anno_alloc, ref);
+            uint tag = anno_tag.tag;
            uint seq_ix = ix - (el_ix > 0 ? sh_tile_count[el_ix - 1] : 0);
            uint width = sh_tile_width[el_ix];
            uint x = sh_tile_x0[el_ix] + seq_ix % width;
@ -287,7 +288,10 @@ void main() {
                // For draws, include the tile if it is solid.
                // For clips, include the tile if it is empty - this way, logic
                // below will suppress the drawing of inner elements.
-                include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip;
+                // For blends, include the tile if
+                // (blend_mode, composition_mode) != (Normal, SrcOver)
+                include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip
+                    || (is_clip && (anno_tag.flags & 0x2) != 0);
            }
            if (include_tile) {
                uint el_slice = el_ix / 32;
@ -387,13 +391,14 @@ void main() {
                    tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok),
                                     TileRef(sh_tile_base[element_ref_ix] +
                                             (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
+                    AnnoEndClip end_clip = Annotated_EndClip_read(conf.anno_alloc, ref);
                    clip_depth--;
                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
                        break;
                    }
                    write_fill(cmd_alloc, cmd_ref, MODE_NONZERO, tile, 0.0);
-                    Cmd_EndClip_write(cmd_alloc, cmd_ref);
-                    cmd_ref.offset += 4;
+                    Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(end_clip.blend));
+                    cmd_ref.offset += 4 + CmdEndClip_size;
                    break;
                }
            } else {
--- a/piet-gpu/shader/draw_leaf.comp
+++ b/piet-gpu/shader/draw_leaf.comp
@ -28,6 +28,7 @@ layout(binding = 2) readonly buffer SceneBuf {
 #include "tile.h"
 #include "drawtag.h"
 #include "annotated.h"
+#include "blend.h"

 #define Monoid DrawMonoid

@ -149,17 +150,23 @@ void main() {
                Annotated_Image_write(conf.anno_alloc, out_ref, fill_mode, anno_img);
                break;
            case Element_BeginClip:
+                Clip begin_clip = Element_BeginClip_read(this_ref);
                AnnoBeginClip anno_begin_clip;
                anno_begin_clip.bbox = bbox;
                anno_begin_clip.linewidth = 0.0; // don't support clip-with-stroke
-                Annotated_BeginClip_write(conf.anno_alloc, out_ref, 0, anno_begin_clip);
+                anno_begin_clip.blend = begin_clip.blend;
+                uint flags = uint(begin_clip.blend != BlendComp_default) << 1;
+                Annotated_BeginClip_write(conf.anno_alloc, out_ref, flags, anno_begin_clip);
                break;
            }
        } else if (tag_word == Element_EndClip) {
+            Clip end_clip = Element_BeginClip_read(this_ref);
            AnnoEndClip anno_end_clip;
            // The actual bbox will be reconstructed from clip stream output.
            anno_end_clip.bbox = vec4(-1e9, -1e9, 1e9, 1e9);
-            Annotated_EndClip_write(conf.anno_alloc, out_ref, anno_end_clip);
+            anno_end_clip.blend = end_clip.blend;
+            uint flags = uint(end_clip.blend != BlendComp_default) << 1;
+            Annotated_EndClip_write(conf.anno_alloc, out_ref, flags, anno_end_clip);
        }
        // Generate clip stream.
        if (tag_word == Element_BeginClip || tag_word == Element_EndClip) {
--- a/piet-gpu/shader/gen/binning.msl
+++ b/piet-gpu/shader/gen/binning.msl
@ -220,7 +220,7 @@ void BinInstance_write(thread const Alloc& a, thread const BinInstanceRef& ref,
 kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_94 [[buffer(0)]], const device ConfigBuf& v_202 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
    threadgroup uint bitmaps[8][256];
-    threadgroup short sh_alloc_failed;
+    threadgroup bool sh_alloc_failed;
    threadgroup uint count[8][256];
    threadgroup Alloc sh_chunk_alloc[256];
    constant uint& v_94BufferSize = spvBufferSizeConstants[0];
@ -232,7 +232,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
    }
    if (gl_LocalInvocationID.x == 0u)
    {
-        sh_alloc_failed = short(false);
+        sh_alloc_failed = false;
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);
    uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x;
@ -331,7 +331,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
        sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
        if (chunk.failed)
        {
-            sh_alloc_failed = short(true);
+            sh_alloc_failed = true;
        }
    }
    uint out_ix = (v_202.conf.bin_alloc.offset >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u);
@ -347,13 +347,13 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
    write_mem(param_16, param_17, param_18, v_94, v_94BufferSize);
    threadgroup_barrier(mem_flags::mem_threadgroup);
    bool _687;
-    if (!bool(sh_alloc_failed))
+    if (!sh_alloc_failed)
    {
        _687 = v_94.mem_error != 0u;
    }
    else
    {
-        _687 = bool(sh_alloc_failed);
+        _687 = sh_alloc_failed;
    }
    if (_687)
    {
--- a/piet-gpu/shader/gen/coarse.msl
+++ b/piet-gpu/shader/gen/coarse.msl
--- a/piet-gpu/shader/gen/coarse.spv
+++ b/piet-gpu/shader/gen/coarse.spv
--- a/piet-gpu/shader/gen/draw_leaf.msl
+++ b/piet-gpu/shader/gen/draw_leaf.msl
@ -87,6 +87,17 @@ struct FillImage
    int2 offset;
 };

+struct ClipRef
+{
+    uint offset;
+};
+
+struct Clip
+{
+    float4 bbox;
+    uint blend;
+};
+
 struct ElementTag
 {
    uint tag;
@ -148,6 +159,7 @@ struct AnnoBeginClip
 {
    float4 bbox;
    float linewidth;
+    uint blend;
 };

 struct AnnoEndClipRef
@ -158,6 +170,7 @@ struct AnnoEndClipRef
 struct AnnoEndClip
 {
    float4 bbox;
+    uint blend;
 };

 struct AnnotatedRef
@ -228,9 +241,9 @@ struct ConfigBuf
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);

 static inline __attribute__((always_inline))
-ElementTag Element_tag(thread const ElementRef& ref, const device SceneBuf& v_211)
+ElementTag Element_tag(thread const ElementRef& ref, const device SceneBuf& v_223)
 {
-    uint tag_and_flags = v_211.scene[ref.offset >> uint(2)];
+    uint tag_and_flags = v_223.scene[ref.offset >> uint(2)];
    return ElementTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
 }

@ -279,20 +292,20 @@ DrawMonoid tag_monoid_identity()
 }

 static inline __attribute__((always_inline))
-FillColor FillColor_read(thread const FillColorRef& ref, const device SceneBuf& v_211)
+FillColor FillColor_read(thread const FillColorRef& ref, const device SceneBuf& v_223)
 {
    uint ix = ref.offset >> uint(2);
-    uint raw0 = v_211.scene[ix + 0u];
+    uint raw0 = v_223.scene[ix + 0u];
    FillColor s;
    s.rgba_color = raw0;
    return s;
 }

 static inline __attribute__((always_inline))
-FillColor Element_FillColor_read(thread const ElementRef& ref, const device SceneBuf& v_211)
+FillColor Element_FillColor_read(thread const ElementRef& ref, const device SceneBuf& v_223)
 {
    FillColorRef param = FillColorRef{ ref.offset + 4u };
-    return FillColor_read(param, v_211);
+    return FillColor_read(param, v_223);
 }

 static inline __attribute__((always_inline))
@ -302,7 +315,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
 }

 static inline __attribute__((always_inline))
-void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_187)
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_199)
 {
    Alloc param = alloc;
    uint param_1 = offset;
@ -310,61 +323,61 @@ void write_mem(thread const Alloc& alloc, thread const uint& offset, thread cons
    {
        return;
    }
-    v_187.memory[offset] = val;
+    v_199.memory[offset] = val;
 }

 static inline __attribute__((always_inline))
-void AnnoColor_write(thread const Alloc& a, thread const AnnoColorRef& ref, thread const AnnoColor& s, device Memory& v_187)
+void AnnoColor_write(thread const Alloc& a, thread const AnnoColorRef& ref, thread const AnnoColor& s, device Memory& v_199)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
    uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_187);
+    write_mem(param, param_1, param_2, v_199);
    Alloc param_3 = a;
    uint param_4 = ix + 1u;
    uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_187);
+    write_mem(param_3, param_4, param_5, v_199);
    Alloc param_6 = a;
    uint param_7 = ix + 2u;
    uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_187);
+    write_mem(param_6, param_7, param_8, v_199);
    Alloc param_9 = a;
    uint param_10 = ix + 3u;
    uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_187);
+    write_mem(param_9, param_10, param_11, v_199);
    Alloc param_12 = a;
    uint param_13 = ix + 4u;
    uint param_14 = as_type<uint>(s.linewidth);
-    write_mem(param_12, param_13, param_14, v_187);
+    write_mem(param_12, param_13, param_14, v_199);
    Alloc param_15 = a;
    uint param_16 = ix + 5u;
    uint param_17 = s.rgba_color;
-    write_mem(param_15, param_16, param_17, v_187);
+    write_mem(param_15, param_16, param_17, v_199);
 }

 static inline __attribute__((always_inline))
-void Annotated_Color_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoColor& s, device Memory& v_187)
+void Annotated_Color_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoColor& s, device Memory& v_199)
 {
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = (flags << uint(16)) | 1u;
-    write_mem(param, param_1, param_2, v_187);
+    write_mem(param, param_1, param_2, v_199);
    Alloc param_3 = a;
    AnnoColorRef param_4 = AnnoColorRef{ ref.offset + 4u };
    AnnoColor param_5 = s;
-    AnnoColor_write(param_3, param_4, param_5, v_187);
+    AnnoColor_write(param_3, param_4, param_5, v_199);
 }

 static inline __attribute__((always_inline))
-FillLinGradient FillLinGradient_read(thread const FillLinGradientRef& ref, const device SceneBuf& v_211)
+FillLinGradient FillLinGradient_read(thread const FillLinGradientRef& ref, const device SceneBuf& v_223)
 {
    uint ix = ref.offset >> uint(2);
-    uint raw0 = v_211.scene[ix + 0u];
-    uint raw1 = v_211.scene[ix + 1u];
-    uint raw2 = v_211.scene[ix + 2u];
-    uint raw3 = v_211.scene[ix + 3u];
-    uint raw4 = v_211.scene[ix + 4u];
+    uint raw0 = v_223.scene[ix + 0u];
+    uint raw1 = v_223.scene[ix + 1u];
+    uint raw2 = v_223.scene[ix + 2u];
+    uint raw3 = v_223.scene[ix + 3u];
+    uint raw4 = v_223.scene[ix + 4u];
    FillLinGradient s;
    s.index = raw0;
    s.p0 = float2(as_type<float>(raw1), as_type<float>(raw2));
@ -373,73 +386,73 @@ FillLinGradient FillLinGradient_read(thread const FillLinGradientRef& ref, const
 }

 static inline __attribute__((always_inline))
-FillLinGradient Element_FillLinGradient_read(thread const ElementRef& ref, const device SceneBuf& v_211)
+FillLinGradient Element_FillLinGradient_read(thread const ElementRef& ref, const device SceneBuf& v_223)
 {
    FillLinGradientRef param = FillLinGradientRef{ ref.offset + 4u };
-    return FillLinGradient_read(param, v_211);
+    return FillLinGradient_read(param, v_223);
 }

 static inline __attribute__((always_inline))
-void AnnoLinGradient_write(thread const Alloc& a, thread const AnnoLinGradientRef& ref, thread const AnnoLinGradient& s, device Memory& v_187)
+void AnnoLinGradient_write(thread const Alloc& a, thread const AnnoLinGradientRef& ref, thread const AnnoLinGradient& s, device Memory& v_199)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
    uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_187);
+    write_mem(param, param_1, param_2, v_199);
    Alloc param_3 = a;
    uint param_4 = ix + 1u;
    uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_187);
+    write_mem(param_3, param_4, param_5, v_199);
    Alloc param_6 = a;
    uint param_7 = ix + 2u;
    uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_187);
+    write_mem(param_6, param_7, param_8, v_199);
    Alloc param_9 = a;
    uint param_10 = ix + 3u;
    uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_187);
+    write_mem(param_9, param_10, param_11, v_199);
    Alloc param_12 = a;
    uint param_13 = ix + 4u;
    uint param_14 = as_type<uint>(s.linewidth);
-    write_mem(param_12, param_13, param_14, v_187);
+    write_mem(param_12, param_13, param_14, v_199);
    Alloc param_15 = a;
    uint param_16 = ix + 5u;
    uint param_17 = s.index;
-    write_mem(param_15, param_16, param_17, v_187);
+    write_mem(param_15, param_16, param_17, v_199);
    Alloc param_18 = a;
    uint param_19 = ix + 6u;
    uint param_20 = as_type<uint>(s.line_x);
-    write_mem(param_18, param_19, param_20, v_187);
+    write_mem(param_18, param_19, param_20, v_199);
    Alloc param_21 = a;
    uint param_22 = ix + 7u;
    uint param_23 = as_type<uint>(s.line_y);
-    write_mem(param_21, param_22, param_23, v_187);
+    write_mem(param_21, param_22, param_23, v_199);
    Alloc param_24 = a;
    uint param_25 = ix + 8u;
    uint param_26 = as_type<uint>(s.line_c);
-    write_mem(param_24, param_25, param_26, v_187);
+    write_mem(param_24, param_25, param_26, v_199);
 }

 static inline __attribute__((always_inline))
-void Annotated_LinGradient_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoLinGradient& s, device Memory& v_187)
+void Annotated_LinGradient_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoLinGradient& s, device Memory& v_199)
 {
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = (flags << uint(16)) | 2u;
-    write_mem(param, param_1, param_2, v_187);
+    write_mem(param, param_1, param_2, v_199);
    Alloc param_3 = a;
    AnnoLinGradientRef param_4 = AnnoLinGradientRef{ ref.offset + 4u };
    AnnoLinGradient param_5 = s;
-    AnnoLinGradient_write(param_3, param_4, param_5, v_187);
+    AnnoLinGradient_write(param_3, param_4, param_5, v_199);
 }

 static inline __attribute__((always_inline))
-FillImage FillImage_read(thread const FillImageRef& ref, const device SceneBuf& v_211)
+FillImage FillImage_read(thread const FillImageRef& ref, const device SceneBuf& v_223)
 {
    uint ix = ref.offset >> uint(2);
-    uint raw0 = v_211.scene[ix + 0u];
-    uint raw1 = v_211.scene[ix + 1u];
+    uint raw0 = v_223.scene[ix + 0u];
+    uint raw1 = v_223.scene[ix + 1u];
    FillImage s;
    s.index = raw0;
    s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
@ -447,140 +460,169 @@ FillImage FillImage_read(thread const FillImageRef& ref, const device SceneBuf&
 }

 static inline __attribute__((always_inline))
-FillImage Element_FillImage_read(thread const ElementRef& ref, const device SceneBuf& v_211)
+FillImage Element_FillImage_read(thread const ElementRef& ref, const device SceneBuf& v_223)
 {
    FillImageRef param = FillImageRef{ ref.offset + 4u };
-    return FillImage_read(param, v_211);
+    return FillImage_read(param, v_223);
 }

 static inline __attribute__((always_inline))
-void AnnoImage_write(thread const Alloc& a, thread const AnnoImageRef& ref, thread const AnnoImage& s, device Memory& v_187)
+void AnnoImage_write(thread const Alloc& a, thread const AnnoImageRef& ref, thread const AnnoImage& s, device Memory& v_199)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
    uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_187);
+    write_mem(param, param_1, param_2, v_199);
    Alloc param_3 = a;
    uint param_4 = ix + 1u;
    uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_187);
+    write_mem(param_3, param_4, param_5, v_199);
    Alloc param_6 = a;
    uint param_7 = ix + 2u;
    uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_187);
+    write_mem(param_6, param_7, param_8, v_199);
    Alloc param_9 = a;
    uint param_10 = ix + 3u;
    uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_187);
+    write_mem(param_9, param_10, param_11, v_199);
    Alloc param_12 = a;
    uint param_13 = ix + 4u;
    uint param_14 = as_type<uint>(s.linewidth);
-    write_mem(param_12, param_13, param_14, v_187);
+    write_mem(param_12, param_13, param_14, v_199);
    Alloc param_15 = a;
    uint param_16 = ix + 5u;
    uint param_17 = s.index;
-    write_mem(param_15, param_16, param_17, v_187);
+    write_mem(param_15, param_16, param_17, v_199);
    Alloc param_18 = a;
    uint param_19 = ix + 6u;
    uint param_20 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16));
-    write_mem(param_18, param_19, param_20, v_187);
+    write_mem(param_18, param_19, param_20, v_199);
 }

 static inline __attribute__((always_inline))
-void Annotated_Image_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoImage& s, device Memory& v_187)
+void Annotated_Image_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoImage& s, device Memory& v_199)
 {
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = (flags << uint(16)) | 3u;
-    write_mem(param, param_1, param_2, v_187);
+    write_mem(param, param_1, param_2, v_199);
    Alloc param_3 = a;
    AnnoImageRef param_4 = AnnoImageRef{ ref.offset + 4u };
    AnnoImage param_5 = s;
-    AnnoImage_write(param_3, param_4, param_5, v_187);
+    AnnoImage_write(param_3, param_4, param_5, v_199);
 }

 static inline __attribute__((always_inline))
-void AnnoBeginClip_write(thread const Alloc& a, thread const AnnoBeginClipRef& ref, thread const AnnoBeginClip& s, device Memory& v_187)
+Clip Clip_read(thread const ClipRef& ref, const device SceneBuf& v_223)
+{
+    uint ix = ref.offset >> uint(2);
+    uint raw0 = v_223.scene[ix + 0u];
+    uint raw1 = v_223.scene[ix + 1u];
+    uint raw2 = v_223.scene[ix + 2u];
+    uint raw3 = v_223.scene[ix + 3u];
+    Clip s;
+    s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
+    s.blend = v_223.scene[ix + 4u];
+    return s;
+}
+
+static inline __attribute__((always_inline))
+Clip Element_BeginClip_read(thread const ElementRef& ref, const device SceneBuf& v_223)
+{
+    ClipRef param = ClipRef{ ref.offset + 4u };
+    return Clip_read(param, v_223);
+}
+
+static inline __attribute__((always_inline))
+void AnnoBeginClip_write(thread const Alloc& a, thread const AnnoBeginClipRef& ref, thread const AnnoBeginClip& s, device Memory& v_199)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
    uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_187);
+    write_mem(param, param_1, param_2, v_199);
    Alloc param_3 = a;
    uint param_4 = ix + 1u;
    uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_187);
+    write_mem(param_3, param_4, param_5, v_199);
    Alloc param_6 = a;
    uint param_7 = ix + 2u;
    uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_187);
+    write_mem(param_6, param_7, param_8, v_199);
    Alloc param_9 = a;
    uint param_10 = ix + 3u;
    uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_187);
+    write_mem(param_9, param_10, param_11, v_199);
    Alloc param_12 = a;
    uint param_13 = ix + 4u;
    uint param_14 = as_type<uint>(s.linewidth);
-    write_mem(param_12, param_13, param_14, v_187);
+    write_mem(param_12, param_13, param_14, v_199);
+    Alloc param_15 = a;
+    uint param_16 = ix + 5u;
+    uint param_17 = s.blend;
+    write_mem(param_15, param_16, param_17, v_199);
 }

 static inline __attribute__((always_inline))
-void Annotated_BeginClip_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoBeginClip& s, device Memory& v_187)
+void Annotated_BeginClip_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoBeginClip& s, device Memory& v_199)
 {
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = (flags << uint(16)) | 4u;
-    write_mem(param, param_1, param_2, v_187);
+    write_mem(param, param_1, param_2, v_199);
    Alloc param_3 = a;
    AnnoBeginClipRef param_4 = AnnoBeginClipRef{ ref.offset + 4u };
    AnnoBeginClip param_5 = s;
-    AnnoBeginClip_write(param_3, param_4, param_5, v_187);
+    AnnoBeginClip_write(param_3, param_4, param_5, v_199);
 }

 static inline __attribute__((always_inline))
-void AnnoEndClip_write(thread const Alloc& a, thread const AnnoEndClipRef& ref, thread const AnnoEndClip& s, device Memory& v_187)
+void AnnoEndClip_write(thread const Alloc& a, thread const AnnoEndClipRef& ref, thread const AnnoEndClip& s, device Memory& v_199)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
    uint param_2 = as_type<uint>(s.bbox.x);
-    write_mem(param, param_1, param_2, v_187);
+    write_mem(param, param_1, param_2, v_199);
    Alloc param_3 = a;
    uint param_4 = ix + 1u;
    uint param_5 = as_type<uint>(s.bbox.y);
-    write_mem(param_3, param_4, param_5, v_187);
+    write_mem(param_3, param_4, param_5, v_199);
    Alloc param_6 = a;
    uint param_7 = ix + 2u;
    uint param_8 = as_type<uint>(s.bbox.z);
-    write_mem(param_6, param_7, param_8, v_187);
+    write_mem(param_6, param_7, param_8, v_199);
    Alloc param_9 = a;
    uint param_10 = ix + 3u;
    uint param_11 = as_type<uint>(s.bbox.w);
-    write_mem(param_9, param_10, param_11, v_187);
+    write_mem(param_9, param_10, param_11, v_199);
+    Alloc param_12 = a;
+    uint param_13 = ix + 4u;
+    uint param_14 = s.blend;
+    write_mem(param_12, param_13, param_14, v_199);
 }

 static inline __attribute__((always_inline))
-void Annotated_EndClip_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const AnnoEndClip& s, device Memory& v_187)
+void Annotated_EndClip_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoEndClip& s, device Memory& v_199)
 {
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
-    uint param_2 = 5u;
-    write_mem(param, param_1, param_2, v_187);
+    uint param_2 = (flags << uint(16)) | 5u;
+    write_mem(param, param_1, param_2, v_199);
    Alloc param_3 = a;
    AnnoEndClipRef param_4 = AnnoEndClipRef{ ref.offset + 4u };
    AnnoEndClip param_5 = s;
-    AnnoEndClip_write(param_3, param_4, param_5, v_187);
+    AnnoEndClip_write(param_3, param_4, param_5, v_199);
 }

-kernel void main0(device Memory& v_187 [[buffer(0)]], const device ConfigBuf& _968 [[buffer(1)]], const device SceneBuf& v_211 [[buffer(2)]], const device ParentBuf& _934 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+kernel void main0(device Memory& v_199 [[buffer(0)]], const device ConfigBuf& _1054 [[buffer(1)]], const device SceneBuf& v_223 [[buffer(2)]], const device ParentBuf& _1020 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
 {
    threadgroup DrawMonoid sh_scratch[256];
    uint ix = gl_GlobalInvocationID.x * 8u;
    ElementRef ref = ElementRef{ ix * 36u };
    ElementRef param = ref;
-    uint tag_word = Element_tag(param, v_211).tag;
+    uint tag_word = Element_tag(param, v_223).tag;
    uint param_1 = tag_word;
    DrawMonoid agg = map_tag(param_1);
    spvUnsafeArray<DrawMonoid, 8> local;
@ -590,7 +632,7 @@ kernel void main0(device Memory& v_187 [[buffer(0)]], const device ConfigBuf& _9
        ElementRef param_2 = ref;
        uint param_3 = i;
        ElementRef param_4 = Element_index(param_2, param_3);
-        tag_word = Element_tag(param_4, v_211).tag;
+        tag_word = Element_tag(param_4, v_223).tag;
        uint param_5 = tag_word;
        DrawMonoid param_6 = agg;
        DrawMonoid param_7 = map_tag(param_5);
@ -615,9 +657,9 @@ kernel void main0(device Memory& v_187 [[buffer(0)]], const device ConfigBuf& _9
    DrawMonoid row = tag_monoid_identity();
    if (gl_WorkGroupID.x > 0u)
    {
-        uint _937 = gl_WorkGroupID.x - 1u;
-        row.path_ix = _934.parent[_937].path_ix;
-        row.clip_ix = _934.parent[_937].clip_ix;
+        uint _1023 = gl_WorkGroupID.x - 1u;
+        row.path_ix = _1020.parent[_1023].path_ix;
+        row.clip_ix = _1020.parent[_1023].clip_ix;
    }
    if (gl_LocalInvocationID.x > 0u)
    {
@ -626,9 +668,9 @@ kernel void main0(device Memory& v_187 [[buffer(0)]], const device ConfigBuf& _9
        row = combine_tag_monoid(param_10, param_11);
    }
    uint out_ix = gl_GlobalInvocationID.x * 8u;
-    uint out_base = (_968.conf.drawmonoid_alloc.offset >> uint(2)) + (out_ix * 2u);
-    uint clip_out_base = _968.conf.clip_alloc.offset >> uint(2);
-    AnnotatedRef out_ref = AnnotatedRef{ _968.conf.anno_alloc.offset + (out_ix * 40u) };
+    uint out_base = (_1054.conf.drawmonoid_alloc.offset >> uint(2)) + (out_ix * 2u);
+    uint clip_out_base = _1054.conf.clip_alloc.offset >> uint(2);
+    AnnotatedRef out_ref = AnnotatedRef{ _1054.conf.anno_alloc.offset + (out_ix * 40u) };
    float4 mat;
    float2 translate;
    AnnoColor anno_fill;
@ -638,9 +680,9 @@ kernel void main0(device Memory& v_187 [[buffer(0)]], const device ConfigBuf& _9
    AnnoImage anno_img;
    Alloc param_28;
    AnnoBeginClip anno_begin_clip;
-    Alloc param_32;
+    Alloc param_33;
    AnnoEndClip anno_end_clip;
-    Alloc param_36;
+    Alloc param_38;
    for (uint i_2 = 0u; i_2 < 8u; i_2++)
    {
        DrawMonoid m = row;
@ -650,31 +692,31 @@ kernel void main0(device Memory& v_187 [[buffer(0)]], const device ConfigBuf& _9
            DrawMonoid param_13 = local[i_2 - 1u];
            m = combine_tag_monoid(param_12, param_13);
        }
-        v_187.memory[out_base + (i_2 * 2u)] = m.path_ix;
-        v_187.memory[(out_base + (i_2 * 2u)) + 1u] = m.clip_ix;
+        v_199.memory[out_base + (i_2 * 2u)] = m.path_ix;
+        v_199.memory[(out_base + (i_2 * 2u)) + 1u] = m.clip_ix;
        ElementRef param_14 = ref;
        uint param_15 = i_2;
        ElementRef this_ref = Element_index(param_14, param_15);
        ElementRef param_16 = this_ref;
-        tag_word = Element_tag(param_16, v_211).tag;
+        tag_word = Element_tag(param_16, v_223).tag;
        if ((((tag_word == 4u) || (tag_word == 5u)) || (tag_word == 6u)) || (tag_word == 9u))
        {
-            uint bbox_offset = (_968.conf.bbox_alloc.offset >> uint(2)) + (6u * m.path_ix);
-            float bbox_l = float(v_187.memory[bbox_offset]) - 32768.0;
-            float bbox_t = float(v_187.memory[bbox_offset + 1u]) - 32768.0;
-            float bbox_r = float(v_187.memory[bbox_offset + 2u]) - 32768.0;
-            float bbox_b = float(v_187.memory[bbox_offset + 3u]) - 32768.0;
+            uint bbox_offset = (_1054.conf.bbox_alloc.offset >> uint(2)) + (6u * m.path_ix);
+            float bbox_l = float(v_199.memory[bbox_offset]) - 32768.0;
+            float bbox_t = float(v_199.memory[bbox_offset + 1u]) - 32768.0;
+            float bbox_r = float(v_199.memory[bbox_offset + 2u]) - 32768.0;
+            float bbox_b = float(v_199.memory[bbox_offset + 3u]) - 32768.0;
            float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
-            float linewidth = as_type<float>(v_187.memory[bbox_offset + 4u]);
+            float linewidth = as_type<float>(v_199.memory[bbox_offset + 4u]);
            uint fill_mode = uint(linewidth >= 0.0);
            if ((linewidth >= 0.0) || (tag_word == 5u))
            {
-                uint trans_ix = v_187.memory[bbox_offset + 5u];
-                uint t = (_968.conf.trans_alloc.offset >> uint(2)) + (6u * trans_ix);
-                mat = as_type<float4>(uint4(v_187.memory[t], v_187.memory[t + 1u], v_187.memory[t + 2u], v_187.memory[t + 3u]));
+                uint trans_ix = v_199.memory[bbox_offset + 5u];
+                uint t = (_1054.conf.trans_alloc.offset >> uint(2)) + (6u * trans_ix);
+                mat = as_type<float4>(uint4(v_199.memory[t], v_199.memory[t + 1u], v_199.memory[t + 2u], v_199.memory[t + 3u]));
                if (tag_word == 5u)
                {
-                    translate = as_type<float2>(uint2(v_187.memory[t + 4u], v_187.memory[t + 5u]));
+                    translate = as_type<float2>(uint2(v_199.memory[t + 4u], v_199.memory[t + 5u]));
                }
            }
            if (linewidth >= 0.0)
@ -687,21 +729,21 @@ kernel void main0(device Memory& v_187 [[buffer(0)]], const device ConfigBuf& _9
                case 4u:
                {
                    ElementRef param_17 = this_ref;
-                    FillColor fill = Element_FillColor_read(param_17, v_211);
+                    FillColor fill = Element_FillColor_read(param_17, v_223);
                    anno_fill.bbox = bbox;
                    anno_fill.linewidth = linewidth;
                    anno_fill.rgba_color = fill.rgba_color;
-                    param_18.offset = _968.conf.anno_alloc.offset;
+                    param_18.offset = _1054.conf.anno_alloc.offset;
                    AnnotatedRef param_19 = out_ref;
                    uint param_20 = fill_mode;
                    AnnoColor param_21 = anno_fill;
-                    Annotated_Color_write(param_18, param_19, param_20, param_21, v_187);
+                    Annotated_Color_write(param_18, param_19, param_20, param_21, v_199);
                    break;
                }
                case 5u:
                {
                    ElementRef param_22 = this_ref;
-                    FillLinGradient lin = Element_FillLinGradient_read(param_22, v_211);
+                    FillLinGradient lin = Element_FillLinGradient_read(param_22, v_223);
                    anno_lin.bbox = bbox;
                    anno_lin.linewidth = linewidth;
                    anno_lin.index = lin.index;
@ -714,37 +756,41 @@ kernel void main0(device Memory& v_187 [[buffer(0)]], const device ConfigBuf& _9
                    anno_lin.line_x = line_x;
                    anno_lin.line_y = line_y;
                    anno_lin.line_c = -((p0.x * line_x) + (p0.y * line_y));
-                    param_23.offset = _968.conf.anno_alloc.offset;
+                    param_23.offset = _1054.conf.anno_alloc.offset;
                    AnnotatedRef param_24 = out_ref;
                    uint param_25 = fill_mode;
                    AnnoLinGradient param_26 = anno_lin;
-                    Annotated_LinGradient_write(param_23, param_24, param_25, param_26, v_187);
+                    Annotated_LinGradient_write(param_23, param_24, param_25, param_26, v_199);
                    break;
                }
                case 6u:
                {
                    ElementRef param_27 = this_ref;
-                    FillImage fill_img = Element_FillImage_read(param_27, v_211);
+                    FillImage fill_img = Element_FillImage_read(param_27, v_223);
                    anno_img.bbox = bbox;
                    anno_img.linewidth = linewidth;
                    anno_img.index = fill_img.index;
                    anno_img.offset = fill_img.offset;
-                    param_28.offset = _968.conf.anno_alloc.offset;
+                    param_28.offset = _1054.conf.anno_alloc.offset;
                    AnnotatedRef param_29 = out_ref;
                    uint param_30 = fill_mode;
                    AnnoImage param_31 = anno_img;
-                    Annotated_Image_write(param_28, param_29, param_30, param_31, v_187);
+                    Annotated_Image_write(param_28, param_29, param_30, param_31, v_199);
                    break;
                }
                case 9u:
                {
+                    ElementRef param_32 = this_ref;
+                    Clip begin_clip = Element_BeginClip_read(param_32, v_223);
                    anno_begin_clip.bbox = bbox;
                    anno_begin_clip.linewidth = 0.0;
-                    param_32.offset = _968.conf.anno_alloc.offset;
-                    AnnotatedRef param_33 = out_ref;
-                    uint param_34 = 0u;
-                    AnnoBeginClip param_35 = anno_begin_clip;
-                    Annotated_BeginClip_write(param_32, param_33, param_34, param_35, v_187);
+                    anno_begin_clip.blend = begin_clip.blend;
+                    uint flags = uint(begin_clip.blend != 3u) << uint(1);
+                    param_33.offset = _1054.conf.anno_alloc.offset;
+                    AnnotatedRef param_34 = out_ref;
+                    uint param_35 = flags;
+                    AnnoBeginClip param_36 = anno_begin_clip;
+                    Annotated_BeginClip_write(param_33, param_34, param_35, param_36, v_199);
                    break;
                }
            }
@ -753,11 +799,16 @@ kernel void main0(device Memory& v_187 [[buffer(0)]], const device ConfigBuf& _9
        {
            if (tag_word == 10u)
            {
+                ElementRef param_37 = this_ref;
+                Clip end_clip = Element_BeginClip_read(param_37, v_223);
                anno_end_clip.bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0);
-                param_36.offset = _968.conf.anno_alloc.offset;
-                AnnotatedRef param_37 = out_ref;
-                AnnoEndClip param_38 = anno_end_clip;
-                Annotated_EndClip_write(param_36, param_37, param_38, v_187);
+                anno_end_clip.blend = end_clip.blend;
+                uint flags_1 = uint(end_clip.blend != 3u) << uint(1);
+                param_38.offset = _1054.conf.anno_alloc.offset;
+                AnnotatedRef param_39 = out_ref;
+                uint param_40 = flags_1;
+                AnnoEndClip param_41 = anno_end_clip;
+                Annotated_EndClip_write(param_38, param_39, param_40, param_41, v_199);
            }
        }
        if ((tag_word == 9u) || (tag_word == 10u))
@ -767,7 +818,7 @@ kernel void main0(device Memory& v_187 [[buffer(0)]], const device ConfigBuf& _9
            {
                path_ix = m.path_ix;
            }
-            v_187.memory[clip_out_base + m.clip_ix] = path_ix;
+            v_199.memory[clip_out_base + m.clip_ix] = path_ix;
        }
        out_ref.offset += 40u;
    }
--- a/piet-gpu/shader/gen/draw_leaf.spv
+++ b/piet-gpu/shader/gen/draw_leaf.spv
--- a/piet-gpu/shader/gen/kernel4.msl
+++ b/piet-gpu/shader/gen/kernel4.msl
@ -115,6 +115,16 @@ struct CmdAlpha
    float alpha;
 };

+struct CmdEndClipRef
+{
+    uint offset;
+};
+
+struct CmdEndClip
+{
+    uint blend;
+};
+
 struct CmdJumpRef
 {
    uint offset;
@ -208,7 +218,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
 }

 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_202)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_278)
 {
    Alloc param = alloc;
    uint param_1 = offset;
@ -216,29 +226,29 @@ uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memor
    {
        return 0u;
    }
-    uint v = v_202.memory[offset];
+    uint v = v_278.memory[offset];
    return v;
 }

 static inline __attribute__((always_inline))
-CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_202);
+    uint tag_and_flags = read_mem(param, param_1, v_278);
    return CmdTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
 }

 static inline __attribute__((always_inline))
-CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_202)
+CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_278)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
    Alloc param_2 = a;
    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
    CmdStroke s;
    s.tile_ref = raw0;
    s.half_width = as_type<float>(raw1);
@ -246,11 +256,11 @@ CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref,
 }

 static inline __attribute__((always_inline))
-CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
    Alloc param = a;
    CmdStrokeRef param_1 = CmdStrokeRef{ ref.offset + 4u };
-    return CmdStroke_read(param, param_1, v_202);
+    return CmdStroke_read(param, param_1, v_278);
 }

 static inline __attribute__((always_inline))
@ -262,27 +272,27 @@ Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const
 }

 static inline __attribute__((always_inline))
-TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_202)
+TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_278)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
    Alloc param_2 = a;
    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
    Alloc param_4 = a;
    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_202);
+    uint raw2 = read_mem(param_4, param_5, v_278);
    Alloc param_6 = a;
    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_202);
+    uint raw3 = read_mem(param_6, param_7, v_278);
    Alloc param_8 = a;
    uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_202);
+    uint raw4 = read_mem(param_8, param_9, v_278);
    Alloc param_10 = a;
    uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_202);
+    uint raw5 = read_mem(param_10, param_11, v_278);
    TileSeg s;
    s.origin = float2(as_type<float>(raw0), as_type<float>(raw1));
    s.vector = float2(as_type<float>(raw2), as_type<float>(raw3));
@ -298,15 +308,15 @@ uint2 chunk_offset(thread const uint& i)
 }

 static inline __attribute__((always_inline))
-CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_202)
+CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_278)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
    Alloc param_2 = a;
    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
    CmdFill s;
    s.tile_ref = raw0;
    s.backdrop = int(raw1);
@ -314,51 +324,51 @@ CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device
 }

 static inline __attribute__((always_inline))
-CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
    Alloc param = a;
    CmdFillRef param_1 = CmdFillRef{ ref.offset + 4u };
-    return CmdFill_read(param, param_1, v_202);
+    return CmdFill_read(param, param_1, v_278);
 }

 static inline __attribute__((always_inline))
-CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_202)
+CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_278)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
    CmdAlpha s;
    s.alpha = as_type<float>(raw0);
    return s;
 }

 static inline __attribute__((always_inline))
-CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
    Alloc param = a;
    CmdAlphaRef param_1 = CmdAlphaRef{ ref.offset + 4u };
-    return CmdAlpha_read(param, param_1, v_202);
+    return CmdAlpha_read(param, param_1, v_278);
 }

 static inline __attribute__((always_inline))
-CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_202)
+CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_278)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
    CmdColor s;
    s.rgba_color = raw0;
    return s;
 }

 static inline __attribute__((always_inline))
-CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
    Alloc param = a;
    CmdColorRef param_1 = CmdColorRef{ ref.offset + 4u };
-    return CmdColor_read(param, param_1, v_202);
+    return CmdColor_read(param, param_1, v_278);
 }

 static inline __attribute__((always_inline))
@ -379,21 +389,21 @@ float4 unpacksRGB(thread const uint& srgba)
 }

 static inline __attribute__((always_inline))
-CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_202)
+CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_278)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
    Alloc param_2 = a;
    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
    Alloc param_4 = a;
    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_202);
+    uint raw2 = read_mem(param_4, param_5, v_278);
    Alloc param_6 = a;
    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_202);
+    uint raw3 = read_mem(param_6, param_7, v_278);
    CmdLinGrad s;
    s.index = raw0;
    s.line_x = as_type<float>(raw1);
@ -403,23 +413,23 @@ CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& re
 }

 static inline __attribute__((always_inline))
-CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
    Alloc param = a;
    CmdLinGradRef param_1 = CmdLinGradRef{ ref.offset + 4u };
-    return CmdLinGrad_read(param, param_1, v_202);
+    return CmdLinGrad_read(param, param_1, v_278);
 }

 static inline __attribute__((always_inline))
-CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_202)
+CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_278)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
    Alloc param_2 = a;
    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
    CmdImage s;
    s.index = raw0;
    s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
@ -427,11 +437,11 @@ CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, dev
 }

 static inline __attribute__((always_inline))
-CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
    Alloc param = a;
    CmdImageRef param_1 = CmdImageRef{ ref.offset + 4u };
-    return CmdImage_read(param, param_1, v_202);
+    return CmdImage_read(param, param_1, v_278);
 }

 static inline __attribute__((always_inline))
@ -444,10 +454,10 @@ spvUnsafeArray<float4, 8> fillImage(thread const uint2& xy, thread const CmdImag
        int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
        float4 fg_rgba = image_atlas.read(uint2(uv));
        float3 param_1 = fg_rgba.xyz;
-        float3 _695 = fromsRGB(param_1);
-        fg_rgba.x = _695.x;
-        fg_rgba.y = _695.y;
-        fg_rgba.z = _695.z;
+        float3 _1493 = fromsRGB(param_1);
+        fg_rgba.x = _1493.x;
+        fg_rgba.y = _1493.y;
+        fg_rgba.z = _1493.z;
        rgba[i] = fg_rgba;
    }
    return rgba;
@ -471,30 +481,476 @@ uint packsRGB(thread float4& rgba)
 }

 static inline __attribute__((always_inline))
-CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_202)
+CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_278)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
+    CmdEndClip s;
+    s.blend = raw0;
+    return s;
+}
+
+static inline __attribute__((always_inline))
+CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+{
+    Alloc param = a;
+    CmdEndClipRef param_1 = CmdEndClipRef{ ref.offset + 4u };
+    return CmdEndClip_read(param, param_1, v_278);
+}
+
+static inline __attribute__((always_inline))
+float3 screen(thread const float3& cb, thread const float3& cs)
+{
+    return (cb + cs) - (cb * cs);
+}
+
+static inline __attribute__((always_inline))
+float3 hard_light(thread const float3& cb, thread const float3& cs)
+{
+    float3 param = cb;
+    float3 param_1 = (cs * 2.0) - float3(1.0);
+    return mix(screen(param, param_1), (cb * 2.0) * cs, select(float3(0.0), float3(1.0), cs <= float3(0.5)));
+}
+
+static inline __attribute__((always_inline))
+float color_dodge(thread const float& cb, thread const float& cs)
+{
+    if (cb == 0.0)
+    {
+        return 0.0;
+    }
+    else
+    {
+        if (cs == 1.0)
+        {
+            return 1.0;
+        }
+        else
+        {
+            return fast::min(1.0, cb / (1.0 - cs));
+        }
+    }
+}
+
+static inline __attribute__((always_inline))
+float color_burn(thread const float& cb, thread const float& cs)
+{
+    if (cb == 1.0)
+    {
+        return 1.0;
+    }
+    else
+    {
+        if (cs == 0.0)
+        {
+            return 0.0;
+        }
+        else
+        {
+            return 1.0 - fast::min(1.0, (1.0 - cb) / cs);
+        }
+    }
+}
+
+static inline __attribute__((always_inline))
+float3 soft_light(thread const float3& cb, thread const float3& cs)
+{
+    float3 d = mix(sqrt(cb), ((((cb * 16.0) - float3(12.0)) * cb) + float3(4.0)) * cb, select(float3(0.0), float3(1.0), cb <= float3(0.25)));
+    return mix(cb + (((cs * 2.0) - float3(1.0)) * (d - cb)), cb - (((float3(1.0) - (cs * 2.0)) * cb) * (float3(1.0) - cb)), select(float3(0.0), float3(1.0), cs <= float3(0.5)));
+}
+
+static inline __attribute__((always_inline))
+float sat(thread const float3& c)
+{
+    return fast::max(c.x, fast::max(c.y, c.z)) - fast::min(c.x, fast::min(c.y, c.z));
+}
+
+static inline __attribute__((always_inline))
+void set_sat_inner(thread float& cmin, thread float& cmid, thread float& cmax, thread const float& s)
+{
+    if (cmax > cmin)
+    {
+        cmid = ((cmid - cmin) * s) / (cmax - cmin);
+        cmax = s;
+    }
+    else
+    {
+        cmid = 0.0;
+        cmax = 0.0;
+    }
+    cmin = 0.0;
+}
+
+static inline __attribute__((always_inline))
+float3 set_sat(thread float3& c, thread const float& s)
+{
+    if (c.x <= c.y)
+    {
+        if (c.y <= c.z)
+        {
+            float param = c.x;
+            float param_1 = c.y;
+            float param_2 = c.z;
+            float param_3 = s;
+            set_sat_inner(param, param_1, param_2, param_3);
+            c.x = param;
+            c.y = param_1;
+            c.z = param_2;
+        }
+        else
+        {
+            if (c.x <= c.z)
+            {
+                float param_4 = c.x;
+                float param_5 = c.z;
+                float param_6 = c.y;
+                float param_7 = s;
+                set_sat_inner(param_4, param_5, param_6, param_7);
+                c.x = param_4;
+                c.z = param_5;
+                c.y = param_6;
+            }
+            else
+            {
+                float param_8 = c.z;
+                float param_9 = c.x;
+                float param_10 = c.y;
+                float param_11 = s;
+                set_sat_inner(param_8, param_9, param_10, param_11);
+                c.z = param_8;
+                c.x = param_9;
+                c.y = param_10;
+            }
+        }
+    }
+    else
+    {
+        if (c.x <= c.z)
+        {
+            float param_12 = c.y;
+            float param_13 = c.x;
+            float param_14 = c.z;
+            float param_15 = s;
+            set_sat_inner(param_12, param_13, param_14, param_15);
+            c.y = param_12;
+            c.x = param_13;
+            c.z = param_14;
+        }
+        else
+        {
+            if (c.y <= c.z)
+            {
+                float param_16 = c.y;
+                float param_17 = c.z;
+                float param_18 = c.x;
+                float param_19 = s;
+                set_sat_inner(param_16, param_17, param_18, param_19);
+                c.y = param_16;
+                c.z = param_17;
+                c.x = param_18;
+            }
+            else
+            {
+                float param_20 = c.z;
+                float param_21 = c.y;
+                float param_22 = c.x;
+                float param_23 = s;
+                set_sat_inner(param_20, param_21, param_22, param_23);
+                c.z = param_20;
+                c.y = param_21;
+                c.x = param_22;
+            }
+        }
+    }
+    return c;
+}
+
+static inline __attribute__((always_inline))
+float lum(thread const float3& c)
+{
+    float3 f = float3(0.300000011920928955078125, 0.589999973773956298828125, 0.10999999940395355224609375);
+    return dot(c, f);
+}
+
+static inline __attribute__((always_inline))
+float3 clip_color(thread float3& c)
+{
+    float3 param = c;
+    float L = lum(param);
+    float n = fast::min(c.x, fast::min(c.y, c.z));
+    float x = fast::max(c.x, fast::max(c.y, c.z));
+    if (n < 0.0)
+    {
+        c = float3(L) + (((c - float3(L)) * L) / float3(L - n));
+    }
+    if (x > 1.0)
+    {
+        c = float3(L) + (((c - float3(L)) * (1.0 - L)) / float3(x - L));
+    }
+    return c;
+}
+
+static inline __attribute__((always_inline))
+float3 set_lum(thread const float3& c, thread const float& l)
+{
+    float3 param = c;
+    float3 param_1 = c + float3(l - lum(param));
+    float3 _901 = clip_color(param_1);
+    return _901;
+}
+
+static inline __attribute__((always_inline))
+float3 mix_blend(thread const float3& cb, thread const float3& cs, thread const uint& mode)
+{
+    float3 b = float3(0.0);
+    switch (mode)
+    {
+        case 1u:
+        {
+            b = cb * cs;
+            break;
+        }
+        case 2u:
+        {
+            float3 param = cb;
+            float3 param_1 = cs;
+            b = screen(param, param_1);
+            break;
+        }
+        case 3u:
+        {
+            float3 param_2 = cs;
+            float3 param_3 = cb;
+            b = hard_light(param_2, param_3);
+            break;
+        }
+        case 4u:
+        {
+            b = fast::min(cb, cs);
+            break;
+        }
+        case 5u:
+        {
+            b = fast::max(cb, cs);
+            break;
+        }
+        case 6u:
+        {
+            float param_4 = cb.x;
+            float param_5 = cs.x;
+            float param_6 = cb.y;
+            float param_7 = cs.y;
+            float param_8 = cb.z;
+            float param_9 = cs.z;
+            b = float3(color_dodge(param_4, param_5), color_dodge(param_6, param_7), color_dodge(param_8, param_9));
+            break;
+        }
+        case 7u:
+        {
+            float param_10 = cb.x;
+            float param_11 = cs.x;
+            float param_12 = cb.y;
+            float param_13 = cs.y;
+            float param_14 = cb.z;
+            float param_15 = cs.z;
+            b = float3(color_burn(param_10, param_11), color_burn(param_12, param_13), color_burn(param_14, param_15));
+            break;
+        }
+        case 8u:
+        {
+            float3 param_16 = cb;
+            float3 param_17 = cs;
+            b = hard_light(param_16, param_17);
+            break;
+        }
+        case 9u:
+        {
+            float3 param_18 = cb;
+            float3 param_19 = cs;
+            b = soft_light(param_18, param_19);
+            break;
+        }
+        case 10u:
+        {
+            b = abs(cb - cs);
+            break;
+        }
+        case 11u:
+        {
+            b = (cb + cs) - ((cb * 2.0) * cs);
+            break;
+        }
+        case 12u:
+        {
+            float3 param_20 = cb;
+            float3 param_21 = cs;
+            float param_22 = sat(param_20);
+            float3 _1192 = set_sat(param_21, param_22);
+            float3 param_23 = cb;
+            float3 param_24 = _1192;
+            float param_25 = lum(param_23);
+            b = set_lum(param_24, param_25);
+            break;
+        }
+        case 13u:
+        {
+            float3 param_26 = cs;
+            float3 param_27 = cb;
+            float param_28 = sat(param_26);
+            float3 _1206 = set_sat(param_27, param_28);
+            float3 param_29 = cb;
+            float3 param_30 = _1206;
+            float param_31 = lum(param_29);
+            b = set_lum(param_30, param_31);
+            break;
+        }
+        case 14u:
+        {
+            float3 param_32 = cb;
+            float3 param_33 = cs;
+            float param_34 = lum(param_32);
+            b = set_lum(param_33, param_34);
+            break;
+        }
+        case 15u:
+        {
+            float3 param_35 = cs;
+            float3 param_36 = cb;
+            float param_37 = lum(param_35);
+            b = set_lum(param_36, param_37);
+            break;
+        }
+        default:
+        {
+            b = cs;
+            break;
+        }
+    }
+    return b;
+}
+
+static inline __attribute__((always_inline))
+float4 mix_compose(thread const float3& cb, thread const float3& cs, thread const float& ab, thread const float& as, thread const uint& mode)
+{
+    float fa = 0.0;
+    float fb = 0.0;
+    switch (mode)
+    {
+        case 1u:
+        {
+            fa = 1.0;
+            fb = 0.0;
+            break;
+        }
+        case 2u:
+        {
+            fa = 0.0;
+            fb = 1.0;
+            break;
+        }
+        case 3u:
+        {
+            fa = 1.0;
+            fb = 1.0 - as;
+            break;
+        }
+        case 4u:
+        {
+            fa = 1.0 - ab;
+            fb = 1.0;
+            break;
+        }
+        case 5u:
+        {
+            fa = ab;
+            fb = 0.0;
+            break;
+        }
+        case 6u:
+        {
+            fa = 0.0;
+            fb = as;
+            break;
+        }
+        case 7u:
+        {
+            fa = 1.0 - ab;
+            fb = 0.0;
+            break;
+        }
+        case 8u:
+        {
+            fa = 0.0;
+            fb = 1.0 - as;
+            break;
+        }
+        case 9u:
+        {
+            fa = ab;
+            fb = 1.0 - as;
+            break;
+        }
+        case 10u:
+        {
+            fa = 1.0 - ab;
+            fb = as;
+            break;
+        }
+        case 11u:
+        {
+            fa = 1.0 - ab;
+            fb = 1.0 - as;
+            break;
+        }
+        case 12u:
+        {
+            fa = 1.0;
+            fb = 1.0;
+            break;
+        }
+        case 13u:
+        {
+            return float4(fast::max(float4(0.0), ((float4(1.0) - (float4(cs, as) * as)) + float4(1.0)) - (float4(cb, ab) * ab)).xyz, fast::max(0.0, ((1.0 - as) + 1.0) - ab));
+        }
+        case 14u:
+        {
+            return float4(fast::min(float4(1.0), (float4(cs, as) * as) + (float4(cb, ab) * ab)).xyz, fast::min(1.0, as + ab));
+        }
+        default:
+        {
+            break;
+        }
+    }
+    return (float4(cs, as) * (as * fa)) + (float4(cb, ab) * (ab * fb));
+}
+
+static inline __attribute__((always_inline))
+CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_278)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_278);
    CmdJump s;
    s.new_ref = raw0;
    return s;
 }

 static inline __attribute__((always_inline))
-CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
    Alloc param = a;
    CmdJumpRef param_1 = CmdJumpRef{ ref.offset + 4u };
-    return CmdJump_read(param, param_1, v_202);
+    return CmdJump_read(param, param_1, v_278);
 }

-kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _723 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1521 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
-    uint tile_ix = (gl_WorkGroupID.y * _723.conf.width_in_tiles) + gl_WorkGroupID.x;
+    uint tile_ix = (gl_WorkGroupID.y * _1521.conf.width_in_tiles) + gl_WorkGroupID.x;
    Alloc param;
-    param.offset = _723.conf.ptcl_alloc.offset;
+    param.offset = _1521.conf.ptcl_alloc.offset;
    uint param_1 = tile_ix * 1024u;
    uint param_2 = 1024u;
    Alloc cmd_alloc = slice_mem(param, param_1, param_2);
@ -507,7 +963,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
        rgba[i] = float4(0.0);
    }
    uint clip_depth = 0u;
-    bool mem_ok = v_202.mem_error == 0u;
+    bool mem_ok = v_278.mem_error == 0u;
    spvUnsafeArray<float, 8> df;
    TileSegRef tile_seg_ref;
    spvUnsafeArray<float, 8> area;
@ -516,7 +972,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
    {
        Alloc param_3 = cmd_alloc;
        CmdRef param_4 = cmd_ref;
-        uint tag = Cmd_tag(param_3, param_4, v_202).tag;
+        uint tag = Cmd_tag(param_3, param_4, v_278).tag;
        if (tag == 0u)
        {
            break;
@ -527,7 +983,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
            {
                Alloc param_5 = cmd_alloc;
                CmdRef param_6 = cmd_ref;
-                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_202);
+                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_278);
                for (uint k = 0u; k < 8u; k++)
                {
                    df[k] = 1000000000.0;
@ -540,7 +996,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
                    bool param_9 = mem_ok;
                    Alloc param_10 = new_alloc(param_7, param_8, param_9);
                    TileSegRef param_11 = tile_seg_ref;
-                    TileSeg seg = TileSeg_read(param_10, param_11, v_202);
+                    TileSeg seg = TileSeg_read(param_10, param_11, v_278);
                    float2 line_vec = seg.vector;
                    for (uint k_1 = 0u; k_1 < 8u; k_1++)
                    {
@ -563,7 +1019,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
            {
                Alloc param_13 = cmd_alloc;
                CmdRef param_14 = cmd_ref;
-                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_202);
+                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_278);
                for (uint k_3 = 0u; k_3 < 8u; k_3++)
                {
                    area[k_3] = float(fill.backdrop);
@ -576,7 +1032,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
                    bool param_17 = mem_ok;
                    Alloc param_18 = new_alloc(param_15, param_16, param_17);
                    TileSegRef param_19 = tile_seg_ref;
-                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_202);
+                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_278);
                    for (uint k_4 = 0u; k_4 < 8u; k_4++)
                    {
                        uint param_20 = k_4;
@ -620,7 +1076,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
            {
                Alloc param_21 = cmd_alloc;
                CmdRef param_22 = cmd_ref;
-                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_202);
+                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_278);
                for (uint k_7 = 0u; k_7 < 8u; k_7++)
                {
                    area[k_7] = alpha.alpha;
@ -632,7 +1088,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
            {
                Alloc param_23 = cmd_alloc;
                CmdRef param_24 = cmd_ref;
-                CmdColor color = Cmd_Color_read(param_23, param_24, v_202);
+                CmdColor color = Cmd_Color_read(param_23, param_24, v_278);
                uint param_25 = color.rgba_color;
                float4 fg = unpacksRGB(param_25);
                for (uint k_8 = 0u; k_8 < 8u; k_8++)
@ -647,7 +1103,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
            {
                Alloc param_26 = cmd_alloc;
                CmdRef param_27 = cmd_ref;
-                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_202);
+                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_278);
                float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c;
                for (uint k_9 = 0u; k_9 < 8u; k_9++)
                {
@ -657,10 +1113,10 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
                    int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                    float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                    float3 param_29 = fg_rgba.xyz;
-                    float3 _1298 = fromsRGB(param_29);
-                    fg_rgba.x = _1298.x;
-                    fg_rgba.y = _1298.y;
-                    fg_rgba.z = _1298.z;
+                    float3 _2092 = fromsRGB(param_29);
+                    fg_rgba.x = _2092.x;
+                    fg_rgba.y = _2092.y;
+                    fg_rgba.z = _2092.z;
                    rgba[k_9] = fg_rgba;
                }
                cmd_ref.offset += 20u;
@ -670,7 +1126,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
            {
                Alloc param_30 = cmd_alloc;
                CmdRef param_31 = cmd_ref;
-                CmdImage fill_img = Cmd_Image_read(param_30, param_31, v_202);
+                CmdImage fill_img = Cmd_Image_read(param_30, param_31, v_278);
                uint2 param_32 = xy_uint;
                CmdImage param_33 = fill_img;
                spvUnsafeArray<float4, 8> img;
@ -689,8 +1145,8 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
                {
                    uint d_2 = min(clip_depth, 127u);
                    float4 param_34 = float4(rgba[k_11]);
-                    uint _1390 = packsRGB(param_34);
-                    blend_stack[d_2][k_11] = _1390;
+                    uint _2184 = packsRGB(param_34);
+                    blend_stack[d_2][k_11] = _2184;
                    rgba[k_11] = float4(0.0);
                }
                clip_depth++;
@ -699,23 +1155,43 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
            }
            case 9u:
            {
+                Alloc param_35 = cmd_alloc;
+                CmdRef param_36 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_35, param_36, v_278);
+                uint blend_mode = end_clip.blend >> uint(8);
+                uint comp_mode = end_clip.blend & 255u;
                clip_depth--;
                for (uint k_12 = 0u; k_12 < 8u; k_12++)
                {
                    uint d_3 = min(clip_depth, 127u);
-                    uint param_35 = blend_stack[d_3][k_12];
-                    float4 bg = unpacksRGB(param_35);
+                    uint param_37 = blend_stack[d_3][k_12];
+                    float4 bg = unpacksRGB(param_37);
                    float4 fg_1 = rgba[k_12] * area[k_12];
-                    rgba[k_12] = (bg * (1.0 - fg_1.w)) + fg_1;
+                    float3 param_38 = bg.xyz;
+                    float3 param_39 = fg_1.xyz;
+                    uint param_40 = blend_mode;
+                    float3 blend = mix_blend(param_38, param_39, param_40);
+                    float4 _2251 = fg_1;
+                    float _2255 = fg_1.w;
+                    float3 _2262 = mix(_2251.xyz, blend, float3(float((_2255 * bg.w) > 0.0)));
+                    fg_1.x = _2262.x;
+                    fg_1.y = _2262.y;
+                    fg_1.z = _2262.z;
+                    float3 param_41 = bg.xyz;
+                    float3 param_42 = fg_1.xyz;
+                    float param_43 = bg.w;
+                    float param_44 = fg_1.w;
+                    uint param_45 = comp_mode;
+                    rgba[k_12] = mix_compose(param_41, param_42, param_43, param_44, param_45);
                }
-                cmd_ref.offset += 4u;
+                cmd_ref.offset += 8u;
                break;
            }
            case 10u:
            {
-                Alloc param_36 = cmd_alloc;
-                CmdRef param_37 = cmd_ref;
-                cmd_ref = CmdRef{ Cmd_Jump_read(param_36, param_37, v_202).new_ref };
+                Alloc param_46 = cmd_alloc;
+                CmdRef param_47 = cmd_ref;
+                cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_278).new_ref };
                cmd_alloc.offset = cmd_ref.offset;
                break;
            }
@ -723,9 +1199,9 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
    }
    for (uint i_1 = 0u; i_1 < 8u; i_1++)
    {
-        uint param_38 = i_1;
-        float3 param_39 = rgba[i_1].xyz;
-        image.write(float4(tosRGB(param_39), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_38))));
+        uint param_48 = i_1;
+        float3 param_49 = rgba[i_1].xyz;
+        image.write(float4(tosRGB(param_49), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48))));
    }
 }

--- a/piet-gpu/shader/gen/kernel4.spv
+++ b/piet-gpu/shader/gen/kernel4.spv
--- a/piet-gpu/shader/gen/kernel4_gray.msl
+++ b/piet-gpu/shader/gen/kernel4_gray.msl
@ -115,6 +115,16 @@ struct CmdAlpha
    float alpha;
 };

+struct CmdEndClipRef
+{
+    uint offset;
+};
+
+struct CmdEndClip
+{
+    uint blend;
+};
+
 struct CmdJumpRef
 {
    uint offset;
@ -208,7 +218,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
 }

 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_202)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_278)
 {
    Alloc param = alloc;
    uint param_1 = offset;
@ -216,29 +226,29 @@ uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memor
    {
        return 0u;
    }
-    uint v = v_202.memory[offset];
+    uint v = v_278.memory[offset];
    return v;
 }

 static inline __attribute__((always_inline))
-CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_202);
+    uint tag_and_flags = read_mem(param, param_1, v_278);
    return CmdTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
 }

 static inline __attribute__((always_inline))
-CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_202)
+CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_278)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
    Alloc param_2 = a;
    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
    CmdStroke s;
    s.tile_ref = raw0;
    s.half_width = as_type<float>(raw1);
@ -246,11 +256,11 @@ CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref,
 }

 static inline __attribute__((always_inline))
-CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
    Alloc param = a;
    CmdStrokeRef param_1 = CmdStrokeRef{ ref.offset + 4u };
-    return CmdStroke_read(param, param_1, v_202);
+    return CmdStroke_read(param, param_1, v_278);
 }

 static inline __attribute__((always_inline))
@ -262,27 +272,27 @@ Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const
 }

 static inline __attribute__((always_inline))
-TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_202)
+TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_278)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
    Alloc param_2 = a;
    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
    Alloc param_4 = a;
    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_202);
+    uint raw2 = read_mem(param_4, param_5, v_278);
    Alloc param_6 = a;
    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_202);
+    uint raw3 = read_mem(param_6, param_7, v_278);
    Alloc param_8 = a;
    uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_202);
+    uint raw4 = read_mem(param_8, param_9, v_278);
    Alloc param_10 = a;
    uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_202);
+    uint raw5 = read_mem(param_10, param_11, v_278);
    TileSeg s;
    s.origin = float2(as_type<float>(raw0), as_type<float>(raw1));
    s.vector = float2(as_type<float>(raw2), as_type<float>(raw3));
@ -298,15 +308,15 @@ uint2 chunk_offset(thread const uint& i)
 }

 static inline __attribute__((always_inline))
-CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_202)
+CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_278)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
    Alloc param_2 = a;
    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
    CmdFill s;
    s.tile_ref = raw0;
    s.backdrop = int(raw1);
@ -314,51 +324,51 @@ CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device
 }

 static inline __attribute__((always_inline))
-CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
    Alloc param = a;
    CmdFillRef param_1 = CmdFillRef{ ref.offset + 4u };
-    return CmdFill_read(param, param_1, v_202);
+    return CmdFill_read(param, param_1, v_278);
 }

 static inline __attribute__((always_inline))
-CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_202)
+CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_278)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
    CmdAlpha s;
    s.alpha = as_type<float>(raw0);
    return s;
 }

 static inline __attribute__((always_inline))
-CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
    Alloc param = a;
    CmdAlphaRef param_1 = CmdAlphaRef{ ref.offset + 4u };
-    return CmdAlpha_read(param, param_1, v_202);
+    return CmdAlpha_read(param, param_1, v_278);
 }

 static inline __attribute__((always_inline))
-CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_202)
+CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_278)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
    CmdColor s;
    s.rgba_color = raw0;
    return s;
 }

 static inline __attribute__((always_inline))
-CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
    Alloc param = a;
    CmdColorRef param_1 = CmdColorRef{ ref.offset + 4u };
-    return CmdColor_read(param, param_1, v_202);
+    return CmdColor_read(param, param_1, v_278);
 }

 static inline __attribute__((always_inline))
@ -379,21 +389,21 @@ float4 unpacksRGB(thread const uint& srgba)
 }

 static inline __attribute__((always_inline))
-CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_202)
+CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_278)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
    Alloc param_2 = a;
    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
    Alloc param_4 = a;
    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_202);
+    uint raw2 = read_mem(param_4, param_5, v_278);
    Alloc param_6 = a;
    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_202);
+    uint raw3 = read_mem(param_6, param_7, v_278);
    CmdLinGrad s;
    s.index = raw0;
    s.line_x = as_type<float>(raw1);
@ -403,23 +413,23 @@ CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& re
 }

 static inline __attribute__((always_inline))
-CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
    Alloc param = a;
    CmdLinGradRef param_1 = CmdLinGradRef{ ref.offset + 4u };
-    return CmdLinGrad_read(param, param_1, v_202);
+    return CmdLinGrad_read(param, param_1, v_278);
 }

 static inline __attribute__((always_inline))
-CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_202)
+CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_278)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
    Alloc param_2 = a;
    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_202);
+    uint raw1 = read_mem(param_2, param_3, v_278);
    CmdImage s;
    s.index = raw0;
    s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
@ -427,11 +437,11 @@ CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, dev
 }

 static inline __attribute__((always_inline))
-CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
    Alloc param = a;
    CmdImageRef param_1 = CmdImageRef{ ref.offset + 4u };
-    return CmdImage_read(param, param_1, v_202);
+    return CmdImage_read(param, param_1, v_278);
 }

 static inline __attribute__((always_inline))
@ -444,10 +454,10 @@ spvUnsafeArray<float4, 8> fillImage(thread const uint2& xy, thread const CmdImag
        int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
        float4 fg_rgba = image_atlas.read(uint2(uv));
        float3 param_1 = fg_rgba.xyz;
-        float3 _695 = fromsRGB(param_1);
-        fg_rgba.x = _695.x;
-        fg_rgba.y = _695.y;
-        fg_rgba.z = _695.z;
+        float3 _1495 = fromsRGB(param_1);
+        fg_rgba.x = _1495.x;
+        fg_rgba.y = _1495.y;
+        fg_rgba.z = _1495.z;
        rgba[i] = fg_rgba;
    }
    return rgba;
@ -471,30 +481,477 @@ uint packsRGB(thread float4& rgba)
 }

 static inline __attribute__((always_inline))
-CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_202)
+CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_278)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_202);
+    uint raw0 = read_mem(param, param_1, v_278);
+    CmdEndClip s;
+    s.blend = raw0;
+    return s;
+}
+
+static inline __attribute__((always_inline))
+CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+{
+    Alloc param = a;
+    CmdEndClipRef param_1 = CmdEndClipRef{ ref.offset + 4u };
+    return CmdEndClip_read(param, param_1, v_278);
+}
+
+static inline __attribute__((always_inline))
+float3 screen(thread const float3& cb, thread const float3& cs)
+{
+    return (cb + cs) - (cb * cs);
+}
+
+static inline __attribute__((always_inline))
+float3 hard_light(thread const float3& cb, thread const float3& cs)
+{
+    float3 param = cb;
+    float3 param_1 = (cs * 2.0) - float3(1.0);
+    return mix(screen(param, param_1), (cb * 2.0) * cs, select(float3(0.0), float3(1.0), cs <= float3(0.5)));
+}
+
+static inline __attribute__((always_inline))
+float color_dodge(thread const float& cb, thread const float& cs)
+{
+    if (cb == 0.0)
+    {
+        return 0.0;
+    }
+    else
+    {
+        if (cs == 1.0)
+        {
+            return 1.0;
+        }
+        else
+        {
+            return fast::min(1.0, cb / (1.0 - cs));
+        }
+    }
+}
+
+static inline __attribute__((always_inline))
+float color_burn(thread const float& cb, thread const float& cs)
+{
+    if (cb == 1.0)
+    {
+        return 1.0;
+    }
+    else
+    {
+        if (cs == 0.0)
+        {
+            return 0.0;
+        }
+        else
+        {
+            return 1.0 - fast::min(1.0, (1.0 - cb) / cs);
+        }
+    }
+}
+
+static inline __attribute__((always_inline))
+float3 soft_light(thread const float3& cb, thread const float3& cs)
+{
+    float3 d = mix(sqrt(cb), ((((cb * 16.0) - float3(12.0)) * cb) + float3(4.0)) * cb, select(float3(0.0), float3(1.0), cb <= float3(0.25)));
+    return mix(cb + (((cs * 2.0) - float3(1.0)) * (d - cb)), cb - (((float3(1.0) - (cs * 2.0)) * cb) * (float3(1.0) - cb)), select(float3(0.0), float3(1.0), cs <= float3(0.5)));
+}
+
+static inline __attribute__((always_inline))
+float sat(thread const float3& c)
+{
+    return fast::max(c.x, fast::max(c.y, c.z)) - fast::min(c.x, fast::min(c.y, c.z));
+}
+
+static inline __attribute__((always_inline))
+void SetSatInner(thread float& Cmin, thread float& Cmid, thread float& Cmax, thread const float& s)
+{
+    if (Cmax > Cmin)
+    {
+        Cmid = ((Cmid - Cmin) * s) / (Cmax - Cmin);
+        Cmax = s;
+    }
+    else
+    {
+        Cmid = 0.0;
+        Cmax = 0.0;
+    }
+    Cmin = 0.0;
+}
+
+static inline __attribute__((always_inline))
+float3 set_sat(thread float3& C, thread const float& s)
+{
+    if (C.x <= C.y)
+    {
+        if (C.y <= C.z)
+        {
+            float param = C.x;
+            float param_1 = C.y;
+            float param_2 = C.z;
+            float param_3 = s;
+            SetSatInner(param, param_1, param_2, param_3);
+            C.x = param;
+            C.y = param_1;
+            C.z = param_2;
+        }
+        else
+        {
+            if (C.x <= C.z)
+            {
+                float param_4 = C.x;
+                float param_5 = C.z;
+                float param_6 = C.y;
+                float param_7 = s;
+                SetSatInner(param_4, param_5, param_6, param_7);
+                C.x = param_4;
+                C.z = param_5;
+                C.y = param_6;
+            }
+            else
+            {
+                float param_8 = C.z;
+                float param_9 = C.x;
+                float param_10 = C.y;
+                float param_11 = s;
+                SetSatInner(param_8, param_9, param_10, param_11);
+                C.z = param_8;
+                C.x = param_9;
+                C.y = param_10;
+            }
+        }
+    }
+    else
+    {
+        if (C.x <= C.z)
+        {
+            float param_12 = C.y;
+            float param_13 = C.x;
+            float param_14 = C.z;
+            float param_15 = s;
+            SetSatInner(param_12, param_13, param_14, param_15);
+            C.y = param_12;
+            C.x = param_13;
+            C.z = param_14;
+        }
+        else
+        {
+            if (C.y <= C.z)
+            {
+                float param_16 = C.y;
+                float param_17 = C.z;
+                float param_18 = C.x;
+                float param_19 = s;
+                SetSatInner(param_16, param_17, param_18, param_19);
+                C.y = param_16;
+                C.z = param_17;
+                C.x = param_18;
+            }
+            else
+            {
+                float param_20 = C.z;
+                float param_21 = C.y;
+                float param_22 = C.x;
+                float param_23 = s;
+                SetSatInner(param_20, param_21, param_22, param_23);
+                C.z = param_20;
+                C.y = param_21;
+                C.x = param_22;
+            }
+        }
+    }
+    return C;
+}
+
+static inline __attribute__((always_inline))
+float lum(thread const float3& c)
+{
+    float3 f = float3(0.300000011920928955078125, 0.589999973773956298828125, 0.10999999940395355224609375);
+    return dot(c, f);
+}
+
+static inline __attribute__((always_inline))
+float3 clip_color(thread float3& c)
+{
+    float3 param = c;
+    float L = lum(param);
+    float n = fast::min(c.x, fast::min(c.y, c.z));
+    float x = fast::max(c.x, fast::max(c.y, c.z));
+    if (n < 0.0)
+    {
+        c = float3(L) + (((c - float3(L)) * L) / float3(L - n));
+    }
+    if (x > 1.0)
+    {
+        c = float3(L) + (((c - float3(L)) * (1.0 - L)) / float3(x - L));
+    }
+    return c;
+}
+
+static inline __attribute__((always_inline))
+float3 set_lum(thread const float3& c, thread const float& l)
+{
+    float3 param = c;
+    float d = l - lum(param);
+    float3 param_1 = c + float3(d);
+    float3 _903 = clip_color(param_1);
+    return _903;
+}
+
+static inline __attribute__((always_inline))
+float3 mix_blend(thread const float3& cb, thread const float3& cs, thread const uint& mode)
+{
+    float3 b = float3(0.0);
+    switch (mode)
+    {
+        case 1u:
+        {
+            b = cb * cs;
+            break;
+        }
+        case 2u:
+        {
+            float3 param = cb;
+            float3 param_1 = cs;
+            b = screen(param, param_1);
+            break;
+        }
+        case 3u:
+        {
+            float3 param_2 = cs;
+            float3 param_3 = cb;
+            b = hard_light(param_2, param_3);
+            break;
+        }
+        case 4u:
+        {
+            b = fast::min(cb, cs);
+            break;
+        }
+        case 5u:
+        {
+            b = fast::max(cb, cs);
+            break;
+        }
+        case 6u:
+        {
+            float param_4 = cb.x;
+            float param_5 = cs.x;
+            float param_6 = cb.y;
+            float param_7 = cs.y;
+            float param_8 = cb.z;
+            float param_9 = cs.z;
+            b = float3(color_dodge(param_4, param_5), color_dodge(param_6, param_7), color_dodge(param_8, param_9));
+            break;
+        }
+        case 7u:
+        {
+            float param_10 = cb.x;
+            float param_11 = cs.x;
+            float param_12 = cb.y;
+            float param_13 = cs.y;
+            float param_14 = cb.z;
+            float param_15 = cs.z;
+            b = float3(color_burn(param_10, param_11), color_burn(param_12, param_13), color_burn(param_14, param_15));
+            break;
+        }
+        case 8u:
+        {
+            float3 param_16 = cb;
+            float3 param_17 = cs;
+            b = hard_light(param_16, param_17);
+            break;
+        }
+        case 9u:
+        {
+            float3 param_18 = cb;
+            float3 param_19 = cs;
+            b = soft_light(param_18, param_19);
+            break;
+        }
+        case 10u:
+        {
+            b = abs(cb - cs);
+            break;
+        }
+        case 11u:
+        {
+            b = (cb + cs) - ((cb * 2.0) * cs);
+            break;
+        }
+        case 12u:
+        {
+            float3 param_20 = cb;
+            float3 param_21 = cs;
+            float param_22 = sat(param_20);
+            float3 _1194 = set_sat(param_21, param_22);
+            float3 param_23 = cb;
+            float3 param_24 = _1194;
+            float param_25 = lum(param_23);
+            b = set_lum(param_24, param_25);
+            break;
+        }
+        case 13u:
+        {
+            float3 param_26 = cs;
+            float3 param_27 = cb;
+            float param_28 = sat(param_26);
+            float3 _1208 = set_sat(param_27, param_28);
+            float3 param_29 = cb;
+            float3 param_30 = _1208;
+            float param_31 = lum(param_29);
+            b = set_lum(param_30, param_31);
+            break;
+        }
+        case 14u:
+        {
+            float3 param_32 = cb;
+            float3 param_33 = cs;
+            float param_34 = lum(param_32);
+            b = set_lum(param_33, param_34);
+            break;
+        }
+        case 15u:
+        {
+            float3 param_35 = cs;
+            float3 param_36 = cb;
+            float param_37 = lum(param_35);
+            b = set_lum(param_36, param_37);
+            break;
+        }
+        default:
+        {
+            b = cs;
+            break;
+        }
+    }
+    return b;
+}
+
+static inline __attribute__((always_inline))
+float4 mix_compose(thread const float3& cb, thread const float3& cs, thread const float& ab, thread const float& as, thread const uint& mode)
+{
+    float fa = 0.0;
+    float fb = 0.0;
+    switch (mode)
+    {
+        case 1u:
+        {
+            fa = 1.0;
+            fb = 0.0;
+            break;
+        }
+        case 2u:
+        {
+            fa = 0.0;
+            fb = 1.0;
+            break;
+        }
+        case 3u:
+        {
+            fa = 1.0;
+            fb = 1.0 - as;
+            break;
+        }
+        case 4u:
+        {
+            fa = 1.0 - ab;
+            fb = 1.0;
+            break;
+        }
+        case 5u:
+        {
+            fa = ab;
+            fb = 0.0;
+            break;
+        }
+        case 6u:
+        {
+            fa = 0.0;
+            fb = as;
+            break;
+        }
+        case 7u:
+        {
+            fa = 1.0 - ab;
+            fb = 0.0;
+            break;
+        }
+        case 8u:
+        {
+            fa = 0.0;
+            fb = 1.0 - as;
+            break;
+        }
+        case 9u:
+        {
+            fa = ab;
+            fb = 1.0 - as;
+            break;
+        }
+        case 10u:
+        {
+            fa = 1.0 - ab;
+            fb = as;
+            break;
+        }
+        case 11u:
+        {
+            fa = 1.0 - ab;
+            fb = 1.0 - as;
+            break;
+        }
+        case 12u:
+        {
+            fa = 1.0;
+            fb = 1.0;
+            break;
+        }
+        case 13u:
+        {
+            return float4(fast::max(float4(0.0), ((float4(1.0) - (float4(cs, as) * as)) + float4(1.0)) - (float4(cb, ab) * ab)).xyz, fast::max(0.0, ((1.0 - as) + 1.0) - ab));
+        }
+        case 14u:
+        {
+            return float4(fast::min(float4(1.0), (float4(cs, as) * as) + (float4(cb, ab) * ab)).xyz, fast::min(1.0, as + ab));
+        }
+        default:
+        {
+            break;
+        }
+    }
+    return (float4(cs, as) * (as * fa)) + (float4(cb, ab) * (ab * fb));
+}
+
+static inline __attribute__((always_inline))
+CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_278)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_278);
    CmdJump s;
    s.new_ref = raw0;
    return s;
 }

 static inline __attribute__((always_inline))
-CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
+CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
 {
    Alloc param = a;
    CmdJumpRef param_1 = CmdJumpRef{ ref.offset + 4u };
-    return CmdJump_read(param, param_1, v_202);
+    return CmdJump_read(param, param_1, v_278);
 }

-kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _723 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1523 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
-    uint tile_ix = (gl_WorkGroupID.y * _723.conf.width_in_tiles) + gl_WorkGroupID.x;
+    uint tile_ix = (gl_WorkGroupID.y * _1523.conf.width_in_tiles) + gl_WorkGroupID.x;
    Alloc param;
-    param.offset = _723.conf.ptcl_alloc.offset;
+    param.offset = _1523.conf.ptcl_alloc.offset;
    uint param_1 = tile_ix * 1024u;
    uint param_2 = 1024u;
    Alloc cmd_alloc = slice_mem(param, param_1, param_2);
@ -507,7 +964,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
        rgba[i] = float4(0.0);
    }
    uint clip_depth = 0u;
-    bool mem_ok = v_202.mem_error == 0u;
+    bool mem_ok = v_278.mem_error == 0u;
    spvUnsafeArray<float, 8> df;
    TileSegRef tile_seg_ref;
    spvUnsafeArray<float, 8> area;
@ -516,7 +973,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
    {
        Alloc param_3 = cmd_alloc;
        CmdRef param_4 = cmd_ref;
-        uint tag = Cmd_tag(param_3, param_4, v_202).tag;
+        uint tag = Cmd_tag(param_3, param_4, v_278).tag;
        if (tag == 0u)
        {
            break;
@ -527,7 +984,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
            {
                Alloc param_5 = cmd_alloc;
                CmdRef param_6 = cmd_ref;
-                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_202);
+                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_278);
                for (uint k = 0u; k < 8u; k++)
                {
                    df[k] = 1000000000.0;
@ -540,7 +997,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
                    bool param_9 = mem_ok;
                    Alloc param_10 = new_alloc(param_7, param_8, param_9);
                    TileSegRef param_11 = tile_seg_ref;
-                    TileSeg seg = TileSeg_read(param_10, param_11, v_202);
+                    TileSeg seg = TileSeg_read(param_10, param_11, v_278);
                    float2 line_vec = seg.vector;
                    for (uint k_1 = 0u; k_1 < 8u; k_1++)
                    {
@ -563,7 +1020,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
            {
                Alloc param_13 = cmd_alloc;
                CmdRef param_14 = cmd_ref;
-                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_202);
+                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_278);
                for (uint k_3 = 0u; k_3 < 8u; k_3++)
                {
                    area[k_3] = float(fill.backdrop);
@ -576,7 +1033,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
                    bool param_17 = mem_ok;
                    Alloc param_18 = new_alloc(param_15, param_16, param_17);
                    TileSegRef param_19 = tile_seg_ref;
-                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_202);
+                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_278);
                    for (uint k_4 = 0u; k_4 < 8u; k_4++)
                    {
                        uint param_20 = k_4;
@ -620,7 +1077,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
            {
                Alloc param_21 = cmd_alloc;
                CmdRef param_22 = cmd_ref;
-                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_202);
+                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_278);
                for (uint k_7 = 0u; k_7 < 8u; k_7++)
                {
                    area[k_7] = alpha.alpha;
@ -632,7 +1089,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
            {
                Alloc param_23 = cmd_alloc;
                CmdRef param_24 = cmd_ref;
-                CmdColor color = Cmd_Color_read(param_23, param_24, v_202);
+                CmdColor color = Cmd_Color_read(param_23, param_24, v_278);
                uint param_25 = color.rgba_color;
                float4 fg = unpacksRGB(param_25);
                for (uint k_8 = 0u; k_8 < 8u; k_8++)
@ -647,7 +1104,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
            {
                Alloc param_26 = cmd_alloc;
                CmdRef param_27 = cmd_ref;
-                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_202);
+                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_278);
                float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c;
                for (uint k_9 = 0u; k_9 < 8u; k_9++)
                {
@ -657,10 +1114,10 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
                    int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                    float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                    float3 param_29 = fg_rgba.xyz;
-                    float3 _1298 = fromsRGB(param_29);
-                    fg_rgba.x = _1298.x;
-                    fg_rgba.y = _1298.y;
-                    fg_rgba.z = _1298.z;
+                    float3 _2094 = fromsRGB(param_29);
+                    fg_rgba.x = _2094.x;
+                    fg_rgba.y = _2094.y;
+                    fg_rgba.z = _2094.z;
                    rgba[k_9] = fg_rgba;
                }
                cmd_ref.offset += 20u;
@ -670,7 +1127,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
            {
                Alloc param_30 = cmd_alloc;
                CmdRef param_31 = cmd_ref;
-                CmdImage fill_img = Cmd_Image_read(param_30, param_31, v_202);
+                CmdImage fill_img = Cmd_Image_read(param_30, param_31, v_278);
                uint2 param_32 = xy_uint;
                CmdImage param_33 = fill_img;
                spvUnsafeArray<float4, 8> img;
@ -689,8 +1146,8 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
                {
                    uint d_2 = min(clip_depth, 127u);
                    float4 param_34 = float4(rgba[k_11]);
-                    uint _1390 = packsRGB(param_34);
-                    blend_stack[d_2][k_11] = _1390;
+                    uint _2186 = packsRGB(param_34);
+                    blend_stack[d_2][k_11] = _2186;
                    rgba[k_11] = float4(0.0);
                }
                clip_depth++;
@ -699,23 +1156,43 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
            }
            case 9u:
            {
+                Alloc param_35 = cmd_alloc;
+                CmdRef param_36 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_35, param_36, v_278);
+                uint blend_mode = end_clip.blend >> uint(8);
+                uint comp_mode = end_clip.blend & 255u;
                clip_depth--;
                for (uint k_12 = 0u; k_12 < 8u; k_12++)
                {
                    uint d_3 = min(clip_depth, 127u);
-                    uint param_35 = blend_stack[d_3][k_12];
-                    float4 bg = unpacksRGB(param_35);
+                    uint param_37 = blend_stack[d_3][k_12];
+                    float4 bg = unpacksRGB(param_37);
                    float4 fg_1 = rgba[k_12] * area[k_12];
-                    rgba[k_12] = (bg * (1.0 - fg_1.w)) + fg_1;
+                    float3 param_38 = bg.xyz;
+                    float3 param_39 = fg_1.xyz;
+                    uint param_40 = blend_mode;
+                    float3 blend = mix_blend(param_38, param_39, param_40);
+                    float4 _2253 = fg_1;
+                    float _2257 = fg_1.w;
+                    float3 _2264 = mix(_2253.xyz, blend, float3(float((_2257 * bg.w) > 0.0)));
+                    fg_1.x = _2264.x;
+                    fg_1.y = _2264.y;
+                    fg_1.z = _2264.z;
+                    float3 param_41 = bg.xyz;
+                    float3 param_42 = fg_1.xyz;
+                    float param_43 = bg.w;
+                    float param_44 = fg_1.w;
+                    uint param_45 = comp_mode;
+                    rgba[k_12] = mix_compose(param_41, param_42, param_43, param_44, param_45);
                }
-                cmd_ref.offset += 4u;
+                cmd_ref.offset += 8u;
                break;
            }
            case 10u:
            {
-                Alloc param_36 = cmd_alloc;
-                CmdRef param_37 = cmd_ref;
-                cmd_ref = CmdRef{ Cmd_Jump_read(param_36, param_37, v_202).new_ref };
+                Alloc param_46 = cmd_alloc;
+                CmdRef param_47 = cmd_ref;
+                cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_278).new_ref };
                cmd_alloc.offset = cmd_ref.offset;
                break;
            }
@ -723,8 +1200,8 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7
    }
    for (uint i_1 = 0u; i_1 < 8u; i_1++)
    {
-        uint param_38 = i_1;
-        image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_38))));
+        uint param_48 = i_1;
+        image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48))));
    }
 }

--- a/piet-gpu/shader/gen/kernel4_gray.spv
+++ b/piet-gpu/shader/gen/kernel4_gray.spv
--- a/piet-gpu/shader/gen/tile_alloc.msl
+++ b/piet-gpu/shader/gen/tile_alloc.msl
@ -26,6 +26,7 @@ struct AnnoEndClipRef
 struct AnnoEndClip
 {
    float4 bbox;
+    uint blend;
 };

 struct AnnotatedRef
@ -145,8 +146,12 @@ AnnoEndClip AnnoEndClip_read(thread const Alloc& a, thread const AnnoEndClipRef&
    Alloc param_6 = a;
    uint param_7 = ix + 3u;
    uint raw3 = read_mem(param_6, param_7, v_92, v_92BufferSize);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9, v_92, v_92BufferSize);
    AnnoEndClip s;
    s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
+    s.blend = raw4;
    return s;
 }

@ -221,20 +226,20 @@ void Path_write(thread const Alloc& a, thread const PathRef& ref, thread const P
    write_mem(param_6, param_7, param_8, v_92, v_92BufferSize);
 }

-kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_92 [[buffer(0)]], const device ConfigBuf& _305 [[buffer(1)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_92 [[buffer(0)]], const device ConfigBuf& _314 [[buffer(1)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
 {
    threadgroup uint sh_tile_count[256];
    threadgroup MallocResult sh_tile_alloc;
    constant uint& v_92BufferSize = spvBufferSizeConstants[0];
    uint th_ix = gl_LocalInvocationID.x;
    uint element_ix = gl_GlobalInvocationID.x;
-    PathRef path_ref = PathRef{ _305.conf.tile_alloc.offset + (element_ix * 12u) };
-    AnnotatedRef ref = AnnotatedRef{ _305.conf.anno_alloc.offset + (element_ix * 40u) };
+    PathRef path_ref = PathRef{ _314.conf.tile_alloc.offset + (element_ix * 12u) };
+    AnnotatedRef ref = AnnotatedRef{ _314.conf.anno_alloc.offset + (element_ix * 40u) };
    uint tag = 0u;
-    if (element_ix < _305.conf.n_elements)
+    if (element_ix < _314.conf.n_elements)
    {
        Alloc param;
-        param.offset = _305.conf.anno_alloc.offset;
+        param.offset = _314.conf.anno_alloc.offset;
        AnnotatedRef param_1 = ref;
        tag = Annotated_tag(param, param_1, v_92, v_92BufferSize).tag;
    }
@ -251,7 +256,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
        case 5u:
        {
            Alloc param_2;
-            param_2.offset = _305.conf.anno_alloc.offset;
+            param_2.offset = _314.conf.anno_alloc.offset;
            AnnotatedRef param_3 = ref;
            AnnoEndClip clip = Annotated_EndClip_read(param_2, param_3, v_92, v_92BufferSize);
            x0 = int(floor(clip.bbox.x * 0.0625));
@ -261,10 +266,10 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
            break;
        }
    }
-    x0 = clamp(x0, 0, int(_305.conf.width_in_tiles));
-    y0 = clamp(y0, 0, int(_305.conf.height_in_tiles));
-    x1 = clamp(x1, 0, int(_305.conf.width_in_tiles));
-    y1 = clamp(y1, 0, int(_305.conf.height_in_tiles));
+    x0 = clamp(x0, 0, int(_314.conf.width_in_tiles));
+    y0 = clamp(y0, 0, int(_314.conf.height_in_tiles));
+    x1 = clamp(x1, 0, int(_314.conf.width_in_tiles));
+    y1 = clamp(y1, 0, int(_314.conf.height_in_tiles));
    Path path;
    path.bbox = uint4(uint(x0), uint(y0), uint(x1), uint(y1));
    uint tile_count = uint((x1 - x0) * (y1 - y0));
@ -287,43 +292,43 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
    if (th_ix == 255u)
    {
        uint param_4 = total_tile_count * 8u;
-        MallocResult _476 = malloc(param_4, v_92, v_92BufferSize);
-        sh_tile_alloc = _476;
+        MallocResult _485 = malloc(param_4, v_92, v_92BufferSize);
+        sh_tile_alloc = _485;
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);
    MallocResult alloc_start = sh_tile_alloc;
-    bool _487;
+    bool _496;
    if (!alloc_start.failed)
    {
-        _487 = v_92.mem_error != 0u;
+        _496 = v_92.mem_error != 0u;
    }
    else
    {
-        _487 = alloc_start.failed;
+        _496 = alloc_start.failed;
    }
-    if (_487)
+    if (_496)
    {
        return;
    }
-    if (element_ix < _305.conf.n_elements)
+    if (element_ix < _314.conf.n_elements)
    {
-        uint _500;
+        uint _509;
        if (th_ix > 0u)
        {
-            _500 = sh_tile_count[th_ix - 1u];
+            _509 = sh_tile_count[th_ix - 1u];
        }
        else
        {
-            _500 = 0u;
+            _509 = 0u;
        }
-        uint tile_subix = _500;
+        uint tile_subix = _509;
        Alloc param_5 = alloc_start.alloc;
        uint param_6 = 8u * tile_subix;
        uint param_7 = 8u * tile_count;
        Alloc tiles_alloc = slice_mem(param_5, param_6, param_7);
        path.tiles = TileRef{ tiles_alloc.offset };
        Alloc param_8;
-        param_8.offset = _305.conf.tile_alloc.offset;
+        param_8.offset = _314.conf.tile_alloc.offset;
        PathRef param_9 = path_ref;
        Path param_10 = path;
        Path_write(param_8, param_9, param_10, v_92, v_92BufferSize);
--- a/piet-gpu/shader/gen/tile_alloc.spv
+++ b/piet-gpu/shader/gen/tile_alloc.spv
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@ -35,6 +35,7 @@ layout(rgba8, set = 0, binding = 4) uniform restrict readonly image2D gradients;

 #include "ptcl.h"
 #include "tile.h"
+#include "blend.h"

 #define MAX_BLEND_STACK 128
 mediump vec3 tosRGB(mediump vec3 rgb) {
@ -216,14 +217,20 @@ void main() {
            cmd_ref.offset += 4;
            break;
        case Cmd_EndClip:
+            CmdEndClip end_clip = Cmd_EndClip_read(cmd_alloc, cmd_ref);
+            uint blend_mode = uint(end_clip.blend >> 8);
+            uint comp_mode = uint(end_clip.blend & 0xFF);
            clip_depth--;
            for (uint k = 0; k < CHUNK; k++) {
                uint d = min(clip_depth, MAX_BLEND_STACK - 1);
                mediump vec4 bg = unpacksRGB(blend_stack[d][k]);
                mediump vec4 fg = rgba[k] * area[k];
-                rgba[k] = bg * (1.0 - fg.a) + fg;
+                vec3 blend = mix_blend(bg.rgb, fg.rgb, blend_mode);
+                // Apply the blend color only where the foreground and background overlap.
+                fg.rgb = mix(fg.rgb, blend, float((fg.a * bg.a) > 0.0));
+                rgba[k] = mix_compose(bg.rgb, fg.rgb, bg.a, fg.a, comp_mode);
            }
-            cmd_ref.offset += 4;
+            cmd_ref.offset += 4 + CmdEndClip_size;
            break;
        case Cmd_Jump:
            cmd_ref = CmdRef(Cmd_Jump_read(cmd_alloc, cmd_ref).new_ref);
--- a/piet-gpu/shader/ptcl.h
+++ b/piet-gpu/shader/ptcl.h
@ -26,6 +26,10 @@ struct CmdAlphaRef {
    uint offset;
 };

+struct CmdEndClipRef {
+    uint offset;
+};
+
 struct CmdJumpRef {
    uint offset;
 };
@ -100,6 +104,16 @@ CmdAlphaRef CmdAlpha_index(CmdAlphaRef ref, uint index) {
    return CmdAlphaRef(ref.offset + index * CmdAlpha_size);
 }

+struct CmdEndClip {
+    uint blend;
+};
+
+#define CmdEndClip_size 4
+
+CmdEndClipRef CmdEndClip_index(CmdEndClipRef ref, uint index) {
+    return CmdEndClipRef(ref.offset + index * CmdEndClip_size);
+}
+
 struct CmdJump {
    uint new_ref;
 };
@ -228,6 +242,19 @@ void CmdAlpha_write(Alloc a, CmdAlphaRef ref, CmdAlpha s) {
    write_mem(a, ix + 0, floatBitsToUint(s.alpha));
 }

+CmdEndClip CmdEndClip_read(Alloc a, CmdEndClipRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    CmdEndClip s;
+    s.blend = raw0;
+    return s;
+}
+
+void CmdEndClip_write(Alloc a, CmdEndClipRef ref, CmdEndClip s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, s.blend);
+}
+
 CmdJump CmdJump_read(Alloc a, CmdJumpRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = read_mem(a, ix + 0);
@ -270,6 +297,10 @@ CmdImage Cmd_Image_read(Alloc a, CmdRef ref) {
    return CmdImage_read(a, CmdImageRef(ref.offset + 4));
 }

+CmdEndClip Cmd_EndClip_read(Alloc a, CmdRef ref) {
+    return CmdEndClip_read(a, CmdEndClipRef(ref.offset + 4));
+}
+
 CmdJump Cmd_Jump_read(Alloc a, CmdRef ref) {
    return CmdJump_read(a, CmdJumpRef(ref.offset + 4));
 }
@ -316,8 +347,9 @@ void Cmd_BeginClip_write(Alloc a, CmdRef ref) {
    write_mem(a, ref.offset >> 2, Cmd_BeginClip);
 }

-void Cmd_EndClip_write(Alloc a, CmdRef ref) {
+void Cmd_EndClip_write(Alloc a, CmdRef ref, CmdEndClip s) {
    write_mem(a, ref.offset >> 2, Cmd_EndClip);
+    CmdEndClip_write(a, CmdEndClipRef(ref.offset + 4), s);
 }

 void Cmd_Jump_write(Alloc a, CmdRef ref, CmdJump s) {
--- a/piet-gpu/shader/scene.h
+++ b/piet-gpu/shader/scene.h
@ -138,9 +138,10 @@ TransformRef Transform_index(TransformRef ref, uint index) {

 struct Clip {
    vec4 bbox;
+    uint blend;
 };

-#define Clip_size 16
+#define Clip_size 20

 ClipRef Clip_index(ClipRef ref, uint index) {
    return ClipRef(ref.offset + index * Clip_size);
@ -286,6 +287,7 @@ Clip Clip_read(ClipRef ref) {
    uint raw3 = scene[ix + 3];
    Clip s;
    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.blend = scene[ix + 4];
    return s;
 }

--- a/piet-gpu/src/blend.rs
+++ b/piet-gpu/src/blend.rs
@ -0,0 +1,99 @@
+// Copyright 2022 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+#[repr(C)]
+pub enum BlendMode {
+    Normal = 0,
+    Multiply = 1,
+    Screen = 2,
+    Overlay = 3,
+    Darken = 4,
+    Lighten = 5,
+    ColorDodge = 6,
+    ColorBurn = 7,
+    HardLight = 8,
+    SoftLight = 9,
+    Difference = 10,
+    Exclusion = 11,
+    Hue = 12,
+    Saturation = 13,
+    Color = 14,
+    Luminosity = 15,
+}
+
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+#[repr(C)]
+pub enum CompositionMode {
+    Clear = 0,
+    Copy = 1,
+    Dest = 2,
+    SrcOver = 3,
+    DestOver = 4,
+    SrcIn = 5,
+    DestIn = 6,
+    SrcOut = 7,
+    DestOut = 8,
+    SrcAtop = 9,
+    DestAtop = 10,
+    Xor = 11,
+    Plus = 12,
+    PlusDarker = 13,
+    PlusLighter = 14,
+}
+
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+pub struct Blend {
+    pub mode: BlendMode,
+    pub composition_mode: CompositionMode,
+}
+
+impl Blend {
+    pub fn new(mode: BlendMode, composition_mode: CompositionMode) -> Self {
+        Self { mode, composition_mode }
+    }
+
+    pub(crate) fn pack(&self) -> u32 {
+        (self.mode as u32) << 8 | self.composition_mode as u32
+    }
+}
+
+impl Default for Blend {
+    fn default() -> Self {
+        Self {
+            mode: BlendMode::Normal,
+            composition_mode: CompositionMode::SrcOver,
+        }
+    }
+}
+
+impl From<BlendMode> for Blend {
+    fn from(mode: BlendMode) -> Self {
+        Self {
+            mode,
+            composition_mode: CompositionMode::SrcOver,
+        }
+    }
+}
+
+impl From<CompositionMode> for Blend {
+    fn from(mode: CompositionMode) -> Self {
+        Self {
+            mode: BlendMode::Normal,
+            composition_mode: mode,
+        }
+    }
+}
--- a/piet-gpu/src/encoder.rs
+++ b/piet-gpu/src/encoder.rs
@ -16,6 +16,7 @@

 //! Low-level scene encoding.

+use crate::Blend;
 use bytemuck::{Pod, Zeroable};
 use piet_gpu_hal::BufWrite;

@ -87,7 +88,8 @@ pub struct FillLinGradient {
 pub struct Clip {
    tag: u32,
    bbox: [f32; 4],
-    padding: [u32; 4],
+    blend: u32,
+    padding: [u32; 3],
 }

 impl Encoder {
@ -151,10 +153,11 @@ impl Encoder {
    }

    /// Start a clip and return a save point to be filled in later.
-    pub fn begin_clip(&mut self) -> usize {
+    pub fn begin_clip(&mut self, blend: Option<Blend>) -> usize {
        let saved = self.drawobj_stream.len();
        let element = Clip {
            tag: ELEMENT_BEGINCLIP,
+            blend: blend.unwrap_or(Blend::default()).pack(),
            ..Default::default()
        };
        self.drawobj_stream.extend(bytemuck::bytes_of(&element));
@ -162,10 +165,11 @@ impl Encoder {
        saved
    }

-    pub fn end_clip(&mut self, bbox: [f32; 4], save_point: usize) {
+    pub fn end_clip(&mut self, bbox: [f32; 4], blend: Option<Blend>, save_point: usize) {
        let element = Clip {
            tag: ELEMENT_ENDCLIP,
            bbox,
+            blend: blend.unwrap_or(Blend::default()).pack(),
            ..Default::default()
        };
        self.drawobj_stream[save_point + 4..save_point + 20]
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -1,3 +1,4 @@
+mod blend;
 mod encoder;
 pub mod glyph_render;
 mod gradient;
@ -9,6 +10,7 @@ mod text;

 use std::convert::TryInto;

+pub use blend::{Blend, BlendMode, CompositionMode};
 pub use render_ctx::PietGpuRenderContext;

 use piet::kurbo::Vec2;
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@ -16,6 +16,7 @@ use piet_gpu_types::scene::Element;
 use crate::gradient::{LinearGradient, RampCache};
 use crate::text::Font;
 pub use crate::text::{PietGpuText, PietGpuTextLayout, PietGpuTextLayoutBuilder};
+use crate::Blend;

 pub struct PietGpuImage;

@ -66,6 +67,7 @@ struct ClipElement {
    /// Byte offset of BeginClip element in element vec, for bbox fixup.
    save_point: usize,
    bbox: Option<Rect>,
+    blend: Option<Blend>,
 }

 const TOLERANCE: f64 = 0.25;
@ -230,13 +232,14 @@ impl RenderContext for PietGpuRenderContext {
        self.encode_linewidth(-1.0);
        let path = shape.path_elements(TOLERANCE);
        self.encode_path(path, true);
-        let save_point = self.new_encoder.begin_clip();
+        let save_point = self.new_encoder.begin_clip(None);
        if self.clip_stack.len() >= MAX_BLEND_STACK {
            panic!("Maximum clip/blend stack size {} exceeded", MAX_BLEND_STACK);
        }
        self.clip_stack.push(ClipElement {
            bbox: None,
            save_point,
+            blend: None,
        });
        if let Some(tos) = self.state_stack.last_mut() {
            tos.n_clip += 1;
@ -333,6 +336,25 @@ impl RenderContext for PietGpuRenderContext {
 }

 impl PietGpuRenderContext {
+    pub fn blend(&mut self, shape: impl Shape, blend: Blend) {
+        self.encode_linewidth(-1.0);
+        let path = shape.path_elements(TOLERANCE);
+        self.encode_path(path, true);
+        let save_point = self.new_encoder.begin_clip(Some(blend));
+        if self.clip_stack.len() >= MAX_BLEND_STACK {
+            panic!("Maximum clip/blend stack size {} exceeded", MAX_BLEND_STACK);
+        }
+        self.clip_stack.push(ClipElement {
+            bbox: None,
+            save_point,
+            blend: Some(blend),
+        });
+        self.accumulate_bbox(|| shape.bounding_box());
+        if let Some(tos) = self.state_stack.last_mut() {
+            tos.n_clip += 1;
+        }
+    }
+
    fn encode_path(&mut self, path: impl Iterator<Item = PathEl>, is_fill: bool) {
        if is_fill {
            self.encode_path_inner(
@ -386,7 +408,7 @@ impl PietGpuRenderContext {
        let tos = self.clip_stack.pop().unwrap();
        let bbox = tos.bbox.unwrap_or_default();
        let bbox_f32_4 = rect_to_f32_4(bbox);
-        self.new_encoder.end_clip(bbox_f32_4, tos.save_point);
+        self.new_encoder.end_clip(bbox_f32_4, tos.blend, tos.save_point);
        if let Some(bbox) = tos.bbox {
            self.union_bbox(bbox);
        }
--- a/piet-gpu/src/test_scenes.rs
+++ b/piet-gpu/src/test_scenes.rs
@ -2,7 +2,8 @@

 use rand::{Rng, RngCore};

-use piet::kurbo::{BezPath, Circle, Line, Point, Rect, Shape};
+use crate::{PietGpuRenderContext, Blend, BlendMode, CompositionMode};
+use piet::kurbo::{Affine, BezPath, Circle, Line, Point, Rect, Shape};
 use piet::{
    Color, FixedGradient, FixedLinearGradient, GradientStop, Text, TextAttribute, TextLayoutBuilder,
 };
@ -11,6 +12,18 @@ use crate::{PicoSvg, RenderContext, Vec2};

 const N_CIRCLES: usize = 0;

+pub fn render_blend_test(rc: &mut PietGpuRenderContext, i: usize, blend: Blend) {
+    rc.fill(
+        Rect::new(400., 400., 800., 800.),
+        &Color::rgb8(0, 0, 200),
+    );
+    rc.save().unwrap();
+    rc.blend(Rect::new(0., 0., 1000., 1000.), blend);
+    rc.transform(Affine::translate(Vec2::new(600., 600.)) * Affine::rotate(0.01 * i as f64));
+    rc.fill(Rect::new(0., 0., 400., 400.), &Color::rgba8(255, 0, 0, 255));
+    rc.restore().unwrap();
+}
+
 pub fn render_svg(rc: &mut impl RenderContext, filename: &str, scale: f64) {
    let xml_str = std::fs::read_to_string(filename).unwrap();
    let start = std::time::Instant::now();