diff --git a/piet-gpu/shader/blend.h b/piet-gpu/shader/blend.h
index 1ac4bd6..c0ae6af 100644
--- a/piet-gpu/shader/blend.h
+++ b/piet-gpu/shader/blend.h
@@ -122,6 +122,8 @@ vec3 set_sat(vec3 c, float s) {
     return c;
 }
 
+// Blends two RGB colors together. The colors are assumed to be in sRGB
+// color space, and this function does not take alpha into account.
 vec3 mix_blend(vec3 cb, vec3 cs, uint mode) {
 	vec3 b = vec3(0.0);
 	switch (mode) {
@@ -190,9 +192,10 @@ vec3 mix_blend(vec3 cb, vec3 cs, uint mode) {
 #define Comp_DestAtop 10
 #define Comp_Xor 11
 #define Comp_Plus 12
-#define Comp_PlusDarker 13
-#define Comp_PlusLighter 14
+#define Comp_PlusLighter 13
 
+// Apply general compositing operation.
+// Inputs are separated colors and alpha, output is premultiplied.
 vec4 mix_compose(vec3 cb, vec3 cs, float ab, float as, uint mode) {
 	float fa = 0.0;
 	float fb = 0.0;
@@ -245,16 +248,41 @@ vec4 mix_compose(vec3 cb, vec3 cs, float ab, float as, uint mode) {
 		fa = 1.0;
 		fb = 1.0;
 		break;
-	case Comp_PlusDarker:
-		return vec4(max(vec4(0.0), 1.0 - as * vec4(cs, as) + 1.0 - ab * vec4(cb, ab)).xyz, 
-			    max(0.0, 1.0 - as + 1.0 - ab));
 	case Comp_PlusLighter:
-		return vec4(min(vec4(1.0), as * vec4(cs, as) + ab * vec4(cb, ab)).xyz,
-			    min(1.0, as + ab));
+		return min(vec4(1.0), vec4(as * cs + ab * cb, as + ab));
 	default:
 		break;
 	}
-	return as * fa * vec4(cs, as) + ab * fb * vec4(cb, ab);
+	float as_fa = as * fa;
+	float ab_fb = ab * fb;
+	vec3 co = as_fa * cs + ab_fb * cb;
+	return vec4(co, as_fa + ab_fb);
 }
 
 #define BlendComp_default (Blend_Normal << 8 | Comp_SrcOver)
+
+// This is added to alpha to prevent divide-by-zero
+#define EPSILON 1e-15
+
+// Apply blending and composition. Both input and output colors are
+// premultiplied RGB.
+vec4 mix_blend_compose(vec4 backdrop, vec4 src, uint mode) {
+	if (mode == BlendComp_default) {
+		return backdrop * (1.0 - src.a) + src;
+	}
+	// Un-premultiply colors for blending
+	float inv_src_a = 1.0 / (src.a + EPSILON);
+	vec3 cs = src.rgb * inv_src_a;
+	float inv_backdrop_a = 1.0 / (backdrop.a + EPSILON);
+	vec3 cb = backdrop.rgb * inv_backdrop_a;
+	uint blend_mode = mode >> 8;
+	vec3 blended = mix_blend(cs, cb, blend_mode);
+	cs = mix(cs, blended, backdrop.a);
+	uint comp_mode = mode * 0xff;
+	if (comp_mode == Comp_SrcOver) {
+		vec3 co = mix(backdrop.rgb, cs, src.a);
+		return vec4(co, src.a + backdrop.a * (1 - src.a));
+	} else {
+		return mix_compose(cb, cs, backdrop.a, src.a, comp_mode);
+	}
+}
diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja
index 60e5582..09b0683 100644
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@@ -58,7 +58,7 @@ build gen/kernel4.hlsl: hlsl gen/kernel4.spv
 build gen/kernel4.dxil: dxil gen/kernel4.hlsl
 build gen/kernel4.msl: msl gen/kernel4.spv
 
-build gen/kernel4_gray.spv: glsl kernel4.comp | ptcl.h setup.h mem.h
+build gen/kernel4_gray.spv: glsl kernel4.comp | blend.h ptcl.h setup.h mem.h
   flags = -DGRAY
 build gen/kernel4_gray.hlsl: hlsl gen/kernel4_gray.spv
 build gen/kernel4_gray.dxil: dxil gen/kernel4_gray.hlsl
diff --git a/piet-gpu/shader/gen/backdrop.dxil b/piet-gpu/shader/gen/backdrop.dxil
index 0fb9622..df2be88 100644
Binary files a/piet-gpu/shader/gen/backdrop.dxil and b/piet-gpu/shader/gen/backdrop.dxil differ
diff --git a/piet-gpu/shader/gen/backdrop_lg.dxil b/piet-gpu/shader/gen/backdrop_lg.dxil
index e24a6d3..81f9b65 100644
Binary files a/piet-gpu/shader/gen/backdrop_lg.dxil and b/piet-gpu/shader/gen/backdrop_lg.dxil differ
diff --git a/piet-gpu/shader/gen/bbox_clear.dxil b/piet-gpu/shader/gen/bbox_clear.dxil
index 6655b7f..6b3efaf 100644
Binary files a/piet-gpu/shader/gen/bbox_clear.dxil and b/piet-gpu/shader/gen/bbox_clear.dxil differ
diff --git a/piet-gpu/shader/gen/binning.dxil b/piet-gpu/shader/gen/binning.dxil
index 3050aa8..4a4f073 100644
Binary files a/piet-gpu/shader/gen/binning.dxil and b/piet-gpu/shader/gen/binning.dxil differ
diff --git a/piet-gpu/shader/gen/clip_leaf.dxil b/piet-gpu/shader/gen/clip_leaf.dxil
index 29a158e..b681a65 100644
Binary files a/piet-gpu/shader/gen/clip_leaf.dxil and b/piet-gpu/shader/gen/clip_leaf.dxil differ
diff --git a/piet-gpu/shader/gen/clip_reduce.dxil b/piet-gpu/shader/gen/clip_reduce.dxil
index 0dff71b..0ccaac9 100644
Binary files a/piet-gpu/shader/gen/clip_reduce.dxil and b/piet-gpu/shader/gen/clip_reduce.dxil differ
diff --git a/piet-gpu/shader/gen/coarse.dxil b/piet-gpu/shader/gen/coarse.dxil
index fdab444..910925d 100644
Binary files a/piet-gpu/shader/gen/coarse.dxil and b/piet-gpu/shader/gen/coarse.dxil differ
diff --git a/piet-gpu/shader/gen/draw_leaf.dxil b/piet-gpu/shader/gen/draw_leaf.dxil
index 200f169..6353f19 100644
Binary files a/piet-gpu/shader/gen/draw_leaf.dxil and b/piet-gpu/shader/gen/draw_leaf.dxil differ
diff --git a/piet-gpu/shader/gen/draw_reduce.dxil b/piet-gpu/shader/gen/draw_reduce.dxil
index be69aad..c101fc8 100644
Binary files a/piet-gpu/shader/gen/draw_reduce.dxil and b/piet-gpu/shader/gen/draw_reduce.dxil differ
diff --git a/piet-gpu/shader/gen/draw_root.dxil b/piet-gpu/shader/gen/draw_root.dxil
index 4ea23f7..873fa29 100644
Binary files a/piet-gpu/shader/gen/draw_root.dxil and b/piet-gpu/shader/gen/draw_root.dxil differ
diff --git a/piet-gpu/shader/gen/kernel4.dxil b/piet-gpu/shader/gen/kernel4.dxil
index e6eccc1..da6c563 100644
Binary files a/piet-gpu/shader/gen/kernel4.dxil and b/piet-gpu/shader/gen/kernel4.dxil differ
diff --git a/piet-gpu/shader/gen/kernel4.hlsl b/piet-gpu/shader/gen/kernel4.hlsl
index 92fe05b..5d6f839 100644
--- a/piet-gpu/shader/gen/kernel4.hlsl
+++ b/piet-gpu/shader/gen/kernel4.hlsl
@@ -161,8 +161,8 @@ struct Config
 
 static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);
 
-RWByteAddressBuffer _291 : register(u0, space0);
-ByteAddressBuffer _1666 : register(t1, space0);
+RWByteAddressBuffer _297 : register(u0, space0);
+ByteAddressBuffer _1749 : register(t1, space0);
 RWTexture2D<unorm float4> image_atlas : register(u3, space0);
 RWTexture2D<unorm float4> gradients : register(u4, space0);
 RWTexture2D<unorm float4> image : register(u2, space0);
@@ -189,8 +189,8 @@ float4 spvUnpackUnorm4x8(uint value)
 
 Alloc slice_mem(Alloc a, uint offset, uint size)
 {
-    Alloc _304 = { a.offset + offset };
-    return _304;
+    Alloc _310 = { a.offset + offset };
+    return _310;
 }
 
 bool touch_mem(Alloc alloc, uint offset)
@@ -206,7 +206,7 @@ uint read_mem(Alloc alloc, uint offset)
     {
         return 0u;
     }
-    uint v = _291.Load(offset * 4 + 8);
+    uint v = _297.Load(offset * 4 + 8);
     return v;
 }
 
@@ -215,8 +215,8 @@ CmdTag Cmd_tag(Alloc a, CmdRef ref)
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint tag_and_flags = read_mem(param, param_1);
-    CmdTag _663 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _663;
+    CmdTag _669 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+    return _669;
 }
 
 CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref)
@@ -236,9 +236,9 @@ CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref)
 
 CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref)
 {
-    CmdStrokeRef _679 = { ref.offset + 4u };
+    CmdStrokeRef _685 = { ref.offset + 4u };
     Alloc param = a;
-    CmdStrokeRef param_1 = _679;
+    CmdStrokeRef param_1 = _685;
     return CmdStroke_read(param, param_1);
 }
 
@@ -274,8 +274,8 @@ TileSeg TileSeg_read(Alloc a, TileSegRef ref)
     s.origin = float2(asfloat(raw0), asfloat(raw1));
     s._vector = float2(asfloat(raw2), asfloat(raw3));
     s.y_edge = asfloat(raw4);
-    TileSegRef _820 = { raw5 };
-    s.next = _820;
+    TileSegRef _826 = { raw5 };
+    s.next = _826;
     return s;
 }
 
@@ -301,9 +301,9 @@ CmdFill CmdFill_read(Alloc a, CmdFillRef ref)
 
 CmdFill Cmd_Fill_read(Alloc a, CmdRef ref)
 {
-    CmdFillRef _669 = { ref.offset + 4u };
+    CmdFillRef _675 = { ref.offset + 4u };
     Alloc param = a;
-    CmdFillRef param_1 = _669;
+    CmdFillRef param_1 = _675;
     return CmdFill_read(param, param_1);
 }
 
@@ -320,9 +320,9 @@ CmdAlpha CmdAlpha_read(Alloc a, CmdAlphaRef ref)
 
 CmdAlpha Cmd_Alpha_read(Alloc a, CmdRef ref)
 {
-    CmdAlphaRef _689 = { ref.offset + 4u };
+    CmdAlphaRef _695 = { ref.offset + 4u };
     Alloc param = a;
-    CmdAlphaRef param_1 = _689;
+    CmdAlphaRef param_1 = _695;
     return CmdAlpha_read(param, param_1);
 }
 
@@ -339,9 +339,9 @@ CmdColor CmdColor_read(Alloc a, CmdColorRef ref)
 
 CmdColor Cmd_Color_read(Alloc a, CmdRef ref)
 {
-    CmdColorRef _699 = { ref.offset + 4u };
+    CmdColorRef _705 = { ref.offset + 4u };
     Alloc param = a;
-    CmdColorRef param_1 = _699;
+    CmdColorRef param_1 = _705;
     return CmdColor_read(param, param_1);
 }
 
@@ -385,9 +385,9 @@ CmdLinGrad CmdLinGrad_read(Alloc a, CmdLinGradRef ref)
 
 CmdLinGrad Cmd_LinGrad_read(Alloc a, CmdRef ref)
 {
-    CmdLinGradRef _709 = { ref.offset + 4u };
+    CmdLinGradRef _715 = { ref.offset + 4u };
     Alloc param = a;
-    CmdLinGradRef param_1 = _709;
+    CmdLinGradRef param_1 = _715;
     return CmdLinGrad_read(param, param_1);
 }
 
@@ -439,9 +439,9 @@ CmdRadGrad CmdRadGrad_read(Alloc a, CmdRadGradRef ref)
 
 CmdRadGrad Cmd_RadGrad_read(Alloc a, CmdRef ref)
 {
-    CmdRadGradRef _719 = { ref.offset + 4u };
+    CmdRadGradRef _725 = { ref.offset + 4u };
     Alloc param = a;
-    CmdRadGradRef param_1 = _719;
+    CmdRadGradRef param_1 = _725;
     return CmdRadGrad_read(param, param_1);
 }
 
@@ -462,9 +462,9 @@ CmdImage CmdImage_read(Alloc a, CmdImageRef ref)
 
 CmdImage Cmd_Image_read(Alloc a, CmdRef ref)
 {
-    CmdImageRef _729 = { ref.offset + 4u };
+    CmdImageRef _735 = { ref.offset + 4u };
     Alloc param = a;
-    CmdImageRef param_1 = _729;
+    CmdImageRef param_1 = _735;
     return CmdImage_read(param, param_1);
 }
 
@@ -477,10 +477,10 @@ void fillImage(out float4 spvReturnValue[8], uint2 xy, CmdImage cmd_img)
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas[uv];
         float3 param_1 = fg_rgba.xyz;
-        float3 _1638 = fromsRGB(param_1);
-        fg_rgba.x = _1638.x;
-        fg_rgba.y = _1638.y;
-        fg_rgba.z = _1638.z;
+        float3 _1721 = fromsRGB(param_1);
+        fg_rgba.x = _1721.x;
+        fg_rgba.y = _1721.y;
+        fg_rgba.z = _1721.z;
         rgba[i] = fg_rgba;
     }
     spvReturnValue = rgba;
@@ -514,9 +514,9 @@ CmdEndClip CmdEndClip_read(Alloc a, CmdEndClipRef ref)
 
 CmdEndClip Cmd_EndClip_read(Alloc a, CmdRef ref)
 {
-    CmdEndClipRef _739 = { ref.offset + 4u };
+    CmdEndClipRef _745 = { ref.offset + 4u };
     Alloc param = a;
-    CmdEndClipRef param_1 = _739;
+    CmdEndClipRef param_1 = _745;
     return CmdEndClip_read(param, param_1);
 }
 
@@ -706,8 +706,8 @@ float3 set_lum(float3 c, float l)
 {
     float3 param = c;
     float3 param_1 = c + (l - lum(param)).xxx;
-    float3 _1046 = clip_color(param_1);
-    return _1046;
+    float3 _1052 = clip_color(param_1);
+    return _1052;
 }
 
 float3 mix_blend(float3 cb, float3 cs, uint mode)
@@ -795,9 +795,9 @@ float3 mix_blend(float3 cb, float3 cs, uint mode)
             float3 param_20 = cb;
             float3 param_21 = cs;
             float param_22 = sat(param_20);
-            float3 _1337 = set_sat(param_21, param_22);
+            float3 _1343 = set_sat(param_21, param_22);
             float3 param_23 = cb;
-            float3 param_24 = _1337;
+            float3 param_24 = _1343;
             float param_25 = lum(param_23);
             b = set_lum(param_24, param_25);
             break;
@@ -807,9 +807,9 @@ float3 mix_blend(float3 cb, float3 cs, uint mode)
             float3 param_26 = cs;
             float3 param_27 = cb;
             float param_28 = sat(param_26);
-            float3 _1351 = set_sat(param_27, param_28);
+            float3 _1357 = set_sat(param_27, param_28);
             float3 param_29 = cb;
-            float3 param_30 = _1351;
+            float3 param_30 = _1357;
             float param_31 = lum(param_29);
             b = set_lum(param_30, param_31);
             break;
@@ -919,18 +919,56 @@ float4 mix_compose(float3 cb, float3 cs, float ab, float as, uint mode)
         }
         case 13u:
         {
-            return float4(max(0.0f.xxxx, ((1.0f.xxxx - (float4(cs, as) * as)) + 1.0f.xxxx) - (float4(cb, ab) * ab)).xyz, max(0.0f, ((1.0f - as) + 1.0f) - ab));
+            float rev_as = 1.0f - as;
+            float rev_ab = 1.0f - ab;
+            return max(0.0f.xxxx, float4((cs * rev_as) + (cb * rev_ab), rev_as + rev_ab));
         }
         case 14u:
         {
-            return float4(min(1.0f.xxxx, (float4(cs, as) * as) + (float4(cb, ab) * ab)).xyz, min(1.0f, as + ab));
+            return min(1.0f.xxxx, float4((cs * as) + (cb * ab), as + ab));
         }
         default:
         {
             break;
         }
     }
-    return (float4(cs, as) * (as * fa)) + (float4(cb, ab) * (ab * fb));
+    float as_fa = as * fa;
+    float ab_fb = ab * fb;
+    float3 co = (cs * as_fa) + (cb * ab_fb);
+    return float4(co, as_fa + ab_fb);
+}
+
+float4 mix_blend_compose(float4 backdrop, float4 src, uint mode)
+{
+    if (mode == 3u)
+    {
+        return (backdrop * (1.0f - src.w)) + src;
+    }
+    float inv_src_a = 1.0f / (src.w + 1.0000000036274937255387218471014e-15f);
+    float3 cs = src.xyz * inv_src_a;
+    float inv_backdrop_a = 1.0f / (backdrop.w + 1.0000000036274937255387218471014e-15f);
+    float3 cb = backdrop.xyz * inv_backdrop_a;
+    uint blend_mode = mode >> uint(8);
+    float3 param = cs;
+    float3 param_1 = cb;
+    uint param_2 = blend_mode;
+    float3 blended = mix_blend(param, param_1, param_2);
+    cs = lerp(cs, blended, backdrop.w.xxx);
+    uint comp_mode = mode * 255u;
+    if (comp_mode == 3u)
+    {
+        float3 co = lerp(backdrop.xyz, cs, src.w.xxx);
+        return float4(co, src.w + (backdrop.w * (1.0f - src.w)));
+    }
+    else
+    {
+        float3 param_3 = cb;
+        float3 param_4 = cs;
+        float param_5 = backdrop.w;
+        float param_6 = src.w;
+        uint param_7 = comp_mode;
+        return mix_compose(param_3, param_4, param_5, param_6, param_7);
+    }
 }
 
 CmdJump CmdJump_read(Alloc a, CmdJumpRef ref)
@@ -946,24 +984,24 @@ CmdJump CmdJump_read(Alloc a, CmdJumpRef ref)
 
 CmdJump Cmd_Jump_read(Alloc a, CmdRef ref)
 {
-    CmdJumpRef _749 = { ref.offset + 4u };
+    CmdJumpRef _755 = { ref.offset + 4u };
     Alloc param = a;
-    CmdJumpRef param_1 = _749;
+    CmdJumpRef param_1 = _755;
     return CmdJump_read(param, param_1);
 }
 
 void comp_main()
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1666.Load(8)) + gl_WorkGroupID.x;
-    Alloc _1681;
-    _1681.offset = _1666.Load(24);
+    uint tile_ix = (gl_WorkGroupID.y * _1749.Load(8)) + gl_WorkGroupID.x;
+    Alloc _1764;
+    _1764.offset = _1749.Load(24);
     Alloc param;
-    param.offset = _1681.offset;
+    param.offset = _1764.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
-    CmdRef _1690 = { cmd_alloc.offset };
-    CmdRef cmd_ref = _1690;
+    CmdRef _1773 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _1773;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
     float4 rgba[8];
@@ -972,7 +1010,7 @@ void comp_main()
         rgba[i] = 0.0f.xxxx;
     }
     uint clip_depth = 0u;
-    bool mem_ok = _291.Load(4) == 0u;
+    bool mem_ok = _297.Load(4) == 0u;
     float df[8];
     TileSegRef tile_seg_ref;
     float area[8];
@@ -997,8 +1035,8 @@ void comp_main()
                 {
                     df[k] = 1000000000.0f;
                 }
-                TileSegRef _1784 = { stroke.tile_ref };
-                tile_seg_ref = _1784;
+                TileSegRef _1867 = { stroke.tile_ref };
+                tile_seg_ref = _1867;
                 do
                 {
                     uint param_7 = tile_seg_ref.offset;
@@ -1034,8 +1072,8 @@ void comp_main()
                 {
                     area[k_3] = float(fill.backdrop);
                 }
-                TileSegRef _1904 = { fill.tile_ref };
-                tile_seg_ref = _1904;
+                TileSegRef _1987 = { fill.tile_ref };
+                tile_seg_ref = _1987;
                 do
                 {
                     uint param_15 = tile_seg_ref.offset;
@@ -1124,10 +1162,10 @@ void comp_main()
                     int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba = gradients[int2(x, int(lin.index))];
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2238 = fromsRGB(param_29);
-                    fg_rgba.x = _2238.x;
-                    fg_rgba.y = _2238.y;
-                    fg_rgba.z = _2238.z;
+                    float3 _2321 = fromsRGB(param_29);
+                    fg_rgba.x = _2321.x;
+                    fg_rgba.y = _2321.y;
+                    fg_rgba.z = _2321.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1;
                 }
@@ -1150,10 +1188,10 @@ void comp_main()
                     int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))];
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2348 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2348.x;
-                    fg_rgba_1.y = _2348.y;
-                    fg_rgba_1.z = _2348.z;
+                    float3 _2431 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2431.x;
+                    fg_rgba_1.y = _2431.y;
+                    fg_rgba_1.z = _2431.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2;
                 }
@@ -1167,9 +1205,9 @@ void comp_main()
                 CmdImage fill_img = Cmd_Image_read(param_34, param_35);
                 uint2 param_36 = xy_uint;
                 CmdImage param_37 = fill_img;
-                float4 _2391[8];
-                fillImage(_2391, param_36, param_37);
-                float4 img[8] = _2391;
+                float4 _2474[8];
+                fillImage(_2474, param_36, param_37);
+                float4 img[8] = _2474;
                 for (uint k_11 = 0u; k_11 < 8u; k_11++)
                 {
                     float4 fg_k_3 = img[k_11] * area[k_11];
@@ -1184,8 +1222,8 @@ void comp_main()
                 {
                     uint d_2 = min(clip_depth, 127u);
                     float4 param_38 = float4(rgba[k_12]);
-                    uint _2454 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2454;
+                    uint _2537 = packsRGB(param_38);
+                    blend_stack[d_2][k_12] = _2537;
                     rgba[k_12] = 0.0f.xxxx;
                 }
                 clip_depth++;
@@ -1206,32 +1244,20 @@ void comp_main()
                     uint param_41 = blend_stack[d_3][k_13];
                     float4 bg = unpacksRGB(param_41);
                     float4 fg_1 = rgba[k_13] * area[k_13];
-                    float3 param_42 = bg.xyz;
-                    float3 param_43 = fg_1.xyz;
-                    uint param_44 = blend_mode;
-                    float3 blend = mix_blend(param_42, param_43, param_44);
-                    float4 _2521 = fg_1;
-                    float _2525 = fg_1.w;
-                    float3 _2532 = lerp(_2521.xyz, blend, float((_2525 * bg.w) > 0.0f).xxx);
-                    fg_1.x = _2532.x;
-                    fg_1.y = _2532.y;
-                    fg_1.z = _2532.z;
-                    float3 param_45 = bg.xyz;
-                    float3 param_46 = fg_1.xyz;
-                    float param_47 = bg.w;
-                    float param_48 = fg_1.w;
-                    uint param_49 = comp_mode;
-                    rgba[k_13] = mix_compose(param_45, param_46, param_47, param_48, param_49);
+                    float4 param_42 = bg;
+                    float4 param_43 = fg_1;
+                    uint param_44 = end_clip.blend;
+                    rgba[k_13] = mix_blend_compose(param_42, param_43, param_44);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
             case 11u:
             {
-                Alloc param_50 = cmd_alloc;
-                CmdRef param_51 = cmd_ref;
-                CmdRef _2569 = { Cmd_Jump_read(param_50, param_51).new_ref };
-                cmd_ref = _2569;
+                Alloc param_45 = cmd_alloc;
+                CmdRef param_46 = cmd_ref;
+                CmdRef _2615 = { Cmd_Jump_read(param_45, param_46).new_ref };
+                cmd_ref = _2615;
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1239,9 +1265,9 @@ void comp_main()
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_52 = i_1;
-        float3 param_53 = rgba[i_1].xyz;
-        image[int2(xy_uint + chunk_offset(param_52))] = float4(tosRGB(param_53), rgba[i_1].w);
+        uint param_47 = i_1;
+        float3 param_48 = rgba[i_1].xyz;
+        image[int2(xy_uint + chunk_offset(param_47))] = float4(tosRGB(param_48), rgba[i_1].w);
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4.msl b/piet-gpu/shader/gen/kernel4.msl
index 6489563..796043b 100644
--- a/piet-gpu/shader/gen/kernel4.msl
+++ b/piet-gpu/shader/gen/kernel4.msl
@@ -237,7 +237,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
 }
 
 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_291)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_297)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -245,29 +245,29 @@ uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memor
     {
         return 0u;
     }
-    uint v = v_291.memory[offset];
+    uint v = v_297.memory[offset];
     return v;
 }
 
 static inline __attribute__((always_inline))
-CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_291);
+    uint tag_and_flags = read_mem(param, param_1, v_297);
     return CmdTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
 }
 
 static inline __attribute__((always_inline))
-CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_291)
+CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_291);
+    uint raw1 = read_mem(param_2, param_3, v_297);
     CmdStroke s;
     s.tile_ref = raw0;
     s.half_width = as_type<float>(raw1);
@@ -275,11 +275,11 @@ CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref,
 }
 
 static inline __attribute__((always_inline))
-CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdStrokeRef param_1 = CmdStrokeRef{ ref.offset + 4u };
-    return CmdStroke_read(param, param_1, v_291);
+    return CmdStroke_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
@@ -291,27 +291,27 @@ Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const
 }
 
 static inline __attribute__((always_inline))
-TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_291)
+TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_291);
+    uint raw1 = read_mem(param_2, param_3, v_297);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_291);
+    uint raw2 = read_mem(param_4, param_5, v_297);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_291);
+    uint raw3 = read_mem(param_6, param_7, v_297);
     Alloc param_8 = a;
     uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_291);
+    uint raw4 = read_mem(param_8, param_9, v_297);
     Alloc param_10 = a;
     uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_291);
+    uint raw5 = read_mem(param_10, param_11, v_297);
     TileSeg s;
     s.origin = float2(as_type<float>(raw0), as_type<float>(raw1));
     s.vector = float2(as_type<float>(raw2), as_type<float>(raw3));
@@ -327,15 +327,15 @@ uint2 chunk_offset(thread const uint& i)
 }
 
 static inline __attribute__((always_inline))
-CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_291)
+CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_291);
+    uint raw1 = read_mem(param_2, param_3, v_297);
     CmdFill s;
     s.tile_ref = raw0;
     s.backdrop = int(raw1);
@@ -343,51 +343,51 @@ CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device
 }
 
 static inline __attribute__((always_inline))
-CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdFillRef param_1 = CmdFillRef{ ref.offset + 4u };
-    return CmdFill_read(param, param_1, v_291);
+    return CmdFill_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
-CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_291)
+CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     CmdAlpha s;
     s.alpha = as_type<float>(raw0);
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdAlphaRef param_1 = CmdAlphaRef{ ref.offset + 4u };
-    return CmdAlpha_read(param, param_1, v_291);
+    return CmdAlpha_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
-CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_291)
+CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     CmdColor s;
     s.rgba_color = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdColorRef param_1 = CmdColorRef{ ref.offset + 4u };
-    return CmdColor_read(param, param_1, v_291);
+    return CmdColor_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
@@ -408,21 +408,21 @@ float4 unpacksRGB(thread const uint& srgba)
 }
 
 static inline __attribute__((always_inline))
-CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_291)
+CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_291);
+    uint raw1 = read_mem(param_2, param_3, v_297);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_291);
+    uint raw2 = read_mem(param_4, param_5, v_297);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_291);
+    uint raw3 = read_mem(param_6, param_7, v_297);
     CmdLinGrad s;
     s.index = raw0;
     s.line_x = as_type<float>(raw1);
@@ -432,50 +432,50 @@ CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& re
 }
 
 static inline __attribute__((always_inline))
-CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdLinGradRef param_1 = CmdLinGradRef{ ref.offset + 4u };
-    return CmdLinGrad_read(param, param_1, v_291);
+    return CmdLinGrad_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
-CmdRadGrad CmdRadGrad_read(thread const Alloc& a, thread const CmdRadGradRef& ref, device Memory& v_291)
+CmdRadGrad CmdRadGrad_read(thread const Alloc& a, thread const CmdRadGradRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_291);
+    uint raw1 = read_mem(param_2, param_3, v_297);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_291);
+    uint raw2 = read_mem(param_4, param_5, v_297);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_291);
+    uint raw3 = read_mem(param_6, param_7, v_297);
     Alloc param_8 = a;
     uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_291);
+    uint raw4 = read_mem(param_8, param_9, v_297);
     Alloc param_10 = a;
     uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_291);
+    uint raw5 = read_mem(param_10, param_11, v_297);
     Alloc param_12 = a;
     uint param_13 = ix + 6u;
-    uint raw6 = read_mem(param_12, param_13, v_291);
+    uint raw6 = read_mem(param_12, param_13, v_297);
     Alloc param_14 = a;
     uint param_15 = ix + 7u;
-    uint raw7 = read_mem(param_14, param_15, v_291);
+    uint raw7 = read_mem(param_14, param_15, v_297);
     Alloc param_16 = a;
     uint param_17 = ix + 8u;
-    uint raw8 = read_mem(param_16, param_17, v_291);
+    uint raw8 = read_mem(param_16, param_17, v_297);
     Alloc param_18 = a;
     uint param_19 = ix + 9u;
-    uint raw9 = read_mem(param_18, param_19, v_291);
+    uint raw9 = read_mem(param_18, param_19, v_297);
     Alloc param_20 = a;
     uint param_21 = ix + 10u;
-    uint raw10 = read_mem(param_20, param_21, v_291);
+    uint raw10 = read_mem(param_20, param_21, v_297);
     CmdRadGrad s;
     s.index = raw0;
     s.mat = float4(as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3), as_type<float>(raw4));
@@ -487,23 +487,23 @@ CmdRadGrad CmdRadGrad_read(thread const Alloc& a, thread const CmdRadGradRef& re
 }
 
 static inline __attribute__((always_inline))
-CmdRadGrad Cmd_RadGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdRadGrad Cmd_RadGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdRadGradRef param_1 = CmdRadGradRef{ ref.offset + 4u };
-    return CmdRadGrad_read(param, param_1, v_291);
+    return CmdRadGrad_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
-CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_291)
+CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_291);
+    uint raw1 = read_mem(param_2, param_3, v_297);
     CmdImage s;
     s.index = raw0;
     s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
@@ -511,11 +511,11 @@ CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, dev
 }
 
 static inline __attribute__((always_inline))
-CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdImageRef param_1 = CmdImageRef{ ref.offset + 4u };
-    return CmdImage_read(param, param_1, v_291);
+    return CmdImage_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
@@ -528,10 +528,10 @@ spvUnsafeArray<float4, 8> fillImage(thread const uint2& xy, thread const CmdImag
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas.read(uint2(uv));
         float3 param_1 = fg_rgba.xyz;
-        float3 _1638 = fromsRGB(param_1);
-        fg_rgba.x = _1638.x;
-        fg_rgba.y = _1638.y;
-        fg_rgba.z = _1638.z;
+        float3 _1721 = fromsRGB(param_1);
+        fg_rgba.x = _1721.x;
+        fg_rgba.y = _1721.y;
+        fg_rgba.z = _1721.z;
         rgba[i] = fg_rgba;
     }
     return rgba;
@@ -555,23 +555,23 @@ uint packsRGB(thread float4& rgba)
 }
 
 static inline __attribute__((always_inline))
-CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_291)
+CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     CmdEndClip s;
     s.blend = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdEndClipRef param_1 = CmdEndClipRef{ ref.offset + 4u };
-    return CmdEndClip_read(param, param_1, v_291);
+    return CmdEndClip_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
@@ -771,8 +771,8 @@ float3 set_lum(thread const float3& c, thread const float& l)
 {
     float3 param = c;
     float3 param_1 = c + float3(l - lum(param));
-    float3 _1046 = clip_color(param_1);
-    return _1046;
+    float3 _1052 = clip_color(param_1);
+    return _1052;
 }
 
 static inline __attribute__((always_inline))
@@ -861,9 +861,9 @@ float3 mix_blend(thread const float3& cb, thread const float3& cs, thread const
             float3 param_20 = cb;
             float3 param_21 = cs;
             float param_22 = sat(param_20);
-            float3 _1337 = set_sat(param_21, param_22);
+            float3 _1343 = set_sat(param_21, param_22);
             float3 param_23 = cb;
-            float3 param_24 = _1337;
+            float3 param_24 = _1343;
             float param_25 = lum(param_23);
             b = set_lum(param_24, param_25);
             break;
@@ -873,9 +873,9 @@ float3 mix_blend(thread const float3& cb, thread const float3& cs, thread const
             float3 param_26 = cs;
             float3 param_27 = cb;
             float param_28 = sat(param_26);
-            float3 _1351 = set_sat(param_27, param_28);
+            float3 _1357 = set_sat(param_27, param_28);
             float3 param_29 = cb;
-            float3 param_30 = _1351;
+            float3 param_30 = _1357;
             float param_31 = lum(param_29);
             b = set_lum(param_30, param_31);
             break;
@@ -986,45 +986,84 @@ float4 mix_compose(thread const float3& cb, thread const float3& cs, thread cons
         }
         case 13u:
         {
-            return float4(fast::max(float4(0.0), ((float4(1.0) - (float4(cs, as) * as)) + float4(1.0)) - (float4(cb, ab) * ab)).xyz, fast::max(0.0, ((1.0 - as) + 1.0) - ab));
+            float rev_as = 1.0 - as;
+            float rev_ab = 1.0 - ab;
+            return fast::max(float4(0.0), float4((cs * rev_as) + (cb * rev_ab), rev_as + rev_ab));
         }
         case 14u:
         {
-            return float4(fast::min(float4(1.0), (float4(cs, as) * as) + (float4(cb, ab) * ab)).xyz, fast::min(1.0, as + ab));
+            return fast::min(float4(1.0), float4((cs * as) + (cb * ab), as + ab));
         }
         default:
         {
             break;
         }
     }
-    return (float4(cs, as) * (as * fa)) + (float4(cb, ab) * (ab * fb));
+    float as_fa = as * fa;
+    float ab_fb = ab * fb;
+    float3 co = (cs * as_fa) + (cb * ab_fb);
+    return float4(co, as_fa + ab_fb);
 }
 
 static inline __attribute__((always_inline))
-CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_291)
+float4 mix_blend_compose(thread const float4& backdrop, thread const float4& src, thread const uint& mode)
+{
+    if (mode == 3u)
+    {
+        return (backdrop * (1.0 - src.w)) + src;
+    }
+    float inv_src_a = 1.0 / (src.w + 1.0000000036274937255387218471014e-15);
+    float3 cs = src.xyz * inv_src_a;
+    float inv_backdrop_a = 1.0 / (backdrop.w + 1.0000000036274937255387218471014e-15);
+    float3 cb = backdrop.xyz * inv_backdrop_a;
+    uint blend_mode = mode >> uint(8);
+    float3 param = cs;
+    float3 param_1 = cb;
+    uint param_2 = blend_mode;
+    float3 blended = mix_blend(param, param_1, param_2);
+    cs = mix(cs, blended, float3(backdrop.w));
+    uint comp_mode = mode * 255u;
+    if (comp_mode == 3u)
+    {
+        float3 co = mix(backdrop.xyz, cs, float3(src.w));
+        return float4(co, src.w + (backdrop.w * (1.0 - src.w)));
+    }
+    else
+    {
+        float3 param_3 = cb;
+        float3 param_4 = cs;
+        float param_5 = backdrop.w;
+        float param_6 = src.w;
+        uint param_7 = comp_mode;
+        return mix_compose(param_3, param_4, param_5, param_6, param_7);
+    }
+}
+
+static inline __attribute__((always_inline))
+CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     CmdJump s;
     s.new_ref = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdJumpRef param_1 = CmdJumpRef{ ref.offset + 4u };
-    return CmdJump_read(param, param_1, v_291);
+    return CmdJump_read(param, param_1, v_297);
 }
 
-kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1666 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1749 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1666.conf.width_in_tiles) + gl_WorkGroupID.x;
+    uint tile_ix = (gl_WorkGroupID.y * _1749.conf.width_in_tiles) + gl_WorkGroupID.x;
     Alloc param;
-    param.offset = _1666.conf.ptcl_alloc.offset;
+    param.offset = _1749.conf.ptcl_alloc.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
@@ -1037,7 +1076,7 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
         rgba[i] = float4(0.0);
     }
     uint clip_depth = 0u;
-    bool mem_ok = v_291.mem_error == 0u;
+    bool mem_ok = v_297.mem_error == 0u;
     spvUnsafeArray<float, 8> df;
     TileSegRef tile_seg_ref;
     spvUnsafeArray<float, 8> area;
@@ -1046,7 +1085,7 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
     {
         Alloc param_3 = cmd_alloc;
         CmdRef param_4 = cmd_ref;
-        uint tag = Cmd_tag(param_3, param_4, v_291).tag;
+        uint tag = Cmd_tag(param_3, param_4, v_297).tag;
         if (tag == 0u)
         {
             break;
@@ -1057,7 +1096,7 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
             {
                 Alloc param_5 = cmd_alloc;
                 CmdRef param_6 = cmd_ref;
-                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_291);
+                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_297);
                 for (uint k = 0u; k < 8u; k++)
                 {
                     df[k] = 1000000000.0;
@@ -1070,7 +1109,7 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
                     bool param_9 = mem_ok;
                     Alloc param_10 = new_alloc(param_7, param_8, param_9);
                     TileSegRef param_11 = tile_seg_ref;
-                    TileSeg seg = TileSeg_read(param_10, param_11, v_291);
+                    TileSeg seg = TileSeg_read(param_10, param_11, v_297);
                     float2 line_vec = seg.vector;
                     for (uint k_1 = 0u; k_1 < 8u; k_1++)
                     {
@@ -1093,7 +1132,7 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
             {
                 Alloc param_13 = cmd_alloc;
                 CmdRef param_14 = cmd_ref;
-                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_291);
+                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_297);
                 for (uint k_3 = 0u; k_3 < 8u; k_3++)
                 {
                     area[k_3] = float(fill.backdrop);
@@ -1106,7 +1145,7 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
                     bool param_17 = mem_ok;
                     Alloc param_18 = new_alloc(param_15, param_16, param_17);
                     TileSegRef param_19 = tile_seg_ref;
-                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_291);
+                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_297);
                     for (uint k_4 = 0u; k_4 < 8u; k_4++)
                     {
                         uint param_20 = k_4;
@@ -1150,7 +1189,7 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
             {
                 Alloc param_21 = cmd_alloc;
                 CmdRef param_22 = cmd_ref;
-                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_291);
+                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_297);
                 for (uint k_7 = 0u; k_7 < 8u; k_7++)
                 {
                     area[k_7] = alpha.alpha;
@@ -1162,7 +1201,7 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
             {
                 Alloc param_23 = cmd_alloc;
                 CmdRef param_24 = cmd_ref;
-                CmdColor color = Cmd_Color_read(param_23, param_24, v_291);
+                CmdColor color = Cmd_Color_read(param_23, param_24, v_297);
                 uint param_25 = color.rgba_color;
                 float4 fg = unpacksRGB(param_25);
                 for (uint k_8 = 0u; k_8 < 8u; k_8++)
@@ -1177,7 +1216,7 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
             {
                 Alloc param_26 = cmd_alloc;
                 CmdRef param_27 = cmd_ref;
-                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_291);
+                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_297);
                 float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c;
                 for (uint k_9 = 0u; k_9 < 8u; k_9++)
                 {
@@ -1187,10 +1226,10 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
                     int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                     float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2238 = fromsRGB(param_29);
-                    fg_rgba.x = _2238.x;
-                    fg_rgba.y = _2238.y;
-                    fg_rgba.z = _2238.z;
+                    float3 _2321 = fromsRGB(param_29);
+                    fg_rgba.x = _2321.x;
+                    fg_rgba.y = _2321.y;
+                    fg_rgba.z = _2321.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1;
                 }
@@ -1201,7 +1240,7 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
             {
                 Alloc param_30 = cmd_alloc;
                 CmdRef param_31 = cmd_ref;
-                CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31, v_291);
+                CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31, v_297);
                 for (uint k_10 = 0u; k_10 < 8u; k_10++)
                 {
                     uint param_32 = k_10;
@@ -1213,10 +1252,10 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
                     int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0));
                     float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index))));
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2348 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2348.x;
-                    fg_rgba_1.y = _2348.y;
-                    fg_rgba_1.z = _2348.z;
+                    float3 _2431 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2431.x;
+                    fg_rgba_1.y = _2431.y;
+                    fg_rgba_1.z = _2431.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2;
                 }
@@ -1227,7 +1266,7 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
             {
                 Alloc param_34 = cmd_alloc;
                 CmdRef param_35 = cmd_ref;
-                CmdImage fill_img = Cmd_Image_read(param_34, param_35, v_291);
+                CmdImage fill_img = Cmd_Image_read(param_34, param_35, v_297);
                 uint2 param_36 = xy_uint;
                 CmdImage param_37 = fill_img;
                 spvUnsafeArray<float4, 8> img;
@@ -1246,8 +1285,8 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
                 {
                     uint d_2 = min(clip_depth, 127u);
                     float4 param_38 = float4(rgba[k_12]);
-                    uint _2454 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2454;
+                    uint _2537 = packsRGB(param_38);
+                    blend_stack[d_2][k_12] = _2537;
                     rgba[k_12] = float4(0.0);
                 }
                 clip_depth++;
@@ -1258,7 +1297,7 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
             {
                 Alloc param_39 = cmd_alloc;
                 CmdRef param_40 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40, v_291);
+                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40, v_297);
                 uint blend_mode = end_clip.blend >> uint(8);
                 uint comp_mode = end_clip.blend & 255u;
                 clip_depth--;
@@ -1268,31 +1307,19 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
                     uint param_41 = blend_stack[d_3][k_13];
                     float4 bg = unpacksRGB(param_41);
                     float4 fg_1 = rgba[k_13] * area[k_13];
-                    float3 param_42 = bg.xyz;
-                    float3 param_43 = fg_1.xyz;
-                    uint param_44 = blend_mode;
-                    float3 blend = mix_blend(param_42, param_43, param_44);
-                    float4 _2521 = fg_1;
-                    float _2525 = fg_1.w;
-                    float3 _2532 = mix(_2521.xyz, blend, float3(float((_2525 * bg.w) > 0.0)));
-                    fg_1.x = _2532.x;
-                    fg_1.y = _2532.y;
-                    fg_1.z = _2532.z;
-                    float3 param_45 = bg.xyz;
-                    float3 param_46 = fg_1.xyz;
-                    float param_47 = bg.w;
-                    float param_48 = fg_1.w;
-                    uint param_49 = comp_mode;
-                    rgba[k_13] = mix_compose(param_45, param_46, param_47, param_48, param_49);
+                    float4 param_42 = bg;
+                    float4 param_43 = fg_1;
+                    uint param_44 = end_clip.blend;
+                    rgba[k_13] = mix_blend_compose(param_42, param_43, param_44);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
             case 11u:
             {
-                Alloc param_50 = cmd_alloc;
-                CmdRef param_51 = cmd_ref;
-                cmd_ref = CmdRef{ Cmd_Jump_read(param_50, param_51, v_291).new_ref };
+                Alloc param_45 = cmd_alloc;
+                CmdRef param_46 = cmd_ref;
+                cmd_ref = CmdRef{ Cmd_Jump_read(param_45, param_46, v_297).new_ref };
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1300,9 +1327,9 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_52 = i_1;
-        float3 param_53 = rgba[i_1].xyz;
-        image.write(float4(tosRGB(param_53), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_52))));
+        uint param_47 = i_1;
+        float3 param_48 = rgba[i_1].xyz;
+        image.write(float4(tosRGB(param_48), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_47))));
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4.spv b/piet-gpu/shader/gen/kernel4.spv
index 7061263..b145245 100644
Binary files a/piet-gpu/shader/gen/kernel4.spv and b/piet-gpu/shader/gen/kernel4.spv differ
diff --git a/piet-gpu/shader/gen/kernel4_gray.dxil b/piet-gpu/shader/gen/kernel4_gray.dxil
index 046045f..abe1d22 100644
Binary files a/piet-gpu/shader/gen/kernel4_gray.dxil and b/piet-gpu/shader/gen/kernel4_gray.dxil differ
diff --git a/piet-gpu/shader/gen/kernel4_gray.hlsl b/piet-gpu/shader/gen/kernel4_gray.hlsl
index 019a73c..f402268 100644
--- a/piet-gpu/shader/gen/kernel4_gray.hlsl
+++ b/piet-gpu/shader/gen/kernel4_gray.hlsl
@@ -161,8 +161,8 @@ struct Config
 
 static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);
 
-RWByteAddressBuffer _291 : register(u0, space0);
-ByteAddressBuffer _1666 : register(t1, space0);
+RWByteAddressBuffer _297 : register(u0, space0);
+ByteAddressBuffer _1749 : register(t1, space0);
 RWTexture2D<unorm float4> image_atlas : register(u3, space0);
 RWTexture2D<unorm float4> gradients : register(u4, space0);
 RWTexture2D<unorm float> image : register(u2, space0);
@@ -189,8 +189,8 @@ float4 spvUnpackUnorm4x8(uint value)
 
 Alloc slice_mem(Alloc a, uint offset, uint size)
 {
-    Alloc _304 = { a.offset + offset };
-    return _304;
+    Alloc _310 = { a.offset + offset };
+    return _310;
 }
 
 bool touch_mem(Alloc alloc, uint offset)
@@ -206,7 +206,7 @@ uint read_mem(Alloc alloc, uint offset)
     {
         return 0u;
     }
-    uint v = _291.Load(offset * 4 + 8);
+    uint v = _297.Load(offset * 4 + 8);
     return v;
 }
 
@@ -215,8 +215,8 @@ CmdTag Cmd_tag(Alloc a, CmdRef ref)
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint tag_and_flags = read_mem(param, param_1);
-    CmdTag _663 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _663;
+    CmdTag _669 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+    return _669;
 }
 
 CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref)
@@ -236,9 +236,9 @@ CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref)
 
 CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref)
 {
-    CmdStrokeRef _679 = { ref.offset + 4u };
+    CmdStrokeRef _685 = { ref.offset + 4u };
     Alloc param = a;
-    CmdStrokeRef param_1 = _679;
+    CmdStrokeRef param_1 = _685;
     return CmdStroke_read(param, param_1);
 }
 
@@ -274,8 +274,8 @@ TileSeg TileSeg_read(Alloc a, TileSegRef ref)
     s.origin = float2(asfloat(raw0), asfloat(raw1));
     s._vector = float2(asfloat(raw2), asfloat(raw3));
     s.y_edge = asfloat(raw4);
-    TileSegRef _820 = { raw5 };
-    s.next = _820;
+    TileSegRef _826 = { raw5 };
+    s.next = _826;
     return s;
 }
 
@@ -301,9 +301,9 @@ CmdFill CmdFill_read(Alloc a, CmdFillRef ref)
 
 CmdFill Cmd_Fill_read(Alloc a, CmdRef ref)
 {
-    CmdFillRef _669 = { ref.offset + 4u };
+    CmdFillRef _675 = { ref.offset + 4u };
     Alloc param = a;
-    CmdFillRef param_1 = _669;
+    CmdFillRef param_1 = _675;
     return CmdFill_read(param, param_1);
 }
 
@@ -320,9 +320,9 @@ CmdAlpha CmdAlpha_read(Alloc a, CmdAlphaRef ref)
 
 CmdAlpha Cmd_Alpha_read(Alloc a, CmdRef ref)
 {
-    CmdAlphaRef _689 = { ref.offset + 4u };
+    CmdAlphaRef _695 = { ref.offset + 4u };
     Alloc param = a;
-    CmdAlphaRef param_1 = _689;
+    CmdAlphaRef param_1 = _695;
     return CmdAlpha_read(param, param_1);
 }
 
@@ -339,9 +339,9 @@ CmdColor CmdColor_read(Alloc a, CmdColorRef ref)
 
 CmdColor Cmd_Color_read(Alloc a, CmdRef ref)
 {
-    CmdColorRef _699 = { ref.offset + 4u };
+    CmdColorRef _705 = { ref.offset + 4u };
     Alloc param = a;
-    CmdColorRef param_1 = _699;
+    CmdColorRef param_1 = _705;
     return CmdColor_read(param, param_1);
 }
 
@@ -385,9 +385,9 @@ CmdLinGrad CmdLinGrad_read(Alloc a, CmdLinGradRef ref)
 
 CmdLinGrad Cmd_LinGrad_read(Alloc a, CmdRef ref)
 {
-    CmdLinGradRef _709 = { ref.offset + 4u };
+    CmdLinGradRef _715 = { ref.offset + 4u };
     Alloc param = a;
-    CmdLinGradRef param_1 = _709;
+    CmdLinGradRef param_1 = _715;
     return CmdLinGrad_read(param, param_1);
 }
 
@@ -439,9 +439,9 @@ CmdRadGrad CmdRadGrad_read(Alloc a, CmdRadGradRef ref)
 
 CmdRadGrad Cmd_RadGrad_read(Alloc a, CmdRef ref)
 {
-    CmdRadGradRef _719 = { ref.offset + 4u };
+    CmdRadGradRef _725 = { ref.offset + 4u };
     Alloc param = a;
-    CmdRadGradRef param_1 = _719;
+    CmdRadGradRef param_1 = _725;
     return CmdRadGrad_read(param, param_1);
 }
 
@@ -462,9 +462,9 @@ CmdImage CmdImage_read(Alloc a, CmdImageRef ref)
 
 CmdImage Cmd_Image_read(Alloc a, CmdRef ref)
 {
-    CmdImageRef _729 = { ref.offset + 4u };
+    CmdImageRef _735 = { ref.offset + 4u };
     Alloc param = a;
-    CmdImageRef param_1 = _729;
+    CmdImageRef param_1 = _735;
     return CmdImage_read(param, param_1);
 }
 
@@ -477,10 +477,10 @@ void fillImage(out float4 spvReturnValue[8], uint2 xy, CmdImage cmd_img)
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas[uv];
         float3 param_1 = fg_rgba.xyz;
-        float3 _1638 = fromsRGB(param_1);
-        fg_rgba.x = _1638.x;
-        fg_rgba.y = _1638.y;
-        fg_rgba.z = _1638.z;
+        float3 _1721 = fromsRGB(param_1);
+        fg_rgba.x = _1721.x;
+        fg_rgba.y = _1721.y;
+        fg_rgba.z = _1721.z;
         rgba[i] = fg_rgba;
     }
     spvReturnValue = rgba;
@@ -514,9 +514,9 @@ CmdEndClip CmdEndClip_read(Alloc a, CmdEndClipRef ref)
 
 CmdEndClip Cmd_EndClip_read(Alloc a, CmdRef ref)
 {
-    CmdEndClipRef _739 = { ref.offset + 4u };
+    CmdEndClipRef _745 = { ref.offset + 4u };
     Alloc param = a;
-    CmdEndClipRef param_1 = _739;
+    CmdEndClipRef param_1 = _745;
     return CmdEndClip_read(param, param_1);
 }
 
@@ -706,8 +706,8 @@ float3 set_lum(float3 c, float l)
 {
     float3 param = c;
     float3 param_1 = c + (l - lum(param)).xxx;
-    float3 _1046 = clip_color(param_1);
-    return _1046;
+    float3 _1052 = clip_color(param_1);
+    return _1052;
 }
 
 float3 mix_blend(float3 cb, float3 cs, uint mode)
@@ -795,9 +795,9 @@ float3 mix_blend(float3 cb, float3 cs, uint mode)
             float3 param_20 = cb;
             float3 param_21 = cs;
             float param_22 = sat(param_20);
-            float3 _1337 = set_sat(param_21, param_22);
+            float3 _1343 = set_sat(param_21, param_22);
             float3 param_23 = cb;
-            float3 param_24 = _1337;
+            float3 param_24 = _1343;
             float param_25 = lum(param_23);
             b = set_lum(param_24, param_25);
             break;
@@ -807,9 +807,9 @@ float3 mix_blend(float3 cb, float3 cs, uint mode)
             float3 param_26 = cs;
             float3 param_27 = cb;
             float param_28 = sat(param_26);
-            float3 _1351 = set_sat(param_27, param_28);
+            float3 _1357 = set_sat(param_27, param_28);
             float3 param_29 = cb;
-            float3 param_30 = _1351;
+            float3 param_30 = _1357;
             float param_31 = lum(param_29);
             b = set_lum(param_30, param_31);
             break;
@@ -919,18 +919,56 @@ float4 mix_compose(float3 cb, float3 cs, float ab, float as, uint mode)
         }
         case 13u:
         {
-            return float4(max(0.0f.xxxx, ((1.0f.xxxx - (float4(cs, as) * as)) + 1.0f.xxxx) - (float4(cb, ab) * ab)).xyz, max(0.0f, ((1.0f - as) + 1.0f) - ab));
+            float rev_as = 1.0f - as;
+            float rev_ab = 1.0f - ab;
+            return max(0.0f.xxxx, float4((cs * rev_as) + (cb * rev_ab), rev_as + rev_ab));
         }
         case 14u:
         {
-            return float4(min(1.0f.xxxx, (float4(cs, as) * as) + (float4(cb, ab) * ab)).xyz, min(1.0f, as + ab));
+            return min(1.0f.xxxx, float4((cs * as) + (cb * ab), as + ab));
         }
         default:
         {
             break;
         }
     }
-    return (float4(cs, as) * (as * fa)) + (float4(cb, ab) * (ab * fb));
+    float as_fa = as * fa;
+    float ab_fb = ab * fb;
+    float3 co = (cs * as_fa) + (cb * ab_fb);
+    return float4(co, as_fa + ab_fb);
+}
+
+float4 mix_blend_compose(float4 backdrop, float4 src, uint mode)
+{
+    if (mode == 3u)
+    {
+        return (backdrop * (1.0f - src.w)) + src;
+    }
+    float inv_src_a = 1.0f / (src.w + 1.0000000036274937255387218471014e-15f);
+    float3 cs = src.xyz * inv_src_a;
+    float inv_backdrop_a = 1.0f / (backdrop.w + 1.0000000036274937255387218471014e-15f);
+    float3 cb = backdrop.xyz * inv_backdrop_a;
+    uint blend_mode = mode >> uint(8);
+    float3 param = cs;
+    float3 param_1 = cb;
+    uint param_2 = blend_mode;
+    float3 blended = mix_blend(param, param_1, param_2);
+    cs = lerp(cs, blended, backdrop.w.xxx);
+    uint comp_mode = mode * 255u;
+    if (comp_mode == 3u)
+    {
+        float3 co = lerp(backdrop.xyz, cs, src.w.xxx);
+        return float4(co, src.w + (backdrop.w * (1.0f - src.w)));
+    }
+    else
+    {
+        float3 param_3 = cb;
+        float3 param_4 = cs;
+        float param_5 = backdrop.w;
+        float param_6 = src.w;
+        uint param_7 = comp_mode;
+        return mix_compose(param_3, param_4, param_5, param_6, param_7);
+    }
 }
 
 CmdJump CmdJump_read(Alloc a, CmdJumpRef ref)
@@ -946,24 +984,24 @@ CmdJump CmdJump_read(Alloc a, CmdJumpRef ref)
 
 CmdJump Cmd_Jump_read(Alloc a, CmdRef ref)
 {
-    CmdJumpRef _749 = { ref.offset + 4u };
+    CmdJumpRef _755 = { ref.offset + 4u };
     Alloc param = a;
-    CmdJumpRef param_1 = _749;
+    CmdJumpRef param_1 = _755;
     return CmdJump_read(param, param_1);
 }
 
 void comp_main()
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1666.Load(8)) + gl_WorkGroupID.x;
-    Alloc _1681;
-    _1681.offset = _1666.Load(24);
+    uint tile_ix = (gl_WorkGroupID.y * _1749.Load(8)) + gl_WorkGroupID.x;
+    Alloc _1764;
+    _1764.offset = _1749.Load(24);
     Alloc param;
-    param.offset = _1681.offset;
+    param.offset = _1764.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
-    CmdRef _1690 = { cmd_alloc.offset };
-    CmdRef cmd_ref = _1690;
+    CmdRef _1773 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _1773;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
     float4 rgba[8];
@@ -972,7 +1010,7 @@ void comp_main()
         rgba[i] = 0.0f.xxxx;
     }
     uint clip_depth = 0u;
-    bool mem_ok = _291.Load(4) == 0u;
+    bool mem_ok = _297.Load(4) == 0u;
     float df[8];
     TileSegRef tile_seg_ref;
     float area[8];
@@ -997,8 +1035,8 @@ void comp_main()
                 {
                     df[k] = 1000000000.0f;
                 }
-                TileSegRef _1784 = { stroke.tile_ref };
-                tile_seg_ref = _1784;
+                TileSegRef _1867 = { stroke.tile_ref };
+                tile_seg_ref = _1867;
                 do
                 {
                     uint param_7 = tile_seg_ref.offset;
@@ -1034,8 +1072,8 @@ void comp_main()
                 {
                     area[k_3] = float(fill.backdrop);
                 }
-                TileSegRef _1904 = { fill.tile_ref };
-                tile_seg_ref = _1904;
+                TileSegRef _1987 = { fill.tile_ref };
+                tile_seg_ref = _1987;
                 do
                 {
                     uint param_15 = tile_seg_ref.offset;
@@ -1124,10 +1162,10 @@ void comp_main()
                     int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba = gradients[int2(x, int(lin.index))];
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2238 = fromsRGB(param_29);
-                    fg_rgba.x = _2238.x;
-                    fg_rgba.y = _2238.y;
-                    fg_rgba.z = _2238.z;
+                    float3 _2321 = fromsRGB(param_29);
+                    fg_rgba.x = _2321.x;
+                    fg_rgba.y = _2321.y;
+                    fg_rgba.z = _2321.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1;
                 }
@@ -1150,10 +1188,10 @@ void comp_main()
                     int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))];
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2348 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2348.x;
-                    fg_rgba_1.y = _2348.y;
-                    fg_rgba_1.z = _2348.z;
+                    float3 _2431 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2431.x;
+                    fg_rgba_1.y = _2431.y;
+                    fg_rgba_1.z = _2431.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2;
                 }
@@ -1167,9 +1205,9 @@ void comp_main()
                 CmdImage fill_img = Cmd_Image_read(param_34, param_35);
                 uint2 param_36 = xy_uint;
                 CmdImage param_37 = fill_img;
-                float4 _2391[8];
-                fillImage(_2391, param_36, param_37);
-                float4 img[8] = _2391;
+                float4 _2474[8];
+                fillImage(_2474, param_36, param_37);
+                float4 img[8] = _2474;
                 for (uint k_11 = 0u; k_11 < 8u; k_11++)
                 {
                     float4 fg_k_3 = img[k_11] * area[k_11];
@@ -1184,8 +1222,8 @@ void comp_main()
                 {
                     uint d_2 = min(clip_depth, 127u);
                     float4 param_38 = float4(rgba[k_12]);
-                    uint _2454 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2454;
+                    uint _2537 = packsRGB(param_38);
+                    blend_stack[d_2][k_12] = _2537;
                     rgba[k_12] = 0.0f.xxxx;
                 }
                 clip_depth++;
@@ -1206,32 +1244,20 @@ void comp_main()
                     uint param_41 = blend_stack[d_3][k_13];
                     float4 bg = unpacksRGB(param_41);
                     float4 fg_1 = rgba[k_13] * area[k_13];
-                    float3 param_42 = bg.xyz;
-                    float3 param_43 = fg_1.xyz;
-                    uint param_44 = blend_mode;
-                    float3 blend = mix_blend(param_42, param_43, param_44);
-                    float4 _2521 = fg_1;
-                    float _2525 = fg_1.w;
-                    float3 _2532 = lerp(_2521.xyz, blend, float((_2525 * bg.w) > 0.0f).xxx);
-                    fg_1.x = _2532.x;
-                    fg_1.y = _2532.y;
-                    fg_1.z = _2532.z;
-                    float3 param_45 = bg.xyz;
-                    float3 param_46 = fg_1.xyz;
-                    float param_47 = bg.w;
-                    float param_48 = fg_1.w;
-                    uint param_49 = comp_mode;
-                    rgba[k_13] = mix_compose(param_45, param_46, param_47, param_48, param_49);
+                    float4 param_42 = bg;
+                    float4 param_43 = fg_1;
+                    uint param_44 = end_clip.blend;
+                    rgba[k_13] = mix_blend_compose(param_42, param_43, param_44);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
             case 11u:
             {
-                Alloc param_50 = cmd_alloc;
-                CmdRef param_51 = cmd_ref;
-                CmdRef _2569 = { Cmd_Jump_read(param_50, param_51).new_ref };
-                cmd_ref = _2569;
+                Alloc param_45 = cmd_alloc;
+                CmdRef param_46 = cmd_ref;
+                CmdRef _2615 = { Cmd_Jump_read(param_45, param_46).new_ref };
+                cmd_ref = _2615;
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1239,8 +1265,8 @@ void comp_main()
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_52 = i_1;
-        image[int2(xy_uint + chunk_offset(param_52))] = rgba[i_1].w.x;
+        uint param_47 = i_1;
+        image[int2(xy_uint + chunk_offset(param_47))] = rgba[i_1].w.x;
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4_gray.msl b/piet-gpu/shader/gen/kernel4_gray.msl
index 6402c6f..9647001 100644
--- a/piet-gpu/shader/gen/kernel4_gray.msl
+++ b/piet-gpu/shader/gen/kernel4_gray.msl
@@ -237,7 +237,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
 }
 
 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_291)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_297)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -245,29 +245,29 @@ uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memor
     {
         return 0u;
     }
-    uint v = v_291.memory[offset];
+    uint v = v_297.memory[offset];
     return v;
 }
 
 static inline __attribute__((always_inline))
-CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_291);
+    uint tag_and_flags = read_mem(param, param_1, v_297);
     return CmdTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
 }
 
 static inline __attribute__((always_inline))
-CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_291)
+CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_291);
+    uint raw1 = read_mem(param_2, param_3, v_297);
     CmdStroke s;
     s.tile_ref = raw0;
     s.half_width = as_type<float>(raw1);
@@ -275,11 +275,11 @@ CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref,
 }
 
 static inline __attribute__((always_inline))
-CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdStrokeRef param_1 = CmdStrokeRef{ ref.offset + 4u };
-    return CmdStroke_read(param, param_1, v_291);
+    return CmdStroke_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
@@ -291,27 +291,27 @@ Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const
 }
 
 static inline __attribute__((always_inline))
-TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_291)
+TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_291);
+    uint raw1 = read_mem(param_2, param_3, v_297);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_291);
+    uint raw2 = read_mem(param_4, param_5, v_297);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_291);
+    uint raw3 = read_mem(param_6, param_7, v_297);
     Alloc param_8 = a;
     uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_291);
+    uint raw4 = read_mem(param_8, param_9, v_297);
     Alloc param_10 = a;
     uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_291);
+    uint raw5 = read_mem(param_10, param_11, v_297);
     TileSeg s;
     s.origin = float2(as_type<float>(raw0), as_type<float>(raw1));
     s.vector = float2(as_type<float>(raw2), as_type<float>(raw3));
@@ -327,15 +327,15 @@ uint2 chunk_offset(thread const uint& i)
 }
 
 static inline __attribute__((always_inline))
-CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_291)
+CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_291);
+    uint raw1 = read_mem(param_2, param_3, v_297);
     CmdFill s;
     s.tile_ref = raw0;
     s.backdrop = int(raw1);
@@ -343,51 +343,51 @@ CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device
 }
 
 static inline __attribute__((always_inline))
-CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdFillRef param_1 = CmdFillRef{ ref.offset + 4u };
-    return CmdFill_read(param, param_1, v_291);
+    return CmdFill_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
-CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_291)
+CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     CmdAlpha s;
     s.alpha = as_type<float>(raw0);
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdAlphaRef param_1 = CmdAlphaRef{ ref.offset + 4u };
-    return CmdAlpha_read(param, param_1, v_291);
+    return CmdAlpha_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
-CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_291)
+CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     CmdColor s;
     s.rgba_color = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdColorRef param_1 = CmdColorRef{ ref.offset + 4u };
-    return CmdColor_read(param, param_1, v_291);
+    return CmdColor_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
@@ -408,21 +408,21 @@ float4 unpacksRGB(thread const uint& srgba)
 }
 
 static inline __attribute__((always_inline))
-CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_291)
+CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_291);
+    uint raw1 = read_mem(param_2, param_3, v_297);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_291);
+    uint raw2 = read_mem(param_4, param_5, v_297);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_291);
+    uint raw3 = read_mem(param_6, param_7, v_297);
     CmdLinGrad s;
     s.index = raw0;
     s.line_x = as_type<float>(raw1);
@@ -432,50 +432,50 @@ CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& re
 }
 
 static inline __attribute__((always_inline))
-CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdLinGradRef param_1 = CmdLinGradRef{ ref.offset + 4u };
-    return CmdLinGrad_read(param, param_1, v_291);
+    return CmdLinGrad_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
-CmdRadGrad CmdRadGrad_read(thread const Alloc& a, thread const CmdRadGradRef& ref, device Memory& v_291)
+CmdRadGrad CmdRadGrad_read(thread const Alloc& a, thread const CmdRadGradRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_291);
+    uint raw1 = read_mem(param_2, param_3, v_297);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_291);
+    uint raw2 = read_mem(param_4, param_5, v_297);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_291);
+    uint raw3 = read_mem(param_6, param_7, v_297);
     Alloc param_8 = a;
     uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_291);
+    uint raw4 = read_mem(param_8, param_9, v_297);
     Alloc param_10 = a;
     uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_291);
+    uint raw5 = read_mem(param_10, param_11, v_297);
     Alloc param_12 = a;
     uint param_13 = ix + 6u;
-    uint raw6 = read_mem(param_12, param_13, v_291);
+    uint raw6 = read_mem(param_12, param_13, v_297);
     Alloc param_14 = a;
     uint param_15 = ix + 7u;
-    uint raw7 = read_mem(param_14, param_15, v_291);
+    uint raw7 = read_mem(param_14, param_15, v_297);
     Alloc param_16 = a;
     uint param_17 = ix + 8u;
-    uint raw8 = read_mem(param_16, param_17, v_291);
+    uint raw8 = read_mem(param_16, param_17, v_297);
     Alloc param_18 = a;
     uint param_19 = ix + 9u;
-    uint raw9 = read_mem(param_18, param_19, v_291);
+    uint raw9 = read_mem(param_18, param_19, v_297);
     Alloc param_20 = a;
     uint param_21 = ix + 10u;
-    uint raw10 = read_mem(param_20, param_21, v_291);
+    uint raw10 = read_mem(param_20, param_21, v_297);
     CmdRadGrad s;
     s.index = raw0;
     s.mat = float4(as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3), as_type<float>(raw4));
@@ -487,23 +487,23 @@ CmdRadGrad CmdRadGrad_read(thread const Alloc& a, thread const CmdRadGradRef& re
 }
 
 static inline __attribute__((always_inline))
-CmdRadGrad Cmd_RadGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdRadGrad Cmd_RadGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdRadGradRef param_1 = CmdRadGradRef{ ref.offset + 4u };
-    return CmdRadGrad_read(param, param_1, v_291);
+    return CmdRadGrad_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
-CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_291)
+CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_291);
+    uint raw1 = read_mem(param_2, param_3, v_297);
     CmdImage s;
     s.index = raw0;
     s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
@@ -511,11 +511,11 @@ CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, dev
 }
 
 static inline __attribute__((always_inline))
-CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdImageRef param_1 = CmdImageRef{ ref.offset + 4u };
-    return CmdImage_read(param, param_1, v_291);
+    return CmdImage_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
@@ -528,10 +528,10 @@ spvUnsafeArray<float4, 8> fillImage(thread const uint2& xy, thread const CmdImag
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas.read(uint2(uv));
         float3 param_1 = fg_rgba.xyz;
-        float3 _1638 = fromsRGB(param_1);
-        fg_rgba.x = _1638.x;
-        fg_rgba.y = _1638.y;
-        fg_rgba.z = _1638.z;
+        float3 _1721 = fromsRGB(param_1);
+        fg_rgba.x = _1721.x;
+        fg_rgba.y = _1721.y;
+        fg_rgba.z = _1721.z;
         rgba[i] = fg_rgba;
     }
     return rgba;
@@ -555,23 +555,23 @@ uint packsRGB(thread float4& rgba)
 }
 
 static inline __attribute__((always_inline))
-CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_291)
+CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     CmdEndClip s;
     s.blend = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdEndClipRef param_1 = CmdEndClipRef{ ref.offset + 4u };
-    return CmdEndClip_read(param, param_1, v_291);
+    return CmdEndClip_read(param, param_1, v_297);
 }
 
 static inline __attribute__((always_inline))
@@ -771,8 +771,8 @@ float3 set_lum(thread const float3& c, thread const float& l)
 {
     float3 param = c;
     float3 param_1 = c + float3(l - lum(param));
-    float3 _1046 = clip_color(param_1);
-    return _1046;
+    float3 _1052 = clip_color(param_1);
+    return _1052;
 }
 
 static inline __attribute__((always_inline))
@@ -861,9 +861,9 @@ float3 mix_blend(thread const float3& cb, thread const float3& cs, thread const
             float3 param_20 = cb;
             float3 param_21 = cs;
             float param_22 = sat(param_20);
-            float3 _1337 = set_sat(param_21, param_22);
+            float3 _1343 = set_sat(param_21, param_22);
             float3 param_23 = cb;
-            float3 param_24 = _1337;
+            float3 param_24 = _1343;
             float param_25 = lum(param_23);
             b = set_lum(param_24, param_25);
             break;
@@ -873,9 +873,9 @@ float3 mix_blend(thread const float3& cb, thread const float3& cs, thread const
             float3 param_26 = cs;
             float3 param_27 = cb;
             float param_28 = sat(param_26);
-            float3 _1351 = set_sat(param_27, param_28);
+            float3 _1357 = set_sat(param_27, param_28);
             float3 param_29 = cb;
-            float3 param_30 = _1351;
+            float3 param_30 = _1357;
             float param_31 = lum(param_29);
             b = set_lum(param_30, param_31);
             break;
@@ -986,45 +986,84 @@ float4 mix_compose(thread const float3& cb, thread const float3& cs, thread cons
         }
         case 13u:
         {
-            return float4(fast::max(float4(0.0), ((float4(1.0) - (float4(cs, as) * as)) + float4(1.0)) - (float4(cb, ab) * ab)).xyz, fast::max(0.0, ((1.0 - as) + 1.0) - ab));
+            float rev_as = 1.0 - as;
+            float rev_ab = 1.0 - ab;
+            return fast::max(float4(0.0), float4((cs * rev_as) + (cb * rev_ab), rev_as + rev_ab));
         }
         case 14u:
         {
-            return float4(fast::min(float4(1.0), (float4(cs, as) * as) + (float4(cb, ab) * ab)).xyz, fast::min(1.0, as + ab));
+            return fast::min(float4(1.0), float4((cs * as) + (cb * ab), as + ab));
         }
         default:
         {
             break;
         }
     }
-    return (float4(cs, as) * (as * fa)) + (float4(cb, ab) * (ab * fb));
+    float as_fa = as * fa;
+    float ab_fb = ab * fb;
+    float3 co = (cs * as_fa) + (cb * ab_fb);
+    return float4(co, as_fa + ab_fb);
 }
 
 static inline __attribute__((always_inline))
-CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_291)
+float4 mix_blend_compose(thread const float4& backdrop, thread const float4& src, thread const uint& mode)
+{
+    if (mode == 3u)
+    {
+        return (backdrop * (1.0 - src.w)) + src;
+    }
+    float inv_src_a = 1.0 / (src.w + 1.0000000036274937255387218471014e-15);
+    float3 cs = src.xyz * inv_src_a;
+    float inv_backdrop_a = 1.0 / (backdrop.w + 1.0000000036274937255387218471014e-15);
+    float3 cb = backdrop.xyz * inv_backdrop_a;
+    uint blend_mode = mode >> uint(8);
+    float3 param = cs;
+    float3 param_1 = cb;
+    uint param_2 = blend_mode;
+    float3 blended = mix_blend(param, param_1, param_2);
+    cs = mix(cs, blended, float3(backdrop.w));
+    uint comp_mode = mode * 255u;
+    if (comp_mode == 3u)
+    {
+        float3 co = mix(backdrop.xyz, cs, float3(src.w));
+        return float4(co, src.w + (backdrop.w * (1.0 - src.w)));
+    }
+    else
+    {
+        float3 param_3 = cb;
+        float3 param_4 = cs;
+        float param_5 = backdrop.w;
+        float param_6 = src.w;
+        uint param_7 = comp_mode;
+        return mix_compose(param_3, param_4, param_5, param_6, param_7);
+    }
+}
+
+static inline __attribute__((always_inline))
+CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_297)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_291);
+    uint raw0 = read_mem(param, param_1, v_297);
     CmdJump s;
     s.new_ref = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297)
 {
     Alloc param = a;
     CmdJumpRef param_1 = CmdJumpRef{ ref.offset + 4u };
-    return CmdJump_read(param, param_1, v_291);
+    return CmdJump_read(param, param_1, v_297);
 }
 
-kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1666 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1749 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1666.conf.width_in_tiles) + gl_WorkGroupID.x;
+    uint tile_ix = (gl_WorkGroupID.y * _1749.conf.width_in_tiles) + gl_WorkGroupID.x;
     Alloc param;
-    param.offset = _1666.conf.ptcl_alloc.offset;
+    param.offset = _1749.conf.ptcl_alloc.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
@@ -1037,7 +1076,7 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
         rgba[i] = float4(0.0);
     }
     uint clip_depth = 0u;
-    bool mem_ok = v_291.mem_error == 0u;
+    bool mem_ok = v_297.mem_error == 0u;
     spvUnsafeArray<float, 8> df;
     TileSegRef tile_seg_ref;
     spvUnsafeArray<float, 8> area;
@@ -1046,7 +1085,7 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
     {
         Alloc param_3 = cmd_alloc;
         CmdRef param_4 = cmd_ref;
-        uint tag = Cmd_tag(param_3, param_4, v_291).tag;
+        uint tag = Cmd_tag(param_3, param_4, v_297).tag;
         if (tag == 0u)
         {
             break;
@@ -1057,7 +1096,7 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
             {
                 Alloc param_5 = cmd_alloc;
                 CmdRef param_6 = cmd_ref;
-                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_291);
+                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_297);
                 for (uint k = 0u; k < 8u; k++)
                 {
                     df[k] = 1000000000.0;
@@ -1070,7 +1109,7 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
                     bool param_9 = mem_ok;
                     Alloc param_10 = new_alloc(param_7, param_8, param_9);
                     TileSegRef param_11 = tile_seg_ref;
-                    TileSeg seg = TileSeg_read(param_10, param_11, v_291);
+                    TileSeg seg = TileSeg_read(param_10, param_11, v_297);
                     float2 line_vec = seg.vector;
                     for (uint k_1 = 0u; k_1 < 8u; k_1++)
                     {
@@ -1093,7 +1132,7 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
             {
                 Alloc param_13 = cmd_alloc;
                 CmdRef param_14 = cmd_ref;
-                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_291);
+                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_297);
                 for (uint k_3 = 0u; k_3 < 8u; k_3++)
                 {
                     area[k_3] = float(fill.backdrop);
@@ -1106,7 +1145,7 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
                     bool param_17 = mem_ok;
                     Alloc param_18 = new_alloc(param_15, param_16, param_17);
                     TileSegRef param_19 = tile_seg_ref;
-                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_291);
+                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_297);
                     for (uint k_4 = 0u; k_4 < 8u; k_4++)
                     {
                         uint param_20 = k_4;
@@ -1150,7 +1189,7 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
             {
                 Alloc param_21 = cmd_alloc;
                 CmdRef param_22 = cmd_ref;
-                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_291);
+                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_297);
                 for (uint k_7 = 0u; k_7 < 8u; k_7++)
                 {
                     area[k_7] = alpha.alpha;
@@ -1162,7 +1201,7 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
             {
                 Alloc param_23 = cmd_alloc;
                 CmdRef param_24 = cmd_ref;
-                CmdColor color = Cmd_Color_read(param_23, param_24, v_291);
+                CmdColor color = Cmd_Color_read(param_23, param_24, v_297);
                 uint param_25 = color.rgba_color;
                 float4 fg = unpacksRGB(param_25);
                 for (uint k_8 = 0u; k_8 < 8u; k_8++)
@@ -1177,7 +1216,7 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
             {
                 Alloc param_26 = cmd_alloc;
                 CmdRef param_27 = cmd_ref;
-                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_291);
+                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_297);
                 float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c;
                 for (uint k_9 = 0u; k_9 < 8u; k_9++)
                 {
@@ -1187,10 +1226,10 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
                     int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                     float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2238 = fromsRGB(param_29);
-                    fg_rgba.x = _2238.x;
-                    fg_rgba.y = _2238.y;
-                    fg_rgba.z = _2238.z;
+                    float3 _2321 = fromsRGB(param_29);
+                    fg_rgba.x = _2321.x;
+                    fg_rgba.y = _2321.y;
+                    fg_rgba.z = _2321.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1;
                 }
@@ -1201,7 +1240,7 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
             {
                 Alloc param_30 = cmd_alloc;
                 CmdRef param_31 = cmd_ref;
-                CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31, v_291);
+                CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31, v_297);
                 for (uint k_10 = 0u; k_10 < 8u; k_10++)
                 {
                     uint param_32 = k_10;
@@ -1213,10 +1252,10 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
                     int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0));
                     float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index))));
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2348 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2348.x;
-                    fg_rgba_1.y = _2348.y;
-                    fg_rgba_1.z = _2348.z;
+                    float3 _2431 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2431.x;
+                    fg_rgba_1.y = _2431.y;
+                    fg_rgba_1.z = _2431.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2;
                 }
@@ -1227,7 +1266,7 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
             {
                 Alloc param_34 = cmd_alloc;
                 CmdRef param_35 = cmd_ref;
-                CmdImage fill_img = Cmd_Image_read(param_34, param_35, v_291);
+                CmdImage fill_img = Cmd_Image_read(param_34, param_35, v_297);
                 uint2 param_36 = xy_uint;
                 CmdImage param_37 = fill_img;
                 spvUnsafeArray<float4, 8> img;
@@ -1246,8 +1285,8 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
                 {
                     uint d_2 = min(clip_depth, 127u);
                     float4 param_38 = float4(rgba[k_12]);
-                    uint _2454 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2454;
+                    uint _2537 = packsRGB(param_38);
+                    blend_stack[d_2][k_12] = _2537;
                     rgba[k_12] = float4(0.0);
                 }
                 clip_depth++;
@@ -1258,7 +1297,7 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
             {
                 Alloc param_39 = cmd_alloc;
                 CmdRef param_40 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40, v_291);
+                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40, v_297);
                 uint blend_mode = end_clip.blend >> uint(8);
                 uint comp_mode = end_clip.blend & 255u;
                 clip_depth--;
@@ -1268,31 +1307,19 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
                     uint param_41 = blend_stack[d_3][k_13];
                     float4 bg = unpacksRGB(param_41);
                     float4 fg_1 = rgba[k_13] * area[k_13];
-                    float3 param_42 = bg.xyz;
-                    float3 param_43 = fg_1.xyz;
-                    uint param_44 = blend_mode;
-                    float3 blend = mix_blend(param_42, param_43, param_44);
-                    float4 _2521 = fg_1;
-                    float _2525 = fg_1.w;
-                    float3 _2532 = mix(_2521.xyz, blend, float3(float((_2525 * bg.w) > 0.0)));
-                    fg_1.x = _2532.x;
-                    fg_1.y = _2532.y;
-                    fg_1.z = _2532.z;
-                    float3 param_45 = bg.xyz;
-                    float3 param_46 = fg_1.xyz;
-                    float param_47 = bg.w;
-                    float param_48 = fg_1.w;
-                    uint param_49 = comp_mode;
-                    rgba[k_13] = mix_compose(param_45, param_46, param_47, param_48, param_49);
+                    float4 param_42 = bg;
+                    float4 param_43 = fg_1;
+                    uint param_44 = end_clip.blend;
+                    rgba[k_13] = mix_blend_compose(param_42, param_43, param_44);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
             case 11u:
             {
-                Alloc param_50 = cmd_alloc;
-                CmdRef param_51 = cmd_ref;
-                cmd_ref = CmdRef{ Cmd_Jump_read(param_50, param_51, v_291).new_ref };
+                Alloc param_45 = cmd_alloc;
+                CmdRef param_46 = cmd_ref;
+                cmd_ref = CmdRef{ Cmd_Jump_read(param_45, param_46, v_297).new_ref };
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1300,8 +1327,8 @@ kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_52 = i_1;
-        image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_52))));
+        uint param_47 = i_1;
+        image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_47))));
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4_gray.spv b/piet-gpu/shader/gen/kernel4_gray.spv
index 4633401..2dd46c0 100644
Binary files a/piet-gpu/shader/gen/kernel4_gray.spv and b/piet-gpu/shader/gen/kernel4_gray.spv differ
diff --git a/piet-gpu/shader/gen/path_coarse.dxil b/piet-gpu/shader/gen/path_coarse.dxil
index 9fd593c..b6c9398 100644
Binary files a/piet-gpu/shader/gen/path_coarse.dxil and b/piet-gpu/shader/gen/path_coarse.dxil differ
diff --git a/piet-gpu/shader/gen/pathseg.dxil b/piet-gpu/shader/gen/pathseg.dxil
index 6130712..7ce4684 100644
Binary files a/piet-gpu/shader/gen/pathseg.dxil and b/piet-gpu/shader/gen/pathseg.dxil differ
diff --git a/piet-gpu/shader/gen/pathtag_reduce.dxil b/piet-gpu/shader/gen/pathtag_reduce.dxil
index 4c2bd23..ff544b8 100644
Binary files a/piet-gpu/shader/gen/pathtag_reduce.dxil and b/piet-gpu/shader/gen/pathtag_reduce.dxil differ
diff --git a/piet-gpu/shader/gen/pathtag_root.dxil b/piet-gpu/shader/gen/pathtag_root.dxil
index 77f12e6..48584bd 100644
Binary files a/piet-gpu/shader/gen/pathtag_root.dxil and b/piet-gpu/shader/gen/pathtag_root.dxil differ
diff --git a/piet-gpu/shader/gen/tile_alloc.dxil b/piet-gpu/shader/gen/tile_alloc.dxil
index 7759910..7b130e0 100644
Binary files a/piet-gpu/shader/gen/tile_alloc.dxil and b/piet-gpu/shader/gen/tile_alloc.dxil differ
diff --git a/piet-gpu/shader/gen/transform_leaf.dxil b/piet-gpu/shader/gen/transform_leaf.dxil
index f9f31e6..0c1e376 100644
Binary files a/piet-gpu/shader/gen/transform_leaf.dxil and b/piet-gpu/shader/gen/transform_leaf.dxil differ
diff --git a/piet-gpu/shader/gen/transform_reduce.dxil b/piet-gpu/shader/gen/transform_reduce.dxil
index 978dd98..fc3a311 100644
Binary files a/piet-gpu/shader/gen/transform_reduce.dxil and b/piet-gpu/shader/gen/transform_reduce.dxil differ
diff --git a/piet-gpu/shader/gen/transform_root.dxil b/piet-gpu/shader/gen/transform_root.dxil
index 5b4f059..a33ff7f 100644
Binary files a/piet-gpu/shader/gen/transform_root.dxil and b/piet-gpu/shader/gen/transform_root.dxil differ
diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp
index c49e2fa..a0710d2 100644
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@@ -242,10 +242,7 @@ void main() {
                 uint d = min(clip_depth, MAX_BLEND_STACK - 1);
                 mediump vec4 bg = unpacksRGB(blend_stack[d][k]);
                 mediump vec4 fg = rgba[k] * area[k];
-                vec3 blend = mix_blend(bg.rgb, fg.rgb, blend_mode);
-                // Apply the blend color only where the foreground and background overlap.
-                fg.rgb = mix(fg.rgb, blend, float((fg.a * bg.a) > 0.0));
-                rgba[k] = mix_compose(bg.rgb, fg.rgb, bg.a, fg.a, comp_mode);
+                rgba[k] = mix_blend_compose(bg, fg, end_clip.blend);
             }
             cmd_ref.offset += 4 + CmdEndClip_size;
             break;
diff --git a/piet-gpu/src/blend.rs b/piet-gpu/src/blend.rs
index aacf597..7edcb4e 100644
--- a/piet-gpu/src/blend.rs
+++ b/piet-gpu/src/blend.rs
@@ -51,8 +51,7 @@ pub enum CompositionMode {
     DestAtop = 10,
     Xor = 11,
     Plus = 12,
-    PlusDarker = 13,
-    PlusLighter = 14,
+    PlusLighter = 13,
 }
 
 #[derive(Copy, Clone, PartialEq, Eq, Debug)]