diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
index 3abb2e0..c93d002 100644
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@@ -151,6 +151,11 @@ void main() {
     uint part_start_ix = 0;
     uint ready_ix = 0;
 
+    cmd_ref.offset += 4;
+    // Accounting for allocation of blend memory
+    uint render_blend_depth = 0;
+    uint max_blend_depth = 0;
+
     uint drawmonoid_start = conf.drawmonoid_alloc.offset >> 2;
     uint drawtag_start = conf.drawtag_offset >> 2;
     uint drawdata_start = conf.drawdata_offset >> 2;
@@ -414,6 +419,8 @@ void main() {
                         }
                         Cmd_BeginClip_write(cmd_alloc, cmd_ref);
                         cmd_ref.offset += 4;
+                        render_blend_depth++;
+                        max_blend_depth = max(max_blend_depth, render_blend_depth);
                     }
                     clip_depth++;
                     break;
@@ -426,6 +433,7 @@ void main() {
                     uint blend = scene[dd];
                     Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(blend));
                     cmd_ref.offset += 4 + CmdEndClip_size;
+                    render_blend_depth--;
                     break;
                 }
             } else {
@@ -451,5 +459,8 @@ void main() {
     }
     if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) {
         Cmd_End_write(cmd_alloc, cmd_ref);
+        if (max_blend_depth > BLEND_STACK_SPLIT) {
+            // TODO: allocate blend memory and write result
+        }
     }
 }
diff --git a/piet-gpu/shader/gen/coarse.dxil b/piet-gpu/shader/gen/coarse.dxil
index 910925d..cbebec0 100644
Binary files a/piet-gpu/shader/gen/coarse.dxil and b/piet-gpu/shader/gen/coarse.dxil differ
diff --git a/piet-gpu/shader/gen/coarse.hlsl b/piet-gpu/shader/gen/coarse.hlsl
index 04529bb..0519a63 100644
--- a/piet-gpu/shader/gen/coarse.hlsl
+++ b/piet-gpu/shader/gen/coarse.hlsl
@@ -177,7 +177,7 @@ static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
 RWByteAddressBuffer _260 : register(u0, space0);
 ByteAddressBuffer _1005 : register(t1, space0);
-ByteAddressBuffer _1372 : register(t2, space0);
+ByteAddressBuffer _1378 : register(t2, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -681,6 +681,9 @@ void comp_main()
     uint wr_ix = 0u;
     uint part_start_ix = 0u;
     uint ready_ix = 0u;
+    cmd_ref.offset += 4u;
+    uint render_blend_depth = 0u;
+    uint max_blend_depth = 0u;
     uint drawmonoid_start = _1005.Load(44) >> uint(2);
     uint drawtag_start = _1005.Load(100) >> uint(2);
     uint drawdata_start = _1005.Load(104) >> uint(2);
@@ -688,11 +691,11 @@ void comp_main()
     bool mem_ok = _260.Load(4) == 0u;
     Alloc param_3;
     Alloc param_5;
-    uint _1304;
+    uint _1310;
     uint element_ix;
     Alloc param_14;
     uint tile_count;
-    uint _1605;
+    uint _1611;
     float linewidth;
     CmdLinGrad cmd_lin;
     CmdRadGrad cmd_rad;
@@ -702,34 +705,34 @@ void comp_main()
         {
             sh_bitmaps[i][th_ix] = 0u;
         }
-        bool _1356;
+        bool _1362;
         for (;;)
         {
             if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
             {
                 part_start_ix = ready_ix;
                 uint count = 0u;
-                bool _1154 = th_ix < 256u;
-                bool _1162;
-                if (_1154)
+                bool _1160 = th_ix < 256u;
+                bool _1168;
+                if (_1160)
                 {
-                    _1162 = (partition_ix + th_ix) < n_partitions;
+                    _1168 = (partition_ix + th_ix) < n_partitions;
                 }
                 else
                 {
-                    _1162 = _1154;
+                    _1168 = _1160;
                 }
-                if (_1162)
+                if (_1168)
                 {
                     uint in_ix = (_1005.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
-                    Alloc _1179;
-                    _1179.offset = _1005.Load(20);
-                    param_3.offset = _1179.offset;
+                    Alloc _1185;
+                    _1185.offset = _1005.Load(20);
+                    param_3.offset = _1185.offset;
                     uint param_4 = in_ix;
                     count = read_mem(param_3, param_4);
-                    Alloc _1190;
-                    _1190.offset = _1005.Load(20);
-                    param_5.offset = _1190.offset;
+                    Alloc _1196;
+                    _1196.offset = _1005.Load(20);
+                    param_5.offset = _1196.offset;
                     uint param_6 = in_ix + 1u;
                     uint offset = read_mem(param_5, param_6);
                     uint param_7 = offset;
@@ -775,16 +778,16 @@ void comp_main()
                 }
                 if (part_ix > 0u)
                 {
-                    _1304 = sh_part_count[part_ix - 1u];
+                    _1310 = sh_part_count[part_ix - 1u];
                 }
                 else
                 {
-                    _1304 = part_start_ix;
+                    _1310 = part_start_ix;
                 }
-                ix -= _1304;
+                ix -= _1310;
                 Alloc bin_alloc = sh_part_elements[part_ix];
-                BinInstanceRef _1323 = { bin_alloc.offset };
-                BinInstanceRef inst_ref = _1323;
+                BinInstanceRef _1329 = { bin_alloc.offset };
+                BinInstanceRef inst_ref = _1329;
                 BinInstanceRef param_10 = inst_ref;
                 uint param_11 = ix;
                 Alloc param_12 = bin_alloc;
@@ -794,16 +797,16 @@ void comp_main()
             }
             GroupMemoryBarrierWithGroupSync();
             wr_ix = min((rd_ix + 256u), ready_ix);
-            bool _1346 = (wr_ix - rd_ix) < 256u;
-            if (_1346)
+            bool _1352 = (wr_ix - rd_ix) < 256u;
+            if (_1352)
             {
-                _1356 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
+                _1362 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
             }
             else
             {
-                _1356 = _1346;
+                _1362 = _1352;
             }
-            if (_1356)
+            if (_1362)
             {
                 continue;
             }
@@ -816,7 +819,7 @@ void comp_main()
         if ((th_ix + rd_ix) < wr_ix)
         {
             element_ix = sh_elements[th_ix];
-            tag = _1372.Load((drawtag_start + element_ix) * 4 + 0);
+            tag = _1378.Load((drawtag_start + element_ix) * 4 + 0);
         }
         switch (tag)
         {
@@ -829,11 +832,11 @@ void comp_main()
             {
                 uint drawmonoid_base = drawmonoid_start + (4u * element_ix);
                 uint path_ix = _260.Load(drawmonoid_base * 4 + 8);
-                PathRef _1397 = { _1005.Load(16) + (path_ix * 12u) };
-                Alloc _1400;
-                _1400.offset = _1005.Load(16);
-                param_14.offset = _1400.offset;
-                PathRef param_15 = _1397;
+                PathRef _1403 = { _1005.Load(16) + (path_ix * 12u) };
+                Alloc _1406;
+                _1406.offset = _1005.Load(16);
+                param_14.offset = _1406.offset;
+                PathRef param_15 = _1403;
                 Path path = Path_read(param_14, param_15);
                 uint stride = path.bbox.z - path.bbox.x;
                 sh_tile_stride[th_ix] = stride;
@@ -889,16 +892,16 @@ void comp_main()
                 }
             }
             uint element_ix_1 = sh_elements[el_ix];
-            uint tag_1 = _1372.Load((drawtag_start + element_ix_1) * 4 + 0);
+            uint tag_1 = _1378.Load((drawtag_start + element_ix_1) * 4 + 0);
             if (el_ix > 0u)
             {
-                _1605 = sh_tile_count[el_ix - 1u];
+                _1611 = sh_tile_count[el_ix - 1u];
             }
             else
             {
-                _1605 = 0u;
+                _1611 = 0u;
             }
-            uint seq_ix = ix_1 - _1605;
+            uint seq_ix = ix_1 - _1611;
             uint width = sh_tile_width[el_ix];
             uint x = sh_tile_x0[el_ix] + (seq_ix % width);
             uint y = sh_tile_y0[el_ix] + (seq_ix / width);
@@ -907,9 +910,9 @@ void comp_main()
             {
                 uint param_21 = el_ix;
                 bool param_22 = mem_ok;
-                TileRef _1657 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
+                TileRef _1663 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
                 Alloc param_23 = read_tile_alloc(param_21, param_22);
-                TileRef param_24 = _1657;
+                TileRef param_24 = _1663;
                 Tile tile = Tile_read(param_23, param_24);
                 bool is_clip = (tag_1 & 1u) != 0u;
                 bool is_blend = false;
@@ -918,27 +921,27 @@ void comp_main()
                     uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1);
                     uint scene_offset = _260.Load((drawmonoid_base_1 + 2u) * 4 + 8);
                     uint dd = drawdata_start + (scene_offset >> uint(2));
-                    uint blend = _1372.Load(dd * 4 + 0);
+                    uint blend = _1378.Load(dd * 4 + 0);
                     is_blend = blend != 3u;
                 }
-                bool _1692 = tile.tile.offset != 0u;
-                bool _1701;
-                if (!_1692)
+                bool _1698 = tile.tile.offset != 0u;
+                bool _1707;
+                if (!_1698)
                 {
-                    _1701 = (tile.backdrop == 0) == is_clip;
+                    _1707 = (tile.backdrop == 0) == is_clip;
                 }
                 else
                 {
-                    _1701 = _1692;
+                    _1707 = _1698;
                 }
-                include_tile = _1701 || is_blend;
+                include_tile = _1707 || is_blend;
             }
             if (include_tile)
             {
                 uint el_slice = el_ix / 32u;
                 uint el_mask = 1u << (el_ix & 31u);
-                uint _1723;
-                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1723);
+                uint _1729;
+                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1729);
             }
         }
         GroupMemoryBarrierWithGroupSync();
@@ -962,14 +965,14 @@ void comp_main()
             uint element_ref_ix = (slice_ix * 32u) + uint(int(firstbitlow(bitmap)));
             uint element_ix_2 = sh_elements[element_ref_ix];
             bitmap &= (bitmap - 1u);
-            uint drawtag = _1372.Load((drawtag_start + element_ix_2) * 4 + 0);
+            uint drawtag = _1378.Load((drawtag_start + element_ix_2) * 4 + 0);
             if (clip_zero_depth == 0u)
             {
                 uint param_25 = element_ref_ix;
                 bool param_26 = mem_ok;
-                TileRef _1800 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                TileRef _1806 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
                 Alloc param_27 = read_tile_alloc(param_25, param_26);
-                TileRef param_28 = _1800;
+                TileRef param_28 = _1806;
                 Tile tile_1 = Tile_read(param_27, param_28);
                 uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2);
                 uint scene_offset_1 = _260.Load((drawmonoid_base_2 + 2u) * 4 + 8);
@@ -984,11 +987,11 @@ void comp_main()
                         Alloc param_29 = cmd_alloc;
                         CmdRef param_30 = cmd_ref;
                         uint param_31 = cmd_limit;
-                        bool _1848 = alloc_cmd(param_29, param_30, param_31);
+                        bool _1854 = alloc_cmd(param_29, param_30, param_31);
                         cmd_alloc = param_29;
                         cmd_ref = param_30;
                         cmd_limit = param_31;
-                        if (!_1848)
+                        if (!_1854)
                         {
                             break;
                         }
@@ -998,11 +1001,11 @@ void comp_main()
                         float param_35 = linewidth;
                         write_fill(param_32, param_33, param_34, param_35);
                         cmd_ref = param_33;
-                        uint rgba = _1372.Load(dd_1 * 4 + 0);
-                        CmdColor _1871 = { rgba };
+                        uint rgba = _1378.Load(dd_1 * 4 + 0);
+                        CmdColor _1877 = { rgba };
                         Alloc param_36 = cmd_alloc;
                         CmdRef param_37 = cmd_ref;
-                        CmdColor param_38 = _1871;
+                        CmdColor param_38 = _1877;
                         Cmd_Color_write(param_36, param_37, param_38);
                         cmd_ref.offset += 8u;
                         break;
@@ -1012,11 +1015,11 @@ void comp_main()
                         Alloc param_39 = cmd_alloc;
                         CmdRef param_40 = cmd_ref;
                         uint param_41 = cmd_limit;
-                        bool _1889 = alloc_cmd(param_39, param_40, param_41);
+                        bool _1895 = alloc_cmd(param_39, param_40, param_41);
                         cmd_alloc = param_39;
                         cmd_ref = param_40;
                         cmd_limit = param_41;
-                        if (!_1889)
+                        if (!_1895)
                         {
                             break;
                         }
@@ -1027,7 +1030,7 @@ void comp_main()
                         float param_45 = linewidth;
                         write_fill(param_42, param_43, param_44, param_45);
                         cmd_ref = param_43;
-                        cmd_lin.index = _1372.Load(dd_1 * 4 + 0);
+                        cmd_lin.index = _1378.Load(dd_1 * 4 + 0);
                         cmd_lin.line_x = asfloat(_260.Load((di + 1u) * 4 + 8));
                         cmd_lin.line_y = asfloat(_260.Load((di + 2u) * 4 + 8));
                         cmd_lin.line_c = asfloat(_260.Load((di + 3u) * 4 + 8));
@@ -1043,11 +1046,11 @@ void comp_main()
                         Alloc param_49 = cmd_alloc;
                         CmdRef param_50 = cmd_ref;
                         uint param_51 = cmd_limit;
-                        bool _1953 = alloc_cmd(param_49, param_50, param_51);
+                        bool _1959 = alloc_cmd(param_49, param_50, param_51);
                         cmd_alloc = param_49;
                         cmd_ref = param_50;
                         cmd_limit = param_51;
-                        if (!_1953)
+                        if (!_1959)
                         {
                             break;
                         }
@@ -1058,7 +1061,7 @@ void comp_main()
                         float param_55 = linewidth;
                         write_fill(param_52, param_53, param_54, param_55);
                         cmd_ref = param_53;
-                        cmd_rad.index = _1372.Load(dd_1 * 4 + 0);
+                        cmd_rad.index = _1378.Load(dd_1 * 4 + 0);
                         cmd_rad.mat = asfloat(uint4(_260.Load((di + 1u) * 4 + 8), _260.Load((di + 2u) * 4 + 8), _260.Load((di + 3u) * 4 + 8), _260.Load((di + 4u) * 4 + 8)));
                         cmd_rad.xlat = asfloat(uint2(_260.Load((di + 5u) * 4 + 8), _260.Load((di + 6u) * 4 + 8)));
                         cmd_rad.c1 = asfloat(uint2(_260.Load((di + 7u) * 4 + 8), _260.Load((di + 8u) * 4 + 8)));
@@ -1077,11 +1080,11 @@ void comp_main()
                         Alloc param_59 = cmd_alloc;
                         CmdRef param_60 = cmd_ref;
                         uint param_61 = cmd_limit;
-                        bool _2059 = alloc_cmd(param_59, param_60, param_61);
+                        bool _2065 = alloc_cmd(param_59, param_60, param_61);
                         cmd_alloc = param_59;
                         cmd_ref = param_60;
                         cmd_limit = param_61;
-                        if (!_2059)
+                        if (!_2065)
                         {
                             break;
                         }
@@ -1091,30 +1094,30 @@ void comp_main()
                         float param_65 = linewidth;
                         write_fill(param_62, param_63, param_64, param_65);
                         cmd_ref = param_63;
-                        uint index = _1372.Load(dd_1 * 4 + 0);
-                        uint raw1 = _1372.Load((dd_1 + 1u) * 4 + 0);
+                        uint index = _1378.Load(dd_1 * 4 + 0);
+                        uint raw1 = _1378.Load((dd_1 + 1u) * 4 + 0);
                         int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
-                        CmdImage _2098 = { index, offset_1 };
+                        CmdImage _2104 = { index, offset_1 };
                         Alloc param_66 = cmd_alloc;
                         CmdRef param_67 = cmd_ref;
-                        CmdImage param_68 = _2098;
+                        CmdImage param_68 = _2104;
                         Cmd_Image_write(param_66, param_67, param_68);
                         cmd_ref.offset += 12u;
                         break;
                     }
                     case 5u:
                     {
-                        bool _2112 = tile_1.tile.offset == 0u;
-                        bool _2118;
-                        if (_2112)
+                        bool _2118 = tile_1.tile.offset == 0u;
+                        bool _2124;
+                        if (_2118)
                         {
-                            _2118 = tile_1.backdrop == 0;
+                            _2124 = tile_1.backdrop == 0;
                         }
                         else
                         {
-                            _2118 = _2112;
+                            _2124 = _2118;
                         }
-                        if (_2118)
+                        if (_2124)
                         {
                             clip_zero_depth = clip_depth + 1u;
                         }
@@ -1123,11 +1126,11 @@ void comp_main()
                             Alloc param_69 = cmd_alloc;
                             CmdRef param_70 = cmd_ref;
                             uint param_71 = cmd_limit;
-                            bool _2130 = alloc_cmd(param_69, param_70, param_71);
+                            bool _2136 = alloc_cmd(param_69, param_70, param_71);
                             cmd_alloc = param_69;
                             cmd_ref = param_70;
                             cmd_limit = param_71;
-                            if (!_2130)
+                            if (!_2136)
                             {
                                 break;
                             }
@@ -1135,6 +1138,8 @@ void comp_main()
                             CmdRef param_73 = cmd_ref;
                             Cmd_BeginClip_write(param_72, param_73);
                             cmd_ref.offset += 4u;
+                            render_blend_depth++;
+                            max_blend_depth = max(max_blend_depth, render_blend_depth);
                         }
                         clip_depth++;
                         break;
@@ -1145,11 +1150,11 @@ void comp_main()
                         Alloc param_74 = cmd_alloc;
                         CmdRef param_75 = cmd_ref;
                         uint param_76 = cmd_limit;
-                        bool _2158 = alloc_cmd(param_74, param_75, param_76);
+                        bool _2169 = alloc_cmd(param_74, param_75, param_76);
                         cmd_alloc = param_74;
                         cmd_ref = param_75;
                         cmd_limit = param_76;
-                        if (!_2158)
+                        if (!_2169)
                         {
                             break;
                         }
@@ -1159,13 +1164,14 @@ void comp_main()
                         float param_80 = -1.0f;
                         write_fill(param_77, param_78, param_79, param_80);
                         cmd_ref = param_78;
-                        uint blend_1 = _1372.Load(dd_1 * 4 + 0);
-                        CmdEndClip _2181 = { blend_1 };
+                        uint blend_1 = _1378.Load(dd_1 * 4 + 0);
+                        CmdEndClip _2192 = { blend_1 };
                         Alloc param_81 = cmd_alloc;
                         CmdRef param_82 = cmd_ref;
-                        CmdEndClip param_83 = _2181;
+                        CmdEndClip param_83 = _2192;
                         Cmd_EndClip_write(param_81, param_82, param_83);
                         cmd_ref.offset += 8u;
+                        render_blend_depth--;
                         break;
                     }
                 }
@@ -1198,21 +1204,24 @@ void comp_main()
             break;
         }
     }
-    bool _2228 = (bin_tile_x + tile_x) < _1005.Load(8);
-    bool _2237;
-    if (_2228)
+    bool _2241 = (bin_tile_x + tile_x) < _1005.Load(8);
+    bool _2250;
+    if (_2241)
     {
-        _2237 = (bin_tile_y + tile_y) < _1005.Load(12);
+        _2250 = (bin_tile_y + tile_y) < _1005.Load(12);
     }
     else
     {
-        _2237 = _2228;
+        _2250 = _2241;
     }
-    if (_2237)
+    if (_2250)
     {
         Alloc param_84 = cmd_alloc;
         CmdRef param_85 = cmd_ref;
         Cmd_End_write(param_84, param_85);
+        if (max_blend_depth > 4u)
+        {
+        }
     }
 }
 
diff --git a/piet-gpu/shader/gen/coarse.msl b/piet-gpu/shader/gen/coarse.msl
index 55812d4..578fa37 100644
--- a/piet-gpu/shader/gen/coarse.msl
+++ b/piet-gpu/shader/gen/coarse.msl
@@ -677,7 +677,7 @@ void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memor
     write_mem(param, param_1, param_2, v_260, v_260BufferSize);
 }
 
-kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_260 [[buffer(0)]], const device ConfigBuf& _1005 [[buffer(1)]], const device SceneBuf& _1372 [[buffer(2)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_260 [[buffer(0)]], const device ConfigBuf& _1005 [[buffer(1)]], const device SceneBuf& _1378 [[buffer(2)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     threadgroup uint sh_bitmaps[8][256];
     threadgroup Alloc sh_part_elements[256];
@@ -713,6 +713,9 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
     uint wr_ix = 0u;
     uint part_start_ix = 0u;
     uint ready_ix = 0u;
+    cmd_ref.offset += 4u;
+    uint render_blend_depth = 0u;
+    uint max_blend_depth = 0u;
     uint drawmonoid_start = _1005.conf.drawmonoid_alloc.offset >> uint(2);
     uint drawtag_start = _1005.conf.drawtag_offset >> uint(2);
     uint drawdata_start = _1005.conf.drawdata_offset >> uint(2);
@@ -720,11 +723,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
     bool mem_ok = v_260.mem_error == 0u;
     Alloc param_3;
     Alloc param_5;
-    uint _1304;
+    uint _1310;
     uint element_ix;
     Alloc param_14;
     uint tile_count;
-    uint _1605;
+    uint _1611;
     float linewidth;
     CmdLinGrad cmd_lin;
     CmdRadGrad cmd_rad;
@@ -734,24 +737,24 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
         {
             sh_bitmaps[i][th_ix] = 0u;
         }
-        bool _1356;
+        bool _1362;
         for (;;)
         {
             if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
             {
                 part_start_ix = ready_ix;
                 uint count = 0u;
-                bool _1154 = th_ix < 256u;
-                bool _1162;
-                if (_1154)
+                bool _1160 = th_ix < 256u;
+                bool _1168;
+                if (_1160)
                 {
-                    _1162 = (partition_ix + th_ix) < n_partitions;
+                    _1168 = (partition_ix + th_ix) < n_partitions;
                 }
                 else
                 {
-                    _1162 = _1154;
+                    _1168 = _1160;
                 }
-                if (_1162)
+                if (_1168)
                 {
                     uint in_ix = (_1005.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
                     param_3.offset = _1005.conf.bin_alloc.offset;
@@ -803,13 +806,13 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                 }
                 if (part_ix > 0u)
                 {
-                    _1304 = sh_part_count[part_ix - 1u];
+                    _1310 = sh_part_count[part_ix - 1u];
                 }
                 else
                 {
-                    _1304 = part_start_ix;
+                    _1310 = part_start_ix;
                 }
-                ix -= _1304;
+                ix -= _1310;
                 Alloc bin_alloc = sh_part_elements[part_ix];
                 BinInstanceRef inst_ref = BinInstanceRef{ bin_alloc.offset };
                 BinInstanceRef param_10 = inst_ref;
@@ -821,16 +824,16 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
             }
             threadgroup_barrier(mem_flags::mem_threadgroup);
             wr_ix = min((rd_ix + 256u), ready_ix);
-            bool _1346 = (wr_ix - rd_ix) < 256u;
-            if (_1346)
+            bool _1352 = (wr_ix - rd_ix) < 256u;
+            if (_1352)
             {
-                _1356 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
+                _1362 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
             }
             else
             {
-                _1356 = _1346;
+                _1362 = _1352;
             }
-            if (_1356)
+            if (_1362)
             {
                 continue;
             }
@@ -843,7 +846,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
         if ((th_ix + rd_ix) < wr_ix)
         {
             element_ix = sh_elements[th_ix];
-            tag = _1372.scene[drawtag_start + element_ix];
+            tag = _1378.scene[drawtag_start + element_ix];
         }
         switch (tag)
         {
@@ -913,16 +916,16 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                 }
             }
             uint element_ix_1 = sh_elements[el_ix];
-            uint tag_1 = _1372.scene[drawtag_start + element_ix_1];
+            uint tag_1 = _1378.scene[drawtag_start + element_ix_1];
             if (el_ix > 0u)
             {
-                _1605 = sh_tile_count[el_ix - 1u];
+                _1611 = sh_tile_count[el_ix - 1u];
             }
             else
             {
-                _1605 = 0u;
+                _1611 = 0u;
             }
-            uint seq_ix = ix_1 - _1605;
+            uint seq_ix = ix_1 - _1611;
             uint width = sh_tile_width[el_ix];
             uint x = sh_tile_x0[el_ix] + (seq_ix % width);
             uint y = sh_tile_y0[el_ix] + (seq_ix / width);
@@ -941,26 +944,26 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                     uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1);
                     uint scene_offset = v_260.memory[drawmonoid_base_1 + 2u];
                     uint dd = drawdata_start + (scene_offset >> uint(2));
-                    uint blend = _1372.scene[dd];
+                    uint blend = _1378.scene[dd];
                     is_blend = blend != 3u;
                 }
-                bool _1692 = tile.tile.offset != 0u;
-                bool _1701;
-                if (!_1692)
+                bool _1698 = tile.tile.offset != 0u;
+                bool _1707;
+                if (!_1698)
                 {
-                    _1701 = (tile.backdrop == 0) == is_clip;
+                    _1707 = (tile.backdrop == 0) == is_clip;
                 }
                 else
                 {
-                    _1701 = _1692;
+                    _1707 = _1698;
                 }
-                include_tile = _1701 || is_blend;
+                include_tile = _1707 || is_blend;
             }
             if (include_tile)
             {
                 uint el_slice = el_ix / 32u;
                 uint el_mask = 1u << (el_ix & 31u);
-                uint _1723 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
+                uint _1729 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
             }
         }
         threadgroup_barrier(mem_flags::mem_threadgroup);
@@ -984,7 +987,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
             uint element_ref_ix = (slice_ix * 32u) + uint(int(spvFindLSB(bitmap)));
             uint element_ix_2 = sh_elements[element_ref_ix];
             bitmap &= (bitmap - 1u);
-            uint drawtag = _1372.scene[drawtag_start + element_ix_2];
+            uint drawtag = _1378.scene[drawtag_start + element_ix_2];
             if (clip_zero_depth == 0u)
             {
                 uint param_25 = element_ref_ix;
@@ -1005,11 +1008,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                         Alloc param_29 = cmd_alloc;
                         CmdRef param_30 = cmd_ref;
                         uint param_31 = cmd_limit;
-                        bool _1848 = alloc_cmd(param_29, param_30, param_31, v_260, v_260BufferSize);
+                        bool _1854 = alloc_cmd(param_29, param_30, param_31, v_260, v_260BufferSize);
                         cmd_alloc = param_29;
                         cmd_ref = param_30;
                         cmd_limit = param_31;
-                        if (!_1848)
+                        if (!_1854)
                         {
                             break;
                         }
@@ -1019,7 +1022,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                         float param_35 = linewidth;
                         write_fill(param_32, param_33, param_34, param_35, v_260, v_260BufferSize);
                         cmd_ref = param_33;
-                        uint rgba = _1372.scene[dd_1];
+                        uint rgba = _1378.scene[dd_1];
                         Alloc param_36 = cmd_alloc;
                         CmdRef param_37 = cmd_ref;
                         CmdColor param_38 = CmdColor{ rgba };
@@ -1032,11 +1035,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                         Alloc param_39 = cmd_alloc;
                         CmdRef param_40 = cmd_ref;
                         uint param_41 = cmd_limit;
-                        bool _1889 = alloc_cmd(param_39, param_40, param_41, v_260, v_260BufferSize);
+                        bool _1895 = alloc_cmd(param_39, param_40, param_41, v_260, v_260BufferSize);
                         cmd_alloc = param_39;
                         cmd_ref = param_40;
                         cmd_limit = param_41;
-                        if (!_1889)
+                        if (!_1895)
                         {
                             break;
                         }
@@ -1047,7 +1050,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                         float param_45 = linewidth;
                         write_fill(param_42, param_43, param_44, param_45, v_260, v_260BufferSize);
                         cmd_ref = param_43;
-                        cmd_lin.index = _1372.scene[dd_1];
+                        cmd_lin.index = _1378.scene[dd_1];
                         cmd_lin.line_x = as_type<float>(v_260.memory[di + 1u]);
                         cmd_lin.line_y = as_type<float>(v_260.memory[di + 2u]);
                         cmd_lin.line_c = as_type<float>(v_260.memory[di + 3u]);
@@ -1063,11 +1066,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                         Alloc param_49 = cmd_alloc;
                         CmdRef param_50 = cmd_ref;
                         uint param_51 = cmd_limit;
-                        bool _1953 = alloc_cmd(param_49, param_50, param_51, v_260, v_260BufferSize);
+                        bool _1959 = alloc_cmd(param_49, param_50, param_51, v_260, v_260BufferSize);
                         cmd_alloc = param_49;
                         cmd_ref = param_50;
                         cmd_limit = param_51;
-                        if (!_1953)
+                        if (!_1959)
                         {
                             break;
                         }
@@ -1078,7 +1081,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                         float param_55 = linewidth;
                         write_fill(param_52, param_53, param_54, param_55, v_260, v_260BufferSize);
                         cmd_ref = param_53;
-                        cmd_rad.index = _1372.scene[dd_1];
+                        cmd_rad.index = _1378.scene[dd_1];
                         cmd_rad.mat = as_type<float4>(uint4(v_260.memory[di + 1u], v_260.memory[di + 2u], v_260.memory[di + 3u], v_260.memory[di + 4u]));
                         cmd_rad.xlat = as_type<float2>(uint2(v_260.memory[di + 5u], v_260.memory[di + 6u]));
                         cmd_rad.c1 = as_type<float2>(uint2(v_260.memory[di + 7u], v_260.memory[di + 8u]));
@@ -1097,11 +1100,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                         Alloc param_59 = cmd_alloc;
                         CmdRef param_60 = cmd_ref;
                         uint param_61 = cmd_limit;
-                        bool _2059 = alloc_cmd(param_59, param_60, param_61, v_260, v_260BufferSize);
+                        bool _2065 = alloc_cmd(param_59, param_60, param_61, v_260, v_260BufferSize);
                         cmd_alloc = param_59;
                         cmd_ref = param_60;
                         cmd_limit = param_61;
-                        if (!_2059)
+                        if (!_2065)
                         {
                             break;
                         }
@@ -1111,8 +1114,8 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                         float param_65 = linewidth;
                         write_fill(param_62, param_63, param_64, param_65, v_260, v_260BufferSize);
                         cmd_ref = param_63;
-                        uint index = _1372.scene[dd_1];
-                        uint raw1 = _1372.scene[dd_1 + 1u];
+                        uint index = _1378.scene[dd_1];
+                        uint raw1 = _1378.scene[dd_1 + 1u];
                         int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
                         Alloc param_66 = cmd_alloc;
                         CmdRef param_67 = cmd_ref;
@@ -1123,17 +1126,17 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                     }
                     case 5u:
                     {
-                        bool _2112 = tile_1.tile.offset == 0u;
-                        bool _2118;
-                        if (_2112)
+                        bool _2118 = tile_1.tile.offset == 0u;
+                        bool _2124;
+                        if (_2118)
                         {
-                            _2118 = tile_1.backdrop == 0;
+                            _2124 = tile_1.backdrop == 0;
                         }
                         else
                         {
-                            _2118 = _2112;
+                            _2124 = _2118;
                         }
-                        if (_2118)
+                        if (_2124)
                         {
                             clip_zero_depth = clip_depth + 1u;
                         }
@@ -1142,11 +1145,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                             Alloc param_69 = cmd_alloc;
                             CmdRef param_70 = cmd_ref;
                             uint param_71 = cmd_limit;
-                            bool _2130 = alloc_cmd(param_69, param_70, param_71, v_260, v_260BufferSize);
+                            bool _2136 = alloc_cmd(param_69, param_70, param_71, v_260, v_260BufferSize);
                             cmd_alloc = param_69;
                             cmd_ref = param_70;
                             cmd_limit = param_71;
-                            if (!_2130)
+                            if (!_2136)
                             {
                                 break;
                             }
@@ -1154,6 +1157,8 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                             CmdRef param_73 = cmd_ref;
                             Cmd_BeginClip_write(param_72, param_73, v_260, v_260BufferSize);
                             cmd_ref.offset += 4u;
+                            render_blend_depth++;
+                            max_blend_depth = max(max_blend_depth, render_blend_depth);
                         }
                         clip_depth++;
                         break;
@@ -1164,11 +1169,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                         Alloc param_74 = cmd_alloc;
                         CmdRef param_75 = cmd_ref;
                         uint param_76 = cmd_limit;
-                        bool _2158 = alloc_cmd(param_74, param_75, param_76, v_260, v_260BufferSize);
+                        bool _2169 = alloc_cmd(param_74, param_75, param_76, v_260, v_260BufferSize);
                         cmd_alloc = param_74;
                         cmd_ref = param_75;
                         cmd_limit = param_76;
-                        if (!_2158)
+                        if (!_2169)
                         {
                             break;
                         }
@@ -1178,12 +1183,13 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                         float param_80 = -1.0;
                         write_fill(param_77, param_78, param_79, param_80, v_260, v_260BufferSize);
                         cmd_ref = param_78;
-                        uint blend_1 = _1372.scene[dd_1];
+                        uint blend_1 = _1378.scene[dd_1];
                         Alloc param_81 = cmd_alloc;
                         CmdRef param_82 = cmd_ref;
                         CmdEndClip param_83 = CmdEndClip{ blend_1 };
                         Cmd_EndClip_write(param_81, param_82, param_83, v_260, v_260BufferSize);
                         cmd_ref.offset += 8u;
+                        render_blend_depth--;
                         break;
                     }
                 }
@@ -1216,21 +1222,24 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
             break;
         }
     }
-    bool _2228 = (bin_tile_x + tile_x) < _1005.conf.width_in_tiles;
-    bool _2237;
-    if (_2228)
+    bool _2241 = (bin_tile_x + tile_x) < _1005.conf.width_in_tiles;
+    bool _2250;
+    if (_2241)
     {
-        _2237 = (bin_tile_y + tile_y) < _1005.conf.height_in_tiles;
+        _2250 = (bin_tile_y + tile_y) < _1005.conf.height_in_tiles;
     }
     else
     {
-        _2237 = _2228;
+        _2250 = _2241;
     }
-    if (_2237)
+    if (_2250)
     {
         Alloc param_84 = cmd_alloc;
         CmdRef param_85 = cmd_ref;
         Cmd_End_write(param_84, param_85, v_260, v_260BufferSize);
+        if (max_blend_depth > 4u)
+        {
+        }
     }
 }
 
diff --git a/piet-gpu/shader/gen/coarse.spv b/piet-gpu/shader/gen/coarse.spv
index 6d33ee7..718acca 100644
Binary files a/piet-gpu/shader/gen/coarse.spv and b/piet-gpu/shader/gen/coarse.spv differ
diff --git a/piet-gpu/shader/gen/kernel4.dxil b/piet-gpu/shader/gen/kernel4.dxil
index da6c563..5617c51 100644
Binary files a/piet-gpu/shader/gen/kernel4.dxil and b/piet-gpu/shader/gen/kernel4.dxil differ
diff --git a/piet-gpu/shader/gen/kernel4.hlsl b/piet-gpu/shader/gen/kernel4.hlsl
index 5d6f839..30779b7 100644
--- a/piet-gpu/shader/gen/kernel4.hlsl
+++ b/piet-gpu/shader/gen/kernel4.hlsl
@@ -162,7 +162,7 @@ struct Config
 static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);
 
 RWByteAddressBuffer _297 : register(u0, space0);
-ByteAddressBuffer _1749 : register(t1, space0);
+ByteAddressBuffer _1725 : register(t1, space0);
 RWTexture2D<unorm float4> image_atlas : register(u3, space0);
 RWTexture2D<unorm float4> gradients : register(u4, space0);
 RWTexture2D<unorm float4> image : register(u2, space0);
@@ -477,10 +477,10 @@ void fillImage(out float4 spvReturnValue[8], uint2 xy, CmdImage cmd_img)
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas[uv];
         float3 param_1 = fg_rgba.xyz;
-        float3 _1721 = fromsRGB(param_1);
-        fg_rgba.x = _1721.x;
-        fg_rgba.y = _1721.y;
-        fg_rgba.z = _1721.z;
+        float3 _1697 = fromsRGB(param_1);
+        fg_rgba.x = _1697.x;
+        fg_rgba.y = _1697.y;
+        fg_rgba.z = _1697.z;
         rgba[i] = fg_rgba;
     }
     spvReturnValue = rgba;
@@ -918,12 +918,6 @@ float4 mix_compose(float3 cb, float3 cs, float ab, float as, uint mode)
             break;
         }
         case 13u:
-        {
-            float rev_as = 1.0f - as;
-            float rev_ab = 1.0f - ab;
-            return max(0.0f.xxxx, float4((cs * rev_as) + (cb * rev_ab), rev_as + rev_ab));
-        }
-        case 14u:
         {
             return min(1.0f.xxxx, float4((cs * as) + (cb * ab), as + ab));
         }
@@ -992,16 +986,18 @@ CmdJump Cmd_Jump_read(Alloc a, CmdRef ref)
 
 void comp_main()
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1749.Load(8)) + gl_WorkGroupID.x;
-    Alloc _1764;
-    _1764.offset = _1749.Load(24);
+    uint tile_ix = (gl_WorkGroupID.y * _1725.Load(8)) + gl_WorkGroupID.x;
+    Alloc _1740;
+    _1740.offset = _1725.Load(24);
     Alloc param;
-    param.offset = _1764.offset;
+    param.offset = _1740.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
-    CmdRef _1773 = { cmd_alloc.offset };
-    CmdRef cmd_ref = _1773;
+    CmdRef _1749 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _1749;
+    uint blend_offset = _297.Load((cmd_ref.offset >> uint(2)) * 4 + 8);
+    cmd_ref.offset += 4u;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
     float4 rgba[8];
@@ -1014,7 +1010,9 @@ void comp_main()
     float df[8];
     TileSegRef tile_seg_ref;
     float area[8];
-    uint blend_stack[128][8];
+    uint blend_stack[4][8];
+    uint base_ix_1;
+    uint bg_rgba;
     while (mem_ok)
     {
         Alloc param_3 = cmd_alloc;
@@ -1035,8 +1033,8 @@ void comp_main()
                 {
                     df[k] = 1000000000.0f;
                 }
-                TileSegRef _1867 = { stroke.tile_ref };
-                tile_seg_ref = _1867;
+                TileSegRef _1854 = { stroke.tile_ref };
+                tile_seg_ref = _1854;
                 do
                 {
                     uint param_7 = tile_seg_ref.offset;
@@ -1072,8 +1070,8 @@ void comp_main()
                 {
                     area[k_3] = float(fill.backdrop);
                 }
-                TileSegRef _1987 = { fill.tile_ref };
-                tile_seg_ref = _1987;
+                TileSegRef _1974 = { fill.tile_ref };
+                tile_seg_ref = _1974;
                 do
                 {
                     uint param_15 = tile_seg_ref.offset;
@@ -1162,10 +1160,10 @@ void comp_main()
                     int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba = gradients[int2(x, int(lin.index))];
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2321 = fromsRGB(param_29);
-                    fg_rgba.x = _2321.x;
-                    fg_rgba.y = _2321.y;
-                    fg_rgba.z = _2321.z;
+                    float3 _2308 = fromsRGB(param_29);
+                    fg_rgba.x = _2308.x;
+                    fg_rgba.y = _2308.y;
+                    fg_rgba.z = _2308.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1;
                 }
@@ -1188,10 +1186,10 @@ void comp_main()
                     int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))];
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2431 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2431.x;
-                    fg_rgba_1.y = _2431.y;
-                    fg_rgba_1.z = _2431.z;
+                    float3 _2418 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2418.x;
+                    fg_rgba_1.y = _2418.y;
+                    fg_rgba_1.z = _2418.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2;
                 }
@@ -1205,9 +1203,9 @@ void comp_main()
                 CmdImage fill_img = Cmd_Image_read(param_34, param_35);
                 uint2 param_36 = xy_uint;
                 CmdImage param_37 = fill_img;
-                float4 _2474[8];
-                fillImage(_2474, param_36, param_37);
-                float4 img[8] = _2474;
+                float4 _2461[8];
+                fillImage(_2461, param_36, param_37);
+                float4 img[8] = _2461;
                 for (uint k_11 = 0u; k_11 < 8u; k_11++)
                 {
                     float4 fg_k_3 = img[k_11] * area[k_11];
@@ -1218,13 +1216,26 @@ void comp_main()
             }
             case 9u:
             {
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                if (clip_depth < 4u)
                 {
-                    uint d_2 = min(clip_depth, 127u);
-                    float4 param_38 = float4(rgba[k_12]);
-                    uint _2537 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2537;
-                    rgba[k_12] = 0.0f.xxxx;
+                    for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                    {
+                        float4 param_38 = float4(rgba[k_12]);
+                        uint _2523 = packsRGB(param_38);
+                        blend_stack[clip_depth][k_12] = _2523;
+                        rgba[k_12] = 0.0f.xxxx;
+                    }
+                }
+                else
+                {
+                    uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                    for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                    {
+                        float4 param_39 = float4(rgba[k_13]);
+                        uint _2566 = packsRGB(param_39);
+                        _297.Store((base_ix + k_13) * 4 + 8, _2566);
+                        rgba[k_13] = 0.0f.xxxx;
+                    }
                 }
                 clip_depth++;
                 cmd_ref.offset += 4u;
@@ -1232,32 +1243,41 @@ void comp_main()
             }
             case 10u:
             {
-                Alloc param_39 = cmd_alloc;
-                CmdRef param_40 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40);
-                uint blend_mode = end_clip.blend >> uint(8);
-                uint comp_mode = end_clip.blend & 255u;
+                Alloc param_40 = cmd_alloc;
+                CmdRef param_41 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41);
                 clip_depth--;
-                for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                if (clip_depth < 4u)
                 {
-                    uint d_3 = min(clip_depth, 127u);
-                    uint param_41 = blend_stack[d_3][k_13];
-                    float4 bg = unpacksRGB(param_41);
-                    float4 fg_1 = rgba[k_13] * area[k_13];
-                    float4 param_42 = bg;
-                    float4 param_43 = fg_1;
-                    uint param_44 = end_clip.blend;
-                    rgba[k_13] = mix_blend_compose(param_42, param_43, param_44);
+                    base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                }
+                for (uint k_14 = 0u; k_14 < 8u; k_14++)
+                {
+                    if (clip_depth < 4u)
+                    {
+                        bg_rgba = blend_stack[clip_depth][k_14];
+                    }
+                    else
+                    {
+                        bg_rgba = _297.Load((base_ix_1 + k_14) * 4 + 8);
+                    }
+                    uint param_42 = bg_rgba;
+                    float4 bg = unpacksRGB(param_42);
+                    float4 fg_1 = rgba[k_14] * area[k_14];
+                    float4 param_43 = bg;
+                    float4 param_44 = fg_1;
+                    uint param_45 = end_clip.blend;
+                    rgba[k_14] = mix_blend_compose(param_43, param_44, param_45);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
             case 11u:
             {
-                Alloc param_45 = cmd_alloc;
-                CmdRef param_46 = cmd_ref;
-                CmdRef _2615 = { Cmd_Jump_read(param_45, param_46).new_ref };
-                cmd_ref = _2615;
+                Alloc param_46 = cmd_alloc;
+                CmdRef param_47 = cmd_ref;
+                CmdRef _2665 = { Cmd_Jump_read(param_46, param_47).new_ref };
+                cmd_ref = _2665;
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1265,9 +1285,9 @@ void comp_main()
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_47 = i_1;
-        float3 param_48 = rgba[i_1].xyz;
-        image[int2(xy_uint + chunk_offset(param_47))] = float4(tosRGB(param_48), rgba[i_1].w);
+        uint param_48 = i_1;
+        float3 param_49 = rgba[i_1].xyz;
+        image[int2(xy_uint + chunk_offset(param_48))] = float4(tosRGB(param_49), rgba[i_1].w);
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4.msl b/piet-gpu/shader/gen/kernel4.msl
index 796043b..6325914 100644
--- a/piet-gpu/shader/gen/kernel4.msl
+++ b/piet-gpu/shader/gen/kernel4.msl
@@ -528,10 +528,10 @@ spvUnsafeArray<float4, 8> fillImage(thread const uint2& xy, thread const CmdImag
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas.read(uint2(uv));
         float3 param_1 = fg_rgba.xyz;
-        float3 _1721 = fromsRGB(param_1);
-        fg_rgba.x = _1721.x;
-        fg_rgba.y = _1721.y;
-        fg_rgba.z = _1721.z;
+        float3 _1697 = fromsRGB(param_1);
+        fg_rgba.x = _1697.x;
+        fg_rgba.y = _1697.y;
+        fg_rgba.z = _1697.z;
         rgba[i] = fg_rgba;
     }
     return rgba;
@@ -985,12 +985,6 @@ float4 mix_compose(thread const float3& cb, thread const float3& cs, thread cons
             break;
         }
         case 13u:
-        {
-            float rev_as = 1.0 - as;
-            float rev_ab = 1.0 - ab;
-            return fast::max(float4(0.0), float4((cs * rev_as) + (cb * rev_ab), rev_as + rev_ab));
-        }
-        case 14u:
         {
             return fast::min(float4(1.0), float4((cs * as) + (cb * ab), as + ab));
         }
@@ -1059,15 +1053,17 @@ CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Me
     return CmdJump_read(param, param_1, v_297);
 }
 
-kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1749 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1725 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1749.conf.width_in_tiles) + gl_WorkGroupID.x;
+    uint tile_ix = (gl_WorkGroupID.y * _1725.conf.width_in_tiles) + gl_WorkGroupID.x;
     Alloc param;
-    param.offset = _1749.conf.ptcl_alloc.offset;
+    param.offset = _1725.conf.ptcl_alloc.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
     CmdRef cmd_ref = CmdRef{ cmd_alloc.offset };
+    uint blend_offset = v_297.memory[cmd_ref.offset >> uint(2)];
+    cmd_ref.offset += 4u;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
     spvUnsafeArray<float4, 8> rgba;
@@ -1080,7 +1076,9 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
     spvUnsafeArray<float, 8> df;
     TileSegRef tile_seg_ref;
     spvUnsafeArray<float, 8> area;
-    spvUnsafeArray<spvUnsafeArray<uint, 8>, 128> blend_stack;
+    spvUnsafeArray<spvUnsafeArray<uint, 8>, 4> blend_stack;
+    uint base_ix_1;
+    uint bg_rgba;
     while (mem_ok)
     {
         Alloc param_3 = cmd_alloc;
@@ -1226,10 +1224,10 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
                     int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                     float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2321 = fromsRGB(param_29);
-                    fg_rgba.x = _2321.x;
-                    fg_rgba.y = _2321.y;
-                    fg_rgba.z = _2321.z;
+                    float3 _2308 = fromsRGB(param_29);
+                    fg_rgba.x = _2308.x;
+                    fg_rgba.y = _2308.y;
+                    fg_rgba.z = _2308.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1;
                 }
@@ -1252,10 +1250,10 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
                     int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0));
                     float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index))));
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2431 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2431.x;
-                    fg_rgba_1.y = _2431.y;
-                    fg_rgba_1.z = _2431.z;
+                    float3 _2418 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2418.x;
+                    fg_rgba_1.y = _2418.y;
+                    fg_rgba_1.z = _2418.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2;
                 }
@@ -1281,13 +1279,26 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
             }
             case 9u:
             {
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                if (clip_depth < 4u)
                 {
-                    uint d_2 = min(clip_depth, 127u);
-                    float4 param_38 = float4(rgba[k_12]);
-                    uint _2537 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2537;
-                    rgba[k_12] = float4(0.0);
+                    for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                    {
+                        float4 param_38 = float4(rgba[k_12]);
+                        uint _2523 = packsRGB(param_38);
+                        blend_stack[clip_depth][k_12] = _2523;
+                        rgba[k_12] = float4(0.0);
+                    }
+                }
+                else
+                {
+                    uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                    for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                    {
+                        float4 param_39 = float4(rgba[k_13]);
+                        uint _2566 = packsRGB(param_39);
+                        v_297.memory[base_ix + k_13] = _2566;
+                        rgba[k_13] = float4(0.0);
+                    }
                 }
                 clip_depth++;
                 cmd_ref.offset += 4u;
@@ -1295,31 +1306,40 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
             }
             case 10u:
             {
-                Alloc param_39 = cmd_alloc;
-                CmdRef param_40 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40, v_297);
-                uint blend_mode = end_clip.blend >> uint(8);
-                uint comp_mode = end_clip.blend & 255u;
+                Alloc param_40 = cmd_alloc;
+                CmdRef param_41 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41, v_297);
                 clip_depth--;
-                for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                if (clip_depth < 4u)
                 {
-                    uint d_3 = min(clip_depth, 127u);
-                    uint param_41 = blend_stack[d_3][k_13];
-                    float4 bg = unpacksRGB(param_41);
-                    float4 fg_1 = rgba[k_13] * area[k_13];
-                    float4 param_42 = bg;
-                    float4 param_43 = fg_1;
-                    uint param_44 = end_clip.blend;
-                    rgba[k_13] = mix_blend_compose(param_42, param_43, param_44);
+                    base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                }
+                for (uint k_14 = 0u; k_14 < 8u; k_14++)
+                {
+                    if (clip_depth < 4u)
+                    {
+                        bg_rgba = blend_stack[clip_depth][k_14];
+                    }
+                    else
+                    {
+                        bg_rgba = v_297.memory[base_ix_1 + k_14];
+                    }
+                    uint param_42 = bg_rgba;
+                    float4 bg = unpacksRGB(param_42);
+                    float4 fg_1 = rgba[k_14] * area[k_14];
+                    float4 param_43 = bg;
+                    float4 param_44 = fg_1;
+                    uint param_45 = end_clip.blend;
+                    rgba[k_14] = mix_blend_compose(param_43, param_44, param_45);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
             case 11u:
             {
-                Alloc param_45 = cmd_alloc;
-                CmdRef param_46 = cmd_ref;
-                cmd_ref = CmdRef{ Cmd_Jump_read(param_45, param_46, v_297).new_ref };
+                Alloc param_46 = cmd_alloc;
+                CmdRef param_47 = cmd_ref;
+                cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_297).new_ref };
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1327,9 +1347,9 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_47 = i_1;
-        float3 param_48 = rgba[i_1].xyz;
-        image.write(float4(tosRGB(param_48), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_47))));
+        uint param_48 = i_1;
+        float3 param_49 = rgba[i_1].xyz;
+        image.write(float4(tosRGB(param_49), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48))));
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4.spv b/piet-gpu/shader/gen/kernel4.spv
index b145245..978e0a2 100644
Binary files a/piet-gpu/shader/gen/kernel4.spv and b/piet-gpu/shader/gen/kernel4.spv differ
diff --git a/piet-gpu/shader/gen/kernel4_gray.dxil b/piet-gpu/shader/gen/kernel4_gray.dxil
index abe1d22..37fe62c 100644
Binary files a/piet-gpu/shader/gen/kernel4_gray.dxil and b/piet-gpu/shader/gen/kernel4_gray.dxil differ
diff --git a/piet-gpu/shader/gen/kernel4_gray.hlsl b/piet-gpu/shader/gen/kernel4_gray.hlsl
index f402268..5bd7b3b 100644
--- a/piet-gpu/shader/gen/kernel4_gray.hlsl
+++ b/piet-gpu/shader/gen/kernel4_gray.hlsl
@@ -162,7 +162,7 @@ struct Config
 static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);
 
 RWByteAddressBuffer _297 : register(u0, space0);
-ByteAddressBuffer _1749 : register(t1, space0);
+ByteAddressBuffer _1725 : register(t1, space0);
 RWTexture2D<unorm float4> image_atlas : register(u3, space0);
 RWTexture2D<unorm float4> gradients : register(u4, space0);
 RWTexture2D<unorm float> image : register(u2, space0);
@@ -477,10 +477,10 @@ void fillImage(out float4 spvReturnValue[8], uint2 xy, CmdImage cmd_img)
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas[uv];
         float3 param_1 = fg_rgba.xyz;
-        float3 _1721 = fromsRGB(param_1);
-        fg_rgba.x = _1721.x;
-        fg_rgba.y = _1721.y;
-        fg_rgba.z = _1721.z;
+        float3 _1697 = fromsRGB(param_1);
+        fg_rgba.x = _1697.x;
+        fg_rgba.y = _1697.y;
+        fg_rgba.z = _1697.z;
         rgba[i] = fg_rgba;
     }
     spvReturnValue = rgba;
@@ -918,12 +918,6 @@ float4 mix_compose(float3 cb, float3 cs, float ab, float as, uint mode)
             break;
         }
         case 13u:
-        {
-            float rev_as = 1.0f - as;
-            float rev_ab = 1.0f - ab;
-            return max(0.0f.xxxx, float4((cs * rev_as) + (cb * rev_ab), rev_as + rev_ab));
-        }
-        case 14u:
         {
             return min(1.0f.xxxx, float4((cs * as) + (cb * ab), as + ab));
         }
@@ -992,16 +986,18 @@ CmdJump Cmd_Jump_read(Alloc a, CmdRef ref)
 
 void comp_main()
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1749.Load(8)) + gl_WorkGroupID.x;
-    Alloc _1764;
-    _1764.offset = _1749.Load(24);
+    uint tile_ix = (gl_WorkGroupID.y * _1725.Load(8)) + gl_WorkGroupID.x;
+    Alloc _1740;
+    _1740.offset = _1725.Load(24);
     Alloc param;
-    param.offset = _1764.offset;
+    param.offset = _1740.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
-    CmdRef _1773 = { cmd_alloc.offset };
-    CmdRef cmd_ref = _1773;
+    CmdRef _1749 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _1749;
+    uint blend_offset = _297.Load((cmd_ref.offset >> uint(2)) * 4 + 8);
+    cmd_ref.offset += 4u;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
     float4 rgba[8];
@@ -1014,7 +1010,9 @@ void comp_main()
     float df[8];
     TileSegRef tile_seg_ref;
     float area[8];
-    uint blend_stack[128][8];
+    uint blend_stack[4][8];
+    uint base_ix_1;
+    uint bg_rgba;
     while (mem_ok)
     {
         Alloc param_3 = cmd_alloc;
@@ -1035,8 +1033,8 @@ void comp_main()
                 {
                     df[k] = 1000000000.0f;
                 }
-                TileSegRef _1867 = { stroke.tile_ref };
-                tile_seg_ref = _1867;
+                TileSegRef _1854 = { stroke.tile_ref };
+                tile_seg_ref = _1854;
                 do
                 {
                     uint param_7 = tile_seg_ref.offset;
@@ -1072,8 +1070,8 @@ void comp_main()
                 {
                     area[k_3] = float(fill.backdrop);
                 }
-                TileSegRef _1987 = { fill.tile_ref };
-                tile_seg_ref = _1987;
+                TileSegRef _1974 = { fill.tile_ref };
+                tile_seg_ref = _1974;
                 do
                 {
                     uint param_15 = tile_seg_ref.offset;
@@ -1162,10 +1160,10 @@ void comp_main()
                     int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba = gradients[int2(x, int(lin.index))];
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2321 = fromsRGB(param_29);
-                    fg_rgba.x = _2321.x;
-                    fg_rgba.y = _2321.y;
-                    fg_rgba.z = _2321.z;
+                    float3 _2308 = fromsRGB(param_29);
+                    fg_rgba.x = _2308.x;
+                    fg_rgba.y = _2308.y;
+                    fg_rgba.z = _2308.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1;
                 }
@@ -1188,10 +1186,10 @@ void comp_main()
                     int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))];
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2431 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2431.x;
-                    fg_rgba_1.y = _2431.y;
-                    fg_rgba_1.z = _2431.z;
+                    float3 _2418 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2418.x;
+                    fg_rgba_1.y = _2418.y;
+                    fg_rgba_1.z = _2418.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2;
                 }
@@ -1205,9 +1203,9 @@ void comp_main()
                 CmdImage fill_img = Cmd_Image_read(param_34, param_35);
                 uint2 param_36 = xy_uint;
                 CmdImage param_37 = fill_img;
-                float4 _2474[8];
-                fillImage(_2474, param_36, param_37);
-                float4 img[8] = _2474;
+                float4 _2461[8];
+                fillImage(_2461, param_36, param_37);
+                float4 img[8] = _2461;
                 for (uint k_11 = 0u; k_11 < 8u; k_11++)
                 {
                     float4 fg_k_3 = img[k_11] * area[k_11];
@@ -1218,13 +1216,26 @@ void comp_main()
             }
             case 9u:
             {
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                if (clip_depth < 4u)
                 {
-                    uint d_2 = min(clip_depth, 127u);
-                    float4 param_38 = float4(rgba[k_12]);
-                    uint _2537 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2537;
-                    rgba[k_12] = 0.0f.xxxx;
+                    for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                    {
+                        float4 param_38 = float4(rgba[k_12]);
+                        uint _2523 = packsRGB(param_38);
+                        blend_stack[clip_depth][k_12] = _2523;
+                        rgba[k_12] = 0.0f.xxxx;
+                    }
+                }
+                else
+                {
+                    uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                    for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                    {
+                        float4 param_39 = float4(rgba[k_13]);
+                        uint _2566 = packsRGB(param_39);
+                        _297.Store((base_ix + k_13) * 4 + 8, _2566);
+                        rgba[k_13] = 0.0f.xxxx;
+                    }
                 }
                 clip_depth++;
                 cmd_ref.offset += 4u;
@@ -1232,32 +1243,41 @@ void comp_main()
             }
             case 10u:
             {
-                Alloc param_39 = cmd_alloc;
-                CmdRef param_40 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40);
-                uint blend_mode = end_clip.blend >> uint(8);
-                uint comp_mode = end_clip.blend & 255u;
+                Alloc param_40 = cmd_alloc;
+                CmdRef param_41 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41);
                 clip_depth--;
-                for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                if (clip_depth < 4u)
                 {
-                    uint d_3 = min(clip_depth, 127u);
-                    uint param_41 = blend_stack[d_3][k_13];
-                    float4 bg = unpacksRGB(param_41);
-                    float4 fg_1 = rgba[k_13] * area[k_13];
-                    float4 param_42 = bg;
-                    float4 param_43 = fg_1;
-                    uint param_44 = end_clip.blend;
-                    rgba[k_13] = mix_blend_compose(param_42, param_43, param_44);
+                    base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                }
+                for (uint k_14 = 0u; k_14 < 8u; k_14++)
+                {
+                    if (clip_depth < 4u)
+                    {
+                        bg_rgba = blend_stack[clip_depth][k_14];
+                    }
+                    else
+                    {
+                        bg_rgba = _297.Load((base_ix_1 + k_14) * 4 + 8);
+                    }
+                    uint param_42 = bg_rgba;
+                    float4 bg = unpacksRGB(param_42);
+                    float4 fg_1 = rgba[k_14] * area[k_14];
+                    float4 param_43 = bg;
+                    float4 param_44 = fg_1;
+                    uint param_45 = end_clip.blend;
+                    rgba[k_14] = mix_blend_compose(param_43, param_44, param_45);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
             case 11u:
             {
-                Alloc param_45 = cmd_alloc;
-                CmdRef param_46 = cmd_ref;
-                CmdRef _2615 = { Cmd_Jump_read(param_45, param_46).new_ref };
-                cmd_ref = _2615;
+                Alloc param_46 = cmd_alloc;
+                CmdRef param_47 = cmd_ref;
+                CmdRef _2665 = { Cmd_Jump_read(param_46, param_47).new_ref };
+                cmd_ref = _2665;
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1265,8 +1285,8 @@ void comp_main()
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_47 = i_1;
-        image[int2(xy_uint + chunk_offset(param_47))] = rgba[i_1].w.x;
+        uint param_48 = i_1;
+        image[int2(xy_uint + chunk_offset(param_48))] = rgba[i_1].w.x;
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4_gray.msl b/piet-gpu/shader/gen/kernel4_gray.msl
index 9647001..2b550b8 100644
--- a/piet-gpu/shader/gen/kernel4_gray.msl
+++ b/piet-gpu/shader/gen/kernel4_gray.msl
@@ -528,10 +528,10 @@ spvUnsafeArray<float4, 8> fillImage(thread const uint2& xy, thread const CmdImag
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas.read(uint2(uv));
         float3 param_1 = fg_rgba.xyz;
-        float3 _1721 = fromsRGB(param_1);
-        fg_rgba.x = _1721.x;
-        fg_rgba.y = _1721.y;
-        fg_rgba.z = _1721.z;
+        float3 _1697 = fromsRGB(param_1);
+        fg_rgba.x = _1697.x;
+        fg_rgba.y = _1697.y;
+        fg_rgba.z = _1697.z;
         rgba[i] = fg_rgba;
     }
     return rgba;
@@ -985,12 +985,6 @@ float4 mix_compose(thread const float3& cb, thread const float3& cs, thread cons
             break;
         }
         case 13u:
-        {
-            float rev_as = 1.0 - as;
-            float rev_ab = 1.0 - ab;
-            return fast::max(float4(0.0), float4((cs * rev_as) + (cb * rev_ab), rev_as + rev_ab));
-        }
-        case 14u:
         {
             return fast::min(float4(1.0), float4((cs * as) + (cb * ab), as + ab));
         }
@@ -1059,15 +1053,17 @@ CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Me
     return CmdJump_read(param, param_1, v_297);
 }
 
-kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1749 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1725 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1749.conf.width_in_tiles) + gl_WorkGroupID.x;
+    uint tile_ix = (gl_WorkGroupID.y * _1725.conf.width_in_tiles) + gl_WorkGroupID.x;
     Alloc param;
-    param.offset = _1749.conf.ptcl_alloc.offset;
+    param.offset = _1725.conf.ptcl_alloc.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
     CmdRef cmd_ref = CmdRef{ cmd_alloc.offset };
+    uint blend_offset = v_297.memory[cmd_ref.offset >> uint(2)];
+    cmd_ref.offset += 4u;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
     spvUnsafeArray<float4, 8> rgba;
@@ -1080,7 +1076,9 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
     spvUnsafeArray<float, 8> df;
     TileSegRef tile_seg_ref;
     spvUnsafeArray<float, 8> area;
-    spvUnsafeArray<spvUnsafeArray<uint, 8>, 128> blend_stack;
+    spvUnsafeArray<spvUnsafeArray<uint, 8>, 4> blend_stack;
+    uint base_ix_1;
+    uint bg_rgba;
     while (mem_ok)
     {
         Alloc param_3 = cmd_alloc;
@@ -1226,10 +1224,10 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
                     int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                     float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2321 = fromsRGB(param_29);
-                    fg_rgba.x = _2321.x;
-                    fg_rgba.y = _2321.y;
-                    fg_rgba.z = _2321.z;
+                    float3 _2308 = fromsRGB(param_29);
+                    fg_rgba.x = _2308.x;
+                    fg_rgba.y = _2308.y;
+                    fg_rgba.z = _2308.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1;
                 }
@@ -1252,10 +1250,10 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
                     int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0));
                     float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index))));
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2431 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2431.x;
-                    fg_rgba_1.y = _2431.y;
-                    fg_rgba_1.z = _2431.z;
+                    float3 _2418 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2418.x;
+                    fg_rgba_1.y = _2418.y;
+                    fg_rgba_1.z = _2418.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2;
                 }
@@ -1281,13 +1279,26 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
             }
             case 9u:
             {
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                if (clip_depth < 4u)
                 {
-                    uint d_2 = min(clip_depth, 127u);
-                    float4 param_38 = float4(rgba[k_12]);
-                    uint _2537 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2537;
-                    rgba[k_12] = float4(0.0);
+                    for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                    {
+                        float4 param_38 = float4(rgba[k_12]);
+                        uint _2523 = packsRGB(param_38);
+                        blend_stack[clip_depth][k_12] = _2523;
+                        rgba[k_12] = float4(0.0);
+                    }
+                }
+                else
+                {
+                    uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                    for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                    {
+                        float4 param_39 = float4(rgba[k_13]);
+                        uint _2566 = packsRGB(param_39);
+                        v_297.memory[base_ix + k_13] = _2566;
+                        rgba[k_13] = float4(0.0);
+                    }
                 }
                 clip_depth++;
                 cmd_ref.offset += 4u;
@@ -1295,31 +1306,40 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
             }
             case 10u:
             {
-                Alloc param_39 = cmd_alloc;
-                CmdRef param_40 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40, v_297);
-                uint blend_mode = end_clip.blend >> uint(8);
-                uint comp_mode = end_clip.blend & 255u;
+                Alloc param_40 = cmd_alloc;
+                CmdRef param_41 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41, v_297);
                 clip_depth--;
-                for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                if (clip_depth < 4u)
                 {
-                    uint d_3 = min(clip_depth, 127u);
-                    uint param_41 = blend_stack[d_3][k_13];
-                    float4 bg = unpacksRGB(param_41);
-                    float4 fg_1 = rgba[k_13] * area[k_13];
-                    float4 param_42 = bg;
-                    float4 param_43 = fg_1;
-                    uint param_44 = end_clip.blend;
-                    rgba[k_13] = mix_blend_compose(param_42, param_43, param_44);
+                    base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                }
+                for (uint k_14 = 0u; k_14 < 8u; k_14++)
+                {
+                    if (clip_depth < 4u)
+                    {
+                        bg_rgba = blend_stack[clip_depth][k_14];
+                    }
+                    else
+                    {
+                        bg_rgba = v_297.memory[base_ix_1 + k_14];
+                    }
+                    uint param_42 = bg_rgba;
+                    float4 bg = unpacksRGB(param_42);
+                    float4 fg_1 = rgba[k_14] * area[k_14];
+                    float4 param_43 = bg;
+                    float4 param_44 = fg_1;
+                    uint param_45 = end_clip.blend;
+                    rgba[k_14] = mix_blend_compose(param_43, param_44, param_45);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
             case 11u:
             {
-                Alloc param_45 = cmd_alloc;
-                CmdRef param_46 = cmd_ref;
-                cmd_ref = CmdRef{ Cmd_Jump_read(param_45, param_46, v_297).new_ref };
+                Alloc param_46 = cmd_alloc;
+                CmdRef param_47 = cmd_ref;
+                cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_297).new_ref };
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1327,8 +1347,8 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_47 = i_1;
-        image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_47))));
+        uint param_48 = i_1;
+        image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48))));
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4_gray.spv b/piet-gpu/shader/gen/kernel4_gray.spv
index 2dd46c0..bacd9a8 100644
Binary files a/piet-gpu/shader/gen/kernel4_gray.spv and b/piet-gpu/shader/gen/kernel4_gray.spv differ
diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp
index a0710d2..c9b5dd3 100644
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@@ -87,11 +87,14 @@ void main() {
     Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC);
     CmdRef cmd_ref = CmdRef(cmd_alloc.offset);
 
+    uint blend_offset = memory[cmd_ref.offset >> 2];
+    cmd_ref.offset += 4;
+
     uvec2 xy_uint = uvec2(gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_WorkGroupID.x,
                           gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y);
     vec2 xy = vec2(xy_uint);
     mediump vec4 rgba[CHUNK];
-    uint blend_stack[MAX_BLEND_STACK][CHUNK];
+    uint blend_stack[BLEND_STACK_SPLIT][CHUNK];
     for (uint i = 0; i < CHUNK; i++) {
         rgba[i] = vec4(0.0);
     }
@@ -223,24 +226,38 @@ void main() {
             cmd_ref.offset += 4 + CmdImage_size;
             break;
         case Cmd_BeginClip:
-            for (uint k = 0; k < CHUNK; k++) {
-                // We reject any inputs that might overflow in render_ctx.rs.
-                // The following is a sanity check so we don't corrupt memory should there be malformed inputs.
-                uint d = min(clip_depth, MAX_BLEND_STACK - 1);
-                blend_stack[d][k] = packsRGB(vec4(rgba[k]));
-                rgba[k] = vec4(0.0);
+            if (clip_depth < BLEND_STACK_SPLIT) {
+                for (uint k = 0; k < CHUNK; k++) {
+                    blend_stack[clip_depth][k] = packsRGB(vec4(rgba[k]));
+                    rgba[k] = vec4(0.0);
+                }
+            } else {
+                uint base_ix = (blend_offset >> 2) + (clip_depth - BLEND_STACK_SPLIT) * TILE_HEIGHT_PX * TILE_WIDTH_PX +
+                    CHUNK * (gl_LocalInvocationID.x + CHUNK_DX * gl_LocalInvocationID.y);
+                for (uint k = 0; k < CHUNK; k++) {
+                    memory[base_ix + k] = packsRGB(vec4(rgba[k]));
+                    rgba[k] = vec4(0.0);
+                }
             }
             clip_depth++;
             cmd_ref.offset += 4;
             break;
         case Cmd_EndClip:
             CmdEndClip end_clip = Cmd_EndClip_read(cmd_alloc, cmd_ref);
-            uint blend_mode = uint(end_clip.blend >> 8);
-            uint comp_mode = uint(end_clip.blend & 0xFF);
             clip_depth--;
+            uint base_ix;
+            if (clip_depth < BLEND_STACK_SPLIT) {
+                base_ix = (blend_offset >> 2) + (clip_depth - BLEND_STACK_SPLIT) * TILE_HEIGHT_PX * TILE_WIDTH_PX +
+                    CHUNK * (gl_LocalInvocationID.x + CHUNK_DX * gl_LocalInvocationID.y);
+            }
             for (uint k = 0; k < CHUNK; k++) {
-                uint d = min(clip_depth, MAX_BLEND_STACK - 1);
-                mediump vec4 bg = unpacksRGB(blend_stack[d][k]);
+                uint bg_rgba;
+                if (clip_depth < BLEND_STACK_SPLIT) {
+                    bg_rgba = blend_stack[clip_depth][k];
+                } else {
+                    bg_rgba = memory[base_ix + k];
+                }
+                mediump vec4 bg = unpacksRGB(bg_rgba);
                 mediump vec4 fg = rgba[k] * area[k];
                 rgba[k] = mix_blend_compose(bg, fg, end_clip.blend);
             }
diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h
index ec17188..21206e5 100644
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@@ -27,6 +27,10 @@
 
 #define GRADIENT_WIDTH 512
 
+// We allocate this many blend stack entries in registers, and spill
+// to memory for the overflow.
+#define BLEND_STACK_SPLIT 4
+
 #ifdef ERR_MALLOC_FAILED
 struct Config {
     uint n_elements; // paths
@@ -91,7 +95,7 @@ struct Config {
 #define MODE_STROKE 1
 
 // Size of kernel4 clip state, in words.
-#define CLIP_STATE_SIZE 2
+#define CLIP_STATE_SIZE 1
 
 // fill_mode_from_flags extracts the fill mode from tag flags.
 uint fill_mode_from_flags(uint flags) {
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index d32a9c5..8915de4 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -34,8 +34,6 @@ const TILE_H: usize = 16;
 
 const PTCL_INITIAL_ALLOC: usize = 1024;
 
-const MAX_BLEND_STACK: usize = 128;
-
 #[allow(unused)]
 fn dump_scene(buf: &[u8]) {
     for i in 0..(buf.len() / 4) {
diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs
index dca03eb..f78e8ab 100644
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@@ -2,7 +2,6 @@ use std::borrow::Cow;
 
 use crate::encoder::GlyphEncoder;
 use crate::stages::{Config, Transform};
-use crate::MAX_BLEND_STACK;
 use piet::kurbo::{Affine, Insets, PathEl, Point, Rect, Shape};
 use piet::{
     Color, Error, FixedGradient, ImageFormat, InterpolationMode, IntoBrush, RenderContext,
@@ -230,9 +229,6 @@ impl RenderContext for PietGpuRenderContext {
         let path = shape.path_elements(TOLERANCE);
         self.encode_path(path, true);
         self.new_encoder.begin_clip(None);
-        if self.clip_stack.len() >= MAX_BLEND_STACK {
-            panic!("Maximum clip/blend stack size {} exceeded", MAX_BLEND_STACK);
-        }
         self.clip_stack.push(ClipElement { blend: None });
         if let Some(tos) = self.state_stack.last_mut() {
             tos.n_clip += 1;
@@ -334,9 +330,6 @@ impl PietGpuRenderContext {
         let path = shape.path_elements(TOLERANCE);
         self.encode_path(path, true);
         self.new_encoder.begin_clip(Some(blend));
-        if self.clip_stack.len() >= MAX_BLEND_STACK {
-            panic!("Maximum clip/blend stack size {} exceeded", MAX_BLEND_STACK);
-        }
         self.clip_stack.push(ClipElement { blend: Some(blend) });
         if let Some(tos) = self.state_stack.last_mut() {
             tos.n_clip += 1;