diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
index 1b3f252..fc6df21 100644
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@@ -151,6 +151,12 @@ void main() {
     uint part_start_ix = 0;
     uint ready_ix = 0;
 
+    Alloc scratch_alloc = slice_mem(cmd_alloc, 0, Alloc_size);
+    cmd_ref.offset += 4;
+    // Accounting for allocation of blend memory
+    uint render_blend_depth = 0;
+    uint max_blend_depth = 0;
+
     uint drawmonoid_start = conf.drawmonoid_alloc.offset >> 2;
     uint drawtag_start = conf.drawtag_offset >> 2;
     uint drawdata_start = conf.drawdata_offset >> 2;
@@ -414,6 +420,8 @@ void main() {
                         }
                         Cmd_BeginClip_write(cmd_alloc, cmd_ref);
                         cmd_ref.offset += 4;
+                        render_blend_depth++;
+                        max_blend_depth = max(max_blend_depth, render_blend_depth);
                     }
                     clip_depth++;
                     break;
@@ -426,6 +434,7 @@ void main() {
                     uint blend = scene[dd];
                     Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(blend));
                     cmd_ref.offset += 4 + CmdEndClip_size;
+                    render_blend_depth--;
                     break;
                 }
             } else {
@@ -451,5 +460,10 @@ void main() {
     }
     if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) {
         Cmd_End_write(cmd_alloc, cmd_ref);
+        if (max_blend_depth > BLEND_STACK_SPLIT) {
+            uint scratch_size = max_blend_depth * TILE_WIDTH_PX * TILE_HEIGHT_PX * CLIP_STATE_SIZE * 4;
+            MallocResult scratch = malloc(scratch_size);
+            alloc_write(scratch_alloc, scratch_alloc.offset, scratch.alloc);
+        }
     }
 }
diff --git a/piet-gpu/shader/gen/backdrop.dxil b/piet-gpu/shader/gen/backdrop.dxil
index df2be88..0fb9622 100644
Binary files a/piet-gpu/shader/gen/backdrop.dxil and b/piet-gpu/shader/gen/backdrop.dxil differ
diff --git a/piet-gpu/shader/gen/backdrop_lg.dxil b/piet-gpu/shader/gen/backdrop_lg.dxil
index 81f9b65..e24a6d3 100644
Binary files a/piet-gpu/shader/gen/backdrop_lg.dxil and b/piet-gpu/shader/gen/backdrop_lg.dxil differ
diff --git a/piet-gpu/shader/gen/bbox_clear.dxil b/piet-gpu/shader/gen/bbox_clear.dxil
index 6b3efaf..6655b7f 100644
Binary files a/piet-gpu/shader/gen/bbox_clear.dxil and b/piet-gpu/shader/gen/bbox_clear.dxil differ
diff --git a/piet-gpu/shader/gen/clip_leaf.dxil b/piet-gpu/shader/gen/clip_leaf.dxil
index b681a65..29a158e 100644
Binary files a/piet-gpu/shader/gen/clip_leaf.dxil and b/piet-gpu/shader/gen/clip_leaf.dxil differ
diff --git a/piet-gpu/shader/gen/clip_reduce.dxil b/piet-gpu/shader/gen/clip_reduce.dxil
index 0ccaac9..0dff71b 100644
Binary files a/piet-gpu/shader/gen/clip_reduce.dxil and b/piet-gpu/shader/gen/clip_reduce.dxil differ
diff --git a/piet-gpu/shader/gen/coarse.dxil b/piet-gpu/shader/gen/coarse.dxil
index c91fcdf..f71cc04 100644
Binary files a/piet-gpu/shader/gen/coarse.dxil and b/piet-gpu/shader/gen/coarse.dxil differ
diff --git a/piet-gpu/shader/gen/coarse.hlsl b/piet-gpu/shader/gen/coarse.hlsl
index 0331e33..a7f769f 100644
--- a/piet-gpu/shader/gen/coarse.hlsl
+++ b/piet-gpu/shader/gen/coarse.hlsl
@@ -175,9 +175,9 @@ struct Config
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-RWByteAddressBuffer _260 : register(u0, space0);
-ByteAddressBuffer _1005 : register(t1, space0);
-ByteAddressBuffer _1372 : register(t2, space0);
+RWByteAddressBuffer _266 : register(u0, space0);
+ByteAddressBuffer _1020 : register(t1, space0);
+ByteAddressBuffer _1399 : register(t2, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -200,8 +200,8 @@ groupshared uint sh_tile_count[256];
 
 Alloc slice_mem(Alloc a, uint offset, uint size)
 {
-    Alloc _337 = { a.offset + offset };
-    return _337;
+    Alloc _343 = { a.offset + offset };
+    return _343;
 }
 
 bool touch_mem(Alloc alloc, uint offset)
@@ -217,7 +217,7 @@ uint read_mem(Alloc alloc, uint offset)
     {
         return 0u;
     }
-    uint v = _260.Load(offset * 4 + 8);
+    uint v = _266.Load(offset * 4 + 8);
     return v;
 }
 
@@ -230,8 +230,8 @@ Alloc new_alloc(uint offset, uint size, bool mem_ok)
 
 BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index)
 {
-    BinInstanceRef _346 = { ref.offset + (index * 4u) };
-    return _346;
+    BinInstanceRef _361 = { ref.offset + (index * 4u) };
+    return _361;
 }
 
 BinInstance BinInstance_read(Alloc a, BinInstanceRef ref)
@@ -259,8 +259,8 @@ Path Path_read(Alloc a, PathRef ref)
     uint raw2 = read_mem(param_4, param_5);
     Path s;
     s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
-    TileRef _409 = { raw2 };
-    s.tiles = _409;
+    TileRef _424 = { raw2 };
+    s.tiles = _424;
     return s;
 }
 
@@ -270,11 +270,11 @@ void write_tile_alloc(uint el_ix, Alloc a)
 
 Alloc read_tile_alloc(uint el_ix, bool mem_ok)
 {
-    uint _892;
-    _260.GetDimensions(_892);
-    _892 = (_892 - 8) / 4;
+    uint _907;
+    _266.GetDimensions(_907);
+    _907 = (_907 - 8) / 4;
     uint param = 0u;
-    uint param_1 = uint(int(_892) * 4);
+    uint param_1 = uint(int(_907) * 4);
     bool param_2 = mem_ok;
     return new_alloc(param, param_1, param_2);
 }
@@ -288,31 +288,31 @@ Tile Tile_read(Alloc a, TileRef ref)
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
     uint raw1 = read_mem(param_2, param_3);
-    TileSegRef _434 = { raw0 };
+    TileSegRef _449 = { raw0 };
     Tile s;
-    s.tile = _434;
+    s.tile = _449;
     s.backdrop = int(raw1);
     return s;
 }
 
 MallocResult malloc(uint size)
 {
-    uint _266;
-    _260.InterlockedAdd(0, size, _266);
-    uint offset = _266;
-    uint _273;
-    _260.GetDimensions(_273);
-    _273 = (_273 - 8) / 4;
+    uint _272;
+    _266.InterlockedAdd(0, size, _272);
+    uint offset = _272;
+    uint _279;
+    _266.GetDimensions(_279);
+    _279 = (_279 - 8) / 4;
     MallocResult r;
-    r.failed = (offset + size) > uint(int(_273) * 4);
+    r.failed = (offset + size) > uint(int(_279) * 4);
     uint param = offset;
     uint param_1 = size;
     bool param_2 = !r.failed;
     r.alloc = new_alloc(param, param_1, param_2);
     if (r.failed)
     {
-        uint _295;
-        _260.InterlockedMax(4, 1u, _295);
+        uint _301;
+        _266.InterlockedMax(4, 1u, _301);
         return r;
     }
     return r;
@@ -326,7 +326,7 @@ void write_mem(Alloc alloc, uint offset, uint val)
     {
         return;
     }
-    _260.Store(offset * 4 + 8, val);
+    _266.Store(offset * 4 + 8, val);
 }
 
 void CmdJump_write(Alloc a, CmdJumpRef ref, CmdJump s)
@@ -344,9 +344,9 @@ void Cmd_Jump_write(Alloc a, CmdRef ref, CmdJump s)
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 11u;
     write_mem(param, param_1, param_2);
-    CmdJumpRef _885 = { ref.offset + 4u };
+    CmdJumpRef _900 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdJumpRef param_4 = _885;
+    CmdJumpRef param_4 = _900;
     CmdJump param_5 = s;
     CmdJump_write(param_3, param_4, param_5);
 }
@@ -358,21 +358,21 @@ bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit
         return true;
     }
     uint param = 1024u;
-    MallocResult _913 = malloc(param);
-    MallocResult new_cmd = _913;
+    MallocResult _928 = malloc(param);
+    MallocResult new_cmd = _928;
     if (new_cmd.failed)
     {
         return false;
     }
-    CmdJump _923 = { new_cmd.alloc.offset };
-    CmdJump jump = _923;
+    CmdJump _938 = { new_cmd.alloc.offset };
+    CmdJump jump = _938;
     Alloc param_1 = cmd_alloc;
     CmdRef param_2 = cmd_ref;
     CmdJump param_3 = jump;
     Cmd_Jump_write(param_1, param_2, param_3);
     cmd_alloc = new_cmd.alloc;
-    CmdRef _935 = { cmd_alloc.offset };
-    cmd_ref = _935;
+    CmdRef _950 = { cmd_alloc.offset };
+    cmd_ref = _950;
     cmd_limit = (cmd_alloc.offset + 1024u) - 144u;
     return true;
 }
@@ -396,9 +396,9 @@ void Cmd_Fill_write(Alloc a, CmdRef ref, CmdFill s)
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 1u;
     write_mem(param, param_1, param_2);
-    CmdFillRef _742 = { ref.offset + 4u };
+    CmdFillRef _757 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdFillRef param_4 = _742;
+    CmdFillRef param_4 = _757;
     CmdFill param_5 = s;
     CmdFill_write(param_3, param_4, param_5);
 }
@@ -430,9 +430,9 @@ void Cmd_Stroke_write(Alloc a, CmdRef ref, CmdStroke s)
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 2u;
     write_mem(param, param_1, param_2);
-    CmdStrokeRef _760 = { ref.offset + 4u };
+    CmdStrokeRef _775 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdStrokeRef param_4 = _760;
+    CmdStrokeRef param_4 = _775;
     CmdStroke param_5 = s;
     CmdStroke_write(param_3, param_4, param_5);
 }
@@ -443,8 +443,8 @@ void write_fill(Alloc alloc, inout CmdRef cmd_ref, Tile tile, float linewidth)
     {
         if (tile.tile.offset != 0u)
         {
-            CmdFill _958 = { tile.tile.offset, tile.backdrop };
-            CmdFill cmd_fill = _958;
+            CmdFill _973 = { tile.tile.offset, tile.backdrop };
+            CmdFill cmd_fill = _973;
             Alloc param = alloc;
             CmdRef param_1 = cmd_ref;
             CmdFill param_2 = cmd_fill;
@@ -461,8 +461,8 @@ void write_fill(Alloc alloc, inout CmdRef cmd_ref, Tile tile, float linewidth)
     }
     else
     {
-        CmdStroke _988 = { tile.tile.offset, 0.5f * linewidth };
-        CmdStroke cmd_stroke = _988;
+        CmdStroke _1003 = { tile.tile.offset, 0.5f * linewidth };
+        CmdStroke cmd_stroke = _1003;
         Alloc param_5 = alloc;
         CmdRef param_6 = cmd_ref;
         CmdStroke param_7 = cmd_stroke;
@@ -486,9 +486,9 @@ void Cmd_Color_write(Alloc a, CmdRef ref, CmdColor s)
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 5u;
     write_mem(param, param_1, param_2);
-    CmdColorRef _786 = { ref.offset + 4u };
+    CmdColorRef _801 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdColorRef param_4 = _786;
+    CmdColorRef param_4 = _801;
     CmdColor param_5 = s;
     CmdColor_write(param_3, param_4, param_5);
 }
@@ -520,9 +520,9 @@ void Cmd_LinGrad_write(Alloc a, CmdRef ref, CmdLinGrad s)
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 6u;
     write_mem(param, param_1, param_2);
-    CmdLinGradRef _804 = { ref.offset + 4u };
+    CmdLinGradRef _819 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdLinGradRef param_4 = _804;
+    CmdLinGradRef param_4 = _819;
     CmdLinGrad param_5 = s;
     CmdLinGrad_write(param_3, param_4, param_5);
 }
@@ -582,9 +582,9 @@ void Cmd_RadGrad_write(Alloc a, CmdRef ref, CmdRadGrad s)
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 7u;
     write_mem(param, param_1, param_2);
-    CmdRadGradRef _822 = { ref.offset + 4u };
+    CmdRadGradRef _837 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdRadGradRef param_4 = _822;
+    CmdRadGradRef param_4 = _837;
     CmdRadGrad param_5 = s;
     CmdRadGrad_write(param_3, param_4, param_5);
 }
@@ -608,9 +608,9 @@ void Cmd_Image_write(Alloc a, CmdRef ref, CmdImage s)
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 8u;
     write_mem(param, param_1, param_2);
-    CmdImageRef _840 = { ref.offset + 4u };
+    CmdImageRef _855 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdImageRef param_4 = _840;
+    CmdImageRef param_4 = _855;
     CmdImage param_5 = s;
     CmdImage_write(param_3, param_4, param_5);
 }
@@ -638,9 +638,9 @@ void Cmd_EndClip_write(Alloc a, CmdRef ref, CmdEndClip s)
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 10u;
     write_mem(param, param_1, param_2);
-    CmdEndClipRef _866 = { ref.offset + 4u };
+    CmdEndClipRef _881 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdEndClipRef param_4 = _866;
+    CmdEndClipRef param_4 = _881;
     CmdEndClip param_5 = s;
     CmdEndClip_write(param_3, param_4, param_5);
 }
@@ -653,27 +653,35 @@ void Cmd_End_write(Alloc a, CmdRef ref)
     write_mem(param, param_1, param_2);
 }
 
+void alloc_write(Alloc a, uint offset, Alloc alloc)
+{
+    Alloc param = a;
+    uint param_1 = offset >> uint(2);
+    uint param_2 = alloc.offset;
+    write_mem(param, param_1, param_2);
+}
+
 void comp_main()
 {
-    uint width_in_bins = ((_1005.Load(8) + 16u) - 1u) / 16u;
+    uint width_in_bins = ((_1020.Load(8) + 16u) - 1u) / 16u;
     uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x;
     uint partition_ix = 0u;
-    uint n_partitions = ((_1005.Load(0) + 256u) - 1u) / 256u;
+    uint n_partitions = ((_1020.Load(0) + 256u) - 1u) / 256u;
     uint th_ix = gl_LocalInvocationID.x;
     uint bin_tile_x = 16u * gl_WorkGroupID.x;
     uint bin_tile_y = 16u * gl_WorkGroupID.y;
     uint tile_x = gl_LocalInvocationID.x % 16u;
     uint tile_y = gl_LocalInvocationID.x / 16u;
-    uint this_tile_ix = (((bin_tile_y + tile_y) * _1005.Load(8)) + bin_tile_x) + tile_x;
-    Alloc _1070;
-    _1070.offset = _1005.Load(24);
+    uint this_tile_ix = (((bin_tile_y + tile_y) * _1020.Load(8)) + bin_tile_x) + tile_x;
+    Alloc _1085;
+    _1085.offset = _1020.Load(24);
     Alloc param;
-    param.offset = _1070.offset;
+    param.offset = _1085.offset;
     uint param_1 = this_tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
-    CmdRef _1079 = { cmd_alloc.offset };
-    CmdRef cmd_ref = _1079;
+    CmdRef _1094 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _1094;
     uint cmd_limit = (cmd_ref.offset + 1024u) - 144u;
     uint clip_depth = 0u;
     uint clip_zero_depth = 0u;
@@ -681,18 +689,25 @@ void comp_main()
     uint wr_ix = 0u;
     uint part_start_ix = 0u;
     uint ready_ix = 0u;
-    uint drawmonoid_start = _1005.Load(44) >> uint(2);
-    uint drawtag_start = _1005.Load(100) >> uint(2);
-    uint drawdata_start = _1005.Load(104) >> uint(2);
-    uint drawinfo_start = _1005.Load(68) >> uint(2);
-    bool mem_ok = _260.Load(4) == 0u;
-    Alloc param_3;
-    Alloc param_5;
-    uint _1304;
+    Alloc param_3 = cmd_alloc;
+    uint param_4 = 0u;
+    uint param_5 = 8u;
+    Alloc scratch_alloc = slice_mem(param_3, param_4, param_5);
+    cmd_ref.offset += 4u;
+    uint render_blend_depth = 0u;
+    uint max_blend_depth = 0u;
+    uint drawmonoid_start = _1020.Load(44) >> uint(2);
+    uint drawtag_start = _1020.Load(100) >> uint(2);
+    uint drawdata_start = _1020.Load(104) >> uint(2);
+    uint drawinfo_start = _1020.Load(68) >> uint(2);
+    bool mem_ok = _266.Load(4) == 0u;
+    Alloc param_6;
+    Alloc param_8;
+    uint _1331;
     uint element_ix;
-    Alloc param_14;
+    Alloc param_17;
     uint tile_count;
-    uint _1605;
+    uint _1632;
     float linewidth;
     CmdLinGrad cmd_lin;
     CmdRadGrad cmd_rad;
@@ -702,40 +717,40 @@ void comp_main()
         {
             sh_bitmaps[i][th_ix] = 0u;
         }
-        bool _1356;
+        bool _1383;
         for (;;)
         {
             if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
             {
                 part_start_ix = ready_ix;
                 uint count = 0u;
-                bool _1154 = th_ix < 256u;
-                bool _1162;
-                if (_1154)
+                bool _1181 = th_ix < 256u;
+                bool _1189;
+                if (_1181)
                 {
-                    _1162 = (partition_ix + th_ix) < n_partitions;
+                    _1189 = (partition_ix + th_ix) < n_partitions;
                 }
                 else
                 {
-                    _1162 = _1154;
+                    _1189 = _1181;
                 }
-                if (_1162)
+                if (_1189)
                 {
-                    uint in_ix = (_1005.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
-                    Alloc _1179;
-                    _1179.offset = _1005.Load(20);
-                    param_3.offset = _1179.offset;
-                    uint param_4 = in_ix;
-                    count = read_mem(param_3, param_4);
-                    Alloc _1190;
-                    _1190.offset = _1005.Load(20);
-                    param_5.offset = _1190.offset;
-                    uint param_6 = in_ix + 1u;
-                    uint offset = read_mem(param_5, param_6);
-                    uint param_7 = offset;
-                    uint param_8 = count * 4u;
-                    bool param_9 = mem_ok;
-                    sh_part_elements[th_ix] = new_alloc(param_7, param_8, param_9);
+                    uint in_ix = (_1020.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
+                    Alloc _1206;
+                    _1206.offset = _1020.Load(20);
+                    param_6.offset = _1206.offset;
+                    uint param_7 = in_ix;
+                    count = read_mem(param_6, param_7);
+                    Alloc _1217;
+                    _1217.offset = _1020.Load(20);
+                    param_8.offset = _1217.offset;
+                    uint param_9 = in_ix + 1u;
+                    uint offset = read_mem(param_8, param_9);
+                    uint param_10 = offset;
+                    uint param_11 = count * 4u;
+                    bool param_12 = mem_ok;
+                    sh_part_elements[th_ix] = new_alloc(param_10, param_11, param_12);
                 }
                 for (uint i_1 = 0u; i_1 < 8u; i_1++)
                 {
@@ -775,35 +790,35 @@ void comp_main()
                 }
                 if (part_ix > 0u)
                 {
-                    _1304 = sh_part_count[part_ix - 1u];
+                    _1331 = sh_part_count[part_ix - 1u];
                 }
                 else
                 {
-                    _1304 = part_start_ix;
+                    _1331 = part_start_ix;
                 }
-                ix -= _1304;
+                ix -= _1331;
                 Alloc bin_alloc = sh_part_elements[part_ix];
-                BinInstanceRef _1323 = { bin_alloc.offset };
-                BinInstanceRef inst_ref = _1323;
-                BinInstanceRef param_10 = inst_ref;
-                uint param_11 = ix;
-                Alloc param_12 = bin_alloc;
-                BinInstanceRef param_13 = BinInstance_index(param_10, param_11);
-                BinInstance inst = BinInstance_read(param_12, param_13);
+                BinInstanceRef _1350 = { bin_alloc.offset };
+                BinInstanceRef inst_ref = _1350;
+                BinInstanceRef param_13 = inst_ref;
+                uint param_14 = ix;
+                Alloc param_15 = bin_alloc;
+                BinInstanceRef param_16 = BinInstance_index(param_13, param_14);
+                BinInstance inst = BinInstance_read(param_15, param_16);
                 sh_elements[th_ix] = inst.element_ix;
             }
             GroupMemoryBarrierWithGroupSync();
             wr_ix = min((rd_ix + 256u), ready_ix);
-            bool _1346 = (wr_ix - rd_ix) < 256u;
-            if (_1346)
+            bool _1373 = (wr_ix - rd_ix) < 256u;
+            if (_1373)
             {
-                _1356 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
+                _1383 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
             }
             else
             {
-                _1356 = _1346;
+                _1383 = _1373;
             }
-            if (_1356)
+            if (_1383)
             {
                 continue;
             }
@@ -816,7 +831,7 @@ void comp_main()
         if ((th_ix + rd_ix) < wr_ix)
         {
             element_ix = sh_elements[th_ix];
-            tag = _1372.Load((drawtag_start + element_ix) * 4 + 0);
+            tag = _1399.Load((drawtag_start + element_ix) * 4 + 0);
         }
         switch (tag)
         {
@@ -828,13 +843,13 @@ void comp_main()
             case 37u:
             {
                 uint drawmonoid_base = drawmonoid_start + (4u * element_ix);
-                uint path_ix = _260.Load(drawmonoid_base * 4 + 8);
-                PathRef _1397 = { _1005.Load(16) + (path_ix * 12u) };
-                Alloc _1400;
-                _1400.offset = _1005.Load(16);
-                param_14.offset = _1400.offset;
-                PathRef param_15 = _1397;
-                Path path = Path_read(param_14, param_15);
+                uint path_ix = _266.Load(drawmonoid_base * 4 + 8);
+                PathRef _1424 = { _1020.Load(16) + (path_ix * 12u) };
+                Alloc _1427;
+                _1427.offset = _1020.Load(16);
+                param_17.offset = _1427.offset;
+                PathRef param_18 = _1424;
+                Path path = Path_read(param_17, param_18);
                 uint stride = path.bbox.z - path.bbox.x;
                 sh_tile_stride[th_ix] = stride;
                 int dx = int(path.bbox.x) - int(bin_tile_x);
@@ -849,13 +864,13 @@ void comp_main()
                 tile_count = uint(x1 - x0) * uint(y1 - y0);
                 uint base = path.tiles.offset - (((uint(dy) * stride) + uint(dx)) * 8u);
                 sh_tile_base[th_ix] = base;
-                uint param_16 = path.tiles.offset;
-                uint param_17 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
-                bool param_18 = mem_ok;
-                Alloc path_alloc = new_alloc(param_16, param_17, param_18);
-                uint param_19 = th_ix;
-                Alloc param_20 = path_alloc;
-                write_tile_alloc(param_19, param_20);
+                uint param_19 = path.tiles.offset;
+                uint param_20 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+                bool param_21 = mem_ok;
+                Alloc path_alloc = new_alloc(param_19, param_20, param_21);
+                uint param_22 = th_ix;
+                Alloc param_23 = path_alloc;
+                write_tile_alloc(param_22, param_23);
                 break;
             }
             default:
@@ -889,56 +904,56 @@ void comp_main()
                 }
             }
             uint element_ix_1 = sh_elements[el_ix];
-            uint tag_1 = _1372.Load((drawtag_start + element_ix_1) * 4 + 0);
+            uint tag_1 = _1399.Load((drawtag_start + element_ix_1) * 4 + 0);
             if (el_ix > 0u)
             {
-                _1605 = sh_tile_count[el_ix - 1u];
+                _1632 = sh_tile_count[el_ix - 1u];
             }
             else
             {
-                _1605 = 0u;
+                _1632 = 0u;
             }
-            uint seq_ix = ix_1 - _1605;
+            uint seq_ix = ix_1 - _1632;
             uint width = sh_tile_width[el_ix];
             uint x = sh_tile_x0[el_ix] + (seq_ix % width);
             uint y = sh_tile_y0[el_ix] + (seq_ix / width);
             bool include_tile = false;
             if (mem_ok)
             {
-                uint param_21 = el_ix;
-                bool param_22 = mem_ok;
-                TileRef _1657 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
-                Alloc param_23 = read_tile_alloc(param_21, param_22);
-                TileRef param_24 = _1657;
-                Tile tile = Tile_read(param_23, param_24);
+                uint param_24 = el_ix;
+                bool param_25 = mem_ok;
+                TileRef _1684 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
+                Alloc param_26 = read_tile_alloc(param_24, param_25);
+                TileRef param_27 = _1684;
+                Tile tile = Tile_read(param_26, param_27);
                 bool is_clip = (tag_1 & 1u) != 0u;
                 bool is_blend = false;
                 if (is_clip)
                 {
                     uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1);
-                    uint scene_offset = _260.Load((drawmonoid_base_1 + 2u) * 4 + 8);
+                    uint scene_offset = _266.Load((drawmonoid_base_1 + 2u) * 4 + 8);
                     uint dd = drawdata_start + (scene_offset >> uint(2));
-                    uint blend = _1372.Load(dd * 4 + 0);
+                    uint blend = _1399.Load(dd * 4 + 0);
                     is_blend = blend != 32771u;
                 }
-                bool _1693 = tile.tile.offset != 0u;
-                bool _1702;
-                if (!_1693)
+                bool _1720 = tile.tile.offset != 0u;
+                bool _1729;
+                if (!_1720)
                 {
-                    _1702 = (tile.backdrop == 0) == is_clip;
+                    _1729 = (tile.backdrop == 0) == is_clip;
                 }
                 else
                 {
-                    _1702 = _1693;
+                    _1729 = _1720;
                 }
-                include_tile = _1702 || is_blend;
+                include_tile = _1729 || is_blend;
             }
             if (include_tile)
             {
                 uint el_slice = el_ix / 32u;
                 uint el_mask = 1u << (el_ix & 31u);
-                uint _1724;
-                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1724);
+                uint _1751;
+                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1751);
             }
         }
         GroupMemoryBarrierWithGroupSync();
@@ -962,179 +977,181 @@ void comp_main()
             uint element_ref_ix = (slice_ix * 32u) + uint(int(firstbitlow(bitmap)));
             uint element_ix_2 = sh_elements[element_ref_ix];
             bitmap &= (bitmap - 1u);
-            uint drawtag = _1372.Load((drawtag_start + element_ix_2) * 4 + 0);
+            uint drawtag = _1399.Load((drawtag_start + element_ix_2) * 4 + 0);
             if (clip_zero_depth == 0u)
             {
-                uint param_25 = element_ref_ix;
-                bool param_26 = mem_ok;
-                TileRef _1801 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                Alloc param_27 = read_tile_alloc(param_25, param_26);
-                TileRef param_28 = _1801;
-                Tile tile_1 = Tile_read(param_27, param_28);
+                uint param_28 = element_ref_ix;
+                bool param_29 = mem_ok;
+                TileRef _1828 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                Alloc param_30 = read_tile_alloc(param_28, param_29);
+                TileRef param_31 = _1828;
+                Tile tile_1 = Tile_read(param_30, param_31);
                 uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2);
-                uint scene_offset_1 = _260.Load((drawmonoid_base_2 + 2u) * 4 + 8);
-                uint info_offset = _260.Load((drawmonoid_base_2 + 3u) * 4 + 8);
+                uint scene_offset_1 = _266.Load((drawmonoid_base_2 + 2u) * 4 + 8);
+                uint info_offset = _266.Load((drawmonoid_base_2 + 3u) * 4 + 8);
                 uint dd_1 = drawdata_start + (scene_offset_1 >> uint(2));
                 uint di = drawinfo_start + (info_offset >> uint(2));
                 switch (drawtag)
                 {
                     case 68u:
                     {
-                        linewidth = asfloat(_260.Load(di * 4 + 8));
-                        Alloc param_29 = cmd_alloc;
-                        CmdRef param_30 = cmd_ref;
-                        uint param_31 = cmd_limit;
-                        bool _1849 = alloc_cmd(param_29, param_30, param_31);
-                        cmd_alloc = param_29;
-                        cmd_ref = param_30;
-                        cmd_limit = param_31;
-                        if (!_1849)
+                        linewidth = asfloat(_266.Load(di * 4 + 8));
+                        Alloc param_32 = cmd_alloc;
+                        CmdRef param_33 = cmd_ref;
+                        uint param_34 = cmd_limit;
+                        bool _1876 = alloc_cmd(param_32, param_33, param_34);
+                        cmd_alloc = param_32;
+                        cmd_ref = param_33;
+                        cmd_limit = param_34;
+                        if (!_1876)
                         {
                             break;
                         }
-                        Alloc param_32 = cmd_alloc;
-                        CmdRef param_33 = cmd_ref;
-                        Tile param_34 = tile_1;
-                        float param_35 = linewidth;
-                        write_fill(param_32, param_33, param_34, param_35);
-                        cmd_ref = param_33;
-                        uint rgba = _1372.Load(dd_1 * 4 + 0);
-                        CmdColor _1872 = { rgba };
-                        Alloc param_36 = cmd_alloc;
-                        CmdRef param_37 = cmd_ref;
-                        CmdColor param_38 = _1872;
-                        Cmd_Color_write(param_36, param_37, param_38);
+                        Alloc param_35 = cmd_alloc;
+                        CmdRef param_36 = cmd_ref;
+                        Tile param_37 = tile_1;
+                        float param_38 = linewidth;
+                        write_fill(param_35, param_36, param_37, param_38);
+                        cmd_ref = param_36;
+                        uint rgba = _1399.Load(dd_1 * 4 + 0);
+                        CmdColor _1899 = { rgba };
+                        Alloc param_39 = cmd_alloc;
+                        CmdRef param_40 = cmd_ref;
+                        CmdColor param_41 = _1899;
+                        Cmd_Color_write(param_39, param_40, param_41);
                         cmd_ref.offset += 8u;
                         break;
                     }
                     case 276u:
                     {
-                        Alloc param_39 = cmd_alloc;
-                        CmdRef param_40 = cmd_ref;
-                        uint param_41 = cmd_limit;
-                        bool _1890 = alloc_cmd(param_39, param_40, param_41);
-                        cmd_alloc = param_39;
-                        cmd_ref = param_40;
-                        cmd_limit = param_41;
-                        if (!_1890)
+                        Alloc param_42 = cmd_alloc;
+                        CmdRef param_43 = cmd_ref;
+                        uint param_44 = cmd_limit;
+                        bool _1917 = alloc_cmd(param_42, param_43, param_44);
+                        cmd_alloc = param_42;
+                        cmd_ref = param_43;
+                        cmd_limit = param_44;
+                        if (!_1917)
                         {
                             break;
                         }
-                        linewidth = asfloat(_260.Load(di * 4 + 8));
-                        Alloc param_42 = cmd_alloc;
-                        CmdRef param_43 = cmd_ref;
-                        Tile param_44 = tile_1;
-                        float param_45 = linewidth;
-                        write_fill(param_42, param_43, param_44, param_45);
-                        cmd_ref = param_43;
-                        cmd_lin.index = _1372.Load(dd_1 * 4 + 0);
-                        cmd_lin.line_x = asfloat(_260.Load((di + 1u) * 4 + 8));
-                        cmd_lin.line_y = asfloat(_260.Load((di + 2u) * 4 + 8));
-                        cmd_lin.line_c = asfloat(_260.Load((di + 3u) * 4 + 8));
-                        Alloc param_46 = cmd_alloc;
-                        CmdRef param_47 = cmd_ref;
-                        CmdLinGrad param_48 = cmd_lin;
-                        Cmd_LinGrad_write(param_46, param_47, param_48);
+                        linewidth = asfloat(_266.Load(di * 4 + 8));
+                        Alloc param_45 = cmd_alloc;
+                        CmdRef param_46 = cmd_ref;
+                        Tile param_47 = tile_1;
+                        float param_48 = linewidth;
+                        write_fill(param_45, param_46, param_47, param_48);
+                        cmd_ref = param_46;
+                        cmd_lin.index = _1399.Load(dd_1 * 4 + 0);
+                        cmd_lin.line_x = asfloat(_266.Load((di + 1u) * 4 + 8));
+                        cmd_lin.line_y = asfloat(_266.Load((di + 2u) * 4 + 8));
+                        cmd_lin.line_c = asfloat(_266.Load((di + 3u) * 4 + 8));
+                        Alloc param_49 = cmd_alloc;
+                        CmdRef param_50 = cmd_ref;
+                        CmdLinGrad param_51 = cmd_lin;
+                        Cmd_LinGrad_write(param_49, param_50, param_51);
                         cmd_ref.offset += 20u;
                         break;
                     }
                     case 732u:
                     {
-                        Alloc param_49 = cmd_alloc;
-                        CmdRef param_50 = cmd_ref;
-                        uint param_51 = cmd_limit;
-                        bool _1954 = alloc_cmd(param_49, param_50, param_51);
-                        cmd_alloc = param_49;
-                        cmd_ref = param_50;
-                        cmd_limit = param_51;
-                        if (!_1954)
+                        Alloc param_52 = cmd_alloc;
+                        CmdRef param_53 = cmd_ref;
+                        uint param_54 = cmd_limit;
+                        bool _1981 = alloc_cmd(param_52, param_53, param_54);
+                        cmd_alloc = param_52;
+                        cmd_ref = param_53;
+                        cmd_limit = param_54;
+                        if (!_1981)
                         {
                             break;
                         }
-                        linewidth = asfloat(_260.Load(di * 4 + 8));
-                        Alloc param_52 = cmd_alloc;
-                        CmdRef param_53 = cmd_ref;
-                        Tile param_54 = tile_1;
-                        float param_55 = linewidth;
-                        write_fill(param_52, param_53, param_54, param_55);
-                        cmd_ref = param_53;
-                        cmd_rad.index = _1372.Load(dd_1 * 4 + 0);
-                        cmd_rad.mat = asfloat(uint4(_260.Load((di + 1u) * 4 + 8), _260.Load((di + 2u) * 4 + 8), _260.Load((di + 3u) * 4 + 8), _260.Load((di + 4u) * 4 + 8)));
-                        cmd_rad.xlat = asfloat(uint2(_260.Load((di + 5u) * 4 + 8), _260.Load((di + 6u) * 4 + 8)));
-                        cmd_rad.c1 = asfloat(uint2(_260.Load((di + 7u) * 4 + 8), _260.Load((di + 8u) * 4 + 8)));
-                        cmd_rad.ra = asfloat(_260.Load((di + 9u) * 4 + 8));
-                        cmd_rad.roff = asfloat(_260.Load((di + 10u) * 4 + 8));
-                        Alloc param_56 = cmd_alloc;
-                        CmdRef param_57 = cmd_ref;
-                        CmdRadGrad param_58 = cmd_rad;
-                        Cmd_RadGrad_write(param_56, param_57, param_58);
+                        linewidth = asfloat(_266.Load(di * 4 + 8));
+                        Alloc param_55 = cmd_alloc;
+                        CmdRef param_56 = cmd_ref;
+                        Tile param_57 = tile_1;
+                        float param_58 = linewidth;
+                        write_fill(param_55, param_56, param_57, param_58);
+                        cmd_ref = param_56;
+                        cmd_rad.index = _1399.Load(dd_1 * 4 + 0);
+                        cmd_rad.mat = asfloat(uint4(_266.Load((di + 1u) * 4 + 8), _266.Load((di + 2u) * 4 + 8), _266.Load((di + 3u) * 4 + 8), _266.Load((di + 4u) * 4 + 8)));
+                        cmd_rad.xlat = asfloat(uint2(_266.Load((di + 5u) * 4 + 8), _266.Load((di + 6u) * 4 + 8)));
+                        cmd_rad.c1 = asfloat(uint2(_266.Load((di + 7u) * 4 + 8), _266.Load((di + 8u) * 4 + 8)));
+                        cmd_rad.ra = asfloat(_266.Load((di + 9u) * 4 + 8));
+                        cmd_rad.roff = asfloat(_266.Load((di + 10u) * 4 + 8));
+                        Alloc param_59 = cmd_alloc;
+                        CmdRef param_60 = cmd_ref;
+                        CmdRadGrad param_61 = cmd_rad;
+                        Cmd_RadGrad_write(param_59, param_60, param_61);
                         cmd_ref.offset += 48u;
                         break;
                     }
                     case 72u:
                     {
-                        linewidth = asfloat(_260.Load(di * 4 + 8));
-                        Alloc param_59 = cmd_alloc;
-                        CmdRef param_60 = cmd_ref;
-                        uint param_61 = cmd_limit;
-                        bool _2060 = alloc_cmd(param_59, param_60, param_61);
-                        cmd_alloc = param_59;
-                        cmd_ref = param_60;
-                        cmd_limit = param_61;
-                        if (!_2060)
+                        linewidth = asfloat(_266.Load(di * 4 + 8));
+                        Alloc param_62 = cmd_alloc;
+                        CmdRef param_63 = cmd_ref;
+                        uint param_64 = cmd_limit;
+                        bool _2087 = alloc_cmd(param_62, param_63, param_64);
+                        cmd_alloc = param_62;
+                        cmd_ref = param_63;
+                        cmd_limit = param_64;
+                        if (!_2087)
                         {
                             break;
                         }
-                        Alloc param_62 = cmd_alloc;
-                        CmdRef param_63 = cmd_ref;
-                        Tile param_64 = tile_1;
-                        float param_65 = linewidth;
-                        write_fill(param_62, param_63, param_64, param_65);
-                        cmd_ref = param_63;
-                        uint index = _1372.Load(dd_1 * 4 + 0);
-                        uint raw1 = _1372.Load((dd_1 + 1u) * 4 + 0);
+                        Alloc param_65 = cmd_alloc;
+                        CmdRef param_66 = cmd_ref;
+                        Tile param_67 = tile_1;
+                        float param_68 = linewidth;
+                        write_fill(param_65, param_66, param_67, param_68);
+                        cmd_ref = param_66;
+                        uint index = _1399.Load(dd_1 * 4 + 0);
+                        uint raw1 = _1399.Load((dd_1 + 1u) * 4 + 0);
                         int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
-                        CmdImage _2099 = { index, offset_1 };
-                        Alloc param_66 = cmd_alloc;
-                        CmdRef param_67 = cmd_ref;
-                        CmdImage param_68 = _2099;
-                        Cmd_Image_write(param_66, param_67, param_68);
+                        CmdImage _2126 = { index, offset_1 };
+                        Alloc param_69 = cmd_alloc;
+                        CmdRef param_70 = cmd_ref;
+                        CmdImage param_71 = _2126;
+                        Cmd_Image_write(param_69, param_70, param_71);
                         cmd_ref.offset += 12u;
                         break;
                     }
                     case 5u:
                     {
-                        bool _2113 = tile_1.tile.offset == 0u;
-                        bool _2119;
-                        if (_2113)
+                        bool _2140 = tile_1.tile.offset == 0u;
+                        bool _2146;
+                        if (_2140)
                         {
-                            _2119 = tile_1.backdrop == 0;
+                            _2146 = tile_1.backdrop == 0;
                         }
                         else
                         {
-                            _2119 = _2113;
+                            _2146 = _2140;
                         }
-                        if (_2119)
+                        if (_2146)
                         {
                             clip_zero_depth = clip_depth + 1u;
                         }
                         else
                         {
-                            Alloc param_69 = cmd_alloc;
-                            CmdRef param_70 = cmd_ref;
-                            uint param_71 = cmd_limit;
-                            bool _2131 = alloc_cmd(param_69, param_70, param_71);
-                            cmd_alloc = param_69;
-                            cmd_ref = param_70;
-                            cmd_limit = param_71;
-                            if (!_2131)
+                            Alloc param_72 = cmd_alloc;
+                            CmdRef param_73 = cmd_ref;
+                            uint param_74 = cmd_limit;
+                            bool _2158 = alloc_cmd(param_72, param_73, param_74);
+                            cmd_alloc = param_72;
+                            cmd_ref = param_73;
+                            cmd_limit = param_74;
+                            if (!_2158)
                             {
                                 break;
                             }
-                            Alloc param_72 = cmd_alloc;
-                            CmdRef param_73 = cmd_ref;
-                            Cmd_BeginClip_write(param_72, param_73);
+                            Alloc param_75 = cmd_alloc;
+                            CmdRef param_76 = cmd_ref;
+                            Cmd_BeginClip_write(param_75, param_76);
                             cmd_ref.offset += 4u;
+                            render_blend_depth++;
+                            max_blend_depth = max(max_blend_depth, render_blend_depth);
                         }
                         clip_depth++;
                         break;
@@ -1142,30 +1159,31 @@ void comp_main()
                     case 37u:
                     {
                         clip_depth--;
-                        Alloc param_74 = cmd_alloc;
-                        CmdRef param_75 = cmd_ref;
-                        uint param_76 = cmd_limit;
-                        bool _2159 = alloc_cmd(param_74, param_75, param_76);
-                        cmd_alloc = param_74;
-                        cmd_ref = param_75;
-                        cmd_limit = param_76;
-                        if (!_2159)
+                        Alloc param_77 = cmd_alloc;
+                        CmdRef param_78 = cmd_ref;
+                        uint param_79 = cmd_limit;
+                        bool _2191 = alloc_cmd(param_77, param_78, param_79);
+                        cmd_alloc = param_77;
+                        cmd_ref = param_78;
+                        cmd_limit = param_79;
+                        if (!_2191)
                         {
                             break;
                         }
-                        Alloc param_77 = cmd_alloc;
-                        CmdRef param_78 = cmd_ref;
-                        Tile param_79 = tile_1;
-                        float param_80 = -1.0f;
-                        write_fill(param_77, param_78, param_79, param_80);
-                        cmd_ref = param_78;
-                        uint blend_1 = _1372.Load(dd_1 * 4 + 0);
-                        CmdEndClip _2182 = { blend_1 };
-                        Alloc param_81 = cmd_alloc;
-                        CmdRef param_82 = cmd_ref;
-                        CmdEndClip param_83 = _2182;
-                        Cmd_EndClip_write(param_81, param_82, param_83);
+                        Alloc param_80 = cmd_alloc;
+                        CmdRef param_81 = cmd_ref;
+                        Tile param_82 = tile_1;
+                        float param_83 = -1.0f;
+                        write_fill(param_80, param_81, param_82, param_83);
+                        cmd_ref = param_81;
+                        uint blend_1 = _1399.Load(dd_1 * 4 + 0);
+                        CmdEndClip _2214 = { blend_1 };
+                        Alloc param_84 = cmd_alloc;
+                        CmdRef param_85 = cmd_ref;
+                        CmdEndClip param_86 = _2214;
+                        Cmd_EndClip_write(param_84, param_85, param_86);
                         cmd_ref.offset += 8u;
+                        render_blend_depth--;
                         break;
                     }
                 }
@@ -1198,21 +1216,32 @@ void comp_main()
             break;
         }
     }
-    bool _2229 = (bin_tile_x + tile_x) < _1005.Load(8);
-    bool _2238;
-    if (_2229)
+    bool _2263 = (bin_tile_x + tile_x) < _1020.Load(8);
+    bool _2272;
+    if (_2263)
     {
-        _2238 = (bin_tile_y + tile_y) < _1005.Load(12);
+        _2272 = (bin_tile_y + tile_y) < _1020.Load(12);
     }
     else
     {
-        _2238 = _2229;
+        _2272 = _2263;
     }
-    if (_2238)
+    if (_2272)
     {
-        Alloc param_84 = cmd_alloc;
-        CmdRef param_85 = cmd_ref;
-        Cmd_End_write(param_84, param_85);
+        Alloc param_87 = cmd_alloc;
+        CmdRef param_88 = cmd_ref;
+        Cmd_End_write(param_87, param_88);
+        if (max_blend_depth > 4u)
+        {
+            uint scratch_size = (((max_blend_depth * 16u) * 16u) * 1u) * 4u;
+            uint param_89 = scratch_size;
+            MallocResult _2293 = malloc(param_89);
+            MallocResult scratch = _2293;
+            Alloc param_90 = scratch_alloc;
+            uint param_91 = scratch_alloc.offset;
+            Alloc param_92 = scratch.alloc;
+            alloc_write(param_90, param_91, param_92);
+        }
     }
 }
 
diff --git a/piet-gpu/shader/gen/coarse.msl b/piet-gpu/shader/gen/coarse.msl
index 854d243..d84add1 100644
--- a/piet-gpu/shader/gen/coarse.msl
+++ b/piet-gpu/shader/gen/coarse.msl
@@ -226,7 +226,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
 }
 
 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_260, constant uint& v_260BufferSize)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_266, constant uint& v_266BufferSize)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -234,7 +234,7 @@ uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memor
     {
         return 0u;
     }
-    uint v = v_260.memory[offset];
+    uint v = v_266.memory[offset];
     return v;
 }
 
@@ -253,30 +253,30 @@ BinInstanceRef BinInstance_index(thread const BinInstanceRef& ref, thread const
 }
 
 static inline __attribute__((always_inline))
-BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_260, constant uint& v_260BufferSize)
+BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_266, constant uint& v_266BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_260, v_260BufferSize);
+    uint raw0 = read_mem(param, param_1, v_266, v_266BufferSize);
     BinInstance s;
     s.element_ix = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_260, constant uint& v_260BufferSize)
+Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_266, constant uint& v_266BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_260, v_260BufferSize);
+    uint raw0 = read_mem(param, param_1, v_266, v_266BufferSize);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_260, v_260BufferSize);
+    uint raw1 = read_mem(param_2, param_3, v_266, v_266BufferSize);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_260, v_260BufferSize);
+    uint raw2 = read_mem(param_4, param_5, v_266, v_266BufferSize);
     Path s;
     s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
     s.tiles = TileRef{ raw2 };
@@ -289,24 +289,24 @@ void write_tile_alloc(thread const uint& el_ix, thread const Alloc& a)
 }
 
 static inline __attribute__((always_inline))
-Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, device Memory& v_260, constant uint& v_260BufferSize)
+Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, device Memory& v_266, constant uint& v_266BufferSize)
 {
     uint param = 0u;
-    uint param_1 = uint(int((v_260BufferSize - 8) / 4) * 4);
+    uint param_1 = uint(int((v_266BufferSize - 8) / 4) * 4);
     bool param_2 = mem_ok;
     return new_alloc(param, param_1, param_2);
 }
 
 static inline __attribute__((always_inline))
-Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_260, constant uint& v_260BufferSize)
+Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_266, constant uint& v_266BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_260, v_260BufferSize);
+    uint raw0 = read_mem(param, param_1, v_266, v_266BufferSize);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_260, v_260BufferSize);
+    uint raw1 = read_mem(param_2, param_3, v_266, v_266BufferSize);
     Tile s;
     s.tile = TileSegRef{ raw0 };
     s.backdrop = int(raw1);
@@ -314,26 +314,26 @@ Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory&
 }
 
 static inline __attribute__((always_inline))
-MallocResult malloc(thread const uint& size, device Memory& v_260, constant uint& v_260BufferSize)
+MallocResult malloc(thread const uint& size, device Memory& v_266, constant uint& v_266BufferSize)
 {
-    uint _266 = atomic_fetch_add_explicit((device atomic_uint*)&v_260.mem_offset, size, memory_order_relaxed);
-    uint offset = _266;
+    uint _272 = atomic_fetch_add_explicit((device atomic_uint*)&v_266.mem_offset, size, memory_order_relaxed);
+    uint offset = _272;
     MallocResult r;
-    r.failed = (offset + size) > uint(int((v_260BufferSize - 8) / 4) * 4);
+    r.failed = (offset + size) > uint(int((v_266BufferSize - 8) / 4) * 4);
     uint param = offset;
     uint param_1 = size;
     bool param_2 = !r.failed;
     r.alloc = new_alloc(param, param_1, param_2);
     if (r.failed)
     {
-        uint _295 = atomic_fetch_max_explicit((device atomic_uint*)&v_260.mem_error, 1u, memory_order_relaxed);
+        uint _301 = atomic_fetch_max_explicit((device atomic_uint*)&v_266.mem_error, 1u, memory_order_relaxed);
         return r;
     }
     return r;
 }
 
 static inline __attribute__((always_inline))
-void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_260, constant uint& v_260BufferSize)
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_266, constant uint& v_266BufferSize)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -341,42 +341,42 @@ void write_mem(thread const Alloc& alloc, thread const uint& offset, thread cons
     {
         return;
     }
-    v_260.memory[offset] = val;
+    v_266.memory[offset] = val;
 }
 
 static inline __attribute__((always_inline))
-void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_260, constant uint& v_260BufferSize)
+void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_266, constant uint& v_266BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.new_ref;
-    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
+    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_260, constant uint& v_260BufferSize)
+void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_266, constant uint& v_266BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 11u;
-    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
+    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
     Alloc param_3 = a;
     CmdJumpRef param_4 = CmdJumpRef{ ref.offset + 4u };
     CmdJump param_5 = s;
-    CmdJump_write(param_3, param_4, param_5, v_260, v_260BufferSize);
+    CmdJump_write(param_3, param_4, param_5, v_266, v_266BufferSize);
 }
 
 static inline __attribute__((always_inline))
-bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, device Memory& v_260, constant uint& v_260BufferSize)
+bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, device Memory& v_266, constant uint& v_266BufferSize)
 {
     if (cmd_ref.offset < cmd_limit)
     {
         return true;
     }
     uint param = 1024u;
-    MallocResult _913 = malloc(param, v_260, v_260BufferSize);
-    MallocResult new_cmd = _913;
+    MallocResult _928 = malloc(param, v_266, v_266BufferSize);
+    MallocResult new_cmd = _928;
     if (new_cmd.failed)
     {
         return false;
@@ -385,7 +385,7 @@ bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd
     Alloc param_1 = cmd_alloc;
     CmdRef param_2 = cmd_ref;
     CmdJump param_3 = jump;
-    Cmd_Jump_write(param_1, param_2, param_3, v_260, v_260BufferSize);
+    Cmd_Jump_write(param_1, param_2, param_3, v_266, v_266BufferSize);
     cmd_alloc = new_cmd.alloc;
     cmd_ref = CmdRef{ cmd_alloc.offset };
     cmd_limit = (cmd_alloc.offset + 1024u) - 144u;
@@ -393,70 +393,70 @@ bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd
 }
 
 static inline __attribute__((always_inline))
-void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_260, constant uint& v_260BufferSize)
+void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_266, constant uint& v_266BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.tile_ref;
-    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
+    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = uint(s.backdrop);
-    write_mem(param_3, param_4, param_5, v_260, v_260BufferSize);
+    write_mem(param_3, param_4, param_5, v_266, v_266BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_260, constant uint& v_260BufferSize)
+void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_266, constant uint& v_266BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 1u;
-    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
+    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
     Alloc param_3 = a;
     CmdFillRef param_4 = CmdFillRef{ ref.offset + 4u };
     CmdFill param_5 = s;
-    CmdFill_write(param_3, param_4, param_5, v_260, v_260BufferSize);
+    CmdFill_write(param_3, param_4, param_5, v_266, v_266BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_260, constant uint& v_260BufferSize)
+void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_266, constant uint& v_266BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 3u;
-    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
+    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_260, constant uint& v_260BufferSize)
+void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_266, constant uint& v_266BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.tile_ref;
-    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
+    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = as_type<uint>(s.half_width);
-    write_mem(param_3, param_4, param_5, v_260, v_260BufferSize);
+    write_mem(param_3, param_4, param_5, v_266, v_266BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_260, constant uint& v_260BufferSize)
+void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_266, constant uint& v_266BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 2u;
-    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
+    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
     Alloc param_3 = a;
     CmdStrokeRef param_4 = CmdStrokeRef{ ref.offset + 4u };
     CmdStroke param_5 = s;
-    CmdStroke_write(param_3, param_4, param_5, v_260, v_260BufferSize);
+    CmdStroke_write(param_3, param_4, param_5, v_266, v_266BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const Tile& tile, thread const float& linewidth, device Memory& v_260, constant uint& v_260BufferSize)
+void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const Tile& tile, thread const float& linewidth, device Memory& v_266, constant uint& v_266BufferSize)
 {
     if (linewidth < 0.0)
     {
@@ -466,14 +466,14 @@ void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const
             Alloc param = alloc;
             CmdRef param_1 = cmd_ref;
             CmdFill param_2 = cmd_fill;
-            Cmd_Fill_write(param, param_1, param_2, v_260, v_260BufferSize);
+            Cmd_Fill_write(param, param_1, param_2, v_266, v_266BufferSize);
             cmd_ref.offset += 12u;
         }
         else
         {
             Alloc param_3 = alloc;
             CmdRef param_4 = cmd_ref;
-            Cmd_Solid_write(param_3, param_4, v_260, v_260BufferSize);
+            Cmd_Solid_write(param_3, param_4, v_266, v_266BufferSize);
             cmd_ref.offset += 4u;
         }
     }
@@ -483,201 +483,210 @@ void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const
         Alloc param_5 = alloc;
         CmdRef param_6 = cmd_ref;
         CmdStroke param_7 = cmd_stroke;
-        Cmd_Stroke_write(param_5, param_6, param_7, v_260, v_260BufferSize);
+        Cmd_Stroke_write(param_5, param_6, param_7, v_266, v_266BufferSize);
         cmd_ref.offset += 12u;
     }
 }
 
 static inline __attribute__((always_inline))
-void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_260, constant uint& v_260BufferSize)
+void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_266, constant uint& v_266BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.rgba_color;
-    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
+    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_260, constant uint& v_260BufferSize)
+void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_266, constant uint& v_266BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 5u;
-    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
+    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
     Alloc param_3 = a;
     CmdColorRef param_4 = CmdColorRef{ ref.offset + 4u };
     CmdColor param_5 = s;
-    CmdColor_write(param_3, param_4, param_5, v_260, v_260BufferSize);
+    CmdColor_write(param_3, param_4, param_5, v_266, v_266BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_260, constant uint& v_260BufferSize)
+void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_266, constant uint& v_266BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.index;
-    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
+    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = as_type<uint>(s.line_x);
-    write_mem(param_3, param_4, param_5, v_260, v_260BufferSize);
+    write_mem(param_3, param_4, param_5, v_266, v_266BufferSize);
     Alloc param_6 = a;
     uint param_7 = ix + 2u;
     uint param_8 = as_type<uint>(s.line_y);
-    write_mem(param_6, param_7, param_8, v_260, v_260BufferSize);
+    write_mem(param_6, param_7, param_8, v_266, v_266BufferSize);
     Alloc param_9 = a;
     uint param_10 = ix + 3u;
     uint param_11 = as_type<uint>(s.line_c);
-    write_mem(param_9, param_10, param_11, v_260, v_260BufferSize);
+    write_mem(param_9, param_10, param_11, v_266, v_266BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_260, constant uint& v_260BufferSize)
+void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_266, constant uint& v_266BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 6u;
-    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
+    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
     Alloc param_3 = a;
     CmdLinGradRef param_4 = CmdLinGradRef{ ref.offset + 4u };
     CmdLinGrad param_5 = s;
-    CmdLinGrad_write(param_3, param_4, param_5, v_260, v_260BufferSize);
+    CmdLinGrad_write(param_3, param_4, param_5, v_266, v_266BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void CmdRadGrad_write(thread const Alloc& a, thread const CmdRadGradRef& ref, thread const CmdRadGrad& s, device Memory& v_260, constant uint& v_260BufferSize)
+void CmdRadGrad_write(thread const Alloc& a, thread const CmdRadGradRef& ref, thread const CmdRadGrad& s, device Memory& v_266, constant uint& v_266BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.index;
-    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
+    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = as_type<uint>(s.mat.x);
-    write_mem(param_3, param_4, param_5, v_260, v_260BufferSize);
+    write_mem(param_3, param_4, param_5, v_266, v_266BufferSize);
     Alloc param_6 = a;
     uint param_7 = ix + 2u;
     uint param_8 = as_type<uint>(s.mat.y);
-    write_mem(param_6, param_7, param_8, v_260, v_260BufferSize);
+    write_mem(param_6, param_7, param_8, v_266, v_266BufferSize);
     Alloc param_9 = a;
     uint param_10 = ix + 3u;
     uint param_11 = as_type<uint>(s.mat.z);
-    write_mem(param_9, param_10, param_11, v_260, v_260BufferSize);
+    write_mem(param_9, param_10, param_11, v_266, v_266BufferSize);
     Alloc param_12 = a;
     uint param_13 = ix + 4u;
     uint param_14 = as_type<uint>(s.mat.w);
-    write_mem(param_12, param_13, param_14, v_260, v_260BufferSize);
+    write_mem(param_12, param_13, param_14, v_266, v_266BufferSize);
     Alloc param_15 = a;
     uint param_16 = ix + 5u;
     uint param_17 = as_type<uint>(s.xlat.x);
-    write_mem(param_15, param_16, param_17, v_260, v_260BufferSize);
+    write_mem(param_15, param_16, param_17, v_266, v_266BufferSize);
     Alloc param_18 = a;
     uint param_19 = ix + 6u;
     uint param_20 = as_type<uint>(s.xlat.y);
-    write_mem(param_18, param_19, param_20, v_260, v_260BufferSize);
+    write_mem(param_18, param_19, param_20, v_266, v_266BufferSize);
     Alloc param_21 = a;
     uint param_22 = ix + 7u;
     uint param_23 = as_type<uint>(s.c1.x);
-    write_mem(param_21, param_22, param_23, v_260, v_260BufferSize);
+    write_mem(param_21, param_22, param_23, v_266, v_266BufferSize);
     Alloc param_24 = a;
     uint param_25 = ix + 8u;
     uint param_26 = as_type<uint>(s.c1.y);
-    write_mem(param_24, param_25, param_26, v_260, v_260BufferSize);
+    write_mem(param_24, param_25, param_26, v_266, v_266BufferSize);
     Alloc param_27 = a;
     uint param_28 = ix + 9u;
     uint param_29 = as_type<uint>(s.ra);
-    write_mem(param_27, param_28, param_29, v_260, v_260BufferSize);
+    write_mem(param_27, param_28, param_29, v_266, v_266BufferSize);
     Alloc param_30 = a;
     uint param_31 = ix + 10u;
     uint param_32 = as_type<uint>(s.roff);
-    write_mem(param_30, param_31, param_32, v_260, v_260BufferSize);
+    write_mem(param_30, param_31, param_32, v_266, v_266BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_RadGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdRadGrad& s, device Memory& v_260, constant uint& v_260BufferSize)
+void Cmd_RadGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdRadGrad& s, device Memory& v_266, constant uint& v_266BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 7u;
-    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
+    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
     Alloc param_3 = a;
     CmdRadGradRef param_4 = CmdRadGradRef{ ref.offset + 4u };
     CmdRadGrad param_5 = s;
-    CmdRadGrad_write(param_3, param_4, param_5, v_260, v_260BufferSize);
+    CmdRadGrad_write(param_3, param_4, param_5, v_266, v_266BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_260, constant uint& v_260BufferSize)
+void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_266, constant uint& v_266BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.index;
-    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
+    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16));
-    write_mem(param_3, param_4, param_5, v_260, v_260BufferSize);
+    write_mem(param_3, param_4, param_5, v_266, v_266BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_260, constant uint& v_260BufferSize)
+void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_266, constant uint& v_266BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 8u;
-    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
+    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
     Alloc param_3 = a;
     CmdImageRef param_4 = CmdImageRef{ ref.offset + 4u };
     CmdImage param_5 = s;
-    CmdImage_write(param_3, param_4, param_5, v_260, v_260BufferSize);
+    CmdImage_write(param_3, param_4, param_5, v_266, v_266BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_260, constant uint& v_260BufferSize)
+void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_266, constant uint& v_266BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 9u;
-    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
+    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void CmdEndClip_write(thread const Alloc& a, thread const CmdEndClipRef& ref, thread const CmdEndClip& s, device Memory& v_260, constant uint& v_260BufferSize)
+void CmdEndClip_write(thread const Alloc& a, thread const CmdEndClipRef& ref, thread const CmdEndClip& s, device Memory& v_266, constant uint& v_266BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.blend;
-    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
+    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdEndClip& s, device Memory& v_260, constant uint& v_260BufferSize)
+void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdEndClip& s, device Memory& v_266, constant uint& v_266BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 10u;
-    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
+    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
     Alloc param_3 = a;
     CmdEndClipRef param_4 = CmdEndClipRef{ ref.offset + 4u };
     CmdEndClip param_5 = s;
-    CmdEndClip_write(param_3, param_4, param_5, v_260, v_260BufferSize);
+    CmdEndClip_write(param_3, param_4, param_5, v_266, v_266BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_260, constant uint& v_260BufferSize)
+void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_266, constant uint& v_266BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 0u;
-    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
+    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
 }
 
-kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_260 [[buffer(0)]], const device ConfigBuf& _1005 [[buffer(1)]], const device SceneBuf& _1372 [[buffer(2)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+static inline __attribute__((always_inline))
+void alloc_write(thread const Alloc& a, thread const uint& offset, thread const Alloc& alloc, device Memory& v_266, constant uint& v_266BufferSize)
+{
+    Alloc param = a;
+    uint param_1 = offset >> uint(2);
+    uint param_2 = alloc.offset;
+    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
+}
+
+kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_266 [[buffer(0)]], const device ConfigBuf& _1020 [[buffer(1)]], const device SceneBuf& _1399 [[buffer(2)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     threadgroup uint sh_bitmaps[8][256];
     threadgroup Alloc sh_part_elements[256];
@@ -689,19 +698,19 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
     threadgroup uint sh_tile_y0[256];
     threadgroup uint sh_tile_base[256];
     threadgroup uint sh_tile_count[256];
-    constant uint& v_260BufferSize = spvBufferSizeConstants[0];
-    uint width_in_bins = ((_1005.conf.width_in_tiles + 16u) - 1u) / 16u;
+    constant uint& v_266BufferSize = spvBufferSizeConstants[0];
+    uint width_in_bins = ((_1020.conf.width_in_tiles + 16u) - 1u) / 16u;
     uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x;
     uint partition_ix = 0u;
-    uint n_partitions = ((_1005.conf.n_elements + 256u) - 1u) / 256u;
+    uint n_partitions = ((_1020.conf.n_elements + 256u) - 1u) / 256u;
     uint th_ix = gl_LocalInvocationID.x;
     uint bin_tile_x = 16u * gl_WorkGroupID.x;
     uint bin_tile_y = 16u * gl_WorkGroupID.y;
     uint tile_x = gl_LocalInvocationID.x % 16u;
     uint tile_y = gl_LocalInvocationID.x / 16u;
-    uint this_tile_ix = (((bin_tile_y + tile_y) * _1005.conf.width_in_tiles) + bin_tile_x) + tile_x;
+    uint this_tile_ix = (((bin_tile_y + tile_y) * _1020.conf.width_in_tiles) + bin_tile_x) + tile_x;
     Alloc param;
-    param.offset = _1005.conf.ptcl_alloc.offset;
+    param.offset = _1020.conf.ptcl_alloc.offset;
     uint param_1 = this_tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
@@ -713,18 +722,25 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
     uint wr_ix = 0u;
     uint part_start_ix = 0u;
     uint ready_ix = 0u;
-    uint drawmonoid_start = _1005.conf.drawmonoid_alloc.offset >> uint(2);
-    uint drawtag_start = _1005.conf.drawtag_offset >> uint(2);
-    uint drawdata_start = _1005.conf.drawdata_offset >> uint(2);
-    uint drawinfo_start = _1005.conf.drawinfo_alloc.offset >> uint(2);
-    bool mem_ok = v_260.mem_error == 0u;
-    Alloc param_3;
-    Alloc param_5;
-    uint _1304;
+    Alloc param_3 = cmd_alloc;
+    uint param_4 = 0u;
+    uint param_5 = 8u;
+    Alloc scratch_alloc = slice_mem(param_3, param_4, param_5);
+    cmd_ref.offset += 4u;
+    uint render_blend_depth = 0u;
+    uint max_blend_depth = 0u;
+    uint drawmonoid_start = _1020.conf.drawmonoid_alloc.offset >> uint(2);
+    uint drawtag_start = _1020.conf.drawtag_offset >> uint(2);
+    uint drawdata_start = _1020.conf.drawdata_offset >> uint(2);
+    uint drawinfo_start = _1020.conf.drawinfo_alloc.offset >> uint(2);
+    bool mem_ok = v_266.mem_error == 0u;
+    Alloc param_6;
+    Alloc param_8;
+    uint _1331;
     uint element_ix;
-    Alloc param_14;
+    Alloc param_17;
     uint tile_count;
-    uint _1605;
+    uint _1632;
     float linewidth;
     CmdLinGrad cmd_lin;
     CmdRadGrad cmd_rad;
@@ -734,36 +750,36 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
         {
             sh_bitmaps[i][th_ix] = 0u;
         }
-        bool _1356;
+        bool _1383;
         for (;;)
         {
             if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
             {
                 part_start_ix = ready_ix;
                 uint count = 0u;
-                bool _1154 = th_ix < 256u;
-                bool _1162;
-                if (_1154)
+                bool _1181 = th_ix < 256u;
+                bool _1189;
+                if (_1181)
                 {
-                    _1162 = (partition_ix + th_ix) < n_partitions;
+                    _1189 = (partition_ix + th_ix) < n_partitions;
                 }
                 else
                 {
-                    _1162 = _1154;
+                    _1189 = _1181;
                 }
-                if (_1162)
+                if (_1189)
                 {
-                    uint in_ix = (_1005.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
-                    param_3.offset = _1005.conf.bin_alloc.offset;
-                    uint param_4 = in_ix;
-                    count = read_mem(param_3, param_4, v_260, v_260BufferSize);
-                    param_5.offset = _1005.conf.bin_alloc.offset;
-                    uint param_6 = in_ix + 1u;
-                    uint offset = read_mem(param_5, param_6, v_260, v_260BufferSize);
-                    uint param_7 = offset;
-                    uint param_8 = count * 4u;
-                    bool param_9 = mem_ok;
-                    sh_part_elements[th_ix] = new_alloc(param_7, param_8, param_9);
+                    uint in_ix = (_1020.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
+                    param_6.offset = _1020.conf.bin_alloc.offset;
+                    uint param_7 = in_ix;
+                    count = read_mem(param_6, param_7, v_266, v_266BufferSize);
+                    param_8.offset = _1020.conf.bin_alloc.offset;
+                    uint param_9 = in_ix + 1u;
+                    uint offset = read_mem(param_8, param_9, v_266, v_266BufferSize);
+                    uint param_10 = offset;
+                    uint param_11 = count * 4u;
+                    bool param_12 = mem_ok;
+                    sh_part_elements[th_ix] = new_alloc(param_10, param_11, param_12);
                 }
                 for (uint i_1 = 0u; i_1 < 8u; i_1++)
                 {
@@ -803,34 +819,34 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                 }
                 if (part_ix > 0u)
                 {
-                    _1304 = sh_part_count[part_ix - 1u];
+                    _1331 = sh_part_count[part_ix - 1u];
                 }
                 else
                 {
-                    _1304 = part_start_ix;
+                    _1331 = part_start_ix;
                 }
-                ix -= _1304;
+                ix -= _1331;
                 Alloc bin_alloc = sh_part_elements[part_ix];
                 BinInstanceRef inst_ref = BinInstanceRef{ bin_alloc.offset };
-                BinInstanceRef param_10 = inst_ref;
-                uint param_11 = ix;
-                Alloc param_12 = bin_alloc;
-                BinInstanceRef param_13 = BinInstance_index(param_10, param_11);
-                BinInstance inst = BinInstance_read(param_12, param_13, v_260, v_260BufferSize);
+                BinInstanceRef param_13 = inst_ref;
+                uint param_14 = ix;
+                Alloc param_15 = bin_alloc;
+                BinInstanceRef param_16 = BinInstance_index(param_13, param_14);
+                BinInstance inst = BinInstance_read(param_15, param_16, v_266, v_266BufferSize);
                 sh_elements[th_ix] = inst.element_ix;
             }
             threadgroup_barrier(mem_flags::mem_threadgroup);
             wr_ix = min((rd_ix + 256u), ready_ix);
-            bool _1346 = (wr_ix - rd_ix) < 256u;
-            if (_1346)
+            bool _1373 = (wr_ix - rd_ix) < 256u;
+            if (_1373)
             {
-                _1356 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
+                _1383 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
             }
             else
             {
-                _1356 = _1346;
+                _1383 = _1373;
             }
-            if (_1356)
+            if (_1383)
             {
                 continue;
             }
@@ -843,7 +859,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
         if ((th_ix + rd_ix) < wr_ix)
         {
             element_ix = sh_elements[th_ix];
-            tag = _1372.scene[drawtag_start + element_ix];
+            tag = _1399.scene[drawtag_start + element_ix];
         }
         switch (tag)
         {
@@ -855,10 +871,10 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
             case 37u:
             {
                 uint drawmonoid_base = drawmonoid_start + (4u * element_ix);
-                uint path_ix = v_260.memory[drawmonoid_base];
-                param_14.offset = _1005.conf.tile_alloc.offset;
-                PathRef param_15 = PathRef{ _1005.conf.tile_alloc.offset + (path_ix * 12u) };
-                Path path = Path_read(param_14, param_15, v_260, v_260BufferSize);
+                uint path_ix = v_266.memory[drawmonoid_base];
+                param_17.offset = _1020.conf.tile_alloc.offset;
+                PathRef param_18 = PathRef{ _1020.conf.tile_alloc.offset + (path_ix * 12u) };
+                Path path = Path_read(param_17, param_18, v_266, v_266BufferSize);
                 uint stride = path.bbox.z - path.bbox.x;
                 sh_tile_stride[th_ix] = stride;
                 int dx = int(path.bbox.x) - int(bin_tile_x);
@@ -873,13 +889,13 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                 tile_count = uint(x1 - x0) * uint(y1 - y0);
                 uint base = path.tiles.offset - (((uint(dy) * stride) + uint(dx)) * 8u);
                 sh_tile_base[th_ix] = base;
-                uint param_16 = path.tiles.offset;
-                uint param_17 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
-                bool param_18 = mem_ok;
-                Alloc path_alloc = new_alloc(param_16, param_17, param_18);
-                uint param_19 = th_ix;
-                Alloc param_20 = path_alloc;
-                write_tile_alloc(param_19, param_20);
+                uint param_19 = path.tiles.offset;
+                uint param_20 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+                bool param_21 = mem_ok;
+                Alloc path_alloc = new_alloc(param_19, param_20, param_21);
+                uint param_22 = th_ix;
+                Alloc param_23 = path_alloc;
+                write_tile_alloc(param_22, param_23);
                 break;
             }
             default:
@@ -913,54 +929,54 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                 }
             }
             uint element_ix_1 = sh_elements[el_ix];
-            uint tag_1 = _1372.scene[drawtag_start + element_ix_1];
+            uint tag_1 = _1399.scene[drawtag_start + element_ix_1];
             if (el_ix > 0u)
             {
-                _1605 = sh_tile_count[el_ix - 1u];
+                _1632 = sh_tile_count[el_ix - 1u];
             }
             else
             {
-                _1605 = 0u;
+                _1632 = 0u;
             }
-            uint seq_ix = ix_1 - _1605;
+            uint seq_ix = ix_1 - _1632;
             uint width = sh_tile_width[el_ix];
             uint x = sh_tile_x0[el_ix] + (seq_ix % width);
             uint y = sh_tile_y0[el_ix] + (seq_ix / width);
             bool include_tile = false;
             if (mem_ok)
             {
-                uint param_21 = el_ix;
-                bool param_22 = mem_ok;
-                Alloc param_23 = read_tile_alloc(param_21, param_22, v_260, v_260BufferSize);
-                TileRef param_24 = TileRef{ sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
-                Tile tile = Tile_read(param_23, param_24, v_260, v_260BufferSize);
+                uint param_24 = el_ix;
+                bool param_25 = mem_ok;
+                Alloc param_26 = read_tile_alloc(param_24, param_25, v_266, v_266BufferSize);
+                TileRef param_27 = TileRef{ sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
+                Tile tile = Tile_read(param_26, param_27, v_266, v_266BufferSize);
                 bool is_clip = (tag_1 & 1u) != 0u;
                 bool is_blend = false;
                 if (is_clip)
                 {
                     uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1);
-                    uint scene_offset = v_260.memory[drawmonoid_base_1 + 2u];
+                    uint scene_offset = v_266.memory[drawmonoid_base_1 + 2u];
                     uint dd = drawdata_start + (scene_offset >> uint(2));
-                    uint blend = _1372.scene[dd];
+                    uint blend = _1399.scene[dd];
                     is_blend = blend != 32771u;
                 }
-                bool _1693 = tile.tile.offset != 0u;
-                bool _1702;
-                if (!_1693)
+                bool _1720 = tile.tile.offset != 0u;
+                bool _1729;
+                if (!_1720)
                 {
-                    _1702 = (tile.backdrop == 0) == is_clip;
+                    _1729 = (tile.backdrop == 0) == is_clip;
                 }
                 else
                 {
-                    _1702 = _1693;
+                    _1729 = _1720;
                 }
-                include_tile = _1702 || is_blend;
+                include_tile = _1729 || is_blend;
             }
             if (include_tile)
             {
                 uint el_slice = el_ix / 32u;
                 uint el_mask = 1u << (el_ix & 31u);
-                uint _1724 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
+                uint _1751 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
             }
         }
         threadgroup_barrier(mem_flags::mem_threadgroup);
@@ -984,176 +1000,178 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
             uint element_ref_ix = (slice_ix * 32u) + uint(int(spvFindLSB(bitmap)));
             uint element_ix_2 = sh_elements[element_ref_ix];
             bitmap &= (bitmap - 1u);
-            uint drawtag = _1372.scene[drawtag_start + element_ix_2];
+            uint drawtag = _1399.scene[drawtag_start + element_ix_2];
             if (clip_zero_depth == 0u)
             {
-                uint param_25 = element_ref_ix;
-                bool param_26 = mem_ok;
-                Alloc param_27 = read_tile_alloc(param_25, param_26, v_260, v_260BufferSize);
-                TileRef param_28 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                Tile tile_1 = Tile_read(param_27, param_28, v_260, v_260BufferSize);
+                uint param_28 = element_ref_ix;
+                bool param_29 = mem_ok;
+                Alloc param_30 = read_tile_alloc(param_28, param_29, v_266, v_266BufferSize);
+                TileRef param_31 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                Tile tile_1 = Tile_read(param_30, param_31, v_266, v_266BufferSize);
                 uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2);
-                uint scene_offset_1 = v_260.memory[drawmonoid_base_2 + 2u];
-                uint info_offset = v_260.memory[drawmonoid_base_2 + 3u];
+                uint scene_offset_1 = v_266.memory[drawmonoid_base_2 + 2u];
+                uint info_offset = v_266.memory[drawmonoid_base_2 + 3u];
                 uint dd_1 = drawdata_start + (scene_offset_1 >> uint(2));
                 uint di = drawinfo_start + (info_offset >> uint(2));
                 switch (drawtag)
                 {
                     case 68u:
                     {
-                        linewidth = as_type<float>(v_260.memory[di]);
-                        Alloc param_29 = cmd_alloc;
-                        CmdRef param_30 = cmd_ref;
-                        uint param_31 = cmd_limit;
-                        bool _1849 = alloc_cmd(param_29, param_30, param_31, v_260, v_260BufferSize);
-                        cmd_alloc = param_29;
-                        cmd_ref = param_30;
-                        cmd_limit = param_31;
-                        if (!_1849)
+                        linewidth = as_type<float>(v_266.memory[di]);
+                        Alloc param_32 = cmd_alloc;
+                        CmdRef param_33 = cmd_ref;
+                        uint param_34 = cmd_limit;
+                        bool _1876 = alloc_cmd(param_32, param_33, param_34, v_266, v_266BufferSize);
+                        cmd_alloc = param_32;
+                        cmd_ref = param_33;
+                        cmd_limit = param_34;
+                        if (!_1876)
                         {
                             break;
                         }
-                        Alloc param_32 = cmd_alloc;
-                        CmdRef param_33 = cmd_ref;
-                        Tile param_34 = tile_1;
-                        float param_35 = linewidth;
-                        write_fill(param_32, param_33, param_34, param_35, v_260, v_260BufferSize);
-                        cmd_ref = param_33;
-                        uint rgba = _1372.scene[dd_1];
-                        Alloc param_36 = cmd_alloc;
-                        CmdRef param_37 = cmd_ref;
-                        CmdColor param_38 = CmdColor{ rgba };
-                        Cmd_Color_write(param_36, param_37, param_38, v_260, v_260BufferSize);
+                        Alloc param_35 = cmd_alloc;
+                        CmdRef param_36 = cmd_ref;
+                        Tile param_37 = tile_1;
+                        float param_38 = linewidth;
+                        write_fill(param_35, param_36, param_37, param_38, v_266, v_266BufferSize);
+                        cmd_ref = param_36;
+                        uint rgba = _1399.scene[dd_1];
+                        Alloc param_39 = cmd_alloc;
+                        CmdRef param_40 = cmd_ref;
+                        CmdColor param_41 = CmdColor{ rgba };
+                        Cmd_Color_write(param_39, param_40, param_41, v_266, v_266BufferSize);
                         cmd_ref.offset += 8u;
                         break;
                     }
                     case 276u:
                     {
-                        Alloc param_39 = cmd_alloc;
-                        CmdRef param_40 = cmd_ref;
-                        uint param_41 = cmd_limit;
-                        bool _1890 = alloc_cmd(param_39, param_40, param_41, v_260, v_260BufferSize);
-                        cmd_alloc = param_39;
-                        cmd_ref = param_40;
-                        cmd_limit = param_41;
-                        if (!_1890)
+                        Alloc param_42 = cmd_alloc;
+                        CmdRef param_43 = cmd_ref;
+                        uint param_44 = cmd_limit;
+                        bool _1917 = alloc_cmd(param_42, param_43, param_44, v_266, v_266BufferSize);
+                        cmd_alloc = param_42;
+                        cmd_ref = param_43;
+                        cmd_limit = param_44;
+                        if (!_1917)
                         {
                             break;
                         }
-                        linewidth = as_type<float>(v_260.memory[di]);
-                        Alloc param_42 = cmd_alloc;
-                        CmdRef param_43 = cmd_ref;
-                        Tile param_44 = tile_1;
-                        float param_45 = linewidth;
-                        write_fill(param_42, param_43, param_44, param_45, v_260, v_260BufferSize);
-                        cmd_ref = param_43;
-                        cmd_lin.index = _1372.scene[dd_1];
-                        cmd_lin.line_x = as_type<float>(v_260.memory[di + 1u]);
-                        cmd_lin.line_y = as_type<float>(v_260.memory[di + 2u]);
-                        cmd_lin.line_c = as_type<float>(v_260.memory[di + 3u]);
-                        Alloc param_46 = cmd_alloc;
-                        CmdRef param_47 = cmd_ref;
-                        CmdLinGrad param_48 = cmd_lin;
-                        Cmd_LinGrad_write(param_46, param_47, param_48, v_260, v_260BufferSize);
+                        linewidth = as_type<float>(v_266.memory[di]);
+                        Alloc param_45 = cmd_alloc;
+                        CmdRef param_46 = cmd_ref;
+                        Tile param_47 = tile_1;
+                        float param_48 = linewidth;
+                        write_fill(param_45, param_46, param_47, param_48, v_266, v_266BufferSize);
+                        cmd_ref = param_46;
+                        cmd_lin.index = _1399.scene[dd_1];
+                        cmd_lin.line_x = as_type<float>(v_266.memory[di + 1u]);
+                        cmd_lin.line_y = as_type<float>(v_266.memory[di + 2u]);
+                        cmd_lin.line_c = as_type<float>(v_266.memory[di + 3u]);
+                        Alloc param_49 = cmd_alloc;
+                        CmdRef param_50 = cmd_ref;
+                        CmdLinGrad param_51 = cmd_lin;
+                        Cmd_LinGrad_write(param_49, param_50, param_51, v_266, v_266BufferSize);
                         cmd_ref.offset += 20u;
                         break;
                     }
                     case 732u:
                     {
-                        Alloc param_49 = cmd_alloc;
-                        CmdRef param_50 = cmd_ref;
-                        uint param_51 = cmd_limit;
-                        bool _1954 = alloc_cmd(param_49, param_50, param_51, v_260, v_260BufferSize);
-                        cmd_alloc = param_49;
-                        cmd_ref = param_50;
-                        cmd_limit = param_51;
-                        if (!_1954)
+                        Alloc param_52 = cmd_alloc;
+                        CmdRef param_53 = cmd_ref;
+                        uint param_54 = cmd_limit;
+                        bool _1981 = alloc_cmd(param_52, param_53, param_54, v_266, v_266BufferSize);
+                        cmd_alloc = param_52;
+                        cmd_ref = param_53;
+                        cmd_limit = param_54;
+                        if (!_1981)
                         {
                             break;
                         }
-                        linewidth = as_type<float>(v_260.memory[di]);
-                        Alloc param_52 = cmd_alloc;
-                        CmdRef param_53 = cmd_ref;
-                        Tile param_54 = tile_1;
-                        float param_55 = linewidth;
-                        write_fill(param_52, param_53, param_54, param_55, v_260, v_260BufferSize);
-                        cmd_ref = param_53;
-                        cmd_rad.index = _1372.scene[dd_1];
-                        cmd_rad.mat = as_type<float4>(uint4(v_260.memory[di + 1u], v_260.memory[di + 2u], v_260.memory[di + 3u], v_260.memory[di + 4u]));
-                        cmd_rad.xlat = as_type<float2>(uint2(v_260.memory[di + 5u], v_260.memory[di + 6u]));
-                        cmd_rad.c1 = as_type<float2>(uint2(v_260.memory[di + 7u], v_260.memory[di + 8u]));
-                        cmd_rad.ra = as_type<float>(v_260.memory[di + 9u]);
-                        cmd_rad.roff = as_type<float>(v_260.memory[di + 10u]);
-                        Alloc param_56 = cmd_alloc;
-                        CmdRef param_57 = cmd_ref;
-                        CmdRadGrad param_58 = cmd_rad;
-                        Cmd_RadGrad_write(param_56, param_57, param_58, v_260, v_260BufferSize);
+                        linewidth = as_type<float>(v_266.memory[di]);
+                        Alloc param_55 = cmd_alloc;
+                        CmdRef param_56 = cmd_ref;
+                        Tile param_57 = tile_1;
+                        float param_58 = linewidth;
+                        write_fill(param_55, param_56, param_57, param_58, v_266, v_266BufferSize);
+                        cmd_ref = param_56;
+                        cmd_rad.index = _1399.scene[dd_1];
+                        cmd_rad.mat = as_type<float4>(uint4(v_266.memory[di + 1u], v_266.memory[di + 2u], v_266.memory[di + 3u], v_266.memory[di + 4u]));
+                        cmd_rad.xlat = as_type<float2>(uint2(v_266.memory[di + 5u], v_266.memory[di + 6u]));
+                        cmd_rad.c1 = as_type<float2>(uint2(v_266.memory[di + 7u], v_266.memory[di + 8u]));
+                        cmd_rad.ra = as_type<float>(v_266.memory[di + 9u]);
+                        cmd_rad.roff = as_type<float>(v_266.memory[di + 10u]);
+                        Alloc param_59 = cmd_alloc;
+                        CmdRef param_60 = cmd_ref;
+                        CmdRadGrad param_61 = cmd_rad;
+                        Cmd_RadGrad_write(param_59, param_60, param_61, v_266, v_266BufferSize);
                         cmd_ref.offset += 48u;
                         break;
                     }
                     case 72u:
                     {
-                        linewidth = as_type<float>(v_260.memory[di]);
-                        Alloc param_59 = cmd_alloc;
-                        CmdRef param_60 = cmd_ref;
-                        uint param_61 = cmd_limit;
-                        bool _2060 = alloc_cmd(param_59, param_60, param_61, v_260, v_260BufferSize);
-                        cmd_alloc = param_59;
-                        cmd_ref = param_60;
-                        cmd_limit = param_61;
-                        if (!_2060)
+                        linewidth = as_type<float>(v_266.memory[di]);
+                        Alloc param_62 = cmd_alloc;
+                        CmdRef param_63 = cmd_ref;
+                        uint param_64 = cmd_limit;
+                        bool _2087 = alloc_cmd(param_62, param_63, param_64, v_266, v_266BufferSize);
+                        cmd_alloc = param_62;
+                        cmd_ref = param_63;
+                        cmd_limit = param_64;
+                        if (!_2087)
                         {
                             break;
                         }
-                        Alloc param_62 = cmd_alloc;
-                        CmdRef param_63 = cmd_ref;
-                        Tile param_64 = tile_1;
-                        float param_65 = linewidth;
-                        write_fill(param_62, param_63, param_64, param_65, v_260, v_260BufferSize);
-                        cmd_ref = param_63;
-                        uint index = _1372.scene[dd_1];
-                        uint raw1 = _1372.scene[dd_1 + 1u];
+                        Alloc param_65 = cmd_alloc;
+                        CmdRef param_66 = cmd_ref;
+                        Tile param_67 = tile_1;
+                        float param_68 = linewidth;
+                        write_fill(param_65, param_66, param_67, param_68, v_266, v_266BufferSize);
+                        cmd_ref = param_66;
+                        uint index = _1399.scene[dd_1];
+                        uint raw1 = _1399.scene[dd_1 + 1u];
                         int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
-                        Alloc param_66 = cmd_alloc;
-                        CmdRef param_67 = cmd_ref;
-                        CmdImage param_68 = CmdImage{ index, offset_1 };
-                        Cmd_Image_write(param_66, param_67, param_68, v_260, v_260BufferSize);
+                        Alloc param_69 = cmd_alloc;
+                        CmdRef param_70 = cmd_ref;
+                        CmdImage param_71 = CmdImage{ index, offset_1 };
+                        Cmd_Image_write(param_69, param_70, param_71, v_266, v_266BufferSize);
                         cmd_ref.offset += 12u;
                         break;
                     }
                     case 5u:
                     {
-                        bool _2113 = tile_1.tile.offset == 0u;
-                        bool _2119;
-                        if (_2113)
+                        bool _2140 = tile_1.tile.offset == 0u;
+                        bool _2146;
+                        if (_2140)
                         {
-                            _2119 = tile_1.backdrop == 0;
+                            _2146 = tile_1.backdrop == 0;
                         }
                         else
                         {
-                            _2119 = _2113;
+                            _2146 = _2140;
                         }
-                        if (_2119)
+                        if (_2146)
                         {
                             clip_zero_depth = clip_depth + 1u;
                         }
                         else
                         {
-                            Alloc param_69 = cmd_alloc;
-                            CmdRef param_70 = cmd_ref;
-                            uint param_71 = cmd_limit;
-                            bool _2131 = alloc_cmd(param_69, param_70, param_71, v_260, v_260BufferSize);
-                            cmd_alloc = param_69;
-                            cmd_ref = param_70;
-                            cmd_limit = param_71;
-                            if (!_2131)
+                            Alloc param_72 = cmd_alloc;
+                            CmdRef param_73 = cmd_ref;
+                            uint param_74 = cmd_limit;
+                            bool _2158 = alloc_cmd(param_72, param_73, param_74, v_266, v_266BufferSize);
+                            cmd_alloc = param_72;
+                            cmd_ref = param_73;
+                            cmd_limit = param_74;
+                            if (!_2158)
                             {
                                 break;
                             }
-                            Alloc param_72 = cmd_alloc;
-                            CmdRef param_73 = cmd_ref;
-                            Cmd_BeginClip_write(param_72, param_73, v_260, v_260BufferSize);
+                            Alloc param_75 = cmd_alloc;
+                            CmdRef param_76 = cmd_ref;
+                            Cmd_BeginClip_write(param_75, param_76, v_266, v_266BufferSize);
                             cmd_ref.offset += 4u;
+                            render_blend_depth++;
+                            max_blend_depth = max(max_blend_depth, render_blend_depth);
                         }
                         clip_depth++;
                         break;
@@ -1161,29 +1179,30 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                     case 37u:
                     {
                         clip_depth--;
-                        Alloc param_74 = cmd_alloc;
-                        CmdRef param_75 = cmd_ref;
-                        uint param_76 = cmd_limit;
-                        bool _2159 = alloc_cmd(param_74, param_75, param_76, v_260, v_260BufferSize);
-                        cmd_alloc = param_74;
-                        cmd_ref = param_75;
-                        cmd_limit = param_76;
-                        if (!_2159)
+                        Alloc param_77 = cmd_alloc;
+                        CmdRef param_78 = cmd_ref;
+                        uint param_79 = cmd_limit;
+                        bool _2191 = alloc_cmd(param_77, param_78, param_79, v_266, v_266BufferSize);
+                        cmd_alloc = param_77;
+                        cmd_ref = param_78;
+                        cmd_limit = param_79;
+                        if (!_2191)
                         {
                             break;
                         }
-                        Alloc param_77 = cmd_alloc;
-                        CmdRef param_78 = cmd_ref;
-                        Tile param_79 = tile_1;
-                        float param_80 = -1.0;
-                        write_fill(param_77, param_78, param_79, param_80, v_260, v_260BufferSize);
-                        cmd_ref = param_78;
-                        uint blend_1 = _1372.scene[dd_1];
-                        Alloc param_81 = cmd_alloc;
-                        CmdRef param_82 = cmd_ref;
-                        CmdEndClip param_83 = CmdEndClip{ blend_1 };
-                        Cmd_EndClip_write(param_81, param_82, param_83, v_260, v_260BufferSize);
+                        Alloc param_80 = cmd_alloc;
+                        CmdRef param_81 = cmd_ref;
+                        Tile param_82 = tile_1;
+                        float param_83 = -1.0;
+                        write_fill(param_80, param_81, param_82, param_83, v_266, v_266BufferSize);
+                        cmd_ref = param_81;
+                        uint blend_1 = _1399.scene[dd_1];
+                        Alloc param_84 = cmd_alloc;
+                        CmdRef param_85 = cmd_ref;
+                        CmdEndClip param_86 = CmdEndClip{ blend_1 };
+                        Cmd_EndClip_write(param_84, param_85, param_86, v_266, v_266BufferSize);
                         cmd_ref.offset += 8u;
+                        render_blend_depth--;
                         break;
                     }
                 }
@@ -1216,21 +1235,32 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
             break;
         }
     }
-    bool _2229 = (bin_tile_x + tile_x) < _1005.conf.width_in_tiles;
-    bool _2238;
-    if (_2229)
+    bool _2263 = (bin_tile_x + tile_x) < _1020.conf.width_in_tiles;
+    bool _2272;
+    if (_2263)
     {
-        _2238 = (bin_tile_y + tile_y) < _1005.conf.height_in_tiles;
+        _2272 = (bin_tile_y + tile_y) < _1020.conf.height_in_tiles;
     }
     else
     {
-        _2238 = _2229;
+        _2272 = _2263;
     }
-    if (_2238)
+    if (_2272)
     {
-        Alloc param_84 = cmd_alloc;
-        CmdRef param_85 = cmd_ref;
-        Cmd_End_write(param_84, param_85, v_260, v_260BufferSize);
+        Alloc param_87 = cmd_alloc;
+        CmdRef param_88 = cmd_ref;
+        Cmd_End_write(param_87, param_88, v_266, v_266BufferSize);
+        if (max_blend_depth > 4u)
+        {
+            uint scratch_size = (((max_blend_depth * 16u) * 16u) * 1u) * 4u;
+            uint param_89 = scratch_size;
+            MallocResult _2293 = malloc(param_89, v_266, v_266BufferSize);
+            MallocResult scratch = _2293;
+            Alloc param_90 = scratch_alloc;
+            uint param_91 = scratch_alloc.offset;
+            Alloc param_92 = scratch.alloc;
+            alloc_write(param_90, param_91, param_92, v_266, v_266BufferSize);
+        }
     }
 }
 
diff --git a/piet-gpu/shader/gen/coarse.spv b/piet-gpu/shader/gen/coarse.spv
index 56a87e5..fe5eeee 100644
Binary files a/piet-gpu/shader/gen/coarse.spv and b/piet-gpu/shader/gen/coarse.spv differ
diff --git a/piet-gpu/shader/gen/kernel4.dxil b/piet-gpu/shader/gen/kernel4.dxil
index 7399fe4..c48d59d 100644
Binary files a/piet-gpu/shader/gen/kernel4.dxil and b/piet-gpu/shader/gen/kernel4.dxil differ
diff --git a/piet-gpu/shader/gen/kernel4.hlsl b/piet-gpu/shader/gen/kernel4.hlsl
index 4839db2..0a6c022 100644
--- a/piet-gpu/shader/gen/kernel4.hlsl
+++ b/piet-gpu/shader/gen/kernel4.hlsl
@@ -999,6 +999,8 @@ void comp_main()
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
     CmdRef _1705 = { cmd_alloc.offset };
     CmdRef cmd_ref = _1705;
+    uint blend_offset = _297.Load((cmd_ref.offset >> uint(2)) * 4 + 8);
+    cmd_ref.offset += 4u;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
     float4 rgba[8];
@@ -1011,7 +1013,9 @@ void comp_main()
     float df[8];
     TileSegRef tile_seg_ref;
     float area[8];
-    uint blend_stack[128][8];
+    uint blend_stack[4][8];
+    uint base_ix_1;
+    uint bg_rgba;
     while (mem_ok)
     {
         Alloc param_3 = cmd_alloc;
@@ -1032,8 +1036,8 @@ void comp_main()
                 {
                     df[k] = 1000000000.0f;
                 }
-                TileSegRef _1800 = { stroke.tile_ref };
-                tile_seg_ref = _1800;
+                TileSegRef _1810 = { stroke.tile_ref };
+                tile_seg_ref = _1810;
                 do
                 {
                     uint param_7 = tile_seg_ref.offset;
@@ -1069,8 +1073,8 @@ void comp_main()
                 {
                     area[k_3] = float(fill.backdrop);
                 }
-                TileSegRef _1920 = { fill.tile_ref };
-                tile_seg_ref = _1920;
+                TileSegRef _1930 = { fill.tile_ref };
+                tile_seg_ref = _1930;
                 do
                 {
                     uint param_15 = tile_seg_ref.offset;
@@ -1159,10 +1163,10 @@ void comp_main()
                     int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba = gradients[int2(x, int(lin.index))];
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2254 = fromsRGB(param_29);
-                    fg_rgba.x = _2254.x;
-                    fg_rgba.y = _2254.y;
-                    fg_rgba.z = _2254.z;
+                    float3 _2264 = fromsRGB(param_29);
+                    fg_rgba.x = _2264.x;
+                    fg_rgba.y = _2264.y;
+                    fg_rgba.z = _2264.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1;
                 }
@@ -1185,10 +1189,10 @@ void comp_main()
                     int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))];
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2364 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2364.x;
-                    fg_rgba_1.y = _2364.y;
-                    fg_rgba_1.z = _2364.z;
+                    float3 _2374 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2374.x;
+                    fg_rgba_1.y = _2374.y;
+                    fg_rgba_1.z = _2374.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2;
                 }
@@ -1202,9 +1206,9 @@ void comp_main()
                 CmdImage fill_img = Cmd_Image_read(param_34, param_35);
                 uint2 param_36 = xy_uint;
                 CmdImage param_37 = fill_img;
-                float4 _2407[8];
-                fillImage(_2407, param_36, param_37);
-                float4 img[8] = _2407;
+                float4 _2417[8];
+                fillImage(_2417, param_36, param_37);
+                float4 img[8] = _2417;
                 for (uint k_11 = 0u; k_11 < 8u; k_11++)
                 {
                     float4 fg_k_3 = img[k_11] * area[k_11];
@@ -1215,13 +1219,26 @@ void comp_main()
             }
             case 9u:
             {
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                if (clip_depth < 4u)
                 {
-                    uint d_2 = min(clip_depth, 127u);
-                    float4 param_38 = float4(rgba[k_12]);
-                    uint _2470 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2470;
-                    rgba[k_12] = 0.0f.xxxx;
+                    for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                    {
+                        float4 param_38 = float4(rgba[k_12]);
+                        uint _2479 = packsRGB(param_38);
+                        blend_stack[clip_depth][k_12] = _2479;
+                        rgba[k_12] = 0.0f.xxxx;
+                    }
+                }
+                else
+                {
+                    uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                    for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                    {
+                        float4 param_39 = float4(rgba[k_13]);
+                        uint _2522 = packsRGB(param_39);
+                        _297.Store((base_ix + k_13) * 4 + 8, _2522);
+                        rgba[k_13] = 0.0f.xxxx;
+                    }
                 }
                 clip_depth++;
                 cmd_ref.offset += 4u;
@@ -1229,32 +1246,41 @@ void comp_main()
             }
             case 10u:
             {
-                Alloc param_39 = cmd_alloc;
-                CmdRef param_40 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40);
-                uint blend_mode = end_clip.blend >> uint(8);
-                uint comp_mode = end_clip.blend & 255u;
+                Alloc param_40 = cmd_alloc;
+                CmdRef param_41 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41);
                 clip_depth--;
-                for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                if (clip_depth >= 4u)
                 {
-                    uint d_3 = min(clip_depth, 127u);
-                    uint param_41 = blend_stack[d_3][k_13];
-                    float4 bg = unpacksRGB(param_41);
-                    float4 fg_1 = rgba[k_13] * area[k_13];
-                    float4 param_42 = bg;
-                    float4 param_43 = fg_1;
-                    uint param_44 = end_clip.blend;
-                    rgba[k_13] = mix_blend_compose(param_42, param_43, param_44);
+                    base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                }
+                for (uint k_14 = 0u; k_14 < 8u; k_14++)
+                {
+                    if (clip_depth < 4u)
+                    {
+                        bg_rgba = blend_stack[clip_depth][k_14];
+                    }
+                    else
+                    {
+                        bg_rgba = _297.Load((base_ix_1 + k_14) * 4 + 8);
+                    }
+                    uint param_42 = bg_rgba;
+                    float4 bg = unpacksRGB(param_42);
+                    float4 fg_1 = rgba[k_14] * area[k_14];
+                    float4 param_43 = bg;
+                    float4 param_44 = fg_1;
+                    uint param_45 = end_clip.blend;
+                    rgba[k_14] = mix_blend_compose(param_43, param_44, param_45);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
             case 11u:
             {
-                Alloc param_45 = cmd_alloc;
-                CmdRef param_46 = cmd_ref;
-                CmdRef _2548 = { Cmd_Jump_read(param_45, param_46).new_ref };
-                cmd_ref = _2548;
+                Alloc param_46 = cmd_alloc;
+                CmdRef param_47 = cmd_ref;
+                CmdRef _2621 = { Cmd_Jump_read(param_46, param_47).new_ref };
+                cmd_ref = _2621;
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1262,9 +1288,9 @@ void comp_main()
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_47 = i_1;
-        float3 param_48 = rgba[i_1].xyz;
-        image[int2(xy_uint + chunk_offset(param_47))] = float4(tosRGB(param_48), rgba[i_1].w);
+        uint param_48 = i_1;
+        float3 param_49 = rgba[i_1].xyz;
+        image[int2(xy_uint + chunk_offset(param_48))] = float4(tosRGB(param_49), rgba[i_1].w);
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4.msl b/piet-gpu/shader/gen/kernel4.msl
index 4caeaf0..4d7068d 100644
--- a/piet-gpu/shader/gen/kernel4.msl
+++ b/piet-gpu/shader/gen/kernel4.msl
@@ -1056,6 +1056,8 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
     CmdRef cmd_ref = CmdRef{ cmd_alloc.offset };
+    uint blend_offset = v_297.memory[cmd_ref.offset >> uint(2)];
+    cmd_ref.offset += 4u;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
     spvUnsafeArray<float4, 8> rgba;
@@ -1068,7 +1070,9 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
     spvUnsafeArray<float, 8> df;
     TileSegRef tile_seg_ref;
     spvUnsafeArray<float, 8> area;
-    spvUnsafeArray<spvUnsafeArray<uint, 8>, 128> blend_stack;
+    spvUnsafeArray<spvUnsafeArray<uint, 8>, 4> blend_stack;
+    uint base_ix_1;
+    uint bg_rgba;
     while (mem_ok)
     {
         Alloc param_3 = cmd_alloc;
@@ -1214,10 +1218,10 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
                     int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                     float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2254 = fromsRGB(param_29);
-                    fg_rgba.x = _2254.x;
-                    fg_rgba.y = _2254.y;
-                    fg_rgba.z = _2254.z;
+                    float3 _2264 = fromsRGB(param_29);
+                    fg_rgba.x = _2264.x;
+                    fg_rgba.y = _2264.y;
+                    fg_rgba.z = _2264.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1;
                 }
@@ -1240,10 +1244,10 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
                     int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0));
                     float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index))));
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2364 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2364.x;
-                    fg_rgba_1.y = _2364.y;
-                    fg_rgba_1.z = _2364.z;
+                    float3 _2374 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2374.x;
+                    fg_rgba_1.y = _2374.y;
+                    fg_rgba_1.z = _2374.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2;
                 }
@@ -1269,13 +1273,26 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
             }
             case 9u:
             {
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                if (clip_depth < 4u)
                 {
-                    uint d_2 = min(clip_depth, 127u);
-                    float4 param_38 = float4(rgba[k_12]);
-                    uint _2470 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2470;
-                    rgba[k_12] = float4(0.0);
+                    for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                    {
+                        float4 param_38 = float4(rgba[k_12]);
+                        uint _2479 = packsRGB(param_38);
+                        blend_stack[clip_depth][k_12] = _2479;
+                        rgba[k_12] = float4(0.0);
+                    }
+                }
+                else
+                {
+                    uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                    for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                    {
+                        float4 param_39 = float4(rgba[k_13]);
+                        uint _2522 = packsRGB(param_39);
+                        v_297.memory[base_ix + k_13] = _2522;
+                        rgba[k_13] = float4(0.0);
+                    }
                 }
                 clip_depth++;
                 cmd_ref.offset += 4u;
@@ -1283,31 +1300,40 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
             }
             case 10u:
             {
-                Alloc param_39 = cmd_alloc;
-                CmdRef param_40 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40, v_297);
-                uint blend_mode = end_clip.blend >> uint(8);
-                uint comp_mode = end_clip.blend & 255u;
+                Alloc param_40 = cmd_alloc;
+                CmdRef param_41 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41, v_297);
                 clip_depth--;
-                for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                if (clip_depth >= 4u)
                 {
-                    uint d_3 = min(clip_depth, 127u);
-                    uint param_41 = blend_stack[d_3][k_13];
-                    float4 bg = unpacksRGB(param_41);
-                    float4 fg_1 = rgba[k_13] * area[k_13];
-                    float4 param_42 = bg;
-                    float4 param_43 = fg_1;
-                    uint param_44 = end_clip.blend;
-                    rgba[k_13] = mix_blend_compose(param_42, param_43, param_44);
+                    base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                }
+                for (uint k_14 = 0u; k_14 < 8u; k_14++)
+                {
+                    if (clip_depth < 4u)
+                    {
+                        bg_rgba = blend_stack[clip_depth][k_14];
+                    }
+                    else
+                    {
+                        bg_rgba = v_297.memory[base_ix_1 + k_14];
+                    }
+                    uint param_42 = bg_rgba;
+                    float4 bg = unpacksRGB(param_42);
+                    float4 fg_1 = rgba[k_14] * area[k_14];
+                    float4 param_43 = bg;
+                    float4 param_44 = fg_1;
+                    uint param_45 = end_clip.blend;
+                    rgba[k_14] = mix_blend_compose(param_43, param_44, param_45);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
             case 11u:
             {
-                Alloc param_45 = cmd_alloc;
-                CmdRef param_46 = cmd_ref;
-                cmd_ref = CmdRef{ Cmd_Jump_read(param_45, param_46, v_297).new_ref };
+                Alloc param_46 = cmd_alloc;
+                CmdRef param_47 = cmd_ref;
+                cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_297).new_ref };
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1315,9 +1341,9 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_47 = i_1;
-        float3 param_48 = rgba[i_1].xyz;
-        image.write(float4(tosRGB(param_48), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_47))));
+        uint param_48 = i_1;
+        float3 param_49 = rgba[i_1].xyz;
+        image.write(float4(tosRGB(param_49), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48))));
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4.spv b/piet-gpu/shader/gen/kernel4.spv
index f0e2963..c388941 100644
Binary files a/piet-gpu/shader/gen/kernel4.spv and b/piet-gpu/shader/gen/kernel4.spv differ
diff --git a/piet-gpu/shader/gen/kernel4_gray.dxil b/piet-gpu/shader/gen/kernel4_gray.dxil
index 7b7c19f..7390167 100644
Binary files a/piet-gpu/shader/gen/kernel4_gray.dxil and b/piet-gpu/shader/gen/kernel4_gray.dxil differ
diff --git a/piet-gpu/shader/gen/kernel4_gray.hlsl b/piet-gpu/shader/gen/kernel4_gray.hlsl
index 5d9b88d..ffada37 100644
--- a/piet-gpu/shader/gen/kernel4_gray.hlsl
+++ b/piet-gpu/shader/gen/kernel4_gray.hlsl
@@ -999,6 +999,8 @@ void comp_main()
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
     CmdRef _1705 = { cmd_alloc.offset };
     CmdRef cmd_ref = _1705;
+    uint blend_offset = _297.Load((cmd_ref.offset >> uint(2)) * 4 + 8);
+    cmd_ref.offset += 4u;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
     float4 rgba[8];
@@ -1011,7 +1013,9 @@ void comp_main()
     float df[8];
     TileSegRef tile_seg_ref;
     float area[8];
-    uint blend_stack[128][8];
+    uint blend_stack[4][8];
+    uint base_ix_1;
+    uint bg_rgba;
     while (mem_ok)
     {
         Alloc param_3 = cmd_alloc;
@@ -1032,8 +1036,8 @@ void comp_main()
                 {
                     df[k] = 1000000000.0f;
                 }
-                TileSegRef _1800 = { stroke.tile_ref };
-                tile_seg_ref = _1800;
+                TileSegRef _1810 = { stroke.tile_ref };
+                tile_seg_ref = _1810;
                 do
                 {
                     uint param_7 = tile_seg_ref.offset;
@@ -1069,8 +1073,8 @@ void comp_main()
                 {
                     area[k_3] = float(fill.backdrop);
                 }
-                TileSegRef _1920 = { fill.tile_ref };
-                tile_seg_ref = _1920;
+                TileSegRef _1930 = { fill.tile_ref };
+                tile_seg_ref = _1930;
                 do
                 {
                     uint param_15 = tile_seg_ref.offset;
@@ -1159,10 +1163,10 @@ void comp_main()
                     int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba = gradients[int2(x, int(lin.index))];
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2254 = fromsRGB(param_29);
-                    fg_rgba.x = _2254.x;
-                    fg_rgba.y = _2254.y;
-                    fg_rgba.z = _2254.z;
+                    float3 _2264 = fromsRGB(param_29);
+                    fg_rgba.x = _2264.x;
+                    fg_rgba.y = _2264.y;
+                    fg_rgba.z = _2264.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1;
                 }
@@ -1185,10 +1189,10 @@ void comp_main()
                     int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))];
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2364 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2364.x;
-                    fg_rgba_1.y = _2364.y;
-                    fg_rgba_1.z = _2364.z;
+                    float3 _2374 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2374.x;
+                    fg_rgba_1.y = _2374.y;
+                    fg_rgba_1.z = _2374.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2;
                 }
@@ -1202,9 +1206,9 @@ void comp_main()
                 CmdImage fill_img = Cmd_Image_read(param_34, param_35);
                 uint2 param_36 = xy_uint;
                 CmdImage param_37 = fill_img;
-                float4 _2407[8];
-                fillImage(_2407, param_36, param_37);
-                float4 img[8] = _2407;
+                float4 _2417[8];
+                fillImage(_2417, param_36, param_37);
+                float4 img[8] = _2417;
                 for (uint k_11 = 0u; k_11 < 8u; k_11++)
                 {
                     float4 fg_k_3 = img[k_11] * area[k_11];
@@ -1215,13 +1219,26 @@ void comp_main()
             }
             case 9u:
             {
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                if (clip_depth < 4u)
                 {
-                    uint d_2 = min(clip_depth, 127u);
-                    float4 param_38 = float4(rgba[k_12]);
-                    uint _2470 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2470;
-                    rgba[k_12] = 0.0f.xxxx;
+                    for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                    {
+                        float4 param_38 = float4(rgba[k_12]);
+                        uint _2479 = packsRGB(param_38);
+                        blend_stack[clip_depth][k_12] = _2479;
+                        rgba[k_12] = 0.0f.xxxx;
+                    }
+                }
+                else
+                {
+                    uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                    for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                    {
+                        float4 param_39 = float4(rgba[k_13]);
+                        uint _2522 = packsRGB(param_39);
+                        _297.Store((base_ix + k_13) * 4 + 8, _2522);
+                        rgba[k_13] = 0.0f.xxxx;
+                    }
                 }
                 clip_depth++;
                 cmd_ref.offset += 4u;
@@ -1229,32 +1246,41 @@ void comp_main()
             }
             case 10u:
             {
-                Alloc param_39 = cmd_alloc;
-                CmdRef param_40 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40);
-                uint blend_mode = end_clip.blend >> uint(8);
-                uint comp_mode = end_clip.blend & 255u;
+                Alloc param_40 = cmd_alloc;
+                CmdRef param_41 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41);
                 clip_depth--;
-                for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                if (clip_depth >= 4u)
                 {
-                    uint d_3 = min(clip_depth, 127u);
-                    uint param_41 = blend_stack[d_3][k_13];
-                    float4 bg = unpacksRGB(param_41);
-                    float4 fg_1 = rgba[k_13] * area[k_13];
-                    float4 param_42 = bg;
-                    float4 param_43 = fg_1;
-                    uint param_44 = end_clip.blend;
-                    rgba[k_13] = mix_blend_compose(param_42, param_43, param_44);
+                    base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                }
+                for (uint k_14 = 0u; k_14 < 8u; k_14++)
+                {
+                    if (clip_depth < 4u)
+                    {
+                        bg_rgba = blend_stack[clip_depth][k_14];
+                    }
+                    else
+                    {
+                        bg_rgba = _297.Load((base_ix_1 + k_14) * 4 + 8);
+                    }
+                    uint param_42 = bg_rgba;
+                    float4 bg = unpacksRGB(param_42);
+                    float4 fg_1 = rgba[k_14] * area[k_14];
+                    float4 param_43 = bg;
+                    float4 param_44 = fg_1;
+                    uint param_45 = end_clip.blend;
+                    rgba[k_14] = mix_blend_compose(param_43, param_44, param_45);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
             case 11u:
             {
-                Alloc param_45 = cmd_alloc;
-                CmdRef param_46 = cmd_ref;
-                CmdRef _2548 = { Cmd_Jump_read(param_45, param_46).new_ref };
-                cmd_ref = _2548;
+                Alloc param_46 = cmd_alloc;
+                CmdRef param_47 = cmd_ref;
+                CmdRef _2621 = { Cmd_Jump_read(param_46, param_47).new_ref };
+                cmd_ref = _2621;
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1262,8 +1288,8 @@ void comp_main()
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_47 = i_1;
-        image[int2(xy_uint + chunk_offset(param_47))] = rgba[i_1].w.x;
+        uint param_48 = i_1;
+        image[int2(xy_uint + chunk_offset(param_48))] = rgba[i_1].w.x;
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4_gray.msl b/piet-gpu/shader/gen/kernel4_gray.msl
index 8c608c3..a190ab6 100644
--- a/piet-gpu/shader/gen/kernel4_gray.msl
+++ b/piet-gpu/shader/gen/kernel4_gray.msl
@@ -1056,6 +1056,8 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
     CmdRef cmd_ref = CmdRef{ cmd_alloc.offset };
+    uint blend_offset = v_297.memory[cmd_ref.offset >> uint(2)];
+    cmd_ref.offset += 4u;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
     spvUnsafeArray<float4, 8> rgba;
@@ -1068,7 +1070,9 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
     spvUnsafeArray<float, 8> df;
     TileSegRef tile_seg_ref;
     spvUnsafeArray<float, 8> area;
-    spvUnsafeArray<spvUnsafeArray<uint, 8>, 128> blend_stack;
+    spvUnsafeArray<spvUnsafeArray<uint, 8>, 4> blend_stack;
+    uint base_ix_1;
+    uint bg_rgba;
     while (mem_ok)
     {
         Alloc param_3 = cmd_alloc;
@@ -1214,10 +1218,10 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
                     int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                     float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2254 = fromsRGB(param_29);
-                    fg_rgba.x = _2254.x;
-                    fg_rgba.y = _2254.y;
-                    fg_rgba.z = _2254.z;
+                    float3 _2264 = fromsRGB(param_29);
+                    fg_rgba.x = _2264.x;
+                    fg_rgba.y = _2264.y;
+                    fg_rgba.z = _2264.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1;
                 }
@@ -1240,10 +1244,10 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
                     int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0));
                     float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index))));
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2364 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2364.x;
-                    fg_rgba_1.y = _2364.y;
-                    fg_rgba_1.z = _2364.z;
+                    float3 _2374 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2374.x;
+                    fg_rgba_1.y = _2374.y;
+                    fg_rgba_1.z = _2374.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2;
                 }
@@ -1269,13 +1273,26 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
             }
             case 9u:
             {
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                if (clip_depth < 4u)
                 {
-                    uint d_2 = min(clip_depth, 127u);
-                    float4 param_38 = float4(rgba[k_12]);
-                    uint _2470 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2470;
-                    rgba[k_12] = float4(0.0);
+                    for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                    {
+                        float4 param_38 = float4(rgba[k_12]);
+                        uint _2479 = packsRGB(param_38);
+                        blend_stack[clip_depth][k_12] = _2479;
+                        rgba[k_12] = float4(0.0);
+                    }
+                }
+                else
+                {
+                    uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                    for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                    {
+                        float4 param_39 = float4(rgba[k_13]);
+                        uint _2522 = packsRGB(param_39);
+                        v_297.memory[base_ix + k_13] = _2522;
+                        rgba[k_13] = float4(0.0);
+                    }
                 }
                 clip_depth++;
                 cmd_ref.offset += 4u;
@@ -1283,31 +1300,40 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
             }
             case 10u:
             {
-                Alloc param_39 = cmd_alloc;
-                CmdRef param_40 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40, v_297);
-                uint blend_mode = end_clip.blend >> uint(8);
-                uint comp_mode = end_clip.blend & 255u;
+                Alloc param_40 = cmd_alloc;
+                CmdRef param_41 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41, v_297);
                 clip_depth--;
-                for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                if (clip_depth >= 4u)
                 {
-                    uint d_3 = min(clip_depth, 127u);
-                    uint param_41 = blend_stack[d_3][k_13];
-                    float4 bg = unpacksRGB(param_41);
-                    float4 fg_1 = rgba[k_13] * area[k_13];
-                    float4 param_42 = bg;
-                    float4 param_43 = fg_1;
-                    uint param_44 = end_clip.blend;
-                    rgba[k_13] = mix_blend_compose(param_42, param_43, param_44);
+                    base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                }
+                for (uint k_14 = 0u; k_14 < 8u; k_14++)
+                {
+                    if (clip_depth < 4u)
+                    {
+                        bg_rgba = blend_stack[clip_depth][k_14];
+                    }
+                    else
+                    {
+                        bg_rgba = v_297.memory[base_ix_1 + k_14];
+                    }
+                    uint param_42 = bg_rgba;
+                    float4 bg = unpacksRGB(param_42);
+                    float4 fg_1 = rgba[k_14] * area[k_14];
+                    float4 param_43 = bg;
+                    float4 param_44 = fg_1;
+                    uint param_45 = end_clip.blend;
+                    rgba[k_14] = mix_blend_compose(param_43, param_44, param_45);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
             case 11u:
             {
-                Alloc param_45 = cmd_alloc;
-                CmdRef param_46 = cmd_ref;
-                cmd_ref = CmdRef{ Cmd_Jump_read(param_45, param_46, v_297).new_ref };
+                Alloc param_46 = cmd_alloc;
+                CmdRef param_47 = cmd_ref;
+                cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_297).new_ref };
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1315,8 +1341,8 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_47 = i_1;
-        image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_47))));
+        uint param_48 = i_1;
+        image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48))));
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4_gray.spv b/piet-gpu/shader/gen/kernel4_gray.spv
index 6ff1791..17c7531 100644
Binary files a/piet-gpu/shader/gen/kernel4_gray.spv and b/piet-gpu/shader/gen/kernel4_gray.spv differ
diff --git a/piet-gpu/shader/gen/path_coarse.dxil b/piet-gpu/shader/gen/path_coarse.dxil
index b6c9398..9fd593c 100644
Binary files a/piet-gpu/shader/gen/path_coarse.dxil and b/piet-gpu/shader/gen/path_coarse.dxil differ
diff --git a/piet-gpu/shader/gen/pathseg.dxil b/piet-gpu/shader/gen/pathseg.dxil
index 7ce4684..6130712 100644
Binary files a/piet-gpu/shader/gen/pathseg.dxil and b/piet-gpu/shader/gen/pathseg.dxil differ
diff --git a/piet-gpu/shader/gen/pathtag_reduce.dxil b/piet-gpu/shader/gen/pathtag_reduce.dxil
index ff544b8..4c2bd23 100644
Binary files a/piet-gpu/shader/gen/pathtag_reduce.dxil and b/piet-gpu/shader/gen/pathtag_reduce.dxil differ
diff --git a/piet-gpu/shader/gen/pathtag_root.dxil b/piet-gpu/shader/gen/pathtag_root.dxil
index 48584bd..77f12e6 100644
Binary files a/piet-gpu/shader/gen/pathtag_root.dxil and b/piet-gpu/shader/gen/pathtag_root.dxil differ
diff --git a/piet-gpu/shader/gen/transform_leaf.dxil b/piet-gpu/shader/gen/transform_leaf.dxil
index 0c1e376..f9f31e6 100644
Binary files a/piet-gpu/shader/gen/transform_leaf.dxil and b/piet-gpu/shader/gen/transform_leaf.dxil differ
diff --git a/piet-gpu/shader/gen/transform_reduce.dxil b/piet-gpu/shader/gen/transform_reduce.dxil
index fc3a311..978dd98 100644
Binary files a/piet-gpu/shader/gen/transform_reduce.dxil and b/piet-gpu/shader/gen/transform_reduce.dxil differ
diff --git a/piet-gpu/shader/gen/transform_root.dxil b/piet-gpu/shader/gen/transform_root.dxil
index a33ff7f..5b4f059 100644
Binary files a/piet-gpu/shader/gen/transform_root.dxil and b/piet-gpu/shader/gen/transform_root.dxil differ
diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp
index 99fd22e..86751d4 100644
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@@ -100,11 +100,14 @@ void main() {
     Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC);
     CmdRef cmd_ref = CmdRef(cmd_alloc.offset);
 
+    uint blend_offset = memory[cmd_ref.offset >> 2];
+    cmd_ref.offset += 4;
+
     uvec2 xy_uint = uvec2(gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_WorkGroupID.x,
                           gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y);
     vec2 xy = vec2(xy_uint);
     mediump vec4 rgba[CHUNK];
-    uint blend_stack[MAX_BLEND_STACK][CHUNK];
+    uint blend_stack[BLEND_STACK_SPLIT][CHUNK];
     for (uint i = 0; i < CHUNK; i++) {
         rgba[i] = vec4(0.0);
     }
@@ -236,24 +239,38 @@ void main() {
             cmd_ref.offset += 4 + CmdImage_size;
             break;
         case Cmd_BeginClip:
-            for (uint k = 0; k < CHUNK; k++) {
-                // We reject any inputs that might overflow in render_ctx.rs.
-                // The following is a sanity check so we don't corrupt memory should there be malformed inputs.
-                uint d = min(clip_depth, MAX_BLEND_STACK - 1);
-                blend_stack[d][k] = packsRGB(vec4(rgba[k]));
-                rgba[k] = vec4(0.0);
+            if (clip_depth < BLEND_STACK_SPLIT) {
+                for (uint k = 0; k < CHUNK; k++) {
+                    blend_stack[clip_depth][k] = packsRGB(vec4(rgba[k]));
+                    rgba[k] = vec4(0.0);
+                }
+            } else {
+                uint base_ix = (blend_offset >> 2) + (clip_depth - BLEND_STACK_SPLIT) * TILE_HEIGHT_PX * TILE_WIDTH_PX +
+                    CHUNK * (gl_LocalInvocationID.x + CHUNK_DX * gl_LocalInvocationID.y);
+                for (uint k = 0; k < CHUNK; k++) {
+                    memory[base_ix + k] = packsRGB(vec4(rgba[k]));
+                    rgba[k] = vec4(0.0);
+                }
             }
             clip_depth++;
             cmd_ref.offset += 4;
             break;
         case Cmd_EndClip:
             CmdEndClip end_clip = Cmd_EndClip_read(cmd_alloc, cmd_ref);
-            uint blend_mode = uint(end_clip.blend >> 8);
-            uint comp_mode = uint(end_clip.blend & 0xFF);
             clip_depth--;
+            uint base_ix;
+            if (clip_depth >= BLEND_STACK_SPLIT) {
+                base_ix = (blend_offset >> 2) + (clip_depth - BLEND_STACK_SPLIT) * TILE_HEIGHT_PX * TILE_WIDTH_PX +
+                    CHUNK * (gl_LocalInvocationID.x + CHUNK_DX * gl_LocalInvocationID.y);
+            }
             for (uint k = 0; k < CHUNK; k++) {
-                uint d = min(clip_depth, MAX_BLEND_STACK - 1);
-                mediump vec4 bg = unpacksRGB(blend_stack[d][k]);
+                uint bg_rgba;
+                if (clip_depth < BLEND_STACK_SPLIT) {
+                    bg_rgba = blend_stack[clip_depth][k];
+                } else {
+                    bg_rgba = memory[base_ix + k];
+                }
+                mediump vec4 bg = unpacksRGB(bg_rgba);
                 mediump vec4 fg = rgba[k] * area[k];
                 rgba[k] = mix_blend_compose(bg, fg, end_clip.blend);
             }
diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h
index ec17188..21206e5 100644
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@@ -27,6 +27,10 @@
 
 #define GRADIENT_WIDTH 512
 
+// We allocate this many blend stack entries in registers, and spill
+// to memory for the overflow.
+#define BLEND_STACK_SPLIT 4
+
 #ifdef ERR_MALLOC_FAILED
 struct Config {
     uint n_elements; // paths
@@ -91,7 +95,7 @@ struct Config {
 #define MODE_STROKE 1
 
 // Size of kernel4 clip state, in words.
-#define CLIP_STATE_SIZE 2
+#define CLIP_STATE_SIZE 1
 
 // fill_mode_from_flags extracts the fill mode from tag flags.
 uint fill_mode_from_flags(uint flags) {
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index c75f41f..1ebb5cf 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -34,8 +34,6 @@ const TILE_H: usize = 16;
 
 const PTCL_INITIAL_ALLOC: usize = 1024;
 
-const MAX_BLEND_STACK: usize = 128;
-
 #[allow(unused)]
 fn dump_scene(buf: &[u8]) {
     for i in 0..(buf.len() / 4) {
diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs
index 14f2561..5d4ffd3 100644
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@@ -5,7 +5,6 @@ use std::borrow::Cow;
 
 use crate::encoder::GlyphEncoder;
 use crate::stages::{Config, Transform};
-use crate::MAX_BLEND_STACK;
 use piet::kurbo::{Affine, PathEl, Point, Rect, Shape};
 use piet::{
     Color, Error, FixedGradient, ImageFormat, InterpolationMode, IntoBrush, RenderContext,
@@ -233,9 +232,6 @@ impl RenderContext for PietGpuRenderContext {
         let path = shape.path_elements(TOLERANCE);
         self.encode_path(path, true);
         self.new_encoder.begin_clip(None);
-        if self.clip_stack.len() >= MAX_BLEND_STACK {
-            panic!("Maximum clip/blend stack size {} exceeded", MAX_BLEND_STACK);
-        }
         self.clip_stack.push(ClipElement { blend: None });
         if let Some(tos) = self.state_stack.last_mut() {
             tos.n_clip += 1;
@@ -337,9 +333,6 @@ impl PietGpuRenderContext {
         let path = shape.path_elements(TOLERANCE);
         self.encode_path(path, true);
         self.new_encoder.begin_clip(Some(blend));
-        if self.clip_stack.len() >= MAX_BLEND_STACK {
-            panic!("Maximum clip/blend stack size {} exceeded", MAX_BLEND_STACK);
-        }
         self.clip_stack.push(ClipElement { blend: Some(blend) });
         if let Some(tos) = self.state_stack.last_mut() {
             tos.n_clip += 1;