First cut at split blend stack

Split the blend stack into register and memory segments. Do blending in registers up to that size, then spill to memory if needed. This version may regress performance on Pixel 4, as it uses common memory for the blend stack, rather than keeping that memory read-only in fine rasterization, and using a separate buffer for blend stack. This needs investigation. It's possible we'll want to have single common memory as a config option, as it pools allocations and decreases the probability of failure. Also a flaw in this version: there is no checking of memory overflow. For understanding code history: this commit largely reverts #77, but there were some intervening changes to blending, and this commit also implements the split so some of the stack is in registers. Closes #156
2025-01-10 04:31:30 +11:00 · 2022-05-16 11:01:03 -07:00 · 2022-05-16 11:01:03 -07:00 · e73049fe98
parent 18563101b2
commit e73049fe98
17 changed files with 504 additions and 383 deletions
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@ -151,6 +151,11 @@ void main() {
    uint part_start_ix = 0;
    uint ready_ix = 0;

+    cmd_ref.offset += 4;
+    // Accounting for allocation of blend memory
+    uint render_blend_depth = 0;
+    uint max_blend_depth = 0;
+
    uint drawmonoid_start = conf.drawmonoid_alloc.offset >> 2;
    uint drawtag_start = conf.drawtag_offset >> 2;
    uint drawdata_start = conf.drawdata_offset >> 2;
@ -414,6 +419,8 @@ void main() {
                        }
                        Cmd_BeginClip_write(cmd_alloc, cmd_ref);
                        cmd_ref.offset += 4;
+                        render_blend_depth++;
+                        max_blend_depth = max(max_blend_depth, render_blend_depth);
                    }
                    clip_depth++;
                    break;
@ -426,6 +433,7 @@ void main() {
                    uint blend = scene[dd];
                    Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(blend));
                    cmd_ref.offset += 4 + CmdEndClip_size;
+                    render_blend_depth--;
                    break;
                }
            } else {
@ -451,5 +459,8 @@ void main() {
    }
    if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) {
        Cmd_End_write(cmd_alloc, cmd_ref);
+        if (max_blend_depth > BLEND_STACK_SPLIT) {
+            // TODO: allocate blend memory and write result
+        }
    }
 }
--- a/piet-gpu/shader/gen/coarse.dxil
+++ b/piet-gpu/shader/gen/coarse.dxil
--- a/piet-gpu/shader/gen/coarse.hlsl
+++ b/piet-gpu/shader/gen/coarse.hlsl
@ -177,7 +177,7 @@ static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);

 RWByteAddressBuffer _260 : register(u0, space0);
 ByteAddressBuffer _1005 : register(t1, space0);
-ByteAddressBuffer _1372 : register(t2, space0);
+ByteAddressBuffer _1378 : register(t2, space0);

 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@ -681,6 +681,9 @@ void comp_main()
    uint wr_ix = 0u;
    uint part_start_ix = 0u;
    uint ready_ix = 0u;
+    cmd_ref.offset += 4u;
+    uint render_blend_depth = 0u;
+    uint max_blend_depth = 0u;
    uint drawmonoid_start = _1005.Load(44) >> uint(2);
    uint drawtag_start = _1005.Load(100) >> uint(2);
    uint drawdata_start = _1005.Load(104) >> uint(2);
@ -688,11 +691,11 @@ void comp_main()
    bool mem_ok = _260.Load(4) == 0u;
    Alloc param_3;
    Alloc param_5;
-    uint _1304;
+    uint _1310;
    uint element_ix;
    Alloc param_14;
    uint tile_count;
-    uint _1605;
+    uint _1611;
    float linewidth;
    CmdLinGrad cmd_lin;
    CmdRadGrad cmd_rad;
@ -702,34 +705,34 @@ void comp_main()
        {
            sh_bitmaps[i][th_ix] = 0u;
        }
-        bool _1356;
+        bool _1362;
        for (;;)
        {
            if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
            {
                part_start_ix = ready_ix;
                uint count = 0u;
-                bool _1154 = th_ix < 256u;
-                bool _1162;
-                if (_1154)
+                bool _1160 = th_ix < 256u;
+                bool _1168;
+                if (_1160)
                {
-                    _1162 = (partition_ix + th_ix) < n_partitions;
+                    _1168 = (partition_ix + th_ix) < n_partitions;
                }
                else
                {
-                    _1162 = _1154;
+                    _1168 = _1160;
                }
-                if (_1162)
+                if (_1168)
                {
                    uint in_ix = (_1005.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
-                    Alloc _1179;
-                    _1179.offset = _1005.Load(20);
-                    param_3.offset = _1179.offset;
+                    Alloc _1185;
+                    _1185.offset = _1005.Load(20);
+                    param_3.offset = _1185.offset;
                    uint param_4 = in_ix;
                    count = read_mem(param_3, param_4);
-                    Alloc _1190;
-                    _1190.offset = _1005.Load(20);
-                    param_5.offset = _1190.offset;
+                    Alloc _1196;
+                    _1196.offset = _1005.Load(20);
+                    param_5.offset = _1196.offset;
                    uint param_6 = in_ix + 1u;
                    uint offset = read_mem(param_5, param_6);
                    uint param_7 = offset;
@ -775,16 +778,16 @@ void comp_main()
                }
                if (part_ix > 0u)
                {
-                    _1304 = sh_part_count[part_ix - 1u];
+                    _1310 = sh_part_count[part_ix - 1u];
                }
                else
                {
-                    _1304 = part_start_ix;
+                    _1310 = part_start_ix;
                }
-                ix -= _1304;
+                ix -= _1310;
                Alloc bin_alloc = sh_part_elements[part_ix];
-                BinInstanceRef _1323 = { bin_alloc.offset };
-                BinInstanceRef inst_ref = _1323;
+                BinInstanceRef _1329 = { bin_alloc.offset };
+                BinInstanceRef inst_ref = _1329;
                BinInstanceRef param_10 = inst_ref;
                uint param_11 = ix;
                Alloc param_12 = bin_alloc;
@ -794,16 +797,16 @@ void comp_main()
            }
            GroupMemoryBarrierWithGroupSync();
            wr_ix = min((rd_ix + 256u), ready_ix);
-            bool _1346 = (wr_ix - rd_ix) < 256u;
-            if (_1346)
+            bool _1352 = (wr_ix - rd_ix) < 256u;
+            if (_1352)
            {
-                _1356 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
+                _1362 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
            }
            else
            {
-                _1356 = _1346;
+                _1362 = _1352;
            }
-            if (_1356)
+            if (_1362)
            {
                continue;
            }
@ -816,7 +819,7 @@ void comp_main()
        if ((th_ix + rd_ix) < wr_ix)
        {
            element_ix = sh_elements[th_ix];
-            tag = _1372.Load((drawtag_start + element_ix) * 4 + 0);
+            tag = _1378.Load((drawtag_start + element_ix) * 4 + 0);
        }
        switch (tag)
        {
@ -829,11 +832,11 @@ void comp_main()
            {
                uint drawmonoid_base = drawmonoid_start + (4u * element_ix);
                uint path_ix = _260.Load(drawmonoid_base * 4 + 8);
-                PathRef _1397 = { _1005.Load(16) + (path_ix * 12u) };
-                Alloc _1400;
-                _1400.offset = _1005.Load(16);
-                param_14.offset = _1400.offset;
-                PathRef param_15 = _1397;
+                PathRef _1403 = { _1005.Load(16) + (path_ix * 12u) };
+                Alloc _1406;
+                _1406.offset = _1005.Load(16);
+                param_14.offset = _1406.offset;
+                PathRef param_15 = _1403;
                Path path = Path_read(param_14, param_15);
                uint stride = path.bbox.z - path.bbox.x;
                sh_tile_stride[th_ix] = stride;
@ -889,16 +892,16 @@ void comp_main()
                }
            }
            uint element_ix_1 = sh_elements[el_ix];
-            uint tag_1 = _1372.Load((drawtag_start + element_ix_1) * 4 + 0);
+            uint tag_1 = _1378.Load((drawtag_start + element_ix_1) * 4 + 0);
            if (el_ix > 0u)
            {
-                _1605 = sh_tile_count[el_ix - 1u];
+                _1611 = sh_tile_count[el_ix - 1u];
            }
            else
            {
-                _1605 = 0u;
+                _1611 = 0u;
            }
-            uint seq_ix = ix_1 - _1605;
+            uint seq_ix = ix_1 - _1611;
            uint width = sh_tile_width[el_ix];
            uint x = sh_tile_x0[el_ix] + (seq_ix % width);
            uint y = sh_tile_y0[el_ix] + (seq_ix / width);
@ -907,9 +910,9 @@ void comp_main()
            {
                uint param_21 = el_ix;
                bool param_22 = mem_ok;
-                TileRef _1657 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
+                TileRef _1663 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
                Alloc param_23 = read_tile_alloc(param_21, param_22);
-                TileRef param_24 = _1657;
+                TileRef param_24 = _1663;
                Tile tile = Tile_read(param_23, param_24);
                bool is_clip = (tag_1 & 1u) != 0u;
                bool is_blend = false;
@ -918,27 +921,27 @@ void comp_main()
                    uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1);
                    uint scene_offset = _260.Load((drawmonoid_base_1 + 2u) * 4 + 8);
                    uint dd = drawdata_start + (scene_offset >> uint(2));
-                    uint blend = _1372.Load(dd * 4 + 0);
+                    uint blend = _1378.Load(dd * 4 + 0);
                    is_blend = blend != 3u;
                }
-                bool _1692 = tile.tile.offset != 0u;
-                bool _1701;
-                if (!_1692)
+                bool _1698 = tile.tile.offset != 0u;
+                bool _1707;
+                if (!_1698)
                {
-                    _1701 = (tile.backdrop == 0) == is_clip;
+                    _1707 = (tile.backdrop == 0) == is_clip;
                }
                else
                {
-                    _1701 = _1692;
+                    _1707 = _1698;
                }
-                include_tile = _1701 || is_blend;
+                include_tile = _1707 || is_blend;
            }
            if (include_tile)
            {
                uint el_slice = el_ix / 32u;
                uint el_mask = 1u << (el_ix & 31u);
-                uint _1723;
-                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1723);
+                uint _1729;
+                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1729);
            }
        }
        GroupMemoryBarrierWithGroupSync();
@ -962,14 +965,14 @@ void comp_main()
            uint element_ref_ix = (slice_ix * 32u) + uint(int(firstbitlow(bitmap)));
            uint element_ix_2 = sh_elements[element_ref_ix];
            bitmap &= (bitmap - 1u);
-            uint drawtag = _1372.Load((drawtag_start + element_ix_2) * 4 + 0);
+            uint drawtag = _1378.Load((drawtag_start + element_ix_2) * 4 + 0);
            if (clip_zero_depth == 0u)
            {
                uint param_25 = element_ref_ix;
                bool param_26 = mem_ok;
-                TileRef _1800 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                TileRef _1806 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
                Alloc param_27 = read_tile_alloc(param_25, param_26);
-                TileRef param_28 = _1800;
+                TileRef param_28 = _1806;
                Tile tile_1 = Tile_read(param_27, param_28);
                uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2);
                uint scene_offset_1 = _260.Load((drawmonoid_base_2 + 2u) * 4 + 8);
@ -984,11 +987,11 @@ void comp_main()
                        Alloc param_29 = cmd_alloc;
                        CmdRef param_30 = cmd_ref;
                        uint param_31 = cmd_limit;
-                        bool _1848 = alloc_cmd(param_29, param_30, param_31);
+                        bool _1854 = alloc_cmd(param_29, param_30, param_31);
                        cmd_alloc = param_29;
                        cmd_ref = param_30;
                        cmd_limit = param_31;
-                        if (!_1848)
+                        if (!_1854)
                        {
                            break;
                        }
@ -998,11 +1001,11 @@ void comp_main()
                        float param_35 = linewidth;
                        write_fill(param_32, param_33, param_34, param_35);
                        cmd_ref = param_33;
-                        uint rgba = _1372.Load(dd_1 * 4 + 0);
-                        CmdColor _1871 = { rgba };
+                        uint rgba = _1378.Load(dd_1 * 4 + 0);
+                        CmdColor _1877 = { rgba };
                        Alloc param_36 = cmd_alloc;
                        CmdRef param_37 = cmd_ref;
-                        CmdColor param_38 = _1871;
+                        CmdColor param_38 = _1877;
                        Cmd_Color_write(param_36, param_37, param_38);
                        cmd_ref.offset += 8u;
                        break;
@ -1012,11 +1015,11 @@ void comp_main()
                        Alloc param_39 = cmd_alloc;
                        CmdRef param_40 = cmd_ref;
                        uint param_41 = cmd_limit;
-                        bool _1889 = alloc_cmd(param_39, param_40, param_41);
+                        bool _1895 = alloc_cmd(param_39, param_40, param_41);
                        cmd_alloc = param_39;
                        cmd_ref = param_40;
                        cmd_limit = param_41;
-                        if (!_1889)
+                        if (!_1895)
                        {
                            break;
                        }
@ -1027,7 +1030,7 @@ void comp_main()
                        float param_45 = linewidth;
                        write_fill(param_42, param_43, param_44, param_45);
                        cmd_ref = param_43;
-                        cmd_lin.index = _1372.Load(dd_1 * 4 + 0);
+                        cmd_lin.index = _1378.Load(dd_1 * 4 + 0);
                        cmd_lin.line_x = asfloat(_260.Load((di + 1u) * 4 + 8));
                        cmd_lin.line_y = asfloat(_260.Load((di + 2u) * 4 + 8));
                        cmd_lin.line_c = asfloat(_260.Load((di + 3u) * 4 + 8));
@ -1043,11 +1046,11 @@ void comp_main()
                        Alloc param_49 = cmd_alloc;
                        CmdRef param_50 = cmd_ref;
                        uint param_51 = cmd_limit;
-                        bool _1953 = alloc_cmd(param_49, param_50, param_51);
+                        bool _1959 = alloc_cmd(param_49, param_50, param_51);
                        cmd_alloc = param_49;
                        cmd_ref = param_50;
                        cmd_limit = param_51;
-                        if (!_1953)
+                        if (!_1959)
                        {
                            break;
                        }
@ -1058,7 +1061,7 @@ void comp_main()
                        float param_55 = linewidth;
                        write_fill(param_52, param_53, param_54, param_55);
                        cmd_ref = param_53;
-                        cmd_rad.index = _1372.Load(dd_1 * 4 + 0);
+                        cmd_rad.index = _1378.Load(dd_1 * 4 + 0);
                        cmd_rad.mat = asfloat(uint4(_260.Load((di + 1u) * 4 + 8), _260.Load((di + 2u) * 4 + 8), _260.Load((di + 3u) * 4 + 8), _260.Load((di + 4u) * 4 + 8)));
                        cmd_rad.xlat = asfloat(uint2(_260.Load((di + 5u) * 4 + 8), _260.Load((di + 6u) * 4 + 8)));
                        cmd_rad.c1 = asfloat(uint2(_260.Load((di + 7u) * 4 + 8), _260.Load((di + 8u) * 4 + 8)));
@ -1077,11 +1080,11 @@ void comp_main()
                        Alloc param_59 = cmd_alloc;
                        CmdRef param_60 = cmd_ref;
                        uint param_61 = cmd_limit;
-                        bool _2059 = alloc_cmd(param_59, param_60, param_61);
+                        bool _2065 = alloc_cmd(param_59, param_60, param_61);
                        cmd_alloc = param_59;
                        cmd_ref = param_60;
                        cmd_limit = param_61;
-                        if (!_2059)
+                        if (!_2065)
                        {
                            break;
                        }
@ -1091,30 +1094,30 @@ void comp_main()
                        float param_65 = linewidth;
                        write_fill(param_62, param_63, param_64, param_65);
                        cmd_ref = param_63;
-                        uint index = _1372.Load(dd_1 * 4 + 0);
-                        uint raw1 = _1372.Load((dd_1 + 1u) * 4 + 0);
+                        uint index = _1378.Load(dd_1 * 4 + 0);
+                        uint raw1 = _1378.Load((dd_1 + 1u) * 4 + 0);
                        int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
-                        CmdImage _2098 = { index, offset_1 };
+                        CmdImage _2104 = { index, offset_1 };
                        Alloc param_66 = cmd_alloc;
                        CmdRef param_67 = cmd_ref;
-                        CmdImage param_68 = _2098;
+                        CmdImage param_68 = _2104;
                        Cmd_Image_write(param_66, param_67, param_68);
                        cmd_ref.offset += 12u;
                        break;
                    }
                    case 5u:
                    {
-                        bool _2112 = tile_1.tile.offset == 0u;
-                        bool _2118;
-                        if (_2112)
+                        bool _2118 = tile_1.tile.offset == 0u;
+                        bool _2124;
+                        if (_2118)
                        {
-                            _2118 = tile_1.backdrop == 0;
+                            _2124 = tile_1.backdrop == 0;
                        }
                        else
                        {
-                            _2118 = _2112;
+                            _2124 = _2118;
                        }
-                        if (_2118)
+                        if (_2124)
                        {
                            clip_zero_depth = clip_depth + 1u;
                        }
@ -1123,11 +1126,11 @@ void comp_main()
                            Alloc param_69 = cmd_alloc;
                            CmdRef param_70 = cmd_ref;
                            uint param_71 = cmd_limit;
-                            bool _2130 = alloc_cmd(param_69, param_70, param_71);
+                            bool _2136 = alloc_cmd(param_69, param_70, param_71);
                            cmd_alloc = param_69;
                            cmd_ref = param_70;
                            cmd_limit = param_71;
-                            if (!_2130)
+                            if (!_2136)
                            {
                                break;
                            }
@ -1135,6 +1138,8 @@ void comp_main()
                            CmdRef param_73 = cmd_ref;
                            Cmd_BeginClip_write(param_72, param_73);
                            cmd_ref.offset += 4u;
+                            render_blend_depth++;
+                            max_blend_depth = max(max_blend_depth, render_blend_depth);
                        }
                        clip_depth++;
                        break;
@ -1145,11 +1150,11 @@ void comp_main()
                        Alloc param_74 = cmd_alloc;
                        CmdRef param_75 = cmd_ref;
                        uint param_76 = cmd_limit;
-                        bool _2158 = alloc_cmd(param_74, param_75, param_76);
+                        bool _2169 = alloc_cmd(param_74, param_75, param_76);
                        cmd_alloc = param_74;
                        cmd_ref = param_75;
                        cmd_limit = param_76;
-                        if (!_2158)
+                        if (!_2169)
                        {
                            break;
                        }
@ -1159,13 +1164,14 @@ void comp_main()
                        float param_80 = -1.0f;
                        write_fill(param_77, param_78, param_79, param_80);
                        cmd_ref = param_78;
-                        uint blend_1 = _1372.Load(dd_1 * 4 + 0);
-                        CmdEndClip _2181 = { blend_1 };
+                        uint blend_1 = _1378.Load(dd_1 * 4 + 0);
+                        CmdEndClip _2192 = { blend_1 };
                        Alloc param_81 = cmd_alloc;
                        CmdRef param_82 = cmd_ref;
-                        CmdEndClip param_83 = _2181;
+                        CmdEndClip param_83 = _2192;
                        Cmd_EndClip_write(param_81, param_82, param_83);
                        cmd_ref.offset += 8u;
+                        render_blend_depth--;
                        break;
                    }
                }
@ -1198,21 +1204,24 @@ void comp_main()
            break;
        }
    }
-    bool _2228 = (bin_tile_x + tile_x) < _1005.Load(8);
-    bool _2237;
-    if (_2228)
+    bool _2241 = (bin_tile_x + tile_x) < _1005.Load(8);
+    bool _2250;
+    if (_2241)
    {
-        _2237 = (bin_tile_y + tile_y) < _1005.Load(12);
+        _2250 = (bin_tile_y + tile_y) < _1005.Load(12);
    }
    else
    {
-        _2237 = _2228;
+        _2250 = _2241;
    }
-    if (_2237)
+    if (_2250)
    {
        Alloc param_84 = cmd_alloc;
        CmdRef param_85 = cmd_ref;
        Cmd_End_write(param_84, param_85);
+        if (max_blend_depth > 4u)
+        {
+        }
    }
 }

--- a/piet-gpu/shader/gen/coarse.msl
+++ b/piet-gpu/shader/gen/coarse.msl
@ -677,7 +677,7 @@ void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memor
    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
 }

-kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_260 [[buffer(0)]], const device ConfigBuf& _1005 [[buffer(1)]], const device SceneBuf& _1372 [[buffer(2)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_260 [[buffer(0)]], const device ConfigBuf& _1005 [[buffer(1)]], const device SceneBuf& _1378 [[buffer(2)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
    threadgroup uint sh_bitmaps[8][256];
    threadgroup Alloc sh_part_elements[256];
@ -713,6 +713,9 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
    uint wr_ix = 0u;
    uint part_start_ix = 0u;
    uint ready_ix = 0u;
+    cmd_ref.offset += 4u;
+    uint render_blend_depth = 0u;
+    uint max_blend_depth = 0u;
    uint drawmonoid_start = _1005.conf.drawmonoid_alloc.offset >> uint(2);
    uint drawtag_start = _1005.conf.drawtag_offset >> uint(2);
    uint drawdata_start = _1005.conf.drawdata_offset >> uint(2);
@ -720,11 +723,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
    bool mem_ok = v_260.mem_error == 0u;
    Alloc param_3;
    Alloc param_5;
-    uint _1304;
+    uint _1310;
    uint element_ix;
    Alloc param_14;
    uint tile_count;
-    uint _1605;
+    uint _1611;
    float linewidth;
    CmdLinGrad cmd_lin;
    CmdRadGrad cmd_rad;
@ -734,24 +737,24 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
        {
            sh_bitmaps[i][th_ix] = 0u;
        }
-        bool _1356;
+        bool _1362;
        for (;;)
        {
            if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
            {
                part_start_ix = ready_ix;
                uint count = 0u;
-                bool _1154 = th_ix < 256u;
-                bool _1162;
-                if (_1154)
+                bool _1160 = th_ix < 256u;
+                bool _1168;
+                if (_1160)
                {
-                    _1162 = (partition_ix + th_ix) < n_partitions;
+                    _1168 = (partition_ix + th_ix) < n_partitions;
                }
                else
                {
-                    _1162 = _1154;
+                    _1168 = _1160;
                }
-                if (_1162)
+                if (_1168)
                {
                    uint in_ix = (_1005.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
                    param_3.offset = _1005.conf.bin_alloc.offset;
@ -803,13 +806,13 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                }
                if (part_ix > 0u)
                {
-                    _1304 = sh_part_count[part_ix - 1u];
+                    _1310 = sh_part_count[part_ix - 1u];
                }
                else
                {
-                    _1304 = part_start_ix;
+                    _1310 = part_start_ix;
                }
-                ix -= _1304;
+                ix -= _1310;
                Alloc bin_alloc = sh_part_elements[part_ix];
                BinInstanceRef inst_ref = BinInstanceRef{ bin_alloc.offset };
                BinInstanceRef param_10 = inst_ref;
@ -821,16 +824,16 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
            }
            threadgroup_barrier(mem_flags::mem_threadgroup);
            wr_ix = min((rd_ix + 256u), ready_ix);
-            bool _1346 = (wr_ix - rd_ix) < 256u;
-            if (_1346)
+            bool _1352 = (wr_ix - rd_ix) < 256u;
+            if (_1352)
            {
-                _1356 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
+                _1362 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
            }
            else
            {
-                _1356 = _1346;
+                _1362 = _1352;
            }
-            if (_1356)
+            if (_1362)
            {
                continue;
            }
@ -843,7 +846,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
        if ((th_ix + rd_ix) < wr_ix)
        {
            element_ix = sh_elements[th_ix];
-            tag = _1372.scene[drawtag_start + element_ix];
+            tag = _1378.scene[drawtag_start + element_ix];
        }
        switch (tag)
        {
@ -913,16 +916,16 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                }
            }
            uint element_ix_1 = sh_elements[el_ix];
-            uint tag_1 = _1372.scene[drawtag_start + element_ix_1];
+            uint tag_1 = _1378.scene[drawtag_start + element_ix_1];
            if (el_ix > 0u)
            {
-                _1605 = sh_tile_count[el_ix - 1u];
+                _1611 = sh_tile_count[el_ix - 1u];
            }
            else
            {
-                _1605 = 0u;
+                _1611 = 0u;
            }
-            uint seq_ix = ix_1 - _1605;
+            uint seq_ix = ix_1 - _1611;
            uint width = sh_tile_width[el_ix];
            uint x = sh_tile_x0[el_ix] + (seq_ix % width);
            uint y = sh_tile_y0[el_ix] + (seq_ix / width);
@ -941,26 +944,26 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                    uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1);
                    uint scene_offset = v_260.memory[drawmonoid_base_1 + 2u];
                    uint dd = drawdata_start + (scene_offset >> uint(2));
-                    uint blend = _1372.scene[dd];
+                    uint blend = _1378.scene[dd];
                    is_blend = blend != 3u;
                }
-                bool _1692 = tile.tile.offset != 0u;
-                bool _1701;
-                if (!_1692)
+                bool _1698 = tile.tile.offset != 0u;
+                bool _1707;
+                if (!_1698)
                {
-                    _1701 = (tile.backdrop == 0) == is_clip;
+                    _1707 = (tile.backdrop == 0) == is_clip;
                }
                else
                {
-                    _1701 = _1692;
+                    _1707 = _1698;
                }
-                include_tile = _1701 || is_blend;
+                include_tile = _1707 || is_blend;
            }
            if (include_tile)
            {
                uint el_slice = el_ix / 32u;
                uint el_mask = 1u << (el_ix & 31u);
-                uint _1723 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
+                uint _1729 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
            }
        }
        threadgroup_barrier(mem_flags::mem_threadgroup);
@ -984,7 +987,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
            uint element_ref_ix = (slice_ix * 32u) + uint(int(spvFindLSB(bitmap)));
            uint element_ix_2 = sh_elements[element_ref_ix];
            bitmap &= (bitmap - 1u);
-            uint drawtag = _1372.scene[drawtag_start + element_ix_2];
+            uint drawtag = _1378.scene[drawtag_start + element_ix_2];
            if (clip_zero_depth == 0u)
            {
                uint param_25 = element_ref_ix;
@ -1005,11 +1008,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                        Alloc param_29 = cmd_alloc;
                        CmdRef param_30 = cmd_ref;
                        uint param_31 = cmd_limit;
-                        bool _1848 = alloc_cmd(param_29, param_30, param_31, v_260, v_260BufferSize);
+                        bool _1854 = alloc_cmd(param_29, param_30, param_31, v_260, v_260BufferSize);
                        cmd_alloc = param_29;
                        cmd_ref = param_30;
                        cmd_limit = param_31;
-                        if (!_1848)
+                        if (!_1854)
                        {
                            break;
                        }
@ -1019,7 +1022,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                        float param_35 = linewidth;
                        write_fill(param_32, param_33, param_34, param_35, v_260, v_260BufferSize);
                        cmd_ref = param_33;
-                        uint rgba = _1372.scene[dd_1];
+                        uint rgba = _1378.scene[dd_1];
                        Alloc param_36 = cmd_alloc;
                        CmdRef param_37 = cmd_ref;
                        CmdColor param_38 = CmdColor{ rgba };
@ -1032,11 +1035,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                        Alloc param_39 = cmd_alloc;
                        CmdRef param_40 = cmd_ref;
                        uint param_41 = cmd_limit;
-                        bool _1889 = alloc_cmd(param_39, param_40, param_41, v_260, v_260BufferSize);
+                        bool _1895 = alloc_cmd(param_39, param_40, param_41, v_260, v_260BufferSize);
                        cmd_alloc = param_39;
                        cmd_ref = param_40;
                        cmd_limit = param_41;
-                        if (!_1889)
+                        if (!_1895)
                        {
                            break;
                        }
@ -1047,7 +1050,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                        float param_45 = linewidth;
                        write_fill(param_42, param_43, param_44, param_45, v_260, v_260BufferSize);
                        cmd_ref = param_43;
-                        cmd_lin.index = _1372.scene[dd_1];
+                        cmd_lin.index = _1378.scene[dd_1];
                        cmd_lin.line_x = as_type<float>(v_260.memory[di + 1u]);
                        cmd_lin.line_y = as_type<float>(v_260.memory[di + 2u]);
                        cmd_lin.line_c = as_type<float>(v_260.memory[di + 3u]);
@ -1063,11 +1066,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                        Alloc param_49 = cmd_alloc;
                        CmdRef param_50 = cmd_ref;
                        uint param_51 = cmd_limit;
-                        bool _1953 = alloc_cmd(param_49, param_50, param_51, v_260, v_260BufferSize);
+                        bool _1959 = alloc_cmd(param_49, param_50, param_51, v_260, v_260BufferSize);
                        cmd_alloc = param_49;
                        cmd_ref = param_50;
                        cmd_limit = param_51;
-                        if (!_1953)
+                        if (!_1959)
                        {
                            break;
                        }
@ -1078,7 +1081,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                        float param_55 = linewidth;
                        write_fill(param_52, param_53, param_54, param_55, v_260, v_260BufferSize);
                        cmd_ref = param_53;
-                        cmd_rad.index = _1372.scene[dd_1];
+                        cmd_rad.index = _1378.scene[dd_1];
                        cmd_rad.mat = as_type<float4>(uint4(v_260.memory[di + 1u], v_260.memory[di + 2u], v_260.memory[di + 3u], v_260.memory[di + 4u]));
                        cmd_rad.xlat = as_type<float2>(uint2(v_260.memory[di + 5u], v_260.memory[di + 6u]));
                        cmd_rad.c1 = as_type<float2>(uint2(v_260.memory[di + 7u], v_260.memory[di + 8u]));
@ -1097,11 +1100,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                        Alloc param_59 = cmd_alloc;
                        CmdRef param_60 = cmd_ref;
                        uint param_61 = cmd_limit;
-                        bool _2059 = alloc_cmd(param_59, param_60, param_61, v_260, v_260BufferSize);
+                        bool _2065 = alloc_cmd(param_59, param_60, param_61, v_260, v_260BufferSize);
                        cmd_alloc = param_59;
                        cmd_ref = param_60;
                        cmd_limit = param_61;
-                        if (!_2059)
+                        if (!_2065)
                        {
                            break;
                        }
@ -1111,8 +1114,8 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                        float param_65 = linewidth;
                        write_fill(param_62, param_63, param_64, param_65, v_260, v_260BufferSize);
                        cmd_ref = param_63;
-                        uint index = _1372.scene[dd_1];
-                        uint raw1 = _1372.scene[dd_1 + 1u];
+                        uint index = _1378.scene[dd_1];
+                        uint raw1 = _1378.scene[dd_1 + 1u];
                        int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
                        Alloc param_66 = cmd_alloc;
                        CmdRef param_67 = cmd_ref;
@ -1123,17 +1126,17 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                    }
                    case 5u:
                    {
-                        bool _2112 = tile_1.tile.offset == 0u;
-                        bool _2118;
-                        if (_2112)
+                        bool _2118 = tile_1.tile.offset == 0u;
+                        bool _2124;
+                        if (_2118)
                        {
-                            _2118 = tile_1.backdrop == 0;
+                            _2124 = tile_1.backdrop == 0;
                        }
                        else
                        {
-                            _2118 = _2112;
+                            _2124 = _2118;
                        }
-                        if (_2118)
+                        if (_2124)
                        {
                            clip_zero_depth = clip_depth + 1u;
                        }
@ -1142,11 +1145,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                            Alloc param_69 = cmd_alloc;
                            CmdRef param_70 = cmd_ref;
                            uint param_71 = cmd_limit;
-                            bool _2130 = alloc_cmd(param_69, param_70, param_71, v_260, v_260BufferSize);
+                            bool _2136 = alloc_cmd(param_69, param_70, param_71, v_260, v_260BufferSize);
                            cmd_alloc = param_69;
                            cmd_ref = param_70;
                            cmd_limit = param_71;
-                            if (!_2130)
+                            if (!_2136)
                            {
                                break;
                            }
@ -1154,6 +1157,8 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                            CmdRef param_73 = cmd_ref;
                            Cmd_BeginClip_write(param_72, param_73, v_260, v_260BufferSize);
                            cmd_ref.offset += 4u;
+                            render_blend_depth++;
+                            max_blend_depth = max(max_blend_depth, render_blend_depth);
                        }
                        clip_depth++;
                        break;
@ -1164,11 +1169,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                        Alloc param_74 = cmd_alloc;
                        CmdRef param_75 = cmd_ref;
                        uint param_76 = cmd_limit;
-                        bool _2158 = alloc_cmd(param_74, param_75, param_76, v_260, v_260BufferSize);
+                        bool _2169 = alloc_cmd(param_74, param_75, param_76, v_260, v_260BufferSize);
                        cmd_alloc = param_74;
                        cmd_ref = param_75;
                        cmd_limit = param_76;
-                        if (!_2158)
+                        if (!_2169)
                        {
                            break;
                        }
@ -1178,12 +1183,13 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                        float param_80 = -1.0;
                        write_fill(param_77, param_78, param_79, param_80, v_260, v_260BufferSize);
                        cmd_ref = param_78;
-                        uint blend_1 = _1372.scene[dd_1];
+                        uint blend_1 = _1378.scene[dd_1];
                        Alloc param_81 = cmd_alloc;
                        CmdRef param_82 = cmd_ref;
                        CmdEndClip param_83 = CmdEndClip{ blend_1 };
                        Cmd_EndClip_write(param_81, param_82, param_83, v_260, v_260BufferSize);
                        cmd_ref.offset += 8u;
+                        render_blend_depth--;
                        break;
                    }
                }
@ -1216,21 +1222,24 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
            break;
        }
    }
-    bool _2228 = (bin_tile_x + tile_x) < _1005.conf.width_in_tiles;
-    bool _2237;
-    if (_2228)
+    bool _2241 = (bin_tile_x + tile_x) < _1005.conf.width_in_tiles;
+    bool _2250;
+    if (_2241)
    {
-        _2237 = (bin_tile_y + tile_y) < _1005.conf.height_in_tiles;
+        _2250 = (bin_tile_y + tile_y) < _1005.conf.height_in_tiles;
    }
    else
    {
-        _2237 = _2228;
+        _2250 = _2241;
    }
-    if (_2237)
+    if (_2250)
    {
        Alloc param_84 = cmd_alloc;
        CmdRef param_85 = cmd_ref;
        Cmd_End_write(param_84, param_85, v_260, v_260BufferSize);
+        if (max_blend_depth > 4u)
+        {
+        }
    }
 }

--- a/piet-gpu/shader/gen/coarse.spv
+++ b/piet-gpu/shader/gen/coarse.spv
--- a/piet-gpu/shader/gen/kernel4.dxil
+++ b/piet-gpu/shader/gen/kernel4.dxil
--- a/piet-gpu/shader/gen/kernel4.hlsl
+++ b/piet-gpu/shader/gen/kernel4.hlsl
@ -162,7 +162,7 @@ struct Config
 static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);

 RWByteAddressBuffer _297 : register(u0, space0);
-ByteAddressBuffer _1749 : register(t1, space0);
+ByteAddressBuffer _1725 : register(t1, space0);
 RWTexture2D<unorm float4> image_atlas : register(u3, space0);
 RWTexture2D<unorm float4> gradients : register(u4, space0);
 RWTexture2D<unorm float4> image : register(u2, space0);
@ -477,10 +477,10 @@ void fillImage(out float4 spvReturnValue[8], uint2 xy, CmdImage cmd_img)
        int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
        float4 fg_rgba = image_atlas[uv];
        float3 param_1 = fg_rgba.xyz;
-        float3 _1721 = fromsRGB(param_1);
-        fg_rgba.x = _1721.x;
-        fg_rgba.y = _1721.y;
-        fg_rgba.z = _1721.z;
+        float3 _1697 = fromsRGB(param_1);
+        fg_rgba.x = _1697.x;
+        fg_rgba.y = _1697.y;
+        fg_rgba.z = _1697.z;
        rgba[i] = fg_rgba;
    }
    spvReturnValue = rgba;
@ -918,12 +918,6 @@ float4 mix_compose(float3 cb, float3 cs, float ab, float as, uint mode)
            break;
        }
        case 13u:
-        {
-            float rev_as = 1.0f - as;
-            float rev_ab = 1.0f - ab;
-            return max(0.0f.xxxx, float4((cs * rev_as) + (cb * rev_ab), rev_as + rev_ab));
-        }
-        case 14u:
        {
            return min(1.0f.xxxx, float4((cs * as) + (cb * ab), as + ab));
        }
@ -992,16 +986,18 @@ CmdJump Cmd_Jump_read(Alloc a, CmdRef ref)

 void comp_main()
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1749.Load(8)) + gl_WorkGroupID.x;
-    Alloc _1764;
-    _1764.offset = _1749.Load(24);
+    uint tile_ix = (gl_WorkGroupID.y * _1725.Load(8)) + gl_WorkGroupID.x;
+    Alloc _1740;
+    _1740.offset = _1725.Load(24);
    Alloc param;
-    param.offset = _1764.offset;
+    param.offset = _1740.offset;
    uint param_1 = tile_ix * 1024u;
    uint param_2 = 1024u;
    Alloc cmd_alloc = slice_mem(param, param_1, param_2);
-    CmdRef _1773 = { cmd_alloc.offset };
-    CmdRef cmd_ref = _1773;
+    CmdRef _1749 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _1749;
+    uint blend_offset = _297.Load((cmd_ref.offset >> uint(2)) * 4 + 8);
+    cmd_ref.offset += 4u;
    uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
    float2 xy = float2(xy_uint);
    float4 rgba[8];
@ -1014,7 +1010,9 @@ void comp_main()
    float df[8];
    TileSegRef tile_seg_ref;
    float area[8];
-    uint blend_stack[128][8];
+    uint blend_stack[4][8];
+    uint base_ix_1;
+    uint bg_rgba;
    while (mem_ok)
    {
        Alloc param_3 = cmd_alloc;
@ -1035,8 +1033,8 @@ void comp_main()
                {
                    df[k] = 1000000000.0f;
                }
-                TileSegRef _1867 = { stroke.tile_ref };
-                tile_seg_ref = _1867;
+                TileSegRef _1854 = { stroke.tile_ref };
+                tile_seg_ref = _1854;
                do
                {
                    uint param_7 = tile_seg_ref.offset;
@ -1072,8 +1070,8 @@ void comp_main()
                {
                    area[k_3] = float(fill.backdrop);
                }
-                TileSegRef _1987 = { fill.tile_ref };
-                tile_seg_ref = _1987;
+                TileSegRef _1974 = { fill.tile_ref };
+                tile_seg_ref = _1974;
                do
                {
                    uint param_15 = tile_seg_ref.offset;
@ -1162,10 +1160,10 @@ void comp_main()
                    int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
                    float4 fg_rgba = gradients[int2(x, int(lin.index))];
                    float3 param_29 = fg_rgba.xyz;
-                    float3 _2321 = fromsRGB(param_29);
-                    fg_rgba.x = _2321.x;
-                    fg_rgba.y = _2321.y;
-                    fg_rgba.z = _2321.z;
+                    float3 _2308 = fromsRGB(param_29);
+                    fg_rgba.x = _2308.x;
+                    fg_rgba.y = _2308.y;
+                    fg_rgba.z = _2308.z;
                    float4 fg_k_1 = fg_rgba * area[k_9];
                    rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1;
                }
@ -1188,10 +1186,10 @@ void comp_main()
                    int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f));
                    float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))];
                    float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2431 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2431.x;
-                    fg_rgba_1.y = _2431.y;
-                    fg_rgba_1.z = _2431.z;
+                    float3 _2418 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2418.x;
+                    fg_rgba_1.y = _2418.y;
+                    fg_rgba_1.z = _2418.z;
                    float4 fg_k_2 = fg_rgba_1 * area[k_10];
                    rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2;
                }
@ -1205,9 +1203,9 @@ void comp_main()
                CmdImage fill_img = Cmd_Image_read(param_34, param_35);
                uint2 param_36 = xy_uint;
                CmdImage param_37 = fill_img;
-                float4 _2474[8];
-                fillImage(_2474, param_36, param_37);
-                float4 img[8] = _2474;
+                float4 _2461[8];
+                fillImage(_2461, param_36, param_37);
+                float4 img[8] = _2461;
                for (uint k_11 = 0u; k_11 < 8u; k_11++)
                {
                    float4 fg_k_3 = img[k_11] * area[k_11];
@ -1218,13 +1216,26 @@ void comp_main()
            }
            case 9u:
            {
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                if (clip_depth < 4u)
                {
-                    uint d_2 = min(clip_depth, 127u);
-                    float4 param_38 = float4(rgba[k_12]);
-                    uint _2537 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2537;
-                    rgba[k_12] = 0.0f.xxxx;
+                    for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                    {
+                        float4 param_38 = float4(rgba[k_12]);
+                        uint _2523 = packsRGB(param_38);
+                        blend_stack[clip_depth][k_12] = _2523;
+                        rgba[k_12] = 0.0f.xxxx;
+                    }
+                }
+                else
+                {
+                    uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                    for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                    {
+                        float4 param_39 = float4(rgba[k_13]);
+                        uint _2566 = packsRGB(param_39);
+                        _297.Store((base_ix + k_13) * 4 + 8, _2566);
+                        rgba[k_13] = 0.0f.xxxx;
+                    }
                }
                clip_depth++;
                cmd_ref.offset += 4u;
@ -1232,32 +1243,41 @@ void comp_main()
            }
            case 10u:
            {
-                Alloc param_39 = cmd_alloc;
-                CmdRef param_40 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40);
-                uint blend_mode = end_clip.blend >> uint(8);
-                uint comp_mode = end_clip.blend & 255u;
+                Alloc param_40 = cmd_alloc;
+                CmdRef param_41 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41);
                clip_depth--;
-                for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                if (clip_depth < 4u)
                {
-                    uint d_3 = min(clip_depth, 127u);
-                    uint param_41 = blend_stack[d_3][k_13];
-                    float4 bg = unpacksRGB(param_41);
-                    float4 fg_1 = rgba[k_13] * area[k_13];
-                    float4 param_42 = bg;
-                    float4 param_43 = fg_1;
-                    uint param_44 = end_clip.blend;
-                    rgba[k_13] = mix_blend_compose(param_42, param_43, param_44);
+                    base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                }
+                for (uint k_14 = 0u; k_14 < 8u; k_14++)
+                {
+                    if (clip_depth < 4u)
+                    {
+                        bg_rgba = blend_stack[clip_depth][k_14];
+                    }
+                    else
+                    {
+                        bg_rgba = _297.Load((base_ix_1 + k_14) * 4 + 8);
+                    }
+                    uint param_42 = bg_rgba;
+                    float4 bg = unpacksRGB(param_42);
+                    float4 fg_1 = rgba[k_14] * area[k_14];
+                    float4 param_43 = bg;
+                    float4 param_44 = fg_1;
+                    uint param_45 = end_clip.blend;
+                    rgba[k_14] = mix_blend_compose(param_43, param_44, param_45);
                }
                cmd_ref.offset += 8u;
                break;
            }
            case 11u:
            {
-                Alloc param_45 = cmd_alloc;
-                CmdRef param_46 = cmd_ref;
-                CmdRef _2615 = { Cmd_Jump_read(param_45, param_46).new_ref };
-                cmd_ref = _2615;
+                Alloc param_46 = cmd_alloc;
+                CmdRef param_47 = cmd_ref;
+                CmdRef _2665 = { Cmd_Jump_read(param_46, param_47).new_ref };
+                cmd_ref = _2665;
                cmd_alloc.offset = cmd_ref.offset;
                break;
            }
@ -1265,9 +1285,9 @@ void comp_main()
    }
    for (uint i_1 = 0u; i_1 < 8u; i_1++)
    {
-        uint param_47 = i_1;
-        float3 param_48 = rgba[i_1].xyz;
-        image[int2(xy_uint + chunk_offset(param_47))] = float4(tosRGB(param_48), rgba[i_1].w);
+        uint param_48 = i_1;
+        float3 param_49 = rgba[i_1].xyz;
+        image[int2(xy_uint + chunk_offset(param_48))] = float4(tosRGB(param_49), rgba[i_1].w);
    }
 }

--- a/piet-gpu/shader/gen/kernel4.msl
+++ b/piet-gpu/shader/gen/kernel4.msl
@ -528,10 +528,10 @@ spvUnsafeArray<float4, 8> fillImage(thread const uint2& xy, thread const CmdImag
        int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
        float4 fg_rgba = image_atlas.read(uint2(uv));
        float3 param_1 = fg_rgba.xyz;
-        float3 _1721 = fromsRGB(param_1);
-        fg_rgba.x = _1721.x;
-        fg_rgba.y = _1721.y;
-        fg_rgba.z = _1721.z;
+        float3 _1697 = fromsRGB(param_1);
+        fg_rgba.x = _1697.x;
+        fg_rgba.y = _1697.y;
+        fg_rgba.z = _1697.z;
        rgba[i] = fg_rgba;
    }
    return rgba;
@ -985,12 +985,6 @@ float4 mix_compose(thread const float3& cb, thread const float3& cs, thread cons
            break;
        }
        case 13u:
-        {
-            float rev_as = 1.0 - as;
-            float rev_ab = 1.0 - ab;
-            return fast::max(float4(0.0), float4((cs * rev_as) + (cb * rev_ab), rev_as + rev_ab));
-        }
-        case 14u:
        {
            return fast::min(float4(1.0), float4((cs * as) + (cb * ab), as + ab));
        }
@ -1059,15 +1053,17 @@ CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Me
    return CmdJump_read(param, param_1, v_297);
 }

-kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1749 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1725 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1749.conf.width_in_tiles) + gl_WorkGroupID.x;
+    uint tile_ix = (gl_WorkGroupID.y * _1725.conf.width_in_tiles) + gl_WorkGroupID.x;
    Alloc param;
-    param.offset = _1749.conf.ptcl_alloc.offset;
+    param.offset = _1725.conf.ptcl_alloc.offset;
    uint param_1 = tile_ix * 1024u;
    uint param_2 = 1024u;
    Alloc cmd_alloc = slice_mem(param, param_1, param_2);
    CmdRef cmd_ref = CmdRef{ cmd_alloc.offset };
+    uint blend_offset = v_297.memory[cmd_ref.offset >> uint(2)];
+    cmd_ref.offset += 4u;
    uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
    float2 xy = float2(xy_uint);
    spvUnsafeArray<float4, 8> rgba;
@ -1080,7 +1076,9 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
    spvUnsafeArray<float, 8> df;
    TileSegRef tile_seg_ref;
    spvUnsafeArray<float, 8> area;
-    spvUnsafeArray<spvUnsafeArray<uint, 8>, 128> blend_stack;
+    spvUnsafeArray<spvUnsafeArray<uint, 8>, 4> blend_stack;
+    uint base_ix_1;
+    uint bg_rgba;
    while (mem_ok)
    {
        Alloc param_3 = cmd_alloc;
@ -1226,10 +1224,10 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
                    int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                    float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                    float3 param_29 = fg_rgba.xyz;
-                    float3 _2321 = fromsRGB(param_29);
-                    fg_rgba.x = _2321.x;
-                    fg_rgba.y = _2321.y;
-                    fg_rgba.z = _2321.z;
+                    float3 _2308 = fromsRGB(param_29);
+                    fg_rgba.x = _2308.x;
+                    fg_rgba.y = _2308.y;
+                    fg_rgba.z = _2308.z;
                    float4 fg_k_1 = fg_rgba * area[k_9];
                    rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1;
                }
@ -1252,10 +1250,10 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
                    int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0));
                    float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index))));
                    float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2431 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2431.x;
-                    fg_rgba_1.y = _2431.y;
-                    fg_rgba_1.z = _2431.z;
+                    float3 _2418 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2418.x;
+                    fg_rgba_1.y = _2418.y;
+                    fg_rgba_1.z = _2418.z;
                    float4 fg_k_2 = fg_rgba_1 * area[k_10];
                    rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2;
                }
@ -1281,13 +1279,26 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
            }
            case 9u:
            {
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                if (clip_depth < 4u)
                {
-                    uint d_2 = min(clip_depth, 127u);
-                    float4 param_38 = float4(rgba[k_12]);
-                    uint _2537 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2537;
-                    rgba[k_12] = float4(0.0);
+                    for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                    {
+                        float4 param_38 = float4(rgba[k_12]);
+                        uint _2523 = packsRGB(param_38);
+                        blend_stack[clip_depth][k_12] = _2523;
+                        rgba[k_12] = float4(0.0);
+                    }
+                }
+                else
+                {
+                    uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                    for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                    {
+                        float4 param_39 = float4(rgba[k_13]);
+                        uint _2566 = packsRGB(param_39);
+                        v_297.memory[base_ix + k_13] = _2566;
+                        rgba[k_13] = float4(0.0);
+                    }
                }
                clip_depth++;
                cmd_ref.offset += 4u;
@ -1295,31 +1306,40 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
            }
            case 10u:
            {
-                Alloc param_39 = cmd_alloc;
-                CmdRef param_40 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40, v_297);
-                uint blend_mode = end_clip.blend >> uint(8);
-                uint comp_mode = end_clip.blend & 255u;
+                Alloc param_40 = cmd_alloc;
+                CmdRef param_41 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41, v_297);
                clip_depth--;
-                for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                if (clip_depth < 4u)
                {
-                    uint d_3 = min(clip_depth, 127u);
-                    uint param_41 = blend_stack[d_3][k_13];
-                    float4 bg = unpacksRGB(param_41);
-                    float4 fg_1 = rgba[k_13] * area[k_13];
-                    float4 param_42 = bg;
-                    float4 param_43 = fg_1;
-                    uint param_44 = end_clip.blend;
-                    rgba[k_13] = mix_blend_compose(param_42, param_43, param_44);
+                    base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                }
+                for (uint k_14 = 0u; k_14 < 8u; k_14++)
+                {
+                    if (clip_depth < 4u)
+                    {
+                        bg_rgba = blend_stack[clip_depth][k_14];
+                    }
+                    else
+                    {
+                        bg_rgba = v_297.memory[base_ix_1 + k_14];
+                    }
+                    uint param_42 = bg_rgba;
+                    float4 bg = unpacksRGB(param_42);
+                    float4 fg_1 = rgba[k_14] * area[k_14];
+                    float4 param_43 = bg;
+                    float4 param_44 = fg_1;
+                    uint param_45 = end_clip.blend;
+                    rgba[k_14] = mix_blend_compose(param_43, param_44, param_45);
                }
                cmd_ref.offset += 8u;
                break;
            }
            case 11u:
            {
-                Alloc param_45 = cmd_alloc;
-                CmdRef param_46 = cmd_ref;
-                cmd_ref = CmdRef{ Cmd_Jump_read(param_45, param_46, v_297).new_ref };
+                Alloc param_46 = cmd_alloc;
+                CmdRef param_47 = cmd_ref;
+                cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_297).new_ref };
                cmd_alloc.offset = cmd_ref.offset;
                break;
            }
@ -1327,9 +1347,9 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
    }
    for (uint i_1 = 0u; i_1 < 8u; i_1++)
    {
-        uint param_47 = i_1;
-        float3 param_48 = rgba[i_1].xyz;
-        image.write(float4(tosRGB(param_48), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_47))));
+        uint param_48 = i_1;
+        float3 param_49 = rgba[i_1].xyz;
+        image.write(float4(tosRGB(param_49), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48))));
    }
 }

--- a/piet-gpu/shader/gen/kernel4.spv
+++ b/piet-gpu/shader/gen/kernel4.spv
--- a/piet-gpu/shader/gen/kernel4_gray.dxil
+++ b/piet-gpu/shader/gen/kernel4_gray.dxil
--- a/piet-gpu/shader/gen/kernel4_gray.hlsl
+++ b/piet-gpu/shader/gen/kernel4_gray.hlsl
@ -162,7 +162,7 @@ struct Config
 static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);

 RWByteAddressBuffer _297 : register(u0, space0);
-ByteAddressBuffer _1749 : register(t1, space0);
+ByteAddressBuffer _1725 : register(t1, space0);
 RWTexture2D<unorm float4> image_atlas : register(u3, space0);
 RWTexture2D<unorm float4> gradients : register(u4, space0);
 RWTexture2D<unorm float> image : register(u2, space0);
@ -477,10 +477,10 @@ void fillImage(out float4 spvReturnValue[8], uint2 xy, CmdImage cmd_img)
        int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
        float4 fg_rgba = image_atlas[uv];
        float3 param_1 = fg_rgba.xyz;
-        float3 _1721 = fromsRGB(param_1);
-        fg_rgba.x = _1721.x;
-        fg_rgba.y = _1721.y;
-        fg_rgba.z = _1721.z;
+        float3 _1697 = fromsRGB(param_1);
+        fg_rgba.x = _1697.x;
+        fg_rgba.y = _1697.y;
+        fg_rgba.z = _1697.z;
        rgba[i] = fg_rgba;
    }
    spvReturnValue = rgba;
@ -918,12 +918,6 @@ float4 mix_compose(float3 cb, float3 cs, float ab, float as, uint mode)
            break;
        }
        case 13u:
-        {
-            float rev_as = 1.0f - as;
-            float rev_ab = 1.0f - ab;
-            return max(0.0f.xxxx, float4((cs * rev_as) + (cb * rev_ab), rev_as + rev_ab));
-        }
-        case 14u:
        {
            return min(1.0f.xxxx, float4((cs * as) + (cb * ab), as + ab));
        }
@ -992,16 +986,18 @@ CmdJump Cmd_Jump_read(Alloc a, CmdRef ref)

 void comp_main()
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1749.Load(8)) + gl_WorkGroupID.x;
-    Alloc _1764;
-    _1764.offset = _1749.Load(24);
+    uint tile_ix = (gl_WorkGroupID.y * _1725.Load(8)) + gl_WorkGroupID.x;
+    Alloc _1740;
+    _1740.offset = _1725.Load(24);
    Alloc param;
-    param.offset = _1764.offset;
+    param.offset = _1740.offset;
    uint param_1 = tile_ix * 1024u;
    uint param_2 = 1024u;
    Alloc cmd_alloc = slice_mem(param, param_1, param_2);
-    CmdRef _1773 = { cmd_alloc.offset };
-    CmdRef cmd_ref = _1773;
+    CmdRef _1749 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _1749;
+    uint blend_offset = _297.Load((cmd_ref.offset >> uint(2)) * 4 + 8);
+    cmd_ref.offset += 4u;
    uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
    float2 xy = float2(xy_uint);
    float4 rgba[8];
@ -1014,7 +1010,9 @@ void comp_main()
    float df[8];
    TileSegRef tile_seg_ref;
    float area[8];
-    uint blend_stack[128][8];
+    uint blend_stack[4][8];
+    uint base_ix_1;
+    uint bg_rgba;
    while (mem_ok)
    {
        Alloc param_3 = cmd_alloc;
@ -1035,8 +1033,8 @@ void comp_main()
                {
                    df[k] = 1000000000.0f;
                }
-                TileSegRef _1867 = { stroke.tile_ref };
-                tile_seg_ref = _1867;
+                TileSegRef _1854 = { stroke.tile_ref };
+                tile_seg_ref = _1854;
                do
                {
                    uint param_7 = tile_seg_ref.offset;
@ -1072,8 +1070,8 @@ void comp_main()
                {
                    area[k_3] = float(fill.backdrop);
                }
-                TileSegRef _1987 = { fill.tile_ref };
-                tile_seg_ref = _1987;
+                TileSegRef _1974 = { fill.tile_ref };
+                tile_seg_ref = _1974;
                do
                {
                    uint param_15 = tile_seg_ref.offset;
@ -1162,10 +1160,10 @@ void comp_main()
                    int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
                    float4 fg_rgba = gradients[int2(x, int(lin.index))];
                    float3 param_29 = fg_rgba.xyz;
-                    float3 _2321 = fromsRGB(param_29);
-                    fg_rgba.x = _2321.x;
-                    fg_rgba.y = _2321.y;
-                    fg_rgba.z = _2321.z;
+                    float3 _2308 = fromsRGB(param_29);
+                    fg_rgba.x = _2308.x;
+                    fg_rgba.y = _2308.y;
+                    fg_rgba.z = _2308.z;
                    float4 fg_k_1 = fg_rgba * area[k_9];
                    rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1;
                }
@ -1188,10 +1186,10 @@ void comp_main()
                    int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f));
                    float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))];
                    float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2431 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2431.x;
-                    fg_rgba_1.y = _2431.y;
-                    fg_rgba_1.z = _2431.z;
+                    float3 _2418 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2418.x;
+                    fg_rgba_1.y = _2418.y;
+                    fg_rgba_1.z = _2418.z;
                    float4 fg_k_2 = fg_rgba_1 * area[k_10];
                    rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2;
                }
@ -1205,9 +1203,9 @@ void comp_main()
                CmdImage fill_img = Cmd_Image_read(param_34, param_35);
                uint2 param_36 = xy_uint;
                CmdImage param_37 = fill_img;
-                float4 _2474[8];
-                fillImage(_2474, param_36, param_37);
-                float4 img[8] = _2474;
+                float4 _2461[8];
+                fillImage(_2461, param_36, param_37);
+                float4 img[8] = _2461;
                for (uint k_11 = 0u; k_11 < 8u; k_11++)
                {
                    float4 fg_k_3 = img[k_11] * area[k_11];
@ -1218,13 +1216,26 @@ void comp_main()
            }
            case 9u:
            {
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                if (clip_depth < 4u)
                {
-                    uint d_2 = min(clip_depth, 127u);
-                    float4 param_38 = float4(rgba[k_12]);
-                    uint _2537 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2537;
-                    rgba[k_12] = 0.0f.xxxx;
+                    for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                    {
+                        float4 param_38 = float4(rgba[k_12]);
+                        uint _2523 = packsRGB(param_38);
+                        blend_stack[clip_depth][k_12] = _2523;
+                        rgba[k_12] = 0.0f.xxxx;
+                    }
+                }
+                else
+                {
+                    uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                    for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                    {
+                        float4 param_39 = float4(rgba[k_13]);
+                        uint _2566 = packsRGB(param_39);
+                        _297.Store((base_ix + k_13) * 4 + 8, _2566);
+                        rgba[k_13] = 0.0f.xxxx;
+                    }
                }
                clip_depth++;
                cmd_ref.offset += 4u;
@ -1232,32 +1243,41 @@ void comp_main()
            }
            case 10u:
            {
-                Alloc param_39 = cmd_alloc;
-                CmdRef param_40 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40);
-                uint blend_mode = end_clip.blend >> uint(8);
-                uint comp_mode = end_clip.blend & 255u;
+                Alloc param_40 = cmd_alloc;
+                CmdRef param_41 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41);
                clip_depth--;
-                for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                if (clip_depth < 4u)
                {
-                    uint d_3 = min(clip_depth, 127u);
-                    uint param_41 = blend_stack[d_3][k_13];
-                    float4 bg = unpacksRGB(param_41);
-                    float4 fg_1 = rgba[k_13] * area[k_13];
-                    float4 param_42 = bg;
-                    float4 param_43 = fg_1;
-                    uint param_44 = end_clip.blend;
-                    rgba[k_13] = mix_blend_compose(param_42, param_43, param_44);
+                    base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                }
+                for (uint k_14 = 0u; k_14 < 8u; k_14++)
+                {
+                    if (clip_depth < 4u)
+                    {
+                        bg_rgba = blend_stack[clip_depth][k_14];
+                    }
+                    else
+                    {
+                        bg_rgba = _297.Load((base_ix_1 + k_14) * 4 + 8);
+                    }
+                    uint param_42 = bg_rgba;
+                    float4 bg = unpacksRGB(param_42);
+                    float4 fg_1 = rgba[k_14] * area[k_14];
+                    float4 param_43 = bg;
+                    float4 param_44 = fg_1;
+                    uint param_45 = end_clip.blend;
+                    rgba[k_14] = mix_blend_compose(param_43, param_44, param_45);
                }
                cmd_ref.offset += 8u;
                break;
            }
            case 11u:
            {
-                Alloc param_45 = cmd_alloc;
-                CmdRef param_46 = cmd_ref;
-                CmdRef _2615 = { Cmd_Jump_read(param_45, param_46).new_ref };
-                cmd_ref = _2615;
+                Alloc param_46 = cmd_alloc;
+                CmdRef param_47 = cmd_ref;
+                CmdRef _2665 = { Cmd_Jump_read(param_46, param_47).new_ref };
+                cmd_ref = _2665;
                cmd_alloc.offset = cmd_ref.offset;
                break;
            }
@ -1265,8 +1285,8 @@ void comp_main()
    }
    for (uint i_1 = 0u; i_1 < 8u; i_1++)
    {
-        uint param_47 = i_1;
-        image[int2(xy_uint + chunk_offset(param_47))] = rgba[i_1].w.x;
+        uint param_48 = i_1;
+        image[int2(xy_uint + chunk_offset(param_48))] = rgba[i_1].w.x;
    }
 }

--- a/piet-gpu/shader/gen/kernel4_gray.msl
+++ b/piet-gpu/shader/gen/kernel4_gray.msl
@ -528,10 +528,10 @@ spvUnsafeArray<float4, 8> fillImage(thread const uint2& xy, thread const CmdImag
        int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
        float4 fg_rgba = image_atlas.read(uint2(uv));
        float3 param_1 = fg_rgba.xyz;
-        float3 _1721 = fromsRGB(param_1);
-        fg_rgba.x = _1721.x;
-        fg_rgba.y = _1721.y;
-        fg_rgba.z = _1721.z;
+        float3 _1697 = fromsRGB(param_1);
+        fg_rgba.x = _1697.x;
+        fg_rgba.y = _1697.y;
+        fg_rgba.z = _1697.z;
        rgba[i] = fg_rgba;
    }
    return rgba;
@ -985,12 +985,6 @@ float4 mix_compose(thread const float3& cb, thread const float3& cs, thread cons
            break;
        }
        case 13u:
-        {
-            float rev_as = 1.0 - as;
-            float rev_ab = 1.0 - ab;
-            return fast::max(float4(0.0), float4((cs * rev_as) + (cb * rev_ab), rev_as + rev_ab));
-        }
-        case 14u:
        {
            return fast::min(float4(1.0), float4((cs * as) + (cb * ab), as + ab));
        }
@ -1059,15 +1053,17 @@ CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Me
    return CmdJump_read(param, param_1, v_297);
 }

-kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1749 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1725 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1749.conf.width_in_tiles) + gl_WorkGroupID.x;
+    uint tile_ix = (gl_WorkGroupID.y * _1725.conf.width_in_tiles) + gl_WorkGroupID.x;
    Alloc param;
-    param.offset = _1749.conf.ptcl_alloc.offset;
+    param.offset = _1725.conf.ptcl_alloc.offset;
    uint param_1 = tile_ix * 1024u;
    uint param_2 = 1024u;
    Alloc cmd_alloc = slice_mem(param, param_1, param_2);
    CmdRef cmd_ref = CmdRef{ cmd_alloc.offset };
+    uint blend_offset = v_297.memory[cmd_ref.offset >> uint(2)];
+    cmd_ref.offset += 4u;
    uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
    float2 xy = float2(xy_uint);
    spvUnsafeArray<float4, 8> rgba;
@ -1080,7 +1076,9 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
    spvUnsafeArray<float, 8> df;
    TileSegRef tile_seg_ref;
    spvUnsafeArray<float, 8> area;
-    spvUnsafeArray<spvUnsafeArray<uint, 8>, 128> blend_stack;
+    spvUnsafeArray<spvUnsafeArray<uint, 8>, 4> blend_stack;
+    uint base_ix_1;
+    uint bg_rgba;
    while (mem_ok)
    {
        Alloc param_3 = cmd_alloc;
@ -1226,10 +1224,10 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
                    int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                    float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                    float3 param_29 = fg_rgba.xyz;
-                    float3 _2321 = fromsRGB(param_29);
-                    fg_rgba.x = _2321.x;
-                    fg_rgba.y = _2321.y;
-                    fg_rgba.z = _2321.z;
+                    float3 _2308 = fromsRGB(param_29);
+                    fg_rgba.x = _2308.x;
+                    fg_rgba.y = _2308.y;
+                    fg_rgba.z = _2308.z;
                    float4 fg_k_1 = fg_rgba * area[k_9];
                    rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1;
                }
@ -1252,10 +1250,10 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
                    int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0));
                    float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index))));
                    float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2431 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2431.x;
-                    fg_rgba_1.y = _2431.y;
-                    fg_rgba_1.z = _2431.z;
+                    float3 _2418 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2418.x;
+                    fg_rgba_1.y = _2418.y;
+                    fg_rgba_1.z = _2418.z;
                    float4 fg_k_2 = fg_rgba_1 * area[k_10];
                    rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2;
                }
@ -1281,13 +1279,26 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
            }
            case 9u:
            {
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                if (clip_depth < 4u)
                {
-                    uint d_2 = min(clip_depth, 127u);
-                    float4 param_38 = float4(rgba[k_12]);
-                    uint _2537 = packsRGB(param_38);
-                    blend_stack[d_2][k_12] = _2537;
-                    rgba[k_12] = float4(0.0);
+                    for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                    {
+                        float4 param_38 = float4(rgba[k_12]);
+                        uint _2523 = packsRGB(param_38);
+                        blend_stack[clip_depth][k_12] = _2523;
+                        rgba[k_12] = float4(0.0);
+                    }
+                }
+                else
+                {
+                    uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                    for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                    {
+                        float4 param_39 = float4(rgba[k_13]);
+                        uint _2566 = packsRGB(param_39);
+                        v_297.memory[base_ix + k_13] = _2566;
+                        rgba[k_13] = float4(0.0);
+                    }
                }
                clip_depth++;
                cmd_ref.offset += 4u;
@ -1295,31 +1306,40 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
            }
            case 10u:
            {
-                Alloc param_39 = cmd_alloc;
-                CmdRef param_40 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40, v_297);
-                uint blend_mode = end_clip.blend >> uint(8);
-                uint comp_mode = end_clip.blend & 255u;
+                Alloc param_40 = cmd_alloc;
+                CmdRef param_41 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41, v_297);
                clip_depth--;
-                for (uint k_13 = 0u; k_13 < 8u; k_13++)
+                if (clip_depth < 4u)
                {
-                    uint d_3 = min(clip_depth, 127u);
-                    uint param_41 = blend_stack[d_3][k_13];
-                    float4 bg = unpacksRGB(param_41);
-                    float4 fg_1 = rgba[k_13] * area[k_13];
-                    float4 param_42 = bg;
-                    float4 param_43 = fg_1;
-                    uint param_44 = end_clip.blend;
-                    rgba[k_13] = mix_blend_compose(param_42, param_43, param_44);
+                    base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y)));
+                }
+                for (uint k_14 = 0u; k_14 < 8u; k_14++)
+                {
+                    if (clip_depth < 4u)
+                    {
+                        bg_rgba = blend_stack[clip_depth][k_14];
+                    }
+                    else
+                    {
+                        bg_rgba = v_297.memory[base_ix_1 + k_14];
+                    }
+                    uint param_42 = bg_rgba;
+                    float4 bg = unpacksRGB(param_42);
+                    float4 fg_1 = rgba[k_14] * area[k_14];
+                    float4 param_43 = bg;
+                    float4 param_44 = fg_1;
+                    uint param_45 = end_clip.blend;
+                    rgba[k_14] = mix_blend_compose(param_43, param_44, param_45);
                }
                cmd_ref.offset += 8u;
                break;
            }
            case 11u:
            {
-                Alloc param_45 = cmd_alloc;
-                CmdRef param_46 = cmd_ref;
-                cmd_ref = CmdRef{ Cmd_Jump_read(param_45, param_46, v_297).new_ref };
+                Alloc param_46 = cmd_alloc;
+                CmdRef param_47 = cmd_ref;
+                cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_297).new_ref };
                cmd_alloc.offset = cmd_ref.offset;
                break;
            }
@ -1327,8 +1347,8 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
    }
    for (uint i_1 = 0u; i_1 < 8u; i_1++)
    {
-        uint param_47 = i_1;
-        image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_47))));
+        uint param_48 = i_1;
+        image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48))));
    }
 }

--- a/piet-gpu/shader/gen/kernel4_gray.spv
+++ b/piet-gpu/shader/gen/kernel4_gray.spv
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@ -87,11 +87,14 @@ void main() {
    Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC);
    CmdRef cmd_ref = CmdRef(cmd_alloc.offset);

+    uint blend_offset = memory[cmd_ref.offset >> 2];
+    cmd_ref.offset += 4;
+
    uvec2 xy_uint = uvec2(gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_WorkGroupID.x,
                          gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y);
    vec2 xy = vec2(xy_uint);
    mediump vec4 rgba[CHUNK];
-    uint blend_stack[MAX_BLEND_STACK][CHUNK];
+    uint blend_stack[BLEND_STACK_SPLIT][CHUNK];
    for (uint i = 0; i < CHUNK; i++) {
        rgba[i] = vec4(0.0);
    }
@ -223,24 +226,38 @@ void main() {
            cmd_ref.offset += 4 + CmdImage_size;
            break;
        case Cmd_BeginClip:
-            for (uint k = 0; k < CHUNK; k++) {
-                // We reject any inputs that might overflow in render_ctx.rs.
-                // The following is a sanity check so we don't corrupt memory should there be malformed inputs.
-                uint d = min(clip_depth, MAX_BLEND_STACK - 1);
-                blend_stack[d][k] = packsRGB(vec4(rgba[k]));
-                rgba[k] = vec4(0.0);
+            if (clip_depth < BLEND_STACK_SPLIT) {
+                for (uint k = 0; k < CHUNK; k++) {
+                    blend_stack[clip_depth][k] = packsRGB(vec4(rgba[k]));
+                    rgba[k] = vec4(0.0);
+                }
+            } else {
+                uint base_ix = (blend_offset >> 2) + (clip_depth - BLEND_STACK_SPLIT) * TILE_HEIGHT_PX * TILE_WIDTH_PX +
+                    CHUNK * (gl_LocalInvocationID.x + CHUNK_DX * gl_LocalInvocationID.y);
+                for (uint k = 0; k < CHUNK; k++) {
+                    memory[base_ix + k] = packsRGB(vec4(rgba[k]));
+                    rgba[k] = vec4(0.0);
+                }
            }
            clip_depth++;
            cmd_ref.offset += 4;
            break;
        case Cmd_EndClip:
            CmdEndClip end_clip = Cmd_EndClip_read(cmd_alloc, cmd_ref);
-            uint blend_mode = uint(end_clip.blend >> 8);
-            uint comp_mode = uint(end_clip.blend & 0xFF);
            clip_depth--;
+            uint base_ix;
+            if (clip_depth < BLEND_STACK_SPLIT) {
+                base_ix = (blend_offset >> 2) + (clip_depth - BLEND_STACK_SPLIT) * TILE_HEIGHT_PX * TILE_WIDTH_PX +
+                    CHUNK * (gl_LocalInvocationID.x + CHUNK_DX * gl_LocalInvocationID.y);
+            }
            for (uint k = 0; k < CHUNK; k++) {
-                uint d = min(clip_depth, MAX_BLEND_STACK - 1);
-                mediump vec4 bg = unpacksRGB(blend_stack[d][k]);
+                uint bg_rgba;
+                if (clip_depth < BLEND_STACK_SPLIT) {
+                    bg_rgba = blend_stack[clip_depth][k];
+                } else {
+                    bg_rgba = memory[base_ix + k];
+                }
+                mediump vec4 bg = unpacksRGB(bg_rgba);
                mediump vec4 fg = rgba[k] * area[k];
                rgba[k] = mix_blend_compose(bg, fg, end_clip.blend);
            }
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@ -27,6 +27,10 @@

 #define GRADIENT_WIDTH 512

+// We allocate this many blend stack entries in registers, and spill
+// to memory for the overflow.
+#define BLEND_STACK_SPLIT 4
+
 #ifdef ERR_MALLOC_FAILED
 struct Config {
    uint n_elements; // paths
@ -91,7 +95,7 @@ struct Config {
 #define MODE_STROKE 1

 // Size of kernel4 clip state, in words.
-#define CLIP_STATE_SIZE 2
+#define CLIP_STATE_SIZE 1

 // fill_mode_from_flags extracts the fill mode from tag flags.
 uint fill_mode_from_flags(uint flags) {
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -34,8 +34,6 @@ const TILE_H: usize = 16;

 const PTCL_INITIAL_ALLOC: usize = 1024;

-const MAX_BLEND_STACK: usize = 128;
-
 #[allow(unused)]
 fn dump_scene(buf: &[u8]) {
    for i in 0..(buf.len() / 4) {
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@ -2,7 +2,6 @@ use std::borrow::Cow;

 use crate::encoder::GlyphEncoder;
 use crate::stages::{Config, Transform};
-use crate::MAX_BLEND_STACK;
 use piet::kurbo::{Affine, Insets, PathEl, Point, Rect, Shape};
 use piet::{
    Color, Error, FixedGradient, ImageFormat, InterpolationMode, IntoBrush, RenderContext,
@ -230,9 +229,6 @@ impl RenderContext for PietGpuRenderContext {
        let path = shape.path_elements(TOLERANCE);
        self.encode_path(path, true);
        self.new_encoder.begin_clip(None);
-        if self.clip_stack.len() >= MAX_BLEND_STACK {
-            panic!("Maximum clip/blend stack size {} exceeded", MAX_BLEND_STACK);
-        }
        self.clip_stack.push(ClipElement { blend: None });
        if let Some(tos) = self.state_stack.last_mut() {
            tos.n_clip += 1;
@ -334,9 +330,6 @@ impl PietGpuRenderContext {
        let path = shape.path_elements(TOLERANCE);
        self.encode_path(path, true);
        self.new_encoder.begin_clip(Some(blend));
-        if self.clip_stack.len() >= MAX_BLEND_STACK {
-            panic!("Maximum clip/blend stack size {} exceeded", MAX_BLEND_STACK);
-        }
        self.clip_stack.push(ClipElement { blend: Some(blend) });
        if let Some(tos) = self.state_stack.last_mut() {
            tos.n_clip += 1;