diff --git a/piet-gpu/shader/gen/backdrop.dxil b/piet-gpu/shader/gen/backdrop.dxil
index 0fb9622..50f5bad 100644
Binary files a/piet-gpu/shader/gen/backdrop.dxil and b/piet-gpu/shader/gen/backdrop.dxil differ
diff --git a/piet-gpu/shader/gen/backdrop.hlsl b/piet-gpu/shader/gen/backdrop.hlsl
index aba3cff..2ed8898 100644
--- a/piet-gpu/shader/gen/backdrop.hlsl
+++ b/piet-gpu/shader/gen/backdrop.hlsl
@@ -21,6 +21,7 @@ struct Path
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -52,8 +53,8 @@ struct Config
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-RWByteAddressBuffer _67 : register(u0, space0);
-ByteAddressBuffer _166 : register(t1, space0);
+RWByteAddressBuffer _59 : register(u0, space0);
+ByteAddressBuffer _181 : register(t1, space0);
 
 static uint3 gl_LocalInvocationID;
 static uint3 gl_GlobalInvocationID;
@@ -69,6 +70,13 @@ groupshared uint sh_row_width[256];
 groupshared Alloc sh_row_alloc[256];
 groupshared uint sh_row_count[256];
 
+bool check_deps(uint dep_stage)
+{
+    uint _65;
+    _59.InterlockedOr(4, 0u, _65);
+    return (_65 & dep_stage) == 0u;
+}
+
 bool touch_mem(Alloc alloc, uint offset)
 {
     return true;
@@ -82,7 +90,7 @@ uint read_mem(Alloc alloc, uint offset)
     {
         return 0u;
     }
-    uint v = _67.Load(offset * 4 + 8);
+    uint v = _59.Load(offset * 4 + 12);
     return v;
 }
 
@@ -100,8 +108,8 @@ Path Path_read(Alloc a, PathRef ref)
     uint raw2 = read_mem(param_4, param_5);
     Path s;
     s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
-    TileRef _134 = { raw2 };
-    s.tiles = _134;
+    TileRef _146 = { raw2 };
+    s.tiles = _146;
     return s;
 }
 
@@ -120,47 +128,52 @@ void write_mem(Alloc alloc, uint offset, uint val)
     {
         return;
     }
-    _67.Store(offset * 4 + 8, val);
+    _59.Store(offset * 4 + 12, val);
 }
 
 void comp_main()
 {
+    uint param = 7u;
+    bool _154 = check_deps(param);
+    if (!_154)
+    {
+        return;
+    }
     uint th_ix = gl_LocalInvocationIndex;
     uint element_ix = gl_GlobalInvocationID.x;
     uint row_count = 0u;
-    bool mem_ok = _67.Load(4) == 0u;
     if (gl_LocalInvocationID.y == 0u)
     {
-        if (element_ix < _166.Load(0))
+        if (element_ix < _181.Load(4))
         {
-            PathRef _180 = { _166.Load(16) + (element_ix * 12u) };
-            PathRef path_ref = _180;
-            Alloc _185;
-            _185.offset = _166.Load(16);
-            Alloc param;
-            param.offset = _185.offset;
-            PathRef param_1 = path_ref;
-            Path path = Path_read(param, param_1);
+            PathRef _195 = { _181.Load(20) + (element_ix * 12u) };
+            PathRef path_ref = _195;
+            Alloc _200;
+            _200.offset = _181.Load(20);
+            Alloc param_1;
+            param_1.offset = _200.offset;
+            PathRef param_2 = path_ref;
+            Path path = Path_read(param_1, param_2);
             sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
             row_count = path.bbox.w - path.bbox.y;
-            bool _210 = row_count == 1u;
-            bool _216;
-            if (_210)
+            bool _225 = row_count == 1u;
+            bool _231;
+            if (_225)
             {
-                _216 = path.bbox.y > 0u;
+                _231 = path.bbox.y > 0u;
             }
             else
             {
-                _216 = _210;
+                _231 = _225;
             }
-            if (_216)
+            if (_231)
             {
                 row_count = 0u;
             }
-            uint param_2 = path.tiles.offset;
-            uint param_3 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
-            bool param_4 = mem_ok;
-            Alloc path_alloc = new_alloc(param_2, param_3, param_4);
+            uint param_3 = path.tiles.offset;
+            uint param_4 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+            bool param_5 = true;
+            Alloc path_alloc = new_alloc(param_3, param_4, param_5);
             sh_row_alloc[th_ix] = path_alloc;
         }
         sh_row_count[th_ix] = row_count;
@@ -168,17 +181,17 @@ void comp_main()
     for (uint i = 0u; i < 8u; i++)
     {
         GroupMemoryBarrierWithGroupSync();
-        bool _262 = gl_LocalInvocationID.y == 0u;
-        bool _269;
-        if (_262)
+        bool _276 = gl_LocalInvocationID.y == 0u;
+        bool _283;
+        if (_276)
         {
-            _269 = th_ix >= (1u << i);
+            _283 = th_ix >= (1u << i);
         }
         else
         {
-            _269 = _262;
+            _283 = _276;
         }
-        if (_269)
+        if (_283)
         {
             row_count += sh_row_count[th_ix - (1u << i)];
         }
@@ -190,7 +203,7 @@ void comp_main()
     }
     GroupMemoryBarrierWithGroupSync();
     uint total_rows = sh_row_count[255];
-    uint _348;
+    uint _360;
     for (uint row = th_ix; row < total_rows; row += 256u)
     {
         uint el_ix = 0u;
@@ -203,32 +216,32 @@ void comp_main()
             }
         }
         uint width = sh_row_width[el_ix];
-        if ((width > 0u) && mem_ok)
+        if (width > 0u)
         {
             Alloc tiles_alloc = sh_row_alloc[el_ix];
             if (el_ix > 0u)
             {
-                _348 = sh_row_count[el_ix - 1u];
+                _360 = sh_row_count[el_ix - 1u];
             }
             else
             {
-                _348 = 0u;
+                _360 = 0u;
             }
-            uint seq_ix = row - _348;
+            uint seq_ix = row - _360;
             uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width);
-            Alloc param_5 = tiles_alloc;
-            uint param_6 = tile_el_ix;
-            uint sum = read_mem(param_5, param_6);
+            Alloc param_6 = tiles_alloc;
+            uint param_7 = tile_el_ix;
+            uint sum = read_mem(param_6, param_7);
             for (uint x = 1u; x < width; x++)
             {
                 tile_el_ix += 2u;
-                Alloc param_7 = tiles_alloc;
-                uint param_8 = tile_el_ix;
-                sum += read_mem(param_7, param_8);
-                Alloc param_9 = tiles_alloc;
-                uint param_10 = tile_el_ix;
-                uint param_11 = sum;
-                write_mem(param_9, param_10, param_11);
+                Alloc param_8 = tiles_alloc;
+                uint param_9 = tile_el_ix;
+                sum += read_mem(param_8, param_9);
+                Alloc param_10 = tiles_alloc;
+                uint param_11 = tile_el_ix;
+                uint param_12 = sum;
+                write_mem(param_10, param_11, param_12);
             }
         }
     }
diff --git a/piet-gpu/shader/gen/backdrop.msl b/piet-gpu/shader/gen/backdrop.msl
index 1c0a0bb..3726dff 100644
--- a/piet-gpu/shader/gen/backdrop.msl
+++ b/piet-gpu/shader/gen/backdrop.msl
@@ -1,7 +1,9 @@
 #pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wunused-variable"
 
 #include <metal_stdlib>
 #include <simd/simd.h>
+#include <metal_atomic>
 
 using namespace metal;
 
@@ -30,6 +32,7 @@ struct Memory
 {
     uint mem_offset;
     uint mem_error;
+    uint blend_offset;
     uint memory[1];
 };
 
@@ -40,6 +43,7 @@ struct Alloc_1
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -76,6 +80,13 @@ struct ConfigBuf
 
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
 
+static inline __attribute__((always_inline))
+bool check_deps(thread const uint& dep_stage, device Memory& v_59)
+{
+    uint _65 = atomic_fetch_or_explicit((device atomic_uint*)&v_59.mem_error, 0u, memory_order_relaxed);
+    return (_65 & dep_stage) == 0u;
+}
+
 static inline __attribute__((always_inline))
 bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
 {
@@ -83,7 +94,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
 }
 
 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_67)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_59)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -91,23 +102,23 @@ uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memor
     {
         return 0u;
     }
-    uint v = v_67.memory[offset];
+    uint v = v_59.memory[offset];
     return v;
 }
 
 static inline __attribute__((always_inline))
-Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_67)
+Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_59)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_67);
+    uint raw0 = read_mem(param, param_1, v_59);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_67);
+    uint raw1 = read_mem(param_2, param_3, v_59);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_67);
+    uint raw2 = read_mem(param_4, param_5, v_59);
     Path s;
     s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
     s.tiles = TileRef{ raw2 };
@@ -123,7 +134,7 @@ Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const
 }
 
 static inline __attribute__((always_inline))
-void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_67)
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_59)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -131,47 +142,52 @@ void write_mem(thread const Alloc& alloc, thread const uint& offset, thread cons
     {
         return;
     }
-    v_67.memory[offset] = val;
+    v_59.memory[offset] = val;
 }
 
-kernel void main0(device Memory& v_67 [[buffer(0)]], const device ConfigBuf& _166 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_59 [[buffer(0)]], const device ConfigBuf& _181 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     threadgroup uint sh_row_width[256];
     threadgroup Alloc sh_row_alloc[256];
     threadgroup uint sh_row_count[256];
+    uint param = 7u;
+    bool _154 = check_deps(param, v_59);
+    if (!_154)
+    {
+        return;
+    }
     uint th_ix = gl_LocalInvocationIndex;
     uint element_ix = gl_GlobalInvocationID.x;
     uint row_count = 0u;
-    bool mem_ok = v_67.mem_error == 0u;
     if (gl_LocalInvocationID.y == 0u)
     {
-        if (element_ix < _166.conf.n_elements)
+        if (element_ix < _181.conf.n_elements)
         {
-            PathRef path_ref = PathRef{ _166.conf.tile_alloc.offset + (element_ix * 12u) };
-            Alloc param;
-            param.offset = _166.conf.tile_alloc.offset;
-            PathRef param_1 = path_ref;
-            Path path = Path_read(param, param_1, v_67);
+            PathRef path_ref = PathRef{ _181.conf.tile_alloc.offset + (element_ix * 12u) };
+            Alloc param_1;
+            param_1.offset = _181.conf.tile_alloc.offset;
+            PathRef param_2 = path_ref;
+            Path path = Path_read(param_1, param_2, v_59);
             sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
             row_count = path.bbox.w - path.bbox.y;
-            bool _210 = row_count == 1u;
-            bool _216;
-            if (_210)
+            bool _225 = row_count == 1u;
+            bool _231;
+            if (_225)
             {
-                _216 = path.bbox.y > 0u;
+                _231 = path.bbox.y > 0u;
             }
             else
             {
-                _216 = _210;
+                _231 = _225;
             }
-            if (_216)
+            if (_231)
             {
                 row_count = 0u;
             }
-            uint param_2 = path.tiles.offset;
-            uint param_3 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
-            bool param_4 = mem_ok;
-            Alloc path_alloc = new_alloc(param_2, param_3, param_4);
+            uint param_3 = path.tiles.offset;
+            uint param_4 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+            bool param_5 = true;
+            Alloc path_alloc = new_alloc(param_3, param_4, param_5);
             sh_row_alloc[th_ix] = path_alloc;
         }
         sh_row_count[th_ix] = row_count;
@@ -179,17 +195,17 @@ kernel void main0(device Memory& v_67 [[buffer(0)]], const device ConfigBuf& _16
     for (uint i = 0u; i < 8u; i++)
     {
         threadgroup_barrier(mem_flags::mem_threadgroup);
-        bool _262 = gl_LocalInvocationID.y == 0u;
-        bool _269;
-        if (_262)
+        bool _276 = gl_LocalInvocationID.y == 0u;
+        bool _283;
+        if (_276)
         {
-            _269 = th_ix >= (1u << i);
+            _283 = th_ix >= (1u << i);
         }
         else
         {
-            _269 = _262;
+            _283 = _276;
         }
-        if (_269)
+        if (_283)
         {
             row_count += sh_row_count[th_ix - (1u << i)];
         }
@@ -201,7 +217,7 @@ kernel void main0(device Memory& v_67 [[buffer(0)]], const device ConfigBuf& _16
     }
     threadgroup_barrier(mem_flags::mem_threadgroup);
     uint total_rows = sh_row_count[255];
-    uint _348;
+    uint _360;
     for (uint row = th_ix; row < total_rows; row += 256u)
     {
         uint el_ix = 0u;
@@ -214,32 +230,32 @@ kernel void main0(device Memory& v_67 [[buffer(0)]], const device ConfigBuf& _16
             }
         }
         uint width = sh_row_width[el_ix];
-        if ((width > 0u) && mem_ok)
+        if (width > 0u)
         {
             Alloc tiles_alloc = sh_row_alloc[el_ix];
             if (el_ix > 0u)
             {
-                _348 = sh_row_count[el_ix - 1u];
+                _360 = sh_row_count[el_ix - 1u];
             }
             else
             {
-                _348 = 0u;
+                _360 = 0u;
             }
-            uint seq_ix = row - _348;
+            uint seq_ix = row - _360;
             uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width);
-            Alloc param_5 = tiles_alloc;
-            uint param_6 = tile_el_ix;
-            uint sum = read_mem(param_5, param_6, v_67);
+            Alloc param_6 = tiles_alloc;
+            uint param_7 = tile_el_ix;
+            uint sum = read_mem(param_6, param_7, v_59);
             for (uint x = 1u; x < width; x++)
             {
                 tile_el_ix += 2u;
-                Alloc param_7 = tiles_alloc;
-                uint param_8 = tile_el_ix;
-                sum += read_mem(param_7, param_8, v_67);
-                Alloc param_9 = tiles_alloc;
-                uint param_10 = tile_el_ix;
-                uint param_11 = sum;
-                write_mem(param_9, param_10, param_11, v_67);
+                Alloc param_8 = tiles_alloc;
+                uint param_9 = tile_el_ix;
+                sum += read_mem(param_8, param_9, v_59);
+                Alloc param_10 = tiles_alloc;
+                uint param_11 = tile_el_ix;
+                uint param_12 = sum;
+                write_mem(param_10, param_11, param_12, v_59);
             }
         }
     }
diff --git a/piet-gpu/shader/gen/backdrop.spv b/piet-gpu/shader/gen/backdrop.spv
index 2bd17d8..b8a74ea 100644
Binary files a/piet-gpu/shader/gen/backdrop.spv and b/piet-gpu/shader/gen/backdrop.spv differ
diff --git a/piet-gpu/shader/gen/backdrop_lg.dxil b/piet-gpu/shader/gen/backdrop_lg.dxil
index e24a6d3..06bacaf 100644
Binary files a/piet-gpu/shader/gen/backdrop_lg.dxil and b/piet-gpu/shader/gen/backdrop_lg.dxil differ
diff --git a/piet-gpu/shader/gen/backdrop_lg.hlsl b/piet-gpu/shader/gen/backdrop_lg.hlsl
index c506403..e547762 100644
--- a/piet-gpu/shader/gen/backdrop_lg.hlsl
+++ b/piet-gpu/shader/gen/backdrop_lg.hlsl
@@ -21,6 +21,7 @@ struct Path
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -52,8 +53,8 @@ struct Config
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 4u, 1u);
 
-RWByteAddressBuffer _67 : register(u0, space0);
-ByteAddressBuffer _166 : register(t1, space0);
+RWByteAddressBuffer _59 : register(u0, space0);
+ByteAddressBuffer _181 : register(t1, space0);
 
 static uint3 gl_LocalInvocationID;
 static uint3 gl_GlobalInvocationID;
@@ -69,6 +70,13 @@ groupshared uint sh_row_width[256];
 groupshared Alloc sh_row_alloc[256];
 groupshared uint sh_row_count[256];
 
+bool check_deps(uint dep_stage)
+{
+    uint _65;
+    _59.InterlockedOr(4, 0u, _65);
+    return (_65 & dep_stage) == 0u;
+}
+
 bool touch_mem(Alloc alloc, uint offset)
 {
     return true;
@@ -82,7 +90,7 @@ uint read_mem(Alloc alloc, uint offset)
     {
         return 0u;
     }
-    uint v = _67.Load(offset * 4 + 8);
+    uint v = _59.Load(offset * 4 + 12);
     return v;
 }
 
@@ -100,8 +108,8 @@ Path Path_read(Alloc a, PathRef ref)
     uint raw2 = read_mem(param_4, param_5);
     Path s;
     s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
-    TileRef _134 = { raw2 };
-    s.tiles = _134;
+    TileRef _146 = { raw2 };
+    s.tiles = _146;
     return s;
 }
 
@@ -120,47 +128,52 @@ void write_mem(Alloc alloc, uint offset, uint val)
     {
         return;
     }
-    _67.Store(offset * 4 + 8, val);
+    _59.Store(offset * 4 + 12, val);
 }
 
 void comp_main()
 {
+    uint param = 7u;
+    bool _154 = check_deps(param);
+    if (!_154)
+    {
+        return;
+    }
     uint th_ix = gl_LocalInvocationIndex;
     uint element_ix = gl_GlobalInvocationID.x;
     uint row_count = 0u;
-    bool mem_ok = _67.Load(4) == 0u;
     if (gl_LocalInvocationID.y == 0u)
     {
-        if (element_ix < _166.Load(0))
+        if (element_ix < _181.Load(4))
         {
-            PathRef _180 = { _166.Load(16) + (element_ix * 12u) };
-            PathRef path_ref = _180;
-            Alloc _185;
-            _185.offset = _166.Load(16);
-            Alloc param;
-            param.offset = _185.offset;
-            PathRef param_1 = path_ref;
-            Path path = Path_read(param, param_1);
+            PathRef _195 = { _181.Load(20) + (element_ix * 12u) };
+            PathRef path_ref = _195;
+            Alloc _200;
+            _200.offset = _181.Load(20);
+            Alloc param_1;
+            param_1.offset = _200.offset;
+            PathRef param_2 = path_ref;
+            Path path = Path_read(param_1, param_2);
             sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
             row_count = path.bbox.w - path.bbox.y;
-            bool _210 = row_count == 1u;
-            bool _216;
-            if (_210)
+            bool _225 = row_count == 1u;
+            bool _231;
+            if (_225)
             {
-                _216 = path.bbox.y > 0u;
+                _231 = path.bbox.y > 0u;
             }
             else
             {
-                _216 = _210;
+                _231 = _225;
             }
-            if (_216)
+            if (_231)
             {
                 row_count = 0u;
             }
-            uint param_2 = path.tiles.offset;
-            uint param_3 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
-            bool param_4 = mem_ok;
-            Alloc path_alloc = new_alloc(param_2, param_3, param_4);
+            uint param_3 = path.tiles.offset;
+            uint param_4 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+            bool param_5 = true;
+            Alloc path_alloc = new_alloc(param_3, param_4, param_5);
             sh_row_alloc[th_ix] = path_alloc;
         }
         sh_row_count[th_ix] = row_count;
@@ -168,17 +181,17 @@ void comp_main()
     for (uint i = 0u; i < 8u; i++)
     {
         GroupMemoryBarrierWithGroupSync();
-        bool _262 = gl_LocalInvocationID.y == 0u;
-        bool _269;
-        if (_262)
+        bool _276 = gl_LocalInvocationID.y == 0u;
+        bool _283;
+        if (_276)
         {
-            _269 = th_ix >= (1u << i);
+            _283 = th_ix >= (1u << i);
         }
         else
         {
-            _269 = _262;
+            _283 = _276;
         }
-        if (_269)
+        if (_283)
         {
             row_count += sh_row_count[th_ix - (1u << i)];
         }
@@ -190,7 +203,7 @@ void comp_main()
     }
     GroupMemoryBarrierWithGroupSync();
     uint total_rows = sh_row_count[255];
-    uint _348;
+    uint _360;
     for (uint row = th_ix; row < total_rows; row += 1024u)
     {
         uint el_ix = 0u;
@@ -203,32 +216,32 @@ void comp_main()
             }
         }
         uint width = sh_row_width[el_ix];
-        if ((width > 0u) && mem_ok)
+        if (width > 0u)
         {
             Alloc tiles_alloc = sh_row_alloc[el_ix];
             if (el_ix > 0u)
             {
-                _348 = sh_row_count[el_ix - 1u];
+                _360 = sh_row_count[el_ix - 1u];
             }
             else
             {
-                _348 = 0u;
+                _360 = 0u;
             }
-            uint seq_ix = row - _348;
+            uint seq_ix = row - _360;
             uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width);
-            Alloc param_5 = tiles_alloc;
-            uint param_6 = tile_el_ix;
-            uint sum = read_mem(param_5, param_6);
+            Alloc param_6 = tiles_alloc;
+            uint param_7 = tile_el_ix;
+            uint sum = read_mem(param_6, param_7);
             for (uint x = 1u; x < width; x++)
             {
                 tile_el_ix += 2u;
-                Alloc param_7 = tiles_alloc;
-                uint param_8 = tile_el_ix;
-                sum += read_mem(param_7, param_8);
-                Alloc param_9 = tiles_alloc;
-                uint param_10 = tile_el_ix;
-                uint param_11 = sum;
-                write_mem(param_9, param_10, param_11);
+                Alloc param_8 = tiles_alloc;
+                uint param_9 = tile_el_ix;
+                sum += read_mem(param_8, param_9);
+                Alloc param_10 = tiles_alloc;
+                uint param_11 = tile_el_ix;
+                uint param_12 = sum;
+                write_mem(param_10, param_11, param_12);
             }
         }
     }
diff --git a/piet-gpu/shader/gen/backdrop_lg.msl b/piet-gpu/shader/gen/backdrop_lg.msl
index de43ebe..68f0905 100644
--- a/piet-gpu/shader/gen/backdrop_lg.msl
+++ b/piet-gpu/shader/gen/backdrop_lg.msl
@@ -1,7 +1,9 @@
 #pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wunused-variable"
 
 #include <metal_stdlib>
 #include <simd/simd.h>
+#include <metal_atomic>
 
 using namespace metal;
 
@@ -30,6 +32,7 @@ struct Memory
 {
     uint mem_offset;
     uint mem_error;
+    uint blend_offset;
     uint memory[1];
 };
 
@@ -40,6 +43,7 @@ struct Alloc_1
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -76,6 +80,13 @@ struct ConfigBuf
 
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 4u, 1u);
 
+static inline __attribute__((always_inline))
+bool check_deps(thread const uint& dep_stage, device Memory& v_59)
+{
+    uint _65 = atomic_fetch_or_explicit((device atomic_uint*)&v_59.mem_error, 0u, memory_order_relaxed);
+    return (_65 & dep_stage) == 0u;
+}
+
 static inline __attribute__((always_inline))
 bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
 {
@@ -83,7 +94,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
 }
 
 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_67)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_59)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -91,23 +102,23 @@ uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memor
     {
         return 0u;
     }
-    uint v = v_67.memory[offset];
+    uint v = v_59.memory[offset];
     return v;
 }
 
 static inline __attribute__((always_inline))
-Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_67)
+Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_59)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_67);
+    uint raw0 = read_mem(param, param_1, v_59);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_67);
+    uint raw1 = read_mem(param_2, param_3, v_59);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_67);
+    uint raw2 = read_mem(param_4, param_5, v_59);
     Path s;
     s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
     s.tiles = TileRef{ raw2 };
@@ -123,7 +134,7 @@ Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const
 }
 
 static inline __attribute__((always_inline))
-void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_67)
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_59)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -131,47 +142,52 @@ void write_mem(thread const Alloc& alloc, thread const uint& offset, thread cons
     {
         return;
     }
-    v_67.memory[offset] = val;
+    v_59.memory[offset] = val;
 }
 
-kernel void main0(device Memory& v_67 [[buffer(0)]], const device ConfigBuf& _166 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_59 [[buffer(0)]], const device ConfigBuf& _181 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     threadgroup uint sh_row_width[256];
     threadgroup Alloc sh_row_alloc[256];
     threadgroup uint sh_row_count[256];
+    uint param = 7u;
+    bool _154 = check_deps(param, v_59);
+    if (!_154)
+    {
+        return;
+    }
     uint th_ix = gl_LocalInvocationIndex;
     uint element_ix = gl_GlobalInvocationID.x;
     uint row_count = 0u;
-    bool mem_ok = v_67.mem_error == 0u;
     if (gl_LocalInvocationID.y == 0u)
     {
-        if (element_ix < _166.conf.n_elements)
+        if (element_ix < _181.conf.n_elements)
         {
-            PathRef path_ref = PathRef{ _166.conf.tile_alloc.offset + (element_ix * 12u) };
-            Alloc param;
-            param.offset = _166.conf.tile_alloc.offset;
-            PathRef param_1 = path_ref;
-            Path path = Path_read(param, param_1, v_67);
+            PathRef path_ref = PathRef{ _181.conf.tile_alloc.offset + (element_ix * 12u) };
+            Alloc param_1;
+            param_1.offset = _181.conf.tile_alloc.offset;
+            PathRef param_2 = path_ref;
+            Path path = Path_read(param_1, param_2, v_59);
             sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
             row_count = path.bbox.w - path.bbox.y;
-            bool _210 = row_count == 1u;
-            bool _216;
-            if (_210)
+            bool _225 = row_count == 1u;
+            bool _231;
+            if (_225)
             {
-                _216 = path.bbox.y > 0u;
+                _231 = path.bbox.y > 0u;
             }
             else
             {
-                _216 = _210;
+                _231 = _225;
             }
-            if (_216)
+            if (_231)
             {
                 row_count = 0u;
             }
-            uint param_2 = path.tiles.offset;
-            uint param_3 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
-            bool param_4 = mem_ok;
-            Alloc path_alloc = new_alloc(param_2, param_3, param_4);
+            uint param_3 = path.tiles.offset;
+            uint param_4 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+            bool param_5 = true;
+            Alloc path_alloc = new_alloc(param_3, param_4, param_5);
             sh_row_alloc[th_ix] = path_alloc;
         }
         sh_row_count[th_ix] = row_count;
@@ -179,17 +195,17 @@ kernel void main0(device Memory& v_67 [[buffer(0)]], const device ConfigBuf& _16
     for (uint i = 0u; i < 8u; i++)
     {
         threadgroup_barrier(mem_flags::mem_threadgroup);
-        bool _262 = gl_LocalInvocationID.y == 0u;
-        bool _269;
-        if (_262)
+        bool _276 = gl_LocalInvocationID.y == 0u;
+        bool _283;
+        if (_276)
         {
-            _269 = th_ix >= (1u << i);
+            _283 = th_ix >= (1u << i);
         }
         else
         {
-            _269 = _262;
+            _283 = _276;
         }
-        if (_269)
+        if (_283)
         {
             row_count += sh_row_count[th_ix - (1u << i)];
         }
@@ -201,7 +217,7 @@ kernel void main0(device Memory& v_67 [[buffer(0)]], const device ConfigBuf& _16
     }
     threadgroup_barrier(mem_flags::mem_threadgroup);
     uint total_rows = sh_row_count[255];
-    uint _348;
+    uint _360;
     for (uint row = th_ix; row < total_rows; row += 1024u)
     {
         uint el_ix = 0u;
@@ -214,32 +230,32 @@ kernel void main0(device Memory& v_67 [[buffer(0)]], const device ConfigBuf& _16
             }
         }
         uint width = sh_row_width[el_ix];
-        if ((width > 0u) && mem_ok)
+        if (width > 0u)
         {
             Alloc tiles_alloc = sh_row_alloc[el_ix];
             if (el_ix > 0u)
             {
-                _348 = sh_row_count[el_ix - 1u];
+                _360 = sh_row_count[el_ix - 1u];
             }
             else
             {
-                _348 = 0u;
+                _360 = 0u;
             }
-            uint seq_ix = row - _348;
+            uint seq_ix = row - _360;
             uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width);
-            Alloc param_5 = tiles_alloc;
-            uint param_6 = tile_el_ix;
-            uint sum = read_mem(param_5, param_6, v_67);
+            Alloc param_6 = tiles_alloc;
+            uint param_7 = tile_el_ix;
+            uint sum = read_mem(param_6, param_7, v_59);
             for (uint x = 1u; x < width; x++)
             {
                 tile_el_ix += 2u;
-                Alloc param_7 = tiles_alloc;
-                uint param_8 = tile_el_ix;
-                sum += read_mem(param_7, param_8, v_67);
-                Alloc param_9 = tiles_alloc;
-                uint param_10 = tile_el_ix;
-                uint param_11 = sum;
-                write_mem(param_9, param_10, param_11, v_67);
+                Alloc param_8 = tiles_alloc;
+                uint param_9 = tile_el_ix;
+                sum += read_mem(param_8, param_9, v_59);
+                Alloc param_10 = tiles_alloc;
+                uint param_11 = tile_el_ix;
+                uint param_12 = sum;
+                write_mem(param_10, param_11, param_12, v_59);
             }
         }
     }
diff --git a/piet-gpu/shader/gen/backdrop_lg.spv b/piet-gpu/shader/gen/backdrop_lg.spv
index ff2b1d7..2819ec5 100644
Binary files a/piet-gpu/shader/gen/backdrop_lg.spv and b/piet-gpu/shader/gen/backdrop_lg.spv differ
diff --git a/piet-gpu/shader/gen/bbox_clear.dxil b/piet-gpu/shader/gen/bbox_clear.dxil
index 6655b7f..82cfb03 100644
Binary files a/piet-gpu/shader/gen/bbox_clear.dxil and b/piet-gpu/shader/gen/bbox_clear.dxil differ
diff --git a/piet-gpu/shader/gen/bbox_clear.hlsl b/piet-gpu/shader/gen/bbox_clear.hlsl
index 8a884d3..5d29894 100644
--- a/piet-gpu/shader/gen/bbox_clear.hlsl
+++ b/piet-gpu/shader/gen/bbox_clear.hlsl
@@ -5,6 +5,7 @@ struct Alloc
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -48,13 +49,13 @@ struct SPIRV_Cross_Input
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x;
-    if (ix < _21.Load(76))
+    if (ix < _21.Load(80))
     {
-        uint out_ix = (_21.Load(40) >> uint(2)) + (6u * ix);
-        _45.Store(out_ix * 4 + 8, 65535u);
-        _45.Store((out_ix + 1u) * 4 + 8, 65535u);
-        _45.Store((out_ix + 2u) * 4 + 8, 0u);
-        _45.Store((out_ix + 3u) * 4 + 8, 0u);
+        uint out_ix = (_21.Load(44) >> uint(2)) + (6u * ix);
+        _45.Store(out_ix * 4 + 12, 65535u);
+        _45.Store((out_ix + 1u) * 4 + 12, 65535u);
+        _45.Store((out_ix + 2u) * 4 + 12, 0u);
+        _45.Store((out_ix + 3u) * 4 + 12, 0u);
     }
 }
 
diff --git a/piet-gpu/shader/gen/bbox_clear.msl b/piet-gpu/shader/gen/bbox_clear.msl
index c278c68..289fc9a 100644
--- a/piet-gpu/shader/gen/bbox_clear.msl
+++ b/piet-gpu/shader/gen/bbox_clear.msl
@@ -10,6 +10,7 @@ struct Alloc
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -48,6 +49,7 @@ struct Memory
 {
     uint mem_offset;
     uint mem_error;
+    uint blend_offset;
     uint memory[1];
 };
 
diff --git a/piet-gpu/shader/gen/bbox_clear.spv b/piet-gpu/shader/gen/bbox_clear.spv
index 58a270e..f1ef371 100644
Binary files a/piet-gpu/shader/gen/bbox_clear.spv and b/piet-gpu/shader/gen/bbox_clear.spv differ
diff --git a/piet-gpu/shader/gen/binning.dxil b/piet-gpu/shader/gen/binning.dxil
index 3050aa8..5c89a15 100644
Binary files a/piet-gpu/shader/gen/binning.dxil and b/piet-gpu/shader/gen/binning.dxil differ
diff --git a/piet-gpu/shader/gen/binning.hlsl b/piet-gpu/shader/gen/binning.hlsl
index 986f42b..7096371 100644
--- a/piet-gpu/shader/gen/binning.hlsl
+++ b/piet-gpu/shader/gen/binning.hlsl
@@ -3,22 +3,6 @@ struct Alloc
     uint offset;
 };
 
-struct MallocResult
-{
-    Alloc alloc;
-    bool failed;
-};
-
-struct BinInstanceRef
-{
-    uint offset;
-};
-
-struct BinInstance
-{
-    uint element_ix;
-};
-
 struct DrawMonoid
 {
     uint path_ix;
@@ -29,6 +13,7 @@ struct DrawMonoid
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -60,8 +45,8 @@ struct Config
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-RWByteAddressBuffer _81 : register(u0, space0);
-ByteAddressBuffer _156 : register(t1, space0);
+RWByteAddressBuffer _57 : register(u0, space0);
+ByteAddressBuffer _101 : register(t1, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -72,39 +57,38 @@ struct SPIRV_Cross_Input
 };
 
 groupshared uint bitmaps[8][256];
-groupshared bool sh_alloc_failed;
 groupshared uint count[8][256];
-groupshared Alloc sh_chunk_alloc[256];
+groupshared uint sh_chunk_offset[256];
 
 DrawMonoid load_draw_monoid(uint element_ix)
 {
-    uint base = (_156.Load(44) >> uint(2)) + (4u * element_ix);
-    uint path_ix = _81.Load(base * 4 + 8);
-    uint clip_ix = _81.Load((base + 1u) * 4 + 8);
-    uint scene_offset = _81.Load((base + 2u) * 4 + 8);
-    uint info_offset = _81.Load((base + 3u) * 4 + 8);
-    DrawMonoid _190 = { path_ix, clip_ix, scene_offset, info_offset };
-    return _190;
+    uint base = (_101.Load(48) >> uint(2)) + (4u * element_ix);
+    uint path_ix = _57.Load(base * 4 + 12);
+    uint clip_ix = _57.Load((base + 1u) * 4 + 12);
+    uint scene_offset = _57.Load((base + 2u) * 4 + 12);
+    uint info_offset = _57.Load((base + 3u) * 4 + 12);
+    DrawMonoid _136 = { path_ix, clip_ix, scene_offset, info_offset };
+    return _136;
 }
 
 float4 load_clip_bbox(uint clip_ix)
 {
-    uint base = (_156.Load(60) >> uint(2)) + (4u * clip_ix);
-    float x0 = asfloat(_81.Load(base * 4 + 8));
-    float y0 = asfloat(_81.Load((base + 1u) * 4 + 8));
-    float x1 = asfloat(_81.Load((base + 2u) * 4 + 8));
-    float y1 = asfloat(_81.Load((base + 3u) * 4 + 8));
+    uint base = (_101.Load(64) >> uint(2)) + (4u * clip_ix);
+    float x0 = asfloat(_57.Load(base * 4 + 12));
+    float y0 = asfloat(_57.Load((base + 1u) * 4 + 12));
+    float x1 = asfloat(_57.Load((base + 2u) * 4 + 12));
+    float y1 = asfloat(_57.Load((base + 3u) * 4 + 12));
     float4 bbox = float4(x0, y0, x1, y1);
     return bbox;
 }
 
 float4 load_path_bbox(uint path_ix)
 {
-    uint base = (_156.Load(40) >> uint(2)) + (6u * path_ix);
-    float bbox_l = float(_81.Load(base * 4 + 8)) - 32768.0f;
-    float bbox_t = float(_81.Load((base + 1u) * 4 + 8)) - 32768.0f;
-    float bbox_r = float(_81.Load((base + 2u) * 4 + 8)) - 32768.0f;
-    float bbox_b = float(_81.Load((base + 3u) * 4 + 8)) - 32768.0f;
+    uint base = (_101.Load(44) >> uint(2)) + (6u * path_ix);
+    float bbox_l = float(_57.Load(base * 4 + 12)) - 32768.0f;
+    float bbox_t = float(_57.Load((base + 1u) * 4 + 12)) - 32768.0f;
+    float bbox_r = float(_57.Load((base + 2u) * 4 + 12)) - 32768.0f;
+    float bbox_b = float(_57.Load((base + 3u) * 4 + 12)) - 32768.0f;
     float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
     return bbox;
 }
@@ -116,41 +100,25 @@ float4 bbox_intersect(float4 a, float4 b)
 
 void store_draw_bbox(uint draw_ix, float4 bbox)
 {
-    uint base = (_156.Load(64) >> uint(2)) + (4u * draw_ix);
-    _81.Store(base * 4 + 8, asuint(bbox.x));
-    _81.Store((base + 1u) * 4 + 8, asuint(bbox.y));
-    _81.Store((base + 2u) * 4 + 8, asuint(bbox.z));
-    _81.Store((base + 3u) * 4 + 8, asuint(bbox.w));
+    uint base = (_101.Load(68) >> uint(2)) + (4u * draw_ix);
+    _57.Store(base * 4 + 12, asuint(bbox.x));
+    _57.Store((base + 1u) * 4 + 12, asuint(bbox.y));
+    _57.Store((base + 2u) * 4 + 12, asuint(bbox.z));
+    _57.Store((base + 3u) * 4 + 12, asuint(bbox.w));
 }
 
-Alloc new_alloc(uint offset, uint size, bool mem_ok)
+uint malloc_stage(uint size, uint mem_size, uint stage)
 {
-    Alloc a;
-    a.offset = offset;
-    return a;
-}
-
-MallocResult malloc(uint size)
-{
-    uint _87;
-    _81.InterlockedAdd(0, size, _87);
-    uint offset = _87;
-    uint _94;
-    _81.GetDimensions(_94);
-    _94 = (_94 - 8) / 4;
-    MallocResult r;
-    r.failed = (offset + size) > uint(int(_94) * 4);
-    uint param = offset;
-    uint param_1 = size;
-    bool param_2 = !r.failed;
-    r.alloc = new_alloc(param, param_1, param_2);
-    if (r.failed)
+    uint _65;
+    _57.InterlockedAdd(0, size, _65);
+    uint offset = _65;
+    if ((offset + size) > mem_size)
     {
-        uint _116;
-        _81.InterlockedMax(4, 1u, _116);
-        return r;
+        uint _76;
+        _57.InterlockedOr(4, stage, _76);
+        offset = 0u;
     }
-    return r;
+    return offset;
 }
 
 bool touch_mem(Alloc alloc, uint offset)
@@ -166,16 +134,7 @@ void write_mem(Alloc alloc, uint offset, uint val)
     {
         return;
     }
-    _81.Store(offset * 4 + 8, val);
-}
-
-void BinInstance_write(Alloc a, BinInstanceRef ref, BinInstance s)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint param_2 = s.element_ix;
-    write_mem(param, param_1, param_2);
+    _57.Store(offset * 4 + 12, val);
 }
 
 void comp_main()
@@ -185,17 +144,12 @@ void comp_main()
     {
         bitmaps[i][gl_LocalInvocationID.x] = 0u;
     }
-    if (gl_LocalInvocationID.x == 0u)
-    {
-        sh_alloc_failed = false;
-    }
-    GroupMemoryBarrierWithGroupSync();
     uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x;
     int x0 = 0;
     int y0 = 0;
     int x1 = 0;
     int y1 = 0;
-    if (element_ix < _156.Load(0))
+    if (element_ix < _101.Load(4))
     {
         uint param = element_ix;
         DrawMonoid draw_monoid = load_draw_monoid(param);
@@ -212,11 +166,11 @@ void comp_main()
         float4 param_3 = path_bbox;
         float4 param_4 = clip_bbox;
         float4 bbox = bbox_intersect(param_3, param_4);
-        float4 _417 = bbox;
-        float4 _419 = bbox;
-        float2 _421 = max(_417.xy, _419.zw);
-        bbox.z = _421.x;
-        bbox.w = _421.y;
+        float4 _354 = bbox;
+        float4 _356 = bbox;
+        float2 _358 = max(_354.xy, _356.zw);
+        bbox.z = _358.x;
+        bbox.w = _358.y;
         uint param_5 = element_ix;
         float4 param_6 = bbox;
         store_draw_bbox(param_5, param_6);
@@ -225,8 +179,8 @@ void comp_main()
         x1 = int(ceil(bbox.z * 0.00390625f));
         y1 = int(ceil(bbox.w * 0.00390625f));
     }
-    uint width_in_bins = ((_156.Load(8) + 16u) - 1u) / 16u;
-    uint height_in_bins = ((_156.Load(12) + 16u) - 1u) / 16u;
+    uint width_in_bins = ((_101.Load(12) + 16u) - 1u) / 16u;
+    uint height_in_bins = ((_101.Load(16) + 16u) - 1u) / 16u;
     x0 = clamp(x0, 0, int(width_in_bins));
     x1 = clamp(x1, x0, int(width_in_bins));
     y0 = clamp(y0, 0, int(height_in_bins));
@@ -241,8 +195,8 @@ void comp_main()
     uint my_mask = 1u << (gl_LocalInvocationID.x & 31u);
     while (y < y1)
     {
-        uint _523;
-        InterlockedOr(bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, _523);
+        uint _460;
+        InterlockedOr(bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, _460);
         x++;
         if (x == x1)
         {
@@ -257,51 +211,32 @@ void comp_main()
         element_count += uint(int(countbits(bitmaps[i_1][gl_LocalInvocationID.x])));
         count[i_1][gl_LocalInvocationID.x] = element_count;
     }
-    uint param_7 = 0u;
-    uint param_8 = 0u;
-    bool param_9 = true;
-    Alloc chunk_alloc = new_alloc(param_7, param_8, param_9);
+    uint chunk_offset = 0u;
     if (element_count != 0u)
     {
-        uint param_10 = element_count * 4u;
-        MallocResult _573 = malloc(param_10);
-        MallocResult chunk = _573;
-        chunk_alloc = chunk.alloc;
-        sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
-        if (chunk.failed)
-        {
-            sh_alloc_failed = true;
-        }
+        uint param_7 = element_count * 4u;
+        uint param_8 = _101.Load(0);
+        uint param_9 = 1u;
+        uint _510 = malloc_stage(param_7, param_8, param_9);
+        chunk_offset = _510;
+        sh_chunk_offset[gl_LocalInvocationID.x] = chunk_offset;
     }
-    uint out_ix = (_156.Load(20) >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u);
-    Alloc _603;
-    _603.offset = _156.Load(20);
-    Alloc param_11;
-    param_11.offset = _603.offset;
-    uint param_12 = out_ix;
-    uint param_13 = element_count;
-    write_mem(param_11, param_12, param_13);
-    Alloc _615;
-    _615.offset = _156.Load(20);
-    Alloc param_14;
-    param_14.offset = _615.offset;
-    uint param_15 = out_ix + 1u;
-    uint param_16 = chunk_alloc.offset;
-    write_mem(param_14, param_15, param_16);
+    uint out_ix = (_101.Load(24) >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u);
+    Alloc _532;
+    _532.offset = _101.Load(24);
+    Alloc param_10;
+    param_10.offset = _532.offset;
+    uint param_11 = out_ix;
+    uint param_12 = element_count;
+    write_mem(param_10, param_11, param_12);
+    Alloc _544;
+    _544.offset = _101.Load(24);
+    Alloc param_13;
+    param_13.offset = _544.offset;
+    uint param_14 = out_ix + 1u;
+    uint param_15 = chunk_offset;
+    write_mem(param_13, param_14, param_15);
     GroupMemoryBarrierWithGroupSync();
-    bool _630;
-    if (!sh_alloc_failed)
-    {
-        _630 = _81.Load(4) != 0u;
-    }
-    else
-    {
-        _630 = sh_alloc_failed;
-    }
-    if (_630)
-    {
-        return;
-    }
     x = x0;
     y = y0;
     while (y < y1)
@@ -315,14 +250,11 @@ void comp_main()
             {
                 idx += count[my_slice - 1u][bin_ix];
             }
-            Alloc out_alloc = sh_chunk_alloc[bin_ix];
-            uint out_offset = out_alloc.offset + (idx * 4u);
-            BinInstanceRef _692 = { out_offset };
-            BinInstance _694 = { element_ix };
-            Alloc param_17 = out_alloc;
-            BinInstanceRef param_18 = _692;
-            BinInstance param_19 = _694;
-            BinInstance_write(param_17, param_18, param_19);
+            uint chunk_offset_1 = sh_chunk_offset[bin_ix];
+            if (chunk_offset_1 != 0u)
+            {
+                _57.Store(((chunk_offset_1 >> uint(2)) + idx) * 4 + 12, element_ix);
+            }
         }
         x++;
         if (x == x1)
diff --git a/piet-gpu/shader/gen/binning.msl b/piet-gpu/shader/gen/binning.msl
index 2ee5168..d3ef95c 100644
--- a/piet-gpu/shader/gen/binning.msl
+++ b/piet-gpu/shader/gen/binning.msl
@@ -12,22 +12,6 @@ struct Alloc
     uint offset;
 };
 
-struct MallocResult
-{
-    Alloc alloc;
-    bool failed;
-};
-
-struct BinInstanceRef
-{
-    uint offset;
-};
-
-struct BinInstance
-{
-    uint element_ix;
-};
-
 struct DrawMonoid
 {
     uint path_ix;
@@ -40,6 +24,7 @@ struct Memory
 {
     uint mem_offset;
     uint mem_error;
+    uint blend_offset;
     uint memory[1];
 };
 
@@ -50,6 +35,7 @@ struct Alloc_1
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -87,36 +73,36 @@ struct ConfigBuf
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
 
 static inline __attribute__((always_inline))
-DrawMonoid load_draw_monoid(thread const uint& element_ix, device Memory& v_81, constant uint& v_81BufferSize, const device ConfigBuf& v_156)
+DrawMonoid load_draw_monoid(thread const uint& element_ix, device Memory& v_57, const device ConfigBuf& v_101)
 {
-    uint base = (v_156.conf.drawmonoid_alloc.offset >> uint(2)) + (4u * element_ix);
-    uint path_ix = v_81.memory[base];
-    uint clip_ix = v_81.memory[base + 1u];
-    uint scene_offset = v_81.memory[base + 2u];
-    uint info_offset = v_81.memory[base + 3u];
+    uint base = (v_101.conf.drawmonoid_alloc.offset >> uint(2)) + (4u * element_ix);
+    uint path_ix = v_57.memory[base];
+    uint clip_ix = v_57.memory[base + 1u];
+    uint scene_offset = v_57.memory[base + 2u];
+    uint info_offset = v_57.memory[base + 3u];
     return DrawMonoid{ path_ix, clip_ix, scene_offset, info_offset };
 }
 
 static inline __attribute__((always_inline))
-float4 load_clip_bbox(thread const uint& clip_ix, device Memory& v_81, constant uint& v_81BufferSize, const device ConfigBuf& v_156)
+float4 load_clip_bbox(thread const uint& clip_ix, device Memory& v_57, const device ConfigBuf& v_101)
 {
-    uint base = (v_156.conf.clip_bbox_alloc.offset >> uint(2)) + (4u * clip_ix);
-    float x0 = as_type<float>(v_81.memory[base]);
-    float y0 = as_type<float>(v_81.memory[base + 1u]);
-    float x1 = as_type<float>(v_81.memory[base + 2u]);
-    float y1 = as_type<float>(v_81.memory[base + 3u]);
+    uint base = (v_101.conf.clip_bbox_alloc.offset >> uint(2)) + (4u * clip_ix);
+    float x0 = as_type<float>(v_57.memory[base]);
+    float y0 = as_type<float>(v_57.memory[base + 1u]);
+    float x1 = as_type<float>(v_57.memory[base + 2u]);
+    float y1 = as_type<float>(v_57.memory[base + 3u]);
     float4 bbox = float4(x0, y0, x1, y1);
     return bbox;
 }
 
 static inline __attribute__((always_inline))
-float4 load_path_bbox(thread const uint& path_ix, device Memory& v_81, constant uint& v_81BufferSize, const device ConfigBuf& v_156)
+float4 load_path_bbox(thread const uint& path_ix, device Memory& v_57, const device ConfigBuf& v_101)
 {
-    uint base = (v_156.conf.path_bbox_alloc.offset >> uint(2)) + (6u * path_ix);
-    float bbox_l = float(v_81.memory[base]) - 32768.0;
-    float bbox_t = float(v_81.memory[base + 1u]) - 32768.0;
-    float bbox_r = float(v_81.memory[base + 2u]) - 32768.0;
-    float bbox_b = float(v_81.memory[base + 3u]) - 32768.0;
+    uint base = (v_101.conf.path_bbox_alloc.offset >> uint(2)) + (6u * path_ix);
+    float bbox_l = float(v_57.memory[base]) - 32768.0;
+    float bbox_t = float(v_57.memory[base + 1u]) - 32768.0;
+    float bbox_r = float(v_57.memory[base + 2u]) - 32768.0;
+    float bbox_b = float(v_57.memory[base + 3u]) - 32768.0;
     float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
     return bbox;
 }
@@ -128,40 +114,26 @@ float4 bbox_intersect(thread const float4& a, thread const float4& b)
 }
 
 static inline __attribute__((always_inline))
-void store_draw_bbox(thread const uint& draw_ix, thread const float4& bbox, device Memory& v_81, constant uint& v_81BufferSize, const device ConfigBuf& v_156)
+void store_draw_bbox(thread const uint& draw_ix, thread const float4& bbox, device Memory& v_57, const device ConfigBuf& v_101)
 {
-    uint base = (v_156.conf.draw_bbox_alloc.offset >> uint(2)) + (4u * draw_ix);
-    v_81.memory[base] = as_type<uint>(bbox.x);
-    v_81.memory[base + 1u] = as_type<uint>(bbox.y);
-    v_81.memory[base + 2u] = as_type<uint>(bbox.z);
-    v_81.memory[base + 3u] = as_type<uint>(bbox.w);
+    uint base = (v_101.conf.draw_bbox_alloc.offset >> uint(2)) + (4u * draw_ix);
+    v_57.memory[base] = as_type<uint>(bbox.x);
+    v_57.memory[base + 1u] = as_type<uint>(bbox.y);
+    v_57.memory[base + 2u] = as_type<uint>(bbox.z);
+    v_57.memory[base + 3u] = as_type<uint>(bbox.w);
 }
 
 static inline __attribute__((always_inline))
-Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok)
+uint malloc_stage(thread const uint& size, thread const uint& mem_size, thread const uint& stage, device Memory& v_57)
 {
-    Alloc a;
-    a.offset = offset;
-    return a;
-}
-
-static inline __attribute__((always_inline))
-MallocResult malloc(thread const uint& size, device Memory& v_81, constant uint& v_81BufferSize)
-{
-    uint _87 = atomic_fetch_add_explicit((device atomic_uint*)&v_81.mem_offset, size, memory_order_relaxed);
-    uint offset = _87;
-    MallocResult r;
-    r.failed = (offset + size) > uint(int((v_81BufferSize - 8) / 4) * 4);
-    uint param = offset;
-    uint param_1 = size;
-    bool param_2 = !r.failed;
-    r.alloc = new_alloc(param, param_1, param_2);
-    if (r.failed)
+    uint _65 = atomic_fetch_add_explicit((device atomic_uint*)&v_57.mem_offset, size, memory_order_relaxed);
+    uint offset = _65;
+    if ((offset + size) > mem_size)
     {
-        uint _116 = atomic_fetch_max_explicit((device atomic_uint*)&v_81.mem_error, 1u, memory_order_relaxed);
-        return r;
+        uint _76 = atomic_fetch_or_explicit((device atomic_uint*)&v_57.mem_error, stage, memory_order_relaxed);
+        offset = 0u;
     }
-    return r;
+    return offset;
 }
 
 static inline __attribute__((always_inline))
@@ -171,7 +143,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
 }
 
 static inline __attribute__((always_inline))
-void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_81, constant uint& v_81BufferSize)
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_57)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -179,73 +151,56 @@ void write_mem(thread const Alloc& alloc, thread const uint& offset, thread cons
     {
         return;
     }
-    v_81.memory[offset] = val;
+    v_57.memory[offset] = val;
 }
 
-static inline __attribute__((always_inline))
-void BinInstance_write(thread const Alloc& a, thread const BinInstanceRef& ref, thread const BinInstance& s, device Memory& v_81, constant uint& v_81BufferSize)
-{
-    uint ix = ref.offset >> uint(2);
-    Alloc param = a;
-    uint param_1 = ix + 0u;
-    uint param_2 = s.element_ix;
-    write_mem(param, param_1, param_2, v_81, v_81BufferSize);
-}
-
-kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_81 [[buffer(0)]], const device ConfigBuf& v_156 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_57 [[buffer(0)]], const device ConfigBuf& v_101 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     threadgroup uint bitmaps[8][256];
-    threadgroup short sh_alloc_failed;
     threadgroup uint count[8][256];
-    threadgroup Alloc sh_chunk_alloc[256];
-    constant uint& v_81BufferSize = spvBufferSizeConstants[0];
+    threadgroup uint sh_chunk_offset[256];
     uint my_partition = gl_WorkGroupID.x;
     for (uint i = 0u; i < 8u; i++)
     {
         bitmaps[i][gl_LocalInvocationID.x] = 0u;
     }
-    if (gl_LocalInvocationID.x == 0u)
-    {
-        sh_alloc_failed = short(false);
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
     uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x;
     int x0 = 0;
     int y0 = 0;
     int x1 = 0;
     int y1 = 0;
-    if (element_ix < v_156.conf.n_elements)
+    if (element_ix < v_101.conf.n_elements)
     {
         uint param = element_ix;
-        DrawMonoid draw_monoid = load_draw_monoid(param, v_81, v_81BufferSize, v_156);
+        DrawMonoid draw_monoid = load_draw_monoid(param, v_57, v_101);
         uint path_ix = draw_monoid.path_ix;
         float4 clip_bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0);
         uint clip_ix = draw_monoid.clip_ix;
         if (clip_ix > 0u)
         {
             uint param_1 = clip_ix - 1u;
-            clip_bbox = load_clip_bbox(param_1, v_81, v_81BufferSize, v_156);
+            clip_bbox = load_clip_bbox(param_1, v_57, v_101);
         }
         uint param_2 = path_ix;
-        float4 path_bbox = load_path_bbox(param_2, v_81, v_81BufferSize, v_156);
+        float4 path_bbox = load_path_bbox(param_2, v_57, v_101);
         float4 param_3 = path_bbox;
         float4 param_4 = clip_bbox;
         float4 bbox = bbox_intersect(param_3, param_4);
-        float4 _417 = bbox;
-        float4 _419 = bbox;
-        float2 _421 = fast::max(_417.xy, _419.zw);
-        bbox.z = _421.x;
-        bbox.w = _421.y;
+        float4 _354 = bbox;
+        float4 _356 = bbox;
+        float2 _358 = fast::max(_354.xy, _356.zw);
+        bbox.z = _358.x;
+        bbox.w = _358.y;
         uint param_5 = element_ix;
         float4 param_6 = bbox;
-        store_draw_bbox(param_5, param_6, v_81, v_81BufferSize, v_156);
+        store_draw_bbox(param_5, param_6, v_57, v_101);
         x0 = int(floor(bbox.x * 0.00390625));
         y0 = int(floor(bbox.y * 0.00390625));
         x1 = int(ceil(bbox.z * 0.00390625));
         y1 = int(ceil(bbox.w * 0.00390625));
     }
-    uint width_in_bins = ((v_156.conf.width_in_tiles + 16u) - 1u) / 16u;
-    uint height_in_bins = ((v_156.conf.height_in_tiles + 16u) - 1u) / 16u;
+    uint width_in_bins = ((v_101.conf.width_in_tiles + 16u) - 1u) / 16u;
+    uint height_in_bins = ((v_101.conf.height_in_tiles + 16u) - 1u) / 16u;
     x0 = clamp(x0, 0, int(width_in_bins));
     x1 = clamp(x1, x0, int(width_in_bins));
     y0 = clamp(y0, 0, int(height_in_bins));
@@ -260,7 +215,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
     uint my_mask = 1u << (gl_LocalInvocationID.x & 31u);
     while (y < y1)
     {
-        uint _523 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, memory_order_relaxed);
+        uint _460 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, memory_order_relaxed);
         x++;
         if (x == x1)
         {
@@ -275,47 +230,28 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
         element_count += uint(int(popcount(bitmaps[i_1][gl_LocalInvocationID.x])));
         count[i_1][gl_LocalInvocationID.x] = element_count;
     }
-    uint param_7 = 0u;
-    uint param_8 = 0u;
-    bool param_9 = true;
-    Alloc chunk_alloc = new_alloc(param_7, param_8, param_9);
+    uint chunk_offset = 0u;
     if (element_count != 0u)
     {
-        uint param_10 = element_count * 4u;
-        MallocResult _573 = malloc(param_10, v_81, v_81BufferSize);
-        MallocResult chunk = _573;
-        chunk_alloc = chunk.alloc;
-        sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
-        if (chunk.failed)
-        {
-            sh_alloc_failed = short(true);
-        }
+        uint param_7 = element_count * 4u;
+        uint param_8 = v_101.conf.mem_size;
+        uint param_9 = 1u;
+        uint _510 = malloc_stage(param_7, param_8, param_9, v_57);
+        chunk_offset = _510;
+        sh_chunk_offset[gl_LocalInvocationID.x] = chunk_offset;
     }
-    uint out_ix = (v_156.conf.bin_alloc.offset >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u);
-    Alloc param_11;
-    param_11.offset = v_156.conf.bin_alloc.offset;
-    uint param_12 = out_ix;
-    uint param_13 = element_count;
-    write_mem(param_11, param_12, param_13, v_81, v_81BufferSize);
-    Alloc param_14;
-    param_14.offset = v_156.conf.bin_alloc.offset;
-    uint param_15 = out_ix + 1u;
-    uint param_16 = chunk_alloc.offset;
-    write_mem(param_14, param_15, param_16, v_81, v_81BufferSize);
+    uint out_ix = (v_101.conf.bin_alloc.offset >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u);
+    Alloc param_10;
+    param_10.offset = v_101.conf.bin_alloc.offset;
+    uint param_11 = out_ix;
+    uint param_12 = element_count;
+    write_mem(param_10, param_11, param_12, v_57);
+    Alloc param_13;
+    param_13.offset = v_101.conf.bin_alloc.offset;
+    uint param_14 = out_ix + 1u;
+    uint param_15 = chunk_offset;
+    write_mem(param_13, param_14, param_15, v_57);
     threadgroup_barrier(mem_flags::mem_threadgroup);
-    bool _630;
-    if (!bool(sh_alloc_failed))
-    {
-        _630 = v_81.mem_error != 0u;
-    }
-    else
-    {
-        _630 = bool(sh_alloc_failed);
-    }
-    if (_630)
-    {
-        return;
-    }
     x = x0;
     y = y0;
     while (y < y1)
@@ -329,12 +265,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
             {
                 idx += count[my_slice - 1u][bin_ix];
             }
-            Alloc out_alloc = sh_chunk_alloc[bin_ix];
-            uint out_offset = out_alloc.offset + (idx * 4u);
-            Alloc param_17 = out_alloc;
-            BinInstanceRef param_18 = BinInstanceRef{ out_offset };
-            BinInstance param_19 = BinInstance{ element_ix };
-            BinInstance_write(param_17, param_18, param_19, v_81, v_81BufferSize);
+            uint chunk_offset_1 = sh_chunk_offset[bin_ix];
+            if (chunk_offset_1 != 0u)
+            {
+                v_57.memory[(chunk_offset_1 >> uint(2)) + idx] = element_ix;
+            }
         }
         x++;
         if (x == x1)
diff --git a/piet-gpu/shader/gen/binning.spv b/piet-gpu/shader/gen/binning.spv
index 30eacd6..1a5c2e1 100644
Binary files a/piet-gpu/shader/gen/binning.spv and b/piet-gpu/shader/gen/binning.spv differ
diff --git a/piet-gpu/shader/gen/clip_leaf.dxil b/piet-gpu/shader/gen/clip_leaf.dxil
index 29a158e..d5123cb 100644
Binary files a/piet-gpu/shader/gen/clip_leaf.dxil and b/piet-gpu/shader/gen/clip_leaf.dxil differ
diff --git a/piet-gpu/shader/gen/clip_leaf.hlsl b/piet-gpu/shader/gen/clip_leaf.hlsl
index ed45bf1..4eb9994 100644
--- a/piet-gpu/shader/gen/clip_leaf.hlsl
+++ b/piet-gpu/shader/gen/clip_leaf.hlsl
@@ -17,6 +17,7 @@ struct Alloc
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -48,7 +49,7 @@ struct Config
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-static const Bic _393 = { 0u, 0u };
+static const Bic _394 = { 0u, 0u };
 
 ByteAddressBuffer _80 : register(t1, space0);
 RWByteAddressBuffer _96 : register(u0, space0);
@@ -71,9 +72,9 @@ groupshared float4 sh_bbox[256];
 
 Bic load_bic(uint ix)
 {
-    uint base = (_80.Load(52) >> uint(2)) + (2u * ix);
-    Bic _286 = { _96.Load(base * 4 + 8), _96.Load((base + 1u) * 4 + 8) };
-    return _286;
+    uint base = (_80.Load(56) >> uint(2)) + (2u * ix);
+    Bic _287 = { _96.Load(base * 4 + 12), _96.Load((base + 1u) * 4 + 12) };
+    return _287;
 }
 
 Bic bic_combine(Bic x, Bic y)
@@ -85,15 +86,15 @@ Bic bic_combine(Bic x, Bic y)
 
 ClipEl load_clip_el(uint ix)
 {
-    uint base = (_80.Load(56) >> uint(2)) + (5u * ix);
-    uint parent_ix = _96.Load(base * 4 + 8);
-    float x0 = asfloat(_96.Load((base + 1u) * 4 + 8));
-    float y0 = asfloat(_96.Load((base + 2u) * 4 + 8));
-    float x1 = asfloat(_96.Load((base + 3u) * 4 + 8));
-    float y1 = asfloat(_96.Load((base + 4u) * 4 + 8));
+    uint base = (_80.Load(60) >> uint(2)) + (5u * ix);
+    uint parent_ix = _96.Load(base * 4 + 12);
+    float x0 = asfloat(_96.Load((base + 1u) * 4 + 12));
+    float y0 = asfloat(_96.Load((base + 2u) * 4 + 12));
+    float x1 = asfloat(_96.Load((base + 3u) * 4 + 12));
+    float y1 = asfloat(_96.Load((base + 4u) * 4 + 12));
     float4 bbox = float4(x0, y0, x1, y1);
-    ClipEl _335 = { parent_ix, bbox };
-    return _335;
+    ClipEl _336 = { parent_ix, bbox };
+    return _336;
 }
 
 float4 bbox_intersect(float4 a, float4 b)
@@ -103,9 +104,9 @@ float4 bbox_intersect(float4 a, float4 b)
 
 uint load_path_ix(uint ix)
 {
-    if (ix < _80.Load(80))
+    if (ix < _80.Load(84))
     {
-        return _96.Load(((_80.Load(48) >> uint(2)) + ix) * 4 + 8);
+        return _96.Load(((_80.Load(52) >> uint(2)) + ix) * 4 + 12);
     }
     else
     {
@@ -115,11 +116,11 @@ uint load_path_ix(uint ix)
 
 float4 load_path_bbox(uint path_ix)
 {
-    uint base = (_80.Load(40) >> uint(2)) + (6u * path_ix);
-    float bbox_l = float(_96.Load(base * 4 + 8)) - 32768.0f;
-    float bbox_t = float(_96.Load((base + 1u) * 4 + 8)) - 32768.0f;
-    float bbox_r = float(_96.Load((base + 2u) * 4 + 8)) - 32768.0f;
-    float bbox_b = float(_96.Load((base + 3u) * 4 + 8)) - 32768.0f;
+    uint base = (_80.Load(44) >> uint(2)) + (6u * path_ix);
+    float bbox_l = float(_96.Load(base * 4 + 12)) - 32768.0f;
+    float bbox_t = float(_96.Load((base + 1u) * 4 + 12)) - 32768.0f;
+    float bbox_r = float(_96.Load((base + 2u) * 4 + 12)) - 32768.0f;
+    float bbox_b = float(_96.Load((base + 3u) * 4 + 12)) - 32768.0f;
     float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
     return bbox;
 }
@@ -173,17 +174,17 @@ uint search_link(inout Bic bic)
 
 void store_clip_bbox(uint ix, float4 bbox)
 {
-    uint base = (_80.Load(60) >> uint(2)) + (4u * ix);
-    _96.Store(base * 4 + 8, asuint(bbox.x));
-    _96.Store((base + 1u) * 4 + 8, asuint(bbox.y));
-    _96.Store((base + 2u) * 4 + 8, asuint(bbox.z));
-    _96.Store((base + 3u) * 4 + 8, asuint(bbox.w));
+    uint base = (_80.Load(64) >> uint(2)) + (4u * ix);
+    _96.Store(base * 4 + 12, asuint(bbox.x));
+    _96.Store((base + 1u) * 4 + 12, asuint(bbox.y));
+    _96.Store((base + 2u) * 4 + 12, asuint(bbox.z));
+    _96.Store((base + 3u) * 4 + 12, asuint(bbox.w));
 }
 
 void comp_main()
 {
     uint th = gl_LocalInvocationID.x;
-    Bic bic = _393;
+    Bic bic = _394;
     if (th < gl_WorkGroupID.x)
     {
         uint param = th;
@@ -240,8 +241,8 @@ void comp_main()
     uint param_6 = gl_GlobalInvocationID.x;
     uint inp = load_path_ix(param_6);
     bool is_push = int(inp) >= 0;
-    Bic _559 = { 1u - uint(is_push), uint(is_push) };
-    bic = _559;
+    Bic _560 = { 1u - uint(is_push), uint(is_push) };
+    bic = _560;
     sh_bic[th] = bic;
     if (is_push)
     {
@@ -266,11 +267,11 @@ void comp_main()
         inbase = outbase;
     }
     GroupMemoryBarrierWithGroupSync();
-    bic = _393;
+    bic = _394;
     Bic param_10 = bic;
-    uint _618 = search_link(param_10);
+    uint _619 = search_link(param_10);
     bic = param_10;
-    uint link = _618;
+    uint link = _619;
     sh_link[th] = link;
     GroupMemoryBarrierWithGroupSync();
     uint grandparent;
@@ -324,22 +325,22 @@ void comp_main()
     sh_bbox[th] = bbox;
     GroupMemoryBarrierWithGroupSync();
     uint path_ix = inp;
-    bool _717 = !is_push;
-    bool _725;
-    if (_717)
+    bool _718 = !is_push;
+    bool _726;
+    if (_718)
     {
-        _725 = gl_GlobalInvocationID.x < _80.Load(80);
+        _726 = gl_GlobalInvocationID.x < _80.Load(84);
     }
     else
     {
-        _725 = _717;
+        _726 = _718;
     }
-    if (_725)
+    if (_726)
     {
         uint param_15 = parent;
         path_ix = load_path_ix(param_15);
-        uint drawmonoid_out_base = (_80.Load(44) >> uint(2)) + (4u * (~inp));
-        _96.Store(drawmonoid_out_base * 4 + 8, path_ix);
+        uint drawmonoid_out_base = (_80.Load(48) >> uint(2)) + (4u * (~inp));
+        _96.Store(drawmonoid_out_base * 4 + 12, path_ix);
         if (int(grandparent) >= 0)
         {
             bbox = sh_bbox[grandparent];
diff --git a/piet-gpu/shader/gen/clip_leaf.msl b/piet-gpu/shader/gen/clip_leaf.msl
index 5f5e0a7..c9456e8 100644
--- a/piet-gpu/shader/gen/clip_leaf.msl
+++ b/piet-gpu/shader/gen/clip_leaf.msl
@@ -24,6 +24,7 @@ struct Alloc
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -62,6 +63,7 @@ struct Memory
 {
     uint mem_offset;
     uint mem_error;
+    uint blend_offset;
     uint memory[1];
 };
 
@@ -275,9 +277,9 @@ kernel void main0(device Memory& v_96 [[buffer(0)]], const device ConfigBuf& v_8
     threadgroup_barrier(mem_flags::mem_threadgroup);
     bic = Bic{ 0u, 0u };
     Bic param_10 = bic;
-    uint _618 = search_link(param_10, gl_LocalInvocationID, sh_bic);
+    uint _619 = search_link(param_10, gl_LocalInvocationID, sh_bic);
     bic = param_10;
-    uint link = _618;
+    uint link = _619;
     sh_link[th] = link;
     threadgroup_barrier(mem_flags::mem_threadgroup);
     uint grandparent;
@@ -331,17 +333,17 @@ kernel void main0(device Memory& v_96 [[buffer(0)]], const device ConfigBuf& v_8
     sh_bbox[th] = bbox;
     threadgroup_barrier(mem_flags::mem_threadgroup);
     uint path_ix = inp;
-    bool _717 = !is_push;
-    bool _725;
-    if (_717)
+    bool _718 = !is_push;
+    bool _726;
+    if (_718)
     {
-        _725 = gl_GlobalInvocationID.x < v_80.conf.n_clip;
+        _726 = gl_GlobalInvocationID.x < v_80.conf.n_clip;
     }
     else
     {
-        _725 = _717;
+        _726 = _718;
     }
-    if (_725)
+    if (_726)
     {
         uint param_15 = parent;
         path_ix = load_path_ix(param_15, v_80, v_96);
diff --git a/piet-gpu/shader/gen/clip_leaf.spv b/piet-gpu/shader/gen/clip_leaf.spv
index beac64b..fe62632 100644
Binary files a/piet-gpu/shader/gen/clip_leaf.spv and b/piet-gpu/shader/gen/clip_leaf.spv differ
diff --git a/piet-gpu/shader/gen/clip_reduce.dxil b/piet-gpu/shader/gen/clip_reduce.dxil
index 0dff71b..13ffb01 100644
Binary files a/piet-gpu/shader/gen/clip_reduce.dxil and b/piet-gpu/shader/gen/clip_reduce.dxil differ
diff --git a/piet-gpu/shader/gen/clip_reduce.hlsl b/piet-gpu/shader/gen/clip_reduce.hlsl
index 1276b5f..e031f84 100644
--- a/piet-gpu/shader/gen/clip_reduce.hlsl
+++ b/piet-gpu/shader/gen/clip_reduce.hlsl
@@ -17,6 +17,7 @@ struct Alloc
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -48,7 +49,7 @@ struct Config
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-static const Bic _267 = { 0u, 0u };
+static const Bic _268 = { 0u, 0u };
 
 ByteAddressBuffer _64 : register(t1, space0);
 RWByteAddressBuffer _80 : register(u0, space0);
@@ -77,39 +78,39 @@ Bic bic_combine(Bic x, Bic y)
 
 void store_bic(uint ix, Bic bic)
 {
-    uint base = (_64.Load(52) >> uint(2)) + (2u * ix);
-    _80.Store(base * 4 + 8, bic.a);
-    _80.Store((base + 1u) * 4 + 8, bic.b);
+    uint base = (_64.Load(56) >> uint(2)) + (2u * ix);
+    _80.Store(base * 4 + 12, bic.a);
+    _80.Store((base + 1u) * 4 + 12, bic.b);
 }
 
 float4 load_path_bbox(uint path_ix)
 {
-    uint base = (_64.Load(40) >> uint(2)) + (6u * path_ix);
-    float bbox_l = float(_80.Load(base * 4 + 8)) - 32768.0f;
-    float bbox_t = float(_80.Load((base + 1u) * 4 + 8)) - 32768.0f;
-    float bbox_r = float(_80.Load((base + 2u) * 4 + 8)) - 32768.0f;
-    float bbox_b = float(_80.Load((base + 3u) * 4 + 8)) - 32768.0f;
+    uint base = (_64.Load(44) >> uint(2)) + (6u * path_ix);
+    float bbox_l = float(_80.Load(base * 4 + 12)) - 32768.0f;
+    float bbox_t = float(_80.Load((base + 1u) * 4 + 12)) - 32768.0f;
+    float bbox_r = float(_80.Load((base + 2u) * 4 + 12)) - 32768.0f;
+    float bbox_b = float(_80.Load((base + 3u) * 4 + 12)) - 32768.0f;
     float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
     return bbox;
 }
 
 void store_clip_el(uint ix, ClipEl el)
 {
-    uint base = (_64.Load(56) >> uint(2)) + (5u * ix);
-    _80.Store(base * 4 + 8, el.parent_ix);
-    _80.Store((base + 1u) * 4 + 8, asuint(el.bbox.x));
-    _80.Store((base + 2u) * 4 + 8, asuint(el.bbox.y));
-    _80.Store((base + 3u) * 4 + 8, asuint(el.bbox.z));
-    _80.Store((base + 4u) * 4 + 8, asuint(el.bbox.w));
+    uint base = (_64.Load(60) >> uint(2)) + (5u * ix);
+    _80.Store(base * 4 + 12, el.parent_ix);
+    _80.Store((base + 1u) * 4 + 12, asuint(el.bbox.x));
+    _80.Store((base + 2u) * 4 + 12, asuint(el.bbox.y));
+    _80.Store((base + 3u) * 4 + 12, asuint(el.bbox.z));
+    _80.Store((base + 4u) * 4 + 12, asuint(el.bbox.w));
 }
 
 void comp_main()
 {
     uint th = gl_LocalInvocationID.x;
-    uint inp = _80.Load(((_64.Load(48) >> uint(2)) + gl_GlobalInvocationID.x) * 4 + 8);
+    uint inp = _80.Load(((_64.Load(52) >> uint(2)) + gl_GlobalInvocationID.x) * 4 + 12);
     bool is_push = int(inp) >= 0;
-    Bic _207 = { 1u - uint(is_push), uint(is_push) };
-    Bic bic = _207;
+    Bic _208 = { 1u - uint(is_push), uint(is_push) };
+    Bic bic = _208;
     sh_bic[gl_LocalInvocationID.x] = bic;
     for (uint i = 0u; i < 8u; i++)
     {
@@ -132,21 +133,21 @@ void comp_main()
     }
     GroupMemoryBarrierWithGroupSync();
     uint size = sh_bic[0].b;
-    bic = _267;
+    bic = _268;
     if ((th + 1u) < 256u)
     {
         bic = sh_bic[th + 1u];
     }
-    bool _283;
+    bool _284;
     if (is_push)
     {
-        _283 = bic.a == 0u;
+        _284 = bic.a == 0u;
     }
     else
     {
-        _283 = is_push;
+        _284 = is_push;
     }
-    if (_283)
+    if (_284)
     {
         uint local_ix = (size - bic.b) - 1u;
         sh_parent[local_ix] = th;
@@ -163,8 +164,8 @@ void comp_main()
     if (th < size)
     {
         uint parent_ix = sh_parent[th] + (gl_WorkGroupID.x * 256u);
-        ClipEl _331 = { parent_ix, bbox };
-        ClipEl el = _331;
+        ClipEl _332 = { parent_ix, bbox };
+        ClipEl el = _332;
         uint param_5 = gl_GlobalInvocationID.x;
         ClipEl param_6 = el;
         store_clip_el(param_5, param_6);
diff --git a/piet-gpu/shader/gen/clip_reduce.msl b/piet-gpu/shader/gen/clip_reduce.msl
index 26214f1..dd34e64 100644
--- a/piet-gpu/shader/gen/clip_reduce.msl
+++ b/piet-gpu/shader/gen/clip_reduce.msl
@@ -24,6 +24,7 @@ struct Alloc
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -62,6 +63,7 @@ struct Memory
 {
     uint mem_offset;
     uint mem_error;
+    uint blend_offset;
     uint memory[1];
 };
 
@@ -142,16 +144,16 @@ kernel void main0(device Memory& v_80 [[buffer(0)]], const device ConfigBuf& v_6
     {
         bic = sh_bic[th + 1u];
     }
-    bool _283;
+    bool _284;
     if (is_push)
     {
-        _283 = bic.a == 0u;
+        _284 = bic.a == 0u;
     }
     else
     {
-        _283 = is_push;
+        _284 = is_push;
     }
-    if (_283)
+    if (_284)
     {
         uint local_ix = (size - bic.b) - 1u;
         sh_parent[local_ix] = th;
diff --git a/piet-gpu/shader/gen/clip_reduce.spv b/piet-gpu/shader/gen/clip_reduce.spv
index ce0b9bb..40121e7 100644
Binary files a/piet-gpu/shader/gen/clip_reduce.spv and b/piet-gpu/shader/gen/clip_reduce.spv differ
diff --git a/piet-gpu/shader/gen/coarse.dxil b/piet-gpu/shader/gen/coarse.dxil
index f71cc04..58e2da8 100644
Binary files a/piet-gpu/shader/gen/coarse.dxil and b/piet-gpu/shader/gen/coarse.dxil differ
diff --git a/piet-gpu/shader/gen/coarse.hlsl b/piet-gpu/shader/gen/coarse.hlsl
index a7f769f..673e879 100644
--- a/piet-gpu/shader/gen/coarse.hlsl
+++ b/piet-gpu/shader/gen/coarse.hlsl
@@ -3,12 +3,6 @@ struct Alloc
     uint offset;
 };
 
-struct MallocResult
-{
-    Alloc alloc;
-    bool failed;
-};
-
 struct BinInstanceRef
 {
     uint offset;
@@ -144,6 +138,7 @@ struct CmdRef
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -175,9 +170,9 @@ struct Config
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-RWByteAddressBuffer _266 : register(u0, space0);
-ByteAddressBuffer _1020 : register(t1, space0);
-ByteAddressBuffer _1399 : register(t2, space0);
+RWByteAddressBuffer _267 : register(u0, space0);
+ByteAddressBuffer _891 : register(t1, space0);
+ByteAddressBuffer _1390 : register(t2, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -187,6 +182,7 @@ struct SPIRV_Cross_Input
     uint3 gl_LocalInvocationID : SV_GroupThreadID;
 };
 
+static bool mem_ok;
 groupshared uint sh_bitmaps[8][256];
 groupshared Alloc sh_part_elements[256];
 groupshared uint sh_part_count[256];
@@ -198,10 +194,17 @@ groupshared uint sh_tile_y0[256];
 groupshared uint sh_tile_base[256];
 groupshared uint sh_tile_count[256];
 
+bool check_deps(uint dep_stage)
+{
+    uint _273;
+    _267.InterlockedOr(4, 0u, _273);
+    return (_273 & dep_stage) == 0u;
+}
+
 Alloc slice_mem(Alloc a, uint offset, uint size)
 {
-    Alloc _343 = { a.offset + offset };
-    return _343;
+    Alloc _331 = { a.offset + offset };
+    return _331;
 }
 
 bool touch_mem(Alloc alloc, uint offset)
@@ -217,11 +220,11 @@ uint read_mem(Alloc alloc, uint offset)
     {
         return 0u;
     }
-    uint v = _266.Load(offset * 4 + 8);
+    uint v = _267.Load(offset * 4 + 12);
     return v;
 }
 
-Alloc new_alloc(uint offset, uint size, bool mem_ok)
+Alloc new_alloc(uint offset, uint size, bool mem_ok_1)
 {
     Alloc a;
     a.offset = offset;
@@ -230,8 +233,8 @@ Alloc new_alloc(uint offset, uint size, bool mem_ok)
 
 BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index)
 {
-    BinInstanceRef _361 = { ref.offset + (index * 4u) };
-    return _361;
+    BinInstanceRef _340 = { ref.offset + (index * 4u) };
+    return _340;
 }
 
 BinInstance BinInstance_read(Alloc a, BinInstanceRef ref)
@@ -259,8 +262,8 @@ Path Path_read(Alloc a, PathRef ref)
     uint raw2 = read_mem(param_4, param_5);
     Path s;
     s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
-    TileRef _424 = { raw2 };
-    s.tiles = _424;
+    TileRef _404 = { raw2 };
+    s.tiles = _404;
     return s;
 }
 
@@ -268,14 +271,11 @@ void write_tile_alloc(uint el_ix, Alloc a)
 {
 }
 
-Alloc read_tile_alloc(uint el_ix, bool mem_ok)
+Alloc read_tile_alloc(uint el_ix, bool mem_ok_1)
 {
-    uint _907;
-    _266.GetDimensions(_907);
-    _907 = (_907 - 8) / 4;
     uint param = 0u;
-    uint param_1 = uint(int(_907) * 4);
-    bool param_2 = mem_ok;
+    uint param_1 = _891.Load(0);
+    bool param_2 = mem_ok_1;
     return new_alloc(param, param_1, param_2);
 }
 
@@ -288,34 +288,25 @@ Tile Tile_read(Alloc a, TileRef ref)
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
     uint raw1 = read_mem(param_2, param_3);
-    TileSegRef _449 = { raw0 };
+    TileSegRef _429 = { raw0 };
     Tile s;
-    s.tile = _449;
+    s.tile = _429;
     s.backdrop = int(raw1);
     return s;
 }
 
-MallocResult malloc(uint size)
+uint malloc_stage(uint size, uint mem_size, uint stage)
 {
-    uint _272;
-    _266.InterlockedAdd(0, size, _272);
-    uint offset = _272;
-    uint _279;
-    _266.GetDimensions(_279);
-    _279 = (_279 - 8) / 4;
-    MallocResult r;
-    r.failed = (offset + size) > uint(int(_279) * 4);
-    uint param = offset;
-    uint param_1 = size;
-    bool param_2 = !r.failed;
-    r.alloc = new_alloc(param, param_1, param_2);
-    if (r.failed)
+    uint _282;
+    _267.InterlockedAdd(0, size, _282);
+    uint offset = _282;
+    if ((offset + size) > mem_size)
     {
-        uint _301;
-        _266.InterlockedMax(4, 1u, _301);
-        return r;
+        uint _292;
+        _267.InterlockedOr(4, stage, _292);
+        offset = 0u;
     }
-    return r;
+    return offset;
 }
 
 void write_mem(Alloc alloc, uint offset, uint val)
@@ -326,7 +317,7 @@ void write_mem(Alloc alloc, uint offset, uint val)
     {
         return;
     }
-    _266.Store(offset * 4 + 8, val);
+    _267.Store(offset * 4 + 12, val);
 }
 
 void CmdJump_write(Alloc a, CmdJumpRef ref, CmdJump s)
@@ -344,37 +335,44 @@ void Cmd_Jump_write(Alloc a, CmdRef ref, CmdJump s)
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 11u;
     write_mem(param, param_1, param_2);
-    CmdJumpRef _900 = { ref.offset + 4u };
+    CmdJumpRef _880 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdJumpRef param_4 = _900;
+    CmdJumpRef param_4 = _880;
     CmdJump param_5 = s;
     CmdJump_write(param_3, param_4, param_5);
 }
 
-bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit)
+void alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit)
 {
     if (cmd_ref.offset < cmd_limit)
     {
-        return true;
+        return;
     }
     uint param = 1024u;
-    MallocResult _928 = malloc(param);
-    MallocResult new_cmd = _928;
-    if (new_cmd.failed)
+    uint param_1 = _891.Load(0);
+    uint param_2 = 8u;
+    uint _915 = malloc_stage(param, param_1, param_2);
+    uint new_cmd = _915;
+    if (new_cmd == 0u)
     {
-        return false;
+        mem_ok = false;
     }
-    CmdJump _938 = { new_cmd.alloc.offset };
-    CmdJump jump = _938;
-    Alloc param_1 = cmd_alloc;
-    CmdRef param_2 = cmd_ref;
-    CmdJump param_3 = jump;
-    Cmd_Jump_write(param_1, param_2, param_3);
-    cmd_alloc = new_cmd.alloc;
-    CmdRef _950 = { cmd_alloc.offset };
-    cmd_ref = _950;
-    cmd_limit = (cmd_alloc.offset + 1024u) - 144u;
-    return true;
+    if (mem_ok)
+    {
+        CmdJump _926 = { new_cmd };
+        CmdJump jump = _926;
+        Alloc param_3 = cmd_alloc;
+        CmdRef param_4 = cmd_ref;
+        CmdJump param_5 = jump;
+        Cmd_Jump_write(param_3, param_4, param_5);
+    }
+    uint param_6 = new_cmd;
+    uint param_7 = 1024u;
+    bool param_8 = true;
+    cmd_alloc = new_alloc(param_6, param_7, param_8);
+    CmdRef _940 = { new_cmd };
+    cmd_ref = _940;
+    cmd_limit = (new_cmd + 1024u) - 144u;
 }
 
 void CmdFill_write(Alloc a, CmdFillRef ref, CmdFill s)
@@ -396,9 +394,9 @@ void Cmd_Fill_write(Alloc a, CmdRef ref, CmdFill s)
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 1u;
     write_mem(param, param_1, param_2);
-    CmdFillRef _757 = { ref.offset + 4u };
+    CmdFillRef _737 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdFillRef param_4 = _757;
+    CmdFillRef param_4 = _737;
     CmdFill param_5 = s;
     CmdFill_write(param_3, param_4, param_5);
 }
@@ -430,9 +428,9 @@ void Cmd_Stroke_write(Alloc a, CmdRef ref, CmdStroke s)
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 2u;
     write_mem(param, param_1, param_2);
-    CmdStrokeRef _775 = { ref.offset + 4u };
+    CmdStrokeRef _755 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdStrokeRef param_4 = _775;
+    CmdStrokeRef param_4 = _755;
     CmdStroke param_5 = s;
     CmdStroke_write(param_3, param_4, param_5);
 }
@@ -443,30 +441,39 @@ void write_fill(Alloc alloc, inout CmdRef cmd_ref, Tile tile, float linewidth)
     {
         if (tile.tile.offset != 0u)
         {
-            CmdFill _973 = { tile.tile.offset, tile.backdrop };
-            CmdFill cmd_fill = _973;
-            Alloc param = alloc;
-            CmdRef param_1 = cmd_ref;
-            CmdFill param_2 = cmd_fill;
-            Cmd_Fill_write(param, param_1, param_2);
+            CmdFill _960 = { tile.tile.offset, tile.backdrop };
+            CmdFill cmd_fill = _960;
+            if (mem_ok)
+            {
+                Alloc param = alloc;
+                CmdRef param_1 = cmd_ref;
+                CmdFill param_2 = cmd_fill;
+                Cmd_Fill_write(param, param_1, param_2);
+            }
             cmd_ref.offset += 12u;
         }
         else
         {
-            Alloc param_3 = alloc;
-            CmdRef param_4 = cmd_ref;
-            Cmd_Solid_write(param_3, param_4);
+            if (mem_ok)
+            {
+                Alloc param_3 = alloc;
+                CmdRef param_4 = cmd_ref;
+                Cmd_Solid_write(param_3, param_4);
+            }
             cmd_ref.offset += 4u;
         }
     }
     else
     {
-        CmdStroke _1003 = { tile.tile.offset, 0.5f * linewidth };
-        CmdStroke cmd_stroke = _1003;
-        Alloc param_5 = alloc;
-        CmdRef param_6 = cmd_ref;
-        CmdStroke param_7 = cmd_stroke;
-        Cmd_Stroke_write(param_5, param_6, param_7);
+        CmdStroke _996 = { tile.tile.offset, 0.5f * linewidth };
+        CmdStroke cmd_stroke = _996;
+        if (mem_ok)
+        {
+            Alloc param_5 = alloc;
+            CmdRef param_6 = cmd_ref;
+            CmdStroke param_7 = cmd_stroke;
+            Cmd_Stroke_write(param_5, param_6, param_7);
+        }
         cmd_ref.offset += 12u;
     }
 }
@@ -486,9 +493,9 @@ void Cmd_Color_write(Alloc a, CmdRef ref, CmdColor s)
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 5u;
     write_mem(param, param_1, param_2);
-    CmdColorRef _801 = { ref.offset + 4u };
+    CmdColorRef _781 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdColorRef param_4 = _801;
+    CmdColorRef param_4 = _781;
     CmdColor param_5 = s;
     CmdColor_write(param_3, param_4, param_5);
 }
@@ -520,9 +527,9 @@ void Cmd_LinGrad_write(Alloc a, CmdRef ref, CmdLinGrad s)
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 6u;
     write_mem(param, param_1, param_2);
-    CmdLinGradRef _819 = { ref.offset + 4u };
+    CmdLinGradRef _799 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdLinGradRef param_4 = _819;
+    CmdLinGradRef param_4 = _799;
     CmdLinGrad param_5 = s;
     CmdLinGrad_write(param_3, param_4, param_5);
 }
@@ -582,9 +589,9 @@ void Cmd_RadGrad_write(Alloc a, CmdRef ref, CmdRadGrad s)
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 7u;
     write_mem(param, param_1, param_2);
-    CmdRadGradRef _837 = { ref.offset + 4u };
+    CmdRadGradRef _817 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdRadGradRef param_4 = _837;
+    CmdRadGradRef param_4 = _817;
     CmdRadGrad param_5 = s;
     CmdRadGrad_write(param_3, param_4, param_5);
 }
@@ -608,9 +615,9 @@ void Cmd_Image_write(Alloc a, CmdRef ref, CmdImage s)
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 8u;
     write_mem(param, param_1, param_2);
-    CmdImageRef _855 = { ref.offset + 4u };
+    CmdImageRef _835 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdImageRef param_4 = _855;
+    CmdImageRef param_4 = _835;
     CmdImage param_5 = s;
     CmdImage_write(param_3, param_4, param_5);
 }
@@ -638,9 +645,9 @@ void Cmd_EndClip_write(Alloc a, CmdRef ref, CmdEndClip s)
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 10u;
     write_mem(param, param_1, param_2);
-    CmdEndClipRef _881 = { ref.offset + 4u };
+    CmdEndClipRef _861 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdEndClipRef param_4 = _881;
+    CmdEndClipRef param_4 = _861;
     CmdEndClip param_5 = s;
     CmdEndClip_write(param_3, param_4, param_5);
 }
@@ -653,35 +660,34 @@ void Cmd_End_write(Alloc a, CmdRef ref)
     write_mem(param, param_1, param_2);
 }
 
-void alloc_write(Alloc a, uint offset, Alloc alloc)
-{
-    Alloc param = a;
-    uint param_1 = offset >> uint(2);
-    uint param_2 = alloc.offset;
-    write_mem(param, param_1, param_2);
-}
-
 void comp_main()
 {
-    uint width_in_bins = ((_1020.Load(8) + 16u) - 1u) / 16u;
+    mem_ok = true;
+    uint param = 7u;
+    bool _1012 = check_deps(param);
+    if (!_1012)
+    {
+        return;
+    }
+    uint width_in_bins = ((_891.Load(12) + 16u) - 1u) / 16u;
     uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x;
     uint partition_ix = 0u;
-    uint n_partitions = ((_1020.Load(0) + 256u) - 1u) / 256u;
+    uint n_partitions = ((_891.Load(4) + 256u) - 1u) / 256u;
     uint th_ix = gl_LocalInvocationID.x;
     uint bin_tile_x = 16u * gl_WorkGroupID.x;
     uint bin_tile_y = 16u * gl_WorkGroupID.y;
     uint tile_x = gl_LocalInvocationID.x % 16u;
     uint tile_y = gl_LocalInvocationID.x / 16u;
-    uint this_tile_ix = (((bin_tile_y + tile_y) * _1020.Load(8)) + bin_tile_x) + tile_x;
-    Alloc _1085;
-    _1085.offset = _1020.Load(24);
-    Alloc param;
-    param.offset = _1085.offset;
-    uint param_1 = this_tile_ix * 1024u;
-    uint param_2 = 1024u;
-    Alloc cmd_alloc = slice_mem(param, param_1, param_2);
-    CmdRef _1094 = { cmd_alloc.offset };
-    CmdRef cmd_ref = _1094;
+    uint this_tile_ix = (((bin_tile_y + tile_y) * _891.Load(12)) + bin_tile_x) + tile_x;
+    Alloc _1082;
+    _1082.offset = _891.Load(28);
+    Alloc param_1;
+    param_1.offset = _1082.offset;
+    uint param_2 = this_tile_ix * 1024u;
+    uint param_3 = 1024u;
+    Alloc cmd_alloc = slice_mem(param_1, param_2, param_3);
+    CmdRef _1091 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _1091;
     uint cmd_limit = (cmd_ref.offset + 1024u) - 144u;
     uint clip_depth = 0u;
     uint clip_zero_depth = 0u;
@@ -689,25 +695,24 @@ void comp_main()
     uint wr_ix = 0u;
     uint part_start_ix = 0u;
     uint ready_ix = 0u;
-    Alloc param_3 = cmd_alloc;
-    uint param_4 = 0u;
-    uint param_5 = 8u;
-    Alloc scratch_alloc = slice_mem(param_3, param_4, param_5);
+    Alloc param_4 = cmd_alloc;
+    uint param_5 = 0u;
+    uint param_6 = 8u;
+    Alloc scratch_alloc = slice_mem(param_4, param_5, param_6);
     cmd_ref.offset += 4u;
     uint render_blend_depth = 0u;
     uint max_blend_depth = 0u;
-    uint drawmonoid_start = _1020.Load(44) >> uint(2);
-    uint drawtag_start = _1020.Load(100) >> uint(2);
-    uint drawdata_start = _1020.Load(104) >> uint(2);
-    uint drawinfo_start = _1020.Load(68) >> uint(2);
-    bool mem_ok = _266.Load(4) == 0u;
-    Alloc param_6;
-    Alloc param_8;
-    uint _1331;
+    uint drawmonoid_start = _891.Load(48) >> uint(2);
+    uint drawtag_start = _891.Load(104) >> uint(2);
+    uint drawdata_start = _891.Load(108) >> uint(2);
+    uint drawinfo_start = _891.Load(72) >> uint(2);
+    Alloc param_7;
+    Alloc param_9;
+    uint _1322;
     uint element_ix;
-    Alloc param_17;
+    Alloc param_18;
     uint tile_count;
-    uint _1632;
+    uint _1622;
     float linewidth;
     CmdLinGrad cmd_lin;
     CmdRadGrad cmd_rad;
@@ -717,40 +722,40 @@ void comp_main()
         {
             sh_bitmaps[i][th_ix] = 0u;
         }
-        bool _1383;
+        bool _1374;
         for (;;)
         {
             if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
             {
                 part_start_ix = ready_ix;
                 uint count = 0u;
-                bool _1181 = th_ix < 256u;
-                bool _1189;
-                if (_1181)
+                bool _1174 = th_ix < 256u;
+                bool _1182;
+                if (_1174)
                 {
-                    _1189 = (partition_ix + th_ix) < n_partitions;
+                    _1182 = (partition_ix + th_ix) < n_partitions;
                 }
                 else
                 {
-                    _1189 = _1181;
+                    _1182 = _1174;
                 }
-                if (_1189)
+                if (_1182)
                 {
-                    uint in_ix = (_1020.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
-                    Alloc _1206;
-                    _1206.offset = _1020.Load(20);
-                    param_6.offset = _1206.offset;
-                    uint param_7 = in_ix;
-                    count = read_mem(param_6, param_7);
-                    Alloc _1217;
-                    _1217.offset = _1020.Load(20);
-                    param_8.offset = _1217.offset;
-                    uint param_9 = in_ix + 1u;
-                    uint offset = read_mem(param_8, param_9);
-                    uint param_10 = offset;
-                    uint param_11 = count * 4u;
-                    bool param_12 = mem_ok;
-                    sh_part_elements[th_ix] = new_alloc(param_10, param_11, param_12);
+                    uint in_ix = (_891.Load(24) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
+                    Alloc _1200;
+                    _1200.offset = _891.Load(24);
+                    param_7.offset = _1200.offset;
+                    uint param_8 = in_ix;
+                    count = read_mem(param_7, param_8);
+                    Alloc _1211;
+                    _1211.offset = _891.Load(24);
+                    param_9.offset = _1211.offset;
+                    uint param_10 = in_ix + 1u;
+                    uint offset = read_mem(param_9, param_10);
+                    uint param_11 = offset;
+                    uint param_12 = count * 4u;
+                    bool param_13 = true;
+                    sh_part_elements[th_ix] = new_alloc(param_11, param_12, param_13);
                 }
                 for (uint i_1 = 0u; i_1 < 8u; i_1++)
                 {
@@ -777,7 +782,7 @@ void comp_main()
                 partition_ix += 256u;
             }
             uint ix = rd_ix + th_ix;
-            if (((ix >= wr_ix) && (ix < ready_ix)) && mem_ok)
+            if ((ix >= wr_ix) && (ix < ready_ix))
             {
                 uint part_ix = 0u;
                 for (uint i_2 = 0u; i_2 < 8u; i_2++)
@@ -790,35 +795,35 @@ void comp_main()
                 }
                 if (part_ix > 0u)
                 {
-                    _1331 = sh_part_count[part_ix - 1u];
+                    _1322 = sh_part_count[part_ix - 1u];
                 }
                 else
                 {
-                    _1331 = part_start_ix;
+                    _1322 = part_start_ix;
                 }
-                ix -= _1331;
+                ix -= _1322;
                 Alloc bin_alloc = sh_part_elements[part_ix];
-                BinInstanceRef _1350 = { bin_alloc.offset };
-                BinInstanceRef inst_ref = _1350;
-                BinInstanceRef param_13 = inst_ref;
-                uint param_14 = ix;
-                Alloc param_15 = bin_alloc;
-                BinInstanceRef param_16 = BinInstance_index(param_13, param_14);
-                BinInstance inst = BinInstance_read(param_15, param_16);
+                BinInstanceRef _1341 = { bin_alloc.offset };
+                BinInstanceRef inst_ref = _1341;
+                BinInstanceRef param_14 = inst_ref;
+                uint param_15 = ix;
+                Alloc param_16 = bin_alloc;
+                BinInstanceRef param_17 = BinInstance_index(param_14, param_15);
+                BinInstance inst = BinInstance_read(param_16, param_17);
                 sh_elements[th_ix] = inst.element_ix;
             }
             GroupMemoryBarrierWithGroupSync();
             wr_ix = min((rd_ix + 256u), ready_ix);
-            bool _1373 = (wr_ix - rd_ix) < 256u;
-            if (_1373)
+            bool _1364 = (wr_ix - rd_ix) < 256u;
+            if (_1364)
             {
-                _1383 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
+                _1374 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
             }
             else
             {
-                _1383 = _1373;
+                _1374 = _1364;
             }
-            if (_1383)
+            if (_1374)
             {
                 continue;
             }
@@ -831,7 +836,7 @@ void comp_main()
         if ((th_ix + rd_ix) < wr_ix)
         {
             element_ix = sh_elements[th_ix];
-            tag = _1399.Load((drawtag_start + element_ix) * 4 + 0);
+            tag = _1390.Load((drawtag_start + element_ix) * 4 + 0);
         }
         switch (tag)
         {
@@ -843,13 +848,13 @@ void comp_main()
             case 37u:
             {
                 uint drawmonoid_base = drawmonoid_start + (4u * element_ix);
-                uint path_ix = _266.Load(drawmonoid_base * 4 + 8);
-                PathRef _1424 = { _1020.Load(16) + (path_ix * 12u) };
-                Alloc _1427;
-                _1427.offset = _1020.Load(16);
-                param_17.offset = _1427.offset;
-                PathRef param_18 = _1424;
-                Path path = Path_read(param_17, param_18);
+                uint path_ix = _267.Load(drawmonoid_base * 4 + 12);
+                PathRef _1415 = { _891.Load(20) + (path_ix * 12u) };
+                Alloc _1418;
+                _1418.offset = _891.Load(20);
+                param_18.offset = _1418.offset;
+                PathRef param_19 = _1415;
+                Path path = Path_read(param_18, param_19);
                 uint stride = path.bbox.z - path.bbox.x;
                 sh_tile_stride[th_ix] = stride;
                 int dx = int(path.bbox.x) - int(bin_tile_x);
@@ -864,13 +869,13 @@ void comp_main()
                 tile_count = uint(x1 - x0) * uint(y1 - y0);
                 uint base = path.tiles.offset - (((uint(dy) * stride) + uint(dx)) * 8u);
                 sh_tile_base[th_ix] = base;
-                uint param_19 = path.tiles.offset;
-                uint param_20 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
-                bool param_21 = mem_ok;
-                Alloc path_alloc = new_alloc(param_19, param_20, param_21);
-                uint param_22 = th_ix;
-                Alloc param_23 = path_alloc;
-                write_tile_alloc(param_22, param_23);
+                uint param_20 = path.tiles.offset;
+                uint param_21 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+                bool param_22 = true;
+                Alloc path_alloc = new_alloc(param_20, param_21, param_22);
+                uint param_23 = th_ix;
+                Alloc param_24 = path_alloc;
+                write_tile_alloc(param_23, param_24);
                 break;
             }
             default:
@@ -904,62 +909,59 @@ void comp_main()
                 }
             }
             uint element_ix_1 = sh_elements[el_ix];
-            uint tag_1 = _1399.Load((drawtag_start + element_ix_1) * 4 + 0);
+            uint tag_1 = _1390.Load((drawtag_start + element_ix_1) * 4 + 0);
             if (el_ix > 0u)
             {
-                _1632 = sh_tile_count[el_ix - 1u];
+                _1622 = sh_tile_count[el_ix - 1u];
             }
             else
             {
-                _1632 = 0u;
+                _1622 = 0u;
             }
-            uint seq_ix = ix_1 - _1632;
+            uint seq_ix = ix_1 - _1622;
             uint width = sh_tile_width[el_ix];
             uint x = sh_tile_x0[el_ix] + (seq_ix % width);
             uint y = sh_tile_y0[el_ix] + (seq_ix / width);
             bool include_tile = false;
-            if (mem_ok)
+            uint param_25 = el_ix;
+            bool param_26 = true;
+            TileRef _1670 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
+            Alloc param_27 = read_tile_alloc(param_25, param_26);
+            TileRef param_28 = _1670;
+            Tile tile = Tile_read(param_27, param_28);
+            bool is_clip = (tag_1 & 1u) != 0u;
+            bool is_blend = false;
+            if (is_clip)
             {
-                uint param_24 = el_ix;
-                bool param_25 = mem_ok;
-                TileRef _1684 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
-                Alloc param_26 = read_tile_alloc(param_24, param_25);
-                TileRef param_27 = _1684;
-                Tile tile = Tile_read(param_26, param_27);
-                bool is_clip = (tag_1 & 1u) != 0u;
-                bool is_blend = false;
-                if (is_clip)
-                {
-                    uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1);
-                    uint scene_offset = _266.Load((drawmonoid_base_1 + 2u) * 4 + 8);
-                    uint dd = drawdata_start + (scene_offset >> uint(2));
-                    uint blend = _1399.Load(dd * 4 + 0);
-                    is_blend = blend != 32771u;
-                }
-                bool _1720 = tile.tile.offset != 0u;
-                bool _1729;
-                if (!_1720)
-                {
-                    _1729 = (tile.backdrop == 0) == is_clip;
-                }
-                else
-                {
-                    _1729 = _1720;
-                }
-                include_tile = _1729 || is_blend;
+                uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1);
+                uint scene_offset = _267.Load((drawmonoid_base_1 + 2u) * 4 + 12);
+                uint dd = drawdata_start + (scene_offset >> uint(2));
+                uint blend = _1390.Load(dd * 4 + 0);
+                is_blend = blend != 32771u;
             }
+            bool _1706 = tile.tile.offset != 0u;
+            bool _1715;
+            if (!_1706)
+            {
+                _1715 = (tile.backdrop == 0) == is_clip;
+            }
+            else
+            {
+                _1715 = _1706;
+            }
+            include_tile = _1715 || is_blend;
             if (include_tile)
             {
                 uint el_slice = el_ix / 32u;
                 uint el_mask = 1u << (el_ix & 31u);
-                uint _1751;
-                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1751);
+                uint _1737;
+                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1737);
             }
         }
         GroupMemoryBarrierWithGroupSync();
         uint slice_ix = 0u;
         uint bitmap = sh_bitmaps[0][th_ix];
-        while (mem_ok)
+        while (true)
         {
             if (bitmap == 0u)
             {
@@ -977,178 +979,173 @@ void comp_main()
             uint element_ref_ix = (slice_ix * 32u) + uint(int(firstbitlow(bitmap)));
             uint element_ix_2 = sh_elements[element_ref_ix];
             bitmap &= (bitmap - 1u);
-            uint drawtag = _1399.Load((drawtag_start + element_ix_2) * 4 + 0);
+            uint drawtag = _1390.Load((drawtag_start + element_ix_2) * 4 + 0);
             if (clip_zero_depth == 0u)
             {
-                uint param_28 = element_ref_ix;
-                bool param_29 = mem_ok;
-                TileRef _1828 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                Alloc param_30 = read_tile_alloc(param_28, param_29);
-                TileRef param_31 = _1828;
-                Tile tile_1 = Tile_read(param_30, param_31);
+                uint param_29 = element_ref_ix;
+                bool param_30 = true;
+                TileRef _1812 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                Alloc param_31 = read_tile_alloc(param_29, param_30);
+                TileRef param_32 = _1812;
+                Tile tile_1 = Tile_read(param_31, param_32);
                 uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2);
-                uint scene_offset_1 = _266.Load((drawmonoid_base_2 + 2u) * 4 + 8);
-                uint info_offset = _266.Load((drawmonoid_base_2 + 3u) * 4 + 8);
+                uint scene_offset_1 = _267.Load((drawmonoid_base_2 + 2u) * 4 + 12);
+                uint info_offset = _267.Load((drawmonoid_base_2 + 3u) * 4 + 12);
                 uint dd_1 = drawdata_start + (scene_offset_1 >> uint(2));
                 uint di = drawinfo_start + (info_offset >> uint(2));
                 switch (drawtag)
                 {
                     case 68u:
                     {
-                        linewidth = asfloat(_266.Load(di * 4 + 8));
-                        Alloc param_32 = cmd_alloc;
-                        CmdRef param_33 = cmd_ref;
-                        uint param_34 = cmd_limit;
-                        bool _1876 = alloc_cmd(param_32, param_33, param_34);
-                        cmd_alloc = param_32;
-                        cmd_ref = param_33;
-                        cmd_limit = param_34;
-                        if (!_1876)
+                        linewidth = asfloat(_267.Load(di * 4 + 12));
+                        Alloc param_33 = cmd_alloc;
+                        CmdRef param_34 = cmd_ref;
+                        uint param_35 = cmd_limit;
+                        alloc_cmd(param_33, param_34, param_35);
+                        cmd_alloc = param_33;
+                        cmd_ref = param_34;
+                        cmd_limit = param_35;
+                        Alloc param_36 = cmd_alloc;
+                        CmdRef param_37 = cmd_ref;
+                        Tile param_38 = tile_1;
+                        float param_39 = linewidth;
+                        write_fill(param_36, param_37, param_38, param_39);
+                        cmd_ref = param_37;
+                        uint rgba = _1390.Load(dd_1 * 4 + 0);
+                        if (mem_ok)
                         {
-                            break;
+                            CmdColor _1882 = { rgba };
+                            Alloc param_40 = cmd_alloc;
+                            CmdRef param_41 = cmd_ref;
+                            CmdColor param_42 = _1882;
+                            Cmd_Color_write(param_40, param_41, param_42);
                         }
-                        Alloc param_35 = cmd_alloc;
-                        CmdRef param_36 = cmd_ref;
-                        Tile param_37 = tile_1;
-                        float param_38 = linewidth;
-                        write_fill(param_35, param_36, param_37, param_38);
-                        cmd_ref = param_36;
-                        uint rgba = _1399.Load(dd_1 * 4 + 0);
-                        CmdColor _1899 = { rgba };
-                        Alloc param_39 = cmd_alloc;
-                        CmdRef param_40 = cmd_ref;
-                        CmdColor param_41 = _1899;
-                        Cmd_Color_write(param_39, param_40, param_41);
                         cmd_ref.offset += 8u;
                         break;
                     }
                     case 276u:
                     {
-                        Alloc param_42 = cmd_alloc;
-                        CmdRef param_43 = cmd_ref;
-                        uint param_44 = cmd_limit;
-                        bool _1917 = alloc_cmd(param_42, param_43, param_44);
-                        cmd_alloc = param_42;
-                        cmd_ref = param_43;
-                        cmd_limit = param_44;
-                        if (!_1917)
+                        Alloc param_43 = cmd_alloc;
+                        CmdRef param_44 = cmd_ref;
+                        uint param_45 = cmd_limit;
+                        alloc_cmd(param_43, param_44, param_45);
+                        cmd_alloc = param_43;
+                        cmd_ref = param_44;
+                        cmd_limit = param_45;
+                        linewidth = asfloat(_267.Load(di * 4 + 12));
+                        Alloc param_46 = cmd_alloc;
+                        CmdRef param_47 = cmd_ref;
+                        Tile param_48 = tile_1;
+                        float param_49 = linewidth;
+                        write_fill(param_46, param_47, param_48, param_49);
+                        cmd_ref = param_47;
+                        cmd_lin.index = _1390.Load(dd_1 * 4 + 0);
+                        cmd_lin.line_x = asfloat(_267.Load((di + 1u) * 4 + 12));
+                        cmd_lin.line_y = asfloat(_267.Load((di + 2u) * 4 + 12));
+                        cmd_lin.line_c = asfloat(_267.Load((di + 3u) * 4 + 12));
+                        if (mem_ok)
                         {
-                            break;
+                            Alloc param_50 = cmd_alloc;
+                            CmdRef param_51 = cmd_ref;
+                            CmdLinGrad param_52 = cmd_lin;
+                            Cmd_LinGrad_write(param_50, param_51, param_52);
                         }
-                        linewidth = asfloat(_266.Load(di * 4 + 8));
-                        Alloc param_45 = cmd_alloc;
-                        CmdRef param_46 = cmd_ref;
-                        Tile param_47 = tile_1;
-                        float param_48 = linewidth;
-                        write_fill(param_45, param_46, param_47, param_48);
-                        cmd_ref = param_46;
-                        cmd_lin.index = _1399.Load(dd_1 * 4 + 0);
-                        cmd_lin.line_x = asfloat(_266.Load((di + 1u) * 4 + 8));
-                        cmd_lin.line_y = asfloat(_266.Load((di + 2u) * 4 + 8));
-                        cmd_lin.line_c = asfloat(_266.Load((di + 3u) * 4 + 8));
-                        Alloc param_49 = cmd_alloc;
-                        CmdRef param_50 = cmd_ref;
-                        CmdLinGrad param_51 = cmd_lin;
-                        Cmd_LinGrad_write(param_49, param_50, param_51);
                         cmd_ref.offset += 20u;
                         break;
                     }
                     case 732u:
                     {
-                        Alloc param_52 = cmd_alloc;
-                        CmdRef param_53 = cmd_ref;
-                        uint param_54 = cmd_limit;
-                        bool _1981 = alloc_cmd(param_52, param_53, param_54);
-                        cmd_alloc = param_52;
-                        cmd_ref = param_53;
-                        cmd_limit = param_54;
-                        if (!_1981)
+                        Alloc param_53 = cmd_alloc;
+                        CmdRef param_54 = cmd_ref;
+                        uint param_55 = cmd_limit;
+                        alloc_cmd(param_53, param_54, param_55);
+                        cmd_alloc = param_53;
+                        cmd_ref = param_54;
+                        cmd_limit = param_55;
+                        linewidth = asfloat(_267.Load(di * 4 + 12));
+                        Alloc param_56 = cmd_alloc;
+                        CmdRef param_57 = cmd_ref;
+                        Tile param_58 = tile_1;
+                        float param_59 = linewidth;
+                        write_fill(param_56, param_57, param_58, param_59);
+                        cmd_ref = param_57;
+                        cmd_rad.index = _1390.Load(dd_1 * 4 + 0);
+                        cmd_rad.mat = asfloat(uint4(_267.Load((di + 1u) * 4 + 12), _267.Load((di + 2u) * 4 + 12), _267.Load((di + 3u) * 4 + 12), _267.Load((di + 4u) * 4 + 12)));
+                        cmd_rad.xlat = asfloat(uint2(_267.Load((di + 5u) * 4 + 12), _267.Load((di + 6u) * 4 + 12)));
+                        cmd_rad.c1 = asfloat(uint2(_267.Load((di + 7u) * 4 + 12), _267.Load((di + 8u) * 4 + 12)));
+                        cmd_rad.ra = asfloat(_267.Load((di + 9u) * 4 + 12));
+                        cmd_rad.roff = asfloat(_267.Load((di + 10u) * 4 + 12));
+                        if (mem_ok)
                         {
-                            break;
+                            Alloc param_60 = cmd_alloc;
+                            CmdRef param_61 = cmd_ref;
+                            CmdRadGrad param_62 = cmd_rad;
+                            Cmd_RadGrad_write(param_60, param_61, param_62);
                         }
-                        linewidth = asfloat(_266.Load(di * 4 + 8));
-                        Alloc param_55 = cmd_alloc;
-                        CmdRef param_56 = cmd_ref;
-                        Tile param_57 = tile_1;
-                        float param_58 = linewidth;
-                        write_fill(param_55, param_56, param_57, param_58);
-                        cmd_ref = param_56;
-                        cmd_rad.index = _1399.Load(dd_1 * 4 + 0);
-                        cmd_rad.mat = asfloat(uint4(_266.Load((di + 1u) * 4 + 8), _266.Load((di + 2u) * 4 + 8), _266.Load((di + 3u) * 4 + 8), _266.Load((di + 4u) * 4 + 8)));
-                        cmd_rad.xlat = asfloat(uint2(_266.Load((di + 5u) * 4 + 8), _266.Load((di + 6u) * 4 + 8)));
-                        cmd_rad.c1 = asfloat(uint2(_266.Load((di + 7u) * 4 + 8), _266.Load((di + 8u) * 4 + 8)));
-                        cmd_rad.ra = asfloat(_266.Load((di + 9u) * 4 + 8));
-                        cmd_rad.roff = asfloat(_266.Load((di + 10u) * 4 + 8));
-                        Alloc param_59 = cmd_alloc;
-                        CmdRef param_60 = cmd_ref;
-                        CmdRadGrad param_61 = cmd_rad;
-                        Cmd_RadGrad_write(param_59, param_60, param_61);
                         cmd_ref.offset += 48u;
                         break;
                     }
                     case 72u:
                     {
-                        linewidth = asfloat(_266.Load(di * 4 + 8));
-                        Alloc param_62 = cmd_alloc;
-                        CmdRef param_63 = cmd_ref;
-                        uint param_64 = cmd_limit;
-                        bool _2087 = alloc_cmd(param_62, param_63, param_64);
-                        cmd_alloc = param_62;
-                        cmd_ref = param_63;
-                        cmd_limit = param_64;
-                        if (!_2087)
-                        {
-                            break;
-                        }
-                        Alloc param_65 = cmd_alloc;
-                        CmdRef param_66 = cmd_ref;
-                        Tile param_67 = tile_1;
-                        float param_68 = linewidth;
-                        write_fill(param_65, param_66, param_67, param_68);
-                        cmd_ref = param_66;
-                        uint index = _1399.Load(dd_1 * 4 + 0);
-                        uint raw1 = _1399.Load((dd_1 + 1u) * 4 + 0);
+                        Alloc param_63 = cmd_alloc;
+                        CmdRef param_64 = cmd_ref;
+                        uint param_65 = cmd_limit;
+                        alloc_cmd(param_63, param_64, param_65);
+                        cmd_alloc = param_63;
+                        cmd_ref = param_64;
+                        cmd_limit = param_65;
+                        linewidth = asfloat(_267.Load(di * 4 + 12));
+                        Alloc param_66 = cmd_alloc;
+                        CmdRef param_67 = cmd_ref;
+                        Tile param_68 = tile_1;
+                        float param_69 = linewidth;
+                        write_fill(param_66, param_67, param_68, param_69);
+                        cmd_ref = param_67;
+                        uint index = _1390.Load(dd_1 * 4 + 0);
+                        uint raw1 = _1390.Load((dd_1 + 1u) * 4 + 0);
                         int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
-                        CmdImage _2126 = { index, offset_1 };
-                        Alloc param_69 = cmd_alloc;
-                        CmdRef param_70 = cmd_ref;
-                        CmdImage param_71 = _2126;
-                        Cmd_Image_write(param_69, param_70, param_71);
+                        if (mem_ok)
+                        {
+                            CmdImage _2106 = { index, offset_1 };
+                            Alloc param_70 = cmd_alloc;
+                            CmdRef param_71 = cmd_ref;
+                            CmdImage param_72 = _2106;
+                            Cmd_Image_write(param_70, param_71, param_72);
+                        }
                         cmd_ref.offset += 12u;
                         break;
                     }
                     case 5u:
                     {
-                        bool _2140 = tile_1.tile.offset == 0u;
-                        bool _2146;
-                        if (_2140)
+                        bool _2120 = tile_1.tile.offset == 0u;
+                        bool _2126;
+                        if (_2120)
                         {
-                            _2146 = tile_1.backdrop == 0;
+                            _2126 = tile_1.backdrop == 0;
                         }
                         else
                         {
-                            _2146 = _2140;
+                            _2126 = _2120;
                         }
-                        if (_2146)
+                        if (_2126)
                         {
                             clip_zero_depth = clip_depth + 1u;
                         }
                         else
                         {
-                            Alloc param_72 = cmd_alloc;
-                            CmdRef param_73 = cmd_ref;
-                            uint param_74 = cmd_limit;
-                            bool _2158 = alloc_cmd(param_72, param_73, param_74);
-                            cmd_alloc = param_72;
-                            cmd_ref = param_73;
-                            cmd_limit = param_74;
-                            if (!_2158)
+                            Alloc param_73 = cmd_alloc;
+                            CmdRef param_74 = cmd_ref;
+                            uint param_75 = cmd_limit;
+                            alloc_cmd(param_73, param_74, param_75);
+                            cmd_alloc = param_73;
+                            cmd_ref = param_74;
+                            cmd_limit = param_75;
+                            if (mem_ok)
                             {
-                                break;
+                                Alloc param_76 = cmd_alloc;
+                                CmdRef param_77 = cmd_ref;
+                                Cmd_BeginClip_write(param_76, param_77);
                             }
-                            Alloc param_75 = cmd_alloc;
-                            CmdRef param_76 = cmd_ref;
-                            Cmd_BeginClip_write(param_75, param_76);
                             cmd_ref.offset += 4u;
                             render_blend_depth++;
                             max_blend_depth = max(max_blend_depth, render_blend_depth);
@@ -1159,29 +1156,21 @@ void comp_main()
                     case 37u:
                     {
                         clip_depth--;
-                        Alloc param_77 = cmd_alloc;
-                        CmdRef param_78 = cmd_ref;
-                        uint param_79 = cmd_limit;
-                        bool _2191 = alloc_cmd(param_77, param_78, param_79);
-                        cmd_alloc = param_77;
-                        cmd_ref = param_78;
-                        cmd_limit = param_79;
-                        if (!_2191)
+                        Alloc param_78 = cmd_alloc;
+                        CmdRef param_79 = cmd_ref;
+                        Tile param_80 = tile_1;
+                        float param_81 = -1.0f;
+                        write_fill(param_78, param_79, param_80, param_81);
+                        cmd_ref = param_79;
+                        uint blend_1 = _1390.Load(dd_1 * 4 + 0);
+                        if (mem_ok)
                         {
-                            break;
+                            CmdEndClip _2182 = { blend_1 };
+                            Alloc param_82 = cmd_alloc;
+                            CmdRef param_83 = cmd_ref;
+                            CmdEndClip param_84 = _2182;
+                            Cmd_EndClip_write(param_82, param_83, param_84);
                         }
-                        Alloc param_80 = cmd_alloc;
-                        CmdRef param_81 = cmd_ref;
-                        Tile param_82 = tile_1;
-                        float param_83 = -1.0f;
-                        write_fill(param_80, param_81, param_82, param_83);
-                        cmd_ref = param_81;
-                        uint blend_1 = _1399.Load(dd_1 * 4 + 0);
-                        CmdEndClip _2214 = { blend_1 };
-                        Alloc param_84 = cmd_alloc;
-                        CmdRef param_85 = cmd_ref;
-                        CmdEndClip param_86 = _2214;
-                        Cmd_EndClip_write(param_84, param_85, param_86);
                         cmd_ref.offset += 8u;
                         render_blend_depth--;
                         break;
@@ -1216,31 +1205,34 @@ void comp_main()
             break;
         }
     }
-    bool _2263 = (bin_tile_x + tile_x) < _1020.Load(8);
-    bool _2272;
-    if (_2263)
+    bool _2231 = (bin_tile_x + tile_x) < _891.Load(12);
+    bool _2240;
+    if (_2231)
     {
-        _2272 = (bin_tile_y + tile_y) < _1020.Load(12);
+        _2240 = (bin_tile_y + tile_y) < _891.Load(16);
     }
     else
     {
-        _2272 = _2263;
+        _2240 = _2231;
     }
-    if (_2272)
+    if (_2240)
     {
-        Alloc param_87 = cmd_alloc;
-        CmdRef param_88 = cmd_ref;
-        Cmd_End_write(param_87, param_88);
+        if (mem_ok)
+        {
+            Alloc param_85 = cmd_alloc;
+            CmdRef param_86 = cmd_ref;
+            Cmd_End_write(param_85, param_86);
+        }
         if (max_blend_depth > 4u)
         {
             uint scratch_size = (((max_blend_depth * 16u) * 16u) * 1u) * 4u;
-            uint param_89 = scratch_size;
-            MallocResult _2293 = malloc(param_89);
-            MallocResult scratch = _2293;
-            Alloc param_90 = scratch_alloc;
-            uint param_91 = scratch_alloc.offset;
-            Alloc param_92 = scratch.alloc;
-            alloc_write(param_90, param_91, param_92);
+            uint _2264;
+            _267.InterlockedAdd(8, scratch_size, _2264);
+            uint scratch = _2264;
+            Alloc param_87 = scratch_alloc;
+            uint param_88 = scratch_alloc.offset >> uint(2);
+            uint param_89 = scratch;
+            write_mem(param_87, param_88, param_89);
         }
     }
 }
diff --git a/piet-gpu/shader/gen/coarse.msl b/piet-gpu/shader/gen/coarse.msl
index d84add1..5df99b9 100644
--- a/piet-gpu/shader/gen/coarse.msl
+++ b/piet-gpu/shader/gen/coarse.msl
@@ -19,12 +19,6 @@ struct Alloc
     uint offset;
 };
 
-struct MallocResult
-{
-    Alloc alloc;
-    bool failed;
-};
-
 struct BinInstanceRef
 {
     uint offset;
@@ -162,6 +156,7 @@ struct Memory
 {
     uint mem_offset;
     uint mem_error;
+    uint blend_offset;
     uint memory[1];
 };
 
@@ -172,6 +167,7 @@ struct Alloc_1
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -213,6 +209,13 @@ struct SceneBuf
 
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
 
+static inline __attribute__((always_inline))
+bool check_deps(thread const uint& dep_stage, device Memory& v_267)
+{
+    uint _273 = atomic_fetch_or_explicit((device atomic_uint*)&v_267.mem_error, 0u, memory_order_relaxed);
+    return (_273 & dep_stage) == 0u;
+}
+
 static inline __attribute__((always_inline))
 Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size)
 {
@@ -226,7 +229,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
 }
 
 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_266, constant uint& v_266BufferSize)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_267)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -234,7 +237,7 @@ uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memor
     {
         return 0u;
     }
-    uint v = v_266.memory[offset];
+    uint v = v_267.memory[offset];
     return v;
 }
 
@@ -253,30 +256,30 @@ BinInstanceRef BinInstance_index(thread const BinInstanceRef& ref, thread const
 }
 
 static inline __attribute__((always_inline))
-BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_266, constant uint& v_266BufferSize)
+BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_267)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_266, v_266BufferSize);
+    uint raw0 = read_mem(param, param_1, v_267);
     BinInstance s;
     s.element_ix = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_266, constant uint& v_266BufferSize)
+Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_267)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_266, v_266BufferSize);
+    uint raw0 = read_mem(param, param_1, v_267);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_266, v_266BufferSize);
+    uint raw1 = read_mem(param_2, param_3, v_267);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_266, v_266BufferSize);
+    uint raw2 = read_mem(param_4, param_5, v_267);
     Path s;
     s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
     s.tiles = TileRef{ raw2 };
@@ -289,24 +292,24 @@ void write_tile_alloc(thread const uint& el_ix, thread const Alloc& a)
 }
 
 static inline __attribute__((always_inline))
-Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, device Memory& v_266, constant uint& v_266BufferSize)
+Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, const device ConfigBuf& v_891)
 {
     uint param = 0u;
-    uint param_1 = uint(int((v_266BufferSize - 8) / 4) * 4);
+    uint param_1 = v_891.conf.mem_size;
     bool param_2 = mem_ok;
     return new_alloc(param, param_1, param_2);
 }
 
 static inline __attribute__((always_inline))
-Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_266, constant uint& v_266BufferSize)
+Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_267)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_266, v_266BufferSize);
+    uint raw0 = read_mem(param, param_1, v_267);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_266, v_266BufferSize);
+    uint raw1 = read_mem(param_2, param_3, v_267);
     Tile s;
     s.tile = TileSegRef{ raw0 };
     s.backdrop = int(raw1);
@@ -314,26 +317,20 @@ Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory&
 }
 
 static inline __attribute__((always_inline))
-MallocResult malloc(thread const uint& size, device Memory& v_266, constant uint& v_266BufferSize)
+uint malloc_stage(thread const uint& size, thread const uint& mem_size, thread const uint& stage, device Memory& v_267)
 {
-    uint _272 = atomic_fetch_add_explicit((device atomic_uint*)&v_266.mem_offset, size, memory_order_relaxed);
-    uint offset = _272;
-    MallocResult r;
-    r.failed = (offset + size) > uint(int((v_266BufferSize - 8) / 4) * 4);
-    uint param = offset;
-    uint param_1 = size;
-    bool param_2 = !r.failed;
-    r.alloc = new_alloc(param, param_1, param_2);
-    if (r.failed)
+    uint _282 = atomic_fetch_add_explicit((device atomic_uint*)&v_267.mem_offset, size, memory_order_relaxed);
+    uint offset = _282;
+    if ((offset + size) > mem_size)
     {
-        uint _301 = atomic_fetch_max_explicit((device atomic_uint*)&v_266.mem_error, 1u, memory_order_relaxed);
-        return r;
+        uint _292 = atomic_fetch_or_explicit((device atomic_uint*)&v_267.mem_error, stage, memory_order_relaxed);
+        offset = 0u;
     }
-    return r;
+    return offset;
 }
 
 static inline __attribute__((always_inline))
-void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_266, constant uint& v_266BufferSize)
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_267)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -341,352 +338,359 @@ void write_mem(thread const Alloc& alloc, thread const uint& offset, thread cons
     {
         return;
     }
-    v_266.memory[offset] = val;
+    v_267.memory[offset] = val;
 }
 
 static inline __attribute__((always_inline))
-void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_266, constant uint& v_266BufferSize)
+void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_267)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.new_ref;
-    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
+    write_mem(param, param_1, param_2, v_267);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_266, constant uint& v_266BufferSize)
+void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_267)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 11u;
-    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
+    write_mem(param, param_1, param_2, v_267);
     Alloc param_3 = a;
     CmdJumpRef param_4 = CmdJumpRef{ ref.offset + 4u };
     CmdJump param_5 = s;
-    CmdJump_write(param_3, param_4, param_5, v_266, v_266BufferSize);
+    CmdJump_write(param_3, param_4, param_5, v_267);
 }
 
 static inline __attribute__((always_inline))
-bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, device Memory& v_266, constant uint& v_266BufferSize)
+void alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, thread bool& mem_ok, device Memory& v_267, const device ConfigBuf& v_891)
 {
     if (cmd_ref.offset < cmd_limit)
     {
-        return true;
+        return;
     }
     uint param = 1024u;
-    MallocResult _928 = malloc(param, v_266, v_266BufferSize);
-    MallocResult new_cmd = _928;
-    if (new_cmd.failed)
+    uint param_1 = v_891.conf.mem_size;
+    uint param_2 = 8u;
+    uint _915 = malloc_stage(param, param_1, param_2, v_267);
+    uint new_cmd = _915;
+    if (new_cmd == 0u)
     {
-        return false;
+        mem_ok = false;
     }
-    CmdJump jump = CmdJump{ new_cmd.alloc.offset };
-    Alloc param_1 = cmd_alloc;
-    CmdRef param_2 = cmd_ref;
-    CmdJump param_3 = jump;
-    Cmd_Jump_write(param_1, param_2, param_3, v_266, v_266BufferSize);
-    cmd_alloc = new_cmd.alloc;
-    cmd_ref = CmdRef{ cmd_alloc.offset };
-    cmd_limit = (cmd_alloc.offset + 1024u) - 144u;
-    return true;
+    if (mem_ok)
+    {
+        CmdJump jump = CmdJump{ new_cmd };
+        Alloc param_3 = cmd_alloc;
+        CmdRef param_4 = cmd_ref;
+        CmdJump param_5 = jump;
+        Cmd_Jump_write(param_3, param_4, param_5, v_267);
+    }
+    uint param_6 = new_cmd;
+    uint param_7 = 1024u;
+    bool param_8 = true;
+    cmd_alloc = new_alloc(param_6, param_7, param_8);
+    cmd_ref = CmdRef{ new_cmd };
+    cmd_limit = (new_cmd + 1024u) - 144u;
 }
 
 static inline __attribute__((always_inline))
-void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_266, constant uint& v_266BufferSize)
+void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_267)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.tile_ref;
-    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
+    write_mem(param, param_1, param_2, v_267);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = uint(s.backdrop);
-    write_mem(param_3, param_4, param_5, v_266, v_266BufferSize);
+    write_mem(param_3, param_4, param_5, v_267);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_266, constant uint& v_266BufferSize)
+void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_267)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 1u;
-    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
+    write_mem(param, param_1, param_2, v_267);
     Alloc param_3 = a;
     CmdFillRef param_4 = CmdFillRef{ ref.offset + 4u };
     CmdFill param_5 = s;
-    CmdFill_write(param_3, param_4, param_5, v_266, v_266BufferSize);
+    CmdFill_write(param_3, param_4, param_5, v_267);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_266, constant uint& v_266BufferSize)
+void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_267)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 3u;
-    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
+    write_mem(param, param_1, param_2, v_267);
 }
 
 static inline __attribute__((always_inline))
-void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_266, constant uint& v_266BufferSize)
+void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_267)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.tile_ref;
-    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
+    write_mem(param, param_1, param_2, v_267);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = as_type<uint>(s.half_width);
-    write_mem(param_3, param_4, param_5, v_266, v_266BufferSize);
+    write_mem(param_3, param_4, param_5, v_267);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_266, constant uint& v_266BufferSize)
+void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_267)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 2u;
-    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
+    write_mem(param, param_1, param_2, v_267);
     Alloc param_3 = a;
     CmdStrokeRef param_4 = CmdStrokeRef{ ref.offset + 4u };
     CmdStroke param_5 = s;
-    CmdStroke_write(param_3, param_4, param_5, v_266, v_266BufferSize);
+    CmdStroke_write(param_3, param_4, param_5, v_267);
 }
 
 static inline __attribute__((always_inline))
-void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const Tile& tile, thread const float& linewidth, device Memory& v_266, constant uint& v_266BufferSize)
+void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const Tile& tile, thread const float& linewidth, thread bool& mem_ok, device Memory& v_267)
 {
     if (linewidth < 0.0)
     {
         if (tile.tile.offset != 0u)
         {
             CmdFill cmd_fill = CmdFill{ tile.tile.offset, tile.backdrop };
-            Alloc param = alloc;
-            CmdRef param_1 = cmd_ref;
-            CmdFill param_2 = cmd_fill;
-            Cmd_Fill_write(param, param_1, param_2, v_266, v_266BufferSize);
+            if (mem_ok)
+            {
+                Alloc param = alloc;
+                CmdRef param_1 = cmd_ref;
+                CmdFill param_2 = cmd_fill;
+                Cmd_Fill_write(param, param_1, param_2, v_267);
+            }
             cmd_ref.offset += 12u;
         }
         else
         {
-            Alloc param_3 = alloc;
-            CmdRef param_4 = cmd_ref;
-            Cmd_Solid_write(param_3, param_4, v_266, v_266BufferSize);
+            if (mem_ok)
+            {
+                Alloc param_3 = alloc;
+                CmdRef param_4 = cmd_ref;
+                Cmd_Solid_write(param_3, param_4, v_267);
+            }
             cmd_ref.offset += 4u;
         }
     }
     else
     {
         CmdStroke cmd_stroke = CmdStroke{ tile.tile.offset, 0.5 * linewidth };
-        Alloc param_5 = alloc;
-        CmdRef param_6 = cmd_ref;
-        CmdStroke param_7 = cmd_stroke;
-        Cmd_Stroke_write(param_5, param_6, param_7, v_266, v_266BufferSize);
+        if (mem_ok)
+        {
+            Alloc param_5 = alloc;
+            CmdRef param_6 = cmd_ref;
+            CmdStroke param_7 = cmd_stroke;
+            Cmd_Stroke_write(param_5, param_6, param_7, v_267);
+        }
         cmd_ref.offset += 12u;
     }
 }
 
 static inline __attribute__((always_inline))
-void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_266, constant uint& v_266BufferSize)
+void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_267)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.rgba_color;
-    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
+    write_mem(param, param_1, param_2, v_267);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_266, constant uint& v_266BufferSize)
+void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_267)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 5u;
-    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
+    write_mem(param, param_1, param_2, v_267);
     Alloc param_3 = a;
     CmdColorRef param_4 = CmdColorRef{ ref.offset + 4u };
     CmdColor param_5 = s;
-    CmdColor_write(param_3, param_4, param_5, v_266, v_266BufferSize);
+    CmdColor_write(param_3, param_4, param_5, v_267);
 }
 
 static inline __attribute__((always_inline))
-void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_266, constant uint& v_266BufferSize)
+void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_267)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.index;
-    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
+    write_mem(param, param_1, param_2, v_267);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = as_type<uint>(s.line_x);
-    write_mem(param_3, param_4, param_5, v_266, v_266BufferSize);
+    write_mem(param_3, param_4, param_5, v_267);
     Alloc param_6 = a;
     uint param_7 = ix + 2u;
     uint param_8 = as_type<uint>(s.line_y);
-    write_mem(param_6, param_7, param_8, v_266, v_266BufferSize);
+    write_mem(param_6, param_7, param_8, v_267);
     Alloc param_9 = a;
     uint param_10 = ix + 3u;
     uint param_11 = as_type<uint>(s.line_c);
-    write_mem(param_9, param_10, param_11, v_266, v_266BufferSize);
+    write_mem(param_9, param_10, param_11, v_267);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_266, constant uint& v_266BufferSize)
+void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_267)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 6u;
-    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
+    write_mem(param, param_1, param_2, v_267);
     Alloc param_3 = a;
     CmdLinGradRef param_4 = CmdLinGradRef{ ref.offset + 4u };
     CmdLinGrad param_5 = s;
-    CmdLinGrad_write(param_3, param_4, param_5, v_266, v_266BufferSize);
+    CmdLinGrad_write(param_3, param_4, param_5, v_267);
 }
 
 static inline __attribute__((always_inline))
-void CmdRadGrad_write(thread const Alloc& a, thread const CmdRadGradRef& ref, thread const CmdRadGrad& s, device Memory& v_266, constant uint& v_266BufferSize)
+void CmdRadGrad_write(thread const Alloc& a, thread const CmdRadGradRef& ref, thread const CmdRadGrad& s, device Memory& v_267)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.index;
-    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
+    write_mem(param, param_1, param_2, v_267);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = as_type<uint>(s.mat.x);
-    write_mem(param_3, param_4, param_5, v_266, v_266BufferSize);
+    write_mem(param_3, param_4, param_5, v_267);
     Alloc param_6 = a;
     uint param_7 = ix + 2u;
     uint param_8 = as_type<uint>(s.mat.y);
-    write_mem(param_6, param_7, param_8, v_266, v_266BufferSize);
+    write_mem(param_6, param_7, param_8, v_267);
     Alloc param_9 = a;
     uint param_10 = ix + 3u;
     uint param_11 = as_type<uint>(s.mat.z);
-    write_mem(param_9, param_10, param_11, v_266, v_266BufferSize);
+    write_mem(param_9, param_10, param_11, v_267);
     Alloc param_12 = a;
     uint param_13 = ix + 4u;
     uint param_14 = as_type<uint>(s.mat.w);
-    write_mem(param_12, param_13, param_14, v_266, v_266BufferSize);
+    write_mem(param_12, param_13, param_14, v_267);
     Alloc param_15 = a;
     uint param_16 = ix + 5u;
     uint param_17 = as_type<uint>(s.xlat.x);
-    write_mem(param_15, param_16, param_17, v_266, v_266BufferSize);
+    write_mem(param_15, param_16, param_17, v_267);
     Alloc param_18 = a;
     uint param_19 = ix + 6u;
     uint param_20 = as_type<uint>(s.xlat.y);
-    write_mem(param_18, param_19, param_20, v_266, v_266BufferSize);
+    write_mem(param_18, param_19, param_20, v_267);
     Alloc param_21 = a;
     uint param_22 = ix + 7u;
     uint param_23 = as_type<uint>(s.c1.x);
-    write_mem(param_21, param_22, param_23, v_266, v_266BufferSize);
+    write_mem(param_21, param_22, param_23, v_267);
     Alloc param_24 = a;
     uint param_25 = ix + 8u;
     uint param_26 = as_type<uint>(s.c1.y);
-    write_mem(param_24, param_25, param_26, v_266, v_266BufferSize);
+    write_mem(param_24, param_25, param_26, v_267);
     Alloc param_27 = a;
     uint param_28 = ix + 9u;
     uint param_29 = as_type<uint>(s.ra);
-    write_mem(param_27, param_28, param_29, v_266, v_266BufferSize);
+    write_mem(param_27, param_28, param_29, v_267);
     Alloc param_30 = a;
     uint param_31 = ix + 10u;
     uint param_32 = as_type<uint>(s.roff);
-    write_mem(param_30, param_31, param_32, v_266, v_266BufferSize);
+    write_mem(param_30, param_31, param_32, v_267);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_RadGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdRadGrad& s, device Memory& v_266, constant uint& v_266BufferSize)
+void Cmd_RadGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdRadGrad& s, device Memory& v_267)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 7u;
-    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
+    write_mem(param, param_1, param_2, v_267);
     Alloc param_3 = a;
     CmdRadGradRef param_4 = CmdRadGradRef{ ref.offset + 4u };
     CmdRadGrad param_5 = s;
-    CmdRadGrad_write(param_3, param_4, param_5, v_266, v_266BufferSize);
+    CmdRadGrad_write(param_3, param_4, param_5, v_267);
 }
 
 static inline __attribute__((always_inline))
-void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_266, constant uint& v_266BufferSize)
+void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_267)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.index;
-    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
+    write_mem(param, param_1, param_2, v_267);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16));
-    write_mem(param_3, param_4, param_5, v_266, v_266BufferSize);
+    write_mem(param_3, param_4, param_5, v_267);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_266, constant uint& v_266BufferSize)
+void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_267)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 8u;
-    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
+    write_mem(param, param_1, param_2, v_267);
     Alloc param_3 = a;
     CmdImageRef param_4 = CmdImageRef{ ref.offset + 4u };
     CmdImage param_5 = s;
-    CmdImage_write(param_3, param_4, param_5, v_266, v_266BufferSize);
+    CmdImage_write(param_3, param_4, param_5, v_267);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_266, constant uint& v_266BufferSize)
+void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_267)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 9u;
-    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
+    write_mem(param, param_1, param_2, v_267);
 }
 
 static inline __attribute__((always_inline))
-void CmdEndClip_write(thread const Alloc& a, thread const CmdEndClipRef& ref, thread const CmdEndClip& s, device Memory& v_266, constant uint& v_266BufferSize)
+void CmdEndClip_write(thread const Alloc& a, thread const CmdEndClipRef& ref, thread const CmdEndClip& s, device Memory& v_267)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.blend;
-    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
+    write_mem(param, param_1, param_2, v_267);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdEndClip& s, device Memory& v_266, constant uint& v_266BufferSize)
+void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdEndClip& s, device Memory& v_267)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 10u;
-    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
+    write_mem(param, param_1, param_2, v_267);
     Alloc param_3 = a;
     CmdEndClipRef param_4 = CmdEndClipRef{ ref.offset + 4u };
     CmdEndClip param_5 = s;
-    CmdEndClip_write(param_3, param_4, param_5, v_266, v_266BufferSize);
+    CmdEndClip_write(param_3, param_4, param_5, v_267);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_266, constant uint& v_266BufferSize)
+void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_267)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 0u;
-    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
+    write_mem(param, param_1, param_2, v_267);
 }
 
-static inline __attribute__((always_inline))
-void alloc_write(thread const Alloc& a, thread const uint& offset, thread const Alloc& alloc, device Memory& v_266, constant uint& v_266BufferSize)
-{
-    Alloc param = a;
-    uint param_1 = offset >> uint(2);
-    uint param_2 = alloc.offset;
-    write_mem(param, param_1, param_2, v_266, v_266BufferSize);
-}
-
-kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_266 [[buffer(0)]], const device ConfigBuf& _1020 [[buffer(1)]], const device SceneBuf& _1399 [[buffer(2)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_267 [[buffer(0)]], const device ConfigBuf& v_891 [[buffer(1)]], const device SceneBuf& _1390 [[buffer(2)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     threadgroup uint sh_bitmaps[8][256];
     threadgroup Alloc sh_part_elements[256];
@@ -698,22 +702,28 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
     threadgroup uint sh_tile_y0[256];
     threadgroup uint sh_tile_base[256];
     threadgroup uint sh_tile_count[256];
-    constant uint& v_266BufferSize = spvBufferSizeConstants[0];
-    uint width_in_bins = ((_1020.conf.width_in_tiles + 16u) - 1u) / 16u;
+    bool mem_ok = true;
+    uint param = 7u;
+    bool _1012 = check_deps(param, v_267);
+    if (!_1012)
+    {
+        return;
+    }
+    uint width_in_bins = ((v_891.conf.width_in_tiles + 16u) - 1u) / 16u;
     uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x;
     uint partition_ix = 0u;
-    uint n_partitions = ((_1020.conf.n_elements + 256u) - 1u) / 256u;
+    uint n_partitions = ((v_891.conf.n_elements + 256u) - 1u) / 256u;
     uint th_ix = gl_LocalInvocationID.x;
     uint bin_tile_x = 16u * gl_WorkGroupID.x;
     uint bin_tile_y = 16u * gl_WorkGroupID.y;
     uint tile_x = gl_LocalInvocationID.x % 16u;
     uint tile_y = gl_LocalInvocationID.x / 16u;
-    uint this_tile_ix = (((bin_tile_y + tile_y) * _1020.conf.width_in_tiles) + bin_tile_x) + tile_x;
-    Alloc param;
-    param.offset = _1020.conf.ptcl_alloc.offset;
-    uint param_1 = this_tile_ix * 1024u;
-    uint param_2 = 1024u;
-    Alloc cmd_alloc = slice_mem(param, param_1, param_2);
+    uint this_tile_ix = (((bin_tile_y + tile_y) * v_891.conf.width_in_tiles) + bin_tile_x) + tile_x;
+    Alloc param_1;
+    param_1.offset = v_891.conf.ptcl_alloc.offset;
+    uint param_2 = this_tile_ix * 1024u;
+    uint param_3 = 1024u;
+    Alloc cmd_alloc = slice_mem(param_1, param_2, param_3);
     CmdRef cmd_ref = CmdRef{ cmd_alloc.offset };
     uint cmd_limit = (cmd_ref.offset + 1024u) - 144u;
     uint clip_depth = 0u;
@@ -722,25 +732,24 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
     uint wr_ix = 0u;
     uint part_start_ix = 0u;
     uint ready_ix = 0u;
-    Alloc param_3 = cmd_alloc;
-    uint param_4 = 0u;
-    uint param_5 = 8u;
-    Alloc scratch_alloc = slice_mem(param_3, param_4, param_5);
+    Alloc param_4 = cmd_alloc;
+    uint param_5 = 0u;
+    uint param_6 = 8u;
+    Alloc scratch_alloc = slice_mem(param_4, param_5, param_6);
     cmd_ref.offset += 4u;
     uint render_blend_depth = 0u;
     uint max_blend_depth = 0u;
-    uint drawmonoid_start = _1020.conf.drawmonoid_alloc.offset >> uint(2);
-    uint drawtag_start = _1020.conf.drawtag_offset >> uint(2);
-    uint drawdata_start = _1020.conf.drawdata_offset >> uint(2);
-    uint drawinfo_start = _1020.conf.drawinfo_alloc.offset >> uint(2);
-    bool mem_ok = v_266.mem_error == 0u;
-    Alloc param_6;
-    Alloc param_8;
-    uint _1331;
+    uint drawmonoid_start = v_891.conf.drawmonoid_alloc.offset >> uint(2);
+    uint drawtag_start = v_891.conf.drawtag_offset >> uint(2);
+    uint drawdata_start = v_891.conf.drawdata_offset >> uint(2);
+    uint drawinfo_start = v_891.conf.drawinfo_alloc.offset >> uint(2);
+    Alloc param_7;
+    Alloc param_9;
+    uint _1322;
     uint element_ix;
-    Alloc param_17;
+    Alloc param_18;
     uint tile_count;
-    uint _1632;
+    uint _1622;
     float linewidth;
     CmdLinGrad cmd_lin;
     CmdRadGrad cmd_rad;
@@ -750,36 +759,36 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
         {
             sh_bitmaps[i][th_ix] = 0u;
         }
-        bool _1383;
+        bool _1374;
         for (;;)
         {
             if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
             {
                 part_start_ix = ready_ix;
                 uint count = 0u;
-                bool _1181 = th_ix < 256u;
-                bool _1189;
-                if (_1181)
+                bool _1174 = th_ix < 256u;
+                bool _1182;
+                if (_1174)
                 {
-                    _1189 = (partition_ix + th_ix) < n_partitions;
+                    _1182 = (partition_ix + th_ix) < n_partitions;
                 }
                 else
                 {
-                    _1189 = _1181;
+                    _1182 = _1174;
                 }
-                if (_1189)
+                if (_1182)
                 {
-                    uint in_ix = (_1020.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
-                    param_6.offset = _1020.conf.bin_alloc.offset;
-                    uint param_7 = in_ix;
-                    count = read_mem(param_6, param_7, v_266, v_266BufferSize);
-                    param_8.offset = _1020.conf.bin_alloc.offset;
-                    uint param_9 = in_ix + 1u;
-                    uint offset = read_mem(param_8, param_9, v_266, v_266BufferSize);
-                    uint param_10 = offset;
-                    uint param_11 = count * 4u;
-                    bool param_12 = mem_ok;
-                    sh_part_elements[th_ix] = new_alloc(param_10, param_11, param_12);
+                    uint in_ix = (v_891.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
+                    param_7.offset = v_891.conf.bin_alloc.offset;
+                    uint param_8 = in_ix;
+                    count = read_mem(param_7, param_8, v_267);
+                    param_9.offset = v_891.conf.bin_alloc.offset;
+                    uint param_10 = in_ix + 1u;
+                    uint offset = read_mem(param_9, param_10, v_267);
+                    uint param_11 = offset;
+                    uint param_12 = count * 4u;
+                    bool param_13 = true;
+                    sh_part_elements[th_ix] = new_alloc(param_11, param_12, param_13);
                 }
                 for (uint i_1 = 0u; i_1 < 8u; i_1++)
                 {
@@ -806,7 +815,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                 partition_ix += 256u;
             }
             uint ix = rd_ix + th_ix;
-            if (((ix >= wr_ix) && (ix < ready_ix)) && mem_ok)
+            if ((ix >= wr_ix) && (ix < ready_ix))
             {
                 uint part_ix = 0u;
                 for (uint i_2 = 0u; i_2 < 8u; i_2++)
@@ -819,34 +828,34 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                 }
                 if (part_ix > 0u)
                 {
-                    _1331 = sh_part_count[part_ix - 1u];
+                    _1322 = sh_part_count[part_ix - 1u];
                 }
                 else
                 {
-                    _1331 = part_start_ix;
+                    _1322 = part_start_ix;
                 }
-                ix -= _1331;
+                ix -= _1322;
                 Alloc bin_alloc = sh_part_elements[part_ix];
                 BinInstanceRef inst_ref = BinInstanceRef{ bin_alloc.offset };
-                BinInstanceRef param_13 = inst_ref;
-                uint param_14 = ix;
-                Alloc param_15 = bin_alloc;
-                BinInstanceRef param_16 = BinInstance_index(param_13, param_14);
-                BinInstance inst = BinInstance_read(param_15, param_16, v_266, v_266BufferSize);
+                BinInstanceRef param_14 = inst_ref;
+                uint param_15 = ix;
+                Alloc param_16 = bin_alloc;
+                BinInstanceRef param_17 = BinInstance_index(param_14, param_15);
+                BinInstance inst = BinInstance_read(param_16, param_17, v_267);
                 sh_elements[th_ix] = inst.element_ix;
             }
             threadgroup_barrier(mem_flags::mem_threadgroup);
             wr_ix = min((rd_ix + 256u), ready_ix);
-            bool _1373 = (wr_ix - rd_ix) < 256u;
-            if (_1373)
+            bool _1364 = (wr_ix - rd_ix) < 256u;
+            if (_1364)
             {
-                _1383 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
+                _1374 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
             }
             else
             {
-                _1383 = _1373;
+                _1374 = _1364;
             }
-            if (_1383)
+            if (_1374)
             {
                 continue;
             }
@@ -859,7 +868,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
         if ((th_ix + rd_ix) < wr_ix)
         {
             element_ix = sh_elements[th_ix];
-            tag = _1399.scene[drawtag_start + element_ix];
+            tag = _1390.scene[drawtag_start + element_ix];
         }
         switch (tag)
         {
@@ -871,10 +880,10 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
             case 37u:
             {
                 uint drawmonoid_base = drawmonoid_start + (4u * element_ix);
-                uint path_ix = v_266.memory[drawmonoid_base];
-                param_17.offset = _1020.conf.tile_alloc.offset;
-                PathRef param_18 = PathRef{ _1020.conf.tile_alloc.offset + (path_ix * 12u) };
-                Path path = Path_read(param_17, param_18, v_266, v_266BufferSize);
+                uint path_ix = v_267.memory[drawmonoid_base];
+                param_18.offset = v_891.conf.tile_alloc.offset;
+                PathRef param_19 = PathRef{ v_891.conf.tile_alloc.offset + (path_ix * 12u) };
+                Path path = Path_read(param_18, param_19, v_267);
                 uint stride = path.bbox.z - path.bbox.x;
                 sh_tile_stride[th_ix] = stride;
                 int dx = int(path.bbox.x) - int(bin_tile_x);
@@ -889,13 +898,13 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                 tile_count = uint(x1 - x0) * uint(y1 - y0);
                 uint base = path.tiles.offset - (((uint(dy) * stride) + uint(dx)) * 8u);
                 sh_tile_base[th_ix] = base;
-                uint param_19 = path.tiles.offset;
-                uint param_20 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
-                bool param_21 = mem_ok;
-                Alloc path_alloc = new_alloc(param_19, param_20, param_21);
-                uint param_22 = th_ix;
-                Alloc param_23 = path_alloc;
-                write_tile_alloc(param_22, param_23);
+                uint param_20 = path.tiles.offset;
+                uint param_21 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+                bool param_22 = true;
+                Alloc path_alloc = new_alloc(param_20, param_21, param_22);
+                uint param_23 = th_ix;
+                Alloc param_24 = path_alloc;
+                write_tile_alloc(param_23, param_24);
                 break;
             }
             default:
@@ -929,60 +938,57 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                 }
             }
             uint element_ix_1 = sh_elements[el_ix];
-            uint tag_1 = _1399.scene[drawtag_start + element_ix_1];
+            uint tag_1 = _1390.scene[drawtag_start + element_ix_1];
             if (el_ix > 0u)
             {
-                _1632 = sh_tile_count[el_ix - 1u];
+                _1622 = sh_tile_count[el_ix - 1u];
             }
             else
             {
-                _1632 = 0u;
+                _1622 = 0u;
             }
-            uint seq_ix = ix_1 - _1632;
+            uint seq_ix = ix_1 - _1622;
             uint width = sh_tile_width[el_ix];
             uint x = sh_tile_x0[el_ix] + (seq_ix % width);
             uint y = sh_tile_y0[el_ix] + (seq_ix / width);
             bool include_tile = false;
-            if (mem_ok)
+            uint param_25 = el_ix;
+            bool param_26 = true;
+            Alloc param_27 = read_tile_alloc(param_25, param_26, v_891);
+            TileRef param_28 = TileRef{ sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
+            Tile tile = Tile_read(param_27, param_28, v_267);
+            bool is_clip = (tag_1 & 1u) != 0u;
+            bool is_blend = false;
+            if (is_clip)
             {
-                uint param_24 = el_ix;
-                bool param_25 = mem_ok;
-                Alloc param_26 = read_tile_alloc(param_24, param_25, v_266, v_266BufferSize);
-                TileRef param_27 = TileRef{ sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
-                Tile tile = Tile_read(param_26, param_27, v_266, v_266BufferSize);
-                bool is_clip = (tag_1 & 1u) != 0u;
-                bool is_blend = false;
-                if (is_clip)
-                {
-                    uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1);
-                    uint scene_offset = v_266.memory[drawmonoid_base_1 + 2u];
-                    uint dd = drawdata_start + (scene_offset >> uint(2));
-                    uint blend = _1399.scene[dd];
-                    is_blend = blend != 32771u;
-                }
-                bool _1720 = tile.tile.offset != 0u;
-                bool _1729;
-                if (!_1720)
-                {
-                    _1729 = (tile.backdrop == 0) == is_clip;
-                }
-                else
-                {
-                    _1729 = _1720;
-                }
-                include_tile = _1729 || is_blend;
+                uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1);
+                uint scene_offset = v_267.memory[drawmonoid_base_1 + 2u];
+                uint dd = drawdata_start + (scene_offset >> uint(2));
+                uint blend = _1390.scene[dd];
+                is_blend = blend != 32771u;
             }
+            bool _1706 = tile.tile.offset != 0u;
+            bool _1715;
+            if (!_1706)
+            {
+                _1715 = (tile.backdrop == 0) == is_clip;
+            }
+            else
+            {
+                _1715 = _1706;
+            }
+            include_tile = _1715 || is_blend;
             if (include_tile)
             {
                 uint el_slice = el_ix / 32u;
                 uint el_mask = 1u << (el_ix & 31u);
-                uint _1751 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
+                uint _1737 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
             }
         }
         threadgroup_barrier(mem_flags::mem_threadgroup);
         uint slice_ix = 0u;
         uint bitmap = sh_bitmaps[0][th_ix];
-        while (mem_ok)
+        while (true)
         {
             if (bitmap == 0u)
             {
@@ -1000,175 +1006,170 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
             uint element_ref_ix = (slice_ix * 32u) + uint(int(spvFindLSB(bitmap)));
             uint element_ix_2 = sh_elements[element_ref_ix];
             bitmap &= (bitmap - 1u);
-            uint drawtag = _1399.scene[drawtag_start + element_ix_2];
+            uint drawtag = _1390.scene[drawtag_start + element_ix_2];
             if (clip_zero_depth == 0u)
             {
-                uint param_28 = element_ref_ix;
-                bool param_29 = mem_ok;
-                Alloc param_30 = read_tile_alloc(param_28, param_29, v_266, v_266BufferSize);
-                TileRef param_31 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                Tile tile_1 = Tile_read(param_30, param_31, v_266, v_266BufferSize);
+                uint param_29 = element_ref_ix;
+                bool param_30 = true;
+                Alloc param_31 = read_tile_alloc(param_29, param_30, v_891);
+                TileRef param_32 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                Tile tile_1 = Tile_read(param_31, param_32, v_267);
                 uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2);
-                uint scene_offset_1 = v_266.memory[drawmonoid_base_2 + 2u];
-                uint info_offset = v_266.memory[drawmonoid_base_2 + 3u];
+                uint scene_offset_1 = v_267.memory[drawmonoid_base_2 + 2u];
+                uint info_offset = v_267.memory[drawmonoid_base_2 + 3u];
                 uint dd_1 = drawdata_start + (scene_offset_1 >> uint(2));
                 uint di = drawinfo_start + (info_offset >> uint(2));
                 switch (drawtag)
                 {
                     case 68u:
                     {
-                        linewidth = as_type<float>(v_266.memory[di]);
-                        Alloc param_32 = cmd_alloc;
-                        CmdRef param_33 = cmd_ref;
-                        uint param_34 = cmd_limit;
-                        bool _1876 = alloc_cmd(param_32, param_33, param_34, v_266, v_266BufferSize);
-                        cmd_alloc = param_32;
-                        cmd_ref = param_33;
-                        cmd_limit = param_34;
-                        if (!_1876)
+                        linewidth = as_type<float>(v_267.memory[di]);
+                        Alloc param_33 = cmd_alloc;
+                        CmdRef param_34 = cmd_ref;
+                        uint param_35 = cmd_limit;
+                        alloc_cmd(param_33, param_34, param_35, mem_ok, v_267, v_891);
+                        cmd_alloc = param_33;
+                        cmd_ref = param_34;
+                        cmd_limit = param_35;
+                        Alloc param_36 = cmd_alloc;
+                        CmdRef param_37 = cmd_ref;
+                        Tile param_38 = tile_1;
+                        float param_39 = linewidth;
+                        write_fill(param_36, param_37, param_38, param_39, mem_ok, v_267);
+                        cmd_ref = param_37;
+                        uint rgba = _1390.scene[dd_1];
+                        if (mem_ok)
                         {
-                            break;
+                            Alloc param_40 = cmd_alloc;
+                            CmdRef param_41 = cmd_ref;
+                            CmdColor param_42 = CmdColor{ rgba };
+                            Cmd_Color_write(param_40, param_41, param_42, v_267);
                         }
-                        Alloc param_35 = cmd_alloc;
-                        CmdRef param_36 = cmd_ref;
-                        Tile param_37 = tile_1;
-                        float param_38 = linewidth;
-                        write_fill(param_35, param_36, param_37, param_38, v_266, v_266BufferSize);
-                        cmd_ref = param_36;
-                        uint rgba = _1399.scene[dd_1];
-                        Alloc param_39 = cmd_alloc;
-                        CmdRef param_40 = cmd_ref;
-                        CmdColor param_41 = CmdColor{ rgba };
-                        Cmd_Color_write(param_39, param_40, param_41, v_266, v_266BufferSize);
                         cmd_ref.offset += 8u;
                         break;
                     }
                     case 276u:
                     {
-                        Alloc param_42 = cmd_alloc;
-                        CmdRef param_43 = cmd_ref;
-                        uint param_44 = cmd_limit;
-                        bool _1917 = alloc_cmd(param_42, param_43, param_44, v_266, v_266BufferSize);
-                        cmd_alloc = param_42;
-                        cmd_ref = param_43;
-                        cmd_limit = param_44;
-                        if (!_1917)
+                        Alloc param_43 = cmd_alloc;
+                        CmdRef param_44 = cmd_ref;
+                        uint param_45 = cmd_limit;
+                        alloc_cmd(param_43, param_44, param_45, mem_ok, v_267, v_891);
+                        cmd_alloc = param_43;
+                        cmd_ref = param_44;
+                        cmd_limit = param_45;
+                        linewidth = as_type<float>(v_267.memory[di]);
+                        Alloc param_46 = cmd_alloc;
+                        CmdRef param_47 = cmd_ref;
+                        Tile param_48 = tile_1;
+                        float param_49 = linewidth;
+                        write_fill(param_46, param_47, param_48, param_49, mem_ok, v_267);
+                        cmd_ref = param_47;
+                        cmd_lin.index = _1390.scene[dd_1];
+                        cmd_lin.line_x = as_type<float>(v_267.memory[di + 1u]);
+                        cmd_lin.line_y = as_type<float>(v_267.memory[di + 2u]);
+                        cmd_lin.line_c = as_type<float>(v_267.memory[di + 3u]);
+                        if (mem_ok)
                         {
-                            break;
+                            Alloc param_50 = cmd_alloc;
+                            CmdRef param_51 = cmd_ref;
+                            CmdLinGrad param_52 = cmd_lin;
+                            Cmd_LinGrad_write(param_50, param_51, param_52, v_267);
                         }
-                        linewidth = as_type<float>(v_266.memory[di]);
-                        Alloc param_45 = cmd_alloc;
-                        CmdRef param_46 = cmd_ref;
-                        Tile param_47 = tile_1;
-                        float param_48 = linewidth;
-                        write_fill(param_45, param_46, param_47, param_48, v_266, v_266BufferSize);
-                        cmd_ref = param_46;
-                        cmd_lin.index = _1399.scene[dd_1];
-                        cmd_lin.line_x = as_type<float>(v_266.memory[di + 1u]);
-                        cmd_lin.line_y = as_type<float>(v_266.memory[di + 2u]);
-                        cmd_lin.line_c = as_type<float>(v_266.memory[di + 3u]);
-                        Alloc param_49 = cmd_alloc;
-                        CmdRef param_50 = cmd_ref;
-                        CmdLinGrad param_51 = cmd_lin;
-                        Cmd_LinGrad_write(param_49, param_50, param_51, v_266, v_266BufferSize);
                         cmd_ref.offset += 20u;
                         break;
                     }
                     case 732u:
                     {
-                        Alloc param_52 = cmd_alloc;
-                        CmdRef param_53 = cmd_ref;
-                        uint param_54 = cmd_limit;
-                        bool _1981 = alloc_cmd(param_52, param_53, param_54, v_266, v_266BufferSize);
-                        cmd_alloc = param_52;
-                        cmd_ref = param_53;
-                        cmd_limit = param_54;
-                        if (!_1981)
+                        Alloc param_53 = cmd_alloc;
+                        CmdRef param_54 = cmd_ref;
+                        uint param_55 = cmd_limit;
+                        alloc_cmd(param_53, param_54, param_55, mem_ok, v_267, v_891);
+                        cmd_alloc = param_53;
+                        cmd_ref = param_54;
+                        cmd_limit = param_55;
+                        linewidth = as_type<float>(v_267.memory[di]);
+                        Alloc param_56 = cmd_alloc;
+                        CmdRef param_57 = cmd_ref;
+                        Tile param_58 = tile_1;
+                        float param_59 = linewidth;
+                        write_fill(param_56, param_57, param_58, param_59, mem_ok, v_267);
+                        cmd_ref = param_57;
+                        cmd_rad.index = _1390.scene[dd_1];
+                        cmd_rad.mat = as_type<float4>(uint4(v_267.memory[di + 1u], v_267.memory[di + 2u], v_267.memory[di + 3u], v_267.memory[di + 4u]));
+                        cmd_rad.xlat = as_type<float2>(uint2(v_267.memory[di + 5u], v_267.memory[di + 6u]));
+                        cmd_rad.c1 = as_type<float2>(uint2(v_267.memory[di + 7u], v_267.memory[di + 8u]));
+                        cmd_rad.ra = as_type<float>(v_267.memory[di + 9u]);
+                        cmd_rad.roff = as_type<float>(v_267.memory[di + 10u]);
+                        if (mem_ok)
                         {
-                            break;
+                            Alloc param_60 = cmd_alloc;
+                            CmdRef param_61 = cmd_ref;
+                            CmdRadGrad param_62 = cmd_rad;
+                            Cmd_RadGrad_write(param_60, param_61, param_62, v_267);
                         }
-                        linewidth = as_type<float>(v_266.memory[di]);
-                        Alloc param_55 = cmd_alloc;
-                        CmdRef param_56 = cmd_ref;
-                        Tile param_57 = tile_1;
-                        float param_58 = linewidth;
-                        write_fill(param_55, param_56, param_57, param_58, v_266, v_266BufferSize);
-                        cmd_ref = param_56;
-                        cmd_rad.index = _1399.scene[dd_1];
-                        cmd_rad.mat = as_type<float4>(uint4(v_266.memory[di + 1u], v_266.memory[di + 2u], v_266.memory[di + 3u], v_266.memory[di + 4u]));
-                        cmd_rad.xlat = as_type<float2>(uint2(v_266.memory[di + 5u], v_266.memory[di + 6u]));
-                        cmd_rad.c1 = as_type<float2>(uint2(v_266.memory[di + 7u], v_266.memory[di + 8u]));
-                        cmd_rad.ra = as_type<float>(v_266.memory[di + 9u]);
-                        cmd_rad.roff = as_type<float>(v_266.memory[di + 10u]);
-                        Alloc param_59 = cmd_alloc;
-                        CmdRef param_60 = cmd_ref;
-                        CmdRadGrad param_61 = cmd_rad;
-                        Cmd_RadGrad_write(param_59, param_60, param_61, v_266, v_266BufferSize);
                         cmd_ref.offset += 48u;
                         break;
                     }
                     case 72u:
                     {
-                        linewidth = as_type<float>(v_266.memory[di]);
-                        Alloc param_62 = cmd_alloc;
-                        CmdRef param_63 = cmd_ref;
-                        uint param_64 = cmd_limit;
-                        bool _2087 = alloc_cmd(param_62, param_63, param_64, v_266, v_266BufferSize);
-                        cmd_alloc = param_62;
-                        cmd_ref = param_63;
-                        cmd_limit = param_64;
-                        if (!_2087)
-                        {
-                            break;
-                        }
-                        Alloc param_65 = cmd_alloc;
-                        CmdRef param_66 = cmd_ref;
-                        Tile param_67 = tile_1;
-                        float param_68 = linewidth;
-                        write_fill(param_65, param_66, param_67, param_68, v_266, v_266BufferSize);
-                        cmd_ref = param_66;
-                        uint index = _1399.scene[dd_1];
-                        uint raw1 = _1399.scene[dd_1 + 1u];
+                        Alloc param_63 = cmd_alloc;
+                        CmdRef param_64 = cmd_ref;
+                        uint param_65 = cmd_limit;
+                        alloc_cmd(param_63, param_64, param_65, mem_ok, v_267, v_891);
+                        cmd_alloc = param_63;
+                        cmd_ref = param_64;
+                        cmd_limit = param_65;
+                        linewidth = as_type<float>(v_267.memory[di]);
+                        Alloc param_66 = cmd_alloc;
+                        CmdRef param_67 = cmd_ref;
+                        Tile param_68 = tile_1;
+                        float param_69 = linewidth;
+                        write_fill(param_66, param_67, param_68, param_69, mem_ok, v_267);
+                        cmd_ref = param_67;
+                        uint index = _1390.scene[dd_1];
+                        uint raw1 = _1390.scene[dd_1 + 1u];
                         int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
-                        Alloc param_69 = cmd_alloc;
-                        CmdRef param_70 = cmd_ref;
-                        CmdImage param_71 = CmdImage{ index, offset_1 };
-                        Cmd_Image_write(param_69, param_70, param_71, v_266, v_266BufferSize);
+                        if (mem_ok)
+                        {
+                            Alloc param_70 = cmd_alloc;
+                            CmdRef param_71 = cmd_ref;
+                            CmdImage param_72 = CmdImage{ index, offset_1 };
+                            Cmd_Image_write(param_70, param_71, param_72, v_267);
+                        }
                         cmd_ref.offset += 12u;
                         break;
                     }
                     case 5u:
                     {
-                        bool _2140 = tile_1.tile.offset == 0u;
-                        bool _2146;
-                        if (_2140)
+                        bool _2120 = tile_1.tile.offset == 0u;
+                        bool _2126;
+                        if (_2120)
                         {
-                            _2146 = tile_1.backdrop == 0;
+                            _2126 = tile_1.backdrop == 0;
                         }
                         else
                         {
-                            _2146 = _2140;
+                            _2126 = _2120;
                         }
-                        if (_2146)
+                        if (_2126)
                         {
                             clip_zero_depth = clip_depth + 1u;
                         }
                         else
                         {
-                            Alloc param_72 = cmd_alloc;
-                            CmdRef param_73 = cmd_ref;
-                            uint param_74 = cmd_limit;
-                            bool _2158 = alloc_cmd(param_72, param_73, param_74, v_266, v_266BufferSize);
-                            cmd_alloc = param_72;
-                            cmd_ref = param_73;
-                            cmd_limit = param_74;
-                            if (!_2158)
+                            Alloc param_73 = cmd_alloc;
+                            CmdRef param_74 = cmd_ref;
+                            uint param_75 = cmd_limit;
+                            alloc_cmd(param_73, param_74, param_75, mem_ok, v_267, v_891);
+                            cmd_alloc = param_73;
+                            cmd_ref = param_74;
+                            cmd_limit = param_75;
+                            if (mem_ok)
                             {
-                                break;
+                                Alloc param_76 = cmd_alloc;
+                                CmdRef param_77 = cmd_ref;
+                                Cmd_BeginClip_write(param_76, param_77, v_267);
                             }
-                            Alloc param_75 = cmd_alloc;
-                            CmdRef param_76 = cmd_ref;
-                            Cmd_BeginClip_write(param_75, param_76, v_266, v_266BufferSize);
                             cmd_ref.offset += 4u;
                             render_blend_depth++;
                             max_blend_depth = max(max_blend_depth, render_blend_depth);
@@ -1179,28 +1180,20 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                     case 37u:
                     {
                         clip_depth--;
-                        Alloc param_77 = cmd_alloc;
-                        CmdRef param_78 = cmd_ref;
-                        uint param_79 = cmd_limit;
-                        bool _2191 = alloc_cmd(param_77, param_78, param_79, v_266, v_266BufferSize);
-                        cmd_alloc = param_77;
-                        cmd_ref = param_78;
-                        cmd_limit = param_79;
-                        if (!_2191)
+                        Alloc param_78 = cmd_alloc;
+                        CmdRef param_79 = cmd_ref;
+                        Tile param_80 = tile_1;
+                        float param_81 = -1.0;
+                        write_fill(param_78, param_79, param_80, param_81, mem_ok, v_267);
+                        cmd_ref = param_79;
+                        uint blend_1 = _1390.scene[dd_1];
+                        if (mem_ok)
                         {
-                            break;
+                            Alloc param_82 = cmd_alloc;
+                            CmdRef param_83 = cmd_ref;
+                            CmdEndClip param_84 = CmdEndClip{ blend_1 };
+                            Cmd_EndClip_write(param_82, param_83, param_84, v_267);
                         }
-                        Alloc param_80 = cmd_alloc;
-                        CmdRef param_81 = cmd_ref;
-                        Tile param_82 = tile_1;
-                        float param_83 = -1.0;
-                        write_fill(param_80, param_81, param_82, param_83, v_266, v_266BufferSize);
-                        cmd_ref = param_81;
-                        uint blend_1 = _1399.scene[dd_1];
-                        Alloc param_84 = cmd_alloc;
-                        CmdRef param_85 = cmd_ref;
-                        CmdEndClip param_86 = CmdEndClip{ blend_1 };
-                        Cmd_EndClip_write(param_84, param_85, param_86, v_266, v_266BufferSize);
                         cmd_ref.offset += 8u;
                         render_blend_depth--;
                         break;
@@ -1235,31 +1228,33 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
             break;
         }
     }
-    bool _2263 = (bin_tile_x + tile_x) < _1020.conf.width_in_tiles;
-    bool _2272;
-    if (_2263)
+    bool _2231 = (bin_tile_x + tile_x) < v_891.conf.width_in_tiles;
+    bool _2240;
+    if (_2231)
     {
-        _2272 = (bin_tile_y + tile_y) < _1020.conf.height_in_tiles;
+        _2240 = (bin_tile_y + tile_y) < v_891.conf.height_in_tiles;
     }
     else
     {
-        _2272 = _2263;
+        _2240 = _2231;
     }
-    if (_2272)
+    if (_2240)
     {
-        Alloc param_87 = cmd_alloc;
-        CmdRef param_88 = cmd_ref;
-        Cmd_End_write(param_87, param_88, v_266, v_266BufferSize);
+        if (mem_ok)
+        {
+            Alloc param_85 = cmd_alloc;
+            CmdRef param_86 = cmd_ref;
+            Cmd_End_write(param_85, param_86, v_267);
+        }
         if (max_blend_depth > 4u)
         {
             uint scratch_size = (((max_blend_depth * 16u) * 16u) * 1u) * 4u;
-            uint param_89 = scratch_size;
-            MallocResult _2293 = malloc(param_89, v_266, v_266BufferSize);
-            MallocResult scratch = _2293;
-            Alloc param_90 = scratch_alloc;
-            uint param_91 = scratch_alloc.offset;
-            Alloc param_92 = scratch.alloc;
-            alloc_write(param_90, param_91, param_92, v_266, v_266BufferSize);
+            uint _2264 = atomic_fetch_add_explicit((device atomic_uint*)&v_267.blend_offset, scratch_size, memory_order_relaxed);
+            uint scratch = _2264;
+            Alloc param_87 = scratch_alloc;
+            uint param_88 = scratch_alloc.offset >> uint(2);
+            uint param_89 = scratch;
+            write_mem(param_87, param_88, param_89, v_267);
         }
     }
 }
diff --git a/piet-gpu/shader/gen/coarse.spv b/piet-gpu/shader/gen/coarse.spv
index fe5eeee..2417cf8 100644
Binary files a/piet-gpu/shader/gen/coarse.spv and b/piet-gpu/shader/gen/coarse.spv differ
diff --git a/piet-gpu/shader/gen/draw_leaf.dxil b/piet-gpu/shader/gen/draw_leaf.dxil
index 200f169..97b006a 100644
Binary files a/piet-gpu/shader/gen/draw_leaf.dxil and b/piet-gpu/shader/gen/draw_leaf.dxil differ
diff --git a/piet-gpu/shader/gen/draw_leaf.hlsl b/piet-gpu/shader/gen/draw_leaf.hlsl
index 734d21e..789c9b3 100644
--- a/piet-gpu/shader/gen/draw_leaf.hlsl
+++ b/piet-gpu/shader/gen/draw_leaf.hlsl
@@ -13,6 +13,7 @@ struct Alloc
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -88,7 +89,7 @@ DrawMonoid draw_monoid_identity()
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 8u;
-    uint drawtag_base = _93.Load(100) >> uint(2);
+    uint drawtag_base = _93.Load(104) >> uint(2);
     uint tag_word = _103.Load((drawtag_base + ix) * 4 + 0);
     uint param = tag_word;
     DrawMonoid agg = map_tag(param);
@@ -137,11 +138,11 @@ void comp_main()
         DrawMonoid param_7 = sh_scratch[gl_LocalInvocationID.x - 1u];
         row = combine_draw_monoid(param_6, param_7);
     }
-    uint drawdata_base = _93.Load(104) >> uint(2);
-    uint drawinfo_base = _93.Load(68) >> uint(2);
+    uint drawdata_base = _93.Load(108) >> uint(2);
+    uint drawinfo_base = _93.Load(72) >> uint(2);
     uint out_ix = gl_GlobalInvocationID.x * 8u;
-    uint out_base = (_93.Load(44) >> uint(2)) + (out_ix * 4u);
-    uint clip_out_base = _93.Load(48) >> uint(2);
+    uint out_base = (_93.Load(48) >> uint(2)) + (out_ix * 4u);
+    uint clip_out_base = _93.Load(52) >> uint(2);
     float4 mat;
     float2 translate;
     float2 p0;
@@ -155,31 +156,31 @@ void comp_main()
             DrawMonoid param_9 = local[i_2 - 1u];
             m = combine_draw_monoid(param_8, param_9);
         }
-        _285.Store((out_base + (i_2 * 4u)) * 4 + 8, m.path_ix);
-        _285.Store(((out_base + (i_2 * 4u)) + 1u) * 4 + 8, m.clip_ix);
-        _285.Store(((out_base + (i_2 * 4u)) + 2u) * 4 + 8, m.scene_offset);
-        _285.Store(((out_base + (i_2 * 4u)) + 3u) * 4 + 8, m.info_offset);
+        _285.Store((out_base + (i_2 * 4u)) * 4 + 12, m.path_ix);
+        _285.Store(((out_base + (i_2 * 4u)) + 1u) * 4 + 12, m.clip_ix);
+        _285.Store(((out_base + (i_2 * 4u)) + 2u) * 4 + 12, m.scene_offset);
+        _285.Store(((out_base + (i_2 * 4u)) + 3u) * 4 + 12, m.info_offset);
         uint dd = drawdata_base + (m.scene_offset >> uint(2));
         uint di = drawinfo_base + (m.info_offset >> uint(2));
         tag_word = _103.Load(((drawtag_base + ix) + i_2) * 4 + 0);
         if (((((tag_word == 68u) || (tag_word == 276u)) || (tag_word == 732u)) || (tag_word == 72u)) || (tag_word == 5u))
         {
-            uint bbox_offset = (_93.Load(40) >> uint(2)) + (6u * m.path_ix);
-            float bbox_l = float(_285.Load(bbox_offset * 4 + 8)) - 32768.0f;
-            float bbox_t = float(_285.Load((bbox_offset + 1u) * 4 + 8)) - 32768.0f;
-            float bbox_r = float(_285.Load((bbox_offset + 2u) * 4 + 8)) - 32768.0f;
-            float bbox_b = float(_285.Load((bbox_offset + 3u) * 4 + 8)) - 32768.0f;
+            uint bbox_offset = (_93.Load(44) >> uint(2)) + (6u * m.path_ix);
+            float bbox_l = float(_285.Load(bbox_offset * 4 + 12)) - 32768.0f;
+            float bbox_t = float(_285.Load((bbox_offset + 1u) * 4 + 12)) - 32768.0f;
+            float bbox_r = float(_285.Load((bbox_offset + 2u) * 4 + 12)) - 32768.0f;
+            float bbox_b = float(_285.Load((bbox_offset + 3u) * 4 + 12)) - 32768.0f;
             float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
-            float linewidth = asfloat(_285.Load((bbox_offset + 4u) * 4 + 8));
+            float linewidth = asfloat(_285.Load((bbox_offset + 4u) * 4 + 12));
             uint fill_mode = uint(linewidth >= 0.0f);
             if (((linewidth >= 0.0f) || (tag_word == 276u)) || (tag_word == 732u))
             {
-                uint trans_ix = _285.Load((bbox_offset + 5u) * 4 + 8);
-                uint t = (_93.Load(36) >> uint(2)) + (6u * trans_ix);
-                mat = asfloat(uint4(_285.Load(t * 4 + 8), _285.Load((t + 1u) * 4 + 8), _285.Load((t + 2u) * 4 + 8), _285.Load((t + 3u) * 4 + 8)));
+                uint trans_ix = _285.Load((bbox_offset + 5u) * 4 + 12);
+                uint t = (_93.Load(40) >> uint(2)) + (6u * trans_ix);
+                mat = asfloat(uint4(_285.Load(t * 4 + 12), _285.Load((t + 1u) * 4 + 12), _285.Load((t + 2u) * 4 + 12), _285.Load((t + 3u) * 4 + 12)));
                 if ((tag_word == 276u) || (tag_word == 732u))
                 {
-                    translate = asfloat(uint2(_285.Load((t + 4u) * 4 + 8), _285.Load((t + 5u) * 4 + 8)));
+                    translate = asfloat(uint2(_285.Load((t + 4u) * 4 + 12), _285.Load((t + 5u) * 4 + 12)));
                 }
             }
             if (linewidth >= 0.0f)
@@ -191,12 +192,12 @@ void comp_main()
                 case 68u:
                 case 72u:
                 {
-                    _285.Store(di * 4 + 8, asuint(linewidth));
+                    _285.Store(di * 4 + 12, asuint(linewidth));
                     break;
                 }
                 case 276u:
                 {
-                    _285.Store(di * 4 + 8, asuint(linewidth));
+                    _285.Store(di * 4 + 12, asuint(linewidth));
                     p0 = asfloat(uint2(_103.Load((dd + 1u) * 4 + 0), _103.Load((dd + 2u) * 4 + 0)));
                     p1 = asfloat(uint2(_103.Load((dd + 3u) * 4 + 0), _103.Load((dd + 4u) * 4 + 0)));
                     p0 = ((mat.xy * p0.x) + (mat.zw * p0.y)) + translate;
@@ -206,9 +207,9 @@ void comp_main()
                     float line_x = dxy.x * scale;
                     float line_y = dxy.y * scale;
                     float line_c = -((p0.x * line_x) + (p0.y * line_y));
-                    _285.Store((di + 1u) * 4 + 8, asuint(line_x));
-                    _285.Store((di + 2u) * 4 + 8, asuint(line_y));
-                    _285.Store((di + 3u) * 4 + 8, asuint(line_c));
+                    _285.Store((di + 1u) * 4 + 12, asuint(line_x));
+                    _285.Store((di + 2u) * 4 + 12, asuint(line_y));
+                    _285.Store((di + 3u) * 4 + 12, asuint(line_c));
                     break;
                 }
                 case 732u:
@@ -227,17 +228,17 @@ void comp_main()
                     float2 c1 = center1 * rainv;
                     float ra = rr * rainv;
                     float roff = rr - 1.0f;
-                    _285.Store(di * 4 + 8, asuint(linewidth));
-                    _285.Store((di + 1u) * 4 + 8, asuint(inv_mat.x));
-                    _285.Store((di + 2u) * 4 + 8, asuint(inv_mat.y));
-                    _285.Store((di + 3u) * 4 + 8, asuint(inv_mat.z));
-                    _285.Store((di + 4u) * 4 + 8, asuint(inv_mat.w));
-                    _285.Store((di + 5u) * 4 + 8, asuint(inv_tr.x));
-                    _285.Store((di + 6u) * 4 + 8, asuint(inv_tr.y));
-                    _285.Store((di + 7u) * 4 + 8, asuint(c1.x));
-                    _285.Store((di + 8u) * 4 + 8, asuint(c1.y));
-                    _285.Store((di + 9u) * 4 + 8, asuint(ra));
-                    _285.Store((di + 10u) * 4 + 8, asuint(roff));
+                    _285.Store(di * 4 + 12, asuint(linewidth));
+                    _285.Store((di + 1u) * 4 + 12, asuint(inv_mat.x));
+                    _285.Store((di + 2u) * 4 + 12, asuint(inv_mat.y));
+                    _285.Store((di + 3u) * 4 + 12, asuint(inv_mat.z));
+                    _285.Store((di + 4u) * 4 + 12, asuint(inv_mat.w));
+                    _285.Store((di + 5u) * 4 + 12, asuint(inv_tr.x));
+                    _285.Store((di + 6u) * 4 + 12, asuint(inv_tr.y));
+                    _285.Store((di + 7u) * 4 + 12, asuint(c1.x));
+                    _285.Store((di + 8u) * 4 + 12, asuint(c1.y));
+                    _285.Store((di + 9u) * 4 + 12, asuint(ra));
+                    _285.Store((di + 10u) * 4 + 12, asuint(roff));
                     break;
                 }
                 case 5u:
@@ -253,7 +254,7 @@ void comp_main()
             {
                 path_ix = m.path_ix;
             }
-            _285.Store((clip_out_base + m.clip_ix) * 4 + 8, path_ix);
+            _285.Store((clip_out_base + m.clip_ix) * 4 + 12, path_ix);
         }
     }
 }
diff --git a/piet-gpu/shader/gen/draw_leaf.msl b/piet-gpu/shader/gen/draw_leaf.msl
index c11e21b..2ec1911 100644
--- a/piet-gpu/shader/gen/draw_leaf.msl
+++ b/piet-gpu/shader/gen/draw_leaf.msl
@@ -59,6 +59,7 @@ struct Alloc
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -115,6 +116,7 @@ struct Memory
 {
     uint mem_offset;
     uint mem_error;
+    uint blend_offset;
     uint memory[1];
 };
 
diff --git a/piet-gpu/shader/gen/draw_leaf.spv b/piet-gpu/shader/gen/draw_leaf.spv
index 58dde43..f9feedc 100644
Binary files a/piet-gpu/shader/gen/draw_leaf.spv and b/piet-gpu/shader/gen/draw_leaf.spv differ
diff --git a/piet-gpu/shader/gen/draw_reduce.dxil b/piet-gpu/shader/gen/draw_reduce.dxil
index be69aad..5516a9a 100644
Binary files a/piet-gpu/shader/gen/draw_reduce.dxil and b/piet-gpu/shader/gen/draw_reduce.dxil differ
diff --git a/piet-gpu/shader/gen/draw_reduce.hlsl b/piet-gpu/shader/gen/draw_reduce.hlsl
index 8311155..1a8f2b1 100644
--- a/piet-gpu/shader/gen/draw_reduce.hlsl
+++ b/piet-gpu/shader/gen/draw_reduce.hlsl
@@ -13,6 +13,7 @@ struct Alloc
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -81,7 +82,7 @@ DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b)
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 8u;
-    uint drawtag_base = _87.Load(100) >> uint(2);
+    uint drawtag_base = _87.Load(104) >> uint(2);
     uint tag_word = _97.Load((drawtag_base + ix) * 4 + 0);
     uint param = tag_word;
     DrawMonoid agg = map_tag(param);
diff --git a/piet-gpu/shader/gen/draw_reduce.msl b/piet-gpu/shader/gen/draw_reduce.msl
index 759267c..b2510e3 100644
--- a/piet-gpu/shader/gen/draw_reduce.msl
+++ b/piet-gpu/shader/gen/draw_reduce.msl
@@ -20,6 +20,7 @@ struct Alloc
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -76,6 +77,7 @@ struct Memory
 {
     uint mem_offset;
     uint mem_error;
+    uint blend_offset;
     uint memory[1];
 };
 
diff --git a/piet-gpu/shader/gen/draw_reduce.spv b/piet-gpu/shader/gen/draw_reduce.spv
index d6c6fb7..2992677 100644
Binary files a/piet-gpu/shader/gen/draw_reduce.spv and b/piet-gpu/shader/gen/draw_reduce.spv differ
diff --git a/piet-gpu/shader/gen/kernel4.dxil b/piet-gpu/shader/gen/kernel4.dxil
index c48d59d..33873b3 100644
Binary files a/piet-gpu/shader/gen/kernel4.dxil and b/piet-gpu/shader/gen/kernel4.dxil differ
diff --git a/piet-gpu/shader/gen/kernel4.hlsl b/piet-gpu/shader/gen/kernel4.hlsl
index 0a6c022..2e1f937 100644
--- a/piet-gpu/shader/gen/kernel4.hlsl
+++ b/piet-gpu/shader/gen/kernel4.hlsl
@@ -130,6 +130,7 @@ struct TileSeg
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -163,9 +164,10 @@ static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);
 
 RWByteAddressBuffer _297 : register(u0, space0);
 ByteAddressBuffer _1681 : register(t1, space0);
-RWTexture2D<unorm float4> image_atlas : register(u3, space0);
-RWTexture2D<unorm float4> gradients : register(u4, space0);
-RWTexture2D<unorm float4> image : register(u2, space0);
+RWByteAddressBuffer _2506 : register(u2, space0);
+RWTexture2D<unorm float4> image_atlas : register(u4, space0);
+RWTexture2D<unorm float4> gradients : register(u5, space0);
+RWTexture2D<unorm float4> image : register(u3, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -206,7 +208,7 @@ uint read_mem(Alloc alloc, uint offset)
     {
         return 0u;
     }
-    uint v = _297.Load(offset * 4 + 8);
+    uint v = _297.Load(offset * 4 + 12);
     return v;
 }
 
@@ -989,9 +991,9 @@ CmdJump Cmd_Jump_read(Alloc a, CmdRef ref)
 
 void comp_main()
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1681.Load(8)) + gl_WorkGroupID.x;
+    uint tile_ix = (gl_WorkGroupID.y * _1681.Load(12)) + gl_WorkGroupID.x;
     Alloc _1696;
-    _1696.offset = _1681.Load(24);
+    _1696.offset = _1681.Load(28);
     Alloc param;
     param.offset = _1696.offset;
     uint param_1 = tile_ix * 1024u;
@@ -999,7 +1001,7 @@ void comp_main()
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
     CmdRef _1705 = { cmd_alloc.offset };
     CmdRef cmd_ref = _1705;
-    uint blend_offset = _297.Load((cmd_ref.offset >> uint(2)) * 4 + 8);
+    uint blend_offset = _297.Load((cmd_ref.offset >> uint(2)) * 4 + 12);
     cmd_ref.offset += 4u;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
@@ -1009,14 +1011,13 @@ void comp_main()
         rgba[i] = 0.0f.xxxx;
     }
     uint clip_depth = 0u;
-    bool mem_ok = _297.Load(4) == 0u;
     float df[8];
     TileSegRef tile_seg_ref;
     float area[8];
     uint blend_stack[4][8];
     uint base_ix_1;
     uint bg_rgba;
-    while (mem_ok)
+    while (true)
     {
         Alloc param_3 = cmd_alloc;
         CmdRef param_4 = cmd_ref;
@@ -1036,13 +1037,13 @@ void comp_main()
                 {
                     df[k] = 1000000000.0f;
                 }
-                TileSegRef _1810 = { stroke.tile_ref };
-                tile_seg_ref = _1810;
+                TileSegRef _1805 = { stroke.tile_ref };
+                tile_seg_ref = _1805;
                 do
                 {
                     uint param_7 = tile_seg_ref.offset;
                     uint param_8 = 24u;
-                    bool param_9 = mem_ok;
+                    bool param_9 = true;
                     Alloc param_10 = new_alloc(param_7, param_8, param_9);
                     TileSegRef param_11 = tile_seg_ref;
                     TileSeg seg = TileSeg_read(param_10, param_11);
@@ -1073,13 +1074,13 @@ void comp_main()
                 {
                     area[k_3] = float(fill.backdrop);
                 }
-                TileSegRef _1930 = { fill.tile_ref };
-                tile_seg_ref = _1930;
+                TileSegRef _1924 = { fill.tile_ref };
+                tile_seg_ref = _1924;
                 do
                 {
                     uint param_15 = tile_seg_ref.offset;
                     uint param_16 = 24u;
-                    bool param_17 = mem_ok;
+                    bool param_17 = true;
                     Alloc param_18 = new_alloc(param_15, param_16, param_17);
                     TileSegRef param_19 = tile_seg_ref;
                     TileSeg seg_1 = TileSeg_read(param_18, param_19);
@@ -1163,10 +1164,10 @@ void comp_main()
                     int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba = gradients[int2(x, int(lin.index))];
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2264 = fromsRGB(param_29);
-                    fg_rgba.x = _2264.x;
-                    fg_rgba.y = _2264.y;
-                    fg_rgba.z = _2264.z;
+                    float3 _2257 = fromsRGB(param_29);
+                    fg_rgba.x = _2257.x;
+                    fg_rgba.y = _2257.y;
+                    fg_rgba.z = _2257.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1;
                 }
@@ -1189,10 +1190,10 @@ void comp_main()
                     int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))];
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2374 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2374.x;
-                    fg_rgba_1.y = _2374.y;
-                    fg_rgba_1.z = _2374.z;
+                    float3 _2367 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2367.x;
+                    fg_rgba_1.y = _2367.y;
+                    fg_rgba_1.z = _2367.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2;
                 }
@@ -1206,9 +1207,9 @@ void comp_main()
                 CmdImage fill_img = Cmd_Image_read(param_34, param_35);
                 uint2 param_36 = xy_uint;
                 CmdImage param_37 = fill_img;
-                float4 _2417[8];
-                fillImage(_2417, param_36, param_37);
-                float4 img[8] = _2417;
+                float4 _2410[8];
+                fillImage(_2410, param_36, param_37);
+                float4 img[8] = _2410;
                 for (uint k_11 = 0u; k_11 < 8u; k_11++)
                 {
                     float4 fg_k_3 = img[k_11] * area[k_11];
@@ -1224,8 +1225,8 @@ void comp_main()
                     for (uint k_12 = 0u; k_12 < 8u; k_12++)
                     {
                         float4 param_38 = float4(rgba[k_12]);
-                        uint _2479 = packsRGB(param_38);
-                        blend_stack[clip_depth][k_12] = _2479;
+                        uint _2472 = packsRGB(param_38);
+                        blend_stack[clip_depth][k_12] = _2472;
                         rgba[k_12] = 0.0f.xxxx;
                     }
                 }
@@ -1235,8 +1236,8 @@ void comp_main()
                     for (uint k_13 = 0u; k_13 < 8u; k_13++)
                     {
                         float4 param_39 = float4(rgba[k_13]);
-                        uint _2522 = packsRGB(param_39);
-                        _297.Store((base_ix + k_13) * 4 + 8, _2522);
+                        uint _2519 = packsRGB(param_39);
+                        _2506.Store((base_ix + k_13) * 4 + 0, _2519);
                         rgba[k_13] = 0.0f.xxxx;
                     }
                 }
@@ -1262,7 +1263,7 @@ void comp_main()
                     }
                     else
                     {
-                        bg_rgba = _297.Load((base_ix_1 + k_14) * 4 + 8);
+                        bg_rgba = _2506.Load((base_ix_1 + k_14) * 4 + 0);
                     }
                     uint param_42 = bg_rgba;
                     float4 bg = unpacksRGB(param_42);
@@ -1279,8 +1280,8 @@ void comp_main()
             {
                 Alloc param_46 = cmd_alloc;
                 CmdRef param_47 = cmd_ref;
-                CmdRef _2621 = { Cmd_Jump_read(param_46, param_47).new_ref };
-                cmd_ref = _2621;
+                CmdRef _2618 = { Cmd_Jump_read(param_46, param_47).new_ref };
+                cmd_ref = _2618;
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
diff --git a/piet-gpu/shader/gen/kernel4.msl b/piet-gpu/shader/gen/kernel4.msl
index f60ea81..1cf8cb3 100644
--- a/piet-gpu/shader/gen/kernel4.msl
+++ b/piet-gpu/shader/gen/kernel4.msl
@@ -178,6 +178,7 @@ struct Memory
 {
     uint mem_offset;
     uint mem_error;
+    uint blend_offset;
     uint memory[1];
 };
 
@@ -188,6 +189,7 @@ struct Alloc_1
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -222,6 +224,11 @@ struct ConfigBuf
     Config conf;
 };
 
+struct BlendBuf
+{
+    uint blend_mem[1];
+};
+
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(8u, 4u, 1u);
 
 static inline __attribute__((always_inline))
@@ -1047,7 +1054,7 @@ CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Me
     return CmdJump_read(param, param_1, v_297);
 }
 
-kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1681 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1681 [[buffer(1)]], device BlendBuf& _2506 [[buffer(2)]], texture2d<float, access::write> image [[texture(3)]], texture2d<float> image_atlas [[texture(4)]], texture2d<float> gradients [[texture(5)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     uint tile_ix = (gl_WorkGroupID.y * _1681.conf.width_in_tiles) + gl_WorkGroupID.x;
     Alloc param;
@@ -1066,14 +1073,13 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
         rgba[i] = float4(0.0);
     }
     uint clip_depth = 0u;
-    bool mem_ok = v_297.mem_error == 0u;
     spvUnsafeArray<float, 8> df;
     TileSegRef tile_seg_ref;
     spvUnsafeArray<float, 8> area;
     spvUnsafeArray<spvUnsafeArray<uint, 8>, 4> blend_stack;
     uint base_ix_1;
     uint bg_rgba;
-    while (mem_ok)
+    while (true)
     {
         Alloc param_3 = cmd_alloc;
         CmdRef param_4 = cmd_ref;
@@ -1098,7 +1104,7 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
                 {
                     uint param_7 = tile_seg_ref.offset;
                     uint param_8 = 24u;
-                    bool param_9 = mem_ok;
+                    bool param_9 = true;
                     Alloc param_10 = new_alloc(param_7, param_8, param_9);
                     TileSegRef param_11 = tile_seg_ref;
                     TileSeg seg = TileSeg_read(param_10, param_11, v_297);
@@ -1134,7 +1140,7 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
                 {
                     uint param_15 = tile_seg_ref.offset;
                     uint param_16 = 24u;
-                    bool param_17 = mem_ok;
+                    bool param_17 = true;
                     Alloc param_18 = new_alloc(param_15, param_16, param_17);
                     TileSegRef param_19 = tile_seg_ref;
                     TileSeg seg_1 = TileSeg_read(param_18, param_19, v_297);
@@ -1218,10 +1224,10 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
                     int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                     float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2264 = fromsRGB(param_29);
-                    fg_rgba.x = _2264.x;
-                    fg_rgba.y = _2264.y;
-                    fg_rgba.z = _2264.z;
+                    float3 _2257 = fromsRGB(param_29);
+                    fg_rgba.x = _2257.x;
+                    fg_rgba.y = _2257.y;
+                    fg_rgba.z = _2257.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1;
                 }
@@ -1244,10 +1250,10 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
                     int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0));
                     float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index))));
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2374 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2374.x;
-                    fg_rgba_1.y = _2374.y;
-                    fg_rgba_1.z = _2374.z;
+                    float3 _2367 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2367.x;
+                    fg_rgba_1.y = _2367.y;
+                    fg_rgba_1.z = _2367.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2;
                 }
@@ -1278,8 +1284,8 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
                     for (uint k_12 = 0u; k_12 < 8u; k_12++)
                     {
                         float4 param_38 = float4(rgba[k_12]);
-                        uint _2479 = packsRGB(param_38);
-                        blend_stack[clip_depth][k_12] = _2479;
+                        uint _2472 = packsRGB(param_38);
+                        blend_stack[clip_depth][k_12] = _2472;
                         rgba[k_12] = float4(0.0);
                     }
                 }
@@ -1289,8 +1295,8 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
                     for (uint k_13 = 0u; k_13 < 8u; k_13++)
                     {
                         float4 param_39 = float4(rgba[k_13]);
-                        uint _2522 = packsRGB(param_39);
-                        v_297.memory[base_ix + k_13] = _2522;
+                        uint _2519 = packsRGB(param_39);
+                        _2506.blend_mem[base_ix + k_13] = _2519;
                         rgba[k_13] = float4(0.0);
                     }
                 }
@@ -1316,7 +1322,7 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
                     }
                     else
                     {
-                        bg_rgba = v_297.memory[base_ix_1 + k_14];
+                        bg_rgba = _2506.blend_mem[base_ix_1 + k_14];
                     }
                     uint param_42 = bg_rgba;
                     float4 bg = unpacksRGB(param_42);
diff --git a/piet-gpu/shader/gen/kernel4.spv b/piet-gpu/shader/gen/kernel4.spv
index c388941..f9198c3 100644
Binary files a/piet-gpu/shader/gen/kernel4.spv and b/piet-gpu/shader/gen/kernel4.spv differ
diff --git a/piet-gpu/shader/gen/kernel4_gray.dxil b/piet-gpu/shader/gen/kernel4_gray.dxil
index 7390167..bacd925 100644
Binary files a/piet-gpu/shader/gen/kernel4_gray.dxil and b/piet-gpu/shader/gen/kernel4_gray.dxil differ
diff --git a/piet-gpu/shader/gen/kernel4_gray.hlsl b/piet-gpu/shader/gen/kernel4_gray.hlsl
index ffada37..392d1f3 100644
--- a/piet-gpu/shader/gen/kernel4_gray.hlsl
+++ b/piet-gpu/shader/gen/kernel4_gray.hlsl
@@ -130,6 +130,7 @@ struct TileSeg
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -163,9 +164,10 @@ static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);
 
 RWByteAddressBuffer _297 : register(u0, space0);
 ByteAddressBuffer _1681 : register(t1, space0);
-RWTexture2D<unorm float4> image_atlas : register(u3, space0);
-RWTexture2D<unorm float4> gradients : register(u4, space0);
-RWTexture2D<unorm float> image : register(u2, space0);
+RWByteAddressBuffer _2506 : register(u2, space0);
+RWTexture2D<unorm float4> image_atlas : register(u4, space0);
+RWTexture2D<unorm float4> gradients : register(u5, space0);
+RWTexture2D<unorm float> image : register(u3, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -206,7 +208,7 @@ uint read_mem(Alloc alloc, uint offset)
     {
         return 0u;
     }
-    uint v = _297.Load(offset * 4 + 8);
+    uint v = _297.Load(offset * 4 + 12);
     return v;
 }
 
@@ -989,9 +991,9 @@ CmdJump Cmd_Jump_read(Alloc a, CmdRef ref)
 
 void comp_main()
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1681.Load(8)) + gl_WorkGroupID.x;
+    uint tile_ix = (gl_WorkGroupID.y * _1681.Load(12)) + gl_WorkGroupID.x;
     Alloc _1696;
-    _1696.offset = _1681.Load(24);
+    _1696.offset = _1681.Load(28);
     Alloc param;
     param.offset = _1696.offset;
     uint param_1 = tile_ix * 1024u;
@@ -999,7 +1001,7 @@ void comp_main()
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
     CmdRef _1705 = { cmd_alloc.offset };
     CmdRef cmd_ref = _1705;
-    uint blend_offset = _297.Load((cmd_ref.offset >> uint(2)) * 4 + 8);
+    uint blend_offset = _297.Load((cmd_ref.offset >> uint(2)) * 4 + 12);
     cmd_ref.offset += 4u;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
@@ -1009,14 +1011,13 @@ void comp_main()
         rgba[i] = 0.0f.xxxx;
     }
     uint clip_depth = 0u;
-    bool mem_ok = _297.Load(4) == 0u;
     float df[8];
     TileSegRef tile_seg_ref;
     float area[8];
     uint blend_stack[4][8];
     uint base_ix_1;
     uint bg_rgba;
-    while (mem_ok)
+    while (true)
     {
         Alloc param_3 = cmd_alloc;
         CmdRef param_4 = cmd_ref;
@@ -1036,13 +1037,13 @@ void comp_main()
                 {
                     df[k] = 1000000000.0f;
                 }
-                TileSegRef _1810 = { stroke.tile_ref };
-                tile_seg_ref = _1810;
+                TileSegRef _1805 = { stroke.tile_ref };
+                tile_seg_ref = _1805;
                 do
                 {
                     uint param_7 = tile_seg_ref.offset;
                     uint param_8 = 24u;
-                    bool param_9 = mem_ok;
+                    bool param_9 = true;
                     Alloc param_10 = new_alloc(param_7, param_8, param_9);
                     TileSegRef param_11 = tile_seg_ref;
                     TileSeg seg = TileSeg_read(param_10, param_11);
@@ -1073,13 +1074,13 @@ void comp_main()
                 {
                     area[k_3] = float(fill.backdrop);
                 }
-                TileSegRef _1930 = { fill.tile_ref };
-                tile_seg_ref = _1930;
+                TileSegRef _1924 = { fill.tile_ref };
+                tile_seg_ref = _1924;
                 do
                 {
                     uint param_15 = tile_seg_ref.offset;
                     uint param_16 = 24u;
-                    bool param_17 = mem_ok;
+                    bool param_17 = true;
                     Alloc param_18 = new_alloc(param_15, param_16, param_17);
                     TileSegRef param_19 = tile_seg_ref;
                     TileSeg seg_1 = TileSeg_read(param_18, param_19);
@@ -1163,10 +1164,10 @@ void comp_main()
                     int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba = gradients[int2(x, int(lin.index))];
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2264 = fromsRGB(param_29);
-                    fg_rgba.x = _2264.x;
-                    fg_rgba.y = _2264.y;
-                    fg_rgba.z = _2264.z;
+                    float3 _2257 = fromsRGB(param_29);
+                    fg_rgba.x = _2257.x;
+                    fg_rgba.y = _2257.y;
+                    fg_rgba.z = _2257.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1;
                 }
@@ -1189,10 +1190,10 @@ void comp_main()
                     int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))];
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2374 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2374.x;
-                    fg_rgba_1.y = _2374.y;
-                    fg_rgba_1.z = _2374.z;
+                    float3 _2367 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2367.x;
+                    fg_rgba_1.y = _2367.y;
+                    fg_rgba_1.z = _2367.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2;
                 }
@@ -1206,9 +1207,9 @@ void comp_main()
                 CmdImage fill_img = Cmd_Image_read(param_34, param_35);
                 uint2 param_36 = xy_uint;
                 CmdImage param_37 = fill_img;
-                float4 _2417[8];
-                fillImage(_2417, param_36, param_37);
-                float4 img[8] = _2417;
+                float4 _2410[8];
+                fillImage(_2410, param_36, param_37);
+                float4 img[8] = _2410;
                 for (uint k_11 = 0u; k_11 < 8u; k_11++)
                 {
                     float4 fg_k_3 = img[k_11] * area[k_11];
@@ -1224,8 +1225,8 @@ void comp_main()
                     for (uint k_12 = 0u; k_12 < 8u; k_12++)
                     {
                         float4 param_38 = float4(rgba[k_12]);
-                        uint _2479 = packsRGB(param_38);
-                        blend_stack[clip_depth][k_12] = _2479;
+                        uint _2472 = packsRGB(param_38);
+                        blend_stack[clip_depth][k_12] = _2472;
                         rgba[k_12] = 0.0f.xxxx;
                     }
                 }
@@ -1235,8 +1236,8 @@ void comp_main()
                     for (uint k_13 = 0u; k_13 < 8u; k_13++)
                     {
                         float4 param_39 = float4(rgba[k_13]);
-                        uint _2522 = packsRGB(param_39);
-                        _297.Store((base_ix + k_13) * 4 + 8, _2522);
+                        uint _2519 = packsRGB(param_39);
+                        _2506.Store((base_ix + k_13) * 4 + 0, _2519);
                         rgba[k_13] = 0.0f.xxxx;
                     }
                 }
@@ -1262,7 +1263,7 @@ void comp_main()
                     }
                     else
                     {
-                        bg_rgba = _297.Load((base_ix_1 + k_14) * 4 + 8);
+                        bg_rgba = _2506.Load((base_ix_1 + k_14) * 4 + 0);
                     }
                     uint param_42 = bg_rgba;
                     float4 bg = unpacksRGB(param_42);
@@ -1279,8 +1280,8 @@ void comp_main()
             {
                 Alloc param_46 = cmd_alloc;
                 CmdRef param_47 = cmd_ref;
-                CmdRef _2621 = { Cmd_Jump_read(param_46, param_47).new_ref };
-                cmd_ref = _2621;
+                CmdRef _2618 = { Cmd_Jump_read(param_46, param_47).new_ref };
+                cmd_ref = _2618;
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
diff --git a/piet-gpu/shader/gen/kernel4_gray.msl b/piet-gpu/shader/gen/kernel4_gray.msl
index e174713..45e7a0e 100644
--- a/piet-gpu/shader/gen/kernel4_gray.msl
+++ b/piet-gpu/shader/gen/kernel4_gray.msl
@@ -178,6 +178,7 @@ struct Memory
 {
     uint mem_offset;
     uint mem_error;
+    uint blend_offset;
     uint memory[1];
 };
 
@@ -188,6 +189,7 @@ struct Alloc_1
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -222,6 +224,11 @@ struct ConfigBuf
     Config conf;
 };
 
+struct BlendBuf
+{
+    uint blend_mem[1];
+};
+
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(8u, 4u, 1u);
 
 static inline __attribute__((always_inline))
@@ -1047,7 +1054,7 @@ CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Me
     return CmdJump_read(param, param_1, v_297);
 }
 
-kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1681 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1681 [[buffer(1)]], device BlendBuf& _2506 [[buffer(2)]], texture2d<float, access::write> image [[texture(3)]], texture2d<float> image_atlas [[texture(4)]], texture2d<float> gradients [[texture(5)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     uint tile_ix = (gl_WorkGroupID.y * _1681.conf.width_in_tiles) + gl_WorkGroupID.x;
     Alloc param;
@@ -1066,14 +1073,13 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
         rgba[i] = float4(0.0);
     }
     uint clip_depth = 0u;
-    bool mem_ok = v_297.mem_error == 0u;
     spvUnsafeArray<float, 8> df;
     TileSegRef tile_seg_ref;
     spvUnsafeArray<float, 8> area;
     spvUnsafeArray<spvUnsafeArray<uint, 8>, 4> blend_stack;
     uint base_ix_1;
     uint bg_rgba;
-    while (mem_ok)
+    while (true)
     {
         Alloc param_3 = cmd_alloc;
         CmdRef param_4 = cmd_ref;
@@ -1098,7 +1104,7 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
                 {
                     uint param_7 = tile_seg_ref.offset;
                     uint param_8 = 24u;
-                    bool param_9 = mem_ok;
+                    bool param_9 = true;
                     Alloc param_10 = new_alloc(param_7, param_8, param_9);
                     TileSegRef param_11 = tile_seg_ref;
                     TileSeg seg = TileSeg_read(param_10, param_11, v_297);
@@ -1134,7 +1140,7 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
                 {
                     uint param_15 = tile_seg_ref.offset;
                     uint param_16 = 24u;
-                    bool param_17 = mem_ok;
+                    bool param_17 = true;
                     Alloc param_18 = new_alloc(param_15, param_16, param_17);
                     TileSegRef param_19 = tile_seg_ref;
                     TileSeg seg_1 = TileSeg_read(param_18, param_19, v_297);
@@ -1218,10 +1224,10 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
                     int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                     float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2264 = fromsRGB(param_29);
-                    fg_rgba.x = _2264.x;
-                    fg_rgba.y = _2264.y;
-                    fg_rgba.z = _2264.z;
+                    float3 _2257 = fromsRGB(param_29);
+                    fg_rgba.x = _2257.x;
+                    fg_rgba.y = _2257.y;
+                    fg_rgba.z = _2257.z;
                     float4 fg_k_1 = fg_rgba * area[k_9];
                     rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1;
                 }
@@ -1244,10 +1250,10 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
                     int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0));
                     float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index))));
                     float3 param_33 = fg_rgba_1.xyz;
-                    float3 _2374 = fromsRGB(param_33);
-                    fg_rgba_1.x = _2374.x;
-                    fg_rgba_1.y = _2374.y;
-                    fg_rgba_1.z = _2374.z;
+                    float3 _2367 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2367.x;
+                    fg_rgba_1.y = _2367.y;
+                    fg_rgba_1.z = _2367.z;
                     float4 fg_k_2 = fg_rgba_1 * area[k_10];
                     rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2;
                 }
@@ -1278,8 +1284,8 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
                     for (uint k_12 = 0u; k_12 < 8u; k_12++)
                     {
                         float4 param_38 = float4(rgba[k_12]);
-                        uint _2479 = packsRGB(param_38);
-                        blend_stack[clip_depth][k_12] = _2479;
+                        uint _2472 = packsRGB(param_38);
+                        blend_stack[clip_depth][k_12] = _2472;
                         rgba[k_12] = float4(0.0);
                     }
                 }
@@ -1289,8 +1295,8 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
                     for (uint k_13 = 0u; k_13 < 8u; k_13++)
                     {
                         float4 param_39 = float4(rgba[k_13]);
-                        uint _2522 = packsRGB(param_39);
-                        v_297.memory[base_ix + k_13] = _2522;
+                        uint _2519 = packsRGB(param_39);
+                        _2506.blend_mem[base_ix + k_13] = _2519;
                         rgba[k_13] = float4(0.0);
                     }
                 }
@@ -1316,7 +1322,7 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1
                     }
                     else
                     {
-                        bg_rgba = v_297.memory[base_ix_1 + k_14];
+                        bg_rgba = _2506.blend_mem[base_ix_1 + k_14];
                     }
                     uint param_42 = bg_rgba;
                     float4 bg = unpacksRGB(param_42);
diff --git a/piet-gpu/shader/gen/kernel4_gray.spv b/piet-gpu/shader/gen/kernel4_gray.spv
index 17c7531..df86875 100644
Binary files a/piet-gpu/shader/gen/kernel4_gray.spv and b/piet-gpu/shader/gen/kernel4_gray.spv differ
diff --git a/piet-gpu/shader/gen/path_coarse.dxil b/piet-gpu/shader/gen/path_coarse.dxil
index 9fd593c..2842f0d 100644
Binary files a/piet-gpu/shader/gen/path_coarse.dxil and b/piet-gpu/shader/gen/path_coarse.dxil differ
diff --git a/piet-gpu/shader/gen/path_coarse.hlsl b/piet-gpu/shader/gen/path_coarse.hlsl
index 93ee8f0..106fdfc 100644
--- a/piet-gpu/shader/gen/path_coarse.hlsl
+++ b/piet-gpu/shader/gen/path_coarse.hlsl
@@ -3,12 +3,6 @@ struct Alloc
     uint offset;
 };
 
-struct MallocResult
-{
-    Alloc alloc;
-    bool failed;
-};
-
 struct PathCubicRef
 {
     uint offset;
@@ -74,6 +68,7 @@ struct SubdivResult
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -105,10 +100,10 @@ struct Config
 
 static const uint3 gl_WorkGroupSize = uint3(32u, 1u, 1u);
 
-static const PathSegTag _721 = { 0u, 0u };
+static const PathSegTag _722 = { 0u, 0u };
 
-RWByteAddressBuffer _136 : register(u0, space0);
-ByteAddressBuffer _710 : register(t1, space0);
+RWByteAddressBuffer _143 : register(u0, space0);
+ByteAddressBuffer _711 : register(t1, space0);
 
 static uint3 gl_GlobalInvocationID;
 struct SPIRV_Cross_Input
@@ -116,6 +111,15 @@ struct SPIRV_Cross_Input
     uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
 };
 
+static bool mem_ok;
+
+bool check_deps(uint dep_stage)
+{
+    uint _149;
+    _143.InterlockedOr(4, 0u, _149);
+    return (_149 & dep_stage) == 0u;
+}
+
 bool touch_mem(Alloc alloc, uint offset)
 {
     return true;
@@ -129,7 +133,7 @@ uint read_mem(Alloc alloc, uint offset)
     {
         return 0u;
     }
-    uint v = _136.Load(offset * 4 + 8);
+    uint v = _143.Load(offset * 4 + 12);
     return v;
 }
 
@@ -138,8 +142,8 @@ PathSegTag PathSeg_tag(Alloc a, PathSegRef ref)
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint tag_and_flags = read_mem(param, param_1);
-    PathSegTag _367 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _367;
+    PathSegTag _362 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+    return _362;
 }
 
 PathCubic PathCubic_read(Alloc a, PathCubicRef ref)
@@ -194,9 +198,9 @@ PathCubic PathCubic_read(Alloc a, PathCubicRef ref)
 
 PathCubic PathSeg_Cubic_read(Alloc a, PathSegRef ref)
 {
-    PathCubicRef _373 = { ref.offset + 4u };
+    PathCubicRef _368 = { ref.offset + 4u };
     Alloc param = a;
-    PathCubicRef param_1 = _373;
+    PathCubicRef param_1 = _368;
     return PathCubic_read(param, param_1);
 }
 
@@ -240,8 +244,8 @@ SubdivResult estimate_subdiv(float2 p0, float2 p1, float2 p2, float sqrt_tol)
             val = (sqrt_tol * da) / approx_parabola_integral(param_2);
         }
     }
-    SubdivResult _695 = { val, a0, a2 };
-    return _695;
+    SubdivResult _690 = { val, a0, a2 };
+    return _690;
 }
 
 uint fill_mode_from_flags(uint flags)
@@ -263,12 +267,12 @@ Path Path_read(Alloc a, PathRef ref)
     uint raw2 = read_mem(param_4, param_5);
     Path s;
     s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
-    TileRef _427 = { raw2 };
-    s.tiles = _427;
+    TileRef _422 = { raw2 };
+    s.tiles = _422;
     return s;
 }
 
-Alloc new_alloc(uint offset, uint size, bool mem_ok)
+Alloc new_alloc(uint offset, uint size, bool mem_ok_1)
 {
     Alloc a;
     a.offset = offset;
@@ -286,33 +290,24 @@ float2 eval_quad(float2 p0, float2 p1, float2 p2, float t)
     return (p0 * (mt * mt)) + (((p1 * (mt * 2.0f)) + (p2 * t)) * t);
 }
 
-MallocResult malloc(uint size)
+uint malloc_stage(uint size, uint mem_size, uint stage)
 {
-    uint _142;
-    _136.InterlockedAdd(0, size, _142);
-    uint offset = _142;
-    uint _149;
-    _136.GetDimensions(_149);
-    _149 = (_149 - 8) / 4;
-    MallocResult r;
-    r.failed = (offset + size) > uint(int(_149) * 4);
-    uint param = offset;
-    uint param_1 = size;
-    bool param_2 = !r.failed;
-    r.alloc = new_alloc(param, param_1, param_2);
-    if (r.failed)
+    uint _158;
+    _143.InterlockedAdd(0, size, _158);
+    uint offset = _158;
+    if ((offset + size) > mem_size)
     {
-        uint _171;
-        _136.InterlockedMax(4, 1u, _171);
-        return r;
+        uint _168;
+        _143.InterlockedOr(4, stage, _168);
+        offset = 0u;
     }
-    return r;
+    return offset;
 }
 
 TileRef Tile_index(TileRef ref, uint index)
 {
-    TileRef _385 = { ref.offset + (index * 8u) };
-    return _385;
+    TileRef _380 = { ref.offset + (index * 8u) };
+    return _380;
 }
 
 void write_mem(Alloc alloc, uint offset, uint val)
@@ -323,7 +318,7 @@ void write_mem(Alloc alloc, uint offset, uint val)
     {
         return;
     }
-    _136.Store(offset * 4 + 8, val);
+    _143.Store(offset * 4 + 12, val);
 }
 
 void TileSeg_write(Alloc a, TileSegRef ref, TileSeg s)
@@ -357,30 +352,36 @@ void TileSeg_write(Alloc a, TileSegRef ref, TileSeg s)
 
 void comp_main()
 {
-    uint element_ix = gl_GlobalInvocationID.x;
-    PathSegRef _718 = { _710.Load(28) + (element_ix * 52u) };
-    PathSegRef ref = _718;
-    PathSegTag tag = _721;
-    if (element_ix < _710.Load(4))
+    mem_ok = true;
+    uint param = 7u;
+    bool _694 = check_deps(param);
+    if (!_694)
     {
-        Alloc _731;
-        _731.offset = _710.Load(28);
-        Alloc param;
-        param.offset = _731.offset;
-        PathSegRef param_1 = ref;
-        tag = PathSeg_tag(param, param_1);
+        return;
+    }
+    uint element_ix = gl_GlobalInvocationID.x;
+    PathSegRef _719 = { _711.Load(32) + (element_ix * 52u) };
+    PathSegRef ref = _719;
+    PathSegTag tag = _722;
+    if (element_ix < _711.Load(8))
+    {
+        Alloc _732;
+        _732.offset = _711.Load(32);
+        Alloc param_1;
+        param_1.offset = _732.offset;
+        PathSegRef param_2 = ref;
+        tag = PathSeg_tag(param_1, param_2);
     }
-    bool mem_ok = _136.Load(4) == 0u;
     switch (tag.tag)
     {
         case 1u:
         {
-            Alloc _748;
-            _748.offset = _710.Load(28);
-            Alloc param_2;
-            param_2.offset = _748.offset;
-            PathSegRef param_3 = ref;
-            PathCubic cubic = PathSeg_Cubic_read(param_2, param_3);
+            Alloc _745;
+            _745.offset = _711.Load(32);
+            Alloc param_3;
+            param_3.offset = _745.offset;
+            PathSegRef param_4 = ref;
+            PathCubic cubic = PathSeg_Cubic_read(param_3, param_4);
             float2 err_v = (((cubic.p2 - cubic.p1) * 3.0f) + cubic.p0) - cubic.p3;
             float err = (err_v.x * err_v.x) + (err_v.y * err_v.y);
             uint n_quads = max(uint(ceil(pow(err * 3.7037036418914794921875f, 0.16666667163372039794921875f))), 1u);
@@ -392,43 +393,43 @@ void comp_main()
             for (uint i = 0u; i < n_quads; i++)
             {
                 float t = float(i + 1u) * _step;
-                float2 param_4 = cubic.p0;
-                float2 param_5 = cubic.p1;
-                float2 param_6 = cubic.p2;
-                float2 param_7 = cubic.p3;
-                float param_8 = t;
-                float2 qp2 = eval_cubic(param_4, param_5, param_6, param_7, param_8);
-                float2 param_9 = cubic.p0;
-                float2 param_10 = cubic.p1;
-                float2 param_11 = cubic.p2;
-                float2 param_12 = cubic.p3;
-                float param_13 = t - (0.5f * _step);
-                float2 qp1 = eval_cubic(param_9, param_10, param_11, param_12, param_13);
+                float2 param_5 = cubic.p0;
+                float2 param_6 = cubic.p1;
+                float2 param_7 = cubic.p2;
+                float2 param_8 = cubic.p3;
+                float param_9 = t;
+                float2 qp2 = eval_cubic(param_5, param_6, param_7, param_8, param_9);
+                float2 param_10 = cubic.p0;
+                float2 param_11 = cubic.p1;
+                float2 param_12 = cubic.p2;
+                float2 param_13 = cubic.p3;
+                float param_14 = t - (0.5f * _step);
+                float2 qp1 = eval_cubic(param_10, param_11, param_12, param_13, param_14);
                 qp1 = (qp1 * 2.0f) - ((qp0 + qp2) * 0.5f);
-                float2 param_14 = qp0;
-                float2 param_15 = qp1;
-                float2 param_16 = qp2;
-                float param_17 = 0.4743416607379913330078125f;
-                SubdivResult params = estimate_subdiv(param_14, param_15, param_16, param_17);
+                float2 param_15 = qp0;
+                float2 param_16 = qp1;
+                float2 param_17 = qp2;
+                float param_18 = 0.4743416607379913330078125f;
+                SubdivResult params = estimate_subdiv(param_15, param_16, param_17, param_18);
                 keep_params[i] = params;
                 val += params.val;
                 qp0 = qp2;
             }
             uint n = max(uint(ceil((val * 0.5f) / 0.4743416607379913330078125f)), 1u);
-            uint param_18 = tag.flags;
-            bool is_stroke = fill_mode_from_flags(param_18) == 1u;
+            uint param_19 = tag.flags;
+            bool is_stroke = fill_mode_from_flags(param_19) == 1u;
             uint path_ix = cubic.path_ix;
-            PathRef _904 = { _710.Load(16) + (path_ix * 12u) };
-            Alloc _907;
-            _907.offset = _710.Load(16);
-            Alloc param_19;
-            param_19.offset = _907.offset;
-            PathRef param_20 = _904;
-            Path path = Path_read(param_19, param_20);
-            uint param_21 = path.tiles.offset;
-            uint param_22 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
-            bool param_23 = mem_ok;
-            Alloc path_alloc = new_alloc(param_21, param_22, param_23);
+            PathRef _901 = { _711.Load(20) + (path_ix * 12u) };
+            Alloc _904;
+            _904.offset = _711.Load(20);
+            Alloc param_20;
+            param_20.offset = _904.offset;
+            PathRef param_21 = _901;
+            Path path = Path_read(param_20, param_21);
+            uint param_22 = path.tiles.offset;
+            uint param_23 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+            bool param_24 = true;
+            Alloc path_alloc = new_alloc(param_22, param_23, param_24);
             int4 bbox = int4(path.bbox);
             float2 p0 = cubic.p0;
             qp0 = cubic.p0;
@@ -436,44 +437,44 @@ void comp_main()
             int n_out = 1;
             float val_sum = 0.0f;
             float2 p1;
-            float _1147;
+            float _1143;
             TileSeg tile_seg;
             for (uint i_1 = 0u; i_1 < n_quads; i_1++)
             {
                 float t_1 = float(i_1 + 1u) * _step;
-                float2 param_24 = cubic.p0;
-                float2 param_25 = cubic.p1;
-                float2 param_26 = cubic.p2;
-                float2 param_27 = cubic.p3;
-                float param_28 = t_1;
-                float2 qp2_1 = eval_cubic(param_24, param_25, param_26, param_27, param_28);
-                float2 param_29 = cubic.p0;
-                float2 param_30 = cubic.p1;
-                float2 param_31 = cubic.p2;
-                float2 param_32 = cubic.p3;
-                float param_33 = t_1 - (0.5f * _step);
-                float2 qp1_1 = eval_cubic(param_29, param_30, param_31, param_32, param_33);
+                float2 param_25 = cubic.p0;
+                float2 param_26 = cubic.p1;
+                float2 param_27 = cubic.p2;
+                float2 param_28 = cubic.p3;
+                float param_29 = t_1;
+                float2 qp2_1 = eval_cubic(param_25, param_26, param_27, param_28, param_29);
+                float2 param_30 = cubic.p0;
+                float2 param_31 = cubic.p1;
+                float2 param_32 = cubic.p2;
+                float2 param_33 = cubic.p3;
+                float param_34 = t_1 - (0.5f * _step);
+                float2 qp1_1 = eval_cubic(param_30, param_31, param_32, param_33, param_34);
                 qp1_1 = (qp1_1 * 2.0f) - ((qp0 + qp2_1) * 0.5f);
                 SubdivResult params_1 = keep_params[i_1];
-                float param_34 = params_1.a0;
-                float u0 = approx_parabola_inv_integral(param_34);
-                float param_35 = params_1.a2;
-                float u2 = approx_parabola_inv_integral(param_35);
+                float param_35 = params_1.a0;
+                float u0 = approx_parabola_inv_integral(param_35);
+                float param_36 = params_1.a2;
+                float u2 = approx_parabola_inv_integral(param_36);
                 float uscale = 1.0f / (u2 - u0);
                 float target = float(n_out) * v_step;
                 for (;;)
                 {
-                    bool _1040 = uint(n_out) == n;
-                    bool _1050;
-                    if (!_1040)
+                    bool _1036 = uint(n_out) == n;
+                    bool _1046;
+                    if (!_1036)
                     {
-                        _1050 = target < (val_sum + params_1.val);
+                        _1046 = target < (val_sum + params_1.val);
                     }
                     else
                     {
-                        _1050 = _1040;
+                        _1046 = _1036;
                     }
-                    if (_1050)
+                    if (_1046)
                     {
                         if (uint(n_out) == n)
                         {
@@ -483,14 +484,14 @@ void comp_main()
                         {
                             float u = (target - val_sum) / params_1.val;
                             float a = lerp(params_1.a0, params_1.a2, u);
-                            float param_36 = a;
-                            float au = approx_parabola_inv_integral(param_36);
+                            float param_37 = a;
+                            float au = approx_parabola_inv_integral(param_37);
                             float t_2 = (au - u0) * uscale;
-                            float2 param_37 = qp0;
-                            float2 param_38 = qp1_1;
-                            float2 param_39 = qp2_1;
-                            float param_40 = t_2;
-                            p1 = eval_quad(param_37, param_38, param_39, param_40);
+                            float2 param_38 = qp0;
+                            float2 param_39 = qp1_1;
+                            float2 param_40 = qp2_1;
+                            float param_41 = t_2;
+                            p1 = eval_quad(param_38, param_39, param_40, param_41);
                         }
                         float xmin = min(p0.x, p1.x) - cubic.stroke.x;
                         float xmax = max(p0.x, p1.x) + cubic.stroke.x;
@@ -500,13 +501,13 @@ void comp_main()
                         float dy = p1.y - p0.y;
                         if (abs(dy) < 9.999999717180685365747194737196e-10f)
                         {
-                            _1147 = 1000000000.0f;
+                            _1143 = 1000000000.0f;
                         }
                         else
                         {
-                            _1147 = dx / dy;
+                            _1143 = dx / dy;
                         }
-                        float invslope = _1147;
+                        float invslope = _1143;
                         float c = (cubic.stroke.x + (abs(invslope) * (8.0f + cubic.stroke.y))) * 0.0625f;
                         float b = invslope;
                         float a_1 = (p0.x - ((p0.y - 8.0f) * b)) * 0.0625f;
@@ -522,14 +523,20 @@ void comp_main()
                         int stride = bbox.z - bbox.x;
                         int base = ((y0 - bbox.y) * stride) - bbox.x;
                         uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
-                        uint param_41 = n_tile_alloc * 24u;
-                        MallocResult _1263 = malloc(param_41);
-                        MallocResult tile_alloc = _1263;
-                        if (tile_alloc.failed || (!mem_ok))
+                        uint malloc_size = n_tile_alloc * 24u;
+                        uint param_42 = malloc_size;
+                        uint param_43 = _711.Load(0);
+                        uint param_44 = 4u;
+                        uint _1265 = malloc_stage(param_42, param_43, param_44);
+                        uint tile_offset = _1265;
+                        if (tile_offset == 0u)
                         {
-                            return;
+                            mem_ok = false;
                         }
-                        uint tile_offset = tile_alloc.alloc.offset;
+                        uint param_45 = tile_offset;
+                        uint param_46 = malloc_size;
+                        bool param_47 = true;
+                        Alloc tile_alloc = new_alloc(param_45, param_46, param_47);
                         int xray = int(floor(p0.x * 0.0625f));
                         int last_xray = int(floor(p1.x * 0.0625f));
                         if (p0.y > p1.y)
@@ -542,39 +549,34 @@ void comp_main()
                         {
                             float tile_y0 = float(y * 16);
                             int xbackdrop = max((xray + 1), bbox.x);
-                            bool _1319 = !is_stroke;
-                            bool _1329;
-                            if (_1319)
+                            bool _1322 = !is_stroke;
+                            bool _1332;
+                            if (_1322)
                             {
-                                _1329 = min(p0.y, p1.y) < tile_y0;
+                                _1332 = min(p0.y, p1.y) < tile_y0;
                             }
                             else
                             {
-                                _1329 = _1319;
+                                _1332 = _1322;
                             }
-                            bool _1336;
-                            if (_1329)
+                            bool _1339;
+                            if (_1332)
                             {
-                                _1336 = xbackdrop < bbox.z;
+                                _1339 = xbackdrop < bbox.z;
                             }
                             else
                             {
-                                _1336 = _1329;
+                                _1339 = _1332;
                             }
-                            if (_1336)
+                            if (_1339)
                             {
                                 int backdrop = (p1.y < p0.y) ? 1 : (-1);
-                                TileRef param_42 = path.tiles;
-                                uint param_43 = uint(base + xbackdrop);
-                                TileRef tile_ref = Tile_index(param_42, param_43);
+                                TileRef param_48 = path.tiles;
+                                uint param_49 = uint(base + xbackdrop);
+                                TileRef tile_ref = Tile_index(param_48, param_49);
                                 uint tile_el = tile_ref.offset >> uint(2);
-                                Alloc param_44 = path_alloc;
-                                uint param_45 = tile_el + 1u;
-                                if (touch_mem(param_44, param_45))
-                                {
-                                    uint _1374;
-                                    _136.InterlockedAdd((tile_el + 1u) * 4 + 8, uint(backdrop), _1374);
-                                }
+                                uint _1369;
+                                _143.InterlockedAdd((tile_el + 1u) * 4 + 12, uint(backdrop), _1369);
                             }
                             int next_xray = last_xray;
                             if (y < (y1 - 1))
@@ -592,20 +594,15 @@ void comp_main()
                             for (int x = xx0; x < xx1; x++)
                             {
                                 float tile_x0 = float(x * 16);
-                                TileRef _1454 = { path.tiles.offset };
-                                TileRef param_46 = _1454;
-                                uint param_47 = uint(base + x);
-                                TileRef tile_ref_1 = Tile_index(param_46, param_47);
+                                TileRef _1449 = { path.tiles.offset };
+                                TileRef param_50 = _1449;
+                                uint param_51 = uint(base + x);
+                                TileRef tile_ref_1 = Tile_index(param_50, param_51);
                                 uint tile_el_1 = tile_ref_1.offset >> uint(2);
                                 uint old = 0u;
-                                Alloc param_48 = path_alloc;
-                                uint param_49 = tile_el_1;
-                                if (touch_mem(param_48, param_49))
-                                {
-                                    uint _1477;
-                                    _136.InterlockedExchange(tile_el_1 * 4 + 8, tile_offset, _1477);
-                                    old = _1477;
-                                }
+                                uint _1465;
+                                _143.InterlockedExchange(tile_el_1 * 4 + 12, tile_offset, _1465);
+                                old = _1465;
                                 tile_seg.origin = p0;
                                 tile_seg._vector = p1 - p0;
                                 float y_edge = 0.0f;
@@ -636,11 +633,14 @@ void comp_main()
                                 }
                                 tile_seg.y_edge = y_edge;
                                 tile_seg.next.offset = old;
-                                TileSegRef _1559 = { tile_offset };
-                                Alloc param_50 = tile_alloc.alloc;
-                                TileSegRef param_51 = _1559;
-                                TileSeg param_52 = tile_seg;
-                                TileSeg_write(param_50, param_51, param_52);
+                                if (mem_ok)
+                                {
+                                    TileSegRef _1550 = { tile_offset };
+                                    Alloc param_52 = tile_alloc;
+                                    TileSegRef param_53 = _1550;
+                                    TileSeg param_54 = tile_seg;
+                                    TileSeg_write(param_52, param_53, param_54);
+                                }
                                 tile_offset += 24u;
                             }
                             xc += b;
diff --git a/piet-gpu/shader/gen/path_coarse.msl b/piet-gpu/shader/gen/path_coarse.msl
index 26aa33a..4f59b3f 100644
--- a/piet-gpu/shader/gen/path_coarse.msl
+++ b/piet-gpu/shader/gen/path_coarse.msl
@@ -51,12 +51,6 @@ struct Alloc
     uint offset;
 };
 
-struct MallocResult
-{
-    Alloc alloc;
-    bool failed;
-};
-
 struct PathCubicRef
 {
     uint offset;
@@ -124,6 +118,7 @@ struct Memory
 {
     uint mem_offset;
     uint mem_error;
+    uint blend_offset;
     uint memory[1];
 };
 
@@ -134,6 +129,7 @@ struct Alloc_1
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -170,6 +166,13 @@ struct ConfigBuf
 
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(32u, 1u, 1u);
 
+static inline __attribute__((always_inline))
+bool check_deps(thread const uint& dep_stage, device Memory& v_143)
+{
+    uint _149 = atomic_fetch_or_explicit((device atomic_uint*)&v_143.mem_error, 0u, memory_order_relaxed);
+    return (_149 & dep_stage) == 0u;
+}
+
 static inline __attribute__((always_inline))
 bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
 {
@@ -177,7 +180,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
 }
 
 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_136, constant uint& v_136BufferSize)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_143)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -185,59 +188,59 @@ uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memor
     {
         return 0u;
     }
-    uint v = v_136.memory[offset];
+    uint v = v_143.memory[offset];
     return v;
 }
 
 static inline __attribute__((always_inline))
-PathSegTag PathSeg_tag(thread const Alloc& a, thread const PathSegRef& ref, device Memory& v_136, constant uint& v_136BufferSize)
+PathSegTag PathSeg_tag(thread const Alloc& a, thread const PathSegRef& ref, device Memory& v_143)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_136, v_136BufferSize);
+    uint tag_and_flags = read_mem(param, param_1, v_143);
     return PathSegTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
 }
 
 static inline __attribute__((always_inline))
-PathCubic PathCubic_read(thread const Alloc& a, thread const PathCubicRef& ref, device Memory& v_136, constant uint& v_136BufferSize)
+PathCubic PathCubic_read(thread const Alloc& a, thread const PathCubicRef& ref, device Memory& v_143)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_136, v_136BufferSize);
+    uint raw0 = read_mem(param, param_1, v_143);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_136, v_136BufferSize);
+    uint raw1 = read_mem(param_2, param_3, v_143);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_136, v_136BufferSize);
+    uint raw2 = read_mem(param_4, param_5, v_143);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_136, v_136BufferSize);
+    uint raw3 = read_mem(param_6, param_7, v_143);
     Alloc param_8 = a;
     uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_136, v_136BufferSize);
+    uint raw4 = read_mem(param_8, param_9, v_143);
     Alloc param_10 = a;
     uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_136, v_136BufferSize);
+    uint raw5 = read_mem(param_10, param_11, v_143);
     Alloc param_12 = a;
     uint param_13 = ix + 6u;
-    uint raw6 = read_mem(param_12, param_13, v_136, v_136BufferSize);
+    uint raw6 = read_mem(param_12, param_13, v_143);
     Alloc param_14 = a;
     uint param_15 = ix + 7u;
-    uint raw7 = read_mem(param_14, param_15, v_136, v_136BufferSize);
+    uint raw7 = read_mem(param_14, param_15, v_143);
     Alloc param_16 = a;
     uint param_17 = ix + 8u;
-    uint raw8 = read_mem(param_16, param_17, v_136, v_136BufferSize);
+    uint raw8 = read_mem(param_16, param_17, v_143);
     Alloc param_18 = a;
     uint param_19 = ix + 9u;
-    uint raw9 = read_mem(param_18, param_19, v_136, v_136BufferSize);
+    uint raw9 = read_mem(param_18, param_19, v_143);
     Alloc param_20 = a;
     uint param_21 = ix + 10u;
-    uint raw10 = read_mem(param_20, param_21, v_136, v_136BufferSize);
+    uint raw10 = read_mem(param_20, param_21, v_143);
     Alloc param_22 = a;
     uint param_23 = ix + 11u;
-    uint raw11 = read_mem(param_22, param_23, v_136, v_136BufferSize);
+    uint raw11 = read_mem(param_22, param_23, v_143);
     PathCubic s;
     s.p0 = float2(as_type<float>(raw0), as_type<float>(raw1));
     s.p1 = float2(as_type<float>(raw2), as_type<float>(raw3));
@@ -250,11 +253,11 @@ PathCubic PathCubic_read(thread const Alloc& a, thread const PathCubicRef& ref,
 }
 
 static inline __attribute__((always_inline))
-PathCubic PathSeg_Cubic_read(thread const Alloc& a, thread const PathSegRef& ref, device Memory& v_136, constant uint& v_136BufferSize)
+PathCubic PathSeg_Cubic_read(thread const Alloc& a, thread const PathSegRef& ref, device Memory& v_143)
 {
     Alloc param = a;
     PathCubicRef param_1 = PathCubicRef{ ref.offset + 4u };
-    return PathCubic_read(param, param_1, v_136, v_136BufferSize);
+    return PathCubic_read(param, param_1, v_143);
 }
 
 static inline __attribute__((always_inline))
@@ -310,18 +313,18 @@ uint fill_mode_from_flags(thread const uint& flags)
 }
 
 static inline __attribute__((always_inline))
-Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_136, constant uint& v_136BufferSize)
+Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_143)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_136, v_136BufferSize);
+    uint raw0 = read_mem(param, param_1, v_143);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_136, v_136BufferSize);
+    uint raw1 = read_mem(param_2, param_3, v_143);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_136, v_136BufferSize);
+    uint raw2 = read_mem(param_4, param_5, v_143);
     Path s;
     s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
     s.tiles = TileRef{ raw2 };
@@ -350,22 +353,16 @@ float2 eval_quad(thread const float2& p0, thread const float2& p1, thread const
 }
 
 static inline __attribute__((always_inline))
-MallocResult malloc(thread const uint& size, device Memory& v_136, constant uint& v_136BufferSize)
+uint malloc_stage(thread const uint& size, thread const uint& mem_size, thread const uint& stage, device Memory& v_143)
 {
-    uint _142 = atomic_fetch_add_explicit((device atomic_uint*)&v_136.mem_offset, size, memory_order_relaxed);
-    uint offset = _142;
-    MallocResult r;
-    r.failed = (offset + size) > uint(int((v_136BufferSize - 8) / 4) * 4);
-    uint param = offset;
-    uint param_1 = size;
-    bool param_2 = !r.failed;
-    r.alloc = new_alloc(param, param_1, param_2);
-    if (r.failed)
+    uint _158 = atomic_fetch_add_explicit((device atomic_uint*)&v_143.mem_offset, size, memory_order_relaxed);
+    uint offset = _158;
+    if ((offset + size) > mem_size)
     {
-        uint _171 = atomic_fetch_max_explicit((device atomic_uint*)&v_136.mem_error, 1u, memory_order_relaxed);
-        return r;
+        uint _168 = atomic_fetch_or_explicit((device atomic_uint*)&v_143.mem_error, stage, memory_order_relaxed);
+        offset = 0u;
     }
-    return r;
+    return offset;
 }
 
 static inline __attribute__((always_inline))
@@ -375,7 +372,7 @@ TileRef Tile_index(thread const TileRef& ref, thread const uint& index)
 }
 
 static inline __attribute__((always_inline))
-void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_136, constant uint& v_136BufferSize)
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_143)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -383,61 +380,66 @@ void write_mem(thread const Alloc& alloc, thread const uint& offset, thread cons
     {
         return;
     }
-    v_136.memory[offset] = val;
+    v_143.memory[offset] = val;
 }
 
 static inline __attribute__((always_inline))
-void TileSeg_write(thread const Alloc& a, thread const TileSegRef& ref, thread const TileSeg& s, device Memory& v_136, constant uint& v_136BufferSize)
+void TileSeg_write(thread const Alloc& a, thread const TileSegRef& ref, thread const TileSeg& s, device Memory& v_143)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = as_type<uint>(s.origin.x);
-    write_mem(param, param_1, param_2, v_136, v_136BufferSize);
+    write_mem(param, param_1, param_2, v_143);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = as_type<uint>(s.origin.y);
-    write_mem(param_3, param_4, param_5, v_136, v_136BufferSize);
+    write_mem(param_3, param_4, param_5, v_143);
     Alloc param_6 = a;
     uint param_7 = ix + 2u;
     uint param_8 = as_type<uint>(s.vector.x);
-    write_mem(param_6, param_7, param_8, v_136, v_136BufferSize);
+    write_mem(param_6, param_7, param_8, v_143);
     Alloc param_9 = a;
     uint param_10 = ix + 3u;
     uint param_11 = as_type<uint>(s.vector.y);
-    write_mem(param_9, param_10, param_11, v_136, v_136BufferSize);
+    write_mem(param_9, param_10, param_11, v_143);
     Alloc param_12 = a;
     uint param_13 = ix + 4u;
     uint param_14 = as_type<uint>(s.y_edge);
-    write_mem(param_12, param_13, param_14, v_136, v_136BufferSize);
+    write_mem(param_12, param_13, param_14, v_143);
     Alloc param_15 = a;
     uint param_16 = ix + 5u;
     uint param_17 = s.next.offset;
-    write_mem(param_15, param_16, param_17, v_136, v_136BufferSize);
+    write_mem(param_15, param_16, param_17, v_143);
 }
 
-kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_136 [[buffer(0)]], const device ConfigBuf& _710 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+kernel void main0(device Memory& v_143 [[buffer(0)]], const device ConfigBuf& _711 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
 {
-    constant uint& v_136BufferSize = spvBufferSizeConstants[0];
-    uint element_ix = gl_GlobalInvocationID.x;
-    PathSegRef ref = PathSegRef{ _710.conf.pathseg_alloc.offset + (element_ix * 52u) };
-    PathSegTag tag = PathSegTag{ 0u, 0u };
-    if (element_ix < _710.conf.n_pathseg)
+    bool mem_ok = true;
+    uint param = 7u;
+    bool _694 = check_deps(param, v_143);
+    if (!_694)
     {
-        Alloc param;
-        param.offset = _710.conf.pathseg_alloc.offset;
-        PathSegRef param_1 = ref;
-        tag = PathSeg_tag(param, param_1, v_136, v_136BufferSize);
+        return;
+    }
+    uint element_ix = gl_GlobalInvocationID.x;
+    PathSegRef ref = PathSegRef{ _711.conf.pathseg_alloc.offset + (element_ix * 52u) };
+    PathSegTag tag = PathSegTag{ 0u, 0u };
+    if (element_ix < _711.conf.n_pathseg)
+    {
+        Alloc param_1;
+        param_1.offset = _711.conf.pathseg_alloc.offset;
+        PathSegRef param_2 = ref;
+        tag = PathSeg_tag(param_1, param_2, v_143);
     }
-    bool mem_ok = v_136.mem_error == 0u;
     switch (tag.tag)
     {
         case 1u:
         {
-            Alloc param_2;
-            param_2.offset = _710.conf.pathseg_alloc.offset;
-            PathSegRef param_3 = ref;
-            PathCubic cubic = PathSeg_Cubic_read(param_2, param_3, v_136, v_136BufferSize);
+            Alloc param_3;
+            param_3.offset = _711.conf.pathseg_alloc.offset;
+            PathSegRef param_4 = ref;
+            PathCubic cubic = PathSeg_Cubic_read(param_3, param_4, v_143);
             float2 err_v = (((cubic.p2 - cubic.p1) * 3.0) + cubic.p0) - cubic.p3;
             float err = (err_v.x * err_v.x) + (err_v.y * err_v.y);
             uint n_quads = max(uint(ceil(pow(err * 3.7037036418914794921875, 0.16666667163372039794921875))), 1u);
@@ -449,40 +451,40 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
             for (uint i = 0u; i < n_quads; i++)
             {
                 float t = float(i + 1u) * _step;
-                float2 param_4 = cubic.p0;
-                float2 param_5 = cubic.p1;
-                float2 param_6 = cubic.p2;
-                float2 param_7 = cubic.p3;
-                float param_8 = t;
-                float2 qp2 = eval_cubic(param_4, param_5, param_6, param_7, param_8);
-                float2 param_9 = cubic.p0;
-                float2 param_10 = cubic.p1;
-                float2 param_11 = cubic.p2;
-                float2 param_12 = cubic.p3;
-                float param_13 = t - (0.5 * _step);
-                float2 qp1 = eval_cubic(param_9, param_10, param_11, param_12, param_13);
+                float2 param_5 = cubic.p0;
+                float2 param_6 = cubic.p1;
+                float2 param_7 = cubic.p2;
+                float2 param_8 = cubic.p3;
+                float param_9 = t;
+                float2 qp2 = eval_cubic(param_5, param_6, param_7, param_8, param_9);
+                float2 param_10 = cubic.p0;
+                float2 param_11 = cubic.p1;
+                float2 param_12 = cubic.p2;
+                float2 param_13 = cubic.p3;
+                float param_14 = t - (0.5 * _step);
+                float2 qp1 = eval_cubic(param_10, param_11, param_12, param_13, param_14);
                 qp1 = (qp1 * 2.0) - ((qp0 + qp2) * 0.5);
-                float2 param_14 = qp0;
-                float2 param_15 = qp1;
-                float2 param_16 = qp2;
-                float param_17 = 0.4743416607379913330078125;
-                SubdivResult params = estimate_subdiv(param_14, param_15, param_16, param_17);
+                float2 param_15 = qp0;
+                float2 param_16 = qp1;
+                float2 param_17 = qp2;
+                float param_18 = 0.4743416607379913330078125;
+                SubdivResult params = estimate_subdiv(param_15, param_16, param_17, param_18);
                 keep_params[i] = params;
                 val += params.val;
                 qp0 = qp2;
             }
             uint n = max(uint(ceil((val * 0.5) / 0.4743416607379913330078125)), 1u);
-            uint param_18 = tag.flags;
-            bool is_stroke = fill_mode_from_flags(param_18) == 1u;
+            uint param_19 = tag.flags;
+            bool is_stroke = fill_mode_from_flags(param_19) == 1u;
             uint path_ix = cubic.path_ix;
-            Alloc param_19;
-            param_19.offset = _710.conf.tile_alloc.offset;
-            PathRef param_20 = PathRef{ _710.conf.tile_alloc.offset + (path_ix * 12u) };
-            Path path = Path_read(param_19, param_20, v_136, v_136BufferSize);
-            uint param_21 = path.tiles.offset;
-            uint param_22 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
-            bool param_23 = mem_ok;
-            Alloc path_alloc = new_alloc(param_21, param_22, param_23);
+            Alloc param_20;
+            param_20.offset = _711.conf.tile_alloc.offset;
+            PathRef param_21 = PathRef{ _711.conf.tile_alloc.offset + (path_ix * 12u) };
+            Path path = Path_read(param_20, param_21, v_143);
+            uint param_22 = path.tiles.offset;
+            uint param_23 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
+            bool param_24 = true;
+            Alloc path_alloc = new_alloc(param_22, param_23, param_24);
             int4 bbox = int4(path.bbox);
             float2 p0 = cubic.p0;
             qp0 = cubic.p0;
@@ -490,44 +492,44 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
             int n_out = 1;
             float val_sum = 0.0;
             float2 p1;
-            float _1147;
+            float _1143;
             TileSeg tile_seg;
             for (uint i_1 = 0u; i_1 < n_quads; i_1++)
             {
                 float t_1 = float(i_1 + 1u) * _step;
-                float2 param_24 = cubic.p0;
-                float2 param_25 = cubic.p1;
-                float2 param_26 = cubic.p2;
-                float2 param_27 = cubic.p3;
-                float param_28 = t_1;
-                float2 qp2_1 = eval_cubic(param_24, param_25, param_26, param_27, param_28);
-                float2 param_29 = cubic.p0;
-                float2 param_30 = cubic.p1;
-                float2 param_31 = cubic.p2;
-                float2 param_32 = cubic.p3;
-                float param_33 = t_1 - (0.5 * _step);
-                float2 qp1_1 = eval_cubic(param_29, param_30, param_31, param_32, param_33);
+                float2 param_25 = cubic.p0;
+                float2 param_26 = cubic.p1;
+                float2 param_27 = cubic.p2;
+                float2 param_28 = cubic.p3;
+                float param_29 = t_1;
+                float2 qp2_1 = eval_cubic(param_25, param_26, param_27, param_28, param_29);
+                float2 param_30 = cubic.p0;
+                float2 param_31 = cubic.p1;
+                float2 param_32 = cubic.p2;
+                float2 param_33 = cubic.p3;
+                float param_34 = t_1 - (0.5 * _step);
+                float2 qp1_1 = eval_cubic(param_30, param_31, param_32, param_33, param_34);
                 qp1_1 = (qp1_1 * 2.0) - ((qp0 + qp2_1) * 0.5);
                 SubdivResult params_1 = keep_params[i_1];
-                float param_34 = params_1.a0;
-                float u0 = approx_parabola_inv_integral(param_34);
-                float param_35 = params_1.a2;
-                float u2 = approx_parabola_inv_integral(param_35);
+                float param_35 = params_1.a0;
+                float u0 = approx_parabola_inv_integral(param_35);
+                float param_36 = params_1.a2;
+                float u2 = approx_parabola_inv_integral(param_36);
                 float uscale = 1.0 / (u2 - u0);
                 float target = float(n_out) * v_step;
                 for (;;)
                 {
-                    bool _1040 = uint(n_out) == n;
-                    bool _1050;
-                    if (!_1040)
+                    bool _1036 = uint(n_out) == n;
+                    bool _1046;
+                    if (!_1036)
                     {
-                        _1050 = target < (val_sum + params_1.val);
+                        _1046 = target < (val_sum + params_1.val);
                     }
                     else
                     {
-                        _1050 = _1040;
+                        _1046 = _1036;
                     }
-                    if (_1050)
+                    if (_1046)
                     {
                         if (uint(n_out) == n)
                         {
@@ -537,14 +539,14 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                         {
                             float u = (target - val_sum) / params_1.val;
                             float a = mix(params_1.a0, params_1.a2, u);
-                            float param_36 = a;
-                            float au = approx_parabola_inv_integral(param_36);
+                            float param_37 = a;
+                            float au = approx_parabola_inv_integral(param_37);
                             float t_2 = (au - u0) * uscale;
-                            float2 param_37 = qp0;
-                            float2 param_38 = qp1_1;
-                            float2 param_39 = qp2_1;
-                            float param_40 = t_2;
-                            p1 = eval_quad(param_37, param_38, param_39, param_40);
+                            float2 param_38 = qp0;
+                            float2 param_39 = qp1_1;
+                            float2 param_40 = qp2_1;
+                            float param_41 = t_2;
+                            p1 = eval_quad(param_38, param_39, param_40, param_41);
                         }
                         float xmin = fast::min(p0.x, p1.x) - cubic.stroke.x;
                         float xmax = fast::max(p0.x, p1.x) + cubic.stroke.x;
@@ -554,13 +556,13 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                         float dy = p1.y - p0.y;
                         if (abs(dy) < 9.999999717180685365747194737196e-10)
                         {
-                            _1147 = 1000000000.0;
+                            _1143 = 1000000000.0;
                         }
                         else
                         {
-                            _1147 = dx / dy;
+                            _1143 = dx / dy;
                         }
-                        float invslope = _1147;
+                        float invslope = _1143;
                         float c = (cubic.stroke.x + (abs(invslope) * (8.0 + cubic.stroke.y))) * 0.0625;
                         float b = invslope;
                         float a_1 = (p0.x - ((p0.y - 8.0) * b)) * 0.0625;
@@ -576,14 +578,20 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                         int stride = bbox.z - bbox.x;
                         int base = ((y0 - bbox.y) * stride) - bbox.x;
                         uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
-                        uint param_41 = n_tile_alloc * 24u;
-                        MallocResult _1263 = malloc(param_41, v_136, v_136BufferSize);
-                        MallocResult tile_alloc = _1263;
-                        if (tile_alloc.failed || (!mem_ok))
+                        uint malloc_size = n_tile_alloc * 24u;
+                        uint param_42 = malloc_size;
+                        uint param_43 = _711.conf.mem_size;
+                        uint param_44 = 4u;
+                        uint _1265 = malloc_stage(param_42, param_43, param_44, v_143);
+                        uint tile_offset = _1265;
+                        if (tile_offset == 0u)
                         {
-                            return;
+                            mem_ok = false;
                         }
-                        uint tile_offset = tile_alloc.alloc.offset;
+                        uint param_45 = tile_offset;
+                        uint param_46 = malloc_size;
+                        bool param_47 = true;
+                        Alloc tile_alloc = new_alloc(param_45, param_46, param_47);
                         int xray = int(floor(p0.x * 0.0625));
                         int last_xray = int(floor(p1.x * 0.0625));
                         if (p0.y > p1.y)
@@ -596,38 +604,33 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                         {
                             float tile_y0 = float(y * 16);
                             int xbackdrop = max((xray + 1), bbox.x);
-                            bool _1319 = !is_stroke;
-                            bool _1329;
-                            if (_1319)
+                            bool _1322 = !is_stroke;
+                            bool _1332;
+                            if (_1322)
                             {
-                                _1329 = fast::min(p0.y, p1.y) < tile_y0;
+                                _1332 = fast::min(p0.y, p1.y) < tile_y0;
                             }
                             else
                             {
-                                _1329 = _1319;
+                                _1332 = _1322;
                             }
-                            bool _1336;
-                            if (_1329)
+                            bool _1339;
+                            if (_1332)
                             {
-                                _1336 = xbackdrop < bbox.z;
+                                _1339 = xbackdrop < bbox.z;
                             }
                             else
                             {
-                                _1336 = _1329;
+                                _1339 = _1332;
                             }
-                            if (_1336)
+                            if (_1339)
                             {
                                 int backdrop = (p1.y < p0.y) ? 1 : (-1);
-                                TileRef param_42 = path.tiles;
-                                uint param_43 = uint(base + xbackdrop);
-                                TileRef tile_ref = Tile_index(param_42, param_43);
+                                TileRef param_48 = path.tiles;
+                                uint param_49 = uint(base + xbackdrop);
+                                TileRef tile_ref = Tile_index(param_48, param_49);
                                 uint tile_el = tile_ref.offset >> uint(2);
-                                Alloc param_44 = path_alloc;
-                                uint param_45 = tile_el + 1u;
-                                if (touch_mem(param_44, param_45))
-                                {
-                                    uint _1374 = atomic_fetch_add_explicit((device atomic_uint*)&v_136.memory[tile_el + 1u], uint(backdrop), memory_order_relaxed);
-                                }
+                                uint _1369 = atomic_fetch_add_explicit((device atomic_uint*)&v_143.memory[tile_el + 1u], uint(backdrop), memory_order_relaxed);
                             }
                             int next_xray = last_xray;
                             if (y < (y1 - 1))
@@ -645,18 +648,13 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                             for (int x = xx0; x < xx1; x++)
                             {
                                 float tile_x0 = float(x * 16);
-                                TileRef param_46 = TileRef{ path.tiles.offset };
-                                uint param_47 = uint(base + x);
-                                TileRef tile_ref_1 = Tile_index(param_46, param_47);
+                                TileRef param_50 = TileRef{ path.tiles.offset };
+                                uint param_51 = uint(base + x);
+                                TileRef tile_ref_1 = Tile_index(param_50, param_51);
                                 uint tile_el_1 = tile_ref_1.offset >> uint(2);
                                 uint old = 0u;
-                                Alloc param_48 = path_alloc;
-                                uint param_49 = tile_el_1;
-                                if (touch_mem(param_48, param_49))
-                                {
-                                    uint _1477 = atomic_exchange_explicit((device atomic_uint*)&v_136.memory[tile_el_1], tile_offset, memory_order_relaxed);
-                                    old = _1477;
-                                }
+                                uint _1465 = atomic_exchange_explicit((device atomic_uint*)&v_143.memory[tile_el_1], tile_offset, memory_order_relaxed);
+                                old = _1465;
                                 tile_seg.origin = p0;
                                 tile_seg.vector = p1 - p0;
                                 float y_edge = 0.0;
@@ -687,10 +685,13 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                                 }
                                 tile_seg.y_edge = y_edge;
                                 tile_seg.next.offset = old;
-                                Alloc param_50 = tile_alloc.alloc;
-                                TileSegRef param_51 = TileSegRef{ tile_offset };
-                                TileSeg param_52 = tile_seg;
-                                TileSeg_write(param_50, param_51, param_52, v_136, v_136BufferSize);
+                                if (mem_ok)
+                                {
+                                    Alloc param_52 = tile_alloc;
+                                    TileSegRef param_53 = TileSegRef{ tile_offset };
+                                    TileSeg param_54 = tile_seg;
+                                    TileSeg_write(param_52, param_53, param_54, v_143);
+                                }
                                 tile_offset += 24u;
                             }
                             xc += b;
diff --git a/piet-gpu/shader/gen/path_coarse.spv b/piet-gpu/shader/gen/path_coarse.spv
index 5e6beda..bd32fc2 100644
Binary files a/piet-gpu/shader/gen/path_coarse.spv and b/piet-gpu/shader/gen/path_coarse.spv differ
diff --git a/piet-gpu/shader/gen/pathseg.dxil b/piet-gpu/shader/gen/pathseg.dxil
index 6130712..c498755 100644
Binary files a/piet-gpu/shader/gen/pathseg.dxil and b/piet-gpu/shader/gen/pathseg.dxil differ
diff --git a/piet-gpu/shader/gen/pathseg.hlsl b/piet-gpu/shader/gen/pathseg.hlsl
index 578417f..4e9a0ae 100644
--- a/piet-gpu/shader/gen/pathseg.hlsl
+++ b/piet-gpu/shader/gen/pathseg.hlsl
@@ -52,6 +52,7 @@ struct Monoid
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -164,7 +165,7 @@ uint read_mem(Alloc alloc, uint offset)
     {
         return 0u;
     }
-    uint v = _111.Load(offset * 4 + 8);
+    uint v = _111.Load(offset * 4 + 12);
     return v;
 }
 
@@ -203,7 +204,7 @@ void write_mem(Alloc alloc, uint offset, uint val)
     {
         return;
     }
-    _111.Store(offset * 4 + 8, val);
+    _111.Store(offset * 4 + 12, val);
 }
 
 void PathCubic_write(Alloc a, PathCubicRef ref, PathCubic s)
@@ -365,7 +366,7 @@ uint round_up(float x)
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 4u;
-    uint tag_word = _574.Load(((_639.Load(92) >> uint(2)) + (ix >> uint(2))) * 4 + 0);
+    uint tag_word = _574.Load(((_639.Load(96) >> uint(2)) + (ix >> uint(2))) * 4 + 0);
     uint param = tag_word;
     TagMonoid local_tm = reduce_tag(param);
     sh_tag[gl_LocalInvocationID.x] = local_tm;
@@ -404,14 +405,14 @@ void comp_main()
         TagMonoid param_4 = sh_tag[gl_LocalInvocationID.x - 1u];
         tm = combine_tag_monoid(param_3, param_4);
     }
-    uint ps_ix = (_639.Load(96) >> uint(2)) + tm.pathseg_offset;
-    uint lw_ix = (_639.Load(88) >> uint(2)) + tm.linewidth_ix;
+    uint ps_ix = (_639.Load(100) >> uint(2)) + tm.pathseg_offset;
+    uint lw_ix = (_639.Load(92) >> uint(2)) + tm.linewidth_ix;
     uint save_path_ix = tm.path_ix;
     uint trans_ix = tm.trans_ix;
-    TransformSegRef _771 = { _639.Load(36) + (trans_ix * 24u) };
+    TransformSegRef _771 = { _639.Load(40) + (trans_ix * 24u) };
     TransformSegRef trans_ref = _771;
-    PathSegRef _781 = { _639.Load(28) + (tm.pathseg_ix * 52u) };
-    PathSegRef ps_ref = _781;
+    PathSegRef _780 = { _639.Load(32) + (tm.pathseg_ix * 52u) };
+    PathSegRef ps_ref = _780;
     float linewidth[4];
     uint save_trans_ix[4];
     float2 p0;
@@ -464,9 +465,9 @@ void comp_main()
                     }
                 }
             }
-            Alloc _877;
-            _877.offset = _639.Load(36);
-            param_13.offset = _877.offset;
+            Alloc _876;
+            _876.offset = _639.Load(40);
+            param_13.offset = _876.offset;
             TransformSegRef param_14 = trans_ref;
             TransformSeg transform = TransformSeg_read(param_13, param_14);
             p0 = ((transform.mat.xy * p0.x) + (transform.mat.zw * p0.y)) + transform.translate;
@@ -475,25 +476,25 @@ void comp_main()
             if (seg_type >= 2u)
             {
                 p2 = ((transform.mat.xy * p2.x) + (transform.mat.zw * p2.y)) + transform.translate;
-                float4 _947 = bbox;
-                float2 _950 = min(_947.xy, p2);
-                bbox.x = _950.x;
-                bbox.y = _950.y;
-                float4 _955 = bbox;
-                float2 _958 = max(_955.zw, p2);
-                bbox.z = _958.x;
-                bbox.w = _958.y;
+                float4 _946 = bbox;
+                float2 _949 = min(_946.xy, p2);
+                bbox.x = _949.x;
+                bbox.y = _949.y;
+                float4 _954 = bbox;
+                float2 _957 = max(_954.zw, p2);
+                bbox.z = _957.x;
+                bbox.w = _957.y;
                 if (seg_type == 3u)
                 {
                     p3 = ((transform.mat.xy * p3.x) + (transform.mat.zw * p3.y)) + transform.translate;
-                    float4 _983 = bbox;
-                    float2 _986 = min(_983.xy, p3);
-                    bbox.x = _986.x;
-                    bbox.y = _986.y;
-                    float4 _991 = bbox;
-                    float2 _994 = max(_991.zw, p3);
-                    bbox.z = _994.x;
-                    bbox.w = _994.y;
+                    float4 _982 = bbox;
+                    float2 _985 = min(_982.xy, p3);
+                    bbox.x = _985.x;
+                    bbox.y = _985.y;
+                    float4 _990 = bbox;
+                    float2 _993 = max(_990.zw, p3);
+                    bbox.z = _993.x;
+                    bbox.w = _993.y;
                 }
                 else
                 {
@@ -524,9 +525,9 @@ void comp_main()
             cubic.trans_ix = (gl_GlobalInvocationID.x * 4u) + i_1;
             cubic.stroke = stroke;
             uint fill_mode = uint(linewidth[i_1] >= 0.0f);
-            Alloc _1089;
-            _1089.offset = _639.Load(28);
-            param_15.offset = _1089.offset;
+            Alloc _1088;
+            _1088.offset = _639.Load(32);
+            param_15.offset = _1088.offset;
             PathSegRef param_16 = ps_ref;
             uint param_17 = fill_mode;
             PathCubic param_18 = cubic;
@@ -571,7 +572,7 @@ void comp_main()
     }
     GroupMemoryBarrierWithGroupSync();
     uint path_ix = save_path_ix;
-    uint bbox_out_ix = (_639.Load(40) >> uint(2)) + (path_ix * 6u);
+    uint bbox_out_ix = (_639.Load(44) >> uint(2)) + (path_ix * 6u);
     Monoid row = monoid_identity();
     if (gl_LocalInvocationID.x > 0u)
     {
@@ -583,24 +584,24 @@ void comp_main()
         Monoid param_24 = local[i_4];
         Monoid m = combine_monoid(param_23, param_24);
         bool do_atomic = false;
-        bool _1264 = i_4 == 3u;
-        bool _1270;
-        if (_1264)
+        bool _1263 = i_4 == 3u;
+        bool _1269;
+        if (_1263)
         {
-            _1270 = gl_LocalInvocationID.x == 255u;
+            _1269 = gl_LocalInvocationID.x == 255u;
         }
         else
         {
-            _1270 = _1264;
+            _1269 = _1263;
         }
-        if (_1270)
+        if (_1269)
         {
             do_atomic = true;
         }
         if ((m.flags & 1u) != 0u)
         {
-            _111.Store((bbox_out_ix + 4u) * 4 + 8, asuint(linewidth[i_4]));
-            _111.Store((bbox_out_ix + 5u) * 4 + 8, save_trans_ix[i_4]);
+            _111.Store((bbox_out_ix + 4u) * 4 + 12, asuint(linewidth[i_4]));
+            _111.Store((bbox_out_ix + 5u) * 4 + 12, save_trans_ix[i_4]);
             if ((m.flags & 2u) == 0u)
             {
                 do_atomic = true;
@@ -608,43 +609,43 @@ void comp_main()
             else
             {
                 float param_25 = m.bbox.x;
-                _111.Store(bbox_out_ix * 4 + 8, round_down(param_25));
+                _111.Store(bbox_out_ix * 4 + 12, round_down(param_25));
                 float param_26 = m.bbox.y;
-                _111.Store((bbox_out_ix + 1u) * 4 + 8, round_down(param_26));
+                _111.Store((bbox_out_ix + 1u) * 4 + 12, round_down(param_26));
                 float param_27 = m.bbox.z;
-                _111.Store((bbox_out_ix + 2u) * 4 + 8, round_up(param_27));
+                _111.Store((bbox_out_ix + 2u) * 4 + 12, round_up(param_27));
                 float param_28 = m.bbox.w;
-                _111.Store((bbox_out_ix + 3u) * 4 + 8, round_up(param_28));
+                _111.Store((bbox_out_ix + 3u) * 4 + 12, round_up(param_28));
                 bbox_out_ix += 6u;
                 do_atomic = false;
             }
         }
         if (do_atomic)
         {
-            bool _1335 = m.bbox.z > m.bbox.x;
-            bool _1344;
-            if (!_1335)
+            bool _1334 = m.bbox.z > m.bbox.x;
+            bool _1343;
+            if (!_1334)
             {
-                _1344 = m.bbox.w > m.bbox.y;
+                _1343 = m.bbox.w > m.bbox.y;
             }
             else
             {
-                _1344 = _1335;
+                _1343 = _1334;
             }
-            if (_1344)
+            if (_1343)
             {
                 float param_29 = m.bbox.x;
-                uint _1353;
-                _111.InterlockedMin(bbox_out_ix * 4 + 8, round_down(param_29), _1353);
+                uint _1352;
+                _111.InterlockedMin(bbox_out_ix * 4 + 12, round_down(param_29), _1352);
                 float param_30 = m.bbox.y;
-                uint _1361;
-                _111.InterlockedMin((bbox_out_ix + 1u) * 4 + 8, round_down(param_30), _1361);
+                uint _1360;
+                _111.InterlockedMin((bbox_out_ix + 1u) * 4 + 12, round_down(param_30), _1360);
                 float param_31 = m.bbox.z;
-                uint _1369;
-                _111.InterlockedMax((bbox_out_ix + 2u) * 4 + 8, round_up(param_31), _1369);
+                uint _1368;
+                _111.InterlockedMax((bbox_out_ix + 2u) * 4 + 12, round_up(param_31), _1368);
                 float param_32 = m.bbox.w;
-                uint _1377;
-                _111.InterlockedMax((bbox_out_ix + 3u) * 4 + 8, round_up(param_32), _1377);
+                uint _1376;
+                _111.InterlockedMax((bbox_out_ix + 3u) * 4 + 12, round_up(param_32), _1376);
             }
             bbox_out_ix += 6u;
         }
diff --git a/piet-gpu/shader/gen/pathseg.msl b/piet-gpu/shader/gen/pathseg.msl
index 9f6328e..5aea66d 100644
--- a/piet-gpu/shader/gen/pathseg.msl
+++ b/piet-gpu/shader/gen/pathseg.msl
@@ -102,6 +102,7 @@ struct Memory
 {
     uint mem_offset;
     uint mem_error;
+    uint blend_offset;
     uint memory[1];
 };
 
@@ -117,6 +118,7 @@ struct Alloc_1
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -545,25 +547,25 @@ kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _6
             if (seg_type >= 2u)
             {
                 p2 = ((transform.mat.xy * p2.x) + (transform.mat.zw * p2.y)) + transform.translate;
-                float4 _947 = bbox;
-                float2 _950 = fast::min(_947.xy, p2);
-                bbox.x = _950.x;
-                bbox.y = _950.y;
-                float4 _955 = bbox;
-                float2 _958 = fast::max(_955.zw, p2);
-                bbox.z = _958.x;
-                bbox.w = _958.y;
+                float4 _946 = bbox;
+                float2 _949 = fast::min(_946.xy, p2);
+                bbox.x = _949.x;
+                bbox.y = _949.y;
+                float4 _954 = bbox;
+                float2 _957 = fast::max(_954.zw, p2);
+                bbox.z = _957.x;
+                bbox.w = _957.y;
                 if (seg_type == 3u)
                 {
                     p3 = ((transform.mat.xy * p3.x) + (transform.mat.zw * p3.y)) + transform.translate;
-                    float4 _983 = bbox;
-                    float2 _986 = fast::min(_983.xy, p3);
-                    bbox.x = _986.x;
-                    bbox.y = _986.y;
-                    float4 _991 = bbox;
-                    float2 _994 = fast::max(_991.zw, p3);
-                    bbox.z = _994.x;
-                    bbox.w = _994.y;
+                    float4 _982 = bbox;
+                    float2 _985 = fast::min(_982.xy, p3);
+                    bbox.x = _985.x;
+                    bbox.y = _985.y;
+                    float4 _990 = bbox;
+                    float2 _993 = fast::max(_990.zw, p3);
+                    bbox.z = _993.x;
+                    bbox.w = _993.y;
                 }
                 else
                 {
@@ -651,17 +653,17 @@ kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _6
         Monoid param_24 = local[i_4];
         Monoid m = combine_monoid(param_23, param_24);
         bool do_atomic = false;
-        bool _1264 = i_4 == 3u;
-        bool _1270;
-        if (_1264)
+        bool _1263 = i_4 == 3u;
+        bool _1269;
+        if (_1263)
         {
-            _1270 = gl_LocalInvocationID.x == 255u;
+            _1269 = gl_LocalInvocationID.x == 255u;
         }
         else
         {
-            _1270 = _1264;
+            _1269 = _1263;
         }
-        if (_1270)
+        if (_1269)
         {
             do_atomic = true;
         }
@@ -689,26 +691,26 @@ kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _6
         }
         if (do_atomic)
         {
-            bool _1335 = m.bbox.z > m.bbox.x;
-            bool _1344;
-            if (!_1335)
+            bool _1334 = m.bbox.z > m.bbox.x;
+            bool _1343;
+            if (!_1334)
             {
-                _1344 = m.bbox.w > m.bbox.y;
+                _1343 = m.bbox.w > m.bbox.y;
             }
             else
             {
-                _1344 = _1335;
+                _1343 = _1334;
             }
-            if (_1344)
+            if (_1343)
             {
                 float param_29 = m.bbox.x;
-                uint _1353 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix], round_down(param_29), memory_order_relaxed);
+                uint _1352 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix], round_down(param_29), memory_order_relaxed);
                 float param_30 = m.bbox.y;
-                uint _1361 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 1u], round_down(param_30), memory_order_relaxed);
+                uint _1360 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 1u], round_down(param_30), memory_order_relaxed);
                 float param_31 = m.bbox.z;
-                uint _1369 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 2u], round_up(param_31), memory_order_relaxed);
+                uint _1368 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 2u], round_up(param_31), memory_order_relaxed);
                 float param_32 = m.bbox.w;
-                uint _1377 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 3u], round_up(param_32), memory_order_relaxed);
+                uint _1376 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 3u], round_up(param_32), memory_order_relaxed);
             }
             bbox_out_ix += 6u;
         }
diff --git a/piet-gpu/shader/gen/pathseg.spv b/piet-gpu/shader/gen/pathseg.spv
index 4e2e9d5..2fb04e5 100644
Binary files a/piet-gpu/shader/gen/pathseg.spv and b/piet-gpu/shader/gen/pathseg.spv differ
diff --git a/piet-gpu/shader/gen/pathtag_reduce.dxil b/piet-gpu/shader/gen/pathtag_reduce.dxil
index 4c2bd23..692ac5f 100644
Binary files a/piet-gpu/shader/gen/pathtag_reduce.dxil and b/piet-gpu/shader/gen/pathtag_reduce.dxil differ
diff --git a/piet-gpu/shader/gen/pathtag_reduce.hlsl b/piet-gpu/shader/gen/pathtag_reduce.hlsl
index 5f7d125..6e9dee1 100644
--- a/piet-gpu/shader/gen/pathtag_reduce.hlsl
+++ b/piet-gpu/shader/gen/pathtag_reduce.hlsl
@@ -14,6 +14,7 @@ struct Alloc
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -92,7 +93,7 @@ TagMonoid combine_tag_monoid(TagMonoid a, TagMonoid b)
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 2u;
-    uint scene_ix = (_139.Load(92) >> uint(2)) + ix;
+    uint scene_ix = (_139.Load(96) >> uint(2)) + ix;
     uint tag_word = _151.Load(scene_ix * 4 + 0);
     uint param = tag_word;
     TagMonoid agg = reduce_tag(param);
diff --git a/piet-gpu/shader/gen/pathtag_reduce.msl b/piet-gpu/shader/gen/pathtag_reduce.msl
index 91e0cca..c6266ad 100644
--- a/piet-gpu/shader/gen/pathtag_reduce.msl
+++ b/piet-gpu/shader/gen/pathtag_reduce.msl
@@ -21,6 +21,7 @@ struct Alloc
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -78,6 +79,7 @@ struct Memory
 {
     uint mem_offset;
     uint mem_error;
+    uint blend_offset;
     uint memory[1];
 };
 
diff --git a/piet-gpu/shader/gen/pathtag_reduce.spv b/piet-gpu/shader/gen/pathtag_reduce.spv
index f1d8679..829addc 100644
Binary files a/piet-gpu/shader/gen/pathtag_reduce.spv and b/piet-gpu/shader/gen/pathtag_reduce.spv differ
diff --git a/piet-gpu/shader/gen/tile_alloc.dxil b/piet-gpu/shader/gen/tile_alloc.dxil
index 7759910..35a1c2b 100644
Binary files a/piet-gpu/shader/gen/tile_alloc.dxil and b/piet-gpu/shader/gen/tile_alloc.dxil differ
diff --git a/piet-gpu/shader/gen/tile_alloc.hlsl b/piet-gpu/shader/gen/tile_alloc.hlsl
index 73e0a8e..aed9001 100644
--- a/piet-gpu/shader/gen/tile_alloc.hlsl
+++ b/piet-gpu/shader/gen/tile_alloc.hlsl
@@ -3,12 +3,6 @@ struct Alloc
     uint offset;
 };
 
-struct MallocResult
-{
-    Alloc alloc;
-    bool failed;
-};
-
 struct PathRef
 {
     uint offset;
@@ -27,6 +21,7 @@ struct Path
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -58,9 +53,9 @@ struct Config
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-RWByteAddressBuffer _70 : register(u0, space0);
-ByteAddressBuffer _181 : register(t1, space0);
-ByteAddressBuffer _257 : register(t2, space0);
+RWByteAddressBuffer _53 : register(u0, space0);
+ByteAddressBuffer _148 : register(t1, space0);
+ByteAddressBuffer _232 : register(t2, space0);
 
 static uint3 gl_LocalInvocationID;
 static uint3 gl_GlobalInvocationID;
@@ -71,53 +66,38 @@ struct SPIRV_Cross_Input
 };
 
 groupshared uint sh_tile_count[256];
-groupshared MallocResult sh_tile_alloc;
+groupshared uint sh_tile_offset;
+
+bool check_deps(uint dep_stage)
+{
+    uint _60;
+    _53.InterlockedOr(4, 0u, _60);
+    return (_60 & dep_stage) == 0u;
+}
 
 float4 load_draw_bbox(uint draw_ix)
 {
-    uint base = (_181.Load(64) >> uint(2)) + (4u * draw_ix);
-    float x0 = asfloat(_70.Load(base * 4 + 8));
-    float y0 = asfloat(_70.Load((base + 1u) * 4 + 8));
-    float x1 = asfloat(_70.Load((base + 2u) * 4 + 8));
-    float y1 = asfloat(_70.Load((base + 3u) * 4 + 8));
+    uint base = (_148.Load(68) >> uint(2)) + (4u * draw_ix);
+    float x0 = asfloat(_53.Load(base * 4 + 12));
+    float y0 = asfloat(_53.Load((base + 1u) * 4 + 12));
+    float x1 = asfloat(_53.Load((base + 2u) * 4 + 12));
+    float y1 = asfloat(_53.Load((base + 3u) * 4 + 12));
     float4 bbox = float4(x0, y0, x1, y1);
     return bbox;
 }
 
-Alloc new_alloc(uint offset, uint size, bool mem_ok)
+uint malloc_stage(uint size, uint mem_size, uint stage)
 {
-    Alloc a;
-    a.offset = offset;
-    return a;
-}
-
-MallocResult malloc(uint size)
-{
-    uint _76;
-    _70.InterlockedAdd(0, size, _76);
-    uint offset = _76;
-    uint _83;
-    _70.GetDimensions(_83);
-    _83 = (_83 - 8) / 4;
-    MallocResult r;
-    r.failed = (offset + size) > uint(int(_83) * 4);
-    uint param = offset;
-    uint param_1 = size;
-    bool param_2 = !r.failed;
-    r.alloc = new_alloc(param, param_1, param_2);
-    if (r.failed)
+    uint _70;
+    _53.InterlockedAdd(0, size, _70);
+    uint offset = _70;
+    if ((offset + size) > mem_size)
     {
-        uint _105;
-        _70.InterlockedMax(4, 1u, _105);
-        return r;
+        uint _80;
+        _53.InterlockedOr(4, stage, _80);
+        offset = 0u;
     }
-    return r;
-}
-
-Alloc slice_mem(Alloc a, uint offset, uint size)
-{
-    Alloc _131 = { a.offset + offset };
-    return _131;
+    return offset;
 }
 
 bool touch_mem(Alloc alloc, uint offset)
@@ -133,7 +113,7 @@ void write_mem(Alloc alloc, uint offset, uint val)
     {
         return;
     }
-    _70.Store(offset * 4 + 8, val);
+    _53.Store(offset * 4 + 12, val);
 }
 
 void Path_write(Alloc a, PathRef ref, Path s)
@@ -155,15 +135,21 @@ void Path_write(Alloc a, PathRef ref, Path s)
 
 void comp_main()
 {
+    uint param = 1u;
+    bool _192 = check_deps(param);
+    if (!_192)
+    {
+        return;
+    }
     uint th_ix = gl_LocalInvocationID.x;
     uint element_ix = gl_GlobalInvocationID.x;
-    PathRef _241 = { _181.Load(16) + (element_ix * 12u) };
-    PathRef path_ref = _241;
-    uint drawtag_base = _181.Load(100) >> uint(2);
+    PathRef _216 = { _148.Load(20) + (element_ix * 12u) };
+    PathRef path_ref = _216;
+    uint drawtag_base = _148.Load(104) >> uint(2);
     uint drawtag = 0u;
-    if (element_ix < _181.Load(0))
+    if (element_ix < _148.Load(4))
     {
-        drawtag = _257.Load((drawtag_base + element_ix) * 4 + 0);
+        drawtag = _232.Load((drawtag_base + element_ix) * 4 + 0);
     }
     int x0 = 0;
     int y0 = 0;
@@ -171,17 +157,17 @@ void comp_main()
     int y1 = 0;
     if ((drawtag != 0u) && (drawtag != 37u))
     {
-        uint param = element_ix;
-        float4 bbox = load_draw_bbox(param);
+        uint param_1 = element_ix;
+        float4 bbox = load_draw_bbox(param_1);
         x0 = int(floor(bbox.x * 0.0625f));
         y0 = int(floor(bbox.y * 0.0625f));
         x1 = int(ceil(bbox.z * 0.0625f));
         y1 = int(ceil(bbox.w * 0.0625f));
     }
-    x0 = clamp(x0, 0, int(_181.Load(8)));
-    y0 = clamp(y0, 0, int(_181.Load(12)));
-    x1 = clamp(x1, 0, int(_181.Load(8)));
-    y1 = clamp(y1, 0, int(_181.Load(12)));
+    x0 = clamp(x0, 0, int(_148.Load(12)));
+    y0 = clamp(y0, 0, int(_148.Load(16)));
+    x1 = clamp(x1, 0, int(_148.Load(12)));
+    y1 = clamp(y1, 0, int(_148.Load(16)));
     Path path;
     path.bbox = uint4(uint(x0), uint(y0), uint(x1), uint(y1));
     uint tile_count = uint((x1 - x0) * (y1 - y0));
@@ -199,59 +185,45 @@ void comp_main()
     }
     if (th_ix == 255u)
     {
-        uint param_1 = total_tile_count * 8u;
-        MallocResult _392 = malloc(param_1);
-        sh_tile_alloc = _392;
+        uint param_2 = total_tile_count * 8u;
+        uint param_3 = _148.Load(0);
+        uint param_4 = 2u;
+        uint _370 = malloc_stage(param_2, param_3, param_4);
+        sh_tile_offset = _370;
     }
     GroupMemoryBarrierWithGroupSync();
-    MallocResult alloc_start = sh_tile_alloc;
-    bool _403;
-    if (!alloc_start.failed)
-    {
-        _403 = _70.Load(4) != 0u;
-    }
-    else
-    {
-        _403 = alloc_start.failed;
-    }
-    if (_403)
+    uint offset_start = sh_tile_offset;
+    if (offset_start == 0u)
     {
         return;
     }
-    if (element_ix < _181.Load(0))
+    if (element_ix < _148.Load(4))
     {
-        uint _416;
+        uint _387;
         if (th_ix > 0u)
         {
-            _416 = sh_tile_count[th_ix - 1u];
+            _387 = sh_tile_count[th_ix - 1u];
         }
         else
         {
-            _416 = 0u;
+            _387 = 0u;
         }
-        uint tile_subix = _416;
-        Alloc param_2 = alloc_start.alloc;
-        uint param_3 = 8u * tile_subix;
-        uint param_4 = 8u * tile_count;
-        Alloc tiles_alloc = slice_mem(param_2, param_3, param_4);
-        TileRef _438 = { tiles_alloc.offset };
-        path.tiles = _438;
-        Alloc _444;
-        _444.offset = _181.Load(16);
+        uint tile_subix = _387;
+        TileRef _400 = { offset_start + (8u * tile_subix) };
+        path.tiles = _400;
+        Alloc _406;
+        _406.offset = _148.Load(20);
         Alloc param_5;
-        param_5.offset = _444.offset;
+        param_5.offset = _406.offset;
         PathRef param_6 = path_ref;
         Path param_7 = path;
         Path_write(param_5, param_6, param_7);
     }
     uint total_count = sh_tile_count[255] * 2u;
-    uint start_ix = alloc_start.alloc.offset >> uint(2);
+    uint start_ix = offset_start >> uint(2);
     for (uint i_1 = th_ix; i_1 < total_count; i_1 += 256u)
     {
-        Alloc param_8 = alloc_start.alloc;
-        uint param_9 = start_ix + i_1;
-        uint param_10 = 0u;
-        write_mem(param_8, param_9, param_10);
+        _53.Store((start_ix + i_1) * 4 + 12, 0u);
     }
 }
 
diff --git a/piet-gpu/shader/gen/tile_alloc.msl b/piet-gpu/shader/gen/tile_alloc.msl
index 961be50..e02138a 100644
--- a/piet-gpu/shader/gen/tile_alloc.msl
+++ b/piet-gpu/shader/gen/tile_alloc.msl
@@ -12,12 +12,6 @@ struct Alloc
     uint offset;
 };
 
-struct MallocResult
-{
-    Alloc alloc;
-    bool failed;
-};
-
 struct PathRef
 {
     uint offset;
@@ -38,6 +32,7 @@ struct Memory
 {
     uint mem_offset;
     uint mem_error;
+    uint blend_offset;
     uint memory[1];
 };
 
@@ -48,6 +43,7 @@ struct Alloc_1
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -90,48 +86,35 @@ struct SceneBuf
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
 
 static inline __attribute__((always_inline))
-float4 load_draw_bbox(thread const uint& draw_ix, device Memory& v_70, constant uint& v_70BufferSize, const device ConfigBuf& v_181)
+bool check_deps(thread const uint& dep_stage, device Memory& v_53)
 {
-    uint base = (v_181.conf.draw_bbox_alloc.offset >> uint(2)) + (4u * draw_ix);
-    float x0 = as_type<float>(v_70.memory[base]);
-    float y0 = as_type<float>(v_70.memory[base + 1u]);
-    float x1 = as_type<float>(v_70.memory[base + 2u]);
-    float y1 = as_type<float>(v_70.memory[base + 3u]);
+    uint _60 = atomic_fetch_or_explicit((device atomic_uint*)&v_53.mem_error, 0u, memory_order_relaxed);
+    return (_60 & dep_stage) == 0u;
+}
+
+static inline __attribute__((always_inline))
+float4 load_draw_bbox(thread const uint& draw_ix, device Memory& v_53, const device ConfigBuf& v_148)
+{
+    uint base = (v_148.conf.draw_bbox_alloc.offset >> uint(2)) + (4u * draw_ix);
+    float x0 = as_type<float>(v_53.memory[base]);
+    float y0 = as_type<float>(v_53.memory[base + 1u]);
+    float x1 = as_type<float>(v_53.memory[base + 2u]);
+    float y1 = as_type<float>(v_53.memory[base + 3u]);
     float4 bbox = float4(x0, y0, x1, y1);
     return bbox;
 }
 
 static inline __attribute__((always_inline))
-Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok)
+uint malloc_stage(thread const uint& size, thread const uint& mem_size, thread const uint& stage, device Memory& v_53)
 {
-    Alloc a;
-    a.offset = offset;
-    return a;
-}
-
-static inline __attribute__((always_inline))
-MallocResult malloc(thread const uint& size, device Memory& v_70, constant uint& v_70BufferSize)
-{
-    uint _76 = atomic_fetch_add_explicit((device atomic_uint*)&v_70.mem_offset, size, memory_order_relaxed);
-    uint offset = _76;
-    MallocResult r;
-    r.failed = (offset + size) > uint(int((v_70BufferSize - 8) / 4) * 4);
-    uint param = offset;
-    uint param_1 = size;
-    bool param_2 = !r.failed;
-    r.alloc = new_alloc(param, param_1, param_2);
-    if (r.failed)
+    uint _70 = atomic_fetch_add_explicit((device atomic_uint*)&v_53.mem_offset, size, memory_order_relaxed);
+    uint offset = _70;
+    if ((offset + size) > mem_size)
     {
-        uint _105 = atomic_fetch_max_explicit((device atomic_uint*)&v_70.mem_error, 1u, memory_order_relaxed);
-        return r;
+        uint _80 = atomic_fetch_or_explicit((device atomic_uint*)&v_53.mem_error, stage, memory_order_relaxed);
+        offset = 0u;
     }
-    return r;
-}
-
-static inline __attribute__((always_inline))
-Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size)
-{
-    return Alloc{ a.offset + offset };
+    return offset;
 }
 
 static inline __attribute__((always_inline))
@@ -141,7 +124,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
 }
 
 static inline __attribute__((always_inline))
-void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_70, constant uint& v_70BufferSize)
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_53)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -149,40 +132,45 @@ void write_mem(thread const Alloc& alloc, thread const uint& offset, thread cons
     {
         return;
     }
-    v_70.memory[offset] = val;
+    v_53.memory[offset] = val;
 }
 
 static inline __attribute__((always_inline))
-void Path_write(thread const Alloc& a, thread const PathRef& ref, thread const Path& s, device Memory& v_70, constant uint& v_70BufferSize)
+void Path_write(thread const Alloc& a, thread const PathRef& ref, thread const Path& s, device Memory& v_53)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.bbox.x | (s.bbox.y << uint(16));
-    write_mem(param, param_1, param_2, v_70, v_70BufferSize);
+    write_mem(param, param_1, param_2, v_53);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = s.bbox.z | (s.bbox.w << uint(16));
-    write_mem(param_3, param_4, param_5, v_70, v_70BufferSize);
+    write_mem(param_3, param_4, param_5, v_53);
     Alloc param_6 = a;
     uint param_7 = ix + 2u;
     uint param_8 = s.tiles.offset;
-    write_mem(param_6, param_7, param_8, v_70, v_70BufferSize);
+    write_mem(param_6, param_7, param_8, v_53);
 }
 
-kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_70 [[buffer(0)]], const device ConfigBuf& v_181 [[buffer(1)]], const device SceneBuf& _257 [[buffer(2)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+kernel void main0(device Memory& v_53 [[buffer(0)]], const device ConfigBuf& v_148 [[buffer(1)]], const device SceneBuf& _232 [[buffer(2)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
 {
     threadgroup uint sh_tile_count[256];
-    threadgroup MallocResult sh_tile_alloc;
-    constant uint& v_70BufferSize = spvBufferSizeConstants[0];
+    threadgroup uint sh_tile_offset;
+    uint param = 1u;
+    bool _192 = check_deps(param, v_53);
+    if (!_192)
+    {
+        return;
+    }
     uint th_ix = gl_LocalInvocationID.x;
     uint element_ix = gl_GlobalInvocationID.x;
-    PathRef path_ref = PathRef{ v_181.conf.tile_alloc.offset + (element_ix * 12u) };
-    uint drawtag_base = v_181.conf.drawtag_offset >> uint(2);
+    PathRef path_ref = PathRef{ v_148.conf.tile_alloc.offset + (element_ix * 12u) };
+    uint drawtag_base = v_148.conf.drawtag_offset >> uint(2);
     uint drawtag = 0u;
-    if (element_ix < v_181.conf.n_elements)
+    if (element_ix < v_148.conf.n_elements)
     {
-        drawtag = _257.scene[drawtag_base + element_ix];
+        drawtag = _232.scene[drawtag_base + element_ix];
     }
     int x0 = 0;
     int y0 = 0;
@@ -190,17 +178,17 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
     int y1 = 0;
     if ((drawtag != 0u) && (drawtag != 37u))
     {
-        uint param = element_ix;
-        float4 bbox = load_draw_bbox(param, v_70, v_70BufferSize, v_181);
+        uint param_1 = element_ix;
+        float4 bbox = load_draw_bbox(param_1, v_53, v_148);
         x0 = int(floor(bbox.x * 0.0625));
         y0 = int(floor(bbox.y * 0.0625));
         x1 = int(ceil(bbox.z * 0.0625));
         y1 = int(ceil(bbox.w * 0.0625));
     }
-    x0 = clamp(x0, 0, int(v_181.conf.width_in_tiles));
-    y0 = clamp(y0, 0, int(v_181.conf.height_in_tiles));
-    x1 = clamp(x1, 0, int(v_181.conf.width_in_tiles));
-    y1 = clamp(y1, 0, int(v_181.conf.height_in_tiles));
+    x0 = clamp(x0, 0, int(v_148.conf.width_in_tiles));
+    y0 = clamp(y0, 0, int(v_148.conf.height_in_tiles));
+    x1 = clamp(x1, 0, int(v_148.conf.width_in_tiles));
+    y1 = clamp(y1, 0, int(v_148.conf.height_in_tiles));
     Path path;
     path.bbox = uint4(uint(x0), uint(y0), uint(x1), uint(y1));
     uint tile_count = uint((x1 - x0) * (y1 - y0));
@@ -218,56 +206,42 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
     }
     if (th_ix == 255u)
     {
-        uint param_1 = total_tile_count * 8u;
-        MallocResult _392 = malloc(param_1, v_70, v_70BufferSize);
-        sh_tile_alloc = _392;
+        uint param_2 = total_tile_count * 8u;
+        uint param_3 = v_148.conf.mem_size;
+        uint param_4 = 2u;
+        uint _370 = malloc_stage(param_2, param_3, param_4, v_53);
+        sh_tile_offset = _370;
     }
     threadgroup_barrier(mem_flags::mem_threadgroup);
-    MallocResult alloc_start = sh_tile_alloc;
-    bool _403;
-    if (!alloc_start.failed)
-    {
-        _403 = v_70.mem_error != 0u;
-    }
-    else
-    {
-        _403 = alloc_start.failed;
-    }
-    if (_403)
+    uint offset_start = sh_tile_offset;
+    if (offset_start == 0u)
     {
         return;
     }
-    if (element_ix < v_181.conf.n_elements)
+    if (element_ix < v_148.conf.n_elements)
     {
-        uint _416;
+        uint _387;
         if (th_ix > 0u)
         {
-            _416 = sh_tile_count[th_ix - 1u];
+            _387 = sh_tile_count[th_ix - 1u];
         }
         else
         {
-            _416 = 0u;
+            _387 = 0u;
         }
-        uint tile_subix = _416;
-        Alloc param_2 = alloc_start.alloc;
-        uint param_3 = 8u * tile_subix;
-        uint param_4 = 8u * tile_count;
-        Alloc tiles_alloc = slice_mem(param_2, param_3, param_4);
-        path.tiles = TileRef{ tiles_alloc.offset };
+        uint tile_subix = _387;
+        path.tiles = TileRef{ offset_start + (8u * tile_subix) };
         Alloc param_5;
-        param_5.offset = v_181.conf.tile_alloc.offset;
+        param_5.offset = v_148.conf.tile_alloc.offset;
         PathRef param_6 = path_ref;
         Path param_7 = path;
-        Path_write(param_5, param_6, param_7, v_70, v_70BufferSize);
+        Path_write(param_5, param_6, param_7, v_53);
     }
     uint total_count = sh_tile_count[255] * 2u;
-    uint start_ix = alloc_start.alloc.offset >> uint(2);
+    uint start_ix = offset_start >> uint(2);
     for (uint i_1 = th_ix; i_1 < total_count; i_1 += 256u)
     {
-        Alloc param_8 = alloc_start.alloc;
-        uint param_9 = start_ix + i_1;
-        uint param_10 = 0u;
-        write_mem(param_8, param_9, param_10, v_70, v_70BufferSize);
+        v_53.memory[start_ix + i_1] = 0u;
     }
 }
 
diff --git a/piet-gpu/shader/gen/tile_alloc.spv b/piet-gpu/shader/gen/tile_alloc.spv
index dbc02a8..25a362c 100644
Binary files a/piet-gpu/shader/gen/tile_alloc.spv and b/piet-gpu/shader/gen/tile_alloc.spv differ
diff --git a/piet-gpu/shader/gen/transform_leaf.dxil b/piet-gpu/shader/gen/transform_leaf.dxil
index f9f31e6..9427186 100644
Binary files a/piet-gpu/shader/gen/transform_leaf.dxil and b/piet-gpu/shader/gen/transform_leaf.dxil differ
diff --git a/piet-gpu/shader/gen/transform_leaf.hlsl b/piet-gpu/shader/gen/transform_leaf.hlsl
index 8a3b3d5..d3347a6 100644
--- a/piet-gpu/shader/gen/transform_leaf.hlsl
+++ b/piet-gpu/shader/gen/transform_leaf.hlsl
@@ -27,6 +27,7 @@ struct TransformSeg
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -58,12 +59,12 @@ struct Config
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-static const Transform _224 = { float4(1.0f, 0.0f, 0.0f, 1.0f), 0.0f.xx };
+static const Transform _225 = { float4(1.0f, 0.0f, 0.0f, 1.0f), 0.0f.xx };
 
 RWByteAddressBuffer _71 : register(u0, space0);
-ByteAddressBuffer _96 : register(t2, space0);
-ByteAddressBuffer _278 : register(t1, space0);
-ByteAddressBuffer _376 : register(t3, space0);
+ByteAddressBuffer _97 : register(t2, space0);
+ByteAddressBuffer _279 : register(t1, space0);
+ByteAddressBuffer _377 : register(t3, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -80,12 +81,12 @@ groupshared Transform sh_scratch[256];
 Transform Transform_read(TransformRef ref)
 {
     uint ix = ref.offset >> uint(2);
-    uint raw0 = _96.Load((ix + 0u) * 4 + 0);
-    uint raw1 = _96.Load((ix + 1u) * 4 + 0);
-    uint raw2 = _96.Load((ix + 2u) * 4 + 0);
-    uint raw3 = _96.Load((ix + 3u) * 4 + 0);
-    uint raw4 = _96.Load((ix + 4u) * 4 + 0);
-    uint raw5 = _96.Load((ix + 5u) * 4 + 0);
+    uint raw0 = _97.Load((ix + 0u) * 4 + 0);
+    uint raw1 = _97.Load((ix + 1u) * 4 + 0);
+    uint raw2 = _97.Load((ix + 2u) * 4 + 0);
+    uint raw3 = _97.Load((ix + 3u) * 4 + 0);
+    uint raw4 = _97.Load((ix + 4u) * 4 + 0);
+    uint raw5 = _97.Load((ix + 5u) * 4 + 0);
     Transform s;
     s.mat = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
     s.translate = float2(asfloat(raw4), asfloat(raw5));
@@ -108,7 +109,7 @@ Transform combine_monoid(Transform a, Transform b)
 
 Transform monoid_identity()
 {
-    return _224;
+    return _225;
 }
 
 bool touch_mem(Alloc alloc, uint offset)
@@ -124,7 +125,7 @@ void write_mem(Alloc alloc, uint offset, uint val)
     {
         return;
     }
-    _71.Store(offset * 4 + 8, val);
+    _71.Store(offset * 4 + 12, val);
 }
 
 void TransformSeg_write(Alloc a, TransformSegRef ref, TransformSeg s)
@@ -159,8 +160,8 @@ void TransformSeg_write(Alloc a, TransformSegRef ref, TransformSeg s)
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 8u;
-    TransformRef _285 = { _278.Load(84) + (ix * 24u) };
-    TransformRef ref = _285;
+    TransformRef _286 = { _279.Load(88) + (ix * 24u) };
+    TransformRef ref = _286;
     TransformRef param = ref;
     Transform agg = Transform_read(param);
     Transform local[8];
@@ -193,11 +194,11 @@ void comp_main()
     Transform row = monoid_identity();
     if (gl_WorkGroupID.x > 0u)
     {
-        Transform _382;
-        _382.mat = asfloat(_376.Load4((gl_WorkGroupID.x - 1u) * 32 + 0));
-        _382.translate = asfloat(_376.Load2((gl_WorkGroupID.x - 1u) * 32 + 16));
-        row.mat = _382.mat;
-        row.translate = _382.translate;
+        Transform _383;
+        _383.mat = asfloat(_377.Load4((gl_WorkGroupID.x - 1u) * 32 + 0));
+        _383.translate = asfloat(_377.Load2((gl_WorkGroupID.x - 1u) * 32 + 16));
+        row.mat = _383.mat;
+        row.translate = _383.translate;
     }
     if (gl_LocalInvocationID.x > 0u)
     {
@@ -211,13 +212,13 @@ void comp_main()
         Transform param_10 = row;
         Transform param_11 = local[i_2];
         Transform m = combine_monoid(param_10, param_11);
-        TransformSeg _422 = { m.mat, m.translate };
-        TransformSeg transform = _422;
-        TransformSegRef _432 = { _278.Load(36) + ((ix + i_2) * 24u) };
-        TransformSegRef trans_ref = _432;
-        Alloc _436;
-        _436.offset = _278.Load(36);
-        param_12.offset = _436.offset;
+        TransformSeg _423 = { m.mat, m.translate };
+        TransformSeg transform = _423;
+        TransformSegRef _433 = { _279.Load(40) + ((ix + i_2) * 24u) };
+        TransformSegRef trans_ref = _433;
+        Alloc _437;
+        _437.offset = _279.Load(40);
+        param_12.offset = _437.offset;
         TransformSegRef param_13 = trans_ref;
         TransformSeg param_14 = transform;
         TransformSeg_write(param_12, param_13, param_14);
diff --git a/piet-gpu/shader/gen/transform_leaf.msl b/piet-gpu/shader/gen/transform_leaf.msl
index fe45438..01fefd1 100644
--- a/piet-gpu/shader/gen/transform_leaf.msl
+++ b/piet-gpu/shader/gen/transform_leaf.msl
@@ -75,6 +75,7 @@ struct Memory
 {
     uint mem_offset;
     uint mem_error;
+    uint blend_offset;
     uint memory[1];
 };
 
@@ -90,6 +91,7 @@ struct Alloc_1
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -139,15 +141,15 @@ struct ParentBuf
 constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
 
 static inline __attribute__((always_inline))
-Transform Transform_read(thread const TransformRef& ref, const device SceneBuf& v_96)
+Transform Transform_read(thread const TransformRef& ref, const device SceneBuf& v_97)
 {
     uint ix = ref.offset >> uint(2);
-    uint raw0 = v_96.scene[ix + 0u];
-    uint raw1 = v_96.scene[ix + 1u];
-    uint raw2 = v_96.scene[ix + 2u];
-    uint raw3 = v_96.scene[ix + 3u];
-    uint raw4 = v_96.scene[ix + 4u];
-    uint raw5 = v_96.scene[ix + 5u];
+    uint raw0 = v_97.scene[ix + 0u];
+    uint raw1 = v_97.scene[ix + 1u];
+    uint raw2 = v_97.scene[ix + 2u];
+    uint raw3 = v_97.scene[ix + 3u];
+    uint raw4 = v_97.scene[ix + 4u];
+    uint raw5 = v_97.scene[ix + 5u];
     Transform s;
     s.mat = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
     s.translate = float2(as_type<float>(raw4), as_type<float>(raw5));
@@ -223,13 +225,13 @@ void TransformSeg_write(thread const Alloc& a, thread const TransformSegRef& ref
     write_mem(param_15, param_16, param_17, v_71);
 }
 
-kernel void main0(device Memory& v_71 [[buffer(0)]], const device ConfigBuf& _278 [[buffer(1)]], const device SceneBuf& v_96 [[buffer(2)]], const device ParentBuf& _376 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+kernel void main0(device Memory& v_71 [[buffer(0)]], const device ConfigBuf& _279 [[buffer(1)]], const device SceneBuf& v_97 [[buffer(2)]], const device ParentBuf& _377 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
 {
     threadgroup Transform sh_scratch[256];
     uint ix = gl_GlobalInvocationID.x * 8u;
-    TransformRef ref = TransformRef{ _278.conf.trans_offset + (ix * 24u) };
+    TransformRef ref = TransformRef{ _279.conf.trans_offset + (ix * 24u) };
     TransformRef param = ref;
-    Transform agg = Transform_read(param, v_96);
+    Transform agg = Transform_read(param, v_97);
     spvUnsafeArray<Transform, 8> local;
     local[0] = agg;
     for (uint i = 1u; i < 8u; i++)
@@ -238,7 +240,7 @@ kernel void main0(device Memory& v_71 [[buffer(0)]], const device ConfigBuf& _27
         uint param_2 = i;
         TransformRef param_3 = Transform_index(param_1, param_2);
         Transform param_4 = agg;
-        Transform param_5 = Transform_read(param_3, v_96);
+        Transform param_5 = Transform_read(param_3, v_97);
         agg = combine_monoid(param_4, param_5);
         local[i] = agg;
     }
@@ -260,9 +262,9 @@ kernel void main0(device Memory& v_71 [[buffer(0)]], const device ConfigBuf& _27
     Transform row = monoid_identity();
     if (gl_WorkGroupID.x > 0u)
     {
-        uint _379 = gl_WorkGroupID.x - 1u;
-        row.mat = _376.parent[_379].mat;
-        row.translate = _376.parent[_379].translate;
+        uint _380 = gl_WorkGroupID.x - 1u;
+        row.mat = _377.parent[_380].mat;
+        row.translate = _377.parent[_380].translate;
     }
     if (gl_LocalInvocationID.x > 0u)
     {
@@ -277,8 +279,8 @@ kernel void main0(device Memory& v_71 [[buffer(0)]], const device ConfigBuf& _27
         Transform param_11 = local[i_2];
         Transform m = combine_monoid(param_10, param_11);
         TransformSeg transform = TransformSeg{ m.mat, m.translate };
-        TransformSegRef trans_ref = TransformSegRef{ _278.conf.trans_alloc.offset + ((ix + i_2) * 24u) };
-        param_12.offset = _278.conf.trans_alloc.offset;
+        TransformSegRef trans_ref = TransformSegRef{ _279.conf.trans_alloc.offset + ((ix + i_2) * 24u) };
+        param_12.offset = _279.conf.trans_alloc.offset;
         TransformSegRef param_13 = trans_ref;
         TransformSeg param_14 = transform;
         TransformSeg_write(param_12, param_13, param_14, v_71);
diff --git a/piet-gpu/shader/gen/transform_leaf.spv b/piet-gpu/shader/gen/transform_leaf.spv
index b739099..a0081bf 100644
Binary files a/piet-gpu/shader/gen/transform_leaf.spv and b/piet-gpu/shader/gen/transform_leaf.spv differ
diff --git a/piet-gpu/shader/gen/transform_reduce.dxil b/piet-gpu/shader/gen/transform_reduce.dxil
index 978dd98..6986f8f 100644
Binary files a/piet-gpu/shader/gen/transform_reduce.dxil and b/piet-gpu/shader/gen/transform_reduce.dxil differ
diff --git a/piet-gpu/shader/gen/transform_reduce.hlsl b/piet-gpu/shader/gen/transform_reduce.hlsl
index bd14f79..90ea55f 100644
--- a/piet-gpu/shader/gen/transform_reduce.hlsl
+++ b/piet-gpu/shader/gen/transform_reduce.hlsl
@@ -16,6 +16,7 @@ struct Alloc
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -96,7 +97,7 @@ Transform combine_monoid(Transform a, Transform b)
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 8u;
-    TransformRef _168 = { _161.Load(84) + (ix * 24u) };
+    TransformRef _168 = { _161.Load(88) + (ix * 24u) };
     TransformRef ref = _168;
     TransformRef param = ref;
     Transform agg = Transform_read(param);
diff --git a/piet-gpu/shader/gen/transform_reduce.msl b/piet-gpu/shader/gen/transform_reduce.msl
index 62da531..6ae57e7 100644
--- a/piet-gpu/shader/gen/transform_reduce.msl
+++ b/piet-gpu/shader/gen/transform_reduce.msl
@@ -28,6 +28,7 @@ struct Alloc
 
 struct Config
 {
+    uint mem_size;
     uint n_elements;
     uint n_pathseg;
     uint width_in_tiles;
@@ -78,6 +79,7 @@ struct Memory
 {
     uint mem_offset;
     uint mem_error;
+    uint blend_offset;
     uint memory[1];
 };
 
diff --git a/piet-gpu/shader/gen/transform_reduce.spv b/piet-gpu/shader/gen/transform_reduce.spv
index 6aa6b94..fc8e58a 100644
Binary files a/piet-gpu/shader/gen/transform_reduce.spv and b/piet-gpu/shader/gen/transform_reduce.spv differ