diff --git a/piet-gpu-hal/src/lib.rs b/piet-gpu-hal/src/lib.rs
index 698ae43..1e04b8d 100644
--- a/piet-gpu-hal/src/lib.rs
+++ b/piet-gpu-hal/src/lib.rs
@@ -148,7 +148,7 @@ pub trait CmdBuf<D: Device> {
     /// This is readily supported in Vulkan, but for portability it is remarkably
     /// tricky (unimplemented in gfx-hal right now). Possibly best to write a compute
     /// kernel, or organize the code not to need it.
-    unsafe fn clear_buffer(&self, buffer: &D::Buffer);
+    unsafe fn clear_buffer(&self, buffer: &D::Buffer, size: Option<u64>);
 
     unsafe fn copy_buffer(&self, src: &D::Buffer, dst: &D::Buffer);
 
diff --git a/piet-gpu-hal/src/vulkan.rs b/piet-gpu-hal/src/vulkan.rs
index b40576b..53d575b 100644
--- a/piet-gpu-hal/src/vulkan.rs
+++ b/piet-gpu-hal/src/vulkan.rs
@@ -902,9 +902,10 @@ impl crate::CmdBuf<VkDevice> for CmdBuf {
         );
     }
 
-    unsafe fn clear_buffer(&self, buffer: &Buffer) {
+    unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) {
         let device = &self.device.device;
-        device.cmd_fill_buffer(self.cmd_buf, buffer.buffer, 0, vk::WHOLE_SIZE, 0);
+        let size = size.unwrap_or(vk::WHOLE_SIZE);
+        device.cmd_fill_buffer(self.cmd_buf, buffer.buffer, 0, size, 0);
     }
 
     unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) {
diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp
index a7d5e92..6c98c4b 100644
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@@ -22,13 +22,39 @@ layout(set = 0, binding = 1) buffer TileBuf {
     uint[] tile;
 };
 
-layout(rgba8, set = 0, binding = 2) uniform writeonly image2D image;
+layout(set = 0, binding = 2) buffer ClipScratchBuf {
+    uint[] clip_scratch;
+};
+
+layout(rgba8, set = 0, binding = 3) uniform writeonly image2D image;
 
 #include "ptcl.h"
 #include "tile.h"
 
 #define BLEND_STACK_SIZE 4
 
+// Layout of clip_scratch buffer:
+// [0] is the alloc bump offset (in units of 32 bit words, initially 0)
+// Starting at 1 is a sequence of frames.
+// Each frame is WIDTH * HEIGHT 32-bit words, then a link reference.
+
+#define CLIP_LINK_OFFSET (TILE_WIDTH_PX * TILE_HEIGHT_PX)
+#define CLIP_BUF_SIZE (CLIP_LINK_OFFSET + 1)
+
+shared uint sh_clip_alloc;
+
+// Allocate a scratch buffer for clipping. Unlike offsets in the rest of the code,
+// it counts 32-bit words.
+uint alloc_clip_buf(uint link) {
+    if (gl_LocalInvocationID.x == 0 && gl_LocalInvocationID.y == 0) {
+        uint alloc = atomicAdd(clip_scratch[0], CLIP_BUF_SIZE) + 1;
+        sh_clip_alloc = alloc;
+        clip_scratch[alloc + CLIP_LINK_OFFSET] = link;
+    }
+    barrier();
+    return sh_clip_alloc;
+}
+
 // Calculate coverage based on backdrop + coverage of each line segment
 float[CHUNK] computeArea(vec2 xy, int backdrop, uint tile_ref) {
     // Probably better to store as float, but conversion is no doubt cheap.
@@ -72,7 +98,9 @@ void main() {
     vec3 rgb[CHUNK];
     float mask[CHUNK];
     uint blend_stack[BLEND_STACK_SIZE][CHUNK];
+    uint blend_spill = 0;
     uint blend_sp = 0;
+    uint clip_tos = 0;
     for (uint i = 0; i < CHUNK; i++) {
         rgb[i] = vec3(0.5);
         mask[i] = 1.0;
@@ -142,26 +170,46 @@ void main() {
             }
             break;
         case Cmd_BeginClip:
-            CmdBeginClip begin_clip = Cmd_BeginClip_read(cmd_ref);
-            area = computeArea(xy, begin_clip.backdrop, begin_clip.tile_ref);
-            for (uint k = 0; k < CHUNK; k++) {
-                blend_stack[blend_sp][k] = packUnorm4x8(vec4(rgb[k], clamp(abs(area[k]), 0.0, 1.0)));
-            }
-            blend_sp++;
-            break;
         case Cmd_BeginSolidClip:
-            CmdBeginSolidClip begin_solid_clip = Cmd_BeginSolidClip_read(cmd_ref);
-            float solid_alpha = begin_solid_clip.alpha;
-            for (uint k = 0; k < CHUNK; k++) {
-                blend_stack[blend_sp][k] = packUnorm4x8(vec4(rgb[k], solid_alpha));
+            uint blend_slot = blend_sp % BLEND_STACK_SIZE;
+            if (blend_sp == blend_spill + BLEND_STACK_SIZE) {
+                // spill to scratch buffer
+                clip_tos = alloc_clip_buf(clip_tos);
+                uint base_ix = clip_tos + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y;
+                for (uint k = 0; k < CHUNK; k++) {
+                    clip_scratch[base_ix + k * TILE_WIDTH_PX * CHUNK_DY] = blend_stack[blend_slot][k];
+                }
+                blend_spill++;
+            }
+            if (tag == Cmd_BeginClip) {
+                CmdBeginClip begin_clip = Cmd_BeginClip_read(cmd_ref);
+                area = computeArea(xy, begin_clip.backdrop, begin_clip.tile_ref);
+                for (uint k = 0; k < CHUNK; k++) {
+                    blend_stack[blend_slot][k] = packUnorm4x8(vec4(rgb[k], clamp(abs(area[k]), 0.0, 1.0)));
+                }
+            } else {
+                CmdBeginSolidClip begin_solid_clip = Cmd_BeginSolidClip_read(cmd_ref);
+                float solid_alpha = begin_solid_clip.alpha;
+                for (uint k = 0; k < CHUNK; k++) {
+                    blend_stack[blend_slot][k] = packUnorm4x8(vec4(rgb[k], solid_alpha));
+                }                
             }
             blend_sp++;
             break;
         case Cmd_EndClip:
             CmdEndClip end_clip = Cmd_EndClip_read(cmd_ref);
+            blend_slot = (blend_sp - 1) % BLEND_STACK_SIZE;
+            if (blend_sp == blend_spill) {
+                uint base_ix = clip_tos + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y;
+                for (uint k = 0; k < CHUNK; k++) {
+                    blend_stack[blend_slot][k] = clip_scratch[base_ix + k * TILE_WIDTH_PX * CHUNK_DY];
+                }
+                clip_tos = clip_scratch[clip_tos + CLIP_LINK_OFFSET];
+                blend_spill--;
+            }
             blend_sp--;
             for (uint k = 0; k < CHUNK; k++) {
-                vec4 rgba = unpackUnorm4x8(blend_stack[blend_sp][k]);
+                vec4 rgba = unpackUnorm4x8(blend_stack[blend_slot][k]);
                 rgb[k] = mix(rgba.rgb, rgb[k], end_clip.alpha * rgba.a);
             }
             break;
diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv
index a02387a..538e783 100644
Binary files a/piet-gpu/shader/kernel4.spv and b/piet-gpu/shader/kernel4.spv differ
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index 6eff190..0a6152d 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -73,7 +73,8 @@ pub fn render_scene(rc: &mut impl RenderContext) {
         5.0,
     );
     //render_cardioid(rc);
-    render_tiger(rc);
+    render_clip_test(rc);
+    //render_tiger(rc);
 }
 
 #[allow(unused)]
@@ -94,6 +95,33 @@ fn render_cardioid(rc: &mut impl RenderContext) {
     rc.stroke(&path, &Color::BLACK, 2.0);
 }
 
+#[allow(unused)]
+fn render_clip_test(rc: &mut impl RenderContext) {
+    const N: usize = 16;
+    const X0: f64 = 50.0;
+    const Y0: f64 = 50.0;
+    const X1: f64 = 100.0;
+    const Y1: f64 = 100.0;
+    let step = 1.0 / ((N + 1) as f64);
+    for i in 0..N {
+        let t = ((i + 1) as f64) * step;
+        rc.save();
+        let mut path = BezPath::new();
+        path.move_to((X0, Y0));
+        path.line_to((X1, Y0));
+        path.line_to((X1, Y0 + t * (Y1 - Y0)));
+        path.line_to((X1 + t * (X0 - X1), Y1));
+        path.line_to((X0, Y1));
+        path.close_path();
+        rc.clip(path);
+    }
+    let rect = piet::kurbo::Rect::new(X0, Y0, X1, Y1);
+    rc.fill(rect, &Color::BLACK);
+    for _ in 0..N {
+        rc.restore();
+    }
+}
+
 fn render_tiger(rc: &mut impl RenderContext) {
     let xml_str = std::str::from_utf8(include_bytes!("../Ghostscript_Tiger.svg")).unwrap();
     let start = std::time::Instant::now();
@@ -163,6 +191,8 @@ pub struct Renderer {
     coarse_alloc_buf_host: hub::Buffer,
     coarse_alloc_buf_dev: hub::Buffer,
 
+    clip_scratch_buf: hub::Buffer,
+
     k4_pipeline: hub::Pipeline,
     k4_ds: hub::DescriptorSet,
 
@@ -278,6 +308,8 @@ impl Renderer {
             &[],
         )?;
 
+        let clip_scratch_buf = session.create_buffer(1024 * 1024, dev)?;
+
         let mut coarse_alloc_buf_host = session.create_buffer(8, host)?;
         let coarse_alloc_buf_dev = session.create_buffer(8, dev)?;
 
@@ -298,10 +330,14 @@ impl Renderer {
         )?;
 
         let k4_code = include_bytes!("../shader/kernel4.spv");
-        let k4_pipeline = session.create_simple_compute_pipeline(k4_code, 2, 1)?;
+        let k4_pipeline = session.create_simple_compute_pipeline(k4_code, 3, 1)?;
         let k4_ds = session.create_descriptor_set(
             &k4_pipeline,
-            &[ptcl_buf.vk_buffer(), tile_buf.vk_buffer()],
+            &[
+                ptcl_buf.vk_buffer(),
+                tile_buf.vk_buffer(),
+                clip_scratch_buf.vk_buffer(),
+            ],
             &[image_dev.vk_image()],
         )?;
 
@@ -335,6 +371,7 @@ impl Renderer {
             bin_alloc_buf_dev,
             coarse_alloc_buf_host,
             coarse_alloc_buf_dev,
+            clip_scratch_buf,
             n_elements,
             n_paths,
             n_pathseg,
@@ -355,7 +392,8 @@ impl Renderer {
             self.coarse_alloc_buf_host.vk_buffer(),
             self.coarse_alloc_buf_dev.vk_buffer(),
         );
-        cmd_buf.clear_buffer(self.state_buf.vk_buffer());
+        cmd_buf.clear_buffer(self.state_buf.vk_buffer(), None);
+        cmd_buf.clear_buffer(self.clip_scratch_buf.vk_buffer(), Some(4));
         cmd_buf.memory_barrier();
         cmd_buf.image_barrier(
             self.image_dev.vk_image(),