diff --git a/piet-gpu-hal/src/lib.rs b/piet-gpu-hal/src/lib.rs
index c62678f..77170c0 100644
--- a/piet-gpu-hal/src/lib.rs
+++ b/piet-gpu-hal/src/lib.rs
@@ -71,6 +71,11 @@ pub trait CmdBuf<D: Device> {
 
     unsafe fn memory_barrier(&mut self);
 
+    /// Clear the buffer.
+    ///
+    /// This is readily supported in Vulkan, but for portability it is remarkably
+    /// tricky (unimplemented in gfx-hal right now). Possibly best to write a compute
+    /// kernel, or organize the code not to need it.
     unsafe fn clear_buffer(&self, buffer: &D::Buffer);
 
     unsafe fn copy_buffer(&self, src: &D::Buffer, dst: &D::Buffer);
diff --git a/piet-gpu-types/src/ptcl.rs b/piet-gpu-types/src/ptcl.rs
index b6df77d..ed72e42 100644
--- a/piet-gpu-types/src/ptcl.rs
+++ b/piet-gpu-types/src/ptcl.rs
@@ -33,6 +33,9 @@ piet_gpu! {
         struct CmdSolid {
             rgba_color: u32,
         }
+        struct CmdJump {
+            new_ref: u32,
+        }
         enum Cmd {
             End,
             Circle(CmdCircle),
@@ -42,6 +45,7 @@ piet_gpu! {
             FillEdge(CmdFillEdge),
             DrawFill(CmdDrawFill),
             Solid(CmdSolid),
+            Jump(CmdJump),
             Bail,
         }
     }
diff --git a/piet-gpu-types/src/tilegroup.rs b/piet-gpu-types/src/tilegroup.rs
index 4824178..5912154 100644
--- a/piet-gpu-types/src/tilegroup.rs
+++ b/piet-gpu-types/src/tilegroup.rs
@@ -10,8 +10,12 @@ piet_gpu! {
             // A better type would be Point.
             offset: [f32; 2],
         }
+        struct Jump {
+            new_ref: u32,
+        }
         enum TileGroup {
             Instance(Instance),
+            Jump(Jump),
             End,
         }
     }
diff --git a/piet-gpu/shader/kernel1.comp b/piet-gpu/shader/kernel1.comp
index dbdd492..82ccb8f 100644
--- a/piet-gpu/shader/kernel1.comp
+++ b/piet-gpu/shader/kernel1.comp
@@ -25,6 +25,10 @@ layout(set = 0, binding = 1) buffer TilegroupBuf {
     uint[] tilegroup;
 };
 
+layout(set = 0, binding = 2) buffer AllocBuf {
+    uint alloc;
+};
+
 #include "scene.h"
 #include "tilegroup.h"
 
@@ -43,6 +47,7 @@ void main() {
     uint stack_ix = 0;
     uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS + gl_GlobalInvocationID.x;
     TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_INITIAL_ALLOC);
+    uint tg_limit = tg_ref.offset + TILEGROUP_INITIAL_ALLOC - 2 * TileGroup_size;
     vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILEGROUP_WIDTH_PX, TILEGROUP_HEIGHT_PX);
     PietItemRef root = PietItemRef(0);
     SimpleGroup group = PietItem_Group_read(root);
@@ -62,9 +67,16 @@ void main() {
             if (hit && !is_group) {
                 PietItemRef item_ref = PietItem_index(group.items, tos.index);
                 Instance ins = Instance(item_ref.offset, tos.offset);
+                if (tg_ref.offset > tg_limit) {
+                    // Allocation exceeded; do atomic bump alloc.
+                    uint new_tg = atomicAdd(alloc, TILEGROUP_INITIAL_ALLOC);
+                    Jump jump = Jump(new_tg);
+                    TileGroup_Jump_write(tg_ref, jump);
+                    tg_ref = TileGroupRef(new_tg);
+                    tg_limit = tg_ref.offset + TILEGROUP_INITIAL_ALLOC - 2 * TileGroup_size;
+                }
                 TileGroup_Instance_write(tg_ref, ins);
                 tg_ref.offset += TileGroup_size;
-                // TODO: bump allocate if allocation exceeded
             }
             if (is_group) {
                 PietItemRef item_ref = PietItem_index(group.items, tos.index);
diff --git a/piet-gpu/shader/kernel1.spv b/piet-gpu/shader/kernel1.spv
index 0e9a497..9ac3593 100644
Binary files a/piet-gpu/shader/kernel1.spv and b/piet-gpu/shader/kernel1.spv differ
diff --git a/piet-gpu/shader/kernel3.comp b/piet-gpu/shader/kernel3.comp
index cb344c0..ef3faef 100644
--- a/piet-gpu/shader/kernel3.comp
+++ b/piet-gpu/shader/kernel3.comp
@@ -20,12 +20,26 @@ layout(set = 0, binding = 2) buffer PtclBuf {
     uint[] ptcl;
 };
 
+layout(set = 0, binding = 3) buffer AllocBuf {
+    uint alloc;
+};
+
 #include "scene.h"
 #include "tilegroup.h"
 #include "ptcl.h"
 
 #include "setup.h"
 
+void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) {
+    if (cmd_ref.offset > cmd_limit) {
+        uint new_cmd = atomicAdd(alloc, PTCL_INITIAL_ALLOC);
+        CmdJump jump = CmdJump(new_cmd);
+        Cmd_Jump_write(cmd_ref, jump);
+        cmd_ref = CmdRef(new_cmd);
+        cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
+    }
+}
+
 void main() {
     uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x;
     uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS
@@ -33,12 +47,17 @@ void main() {
     vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
     TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_INITIAL_ALLOC);
     CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC);
+    uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
 
     while (true) {
         uint tg_tag = TileGroup_tag(tg_ref);
         if (tg_tag == TileGroup_End) {
             break;
         }
+        if (tg_tag == TileGroup_Jump) {
+            tg_ref = TileGroupRef(TileGroup_Jump_read(tg_ref).new_ref);
+            continue;
+        }
         // Assume tg_tag is `Instance`, though there will be more cases.
         Instance ins = TileGroup_Instance_read(tg_ref);
         PietItemRef item_ref = PietItemRef(ins.item_ref);
@@ -52,6 +71,7 @@ void main() {
                 && max(center.y - r, xy0.y) < min(center.y + r, xy0.y + float(TILE_HEIGHT_PX)))
             {
                 CmdCircle cmd = CmdCircle(center, r, circle.rgba_color);
+                alloc_cmd(cmd_ref, cmd_limit);
                 Cmd_Circle_write(cmd_ref, cmd);
                 cmd_ref.offset += Cmd_size;
             }
diff --git a/piet-gpu/shader/kernel3.spv b/piet-gpu/shader/kernel3.spv
index 23a7c3e..cd56c48 100644
Binary files a/piet-gpu/shader/kernel3.spv and b/piet-gpu/shader/kernel3.spv differ
diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp
index 6e2392b..cdde198 100644
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@@ -44,6 +44,10 @@ void main() {
             vec4 fg_rgba = unpackUnorm4x8(circle.rgba_color);
             // TODO: sRGB
             rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a);
+            break;
+        case Cmd_Jump:
+            cmd_ref = CmdRef(Cmd_Jump_read(cmd_ref).new_ref);
+            continue;
         }
         cmd_ref.offset += Cmd_size;
     }
diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv
index c2cb755..caef463 100644
Binary files a/piet-gpu/shader/kernel4.spv and b/piet-gpu/shader/kernel4.spv differ
diff --git a/piet-gpu/shader/ptcl.h b/piet-gpu/shader/ptcl.h
index 583cc10..cc43594 100644
--- a/piet-gpu/shader/ptcl.h
+++ b/piet-gpu/shader/ptcl.h
@@ -28,6 +28,10 @@ struct CmdSolidRef {
     uint offset;
 };
 
+struct CmdJumpRef {
+    uint offset;
+};
+
 struct CmdRef {
     uint offset;
 };
@@ -109,6 +113,16 @@ CmdSolidRef CmdSolid_index(CmdSolidRef ref, uint index) {
     return CmdSolidRef(ref.offset + index * CmdSolid_size);
 }
 
+struct CmdJump {
+    uint new_ref;
+};
+
+#define CmdJump_size 4
+
+CmdJumpRef CmdJump_index(CmdJumpRef ref, uint index) {
+    return CmdJumpRef(ref.offset + index * CmdJump_size);
+}
+
 #define Cmd_End 0
 #define Cmd_Circle 1
 #define Cmd_Line 2
@@ -117,7 +131,8 @@ CmdSolidRef CmdSolid_index(CmdSolidRef ref, uint index) {
 #define Cmd_FillEdge 5
 #define Cmd_DrawFill 6
 #define Cmd_Solid 7
-#define Cmd_Bail 8
+#define Cmd_Jump 8
+#define Cmd_Bail 9
 #define Cmd_size 20
 
 CmdRef Cmd_index(CmdRef ref, uint index) {
@@ -246,6 +261,19 @@ void CmdSolid_write(CmdSolidRef ref, CmdSolid s) {
     ptcl[ix + 0] = s.rgba_color;
 }
 
+CmdJump CmdJump_read(CmdJumpRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = ptcl[ix + 0];
+    CmdJump s;
+    s.new_ref = raw0;
+    return s;
+}
+
+void CmdJump_write(CmdJumpRef ref, CmdJump s) {
+    uint ix = ref.offset >> 2;
+    ptcl[ix + 0] = s.new_ref;
+}
+
 uint Cmd_tag(CmdRef ref) {
     return ptcl[ref.offset >> 2];
 }
@@ -278,6 +306,10 @@ CmdSolid Cmd_Solid_read(CmdRef ref) {
     return CmdSolid_read(CmdSolidRef(ref.offset + 4));
 }
 
+CmdJump Cmd_Jump_read(CmdRef ref) {
+    return CmdJump_read(CmdJumpRef(ref.offset + 4));
+}
+
 void Cmd_End_write(CmdRef ref) {
     ptcl[ref.offset >> 2] = Cmd_End;
 }
@@ -317,6 +349,11 @@ void Cmd_Solid_write(CmdRef ref, CmdSolid s) {
     CmdSolid_write(CmdSolidRef(ref.offset + 4), s);
 }
 
+void Cmd_Jump_write(CmdRef ref, CmdJump s) {
+    ptcl[ref.offset >> 2] = Cmd_Jump;
+    CmdJump_write(CmdJumpRef(ref.offset + 4), s);
+}
+
 void Cmd_Bail_write(CmdRef ref) {
     ptcl[ref.offset >> 2] = Cmd_Bail;
 }
diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h
index f04462b..9ce2de6 100644
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@@ -22,4 +22,4 @@
 #define TILE_WIDTH_PX 16
 #define TILE_HEIGHT_PX 16
 
-#define PTCL_INITIAL_ALLOC 4096
+#define PTCL_INITIAL_ALLOC 1024
diff --git a/piet-gpu/shader/tilegroup.h b/piet-gpu/shader/tilegroup.h
index f1d646f..64b27d3 100644
--- a/piet-gpu/shader/tilegroup.h
+++ b/piet-gpu/shader/tilegroup.h
@@ -4,6 +4,10 @@ struct InstanceRef {
     uint offset;
 };
 
+struct JumpRef {
+    uint offset;
+};
+
 struct TileGroupRef {
     uint offset;
 };
@@ -19,8 +23,19 @@ InstanceRef Instance_index(InstanceRef ref, uint index) {
     return InstanceRef(ref.offset + index * Instance_size);
 }
 
+struct Jump {
+    uint new_ref;
+};
+
+#define Jump_size 4
+
+JumpRef Jump_index(JumpRef ref, uint index) {
+    return JumpRef(ref.offset + index * Jump_size);
+}
+
 #define TileGroup_Instance 0
-#define TileGroup_End 1
+#define TileGroup_Jump 1
+#define TileGroup_End 2
 #define TileGroup_size 16
 
 TileGroupRef TileGroup_index(TileGroupRef ref, uint index) {
@@ -45,6 +60,19 @@ void Instance_write(InstanceRef ref, Instance s) {
     tilegroup[ix + 2] = floatBitsToUint(s.offset.y);
 }
 
+Jump Jump_read(JumpRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = tilegroup[ix + 0];
+    Jump s;
+    s.new_ref = raw0;
+    return s;
+}
+
+void Jump_write(JumpRef ref, Jump s) {
+    uint ix = ref.offset >> 2;
+    tilegroup[ix + 0] = s.new_ref;
+}
+
 uint TileGroup_tag(TileGroupRef ref) {
     return tilegroup[ref.offset >> 2];
 }
@@ -53,11 +81,20 @@ Instance TileGroup_Instance_read(TileGroupRef ref) {
     return Instance_read(InstanceRef(ref.offset + 4));
 }
 
+Jump TileGroup_Jump_read(TileGroupRef ref) {
+    return Jump_read(JumpRef(ref.offset + 4));
+}
+
 void TileGroup_Instance_write(TileGroupRef ref, Instance s) {
     tilegroup[ref.offset >> 2] = TileGroup_Instance;
     Instance_write(InstanceRef(ref.offset + 4), s);
 }
 
+void TileGroup_Jump_write(TileGroupRef ref, Jump s) {
+    tilegroup[ref.offset >> 2] = TileGroup_Jump;
+    Jump_write(JumpRef(ref.offset + 4), s);
+}
+
 void TileGroup_End_write(TileGroupRef ref) {
     tilegroup[ref.offset >> 2] = TileGroup_End;
 }
diff --git a/piet-gpu/src/main.rs b/piet-gpu/src/main.rs
index 6a243e9..703e156 100644
--- a/piet-gpu/src/main.rs
+++ b/piet-gpu/src/main.rs
@@ -20,7 +20,15 @@ const HEIGHT: usize = 1536;
 const TILE_W: usize = 16;
 const TILE_H: usize = 16;
 
-const N_CIRCLES: usize = 3000;
+const WIDTH_IN_TILEGROUPS: usize = 4;
+const HEIGHT_IN_TILEGROUPS: usize = 96;
+const TILEGROUP_INITIAL_ALLOC: usize = 1024;
+
+const WIDTH_IN_TILES: usize = 124;
+const HEIGHT_IN_TILES: usize = 96;
+const PTCL_INITIAL_ALLOC: usize = 1024;
+
+const N_CIRCLES: usize = 10_000;
 
 fn render_scene(rc: &mut impl RenderContext) {
     let mut rng = rand::thread_rng();
@@ -71,8 +79,7 @@ fn main() {
             .create_buffer(std::mem::size_of_val(&scene[..]) as u64, dev)
             .unwrap();
         device.write_buffer(&scene_buf, &scene).unwrap();
-        // These should only be on the host if we're going to examine them from Rust.
-        let tilegroup_buf = device.create_buffer(384 * 1024, dev).unwrap();
+        let tilegroup_buf = device.create_buffer(4 * 1024 * 1024, dev).unwrap();
         let ptcl_buf = device.create_buffer(12 * 1024 * 4096, dev).unwrap();
         let image_buf = device
             .create_buffer((WIDTH * HEIGHT * 4) as u64, host)
@@ -81,16 +88,34 @@ fn main() {
             .create_buffer((WIDTH * HEIGHT * 4) as u64, dev)
             .unwrap();
 
+        let k1_alloc_buf_host = device.create_buffer(4, host).unwrap();
+        let k1_alloc_buf_dev = device.create_buffer(4, dev).unwrap();
+        let k1_alloc_start = WIDTH_IN_TILEGROUPS * HEIGHT_IN_TILEGROUPS * TILEGROUP_INITIAL_ALLOC;
+        device
+            .write_buffer(&k1_alloc_buf_host, &[k1_alloc_start as u32])
+            .unwrap();
         let k1_code = include_bytes!("../shader/kernel1.spv");
-        let k1_pipeline = device.create_simple_compute_pipeline(k1_code, 2).unwrap();
+        let k1_pipeline = device.create_simple_compute_pipeline(k1_code, 3).unwrap();
         let k1_ds = device
-            .create_descriptor_set(&k1_pipeline, &[&scene_dev, &tilegroup_buf])
+            .create_descriptor_set(
+                &k1_pipeline,
+                &[&scene_dev, &tilegroup_buf, &k1_alloc_buf_dev],
+            )
             .unwrap();
 
+        let k3_alloc_buf_host = device.create_buffer(4, host).unwrap();
+        let k3_alloc_buf_dev = device.create_buffer(4, dev).unwrap();
+        let k3_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC;
+        device
+            .write_buffer(&k3_alloc_buf_host, &[k3_alloc_start as u32])
+            .unwrap();
         let k3_code = include_bytes!("../shader/kernel3.spv");
-        let k3_pipeline = device.create_simple_compute_pipeline(k3_code, 3).unwrap();
+        let k3_pipeline = device.create_simple_compute_pipeline(k3_code, 4).unwrap();
         let k3_ds = device
-            .create_descriptor_set(&k3_pipeline, &[&scene_dev, &tilegroup_buf, &ptcl_buf])
+            .create_descriptor_set(
+                &k3_pipeline,
+                &[&scene_dev, &tilegroup_buf, &ptcl_buf, &k3_alloc_buf_dev],
+            )
             .unwrap();
 
         let k4_code = include_bytes!("../shader/kernel4.spv");
@@ -102,6 +127,8 @@ fn main() {
         let mut cmd_buf = device.create_cmd_buf().unwrap();
         cmd_buf.begin();
         cmd_buf.copy_buffer(&scene_buf, &scene_dev);
+        cmd_buf.copy_buffer(&k1_alloc_buf_host, &k1_alloc_buf_dev);
+        cmd_buf.copy_buffer(&k3_alloc_buf_host, &k3_alloc_buf_dev);
         cmd_buf.clear_buffer(&tilegroup_buf);
         cmd_buf.clear_buffer(&ptcl_buf);
         cmd_buf.memory_barrier();
diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs
index 4e9a567..eb67132 100644
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@@ -238,7 +238,9 @@ fn flatten_shape(
                 let scene_pt = to_scene_point(p);
                 start_pt = Some(clone_scene_pt(&scene_pt));
                 if !points.is_empty() {
-                    points.push(scene::Point { xy: [std::f32::NAN, std::f32::NAN ]});
+                    points.push(scene::Point {
+                        xy: [std::f32::NAN, std::f32::NAN],
+                    });
                 }
                 last_pt = Some(clone_scene_pt(&scene_pt));
                 points.push(scene_pt);
@@ -350,7 +352,5 @@ fn to_scene_point(point: Point) -> scene::Point {
 
 // TODO: allow #[derive(Clone)] in piet-gpu-derive.
 fn clone_scene_pt(p: &scene::Point) -> scene::Point {
-    scene::Point {
-        xy: p.xy
-    }
+    scene::Point { xy: p.xy }
 }