diff --git a/piet-gpu-hal/src/vulkan.rs b/piet-gpu-hal/src/vulkan.rs
index 2fac015..402b13d 100644
--- a/piet-gpu-hal/src/vulkan.rs
+++ b/piet-gpu-hal/src/vulkan.rs
@@ -1016,7 +1016,7 @@ unsafe fn choose_compute_device(
     devices: &[vk::PhysicalDevice],
     surface: Option<&VkSurface>,
 ) -> Option<(vk::PhysicalDevice, u32)> {
-    for pdevice in devices {
+    for pdevice in &devices[1..] {
         let props = instance.get_physical_device_queue_family_properties(*pdevice);
         for (ix, info) in props.iter().enumerate() {
             // Check for surface presentation support
diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs
index fe8c4ac..cc4cb44 100644
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@@ -181,12 +181,10 @@ fn main() -> Result<(), Error> {
         println!("Coarse kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3);
         println!("Render kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3);
 
-        /*
         let mut data: Vec<u32> = Default::default();
-        device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap();
-        //piet_gpu::dump_k1_data(&data);
-        trace_ptcl(&data);
-        */
+        device.read_buffer(&renderer.bin_buf, &mut data).unwrap();
+        piet_gpu::dump_k1_data(&data);
+        //trace_ptcl(&data);
 
         let mut img_data: Vec<u8> = Default::default();
         // Note: because png can use a `&[u8]` slice, we could avoid an extra copy
diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp
index 713a654..d193dd2 100644
--- a/piet-gpu/shader/binning.comp
+++ b/piet-gpu/shader/binning.comp
@@ -45,8 +45,6 @@ shared uint bitmaps[N_SLICE][N_TILE];
 shared uint count[N_SLICE][N_TILE];
 shared uint sh_my_tile;
 shared uint sh_chunk_start[N_TILE];
-shared uint sh_chunk_end[N_TILE];
-shared uint sh_chunk_jump[N_TILE];
 
 shared float sh_right_edge[N_TILE];
 
@@ -57,8 +55,6 @@ uint state_right_edge_index(uint partition_ix) {
 }
 
 void main() {
-    BinChunkRef chunk_ref = BinChunkRef((gl_LocalInvocationID.x * N_WG + gl_WorkGroupID.x) * BIN_INITIAL_ALLOC);
-    uint wr_limit = chunk_ref.offset + BIN_INITIAL_ALLOC;
     uint chunk_n = 0;
     uint my_n_elements = n_elements;
     while (true) {
@@ -169,41 +165,15 @@ void main() {
             count[i][gl_LocalInvocationID.x] = element_count;
         }
         // element_count is number of elements covering bin for this invocation.
+        uint chunk_start = 0;
         if (element_count != 0) {
-            uint chunk_end;
-            uint chunk_new_start;
-            // Refactor to reduce code duplication?
-            if (chunk_n > 0) {
-                uint next_chunk = chunk_ref.offset + BinChunk_size + chunk_n * BinInstance_size;
-                if (next_chunk + BinChunk_size + min(24, element_count * BinInstance_size) > wr_limit) {
-                    uint alloc_amount = max(BIN_ALLOC, BinChunk_size + element_count * BinInstance_size);
-                    // could try to reduce fragmentation if BIN_ALLOC is only a bit above needed
-                    next_chunk = atomicAdd(alloc, alloc_amount);
-                    wr_limit = next_chunk + alloc_amount;
-                }
-                BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(next_chunk)));
-                chunk_ref = BinChunkRef(next_chunk);
-            }
-            BinInstanceRef instance_ref = BinInstanceRef(chunk_ref.offset + BinChunk_size);
-            if (instance_ref.offset + element_count * BinInstance_size > wr_limit) {
-                chunk_end = wr_limit;
-                chunk_n = (wr_limit - instance_ref.offset) / BinInstance_size;
-                uint alloc_amount = max(BIN_ALLOC, BinChunk_size + (element_count - chunk_n) * BinInstance_size);
-                chunk_new_start = atomicAdd(alloc, alloc_amount);
-                wr_limit = chunk_new_start + alloc_amount;
-                BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(chunk_new_start)));
-                chunk_ref = BinChunkRef(chunk_new_start);
-                chunk_new_start += BinChunk_size;
-                chunk_n = element_count - chunk_n;
-            } else {
-                chunk_end = ~0;
-                chunk_new_start = ~0;
-                chunk_n = element_count;
-            }
-            sh_chunk_start[gl_LocalInvocationID.x] = instance_ref.offset;
-            sh_chunk_end[gl_LocalInvocationID.x] = chunk_end;
-            sh_chunk_jump[gl_LocalInvocationID.x] = chunk_new_start - chunk_end;
+            // TODO: aggregate atomic adds (subgroup is probably fastest)
+            chunk_start = atomicAdd(alloc, element_count * BinInstance_size);
+            sh_chunk_start[gl_LocalInvocationID.x] = chunk_start;
         }
+        uint out_ix = (my_tile * N_TILE + gl_LocalInvocationID.x) * 2;
+        bins[out_ix] = element_count;
+        bins[out_ix + 1] = chunk_start;
 
         barrier();
         // Use similar strategy as Laine & Karras paper; loop over bbox of bins
@@ -219,9 +189,6 @@ void main() {
                     idx += count[my_slice - 1][bin_ix];
                 }
                 uint out_offset = sh_chunk_start[bin_ix] + idx * BinInstance_size;
-                if (out_offset >= sh_chunk_end[bin_ix]) {
-                    out_offset += sh_chunk_jump[bin_ix];
-                }
                 BinInstance_write(BinInstanceRef(out_offset), BinInstance(element_ix, my_right_edge));
             }
             x++;
@@ -231,5 +198,4 @@ void main() {
             }
         }
     }
-    BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(0)));
 }
diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv
index e932e4d..9f22a33 100644
Binary files a/piet-gpu/shader/binning.spv and b/piet-gpu/shader/binning.spv differ
diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv
index d951b24..b58a327 100644
Binary files a/piet-gpu/shader/coarse.spv and b/piet-gpu/shader/coarse.spv differ
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index 65bbe5c..5f0f6be 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -160,7 +160,7 @@ impl<D: Device> Renderer<D> {
 
         let state_buf = device.create_buffer(1 * 1024 * 1024, dev)?;
         let anno_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
-        let bin_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
+        let bin_buf = device.create_buffer(64 * 1024 * 1024, host)?;
         let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?;
         let image_dev = device.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?;
 
@@ -176,12 +176,12 @@ impl<D: Device> Renderer<D> {
         let bin_alloc_buf_dev = device.create_buffer(12, dev)?;
 
         // TODO: constants
-        let bin_alloc_start = 256 * 64 * N_WG;
+        let bin_alloc_start = ((n_elements + 255) & !255) * 8;
         device
             .write_buffer(&bin_alloc_buf_host, &[
                 n_elements as u32,
                 0,
-                bin_alloc_start,
+                bin_alloc_start as u32,
             ])
             ?;
         let bin_code = include_bytes!("../shader/binning.spv");
@@ -268,18 +268,22 @@ impl<D: Device> Renderer<D> {
         );
         cmd_buf.write_timestamp(&query_pool, 2);
         cmd_buf.memory_barrier();
+        /*
         cmd_buf.dispatch(
             &self.coarse_pipeline,
             &self.coarse_ds,
             (WIDTH as u32 / 256, HEIGHT as u32 / 256, 1),
         );
+        */
         cmd_buf.write_timestamp(&query_pool, 3);
         cmd_buf.memory_barrier();
+        /*
         cmd_buf.dispatch(
             &self.k4_pipeline,
             &self.k4_ds,
             ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1),
         );
+        */
         cmd_buf.write_timestamp(&query_pool, 4);
         cmd_buf.memory_barrier();
         cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);