diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
index 082d902..78c758b 100644
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@@ -30,9 +30,16 @@ layout(set = 0, binding = 3) buffer PtclBuf {
 
 #define N_RINGBUF 512
 
+#define LG_N_PART_READ 8
+#define N_PART_READ (1 << LG_N_PART_READ)
+
 shared uint sh_elements[N_RINGBUF];
 shared float sh_right_edge[N_RINGBUF];
 
+// Number of elements in the partition; prefix sum.
+shared uint sh_part_count[N_PART_READ];
+shared uint sh_part_elements[N_PART_READ];
+
 shared uint sh_bitmaps[N_SLICE][N_TILE];
 shared uint sh_backdrop[N_SLICE][N_TILE];
 shared uint sh_bd_sign[N_SLICE];
@@ -89,7 +96,7 @@ void main() {
     // invocations within the workgroup. We'll use variables to abstract.
     uint bin_ix = N_TILE_X * gl_WorkGroupID.y + gl_WorkGroupID.x;
     uint partition_ix = 0;
-    uint my_n_elements = n_elements;
+    uint n_partitions = (n_elements + N_TILE - 1) / N_TILE;
     // Top left coordinates of this bin.
     vec2 xy0 = vec2(N_TILE_X * TILE_WIDTH_PX * gl_WorkGroupID.x, N_TILE_Y * TILE_HEIGHT_PX * gl_WorkGroupID.y);
     uint th_ix = gl_LocalInvocationID.x;
@@ -107,8 +114,14 @@ void main() {
     SegmentRef last_chunk_segs = SegmentRef(0);
     alloc_chunk_remaining = 0;
 
-    uint wr_ix = 0;
+    // I'm sure we can figure out how to do this with at least one fewer register...
+    // Items up to rd_ix have been read from sh_elements
     uint rd_ix = 0;
+    // Items up to wr_ix have been written into sh_elements
+    uint wr_ix = 0;
+    // Items between part_start_ix and ready_ix are ready to be transferred from sh_part_elements
+    uint part_start_ix = 0;
+    uint ready_ix = 0;
     if (th_ix < N_SLICE) {
         sh_bd_sign[th_ix] = 0;
     }
@@ -122,21 +135,58 @@ void main() {
             sh_is_segment[th_ix] = 0;
         }
 
-        while (wr_ix - rd_ix <= N_TILE && partition_ix * N_TILE < my_n_elements) {
-            uint in_ix = (partition_ix * N_TILE + bin_ix) * 2;
-            uint chunk_n = bins[in_ix];
-            uint elements_ref = bins[in_ix + 1];
-            BinInstanceRef inst_ref = BinInstanceRef(elements_ref);
-            if (th_ix < chunk_n) {
-                BinInstance inst = BinInstance_read(BinInstance_index(inst_ref, th_ix));
-                uint wr_el_ix = (wr_ix + th_ix) % N_RINGBUF;
+        // parallel read of input partitions
+        do {
+            if (ready_ix == wr_ix && partition_ix < n_partitions) {
+                part_start_ix = ready_ix;
+                uint count = 0;
+                if (th_ix < N_PART_READ && partition_ix + th_ix < n_partitions) {
+                    uint in_ix = ((partition_ix + th_ix) * N_TILE + bin_ix) * 2;
+                    count = bins[in_ix];
+                    sh_part_elements[th_ix] = bins[in_ix + 1];
+                }
+                // prefix sum of counts
+                for (uint i = 0; i < LG_N_PART_READ; i++) {
+                    if (th_ix < N_PART_READ) {
+                        sh_part_count[th_ix] = count;
+                    }
+                    barrier();
+                    if (th_ix < N_PART_READ) {
+                        if (th_ix >= (1 << i)) {
+                            count += sh_part_count[th_ix - (1 << i)];
+                        }
+                    }
+                    barrier();
+                }
+                if (th_ix < N_PART_READ) {
+                    sh_part_count[th_ix] = part_start_ix + count;
+                }
+                barrier();
+                ready_ix = sh_part_count[N_PART_READ - 1];
+                partition_ix += N_PART_READ;
+            }
+            // use binary search to find element to read
+            uint ix = rd_ix + th_ix;
+            if (ix >= wr_ix && ix < ready_ix) {
+                uint part_ix = 0;
+                for (uint i = 0; i < LG_N_PART_READ; i++) {
+                    uint probe = part_ix + ((N_PART_READ / 2) >> i);
+                    if (ix >= sh_part_count[probe - 1]) {
+                        part_ix = probe;
+                    }
+                }
+                ix -= part_ix > 0 ? sh_part_count[part_ix - 1] : part_start_ix;
+                BinInstanceRef inst_ref = BinInstanceRef(sh_part_elements[part_ix]);
+                BinInstance inst = BinInstance_read(BinInstance_index(inst_ref, ix));
+                uint wr_el_ix = (rd_ix + th_ix) % N_RINGBUF;
                 sh_elements[wr_el_ix] = inst.element_ix;
                 sh_right_edge[wr_el_ix] = inst.right_edge;
             }
-            wr_ix += chunk_n;
-            partition_ix++;
-        }
-        barrier();
+            barrier();
+
+            wr_ix = min(rd_ix + N_TILE, ready_ix);
+        } while (wr_ix - rd_ix < N_TILE && (wr_ix < ready_ix || partition_ix < n_partitions));
+
 
         // We've done the merge and filled the buffer.
 
@@ -475,9 +525,7 @@ void main() {
         barrier();
 
         rd_ix += N_TILE;
-        // The second disjunct is there as a strange workaround on Nvidia. If it is
-        // removed, then the kernel fails with ERROR_DEVICE_LOST.
-        if (rd_ix >= wr_ix || bin_ix == ~0) break;
+        if (rd_ix >= ready_ix && partition_ix >= n_partitions) break;
     }
     Cmd_End_write(cmd_ref);
 }
diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv
index a3dd59e..252ff10 100644
Binary files a/piet-gpu/shader/coarse.spv and b/piet-gpu/shader/coarse.spv differ