diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs
index 672b42d..f37f0cd 100644
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@@ -70,7 +70,7 @@ fn main() -> Result<(), Error> {
         piet_gpu::dump_k1_data(&data);
 
         let mut data: Vec<u32> = Default::default();
-        device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap();
+        device.read_buffer(&renderer.anno_buf, &mut data).unwrap();
         piet_gpu::dump_k1_data(&data);
         */
 
diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp
index 94084c8..76d56b6 100644
--- a/piet-gpu/shader/elements.comp
+++ b/piet-gpu/shader/elements.comp
@@ -18,10 +18,10 @@ layout(set = 0, binding = 0) readonly buffer SceneBuf {
     uint[] scene;
 };
 
-// This will be used for inter-workgroup aggregates. In the
-// meantime, for development, it has been used to store the
-// scan of the state objects.
-layout(set = 0, binding = 1) buffer StateBuf {
+// It would be better to use the Vulkan memory model than
+// "volatile" but shooting for compatibility here rather
+// than doing things right.
+layout(set = 0, binding = 1) volatile buffer StateBuf {
     uint[] state;
 };
 
@@ -34,8 +34,28 @@ layout(set = 0, binding = 2) buffer AnnotatedBuf {
 #include "state.h"
 #include "annotated.h"
 
+#define StateBuf_stride (4 + 2 * State_size)
+
+StateRef state_aggregate_ref(uint partition_ix) {
+    return StateRef(8 + partition_ix * StateBuf_stride);
+}
+
+StateRef state_prefix_ref(uint partition_ix) {
+    return StateRef(8 + partition_ix * StateBuf_stride + State_size);
+}
+
+uint state_flag_index(uint partition_ix) {
+    return 1 + partition_ix * (StateBuf_stride / 4);
+}
+
+// These correspond to X, A, P respectively in the prefix sum paper.
+#define FLAG_NOT_READY 0
+#define FLAG_AGGREGATE_READY 1
+#define FLAG_PREFIX_READY 2
+
 #define FLAG_SET_LINEWIDTH 1
-#define FLAG_RESET_BBOX 2
+#define FLAG_SET_BBOX 2
+#define FLAG_RESET_BBOX 4
 
 // This is almost like a monoid (the interaction between transformation and
 // bounding boxes is approximate)
@@ -47,7 +67,9 @@ State combine_state(State a, State b) {
     c.bbox.w = max(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + max(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
     if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) {
         c.bbox = a.bbox;
-    } else if ((a.flags & FLAG_RESET_BBOX) == 0 && (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)) {
+    } else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 &&
+        (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y))
+    {
         c.bbox.xy = min(a.bbox.xy, c.bbox.xy);
         c.bbox.zw = max(a.bbox.zw, c.bbox.zw);
     }
@@ -59,7 +81,7 @@ State combine_state(State a, State b) {
     c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x;
     c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y;
     c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth;
-    c.flags = a.flags | b.flags;
+    c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX)) | b.flags;
     return c;
 }
 
@@ -122,10 +144,18 @@ shared vec4 sh_bbox[WG_SIZE];
 shared float sh_width[WG_SIZE];
 shared uint sh_flags[WG_SIZE];
 
+shared uint sh_tile_ix;
+shared State sh_prefix;
+
 void main() {
     State th_state[N_ROWS];
-    // this becomes an atomic counter
-    uint tile_ix = gl_WorkGroupID.x;
+    // Determine partition to process by atomic counter (described in Section
+    // 4.4 of prefix sum paper).
+    if (gl_LocalInvocationID.x == 0) {
+        sh_tile_ix = atomicAdd(state[0], 1);
+    }
+    barrier();
+    uint tile_ix = sh_tile_ix;
 
     uint ix = tile_ix * TILE_SIZE + gl_LocalInvocationID.x * N_ROWS;
     ElementRef ref = ElementRef(ix * Element_size);
@@ -162,16 +192,54 @@ void main() {
         sh_flags[gl_LocalInvocationID.x] = agg.flags;
     }
 
-    // TODO: if last invocation in wg, publish agg.
-
-    barrier();
     State exclusive;
     exclusive.bbox = vec4(0.0, 0.0, 0.0, 0.0);
     exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0);
     exclusive.translate = vec2(0.0, 0.0);
     exclusive.linewidth = 1.0; //TODO should be 0.0
     exclusive.flags = 0;
-    // TODO: do decoupled look-back
+
+    // Publish aggregate for this partition
+    if (gl_LocalInvocationID.x == WG_SIZE - 1) {
+        // Note: with memory model, we'd want to generate the atomic store version of this.
+        State_write(state_aggregate_ref(tile_ix), agg);
+        uint flag = FLAG_AGGREGATE_READY;
+        memoryBarrierBuffer();
+        if (tile_ix == 0) {
+            State_write(state_prefix_ref(tile_ix), agg);
+            flag = FLAG_PREFIX_READY;
+        }
+        state[state_flag_index(tile_ix)] = flag;
+        if (tile_ix != 0) {
+            // step 4 of paper: decoupled lookback
+            uint look_back_ix = tile_ix - 1;
+            while (true) {
+                flag = state[state_flag_index(look_back_ix)];
+                if (flag == FLAG_PREFIX_READY) {
+                    State their_prefix = State_read(state_prefix_ref(look_back_ix));
+                    exclusive = combine_state(their_prefix, exclusive);
+                    break;
+                } else if (flag == FLAG_AGGREGATE_READY) {
+                    State their_agg = State_read(state_aggregate_ref(look_back_ix));
+                    exclusive = combine_state(their_agg, exclusive);
+                    look_back_ix--;
+                }
+                // else spin
+            }
+
+            // step 5 of paper: compute inclusive prefix
+            State inclusive_prefix = combine_state(exclusive, agg);
+            sh_prefix = exclusive;
+            State_write(state_prefix_ref(tile_ix), inclusive_prefix);
+            memoryBarrierBuffer();
+            flag = FLAG_PREFIX_READY;
+            state[state_flag_index(tile_ix)] = flag;
+        }
+    }
+    barrier();
+    if (tile_ix != 0) {
+        exclusive = sh_prefix;
+    }
 
     State row = exclusive;
     if (gl_LocalInvocationID.x > 0) {
diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv
index f858129..a19b5f8 100644
Binary files a/piet-gpu/shader/elements.spv and b/piet-gpu/shader/elements.spv differ
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index 0ac8299..bf7a7c7 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -158,7 +158,7 @@ impl<D: Device> Renderer<D> {
             .unwrap();
         device.write_buffer(&scene_buf, &scene)?;
 
-        let state_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
+        let state_buf = device.create_buffer(1 * 1024 * 1024, dev)?;
         let anno_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
         let bin_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
         let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?;
@@ -245,6 +245,7 @@ impl<D: Device> Renderer<D> {
         cmd_buf.copy_buffer(&self.scene_buf, &self.scene_dev);
         cmd_buf.copy_buffer(&self.bin_alloc_buf_host, &self.bin_alloc_buf_dev);
         cmd_buf.copy_buffer(&self.coarse_alloc_buf_host, &self.coarse_alloc_buf_dev);
+        cmd_buf.clear_buffer(&self.state_buf);
         cmd_buf.memory_barrier();
         cmd_buf.image_barrier(
             &self.image_dev,