diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp
index b0e4779..8bb9a4d 100644
--- a/piet-gpu/shader/elements.comp
+++ b/piet-gpu/shader/elements.comp
@@ -94,7 +94,7 @@ State combine_state(State a, State b) {
     return c;
 }
 
-State map_element(ElementRef ref, inout bool is_fill) {
+State map_element(ElementRef ref) {
     // TODO: it would *probably* be more efficient to make the memory read patterns less
     // divergent, though it would be more wasted memory.
     uint tag = Element_tag(ref);
@@ -106,7 +106,6 @@ State map_element(ElementRef ref, inout bool is_fill) {
     c.flags = 0;
     c.path_count = 0;
     c.pathseg_count = 0;
-    is_fill = false;
     switch (tag) {
     case Element_FillLine:
     case Element_StrokeLine:
@@ -132,8 +131,6 @@ State map_element(ElementRef ref, inout bool is_fill) {
     case Element_Fill:
     case Element_FillMask:
     case Element_FillMaskInv:
-        is_fill = true;
-        // fall-through
     case Element_Stroke:
         c.flags = FLAG_RESET_BBOX;
         c.path_count = 1;
@@ -185,12 +182,11 @@ void main() {
     uint ix = part_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
     ElementRef ref = ElementRef(ix * Element_size);
 
-    bool is_fill;
-    th_state[0] = map_element(ref, is_fill);
+    th_state[0] = map_element(ref);
     for (uint i = 1; i < N_ROWS; i++) {
         // discussion question: would it be faster to load using more coherent patterns
         // into thread memory? This is kinda strided.
-        th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i), is_fill));
+        th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i)));
     }
     State agg = th_state[N_ROWS - 1];
     sh_mat[gl_LocalInvocationID.x] = agg.mat;
@@ -247,6 +243,9 @@ void main() {
         if (part_ix != 0) {
             // step 4 of paper: decoupled lookback
             uint look_back_ix = part_ix - 1;
+
+            State their_agg;
+            uint their_ix = 0;
             while (true) {
                 flag = state[state_flag_index(look_back_ix)];
                 if (flag == FLAG_PREFIX_READY) {
@@ -254,11 +253,33 @@ void main() {
                     exclusive = combine_state(their_prefix, exclusive);
                     break;
                 } else if (flag == FLAG_AGGREGATE_READY) {
-                    State their_agg = State_read(state_aggregate_ref(look_back_ix));
+                    their_agg = State_read(state_aggregate_ref(look_back_ix));
                     exclusive = combine_state(their_agg, exclusive);
                     look_back_ix--;
+                    their_ix = 0;
+                    continue;
                 }
                 // else spin
+
+                // Unfortunately there's no guarantee of forward progress of other
+                // workgroups, so compute a bit of the aggregate before trying again.
+                // In the worst case, spinning stops when the aggregate is complete.
+                ElementRef ref = ElementRef((look_back_ix * PARTITION_SIZE + their_ix) * Element_size);
+                State s = map_element(ref);
+                if (their_ix == 0) {
+                    their_agg = s;
+                } else {
+                    their_agg = combine_state(their_agg, s);
+                }
+                their_ix++;
+                if (their_ix == PARTITION_SIZE) {
+                    exclusive = combine_state(their_agg, exclusive);
+                    if (look_back_ix == 0) {
+                        break;
+                    }
+                    look_back_ix--;
+                    their_ix = 0;
+                }
             }
 
             // step 5 of paper: compute inclusive prefix
diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv
index 599465a..fed60fb 100644
Binary files a/piet-gpu/shader/elements.spv and b/piet-gpu/shader/elements.spv differ