diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp
index 2d0a976..d37a2c6 100644
--- a/piet-gpu/shader/elements.comp
+++ b/piet-gpu/shader/elements.comp
@@ -221,8 +221,10 @@ void main() {
     if (gl_LocalInvocationID.x == WG_SIZE - 1) {
         // Note: with memory model, we'd want to generate the atomic store version of this.
         State_write(state_aggregate_ref(part_ix), agg);
+    }
+    memoryBarrierBuffer();
+    if (gl_LocalInvocationID.x == WG_SIZE - 1) {
         uint flag = FLAG_AGGREGATE_READY;
-        memoryBarrierBuffer();
         if (part_ix == 0) {
             State_write(state_prefix_ref(part_ix), agg);
             flag = FLAG_PREFIX_READY;
@@ -274,11 +276,12 @@ void main() {
             State inclusive_prefix = combine_state(exclusive, agg);
             sh_prefix = exclusive;
             State_write(state_prefix_ref(part_ix), inclusive_prefix);
-            memoryBarrierBuffer();
-            flag = FLAG_PREFIX_READY;
-            state[state_flag_index(part_ix)] = flag;
         }
     }
+    memoryBarrierBuffer();
+    if (gl_LocalInvocationID.x == WG_SIZE - 1 && part_ix != 0) {
+        state[state_flag_index(part_ix)] = FLAG_PREFIX_READY;
+    }
     barrier();
     if (part_ix != 0) {
         exclusive = sh_prefix;
diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv
index 8a4bda1..d1fd39a 100644
Binary files a/piet-gpu/shader/elements.spv and b/piet-gpu/shader/elements.spv differ
diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp
index 3230c0b..e2f86f6 100644
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@@ -8,9 +8,6 @@
 
 #version 450
 #extension GL_GOOGLE_include_directive : enable
-#ifdef ENABLE_IMAGE_INDICES
-#extension GL_EXT_nonuniform_qualifier : enable
-#endif
 
 #include "mem.h"
 #include "setup.h"
diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h
index cfe5f28..cb41a4b 100644
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@@ -15,9 +15,6 @@
 
 #define PTCL_INITIAL_ALLOC 1024
 
-// This is now set in the ninja file during compilation
-//#define ENABLE_IMAGE_INDICES
-
 // These should probably be renamed and/or reworked. In the binning
 // kernel, they represent the number of bins. Also, the workgroup size
 // of that kernel is equal to the number of bins, but should probably