diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp
index 0ddeb7e..1413927 100644
--- a/piet-gpu/shader/binning.comp
+++ b/piet-gpu/shader/binning.comp
@@ -15,24 +15,18 @@ layout(set = 0, binding = 0) buffer AnnotatedBuf {
     uint[] annotated;
 };
 
-// This is for scanning forward for right_edge data.
-layout(set = 0, binding = 1) buffer StateBuf {
-    uint[] state;
-};
-
-layout(set = 0, binding = 2) buffer AllocBuf {
+layout(set = 0, binding = 1) buffer AllocBuf {
     uint n_elements; // paths
     // Will be incremented atomically to claim tiles
     uint tile_ix;
     uint alloc;
 };
 
-layout(set = 0, binding = 3) buffer BinsBuf {
+layout(set = 0, binding = 2) buffer BinsBuf {
     uint[] bins;
 };
 
 #include "annotated.h"
-#include "state.h"
 #include "bins.h"
 
 // scale factors useful for converting coordinates to bins
@@ -52,12 +46,6 @@ shared uint sh_chunk_start[N_TILE];
 
 shared float sh_right_edge[N_TILE];
 
-#define StateBuf_stride (8 + 2 * State_size)
-
-uint state_right_edge_index(uint partition_ix) {
-    return 2 + partition_ix * (StateBuf_stride / 4);
-}
-
 void main() {
     uint chunk_n = 0;
     uint my_n_elements = n_elements;
@@ -103,37 +91,6 @@ void main() {
         break;
     }
 
-    /*
-    // If the last element in this partition is a fill edge, then we need to do a
-    // look-forward to find the right edge of its corresponding fill. That data is
-    // recorded in aggregates computed in the element processing pass.
-    if (gl_LocalInvocationID.x == N_TILE - 1 && tag == Annotated_FillLine) {
-        uint aggregate_ix = (my_partition + 1) * ELEMENT_BINNING_RATIO;
-        // This is sequential but the expectation is that the amount of
-        // look-forward is small (performance may degrade in the case
-        // of massively complex paths).
-        do {
-            my_right_edge = uintBitsToFloat(state[state_right_edge_index(aggregate_ix)]);
-            aggregate_ix++;
-        } while (isinf(my_right_edge));
-    }
-
-    // Now propagate right_edge backward, from fill to segment.
-    for (uint i = 0; i < LG_N_TILE; i++) {
-        // Note: we could try to cut down on write bandwidth here if the value hasn't
-        // changed, but not sure it's worth the complexity to track.
-        sh_right_edge[gl_LocalInvocationID.x] = my_right_edge;
-        barrier();
-        if (gl_LocalInvocationID.x + (1 << i) < N_TILE && isinf(my_right_edge)) {
-            my_right_edge = sh_right_edge[gl_LocalInvocationID.x + (1 << i)];
-        }
-        barrier();
-    }
-    if (crosses_edge) {
-        x1 = int(ceil(my_right_edge * SX));
-    }
-    */
-
     // At this point, we run an iterator over the coverage area,
     // trying to keep divergence low.
     // Right now, it's just a bbox, but we'll get finer with
diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv
index 0d30dfc..1b31cd1 100644
Binary files a/piet-gpu/shader/binning.spv and b/piet-gpu/shader/binning.spv differ
diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp
index 905f875..1886907 100644
--- a/piet-gpu/shader/elements.comp
+++ b/piet-gpu/shader/elements.comp
@@ -167,9 +167,7 @@ shared uint sh_flags[WG_SIZE];
 shared uint sh_path_count[WG_SIZE];
 shared uint sh_pathseg_count[WG_SIZE];
 
-shared uint sh_min_fill;
-
-shared uint sh_tile_ix;
+shared uint sh_part_ix;
 shared State sh_prefix;
 
 void main() {
@@ -177,28 +175,21 @@ void main() {
     // Determine partition to process by atomic counter (described in Section
     // 4.4 of prefix sum paper).
     if (gl_LocalInvocationID.x == 0) {
-        sh_tile_ix = atomicAdd(state[0], 1);
-        sh_min_fill = ~0;
+        sh_part_ix = atomicAdd(state[0], 1);
     }
     barrier();
-    uint tile_ix = sh_tile_ix;
+    uint part_ix = sh_part_ix;
 
-    uint ix = tile_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
+    uint ix = part_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
     ElementRef ref = ElementRef(ix * Element_size);
 
     bool is_fill;
-    uint my_min_fill = ~0;
     th_state[0] = map_element(ref, is_fill);
-    if (is_fill) my_min_fill = ix;
     for (uint i = 1; i < N_ROWS; i++) {
         // discussion question: would it be faster to load using more coherent patterns
         // into thread memory? This is kinda strided.
         th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i), is_fill));
-        if (is_fill && my_min_fill == ~0) {
-            my_min_fill = ix + i;
-        }
     }
-    atomicMin(sh_min_fill, my_min_fill);
     State agg = th_state[N_ROWS - 1];
     sh_mat[gl_LocalInvocationID.x] = agg.mat;
     sh_translate[gl_LocalInvocationID.x] = agg.translate;
@@ -243,17 +234,17 @@ void main() {
     // Publish aggregate for this partition
     if (gl_LocalInvocationID.x == WG_SIZE - 1) {
         // Note: with memory model, we'd want to generate the atomic store version of this.
-        State_write(state_aggregate_ref(tile_ix), agg);
+        State_write(state_aggregate_ref(part_ix), agg);
         uint flag = FLAG_AGGREGATE_READY;
         memoryBarrierBuffer();
-        if (tile_ix == 0) {
-            State_write(state_prefix_ref(tile_ix), agg);
+        if (part_ix == 0) {
+            State_write(state_prefix_ref(part_ix), agg);
             flag = FLAG_PREFIX_READY;
         }
-        state[state_flag_index(tile_ix)] = flag;
-        if (tile_ix != 0) {
+        state[state_flag_index(part_ix)] = flag;
+        if (part_ix != 0) {
             // step 4 of paper: decoupled lookback
-            uint look_back_ix = tile_ix - 1;
+            uint look_back_ix = part_ix - 1;
             while (true) {
                 flag = state[state_flag_index(look_back_ix)];
                 if (flag == FLAG_PREFIX_READY) {
@@ -271,15 +262,14 @@ void main() {
             // step 5 of paper: compute inclusive prefix
             State inclusive_prefix = combine_state(exclusive, agg);
             sh_prefix = exclusive;
-            State_write(state_prefix_ref(tile_ix), inclusive_prefix);
+            State_write(state_prefix_ref(part_ix), inclusive_prefix);
             memoryBarrierBuffer();
             flag = FLAG_PREFIX_READY;
-            state[state_flag_index(tile_ix)] = flag;
+            state[state_flag_index(part_ix)] = flag;
         }
     }
     barrier();
-    my_min_fill = sh_min_fill;
-    if (tile_ix != 0) {
+    if (part_ix != 0) {
         exclusive = sh_prefix;
     }
 
@@ -296,14 +286,9 @@ void main() {
         other.pathseg_count = sh_pathseg_count[ix];
         row = combine_state(row, other);
     }
-    if (my_min_fill == ~0 && gl_LocalInvocationID.x == 0) {
-        state[state_flag_index(tile_ix) + 1] = 0x7f800000; // infinity
-    }
     for (uint i = 0; i < N_ROWS; i++) {
         State st = combine_state(row, th_state[i]);
-        if (my_min_fill == ix + i) {
-            state[state_flag_index(tile_ix) + 1] = floatBitsToUint(st.bbox.z);
-        }
+
         // We write the state now for development purposes, but the
         // actual goal is to write transformed and annotated elements.
         //State_write(StateRef((ix + i) * State_size), st);
diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv
index 552956c..5bd0650 100644
Binary files a/piet-gpu/shader/elements.spv and b/piet-gpu/shader/elements.spv differ
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index 7c0a1fc..324df71 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -237,10 +237,10 @@ impl<D: Device> Renderer<D> {
             &[n_paths as u32, 0, bin_alloc_start as u32],
         )?;
         let bin_code = include_bytes!("../shader/binning.spv");
-        let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 4, 0)?;
+        let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 3, 0)?;
         let bin_ds = device.create_descriptor_set(
             &bin_pipeline,
-            &[&anno_buf, &state_buf, &bin_alloc_buf_dev, &bin_buf],
+            &[&anno_buf, &bin_alloc_buf_dev, &bin_buf],
             &[],
         )?;