diff --git a/piet-gpu-types/src/state.rs b/piet-gpu-types/src/state.rs
index 35076f0..b93e9f3 100644
--- a/piet-gpu-types/src/state.rs
+++ b/piet-gpu-types/src/state.rs
@@ -8,6 +8,7 @@ piet_gpu! {
             translate: [f32; 2],
             bbox: [f32; 4],
             linewidth: f32,
+            right_edge: f32,
             flags: u32,
         }
     }
diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp
index c3067e7..cba0217 100644
--- a/piet-gpu/shader/binning.comp
+++ b/piet-gpu/shader/binning.comp
@@ -11,24 +11,33 @@ layout(set = 0, binding = 0) buffer AnnotatedBuf {
     uint[] annotated;
 };
 
-layout(set = 0, binding = 1) buffer AllocBuf {
+// This is for scanning forward for right_edge data.
+layout(set = 0, binding = 1) buffer StateBuf {
+    uint[] state;
+};
+
+layout(set = 0, binding = 2) buffer AllocBuf {
     uint n_elements;
     // Will be incremented atomically to claim tiles
     uint tile_ix;
     uint alloc;
 };
 
-layout(set = 0, binding = 2) buffer BinsBuf {
+layout(set = 0, binding = 3) buffer BinsBuf {
     uint[] bins;
 };
 
 #include "annotated.h"
+#include "state.h"
 #include "bins.h"
 
 // scale factors useful for converting coordinates to bins
 #define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX))
 #define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX))
 
+// Constant not available in GLSL. Also consider uintBitsToFloat(0x7f800000)
+#define INFINITY (1.0 / 0.0)
+
 // Note: cudaraster has N_TILE + 1 to cut down on bank conflicts.
 shared uint bitmaps[N_SLICE][N_TILE];
 shared uint count[N_SLICE][N_TILE];
@@ -37,6 +46,14 @@ shared uint sh_chunk_start[N_TILE];
 shared uint sh_chunk_end[N_TILE];
 shared uint sh_chunk_jump[N_TILE];
 
+shared float sh_right_edge[N_TILE];
+
+#define StateBuf_stride (4 + 2 * State_size)
+
+StateRef state_aggregate_ref(uint partition_ix) {
+    return StateRef(8 + partition_ix * StateBuf_stride);
+}
+
 void main() {
     BinChunkRef chunk_ref = BinChunkRef((gl_LocalInvocationID.x * N_WG + gl_WorkGroupID.x) * BIN_INITIAL_ALLOC);
     uint wr_limit = chunk_ref.offset + BIN_INITIAL_ALLOC;
@@ -65,6 +82,7 @@ void main() {
             tag = Annotated_tag(ref);
         }
         int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
+        float my_right_edge = INFINITY;
         switch (tag) {
         case Annotated_Line:
             AnnoLineSeg line = Annotated_Line_read(ref);
@@ -82,8 +100,37 @@ void main() {
             y0 = int(floor(fill.bbox.y * SY));
             x1 = int(ceil(fill.bbox.z * SX));
             y1 = int(ceil(fill.bbox.w * SY));
+            my_right_edge = x1;
             break;
         }
+
+        // If the last element in this partition is a fill edge, then we need to do a
+        // look-forward to find the right edge of its corresponding fill. That data is
+        // recorded in aggregates computed in the element processing pass.
+        if (gl_LocalInvocationID.x == N_TILE - 1 && tag == Annotated_Line) {
+            uint aggregate_ix = (my_tile + 1) * ELEMENT_BINNING_RATIO;
+            // This is sequential but the expectation is that the amount of
+            // look-forward is small (performance may degrade in the case
+            // of massively complex paths).
+            do {
+                StateRef agg_ref = state_aggregate_ref(aggregate_ix);
+                my_right_edge = State_read(agg_ref).right_edge;
+                aggregate_ix++;
+            } while (isinf(my_right_edge));
+        }
+
+        // Now propagate right_edge backward, from fill to segment.
+        for (uint i = 0; i < LG_N_TILE; i++) {
+            // Note: we could try to cut down on write bandwidth here if the value hasn't
+            // changed, but not sure it's worth the complexity to track.
+            sh_right_edge[gl_LocalInvocationID.x] = my_right_edge;
+            barrier();
+            if (gl_LocalInvocationID.x + (1 << i) < N_TILE && isinf(my_right_edge)) {
+                my_right_edge = sh_right_edge[gl_LocalInvocationID.x + (1 << i)];
+            }
+            barrier();
+        }
+
         // At this point, we run an iterator over the coverage area,
         // trying to keep divergence low.
         // Right now, it's just a bbox, but we'll get finer with
@@ -141,6 +188,7 @@ void main() {
                 chunk_n = element_count - chunk_n;
             } else {
                 chunk_end = ~0;
+                chunk_new_start = ~0;
                 chunk_n = element_count;
             }
             sh_chunk_start[gl_LocalInvocationID.x] = instance_ref.offset;
diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv
index 76148c2..a5379e6 100644
Binary files a/piet-gpu/shader/binning.spv and b/piet-gpu/shader/binning.spv differ
diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp
index 8f87b87..15ad80d 100644
--- a/piet-gpu/shader/elements.comp
+++ b/piet-gpu/shader/elements.comp
@@ -10,7 +10,7 @@
 #define N_ROWS 4
 #define WG_SIZE 32
 #define LG_WG_SIZE 5
-#define TILE_SIZE (WG_SIZE * N_ROWS)
+#define PARTITION_SIZE (WG_SIZE * N_ROWS)
 
 layout(local_size_x = WG_SIZE, local_size_y = 1) in;
 
@@ -81,6 +81,7 @@ State combine_state(State a, State b) {
     c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x;
     c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y;
     c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth;
+    c.right_edge = (a.flags & FLAG_SET_BBOX) != 0 ? a.right_edge : (a.flags & FLAG_RESET_BBOX) != 0 ? a.bbox.z : c.right_edge;
     c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX)) | b.flags;
     c.flags |= (a.flags & FLAG_RESET_BBOX) >> 1;
     return c;
@@ -143,6 +144,7 @@ shared vec4 sh_mat[WG_SIZE];
 shared vec2 sh_translate[WG_SIZE];
 shared vec4 sh_bbox[WG_SIZE];
 shared float sh_width[WG_SIZE];
+shared float sh_right_edge[WG_SIZE];
 shared uint sh_flags[WG_SIZE];
 
 shared uint sh_tile_ix;
@@ -158,7 +160,7 @@ void main() {
     barrier();
     uint tile_ix = sh_tile_ix;
 
-    uint ix = tile_ix * TILE_SIZE + gl_LocalInvocationID.x * N_ROWS;
+    uint ix = tile_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
     ElementRef ref = ElementRef(ix * Element_size);
 
     th_state[0] = map_element(ref);
@@ -172,6 +174,7 @@ void main() {
     sh_translate[gl_LocalInvocationID.x] = agg.translate;
     sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
     sh_width[gl_LocalInvocationID.x] = agg.linewidth;
+    sh_right_edge[gl_LocalInvocationID.x] = agg.right_edge;
     sh_flags[gl_LocalInvocationID.x] = agg.flags;
     for (uint i = 0; i < LG_WG_SIZE; i++) {
         barrier();
@@ -190,6 +193,7 @@ void main() {
         sh_translate[gl_LocalInvocationID.x] = agg.translate;
         sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
         sh_width[gl_LocalInvocationID.x] = agg.linewidth;
+        sh_right_edge[gl_LocalInvocationID.x] = agg.right_edge;
         sh_flags[gl_LocalInvocationID.x] = agg.flags;
     }
 
@@ -198,6 +202,7 @@ void main() {
     exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0);
     exclusive.translate = vec2(0.0, 0.0);
     exclusive.linewidth = 1.0; //TODO should be 0.0
+    exclusive.right_edge = 0.0;
     exclusive.flags = 0;
 
     // Publish aggregate for this partition
@@ -250,6 +255,7 @@ void main() {
         other.translate = sh_translate[ix];
         other.bbox = sh_bbox[ix];
         other.linewidth = sh_width[ix];
+        other.right_edge = sh_right_edge[ix];
         other.flags = sh_flags[ix];
         row = combine_state(row, other);
     }
diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv
index 7828aa4..ff0ae2d 100644
Binary files a/piet-gpu/shader/elements.spv and b/piet-gpu/shader/elements.spv differ
diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h
index 5d8fb9b..6b00661 100644
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@@ -51,9 +51,14 @@
 #define N_TILE_X 16
 #define N_TILE_Y 16
 #define N_TILE (N_TILE_X * N_TILE_Y)
+#define LG_N_TILE 8
 #define N_SLICE (N_TILE / 32)
 // Number of workgroups for binning kernel
 #define N_WG 16
 
+// This is the ratio of the number of elements in a binning workgroup
+// over the number of elements in a partition workgroup.
+#define ELEMENT_BINNING_RATIO 4
+
 #define BIN_INITIAL_ALLOC 64
 #define BIN_ALLOC 256
diff --git a/piet-gpu/shader/state.h b/piet-gpu/shader/state.h
index 2547b93..bc6192f 100644
--- a/piet-gpu/shader/state.h
+++ b/piet-gpu/shader/state.h
@@ -9,10 +9,11 @@ struct State {
     vec2 translate;
     vec4 bbox;
     float linewidth;
+    float right_edge;
     uint flags;
 };
 
-#define State_size 48
+#define State_size 52
 
 StateRef State_index(StateRef ref, uint index) {
     return StateRef(ref.offset + index * State_size);
@@ -32,12 +33,14 @@ State State_read(StateRef ref) {
     uint raw9 = state[ix + 9];
     uint raw10 = state[ix + 10];
     uint raw11 = state[ix + 11];
+    uint raw12 = state[ix + 12];
     State s;
     s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
     s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
     s.bbox = vec4(uintBitsToFloat(raw6), uintBitsToFloat(raw7), uintBitsToFloat(raw8), uintBitsToFloat(raw9));
     s.linewidth = uintBitsToFloat(raw10);
-    s.flags = raw11;
+    s.right_edge = uintBitsToFloat(raw11);
+    s.flags = raw12;
     return s;
 }
 
@@ -54,6 +57,7 @@ void State_write(StateRef ref, State s) {
     state[ix + 8] = floatBitsToUint(s.bbox.z);
     state[ix + 9] = floatBitsToUint(s.bbox.w);
     state[ix + 10] = floatBitsToUint(s.linewidth);
-    state[ix + 11] = s.flags;
+    state[ix + 11] = floatBitsToUint(s.right_edge);
+    state[ix + 12] = s.flags;
 }
 
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index 2dca39d..70b02f5 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -185,10 +185,10 @@ impl<D: Device> Renderer<D> {
             ])
             ?;
         let bin_code = include_bytes!("../shader/binning.spv");
-        let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 3, 0)?;
+        let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 4, 0)?;
         let bin_ds = device.create_descriptor_set(
             &bin_pipeline,
-            &[&anno_buf, &bin_alloc_buf_dev, &bin_buf],
+            &[&anno_buf, &state_buf, &bin_alloc_buf_dev, &bin_buf],
             &[],
         )?;