diff --git a/piet-gpu-types/src/state.rs b/piet-gpu-types/src/state.rs
index b93e9f3..35076f0 100644
--- a/piet-gpu-types/src/state.rs
+++ b/piet-gpu-types/src/state.rs
@@ -8,7 +8,6 @@ piet_gpu! {
             translate: [f32; 2],
             bbox: [f32; 4],
             linewidth: f32,
-            right_edge: f32,
             flags: u32,
         }
     }
diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp
index 138621e..713a654 100644
--- a/piet-gpu/shader/binning.comp
+++ b/piet-gpu/shader/binning.comp
@@ -50,10 +50,10 @@ shared uint sh_chunk_jump[N_TILE];
 
 shared float sh_right_edge[N_TILE];
 
-#define StateBuf_stride (4 + 2 * State_size)
+#define StateBuf_stride (8 + 2 * State_size)
 
-StateRef state_aggregate_ref(uint partition_ix) {
-    return StateRef(8 + partition_ix * StateBuf_stride);
+uint state_right_edge_index(uint partition_ix) {
+    return 2 + partition_ix * (StateBuf_stride / 4);
 }
 
 void main() {
@@ -120,8 +120,7 @@ void main() {
             // look-forward is small (performance may degrade in the case
             // of massively complex paths).
             do {
-                StateRef agg_ref = state_aggregate_ref(aggregate_ix);
-                my_right_edge = State_read(agg_ref).right_edge;
+                my_right_edge = uintBitsToFloat(state[state_right_edge_index(aggregate_ix)]);
                 aggregate_ix++;
             } while (isinf(my_right_edge));
         }
diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv
index dc1713b..e932e4d 100644
Binary files a/piet-gpu/shader/binning.spv and b/piet-gpu/shader/binning.spv differ
diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja
index 3b6b963..14c72aa 100644
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@@ -12,7 +12,7 @@ build image.spv: glsl image.comp | scene.h
 
 build elements.spv: glsl elements.comp | scene.h state.h annotated.h
 
-build binning.spv: glsl binning.comp | annotated.h bins.h setup.h
+build binning.spv: glsl binning.comp | annotated.h state.h bins.h setup.h
 
 build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h
 
diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
index 2389e27..03c4535 100644
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@@ -310,6 +310,30 @@ void main() {
 
             switch (tag) {
             case Annotated_FillLine:
+                AnnoFillLineSeg fill_line = Annotated_FillLine_read(ref);
+                // This is basically the same logic as piet-metal, but should be made numerically robust.
+                vec2 tile_xy = vec2(tile_x * TILE_WIDTH_PX, tile_y * TILE_HEIGHT_PX);
+                float yEdge = mix(fill_line.p0.y, fill_line.p1.y, (tile_xy.x - fill_line.p0.x) / (fill_line.p1.x - fill_line.p0.x));
+                if (min(fill_line.p0.x, fill_line.p1.x) < tile_xy.x && yEdge >= tile_xy.y && yEdge < tile_xy.y + TILE_HEIGHT_PX) {
+                    Segment edge_seg;
+                    if (fill_line.p0.x > fill_line.p1.x) {
+                        fill_line.p1 = vec2(tile_xy.x, yEdge);
+                        edge_seg.start = fill_line.p1;
+                        edge_seg.end = vec2(tile_xy.x, tile_xy.y + TILE_HEIGHT_PX);
+                    } else {
+                        fill_line.p0 = vec2(tile_xy.x, yEdge);
+                        edge_seg.start = vec2(tile_xy.x, tile_xy.y + TILE_HEIGHT_PX);
+                        edge_seg.end = fill_line.p0;
+                    }
+                    alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit);
+                    Segment_write(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs), edge_seg);
+                    chunk_n_segs++;
+                }
+                Segment fill_seg = Segment(fill_line.p0, fill_line.p1);
+                alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit);
+                Segment_write(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs), fill_seg);
+                chunk_n_segs++;
+                break;
             case Annotated_StrokeLine:
                 AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref);
                 Segment seg = Segment(line.p0, line.p1);
diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv
index bc097e4..3d3f3ff 100644
Binary files a/piet-gpu/shader/coarse.spv and b/piet-gpu/shader/coarse.spv differ
diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp
index 341952b..bdb4e0d 100644
--- a/piet-gpu/shader/elements.comp
+++ b/piet-gpu/shader/elements.comp
@@ -34,14 +34,14 @@ layout(set = 0, binding = 2) buffer AnnotatedBuf {
 #include "state.h"
 #include "annotated.h"
 
-#define StateBuf_stride (4 + 2 * State_size)
+#define StateBuf_stride (8 + 2 * State_size)
 
 StateRef state_aggregate_ref(uint partition_ix) {
-    return StateRef(8 + partition_ix * StateBuf_stride);
+    return StateRef(12 + partition_ix * StateBuf_stride);
 }
 
 StateRef state_prefix_ref(uint partition_ix) {
-    return StateRef(8 + partition_ix * StateBuf_stride + State_size);
+    return StateRef(12 + partition_ix * StateBuf_stride + State_size);
 }
 
 uint state_flag_index(uint partition_ix) {
@@ -81,13 +81,12 @@ State combine_state(State a, State b) {
     c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x;
     c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y;
     c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth;
-    c.right_edge = (a.flags & FLAG_SET_BBOX) != 0 ? a.right_edge : (a.flags & FLAG_RESET_BBOX) != 0 ? a.bbox.z : c.right_edge;
     c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX)) | b.flags;
     c.flags |= (a.flags & FLAG_RESET_BBOX) >> 1;
     return c;
 }
 
-State map_element(ElementRef ref) {
+State map_element(ElementRef ref, inout bool is_fill) {
     // TODO: it would *probably* be more efficient to make the memory read patterns less
     // divergent, though it would be more wasted memory.
     uint tag = Element_tag(ref);
@@ -97,6 +96,7 @@ State map_element(ElementRef ref) {
     c.translate = vec2(0.0, 0.0);
     c.linewidth = 1.0; // TODO should be 0.0
     c.flags = 0;
+    is_fill = false;
     switch (tag) {
     case Element_FillLine:
     case Element_StrokeLine:
@@ -115,6 +115,8 @@ State map_element(ElementRef ref) {
         c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3));
         break;
     case Element_Fill:
+        is_fill = true;
+        // fall-through
     case Element_Stroke:
         c.flags = FLAG_RESET_BBOX;
         break;
@@ -145,9 +147,10 @@ shared vec4 sh_mat[WG_SIZE];
 shared vec2 sh_translate[WG_SIZE];
 shared vec4 sh_bbox[WG_SIZE];
 shared float sh_width[WG_SIZE];
-shared float sh_right_edge[WG_SIZE];
 shared uint sh_flags[WG_SIZE];
 
+shared uint sh_min_fill;
+
 shared uint sh_tile_ix;
 shared State sh_prefix;
 
@@ -157,6 +160,7 @@ void main() {
     // 4.4 of prefix sum paper).
     if (gl_LocalInvocationID.x == 0) {
         sh_tile_ix = atomicAdd(state[0], 1);
+        sh_min_fill = ~0;
     }
     barrier();
     uint tile_ix = sh_tile_ix;
@@ -164,18 +168,24 @@ void main() {
     uint ix = tile_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
     ElementRef ref = ElementRef(ix * Element_size);
 
-    th_state[0] = map_element(ref);
+    bool is_fill;
+    uint my_min_fill = ~0;
+    th_state[0] = map_element(ref, is_fill);
+    if (is_fill) my_min_fill = ix;
     for (uint i = 1; i < N_ROWS; i++) {
         // discussion question: would it be faster to load using more coherent patterns
         // into thread memory? This is kinda strided.
-        th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i)));
+        th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i), is_fill));
+        if (is_fill && my_min_fill == ~0) {
+            my_min_fill = ix + i;
+        }
     }
+    atomicMin(sh_min_fill, my_min_fill);
     State agg = th_state[N_ROWS - 1];
     sh_mat[gl_LocalInvocationID.x] = agg.mat;
     sh_translate[gl_LocalInvocationID.x] = agg.translate;
     sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
     sh_width[gl_LocalInvocationID.x] = agg.linewidth;
-    sh_right_edge[gl_LocalInvocationID.x] = agg.right_edge;
     sh_flags[gl_LocalInvocationID.x] = agg.flags;
     for (uint i = 0; i < LG_WG_SIZE; i++) {
         barrier();
@@ -194,7 +204,6 @@ void main() {
         sh_translate[gl_LocalInvocationID.x] = agg.translate;
         sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
         sh_width[gl_LocalInvocationID.x] = agg.linewidth;
-        sh_right_edge[gl_LocalInvocationID.x] = agg.right_edge;
         sh_flags[gl_LocalInvocationID.x] = agg.flags;
     }
 
@@ -203,7 +212,6 @@ void main() {
     exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0);
     exclusive.translate = vec2(0.0, 0.0);
     exclusive.linewidth = 1.0; //TODO should be 0.0
-    exclusive.right_edge = 0.0;
     exclusive.flags = 0;
 
     // Publish aggregate for this partition
@@ -244,6 +252,7 @@ void main() {
         }
     }
     barrier();
+    my_min_fill = sh_min_fill;
     if (tile_ix != 0) {
         exclusive = sh_prefix;
     }
@@ -256,12 +265,17 @@ void main() {
         other.translate = sh_translate[ix];
         other.bbox = sh_bbox[ix];
         other.linewidth = sh_width[ix];
-        other.right_edge = sh_right_edge[ix];
         other.flags = sh_flags[ix];
         row = combine_state(row, other);
     }
+    if (my_min_fill == ~0 && gl_LocalInvocationID.x == 0) {
+        state[state_flag_index(tile_ix) + 1] = 0x7f800000; // infinity
+    }
     for (uint i = 0; i < N_ROWS; i++) {
         State st = combine_state(row, th_state[i]);
+        if (my_min_fill == ix + i) {
+            state[state_flag_index(tile_ix) + 1] = floatBitsToUint(st.bbox.z);
+        }
         // We write the state now for development purposes, but the
         // actual goal is to write transformed and annotated elements.
         //State_write(StateRef((ix + i) * State_size), st);
diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv
index c947240..962bd0a 100644
Binary files a/piet-gpu/shader/elements.spv and b/piet-gpu/shader/elements.spv differ
diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h
index 6b00661..b913086 100644
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@@ -58,7 +58,7 @@
 
 // This is the ratio of the number of elements in a binning workgroup
 // over the number of elements in a partition workgroup.
-#define ELEMENT_BINNING_RATIO 4
+#define ELEMENT_BINNING_RATIO 2
 
 #define BIN_INITIAL_ALLOC 64
 #define BIN_ALLOC 256
diff --git a/piet-gpu/shader/state.h b/piet-gpu/shader/state.h
index bc6192f..2547b93 100644
--- a/piet-gpu/shader/state.h
+++ b/piet-gpu/shader/state.h
@@ -9,11 +9,10 @@ struct State {
     vec2 translate;
     vec4 bbox;
     float linewidth;
-    float right_edge;
     uint flags;
 };
 
-#define State_size 52
+#define State_size 48
 
 StateRef State_index(StateRef ref, uint index) {
     return StateRef(ref.offset + index * State_size);
@@ -33,14 +32,12 @@ State State_read(StateRef ref) {
     uint raw9 = state[ix + 9];
     uint raw10 = state[ix + 10];
     uint raw11 = state[ix + 11];
-    uint raw12 = state[ix + 12];
     State s;
     s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
     s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
     s.bbox = vec4(uintBitsToFloat(raw6), uintBitsToFloat(raw7), uintBitsToFloat(raw8), uintBitsToFloat(raw9));
     s.linewidth = uintBitsToFloat(raw10);
-    s.right_edge = uintBitsToFloat(raw11);
-    s.flags = raw12;
+    s.flags = raw11;
     return s;
 }
 
@@ -57,7 +54,6 @@ void State_write(StateRef ref, State s) {
     state[ix + 8] = floatBitsToUint(s.bbox.z);
     state[ix + 9] = floatBitsToUint(s.bbox.w);
     state[ix + 10] = floatBitsToUint(s.linewidth);
-    state[ix + 11] = floatBitsToUint(s.right_edge);
-    state[ix + 12] = s.flags;
+    state[ix + 11] = s.flags;
 }
 
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index 65bbe5c..70b02f5 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -46,8 +46,8 @@ pub fn render_scene(rc: &mut impl RenderContext) {
         let circle = Circle::new(center, radius);
         rc.fill(circle, &color);
     }
-    /*
     let mut path = BezPath::new();
+    /*
     path.move_to((100.0, 1150.0));
     path.line_to((200.0, 1200.0));
     path.line_to((150.0, 1250.0));
diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs
index 8d68b0c..da234de 100644
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@@ -215,6 +215,7 @@ impl PietGpuRenderContext {
                 match el {
                     PathEl::MoveTo(p) => {
                         let scene_pt = to_f32_2(p);
+                        start_pt = Some(scene_pt);
                         last_pt = Some(scene_pt);
                     }
                     PathEl::LineTo(p) => {
@@ -228,11 +229,13 @@ impl PietGpuRenderContext {
                     }
                     PathEl::ClosePath => {
                         if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) {
-                            let seg = LineSeg {
-                                p0: last,
-                                p1: start,
-                            };
-                            self.encode_line_seg(seg, is_fill);
+                            if last != start {
+                                let seg = LineSeg {
+                                    p0: last,
+                                    p1: start,
+                                };
+                                self.encode_line_seg(seg, is_fill);
+                            }
                         }
                     }
                     _ => (),
@@ -246,6 +249,7 @@ impl PietGpuRenderContext {
                 match el {
                     PathEl::MoveTo(p) => {
                         let scene_pt = to_f32_2(p);
+                        start_pt = Some(scene_pt);
                         last_pt = Some(scene_pt);
                     }
                     PathEl::LineTo(p) => {
@@ -283,11 +287,13 @@ impl PietGpuRenderContext {
                     }
                     PathEl::ClosePath => {
                         if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) {
-                            let seg = LineSeg {
-                                p0: last,
-                                p1: start,
-                            };
-                            self.encode_line_seg(seg, is_fill);
+                            if last != start {
+                                let seg = LineSeg {
+                                    p0: last,
+                                    p1: start,
+                                };
+                                self.encode_line_seg(seg, is_fill);
+                            }
                         }
                     }
                 }