diff --git a/piet-gpu-types/src/annotated.rs b/piet-gpu-types/src/annotated.rs
index 247ab12..f7a6ad6 100644
--- a/piet-gpu-types/src/annotated.rs
+++ b/piet-gpu-types/src/annotated.rs
@@ -3,7 +3,14 @@ use piet_gpu_derive::piet_gpu;
 piet_gpu! {
     #[gpu_write]
     mod annotated {
-        struct AnnoLineSeg {
+        struct AnnoFillLineSeg {
+            p0: [f32; 2],
+            p1: [f32; 2],
+            // A note: the layout of this struct is shared with
+            // AnnoStrokeLineSeg. In that case, we actually write
+            // [0.0, 0.0] as the stroke field, to minimize divergence.
+        }
+        struct AnnoStrokeLineSeg {
             p0: [f32; 2],
             p1: [f32; 2],
             // halfwidth in both x and y for binning
@@ -35,8 +42,8 @@ piet_gpu! {
         }
         enum Annotated {
             Nop,
-            // The segments need a flag to indicate fill/stroke
-            Line(AnnoLineSeg),
+            FillLine(AnnoFillLineSeg),
+            StrokeLine(AnnoStrokeLineSeg),
             Quad(AnnoQuadSeg),
             Cubic(AnnoCubicSeg),
             Stroke(AnnoStroke),
diff --git a/piet-gpu-types/src/bins.rs b/piet-gpu-types/src/bins.rs
index 88f16f1..1ac2413 100644
--- a/piet-gpu-types/src/bins.rs
+++ b/piet-gpu-types/src/bins.rs
@@ -7,6 +7,9 @@ piet_gpu! {
     mod bins {
         struct BinInstance {
             element_ix: u32,
+            // Right edge of the bounding box of the associated fill
+            // element; used in backdrop computation.
+            right_edge: f32,
         }
 
         struct BinChunk {
diff --git a/piet-gpu-types/src/scene.rs b/piet-gpu-types/src/scene.rs
index 7451c9c..5792c94 100644
--- a/piet-gpu-types/src/scene.rs
+++ b/piet-gpu-types/src/scene.rs
@@ -85,8 +85,15 @@ piet_gpu! {
         }
         enum Element {
             Nop,
-            // The segments need a flag to indicate fill/stroke
-            Line(LineSeg),
+            // Another approach to encoding would be to use a single
+            // variant but have a bool for fill/stroke. This could be
+            // packed into the tag, so the on-the-wire representation
+            // would be very similar to what's here.
+            StrokeLine(LineSeg),
+            FillLine(LineSeg),
+
+            // Note: we'll need to handle the stroke/fill distinction
+            // for these as well, when we do flattening on the GPU.
             Quad(QuadSeg),
             Cubic(CubicSeg),
             Stroke(Stroke),
diff --git a/piet-gpu/shader/annotated.h b/piet-gpu/shader/annotated.h
index a3fc464..9812264 100644
--- a/piet-gpu/shader/annotated.h
+++ b/piet-gpu/shader/annotated.h
@@ -1,6 +1,10 @@
 // Code auto-generated by piet-gpu-derive
 
-struct AnnoLineSegRef {
+struct AnnoFillLineSegRef {
+    uint offset;
+};
+
+struct AnnoStrokeLineSegRef {
     uint offset;
 };
 
@@ -24,16 +28,27 @@ struct AnnotatedRef {
     uint offset;
 };
 
-struct AnnoLineSeg {
+struct AnnoFillLineSeg {
+    vec2 p0;
+    vec2 p1;
+};
+
+#define AnnoFillLineSeg_size 16
+
+AnnoFillLineSegRef AnnoFillLineSeg_index(AnnoFillLineSegRef ref, uint index) {
+    return AnnoFillLineSegRef(ref.offset + index * AnnoFillLineSeg_size);
+}
+
+struct AnnoStrokeLineSeg {
     vec2 p0;
     vec2 p1;
     vec2 stroke;
 };
 
-#define AnnoLineSeg_size 24
+#define AnnoStrokeLineSeg_size 24
 
-AnnoLineSegRef AnnoLineSeg_index(AnnoLineSegRef ref, uint index) {
-    return AnnoLineSegRef(ref.offset + index * AnnoLineSeg_size);
+AnnoStrokeLineSegRef AnnoStrokeLineSeg_index(AnnoStrokeLineSegRef ref, uint index) {
+    return AnnoStrokeLineSegRef(ref.offset + index * AnnoStrokeLineSeg_size);
 }
 
 struct AnnoQuadSeg {
@@ -87,18 +102,39 @@ AnnoStrokeRef AnnoStroke_index(AnnoStrokeRef ref, uint index) {
 }
 
 #define Annotated_Nop 0
-#define Annotated_Line 1
-#define Annotated_Quad 2
-#define Annotated_Cubic 3
-#define Annotated_Stroke 4
-#define Annotated_Fill 5
+#define Annotated_FillLine 1
+#define Annotated_StrokeLine 2
+#define Annotated_Quad 3
+#define Annotated_Cubic 4
+#define Annotated_Stroke 5
+#define Annotated_Fill 6
 #define Annotated_size 44
 
 AnnotatedRef Annotated_index(AnnotatedRef ref, uint index) {
     return AnnotatedRef(ref.offset + index * Annotated_size);
 }
 
-AnnoLineSeg AnnoLineSeg_read(AnnoLineSegRef ref) {
+AnnoFillLineSeg AnnoFillLineSeg_read(AnnoFillLineSegRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = annotated[ix + 0];
+    uint raw1 = annotated[ix + 1];
+    uint raw2 = annotated[ix + 2];
+    uint raw3 = annotated[ix + 3];
+    AnnoFillLineSeg s;
+    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    return s;
+}
+
+void AnnoFillLineSeg_write(AnnoFillLineSegRef ref, AnnoFillLineSeg s) {
+    uint ix = ref.offset >> 2;
+    annotated[ix + 0] = floatBitsToUint(s.p0.x);
+    annotated[ix + 1] = floatBitsToUint(s.p0.y);
+    annotated[ix + 2] = floatBitsToUint(s.p1.x);
+    annotated[ix + 3] = floatBitsToUint(s.p1.y);
+}
+
+AnnoStrokeLineSeg AnnoStrokeLineSeg_read(AnnoStrokeLineSegRef ref) {
     uint ix = ref.offset >> 2;
     uint raw0 = annotated[ix + 0];
     uint raw1 = annotated[ix + 1];
@@ -106,14 +142,14 @@ AnnoLineSeg AnnoLineSeg_read(AnnoLineSegRef ref) {
     uint raw3 = annotated[ix + 3];
     uint raw4 = annotated[ix + 4];
     uint raw5 = annotated[ix + 5];
-    AnnoLineSeg s;
+    AnnoStrokeLineSeg s;
     s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
     s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
     s.stroke = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
     return s;
 }
 
-void AnnoLineSeg_write(AnnoLineSegRef ref, AnnoLineSeg s) {
+void AnnoStrokeLineSeg_write(AnnoStrokeLineSegRef ref, AnnoStrokeLineSeg s) {
     uint ix = ref.offset >> 2;
     annotated[ix + 0] = floatBitsToUint(s.p0.x);
     annotated[ix + 1] = floatBitsToUint(s.p0.y);
@@ -239,8 +275,12 @@ uint Annotated_tag(AnnotatedRef ref) {
     return annotated[ref.offset >> 2];
 }
 
-AnnoLineSeg Annotated_Line_read(AnnotatedRef ref) {
-    return AnnoLineSeg_read(AnnoLineSegRef(ref.offset + 4));
+AnnoFillLineSeg Annotated_FillLine_read(AnnotatedRef ref) {
+    return AnnoFillLineSeg_read(AnnoFillLineSegRef(ref.offset + 4));
+}
+
+AnnoStrokeLineSeg Annotated_StrokeLine_read(AnnotatedRef ref) {
+    return AnnoStrokeLineSeg_read(AnnoStrokeLineSegRef(ref.offset + 4));
 }
 
 AnnoQuadSeg Annotated_Quad_read(AnnotatedRef ref) {
@@ -263,9 +303,14 @@ void Annotated_Nop_write(AnnotatedRef ref) {
     annotated[ref.offset >> 2] = Annotated_Nop;
 }
 
-void Annotated_Line_write(AnnotatedRef ref, AnnoLineSeg s) {
-    annotated[ref.offset >> 2] = Annotated_Line;
-    AnnoLineSeg_write(AnnoLineSegRef(ref.offset + 4), s);
+void Annotated_FillLine_write(AnnotatedRef ref, AnnoFillLineSeg s) {
+    annotated[ref.offset >> 2] = Annotated_FillLine;
+    AnnoFillLineSeg_write(AnnoFillLineSegRef(ref.offset + 4), s);
+}
+
+void Annotated_StrokeLine_write(AnnotatedRef ref, AnnoStrokeLineSeg s) {
+    annotated[ref.offset >> 2] = Annotated_StrokeLine;
+    AnnoStrokeLineSeg_write(AnnoStrokeLineSegRef(ref.offset + 4), s);
 }
 
 void Annotated_Quad_write(AnnotatedRef ref, AnnoQuadSeg s) {
diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp
index c3067e7..713a654 100644
--- a/piet-gpu/shader/binning.comp
+++ b/piet-gpu/shader/binning.comp
@@ -11,24 +11,35 @@ layout(set = 0, binding = 0) buffer AnnotatedBuf {
     uint[] annotated;
 };
 
-layout(set = 0, binding = 1) buffer AllocBuf {
+// This is for scanning forward for right_edge data.
+layout(set = 0, binding = 1) buffer StateBuf {
+    uint[] state;
+};
+
+layout(set = 0, binding = 2) buffer AllocBuf {
     uint n_elements;
     // Will be incremented atomically to claim tiles
     uint tile_ix;
     uint alloc;
 };
 
-layout(set = 0, binding = 2) buffer BinsBuf {
+layout(set = 0, binding = 3) buffer BinsBuf {
     uint[] bins;
 };
 
 #include "annotated.h"
+#include "state.h"
 #include "bins.h"
 
 // scale factors useful for converting coordinates to bins
 #define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX))
 #define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX))
 
+#define TSY (1.0 / float(TILE_HEIGHT_PX))
+
+// Constant not available in GLSL. Also consider uintBitsToFloat(0x7f800000)
+#define INFINITY (1.0 / 0.0)
+
 // Note: cudaraster has N_TILE + 1 to cut down on bank conflicts.
 shared uint bitmaps[N_SLICE][N_TILE];
 shared uint count[N_SLICE][N_TILE];
@@ -37,6 +48,14 @@ shared uint sh_chunk_start[N_TILE];
 shared uint sh_chunk_end[N_TILE];
 shared uint sh_chunk_jump[N_TILE];
 
+shared float sh_right_edge[N_TILE];
+
+#define StateBuf_stride (8 + 2 * State_size)
+
+uint state_right_edge_index(uint partition_ix) {
+    return 2 + partition_ix * (StateBuf_stride / 4);
+}
+
 void main() {
     BinChunkRef chunk_ref = BinChunkRef((gl_LocalInvocationID.x * N_WG + gl_WorkGroupID.x) * BIN_INITIAL_ALLOC);
     uint wr_limit = chunk_ref.offset + BIN_INITIAL_ALLOC;
@@ -65,13 +84,17 @@ void main() {
             tag = Annotated_tag(ref);
         }
         int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
+        float my_right_edge = INFINITY;
+        bool crosses_edge = false;
         switch (tag) {
-        case Annotated_Line:
-            AnnoLineSeg line = Annotated_Line_read(ref);
+        case Annotated_FillLine:
+        case Annotated_StrokeLine:
+            AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref);
             x0 = int(floor((min(line.p0.x, line.p1.x) - line.stroke.x) * SX));
             y0 = int(floor((min(line.p0.y, line.p1.y) - line.stroke.y) * SY));
             x1 = int(ceil((max(line.p0.x, line.p1.x) + line.stroke.x) * SX));
             y1 = int(ceil((max(line.p0.y, line.p1.y) + line.stroke.y) * SY));
+            crosses_edge = tag == Annotated_FillLine && ceil(line.p0.y * TSY) != ceil(line.p1.y * TSY);
             break;
         case Annotated_Fill:
         case Annotated_Stroke:
@@ -82,8 +105,41 @@ void main() {
             y0 = int(floor(fill.bbox.y * SY));
             x1 = int(ceil(fill.bbox.z * SX));
             y1 = int(ceil(fill.bbox.w * SY));
+            // It probably makes more sense to track x1, to avoid having to redo
+            // the rounding to tile coords.
+            my_right_edge = fill.bbox.z;
             break;
         }
+
+        // If the last element in this partition is a fill edge, then we need to do a
+        // look-forward to find the right edge of its corresponding fill. That data is
+        // recorded in aggregates computed in the element processing pass.
+        if (gl_LocalInvocationID.x == N_TILE - 1 && tag == Annotated_FillLine) {
+            uint aggregate_ix = (my_tile + 1) * ELEMENT_BINNING_RATIO;
+            // This is sequential but the expectation is that the amount of
+            // look-forward is small (performance may degrade in the case
+            // of massively complex paths).
+            do {
+                my_right_edge = uintBitsToFloat(state[state_right_edge_index(aggregate_ix)]);
+                aggregate_ix++;
+            } while (isinf(my_right_edge));
+        }
+
+        // Now propagate right_edge backward, from fill to segment.
+        for (uint i = 0; i < LG_N_TILE; i++) {
+            // Note: we could try to cut down on write bandwidth here if the value hasn't
+            // changed, but not sure it's worth the complexity to track.
+            sh_right_edge[gl_LocalInvocationID.x] = my_right_edge;
+            barrier();
+            if (gl_LocalInvocationID.x + (1 << i) < N_TILE && isinf(my_right_edge)) {
+                my_right_edge = sh_right_edge[gl_LocalInvocationID.x + (1 << i)];
+            }
+            barrier();
+        }
+        if (crosses_edge) {
+            x1 = int(ceil(my_right_edge * SX));
+        }
+
         // At this point, we run an iterator over the coverage area,
         // trying to keep divergence low.
         // Right now, it's just a bbox, but we'll get finer with
@@ -118,9 +174,9 @@ void main() {
             uint chunk_new_start;
             // Refactor to reduce code duplication?
             if (chunk_n > 0) {
-                uint next_chunk = chunk_ref.offset + BinChunk_size + chunk_n * 4;
-                if (next_chunk + BinChunk_size + min(24, element_count * 4) > wr_limit) {
-                    uint alloc_amount = max(BIN_ALLOC, BinChunk_size + element_count * 4);
+                uint next_chunk = chunk_ref.offset + BinChunk_size + chunk_n * BinInstance_size;
+                if (next_chunk + BinChunk_size + min(24, element_count * BinInstance_size) > wr_limit) {
+                    uint alloc_amount = max(BIN_ALLOC, BinChunk_size + element_count * BinInstance_size);
                     // could try to reduce fragmentation if BIN_ALLOC is only a bit above needed
                     next_chunk = atomicAdd(alloc, alloc_amount);
                     wr_limit = next_chunk + alloc_amount;
@@ -129,10 +185,10 @@ void main() {
                 chunk_ref = BinChunkRef(next_chunk);
             }
             BinInstanceRef instance_ref = BinInstanceRef(chunk_ref.offset + BinChunk_size);
-            if (instance_ref.offset + element_count * 4 > wr_limit) {
+            if (instance_ref.offset + element_count * BinInstance_size > wr_limit) {
                 chunk_end = wr_limit;
-                chunk_n = (wr_limit - instance_ref.offset) / 4;
-                uint alloc_amount = max(BIN_ALLOC, BinChunk_size + (element_count - chunk_n) * 4);
+                chunk_n = (wr_limit - instance_ref.offset) / BinInstance_size;
+                uint alloc_amount = max(BIN_ALLOC, BinChunk_size + (element_count - chunk_n) * BinInstance_size);
                 chunk_new_start = atomicAdd(alloc, alloc_amount);
                 wr_limit = chunk_new_start + alloc_amount;
                 BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(chunk_new_start)));
@@ -141,6 +197,7 @@ void main() {
                 chunk_n = element_count - chunk_n;
             } else {
                 chunk_end = ~0;
+                chunk_new_start = ~0;
                 chunk_n = element_count;
             }
             sh_chunk_start[gl_LocalInvocationID.x] = instance_ref.offset;
@@ -161,11 +218,11 @@ void main() {
                 if (my_slice > 0) {
                     idx += count[my_slice - 1][bin_ix];
                 }
-                uint out_offset = sh_chunk_start[bin_ix] + idx * 4;
+                uint out_offset = sh_chunk_start[bin_ix] + idx * BinInstance_size;
                 if (out_offset >= sh_chunk_end[bin_ix]) {
                     out_offset += sh_chunk_jump[bin_ix];
                 }
-                BinInstance_write(BinInstanceRef(out_offset), BinInstance(element_ix));
+                BinInstance_write(BinInstanceRef(out_offset), BinInstance(element_ix, my_right_edge));
             }
             x++;
             if (x == x1) {
diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv
index 76148c2..e932e4d 100644
Binary files a/piet-gpu/shader/binning.spv and b/piet-gpu/shader/binning.spv differ
diff --git a/piet-gpu/shader/bins.h b/piet-gpu/shader/bins.h
index 3ce06e0..85f7536 100644
--- a/piet-gpu/shader/bins.h
+++ b/piet-gpu/shader/bins.h
@@ -10,9 +10,10 @@ struct BinChunkRef {
 
 struct BinInstance {
     uint element_ix;
+    float right_edge;
 };
 
-#define BinInstance_size 4
+#define BinInstance_size 8
 
 BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) {
     return BinInstanceRef(ref.offset + index * BinInstance_size);
@@ -32,14 +33,17 @@ BinChunkRef BinChunk_index(BinChunkRef ref, uint index) {
 BinInstance BinInstance_read(BinInstanceRef ref) {
     uint ix = ref.offset >> 2;
     uint raw0 = bins[ix + 0];
+    uint raw1 = bins[ix + 1];
     BinInstance s;
     s.element_ix = raw0;
+    s.right_edge = uintBitsToFloat(raw1);
     return s;
 }
 
 void BinInstance_write(BinInstanceRef ref, BinInstance s) {
     uint ix = ref.offset >> 2;
     bins[ix + 0] = s.element_ix;
+    bins[ix + 1] = floatBitsToUint(s.right_edge);
 }
 
 BinChunk BinChunk_read(BinChunkRef ref) {
diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja
index 3b6b963..14c72aa 100644
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@@ -12,7 +12,7 @@ build image.spv: glsl image.comp | scene.h
 
 build elements.spv: glsl elements.comp | scene.h state.h annotated.h
 
-build binning.spv: glsl binning.comp | annotated.h bins.h setup.h
+build binning.spv: glsl binning.comp | annotated.h state.h bins.h setup.h
 
 build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h
 
diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
index e331076..c77c6b8 100644
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@@ -30,6 +30,7 @@ layout(set = 0, binding = 3) buffer PtclBuf {
 #define N_RINGBUF 512
 
 shared uint sh_elements[N_RINGBUF];
+shared float sh_right_edge[N_RINGBUF];
 shared uint sh_chunk[N_WG];
 shared uint sh_chunk_next[N_WG];
 shared uint sh_chunk_n[N_WG];
@@ -41,6 +42,8 @@ shared uint sh_selected_n;
 shared uint sh_elements_ref;
 
 shared uint sh_bitmaps[N_SLICE][N_TILE];
+shared uint sh_backdrop[N_SLICE][N_TILE];
+shared uint sh_bd_sign[N_SLICE];
 
 // scale factors useful for converting coordinates to tiles
 #define SX (1.0 / float(TILE_WIDTH_PX))
@@ -77,6 +80,14 @@ void alloc_chunk(inout uint chunk_n_segs, inout SegChunkRef seg_chunk_ref,
     }
 }
 
+// Accumulate delta to backdrop.
+//
+// Each bit for which bd_bitmap is 1 and bd_sign is 1 counts as +1, and each
+// bit for which bd_bitmap is 1 and bd_sign is 0 counts as -1.
+int count_backdrop(uint bd_bitmap, uint bd_sign) {
+    return bitCount(bd_bitmap & bd_sign) - bitCount(bd_bitmap & ~bd_sign);
+}
+
 void main() {
     // Could use either linear or 2d layouts for both dispatch and
     // invocations within the workgroup. We'll use variables to abstract.
@@ -109,10 +120,14 @@ void main() {
         sh_first_el[th_ix] = chunk.n > 0 ?
             BinInstance_read(BinInstanceRef(start_chunk + BinChunk_size)).element_ix : ~0;
     }
-    uint count = 0;
+    if (th_ix < N_SLICE) {
+        sh_bd_sign[th_ix] = 0;
+    }
+    int backdrop = 0;
     while (true) {
         for (uint i = 0; i < N_SLICE; i++) {
             sh_bitmaps[i][th_ix] = 0;
+            sh_backdrop[i][th_ix] = 0;
         }
 
         while (wr_ix - rd_ix <= N_TILE) {
@@ -157,8 +172,10 @@ void main() {
             }
             BinInstanceRef inst_ref = BinInstanceRef(sh_elements_ref);
             if (th_ix < chunk_n) {
-                uint el = BinInstance_read(BinInstance_index(inst_ref, th_ix)).element_ix;
-                sh_elements[(wr_ix + th_ix) % N_RINGBUF] = el;
+                BinInstance inst = BinInstance_read(BinInstance_index(inst_ref, th_ix));
+                uint wr_el_ix = (wr_ix + th_ix) % N_RINGBUF;
+                sh_elements[wr_el_ix] = inst.element_ix;
+                sh_right_edge[wr_el_ix] = inst.right_edge;
             }
             wr_ix += chunk_n;
         }
@@ -169,8 +186,11 @@ void main() {
         // Read one element, compute coverage.
         uint tag = Annotated_Nop;
         AnnotatedRef ref;
+        float right_edge = 0.0;
         if (th_ix + rd_ix < wr_ix) {
-            uint element_ix = sh_elements[(rd_ix + th_ix) % N_RINGBUF];
+            uint rd_el_ix = (rd_ix + th_ix) % N_RINGBUF;
+            uint element_ix = sh_elements[rd_el_ix];
+            right_edge = sh_right_edge[rd_el_ix];
             ref = AnnotatedRef(element_ix * Annotated_size);
             tag = Annotated_tag(ref);
         }
@@ -179,15 +199,26 @@ void main() {
         float a, b, c;
         // Bounding box of element in pixel coordinates.
         float xmin, xmax, ymin, ymax;
+        uint my_slice = th_ix / 32;
+        uint my_mask = 1 << (th_ix & 31);
         switch (tag) {
-        case Annotated_Line:
-            AnnoLineSeg line = Annotated_Line_read(ref);
+        case Annotated_FillLine:
+        case Annotated_StrokeLine:
+            AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref);
             xmin = min(line.p0.x, line.p1.x) - line.stroke.x;
             xmax = max(line.p0.x, line.p1.x) + line.stroke.x;
             ymin = min(line.p0.y, line.p1.y) - line.stroke.y;
             ymax = max(line.p0.y, line.p1.y) + line.stroke.y;
             float dx = line.p1.x - line.p0.x;
             float dy = line.p1.y - line.p0.y;
+            if (tag == Annotated_FillLine) {
+                // Set bit for backdrop sign calculation, 1 is +1, 0 is -1.
+                if (dy < 0) {
+                    atomicOr(sh_bd_sign[my_slice], my_mask);
+                } else {
+                    atomicAnd(sh_bd_sign[my_slice], ~my_mask);
+                }
+            }
             // Set up for per-scanline coverage formula, below.
             float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
             c = abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + line.stroke.y) * SX;
@@ -214,20 +245,20 @@ void main() {
             break;
         }
 
-        // Draw the coverage area into the bitmaks. This uses an algorithm
+        // Draw the coverage area into the bitmasks. This uses an algorithm
         // that computes the coverage of a span for given scanline.
 
         // Compute bounding box in tiles and clip to this bin.
         int x0 = int(floor((xmin - xy0.x) * SX));
         int x1 = int(ceil((xmax - xy0.x) * SX));
+        int xr = int(ceil((right_edge - xy0.x) * SX));
         int y0 = int(floor((ymin - xy0.y) * SY));
         int y1 = int(ceil((ymax - xy0.y) * SY));
         x0 = clamp(x0, 0, N_TILE_X);
         x1 = clamp(x1, x0, N_TILE_X);
+        xr = clamp(xr, 0, N_TILE_X);
         y0 = clamp(y0, 0, N_TILE_Y);
         y1 = clamp(y1, y0, N_TILE_Y);
-        uint my_slice = th_ix / 32;
-        uint my_mask = 1 << (th_ix & 31);
         float t = a + b * float(y0);
         for (uint y = y0; y < y1; y++) {
             uint xx0 = clamp(int(floor(t - c)), x0, x1);
@@ -235,6 +266,15 @@ void main() {
             for (uint x = xx0; x < xx1; x++) {
                 atomicOr(sh_bitmaps[my_slice][y * N_TILE_X + x], my_mask);
             }
+            if (tag == Annotated_FillLine && ymin <= xy0.y + float(y * TILE_HEIGHT_PX)) {
+                // Assign backdrop to all tiles to the right of the ray crossing the
+                // top edge of this tile, up to the right edge of the fill bbox.
+                float xray = t - 0.5 * b;
+                xx0 = max(int(ceil(xray)), 0);
+                for (uint x = xx0; x < xr; x++) {
+                    atomicOr(sh_backdrop[my_slice][y * N_TILE_X + x], my_mask);
+                }
+            }
             t += b;
         }
         barrier();
@@ -242,13 +282,18 @@ void main() {
         // Output elements for this tile, based on bitmaps.
         uint slice_ix = 0;
         uint bitmap = sh_bitmaps[0][th_ix];
+        uint bd_bitmap = sh_backdrop[0][th_ix];
+        uint bd_sign = sh_bd_sign[0];
         while (true) {
             if (bitmap == 0) {
+                backdrop += count_backdrop(bd_bitmap, bd_sign);
                 slice_ix++;
                 if (slice_ix == N_SLICE) {
                     break;
                 }
                 bitmap = sh_bitmaps[slice_ix][th_ix];
+                bd_bitmap = sh_backdrop[slice_ix][th_ix];
+                bd_sign = sh_bd_sign[slice_ix];
                 if (bitmap == 0) {
                     continue;
                 }
@@ -256,6 +301,13 @@ void main() {
             uint element_ref_ix = slice_ix * 32 + findLSB(bitmap);
             uint element_ix = sh_elements[(rd_ix + element_ref_ix) % N_RINGBUF];
 
+            // Bits up to and including the lsb
+            uint bd_mask = (bitmap - 1) ^ bitmap;
+            backdrop += count_backdrop(bd_bitmap & bd_mask, bd_sign);
+            // Clear bits that have been consumed.
+            bd_bitmap &= ~bd_mask;
+            bitmap &= ~bd_mask;
+
             // At this point, we read the element again from global memory.
             // If that turns out to be expensive, maybe we can pack it into
             // shared memory (or perhaps just the tag).
@@ -263,15 +315,58 @@ void main() {
             tag = Annotated_tag(ref);
 
             switch (tag) {
-            case Annotated_Line:
-                AnnoLineSeg line = Annotated_Line_read(ref);
+            case Annotated_FillLine:
+                AnnoFillLineSeg fill_line = Annotated_FillLine_read(ref);
+                // This is basically the same logic as piet-metal, but should be made numerically robust.
+                vec2 tile_xy = vec2(tile_x * TILE_WIDTH_PX, tile_y * TILE_HEIGHT_PX);
+                float yEdge = mix(fill_line.p0.y, fill_line.p1.y, (tile_xy.x - fill_line.p0.x) / (fill_line.p1.x - fill_line.p0.x));
+                if (min(fill_line.p0.x, fill_line.p1.x) < tile_xy.x && yEdge >= tile_xy.y && yEdge < tile_xy.y + TILE_HEIGHT_PX) {
+                    Segment edge_seg;
+                    if (fill_line.p0.x > fill_line.p1.x) {
+                        fill_line.p1 = vec2(tile_xy.x, yEdge);
+                        edge_seg.start = fill_line.p1;
+                        edge_seg.end = vec2(tile_xy.x, tile_xy.y + TILE_HEIGHT_PX);
+                    } else {
+                        fill_line.p0 = vec2(tile_xy.x, yEdge);
+                        edge_seg.start = vec2(tile_xy.x, tile_xy.y + TILE_HEIGHT_PX);
+                        edge_seg.end = fill_line.p0;
+                    }
+                    alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit);
+                    Segment_write(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs), edge_seg);
+                    chunk_n_segs++;
+                }
+                Segment fill_seg = Segment(fill_line.p0, fill_line.p1);
+                alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit);
+                Segment_write(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs), fill_seg);
+                chunk_n_segs++;
+                break;
+            case Annotated_StrokeLine:
+                AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref);
                 Segment seg = Segment(line.p0, line.p1);
                 alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit);
                 Segment_write(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs), seg);
                 chunk_n_segs++;
                 break;
             case Annotated_Fill:
-                chunk_n_segs = 0;
+                if (chunk_n_segs > 0) {
+                    AnnoFill fill = Annotated_Fill_read(ref);
+                    SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, SegChunkRef(0)));
+                    seg_chunk_ref.offset += SegChunk_size + Segment_size * chunk_n_segs;
+                    CmdFill cmd_fill;
+                    cmd_fill.seg_ref = first_seg_chunk.offset;
+                    cmd_fill.backdrop = backdrop;
+                    cmd_fill.rgba_color = fill.rgba_color;
+                    alloc_cmd(cmd_ref, cmd_limit);
+                    Cmd_Fill_write(cmd_ref, cmd_fill);
+                    cmd_ref.offset += Cmd_size;
+                    chunk_n_segs = 0;
+                } else if (backdrop != 0) {
+                    AnnoFill fill = Annotated_Fill_read(ref);
+                    alloc_cmd(cmd_ref, cmd_limit);
+                    Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color));
+                    cmd_ref.offset += Cmd_size;
+                }
+                backdrop = 0;
                 break;
             case Annotated_Stroke:
                 if (chunk_n_segs > 0) {
@@ -289,9 +384,6 @@ void main() {
                 }
                 break;
             }
-
-            // clear LSB
-            bitmap &= bitmap - 1;
         }
         barrier();
 
diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv
index d61b227..f74d0a0 100644
Binary files a/piet-gpu/shader/coarse.spv and b/piet-gpu/shader/coarse.spv differ
diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp
index 8f87b87..bdb4e0d 100644
--- a/piet-gpu/shader/elements.comp
+++ b/piet-gpu/shader/elements.comp
@@ -10,7 +10,7 @@
 #define N_ROWS 4
 #define WG_SIZE 32
 #define LG_WG_SIZE 5
-#define TILE_SIZE (WG_SIZE * N_ROWS)
+#define PARTITION_SIZE (WG_SIZE * N_ROWS)
 
 layout(local_size_x = WG_SIZE, local_size_y = 1) in;
 
@@ -34,14 +34,14 @@ layout(set = 0, binding = 2) buffer AnnotatedBuf {
 #include "state.h"
 #include "annotated.h"
 
-#define StateBuf_stride (4 + 2 * State_size)
+#define StateBuf_stride (8 + 2 * State_size)
 
 StateRef state_aggregate_ref(uint partition_ix) {
-    return StateRef(8 + partition_ix * StateBuf_stride);
+    return StateRef(12 + partition_ix * StateBuf_stride);
 }
 
 StateRef state_prefix_ref(uint partition_ix) {
-    return StateRef(8 + partition_ix * StateBuf_stride + State_size);
+    return StateRef(12 + partition_ix * StateBuf_stride + State_size);
 }
 
 uint state_flag_index(uint partition_ix) {
@@ -86,7 +86,7 @@ State combine_state(State a, State b) {
     return c;
 }
 
-State map_element(ElementRef ref) {
+State map_element(ElementRef ref, inout bool is_fill) {
     // TODO: it would *probably* be more efficient to make the memory read patterns less
     // divergent, though it would be more wasted memory.
     uint tag = Element_tag(ref);
@@ -96,9 +96,11 @@ State map_element(ElementRef ref) {
     c.translate = vec2(0.0, 0.0);
     c.linewidth = 1.0; // TODO should be 0.0
     c.flags = 0;
+    is_fill = false;
     switch (tag) {
-    case Element_Line:
-        LineSeg line = Element_Line_read(ref);
+    case Element_FillLine:
+    case Element_StrokeLine:
+        LineSeg line = Element_FillLine_read(ref);
         c.bbox.xy = min(line.p0, line.p1);
         c.bbox.zw = max(line.p0, line.p1);
         break;
@@ -113,6 +115,8 @@ State map_element(ElementRef ref) {
         c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3));
         break;
     case Element_Fill:
+        is_fill = true;
+        // fall-through
     case Element_Stroke:
         c.flags = FLAG_RESET_BBOX;
         break;
@@ -145,6 +149,8 @@ shared vec4 sh_bbox[WG_SIZE];
 shared float sh_width[WG_SIZE];
 shared uint sh_flags[WG_SIZE];
 
+shared uint sh_min_fill;
+
 shared uint sh_tile_ix;
 shared State sh_prefix;
 
@@ -154,19 +160,27 @@ void main() {
     // 4.4 of prefix sum paper).
     if (gl_LocalInvocationID.x == 0) {
         sh_tile_ix = atomicAdd(state[0], 1);
+        sh_min_fill = ~0;
     }
     barrier();
     uint tile_ix = sh_tile_ix;
 
-    uint ix = tile_ix * TILE_SIZE + gl_LocalInvocationID.x * N_ROWS;
+    uint ix = tile_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
     ElementRef ref = ElementRef(ix * Element_size);
 
-    th_state[0] = map_element(ref);
+    bool is_fill;
+    uint my_min_fill = ~0;
+    th_state[0] = map_element(ref, is_fill);
+    if (is_fill) my_min_fill = ix;
     for (uint i = 1; i < N_ROWS; i++) {
         // discussion question: would it be faster to load using more coherent patterns
         // into thread memory? This is kinda strided.
-        th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i)));
+        th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i), is_fill));
+        if (is_fill && my_min_fill == ~0) {
+            my_min_fill = ix + i;
+        }
     }
+    atomicMin(sh_min_fill, my_min_fill);
     State agg = th_state[N_ROWS - 1];
     sh_mat[gl_LocalInvocationID.x] = agg.mat;
     sh_translate[gl_LocalInvocationID.x] = agg.translate;
@@ -238,6 +252,7 @@ void main() {
         }
     }
     barrier();
+    my_min_fill = sh_min_fill;
     if (tile_ix != 0) {
         exclusive = sh_prefix;
     }
@@ -253,8 +268,14 @@ void main() {
         other.flags = sh_flags[ix];
         row = combine_state(row, other);
     }
+    if (my_min_fill == ~0 && gl_LocalInvocationID.x == 0) {
+        state[state_flag_index(tile_ix) + 1] = 0x7f800000; // infinity
+    }
     for (uint i = 0; i < N_ROWS; i++) {
         State st = combine_state(row, th_state[i]);
+        if (my_min_fill == ix + i) {
+            state[state_flag_index(tile_ix) + 1] = floatBitsToUint(st.bbox.z);
+        }
         // We write the state now for development purposes, but the
         // actual goal is to write transformed and annotated elements.
         //State_write(StateRef((ix + i) * State_size), st);
@@ -266,13 +287,22 @@ void main() {
         AnnotatedRef out_ref = AnnotatedRef((ix + i) * Annotated_size);
         uint tag = Element_tag(this_ref);
         switch (tag) {
-        case Element_Line:
-            LineSeg line = Element_Line_read(this_ref);
-            AnnoLineSeg anno_line;
+        case Element_FillLine:
+        case Element_StrokeLine:
+            LineSeg line = Element_StrokeLine_read(this_ref);
+            AnnoStrokeLineSeg anno_line;
             anno_line.p0 = st.mat.xz * line.p0.x + st.mat.yw * line.p0.y + st.translate;
             anno_line.p1 = st.mat.xz * line.p1.x + st.mat.yw * line.p1.y + st.translate;
-            anno_line.stroke = get_linewidth(st);
-            Annotated_Line_write(out_ref, anno_line);
+            if (tag == Element_StrokeLine) {
+                anno_line.stroke = get_linewidth(st);
+            } else {
+                anno_line.stroke = vec2(0.0);
+            }
+            // We do encoding a bit by hand to minimize divergence. Another approach
+            // would be to have a fill/stroke bool.
+            uint out_tag = tag == Element_FillLine ? Annotated_FillLine : Annotated_StrokeLine;
+            annotated[out_ref.offset >> 2] = out_tag;
+            AnnoStrokeLineSeg_write(AnnoStrokeLineSegRef(out_ref.offset + 4), anno_line);
             break;
         case Element_Stroke:
             Stroke stroke = Element_Stroke_read(this_ref);
diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv
index 7828aa4..962bd0a 100644
Binary files a/piet-gpu/shader/elements.spv and b/piet-gpu/shader/elements.spv differ
diff --git a/piet-gpu/shader/scene.h b/piet-gpu/shader/scene.h
index 84ef80d..5bb879b 100644
--- a/piet-gpu/shader/scene.h
+++ b/piet-gpu/shader/scene.h
@@ -238,13 +238,14 @@ TransformRef Transform_index(TransformRef ref, uint index) {
 }
 
 #define Element_Nop 0
-#define Element_Line 1
-#define Element_Quad 2
-#define Element_Cubic 3
-#define Element_Stroke 4
-#define Element_Fill 5
-#define Element_SetLineWidth 6
-#define Element_Transform 7
+#define Element_StrokeLine 1
+#define Element_FillLine 2
+#define Element_Quad 3
+#define Element_Cubic 4
+#define Element_Stroke 5
+#define Element_Fill 6
+#define Element_SetLineWidth 7
+#define Element_Transform 8
 #define Element_size 36
 
 ElementRef Element_index(ElementRef ref, uint index) {
@@ -446,7 +447,11 @@ uint Element_tag(ElementRef ref) {
     return scene[ref.offset >> 2];
 }
 
-LineSeg Element_Line_read(ElementRef ref) {
+LineSeg Element_StrokeLine_read(ElementRef ref) {
+    return LineSeg_read(LineSegRef(ref.offset + 4));
+}
+
+LineSeg Element_FillLine_read(ElementRef ref) {
     return LineSeg_read(LineSegRef(ref.offset + 4));
 }
 
diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h
index 5d8fb9b..b913086 100644
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@@ -51,9 +51,14 @@
 #define N_TILE_X 16
 #define N_TILE_Y 16
 #define N_TILE (N_TILE_X * N_TILE_Y)
+#define LG_N_TILE 8
 #define N_SLICE (N_TILE / 32)
 // Number of workgroups for binning kernel
 #define N_WG 16
 
+// This is the ratio of the number of elements in a binning workgroup
+// over the number of elements in a partition workgroup.
+#define ELEMENT_BINNING_RATIO 2
+
 #define BIN_INITIAL_ALLOC 64
 #define BIN_ALLOC 256
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index 2dca39d..70b02f5 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -185,10 +185,10 @@ impl<D: Device> Renderer<D> {
             ])
             ?;
         let bin_code = include_bytes!("../shader/binning.spv");
-        let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 3, 0)?;
+        let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 4, 0)?;
         let bin_ds = device.create_descriptor_set(
             &bin_pipeline,
-            &[&anno_buf, &bin_alloc_buf_dev, &bin_buf],
+            &[&anno_buf, &state_buf, &bin_alloc_buf_dev, &bin_buf],
             &[],
         )?;
 
diff --git a/piet-gpu/src/pico_svg.rs b/piet-gpu/src/pico_svg.rs
index 9cf5cc3..b2f054c 100644
--- a/piet-gpu/src/pico_svg.rs
+++ b/piet-gpu/src/pico_svg.rs
@@ -61,8 +61,8 @@ impl PicoSvg {
         for item in &self.items {
             match item {
                 Item::Fill(fill_item) => {
-                    //rc.fill(&fill_item.path, &fill_item.color);
-                    rc.stroke(&fill_item.path, &fill_item.color, 1.0);
+                    rc.fill(&fill_item.path, &fill_item.color);
+                    //rc.stroke(&fill_item.path, &fill_item.color, 1.0);
                 }
                 Item::Stroke(stroke_item) => {
                     rc.stroke(&stroke_item.path, &stroke_item.color, stroke_item.width);
diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs
index e01a6ae..da234de 100644
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@@ -94,7 +94,7 @@ impl RenderContext for PietGpuRenderContext {
         }
         let brush = brush.make_brush(self, || shape.bounding_box()).into_owned();
         let path = shape.to_bez_path(TOLERANCE);
-        self.encode_path(path);
+        self.encode_path(path, false);
         match brush {
             PietGpuBrush::Solid(rgba_color) => {
                 let stroke = Stroke { rgba_color };
@@ -116,7 +116,7 @@ impl RenderContext for PietGpuRenderContext {
     fn fill(&mut self, shape: impl Shape, brush: &impl IntoBrush<Self>) {
         let brush = brush.make_brush(self, || shape.bounding_box()).into_owned();
         let path = shape.to_bez_path(TOLERANCE);
-        self.encode_path(path);
+        self.encode_path(path, true);
         match brush {
             PietGpuBrush::Solid(rgba_color) => {
                 let fill = Fill { rgba_color };
@@ -198,7 +198,15 @@ impl RenderContext for PietGpuRenderContext {
 }
 
 impl PietGpuRenderContext {
-    fn encode_path(&mut self, path: impl Iterator<Item = PathEl>) {
+    fn encode_line_seg(&mut self, seg: LineSeg, is_fill: bool) {
+        if is_fill {
+            self.elements.push(Element::FillLine(seg));
+        } else {
+            self.elements.push(Element::StrokeLine(seg));
+        }
+    }
+
+    fn encode_path(&mut self, path: impl Iterator<Item = PathEl>, is_fill: bool) {
         let flatten = true;
         if flatten {
             let mut start_pt = None;
@@ -207,6 +215,7 @@ impl PietGpuRenderContext {
                 match el {
                     PathEl::MoveTo(p) => {
                         let scene_pt = to_f32_2(p);
+                        start_pt = Some(scene_pt);
                         last_pt = Some(scene_pt);
                     }
                     PathEl::LineTo(p) => {
@@ -215,16 +224,18 @@ impl PietGpuRenderContext {
                             p0: last_pt.unwrap(),
                             p1: scene_pt,
                         };
-                        self.elements.push(Element::Line(seg));
+                        self.encode_line_seg(seg, is_fill);
                         last_pt = Some(scene_pt);
                     }
                     PathEl::ClosePath => {
                         if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) {
-                            let seg = LineSeg {
-                                p0: last,
-                                p1: start,
-                            };
-                            self.elements.push(Element::Line(seg));
+                            if last != start {
+                                let seg = LineSeg {
+                                    p0: last,
+                                    p1: start,
+                                };
+                                self.encode_line_seg(seg, is_fill);
+                            }
                         }
                     }
                     _ => (),
@@ -238,6 +249,7 @@ impl PietGpuRenderContext {
                 match el {
                     PathEl::MoveTo(p) => {
                         let scene_pt = to_f32_2(p);
+                        start_pt = Some(scene_pt);
                         last_pt = Some(scene_pt);
                     }
                     PathEl::LineTo(p) => {
@@ -246,7 +258,7 @@ impl PietGpuRenderContext {
                             p0: last_pt.unwrap(),
                             p1: scene_pt,
                         };
-                        self.elements.push(Element::Line(seg));
+                        self.encode_line_seg(seg, is_fill);
                         last_pt = Some(scene_pt);
                     }
                     PathEl::QuadTo(p1, p2) => {
@@ -275,11 +287,13 @@ impl PietGpuRenderContext {
                     }
                     PathEl::ClosePath => {
                         if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) {
-                            let seg = LineSeg {
-                                p0: last,
-                                p1: start,
-                            };
-                            self.elements.push(Element::Line(seg));
+                            if last != start {
+                                let seg = LineSeg {
+                                    p0: last,
+                                    p1: start,
+                                };
+                                self.encode_line_seg(seg, is_fill);
+                            }
                         }
                     }
                 }