diff --git a/piet-gpu-types/src/ptcl.rs b/piet-gpu-types/src/ptcl.rs
index 2aa869e..bdf342b 100644
--- a/piet-gpu-types/src/ptcl.rs
+++ b/piet-gpu-types/src/ptcl.rs
@@ -19,8 +19,7 @@ piet_gpu! {
             rgba_color: u32,
         }
         struct CmdFill {
-            // Should be Ref<FillSegChunk>
-            seg_ref: u32,
+            seg_ref: Ref<SegChunk>,
             backdrop: i32,
             rgba_color: u32,
         }
@@ -58,6 +57,10 @@ piet_gpu! {
         struct Segment {
             start: [f32; 2],
             end: [f32; 2],
+
+            // This is used for fills only, but we're including it in
+            // the general structure for simplicity.
+            y_edge: f32,
         }
 
         struct SegChunk {
diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs
index 347cf01..fe8c4ac 100644
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@@ -87,6 +87,29 @@ fn trace_ptcl(buf: &[u32]) {
                 let tag = buf[tile_offset / 4];
                 match tag {
                     0 => break,
+                    3 => {
+                        let backdrop = buf[tile_offset / 4 + 2];
+                        let rgba_color = buf[tile_offset / 4 + 3];
+                        println!("  {:x}: fill {:x} {}", tile_offset, rgba_color, backdrop);
+                        let mut seg_chunk = buf[tile_offset / 4 + 1] as usize;
+                        let n = buf[seg_chunk / 4] as usize;
+                        let segs = buf[seg_chunk / 4 + 2] as usize;
+                        println!("    chunk @{:x}: n={}, segs @{:x}", seg_chunk, n, segs);
+                        for i in 0..n {
+                            let x0 = f32::from_bits(buf[segs / 4 + i * 5]);
+                            let y0 = f32::from_bits(buf[segs / 4 + i * 5 + 1]);
+                            let x1 = f32::from_bits(buf[segs / 4 + i * 5 + 2]);
+                            let y1 = f32::from_bits(buf[segs / 4 + i * 5 + 3]);
+                            let y_edge = f32::from_bits(buf[segs / 4 + i * 5 + 4]);
+                            println!("      ({:.3}, {:.3}) - ({:.3}, {:.3}) | {:.3}", x0, y0, x1, y1, y_edge);
+                        }
+                        loop {
+                            seg_chunk = buf[seg_chunk / 4 + 1] as usize;
+                            if seg_chunk == 0 {
+                                break;
+                            }
+                        }
+                    }
                     4 => {
                         let line_width = f32::from_bits(buf[tile_offset / 4 + 2]);
                         let rgba_color = buf[tile_offset / 4 + 3];
@@ -96,11 +119,12 @@ fn trace_ptcl(buf: &[u32]) {
                         let segs = buf[seg_chunk / 4 + 2] as usize;
                         println!("    chunk @{:x}: n={}, segs @{:x}", seg_chunk, n, segs);
                         for i in 0..n {
-                            let x0 = f32::from_bits(buf[segs / 4 + i * 4]);
-                            let y0 = f32::from_bits(buf[segs / 4 + i * 4 + 1]);
-                            let x1 = f32::from_bits(buf[segs / 4 + i * 4 + 2]);
-                            let y1 = f32::from_bits(buf[segs / 4 + i * 4 + 3]);
-                            println!("      ({:.3}, {:.3}) - ({:.3}, {:.3})", x0, y0, x1, y1);
+                            let x0 = f32::from_bits(buf[segs / 4 + i * 5]);
+                            let y0 = f32::from_bits(buf[segs / 4 + i * 5 + 1]);
+                            let x1 = f32::from_bits(buf[segs / 4 + i * 5 + 2]);
+                            let y1 = f32::from_bits(buf[segs / 4 + i * 5 + 3]);
+                            let y_edge = f32::from_bits(buf[segs / 4 + i * 5 + 4]);
+                            println!("      ({:.3}, {:.3}) - ({:.3}, {:.3}) | {:.3}", x0, y0, x1, y1, y_edge);
                         }
                         loop {
                             seg_chunk = buf[seg_chunk / 4 + 1] as usize;
@@ -160,8 +184,8 @@ fn main() -> Result<(), Error> {
         /*
         let mut data: Vec<u32> = Default::default();
         device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap();
-        piet_gpu::dump_k1_data(&data);
-        //trace_ptcl(&data);
+        //piet_gpu::dump_k1_data(&data);
+        trace_ptcl(&data);
         */
 
         let mut img_data: Vec<u8> = Default::default();
diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
index 81ea890..3a73417 100644
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@@ -51,7 +51,6 @@ shared uint sh_is_segment[N_SLICE];
 // Count of total number of segments in each tile, then
 // inclusive prefix sum of same.
 shared uint sh_seg_count[N_TILE];
-shared uint sh_orig_seg_count[N_TILE];
 shared uint sh_seg_alloc;
 
 // scale factors useful for converting coordinates to tiles
@@ -295,7 +294,6 @@ void main() {
             seg_count += bitCount(sh_bitmaps[i][th_ix] & sh_is_segment[i]);
         }
         sh_seg_count[th_ix] = seg_count;
-        sh_orig_seg_count[th_ix] = seg_count;
         // Prefix sum of sh_seg_count
         for (uint i = 0; i < LG_N_TILE; i++) {
             barrier();
@@ -359,8 +357,23 @@ void main() {
             uint rd_el_ix = (rd_ix + slice_ix * 32 + bit_ix) % N_RINGBUF;
             uint element_ix = sh_elements[rd_el_ix];
             ref = AnnotatedRef(element_ix * Annotated_size);
-            AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref);
-            Segment seg = Segment(line.p0, line.p1);
+            AnnoFillLineSeg line = Annotated_FillLine_read(ref);
+            float y_edge = 0.0;
+            // This is basically the same logic as piet-metal, but should be made numerically robust.
+            if (Annotated_tag(ref) == Annotated_FillLine) {
+                vec2 tile_xy = xy0 + vec2((tile_ix % N_TILE_X) * TILE_WIDTH_PX, (tile_ix / N_TILE_X) * TILE_HEIGHT_PX);
+                y_edge = mix(line.p0.y, line.p1.y, (tile_xy.x - line.p0.x) / (line.p1.x - line.p0.x));
+                if (min(line.p0.x, line.p1.x) < tile_xy.x && y_edge >= tile_xy.y && y_edge < tile_xy.y + TILE_HEIGHT_PX) {
+                    if (line.p0.x > line.p1.x) {
+                        line.p1 = vec2(tile_xy.x, y_edge);
+                    } else {
+                        line.p0 = vec2(tile_xy.x, y_edge);
+                    }
+                } else {
+                    y_edge = 1e9;
+                }
+            }
+            Segment seg = Segment(line.p0, line.p1, y_edge);
             Segment_write(SegmentRef(seg_alloc + Segment_size * ix), seg);
         }
 
@@ -411,33 +424,50 @@ void main() {
 
             switch (tag) {
             case Annotated_Fill:
-                /*
-                if (seg_count > 0) {
+                if (last_chunk_n > 0 || seg_count > 0) {
+                    SegChunkRef chunk_ref = SegChunkRef(0);
+                    if (seg_count > 0) {
+                        chunk_ref = alloc_seg_chunk();
+                        SegChunk chunk;
+                        chunk.n = seg_count;
+                        chunk.next = SegChunkRef(0);
+                        uint seg_offset = seg_alloc + seg_start * Segment_size;
+                        chunk.segs = SegmentRef(seg_offset);
+                        SegChunk_write(chunk_ref, chunk);
+                    }
+                    if (last_chunk_n > 0) {
+                        SegChunk chunk;
+                        chunk.n = last_chunk_n;
+                        chunk.next = chunk_ref;
+                        chunk.segs = last_chunk_segs;
+                        SegChunk_write(last_chunk_ref, chunk);
+                    } else {
+                        first_seg_chunk = chunk_ref;
+                    }
+
                     AnnoFill fill = Annotated_Fill_read(ref);
-                    SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, SegChunkRef(0)));
-                    seg_chunk_ref.offset += SegChunk_size + Segment_size * chunk_n_segs;
                     CmdFill cmd_fill;
-                    cmd_fill.seg_ref = first_seg_chunk.offset;
+                    cmd_fill.seg_ref = first_seg_chunk;
                     cmd_fill.backdrop = backdrop;
                     cmd_fill.rgba_color = fill.rgba_color;
                     alloc_cmd(cmd_ref, cmd_limit);
                     Cmd_Fill_write(cmd_ref, cmd_fill);
                     cmd_ref.offset += Cmd_size;
-                    chunk_n_segs = 0;
+                    last_chunk_n = 0;
                 } else if (backdrop != 0) {
                     AnnoFill fill = Annotated_Fill_read(ref);
                     alloc_cmd(cmd_ref, cmd_limit);
                     Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color));
                     cmd_ref.offset += Cmd_size;
                 }
-                */
-                backdrop = 0;
+                seg_start += seg_count;
                 seg_count = 0;
+                backdrop = 0;
                 break;
             case Annotated_Stroke:
+                // TODO: reduce divergence & code duplication? Much of the
+                // fill and stroke processing is in common.
                 if (last_chunk_n > 0 || seg_count > 0) {
-                    // TODO: noncontiguous case
-
                     SegChunkRef chunk_ref = SegChunkRef(0);
                     if (seg_count > 0) {
                         chunk_ref = alloc_seg_chunk();
diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv
index 069209e..56337ef 100644
Binary files a/piet-gpu/shader/coarse.spv and b/piet-gpu/shader/coarse.spv differ
diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp
index 4c4aba3..1abcc2b 100644
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@@ -68,11 +68,12 @@ void main() {
             CmdFill fill = Cmd_Fill_read(cmd_ref);
             // Probably better to store as float, but conversion is no doubt cheap.
             float area = float(fill.backdrop);
-            SegChunkRef fill_seg_chunk_ref = SegChunkRef(fill.seg_ref);
+            SegChunkRef fill_seg_chunk_ref = fill.seg_ref;
             do {
                 SegChunk seg_chunk = SegChunk_read(fill_seg_chunk_ref);
+                SegmentRef segs = seg_chunk.segs;
                 for (int i = 0; i < seg_chunk.n; i++) {
-                    Segment seg = Segment_read(SegmentRef(fill_seg_chunk_ref.offset + SegChunk_size + Segment_size * i));
+                    Segment seg = Segment_read(Segment_index(segs, i));
                     vec2 start = seg.start - xy;
                     vec2 end = seg.end - xy;
                     vec2 window = clamp(vec2(start.y, end.y), 0.0, 1.0);
@@ -87,6 +88,7 @@ void main() {
                         float a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin);
                         area += a * (window.x - window.y);
                     }
+                    area += sign(end.x - start.x) * clamp(xy.y - seg.y_edge + 1.0, 0.0, 1.0);
                 }
                 fill_seg_chunk_ref = seg_chunk.next;
             } while (fill_seg_chunk_ref.offset != 0);
diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv
index 02bd137..8060f1f 100644
Binary files a/piet-gpu/shader/kernel4.spv and b/piet-gpu/shader/kernel4.spv differ
diff --git a/piet-gpu/shader/ptcl.h b/piet-gpu/shader/ptcl.h
index 2026b46..dd1f9a8 100644
--- a/piet-gpu/shader/ptcl.h
+++ b/piet-gpu/shader/ptcl.h
@@ -80,7 +80,7 @@ CmdStrokeRef CmdStroke_index(CmdStrokeRef ref, uint index) {
 }
 
 struct CmdFill {
-    uint seg_ref;
+    SegChunkRef seg_ref;
     int backdrop;
     uint rgba_color;
 };
@@ -152,9 +152,10 @@ CmdRef Cmd_index(CmdRef ref, uint index) {
 struct Segment {
     vec2 start;
     vec2 end;
+    float y_edge;
 };
 
-#define Segment_size 16
+#define Segment_size 20
 
 SegmentRef Segment_index(SegmentRef ref, uint index) {
     return SegmentRef(ref.offset + index * Segment_size);
@@ -238,7 +239,7 @@ CmdFill CmdFill_read(CmdFillRef ref) {
     uint raw1 = ptcl[ix + 1];
     uint raw2 = ptcl[ix + 2];
     CmdFill s;
-    s.seg_ref = raw0;
+    s.seg_ref = SegChunkRef(raw0);
     s.backdrop = int(raw1);
     s.rgba_color = raw2;
     return s;
@@ -246,7 +247,7 @@ CmdFill CmdFill_read(CmdFillRef ref) {
 
 void CmdFill_write(CmdFillRef ref, CmdFill s) {
     uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = s.seg_ref;
+    ptcl[ix + 0] = s.seg_ref.offset;
     ptcl[ix + 1] = uint(s.backdrop);
     ptcl[ix + 2] = s.rgba_color;
 }
@@ -399,9 +400,11 @@ Segment Segment_read(SegmentRef ref) {
     uint raw1 = ptcl[ix + 1];
     uint raw2 = ptcl[ix + 2];
     uint raw3 = ptcl[ix + 3];
+    uint raw4 = ptcl[ix + 4];
     Segment s;
     s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
     s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.y_edge = uintBitsToFloat(raw4);
     return s;
 }
 
@@ -411,6 +414,7 @@ void Segment_write(SegmentRef ref, Segment s) {
     ptcl[ix + 1] = floatBitsToUint(s.start.y);
     ptcl[ix + 2] = floatBitsToUint(s.end.x);
     ptcl[ix + 3] = floatBitsToUint(s.end.y);
+    ptcl[ix + 4] = floatBitsToUint(s.y_edge);
 }
 
 SegChunk SegChunk_read(SegChunkRef ref) {
diff --git a/piet-gpu/src/pico_svg.rs b/piet-gpu/src/pico_svg.rs
index 9cf5cc3..b2f054c 100644
--- a/piet-gpu/src/pico_svg.rs
+++ b/piet-gpu/src/pico_svg.rs
@@ -61,8 +61,8 @@ impl PicoSvg {
         for item in &self.items {
             match item {
                 Item::Fill(fill_item) => {
-                    //rc.fill(&fill_item.path, &fill_item.color);
-                    rc.stroke(&fill_item.path, &fill_item.color, 1.0);
+                    rc.fill(&fill_item.path, &fill_item.color);
+                    //rc.stroke(&fill_item.path, &fill_item.color, 1.0);
                 }
                 Item::Stroke(stroke_item) => {
                     rc.stroke(&stroke_item.path, &stroke_item.color, stroke_item.width);