Merge pull request #12 from linebender/fills

Make fills work in sort-middle pipeline
2025-01-09 20:31:29 +11:00 · 2020-05-23 10:26:29 -07:00 · 2020-05-23 10:26:29 -07:00 · b5e96b5b87
parent e47c355018 7d040dff37
commit b5e96b5b87
17 changed files with 363 additions and 94 deletions
--- a/piet-gpu-types/src/annotated.rs
+++ b/piet-gpu-types/src/annotated.rs
@ -3,7 +3,14 @@ use piet_gpu_derive::piet_gpu;
 piet_gpu! {
    #[gpu_write]
    mod annotated {
-        struct AnnoLineSeg {
+        struct AnnoFillLineSeg {
+            p0: [f32; 2],
+            p1: [f32; 2],
+            // A note: the layout of this struct is shared with
+            // AnnoStrokeLineSeg. In that case, we actually write
+            // [0.0, 0.0] as the stroke field, to minimize divergence.
+        }
+        struct AnnoStrokeLineSeg {
            p0: [f32; 2],
            p1: [f32; 2],
            // halfwidth in both x and y for binning
@ -35,8 +42,8 @@ piet_gpu! {
        }
        enum Annotated {
            Nop,
-            // The segments need a flag to indicate fill/stroke
-            Line(AnnoLineSeg),
+            FillLine(AnnoFillLineSeg),
+            StrokeLine(AnnoStrokeLineSeg),
            Quad(AnnoQuadSeg),
            Cubic(AnnoCubicSeg),
            Stroke(AnnoStroke),
--- a/piet-gpu-types/src/bins.rs
+++ b/piet-gpu-types/src/bins.rs
@ -7,6 +7,9 @@ piet_gpu! {
    mod bins {
        struct BinInstance {
            element_ix: u32,
+            // Right edge of the bounding box of the associated fill
+            // element; used in backdrop computation.
+            right_edge: f32,
        }

        struct BinChunk {
--- a/piet-gpu-types/src/scene.rs
+++ b/piet-gpu-types/src/scene.rs
@ -85,8 +85,15 @@ piet_gpu! {
        }
        enum Element {
            Nop,
-            // The segments need a flag to indicate fill/stroke
-            Line(LineSeg),
+            // Another approach to encoding would be to use a single
+            // variant but have a bool for fill/stroke. This could be
+            // packed into the tag, so the on-the-wire representation
+            // would be very similar to what's here.
+            StrokeLine(LineSeg),
+            FillLine(LineSeg),
+
+            // Note: we'll need to handle the stroke/fill distinction
+            // for these as well, when we do flattening on the GPU.
            Quad(QuadSeg),
            Cubic(CubicSeg),
            Stroke(Stroke),
--- a/piet-gpu/shader/annotated.h
+++ b/piet-gpu/shader/annotated.h
@ -1,6 +1,10 @@
 // Code auto-generated by piet-gpu-derive

-struct AnnoLineSegRef {
+struct AnnoFillLineSegRef {
+    uint offset;
+};
+
+struct AnnoStrokeLineSegRef {
    uint offset;
 };

@ -24,16 +28,27 @@ struct AnnotatedRef {
    uint offset;
 };

-struct AnnoLineSeg {
+struct AnnoFillLineSeg {
+    vec2 p0;
+    vec2 p1;
+};
+
+#define AnnoFillLineSeg_size 16
+
+AnnoFillLineSegRef AnnoFillLineSeg_index(AnnoFillLineSegRef ref, uint index) {
+    return AnnoFillLineSegRef(ref.offset + index * AnnoFillLineSeg_size);
+}
+
+struct AnnoStrokeLineSeg {
    vec2 p0;
    vec2 p1;
    vec2 stroke;
 };

-#define AnnoLineSeg_size 24
+#define AnnoStrokeLineSeg_size 24

-AnnoLineSegRef AnnoLineSeg_index(AnnoLineSegRef ref, uint index) {
-    return AnnoLineSegRef(ref.offset + index * AnnoLineSeg_size);
+AnnoStrokeLineSegRef AnnoStrokeLineSeg_index(AnnoStrokeLineSegRef ref, uint index) {
+    return AnnoStrokeLineSegRef(ref.offset + index * AnnoStrokeLineSeg_size);
 }

 struct AnnoQuadSeg {
@ -87,18 +102,39 @@ AnnoStrokeRef AnnoStroke_index(AnnoStrokeRef ref, uint index) {
 }

 #define Annotated_Nop 0
-#define Annotated_Line 1
-#define Annotated_Quad 2
-#define Annotated_Cubic 3
-#define Annotated_Stroke 4
-#define Annotated_Fill 5
+#define Annotated_FillLine 1
+#define Annotated_StrokeLine 2
+#define Annotated_Quad 3
+#define Annotated_Cubic 4
+#define Annotated_Stroke 5
+#define Annotated_Fill 6
 #define Annotated_size 44

 AnnotatedRef Annotated_index(AnnotatedRef ref, uint index) {
    return AnnotatedRef(ref.offset + index * Annotated_size);
 }

-AnnoLineSeg AnnoLineSeg_read(AnnoLineSegRef ref) {
+AnnoFillLineSeg AnnoFillLineSeg_read(AnnoFillLineSegRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = annotated[ix + 0];
+    uint raw1 = annotated[ix + 1];
+    uint raw2 = annotated[ix + 2];
+    uint raw3 = annotated[ix + 3];
+    AnnoFillLineSeg s;
+    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    return s;
+}
+
+void AnnoFillLineSeg_write(AnnoFillLineSegRef ref, AnnoFillLineSeg s) {
+    uint ix = ref.offset >> 2;
+    annotated[ix + 0] = floatBitsToUint(s.p0.x);
+    annotated[ix + 1] = floatBitsToUint(s.p0.y);
+    annotated[ix + 2] = floatBitsToUint(s.p1.x);
+    annotated[ix + 3] = floatBitsToUint(s.p1.y);
+}
+
+AnnoStrokeLineSeg AnnoStrokeLineSeg_read(AnnoStrokeLineSegRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = annotated[ix + 0];
    uint raw1 = annotated[ix + 1];
@ -106,14 +142,14 @@ AnnoLineSeg AnnoLineSeg_read(AnnoLineSegRef ref) {
    uint raw3 = annotated[ix + 3];
    uint raw4 = annotated[ix + 4];
    uint raw5 = annotated[ix + 5];
-    AnnoLineSeg s;
+    AnnoStrokeLineSeg s;
    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.stroke = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
    return s;
 }

-void AnnoLineSeg_write(AnnoLineSegRef ref, AnnoLineSeg s) {
+void AnnoStrokeLineSeg_write(AnnoStrokeLineSegRef ref, AnnoStrokeLineSeg s) {
    uint ix = ref.offset >> 2;
    annotated[ix + 0] = floatBitsToUint(s.p0.x);
    annotated[ix + 1] = floatBitsToUint(s.p0.y);
@ -239,8 +275,12 @@ uint Annotated_tag(AnnotatedRef ref) {
    return annotated[ref.offset >> 2];
 }

-AnnoLineSeg Annotated_Line_read(AnnotatedRef ref) {
-    return AnnoLineSeg_read(AnnoLineSegRef(ref.offset + 4));
+AnnoFillLineSeg Annotated_FillLine_read(AnnotatedRef ref) {
+    return AnnoFillLineSeg_read(AnnoFillLineSegRef(ref.offset + 4));
+}
+
+AnnoStrokeLineSeg Annotated_StrokeLine_read(AnnotatedRef ref) {
+    return AnnoStrokeLineSeg_read(AnnoStrokeLineSegRef(ref.offset + 4));
 }

 AnnoQuadSeg Annotated_Quad_read(AnnotatedRef ref) {
@ -263,9 +303,14 @@ void Annotated_Nop_write(AnnotatedRef ref) {
    annotated[ref.offset >> 2] = Annotated_Nop;
 }

-void Annotated_Line_write(AnnotatedRef ref, AnnoLineSeg s) {
-    annotated[ref.offset >> 2] = Annotated_Line;
-    AnnoLineSeg_write(AnnoLineSegRef(ref.offset + 4), s);
+void Annotated_FillLine_write(AnnotatedRef ref, AnnoFillLineSeg s) {
+    annotated[ref.offset >> 2] = Annotated_FillLine;
+    AnnoFillLineSeg_write(AnnoFillLineSegRef(ref.offset + 4), s);
+}
+
+void Annotated_StrokeLine_write(AnnotatedRef ref, AnnoStrokeLineSeg s) {
+    annotated[ref.offset >> 2] = Annotated_StrokeLine;
+    AnnoStrokeLineSeg_write(AnnoStrokeLineSegRef(ref.offset + 4), s);
 }

 void Annotated_Quad_write(AnnotatedRef ref, AnnoQuadSeg s) {
--- a/piet-gpu/shader/binning.comp
+++ b/piet-gpu/shader/binning.comp
@ -11,24 +11,35 @@ layout(set = 0, binding = 0) buffer AnnotatedBuf {
    uint[] annotated;
 };

-layout(set = 0, binding = 1) buffer AllocBuf {
+// This is for scanning forward for right_edge data.
+layout(set = 0, binding = 1) buffer StateBuf {
+    uint[] state;
+};
+
+layout(set = 0, binding = 2) buffer AllocBuf {
    uint n_elements;
    // Will be incremented atomically to claim tiles
    uint tile_ix;
    uint alloc;
 };

-layout(set = 0, binding = 2) buffer BinsBuf {
+layout(set = 0, binding = 3) buffer BinsBuf {
    uint[] bins;
 };

 #include "annotated.h"
+#include "state.h"
 #include "bins.h"

 // scale factors useful for converting coordinates to bins
 #define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX))
 #define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX))

+#define TSY (1.0 / float(TILE_HEIGHT_PX))
+
+// Constant not available in GLSL. Also consider uintBitsToFloat(0x7f800000)
+#define INFINITY (1.0 / 0.0)
+
 // Note: cudaraster has N_TILE + 1 to cut down on bank conflicts.
 shared uint bitmaps[N_SLICE][N_TILE];
 shared uint count[N_SLICE][N_TILE];
@ -37,6 +48,14 @@ shared uint sh_chunk_start[N_TILE];
 shared uint sh_chunk_end[N_TILE];
 shared uint sh_chunk_jump[N_TILE];

+shared float sh_right_edge[N_TILE];
+
+#define StateBuf_stride (8 + 2 * State_size)
+
+uint state_right_edge_index(uint partition_ix) {
+    return 2 + partition_ix * (StateBuf_stride / 4);
+}
+
 void main() {
    BinChunkRef chunk_ref = BinChunkRef((gl_LocalInvocationID.x * N_WG + gl_WorkGroupID.x) * BIN_INITIAL_ALLOC);
    uint wr_limit = chunk_ref.offset + BIN_INITIAL_ALLOC;
@ -65,13 +84,17 @@ void main() {
            tag = Annotated_tag(ref);
        }
        int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
+        float my_right_edge = INFINITY;
+        bool crosses_edge = false;
        switch (tag) {
-        case Annotated_Line:
-            AnnoLineSeg line = Annotated_Line_read(ref);
+        case Annotated_FillLine:
+        case Annotated_StrokeLine:
+            AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref);
            x0 = int(floor((min(line.p0.x, line.p1.x) - line.stroke.x) * SX));
            y0 = int(floor((min(line.p0.y, line.p1.y) - line.stroke.y) * SY));
            x1 = int(ceil((max(line.p0.x, line.p1.x) + line.stroke.x) * SX));
            y1 = int(ceil((max(line.p0.y, line.p1.y) + line.stroke.y) * SY));
+            crosses_edge = tag == Annotated_FillLine && ceil(line.p0.y * TSY) != ceil(line.p1.y * TSY);
            break;
        case Annotated_Fill:
        case Annotated_Stroke:
@ -82,8 +105,41 @@ void main() {
            y0 = int(floor(fill.bbox.y * SY));
            x1 = int(ceil(fill.bbox.z * SX));
            y1 = int(ceil(fill.bbox.w * SY));
+            // It probably makes more sense to track x1, to avoid having to redo
+            // the rounding to tile coords.
+            my_right_edge = fill.bbox.z;
            break;
        }
+
+        // If the last element in this partition is a fill edge, then we need to do a
+        // look-forward to find the right edge of its corresponding fill. That data is
+        // recorded in aggregates computed in the element processing pass.
+        if (gl_LocalInvocationID.x == N_TILE - 1 && tag == Annotated_FillLine) {
+            uint aggregate_ix = (my_tile + 1) * ELEMENT_BINNING_RATIO;
+            // This is sequential but the expectation is that the amount of
+            // look-forward is small (performance may degrade in the case
+            // of massively complex paths).
+            do {
+                my_right_edge = uintBitsToFloat(state[state_right_edge_index(aggregate_ix)]);
+                aggregate_ix++;
+            } while (isinf(my_right_edge));
+        }
+
+        // Now propagate right_edge backward, from fill to segment.
+        for (uint i = 0; i < LG_N_TILE; i++) {
+            // Note: we could try to cut down on write bandwidth here if the value hasn't
+            // changed, but not sure it's worth the complexity to track.
+            sh_right_edge[gl_LocalInvocationID.x] = my_right_edge;
+            barrier();
+            if (gl_LocalInvocationID.x + (1 << i) < N_TILE && isinf(my_right_edge)) {
+                my_right_edge = sh_right_edge[gl_LocalInvocationID.x + (1 << i)];
+            }
+            barrier();
+        }
+        if (crosses_edge) {
+            x1 = int(ceil(my_right_edge * SX));
+        }
+
        // At this point, we run an iterator over the coverage area,
        // trying to keep divergence low.
        // Right now, it's just a bbox, but we'll get finer with
@ -118,9 +174,9 @@ void main() {
            uint chunk_new_start;
            // Refactor to reduce code duplication?
            if (chunk_n > 0) {
-                uint next_chunk = chunk_ref.offset + BinChunk_size + chunk_n * 4;
-                if (next_chunk + BinChunk_size + min(24, element_count * 4) > wr_limit) {
-                    uint alloc_amount = max(BIN_ALLOC, BinChunk_size + element_count * 4);
+                uint next_chunk = chunk_ref.offset + BinChunk_size + chunk_n * BinInstance_size;
+                if (next_chunk + BinChunk_size + min(24, element_count * BinInstance_size) > wr_limit) {
+                    uint alloc_amount = max(BIN_ALLOC, BinChunk_size + element_count * BinInstance_size);
                    // could try to reduce fragmentation if BIN_ALLOC is only a bit above needed
                    next_chunk = atomicAdd(alloc, alloc_amount);
                    wr_limit = next_chunk + alloc_amount;
@ -129,10 +185,10 @@ void main() {
                chunk_ref = BinChunkRef(next_chunk);
            }
            BinInstanceRef instance_ref = BinInstanceRef(chunk_ref.offset + BinChunk_size);
-            if (instance_ref.offset + element_count * 4 > wr_limit) {
+            if (instance_ref.offset + element_count * BinInstance_size > wr_limit) {
                chunk_end = wr_limit;
-                chunk_n = (wr_limit - instance_ref.offset) / 4;
-                uint alloc_amount = max(BIN_ALLOC, BinChunk_size + (element_count - chunk_n) * 4);
+                chunk_n = (wr_limit - instance_ref.offset) / BinInstance_size;
+                uint alloc_amount = max(BIN_ALLOC, BinChunk_size + (element_count - chunk_n) * BinInstance_size);
                chunk_new_start = atomicAdd(alloc, alloc_amount);
                wr_limit = chunk_new_start + alloc_amount;
                BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(chunk_new_start)));
@ -141,6 +197,7 @@ void main() {
                chunk_n = element_count - chunk_n;
            } else {
                chunk_end = ~0;
+                chunk_new_start = ~0;
                chunk_n = element_count;
            }
            sh_chunk_start[gl_LocalInvocationID.x] = instance_ref.offset;
@ -161,11 +218,11 @@ void main() {
                if (my_slice > 0) {
                    idx += count[my_slice - 1][bin_ix];
                }
-                uint out_offset = sh_chunk_start[bin_ix] + idx * 4;
+                uint out_offset = sh_chunk_start[bin_ix] + idx * BinInstance_size;
                if (out_offset >= sh_chunk_end[bin_ix]) {
                    out_offset += sh_chunk_jump[bin_ix];
                }
-                BinInstance_write(BinInstanceRef(out_offset), BinInstance(element_ix));
+                BinInstance_write(BinInstanceRef(out_offset), BinInstance(element_ix, my_right_edge));
            }
            x++;
            if (x == x1) {
--- a/piet-gpu/shader/binning.spv
+++ b/piet-gpu/shader/binning.spv
--- a/piet-gpu/shader/bins.h
+++ b/piet-gpu/shader/bins.h
@ -10,9 +10,10 @@ struct BinChunkRef {

 struct BinInstance {
    uint element_ix;
+    float right_edge;
 };

-#define BinInstance_size 4
+#define BinInstance_size 8

 BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) {
    return BinInstanceRef(ref.offset + index * BinInstance_size);
@ -32,14 +33,17 @@ BinChunkRef BinChunk_index(BinChunkRef ref, uint index) {
 BinInstance BinInstance_read(BinInstanceRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = bins[ix + 0];
+    uint raw1 = bins[ix + 1];
    BinInstance s;
    s.element_ix = raw0;
+    s.right_edge = uintBitsToFloat(raw1);
    return s;
 }

 void BinInstance_write(BinInstanceRef ref, BinInstance s) {
    uint ix = ref.offset >> 2;
    bins[ix + 0] = s.element_ix;
+    bins[ix + 1] = floatBitsToUint(s.right_edge);
 }

 BinChunk BinChunk_read(BinChunkRef ref) {
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@ -12,7 +12,7 @@ build image.spv: glsl image.comp | scene.h

 build elements.spv: glsl elements.comp | scene.h state.h annotated.h

-build binning.spv: glsl binning.comp | annotated.h bins.h setup.h
+build binning.spv: glsl binning.comp | annotated.h state.h bins.h setup.h

 build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h

--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@ -30,6 +30,7 @@ layout(set = 0, binding = 3) buffer PtclBuf {
 #define N_RINGBUF 512

 shared uint sh_elements[N_RINGBUF];
+shared float sh_right_edge[N_RINGBUF];
 shared uint sh_chunk[N_WG];
 shared uint sh_chunk_next[N_WG];
 shared uint sh_chunk_n[N_WG];
@ -41,6 +42,8 @@ shared uint sh_selected_n;
 shared uint sh_elements_ref;

 shared uint sh_bitmaps[N_SLICE][N_TILE];
+shared uint sh_backdrop[N_SLICE][N_TILE];
+shared uint sh_bd_sign[N_SLICE];

 // scale factors useful for converting coordinates to tiles
 #define SX (1.0 / float(TILE_WIDTH_PX))
@ -77,6 +80,14 @@ void alloc_chunk(inout uint chunk_n_segs, inout SegChunkRef seg_chunk_ref,
    }
 }

+// Accumulate delta to backdrop.
+//
+// Each bit for which bd_bitmap is 1 and bd_sign is 1 counts as +1, and each
+// bit for which bd_bitmap is 1 and bd_sign is 0 counts as -1.
+int count_backdrop(uint bd_bitmap, uint bd_sign) {
+    return bitCount(bd_bitmap & bd_sign) - bitCount(bd_bitmap & ~bd_sign);
+}
+
 void main() {
    // Could use either linear or 2d layouts for both dispatch and
    // invocations within the workgroup. We'll use variables to abstract.
@ -109,10 +120,14 @@ void main() {
        sh_first_el[th_ix] = chunk.n > 0 ?
            BinInstance_read(BinInstanceRef(start_chunk + BinChunk_size)).element_ix : ~0;
    }
-    uint count = 0;
+    if (th_ix < N_SLICE) {
+        sh_bd_sign[th_ix] = 0;
+    }
+    int backdrop = 0;
    while (true) {
        for (uint i = 0; i < N_SLICE; i++) {
            sh_bitmaps[i][th_ix] = 0;
+            sh_backdrop[i][th_ix] = 0;
        }

        while (wr_ix - rd_ix <= N_TILE) {
@ -157,8 +172,10 @@ void main() {
            }
            BinInstanceRef inst_ref = BinInstanceRef(sh_elements_ref);
            if (th_ix < chunk_n) {
-                uint el = BinInstance_read(BinInstance_index(inst_ref, th_ix)).element_ix;
-                sh_elements[(wr_ix + th_ix) % N_RINGBUF] = el;
+                BinInstance inst = BinInstance_read(BinInstance_index(inst_ref, th_ix));
+                uint wr_el_ix = (wr_ix + th_ix) % N_RINGBUF;
+                sh_elements[wr_el_ix] = inst.element_ix;
+                sh_right_edge[wr_el_ix] = inst.right_edge;
            }
            wr_ix += chunk_n;
        }
@ -169,8 +186,11 @@ void main() {
        // Read one element, compute coverage.
        uint tag = Annotated_Nop;
        AnnotatedRef ref;
+        float right_edge = 0.0;
        if (th_ix + rd_ix < wr_ix) {
-            uint element_ix = sh_elements[(rd_ix + th_ix) % N_RINGBUF];
+            uint rd_el_ix = (rd_ix + th_ix) % N_RINGBUF;
+            uint element_ix = sh_elements[rd_el_ix];
+            right_edge = sh_right_edge[rd_el_ix];
            ref = AnnotatedRef(element_ix * Annotated_size);
            tag = Annotated_tag(ref);
        }
@ -179,15 +199,26 @@ void main() {
        float a, b, c;
        // Bounding box of element in pixel coordinates.
        float xmin, xmax, ymin, ymax;
+        uint my_slice = th_ix / 32;
+        uint my_mask = 1 << (th_ix & 31);
        switch (tag) {
-        case Annotated_Line:
-            AnnoLineSeg line = Annotated_Line_read(ref);
+        case Annotated_FillLine:
+        case Annotated_StrokeLine:
+            AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref);
            xmin = min(line.p0.x, line.p1.x) - line.stroke.x;
            xmax = max(line.p0.x, line.p1.x) + line.stroke.x;
            ymin = min(line.p0.y, line.p1.y) - line.stroke.y;
            ymax = max(line.p0.y, line.p1.y) + line.stroke.y;
            float dx = line.p1.x - line.p0.x;
            float dy = line.p1.y - line.p0.y;
+            if (tag == Annotated_FillLine) {
+                // Set bit for backdrop sign calculation, 1 is +1, 0 is -1.
+                if (dy < 0) {
+                    atomicOr(sh_bd_sign[my_slice], my_mask);
+                } else {
+                    atomicAnd(sh_bd_sign[my_slice], ~my_mask);
+                }
+            }
            // Set up for per-scanline coverage formula, below.
            float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
            c = abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + line.stroke.y) * SX;
@ -214,20 +245,20 @@ void main() {
            break;
        }

-        // Draw the coverage area into the bitmaks. This uses an algorithm
+        // Draw the coverage area into the bitmasks. This uses an algorithm
        // that computes the coverage of a span for given scanline.

        // Compute bounding box in tiles and clip to this bin.
        int x0 = int(floor((xmin - xy0.x) * SX));
        int x1 = int(ceil((xmax - xy0.x) * SX));
+        int xr = int(ceil((right_edge - xy0.x) * SX));
        int y0 = int(floor((ymin - xy0.y) * SY));
        int y1 = int(ceil((ymax - xy0.y) * SY));
        x0 = clamp(x0, 0, N_TILE_X);
        x1 = clamp(x1, x0, N_TILE_X);
+        xr = clamp(xr, 0, N_TILE_X);
        y0 = clamp(y0, 0, N_TILE_Y);
        y1 = clamp(y1, y0, N_TILE_Y);
-        uint my_slice = th_ix / 32;
-        uint my_mask = 1 << (th_ix & 31);
        float t = a + b * float(y0);
        for (uint y = y0; y < y1; y++) {
            uint xx0 = clamp(int(floor(t - c)), x0, x1);
@ -235,6 +266,15 @@ void main() {
            for (uint x = xx0; x < xx1; x++) {
                atomicOr(sh_bitmaps[my_slice][y * N_TILE_X + x], my_mask);
            }
+            if (tag == Annotated_FillLine && ymin <= xy0.y + float(y * TILE_HEIGHT_PX)) {
+                // Assign backdrop to all tiles to the right of the ray crossing the
+                // top edge of this tile, up to the right edge of the fill bbox.
+                float xray = t - 0.5 * b;
+                xx0 = max(int(ceil(xray)), 0);
+                for (uint x = xx0; x < xr; x++) {
+                    atomicOr(sh_backdrop[my_slice][y * N_TILE_X + x], my_mask);
+                }
+            }
            t += b;
        }
        barrier();
@ -242,13 +282,18 @@ void main() {
        // Output elements for this tile, based on bitmaps.
        uint slice_ix = 0;
        uint bitmap = sh_bitmaps[0][th_ix];
+        uint bd_bitmap = sh_backdrop[0][th_ix];
+        uint bd_sign = sh_bd_sign[0];
        while (true) {
            if (bitmap == 0) {
+                backdrop += count_backdrop(bd_bitmap, bd_sign);
                slice_ix++;
                if (slice_ix == N_SLICE) {
                    break;
                }
                bitmap = sh_bitmaps[slice_ix][th_ix];
+                bd_bitmap = sh_backdrop[slice_ix][th_ix];
+                bd_sign = sh_bd_sign[slice_ix];
                if (bitmap == 0) {
                    continue;
                }
@ -256,6 +301,13 @@ void main() {
            uint element_ref_ix = slice_ix * 32 + findLSB(bitmap);
            uint element_ix = sh_elements[(rd_ix + element_ref_ix) % N_RINGBUF];

+            // Bits up to and including the lsb
+            uint bd_mask = (bitmap - 1) ^ bitmap;
+            backdrop += count_backdrop(bd_bitmap & bd_mask, bd_sign);
+            // Clear bits that have been consumed.
+            bd_bitmap &= ~bd_mask;
+            bitmap &= ~bd_mask;
+
            // At this point, we read the element again from global memory.
            // If that turns out to be expensive, maybe we can pack it into
            // shared memory (or perhaps just the tag).
@ -263,15 +315,58 @@ void main() {
            tag = Annotated_tag(ref);

            switch (tag) {
-            case Annotated_Line:
-                AnnoLineSeg line = Annotated_Line_read(ref);
+            case Annotated_FillLine:
+                AnnoFillLineSeg fill_line = Annotated_FillLine_read(ref);
+                // This is basically the same logic as piet-metal, but should be made numerically robust.
+                vec2 tile_xy = vec2(tile_x * TILE_WIDTH_PX, tile_y * TILE_HEIGHT_PX);
+                float yEdge = mix(fill_line.p0.y, fill_line.p1.y, (tile_xy.x - fill_line.p0.x) / (fill_line.p1.x - fill_line.p0.x));
+                if (min(fill_line.p0.x, fill_line.p1.x) < tile_xy.x && yEdge >= tile_xy.y && yEdge < tile_xy.y + TILE_HEIGHT_PX) {
+                    Segment edge_seg;
+                    if (fill_line.p0.x > fill_line.p1.x) {
+                        fill_line.p1 = vec2(tile_xy.x, yEdge);
+                        edge_seg.start = fill_line.p1;
+                        edge_seg.end = vec2(tile_xy.x, tile_xy.y + TILE_HEIGHT_PX);
+                    } else {
+                        fill_line.p0 = vec2(tile_xy.x, yEdge);
+                        edge_seg.start = vec2(tile_xy.x, tile_xy.y + TILE_HEIGHT_PX);
+                        edge_seg.end = fill_line.p0;
+                    }
+                    alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit);
+                    Segment_write(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs), edge_seg);
+                    chunk_n_segs++;
+                }
+                Segment fill_seg = Segment(fill_line.p0, fill_line.p1);
+                alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit);
+                Segment_write(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs), fill_seg);
+                chunk_n_segs++;
+                break;
+            case Annotated_StrokeLine:
+                AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref);
                Segment seg = Segment(line.p0, line.p1);
                alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit);
                Segment_write(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs), seg);
                chunk_n_segs++;
                break;
            case Annotated_Fill:
-                chunk_n_segs = 0;
+                if (chunk_n_segs > 0) {
+                    AnnoFill fill = Annotated_Fill_read(ref);
+                    SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, SegChunkRef(0)));
+                    seg_chunk_ref.offset += SegChunk_size + Segment_size * chunk_n_segs;
+                    CmdFill cmd_fill;
+                    cmd_fill.seg_ref = first_seg_chunk.offset;
+                    cmd_fill.backdrop = backdrop;
+                    cmd_fill.rgba_color = fill.rgba_color;
+                    alloc_cmd(cmd_ref, cmd_limit);
+                    Cmd_Fill_write(cmd_ref, cmd_fill);
+                    cmd_ref.offset += Cmd_size;
+                    chunk_n_segs = 0;
+                } else if (backdrop != 0) {
+                    AnnoFill fill = Annotated_Fill_read(ref);
+                    alloc_cmd(cmd_ref, cmd_limit);
+                    Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color));
+                    cmd_ref.offset += Cmd_size;
+                }
+                backdrop = 0;
                break;
            case Annotated_Stroke:
                if (chunk_n_segs > 0) {
@ -289,9 +384,6 @@ void main() {
                }
                break;
            }
-
-            // clear LSB
-            bitmap &= bitmap - 1;
        }
        barrier();

--- a/piet-gpu/shader/coarse.spv
+++ b/piet-gpu/shader/coarse.spv
--- a/piet-gpu/shader/elements.comp
+++ b/piet-gpu/shader/elements.comp
@ -10,7 +10,7 @@
 #define N_ROWS 4
 #define WG_SIZE 32
 #define LG_WG_SIZE 5
-#define TILE_SIZE (WG_SIZE * N_ROWS)
+#define PARTITION_SIZE (WG_SIZE * N_ROWS)

 layout(local_size_x = WG_SIZE, local_size_y = 1) in;

@ -34,14 +34,14 @@ layout(set = 0, binding = 2) buffer AnnotatedBuf {
 #include "state.h"
 #include "annotated.h"

-#define StateBuf_stride (4 + 2 * State_size)
+#define StateBuf_stride (8 + 2 * State_size)

 StateRef state_aggregate_ref(uint partition_ix) {
-    return StateRef(8 + partition_ix * StateBuf_stride);
+    return StateRef(12 + partition_ix * StateBuf_stride);
 }

 StateRef state_prefix_ref(uint partition_ix) {
-    return StateRef(8 + partition_ix * StateBuf_stride + State_size);
+    return StateRef(12 + partition_ix * StateBuf_stride + State_size);
 }

 uint state_flag_index(uint partition_ix) {
@ -86,7 +86,7 @@ State combine_state(State a, State b) {
    return c;
 }

-State map_element(ElementRef ref) {
+State map_element(ElementRef ref, inout bool is_fill) {
    // TODO: it would *probably* be more efficient to make the memory read patterns less
    // divergent, though it would be more wasted memory.
    uint tag = Element_tag(ref);
@ -96,9 +96,11 @@ State map_element(ElementRef ref) {
    c.translate = vec2(0.0, 0.0);
    c.linewidth = 1.0; // TODO should be 0.0
    c.flags = 0;
+    is_fill = false;
    switch (tag) {
-    case Element_Line:
-        LineSeg line = Element_Line_read(ref);
+    case Element_FillLine:
+    case Element_StrokeLine:
+        LineSeg line = Element_FillLine_read(ref);
        c.bbox.xy = min(line.p0, line.p1);
        c.bbox.zw = max(line.p0, line.p1);
        break;
@ -113,6 +115,8 @@ State map_element(ElementRef ref) {
        c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3));
        break;
    case Element_Fill:
+        is_fill = true;
+        // fall-through
    case Element_Stroke:
        c.flags = FLAG_RESET_BBOX;
        break;
@ -145,6 +149,8 @@ shared vec4 sh_bbox[WG_SIZE];
 shared float sh_width[WG_SIZE];
 shared uint sh_flags[WG_SIZE];

+shared uint sh_min_fill;
+
 shared uint sh_tile_ix;
 shared State sh_prefix;

@ -154,19 +160,27 @@ void main() {
    // 4.4 of prefix sum paper).
    if (gl_LocalInvocationID.x == 0) {
        sh_tile_ix = atomicAdd(state[0], 1);
+        sh_min_fill = ~0;
    }
    barrier();
    uint tile_ix = sh_tile_ix;

-    uint ix = tile_ix * TILE_SIZE + gl_LocalInvocationID.x * N_ROWS;
+    uint ix = tile_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
    ElementRef ref = ElementRef(ix * Element_size);

-    th_state[0] = map_element(ref);
+    bool is_fill;
+    uint my_min_fill = ~0;
+    th_state[0] = map_element(ref, is_fill);
+    if (is_fill) my_min_fill = ix;
    for (uint i = 1; i < N_ROWS; i++) {
        // discussion question: would it be faster to load using more coherent patterns
        // into thread memory? This is kinda strided.
-        th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i)));
+        th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i), is_fill));
+        if (is_fill && my_min_fill == ~0) {
+            my_min_fill = ix + i;
+        }
    }
+    atomicMin(sh_min_fill, my_min_fill);
    State agg = th_state[N_ROWS - 1];
    sh_mat[gl_LocalInvocationID.x] = agg.mat;
    sh_translate[gl_LocalInvocationID.x] = agg.translate;
@ -238,6 +252,7 @@ void main() {
        }
    }
    barrier();
+    my_min_fill = sh_min_fill;
    if (tile_ix != 0) {
        exclusive = sh_prefix;
    }
@ -253,8 +268,14 @@ void main() {
        other.flags = sh_flags[ix];
        row = combine_state(row, other);
    }
+    if (my_min_fill == ~0 && gl_LocalInvocationID.x == 0) {
+        state[state_flag_index(tile_ix) + 1] = 0x7f800000; // infinity
+    }
    for (uint i = 0; i < N_ROWS; i++) {
        State st = combine_state(row, th_state[i]);
+        if (my_min_fill == ix + i) {
+            state[state_flag_index(tile_ix) + 1] = floatBitsToUint(st.bbox.z);
+        }
        // We write the state now for development purposes, but the
        // actual goal is to write transformed and annotated elements.
        //State_write(StateRef((ix + i) * State_size), st);
@ -266,13 +287,22 @@ void main() {
        AnnotatedRef out_ref = AnnotatedRef((ix + i) * Annotated_size);
        uint tag = Element_tag(this_ref);
        switch (tag) {
-        case Element_Line:
-            LineSeg line = Element_Line_read(this_ref);
-            AnnoLineSeg anno_line;
+        case Element_FillLine:
+        case Element_StrokeLine:
+            LineSeg line = Element_StrokeLine_read(this_ref);
+            AnnoStrokeLineSeg anno_line;
            anno_line.p0 = st.mat.xz * line.p0.x + st.mat.yw * line.p0.y + st.translate;
            anno_line.p1 = st.mat.xz * line.p1.x + st.mat.yw * line.p1.y + st.translate;
-            anno_line.stroke = get_linewidth(st);
-            Annotated_Line_write(out_ref, anno_line);
+            if (tag == Element_StrokeLine) {
+                anno_line.stroke = get_linewidth(st);
+            } else {
+                anno_line.stroke = vec2(0.0);
+            }
+            // We do encoding a bit by hand to minimize divergence. Another approach
+            // would be to have a fill/stroke bool.
+            uint out_tag = tag == Element_FillLine ? Annotated_FillLine : Annotated_StrokeLine;
+            annotated[out_ref.offset >> 2] = out_tag;
+            AnnoStrokeLineSeg_write(AnnoStrokeLineSegRef(out_ref.offset + 4), anno_line);
            break;
        case Element_Stroke:
            Stroke stroke = Element_Stroke_read(this_ref);
--- a/piet-gpu/shader/elements.spv
+++ b/piet-gpu/shader/elements.spv
--- a/piet-gpu/shader/scene.h
+++ b/piet-gpu/shader/scene.h
@ -238,13 +238,14 @@ TransformRef Transform_index(TransformRef ref, uint index) {
 }

 #define Element_Nop 0
-#define Element_Line 1
-#define Element_Quad 2
-#define Element_Cubic 3
-#define Element_Stroke 4
-#define Element_Fill 5
-#define Element_SetLineWidth 6
-#define Element_Transform 7
+#define Element_StrokeLine 1
+#define Element_FillLine 2
+#define Element_Quad 3
+#define Element_Cubic 4
+#define Element_Stroke 5
+#define Element_Fill 6
+#define Element_SetLineWidth 7
+#define Element_Transform 8
 #define Element_size 36

 ElementRef Element_index(ElementRef ref, uint index) {
@ -446,7 +447,11 @@ uint Element_tag(ElementRef ref) {
    return scene[ref.offset >> 2];
 }

-LineSeg Element_Line_read(ElementRef ref) {
+LineSeg Element_StrokeLine_read(ElementRef ref) {
+    return LineSeg_read(LineSegRef(ref.offset + 4));
+}
+
+LineSeg Element_FillLine_read(ElementRef ref) {
    return LineSeg_read(LineSegRef(ref.offset + 4));
 }

--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@ -51,9 +51,14 @@
 #define N_TILE_X 16
 #define N_TILE_Y 16
 #define N_TILE (N_TILE_X * N_TILE_Y)
+#define LG_N_TILE 8
 #define N_SLICE (N_TILE / 32)
 // Number of workgroups for binning kernel
 #define N_WG 16

+// This is the ratio of the number of elements in a binning workgroup
+// over the number of elements in a partition workgroup.
+#define ELEMENT_BINNING_RATIO 2
+
 #define BIN_INITIAL_ALLOC 64
 #define BIN_ALLOC 256
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -185,10 +185,10 @@ impl<D: Device> Renderer<D> {
            ])
            ?;
        let bin_code = include_bytes!("../shader/binning.spv");
-        let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 3, 0)?;
+        let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 4, 0)?;
        let bin_ds = device.create_descriptor_set(
            &bin_pipeline,
-            &[&anno_buf, &bin_alloc_buf_dev, &bin_buf],
+            &[&anno_buf, &state_buf, &bin_alloc_buf_dev, &bin_buf],
            &[],
        )?;

--- a/piet-gpu/src/pico_svg.rs
+++ b/piet-gpu/src/pico_svg.rs
@ -61,8 +61,8 @@ impl PicoSvg {
        for item in &self.items {
            match item {
                Item::Fill(fill_item) => {
-                    //rc.fill(&fill_item.path, &fill_item.color);
-                    rc.stroke(&fill_item.path, &fill_item.color, 1.0);
+                    rc.fill(&fill_item.path, &fill_item.color);
+                    //rc.stroke(&fill_item.path, &fill_item.color, 1.0);
                }
                Item::Stroke(stroke_item) => {
                    rc.stroke(&stroke_item.path, &stroke_item.color, stroke_item.width);
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@ -94,7 +94,7 @@ impl RenderContext for PietGpuRenderContext {
        }
        let brush = brush.make_brush(self, || shape.bounding_box()).into_owned();
        let path = shape.to_bez_path(TOLERANCE);
-        self.encode_path(path);
+        self.encode_path(path, false);
        match brush {
            PietGpuBrush::Solid(rgba_color) => {
                let stroke = Stroke { rgba_color };
@ -116,7 +116,7 @@ impl RenderContext for PietGpuRenderContext {
    fn fill(&mut self, shape: impl Shape, brush: &impl IntoBrush<Self>) {
        let brush = brush.make_brush(self, || shape.bounding_box()).into_owned();
        let path = shape.to_bez_path(TOLERANCE);
-        self.encode_path(path);
+        self.encode_path(path, true);
        match brush {
            PietGpuBrush::Solid(rgba_color) => {
                let fill = Fill { rgba_color };
@ -198,7 +198,15 @@ impl RenderContext for PietGpuRenderContext {
 }

 impl PietGpuRenderContext {
-    fn encode_path(&mut self, path: impl Iterator<Item = PathEl>) {
+    fn encode_line_seg(&mut self, seg: LineSeg, is_fill: bool) {
+        if is_fill {
+            self.elements.push(Element::FillLine(seg));
+        } else {
+            self.elements.push(Element::StrokeLine(seg));
+        }
+    }
+
+    fn encode_path(&mut self, path: impl Iterator<Item = PathEl>, is_fill: bool) {
        let flatten = true;
        if flatten {
            let mut start_pt = None;
@ -207,6 +215,7 @@ impl PietGpuRenderContext {
                match el {
                    PathEl::MoveTo(p) => {
                        let scene_pt = to_f32_2(p);
+                        start_pt = Some(scene_pt);
                        last_pt = Some(scene_pt);
                    }
                    PathEl::LineTo(p) => {
@ -215,16 +224,18 @@ impl PietGpuRenderContext {
                            p0: last_pt.unwrap(),
                            p1: scene_pt,
                        };
-                        self.elements.push(Element::Line(seg));
+                        self.encode_line_seg(seg, is_fill);
                        last_pt = Some(scene_pt);
                    }
                    PathEl::ClosePath => {
                        if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) {
-                            let seg = LineSeg {
-                                p0: last,
-                                p1: start,
-                            };
-                            self.elements.push(Element::Line(seg));
+                            if last != start {
+                                let seg = LineSeg {
+                                    p0: last,
+                                    p1: start,
+                                };
+                                self.encode_line_seg(seg, is_fill);
+                            }
                        }
                    }
                    _ => (),
@ -238,6 +249,7 @@ impl PietGpuRenderContext {
                match el {
                    PathEl::MoveTo(p) => {
                        let scene_pt = to_f32_2(p);
+                        start_pt = Some(scene_pt);
                        last_pt = Some(scene_pt);
                    }
                    PathEl::LineTo(p) => {
@ -246,7 +258,7 @@ impl PietGpuRenderContext {
                            p0: last_pt.unwrap(),
                            p1: scene_pt,
                        };
-                        self.elements.push(Element::Line(seg));
+                        self.encode_line_seg(seg, is_fill);
                        last_pt = Some(scene_pt);
                    }
                    PathEl::QuadTo(p1, p2) => {
@ -275,11 +287,13 @@ impl PietGpuRenderContext {
                    }
                    PathEl::ClosePath => {
                        if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) {
-                            let seg = LineSeg {
-                                p0: last,
-                                p1: start,
-                            };
-                            self.elements.push(Element::Line(seg));
+                            if last != start {
+                                let seg = LineSeg {
+                                    p0: last,
+                                    p1: start,
+                                };
+                                self.encode_line_seg(seg, is_fill);
+                            }
                        }
                    }
                }