Fills

Adds fills, and has more or less working tiger render (with artifacts).
2025-03-25 05:29:38 +11:00 · 2020-04-30 17:06:01 -07:00 · 2020-04-30 17:06:01 -07:00 · aa83d782ed
commit aa83d782ed
parent 064ee86a45
22 changed files with 785 additions and 35 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -139,6 +139,7 @@ dependencies = [
 "piet-gpu-types",
 "png",
 "rand",
+ "roxmltree",
 ]

 [[package]]
@ -243,6 +244,15 @@ dependencies = [
 "rand_core",
 ]

+[[package]]
+name = "roxmltree"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d5001f134077069d87f77c8b9452b690df2445f7a43f1c7ca4a1af8dd505789d"
+dependencies = [
+ "xmlparser",
+]
+
 [[package]]
 name = "syn"
 version = "1.0.17"
@ -287,3 +297,9 @@ name = "winapi-x86_64-pc-windows-gnu"
 version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
+[[package]]
+name = "xmlparser"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ccb4240203dadf40be2de9369e5c6dec1bf427528115b030baca3334c18362d7"
--- a/piet-gpu-types/src/fill_seg.rs
+++ b/piet-gpu-types/src/fill_seg.rs
@ -0,0 +1,37 @@
+use piet_gpu_derive::piet_gpu;
+
+// Structures representing segments for fill items.
+
+// There is some cut'n'paste here from stroke segments, which can be
+// traced to the fact that buffers in GLSL are basically global.
+// Maybe there's a way to address that, but in the meantime living
+// with the duplication is easiest.
+
+piet_gpu! {
+    #[gpu_write]
+    mod fill_seg {
+        struct FillTileHeader {
+            n: u32,
+            items: Ref<FillItemHeader>,
+        }
+
+        struct FillItemHeader {
+            backdrop: i32,
+            segments: Ref<FillSegChunk>,
+        }
+
+        // TODO: strongly consider using f16. If so, these would be
+        // relative to the tile. We're doing f32 for now to minimize
+        // divergence from piet-metal originals.
+        struct FillSegment {
+            start: [f32; 2],
+            end: [f32; 2],
+        }
+
+        struct FillSegChunk {
+            n: u32,
+            next: Ref<FillSegChunk>,
+            // Segments follow (could represent this as a variable sized array).
+        }
+    }
+}
--- a/piet-gpu-types/src/lib.rs
+++ b/piet-gpu-types/src/lib.rs
@ -1,4 +1,5 @@
 pub mod encoder;
+pub mod fill_seg;
 pub mod ptcl;
 pub mod scene;
 pub mod segment;
--- a/piet-gpu-types/src/main.rs
+++ b/piet-gpu-types/src/main.rs
@ -7,6 +7,7 @@ fn main() {
        "scene" => print!("{}", piet_gpu_types::scene::gen_gpu_scene()),
        "tilegroup" => print!("{}", piet_gpu_types::tilegroup::gen_gpu_tilegroup()),
        "segment" => print!("{}", piet_gpu_types::segment::gen_gpu_segment()),
+        "fill_seg" => print!("{}", piet_gpu_types::fill_seg::gen_gpu_fill_seg()),
        "ptcl" => print!("{}", piet_gpu_types::ptcl::gen_gpu_ptcl()),
        "test" => print!("{}", piet_gpu_types::test::gen_gpu_test()),
        _ => println!("Oops, unknown module name"),
--- a/piet-gpu-types/src/ptcl.rs
+++ b/piet-gpu-types/src/ptcl.rs
@ -19,8 +19,10 @@ piet_gpu! {
            rgba_color: u32,
        }
        struct CmdFill {
-            start: [f32; 2],
-            end: [f32; 2],
+            // Should be Ref<FillSegChunk> if we had cross-module references.
+            seg_ref: u32,
+            backdrop: i32,
+            rgba_color: u32,
        }
        struct CmdFillEdge {
            // The sign is only one bit.
--- a/piet-gpu/Cargo.toml
+++ b/piet-gpu/Cargo.toml
@ -17,3 +17,4 @@ kurbo = "0.5.11"
 piet = "0.0.12"
 png = "0.16.2"
 rand = "0.7.3"
+roxmltree = "0.11"
--- a/piet-gpu/Ghostscript_Tiger.svg
+++ b/piet-gpu/Ghostscript_Tiger.svg
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@ -13,6 +13,8 @@ build kernel1.spv: glsl kernel1.comp | scene.h tilegroup.h setup.h

 build kernel2s.spv: glsl kernel2s.comp | scene.h tilegroup.h segment.h setup.h

-build kernel3.spv: glsl kernel3.comp | scene.h tilegroup.h segment.h ptcl.h setup.h
+build kernel2f.spv: glsl kernel2f.comp | scene.h tilegroup.h fill_seg.h setup.h

-build kernel4.spv: glsl kernel4.comp | ptcl.h segment.h setup.h
+build kernel3.spv: glsl kernel3.comp | scene.h tilegroup.h segment.h fill_seg.h ptcl.h setup.h
+
+build kernel4.spv: glsl kernel4.comp | ptcl.h segment.h fill_seg.h setup.h
--- a/piet-gpu/shader/fill_seg.h
+++ b/piet-gpu/shader/fill_seg.h
@ -0,0 +1,130 @@
+// Code auto-generated by piet-gpu-derive
+
+struct FillTileHeaderRef {
+    uint offset;
+};
+
+struct FillItemHeaderRef {
+    uint offset;
+};
+
+struct FillSegmentRef {
+    uint offset;
+};
+
+struct FillSegChunkRef {
+    uint offset;
+};
+
+struct FillTileHeader {
+    uint n;
+    FillItemHeaderRef items;
+};
+
+#define FillTileHeader_size 8
+
+FillTileHeaderRef FillTileHeader_index(FillTileHeaderRef ref, uint index) {
+    return FillTileHeaderRef(ref.offset + index * FillTileHeader_size);
+}
+
+struct FillItemHeader {
+    int backdrop;
+    FillSegChunkRef segments;
+};
+
+#define FillItemHeader_size 8
+
+FillItemHeaderRef FillItemHeader_index(FillItemHeaderRef ref, uint index) {
+    return FillItemHeaderRef(ref.offset + index * FillItemHeader_size);
+}
+
+struct FillSegment {
+    vec2 start;
+    vec2 end;
+};
+
+#define FillSegment_size 16
+
+FillSegmentRef FillSegment_index(FillSegmentRef ref, uint index) {
+    return FillSegmentRef(ref.offset + index * FillSegment_size);
+}
+
+struct FillSegChunk {
+    uint n;
+    FillSegChunkRef next;
+};
+
+#define FillSegChunk_size 8
+
+FillSegChunkRef FillSegChunk_index(FillSegChunkRef ref, uint index) {
+    return FillSegChunkRef(ref.offset + index * FillSegChunk_size);
+}
+
+FillTileHeader FillTileHeader_read(FillTileHeaderRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = fill_seg[ix + 0];
+    uint raw1 = fill_seg[ix + 1];
+    FillTileHeader s;
+    s.n = raw0;
+    s.items = FillItemHeaderRef(raw1);
+    return s;
+}
+
+void FillTileHeader_write(FillTileHeaderRef ref, FillTileHeader s) {
+    uint ix = ref.offset >> 2;
+    fill_seg[ix + 0] = s.n;
+    fill_seg[ix + 1] = s.items.offset;
+}
+
+FillItemHeader FillItemHeader_read(FillItemHeaderRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = fill_seg[ix + 0];
+    uint raw1 = fill_seg[ix + 1];
+    FillItemHeader s;
+    s.backdrop = int(raw0);
+    s.segments = FillSegChunkRef(raw1);
+    return s;
+}
+
+void FillItemHeader_write(FillItemHeaderRef ref, FillItemHeader s) {
+    uint ix = ref.offset >> 2;
+    fill_seg[ix + 0] = uint(s.backdrop);
+    fill_seg[ix + 1] = s.segments.offset;
+}
+
+FillSegment FillSegment_read(FillSegmentRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = fill_seg[ix + 0];
+    uint raw1 = fill_seg[ix + 1];
+    uint raw2 = fill_seg[ix + 2];
+    uint raw3 = fill_seg[ix + 3];
+    FillSegment s;
+    s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    return s;
+}
+
+void FillSegment_write(FillSegmentRef ref, FillSegment s) {
+    uint ix = ref.offset >> 2;
+    fill_seg[ix + 0] = floatBitsToUint(s.start.x);
+    fill_seg[ix + 1] = floatBitsToUint(s.start.y);
+    fill_seg[ix + 2] = floatBitsToUint(s.end.x);
+    fill_seg[ix + 3] = floatBitsToUint(s.end.y);
+}
+
+FillSegChunk FillSegChunk_read(FillSegChunkRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = fill_seg[ix + 0];
+    uint raw1 = fill_seg[ix + 1];
+    FillSegChunk s;
+    s.n = raw0;
+    s.next = FillSegChunkRef(raw1);
+    return s;
+}
+
+void FillSegChunk_write(FillSegChunkRef ref, FillSegChunk s) {
+    uint ix = ref.offset >> 2;
+    fill_seg[ix + 0] = s.n;
+    fill_seg[ix + 1] = s.next.offset;
+}
+
--- a/piet-gpu/shader/kernel1.comp
+++ b/piet-gpu/shader/kernel1.comp
@ -52,10 +52,19 @@ void main() {
    TileGroupRef stroke_start = TileGroupRef(tg_ref.offset + TILEGROUP_STROKE_START);
    ChunkRef stroke_chunk_start = ChunkRef(stroke_start.offset + 4);
    InstanceRef stroke_ref = InstanceRef(stroke_chunk_start.offset + Chunk_size);
-    uint stroke_limit = stroke_start.offset + TILEGROUP_INITIAL_ALLOC - Instance_size;
+    uint stroke_limit = stroke_start.offset + TILEGROUP_INITIAL_STROKE_ALLOC - Instance_size;
    uint stroke_chunk_n = 0;
    uint stroke_n = 0;

+    // State for fill references. All this is a bit cut'n'paste, but making a
+    // proper abstraction isn't easy.
+    TileGroupRef fill_start = TileGroupRef(tg_ref.offset + TILEGROUP_FILL_START);
+    ChunkRef fill_chunk_start = ChunkRef(fill_start.offset + 4);
+    InstanceRef fill_ref = InstanceRef(fill_chunk_start.offset + Chunk_size);
+    uint fill_limit = fill_start.offset + TILEGROUP_INITIAL_FILL_ALLOC - Instance_size;
+    uint fill_chunk_n = 0;
+    uint fill_n = 0;
+
    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILEGROUP_WIDTH_PX, TILEGROUP_HEIGHT_PX);
    PietItemRef root = PietItemRef(0);
    SimpleGroup group = PietItem_Group_read(root);
@ -100,6 +109,20 @@ void main() {
                    Instance_write(stroke_ref, ins);
                    stroke_chunk_n++;
                    stroke_ref.offset += Instance_size;
+                } else if (tag == PietItem_Fill) {
+                    if (fill_ref.offset > fill_limit) {
+                        uint new_fill = atomicAdd(alloc, TILEGROUP_FILL_ALLOC);
+                        Chunk_write(fill_chunk_start, Chunk(fill_chunk_n, ChunkRef(new_fill)));
+                        fill_chunk_start = ChunkRef(new_fill);
+                        fill_ref = InstanceRef(new_fill + Chunk_size);
+                        fill_n += fill_chunk_n;
+                        fill_chunk_n = 0;
+                        fill_limit = new_fill + TILEGROUP_FILL_ALLOC - Instance_size;
+                    }
+                    Instance_write(fill_ref, ins);
+                    fill_chunk_n++;
+                    fill_ref.offset += Instance_size;
+
                }
            }
            if (is_group) {
@ -129,4 +152,10 @@ void main() {
        Chunk_write(stroke_chunk_start, Chunk(stroke_chunk_n, ChunkRef(0)));
    }
    tilegroup[stroke_start.offset >> 2] = stroke_n;
+
+    fill_n += fill_chunk_n;
+    if (fill_n > 0) {
+        Chunk_write(fill_chunk_start, Chunk(fill_chunk_n, ChunkRef(0)));
+    }
+    tilegroup[fill_start.offset >> 2] = fill_n;
 }
--- a/piet-gpu/shader/kernel1.spv
+++ b/piet-gpu/shader/kernel1.spv
--- a/piet-gpu/shader/kernel2f.comp
+++ b/piet-gpu/shader/kernel2f.comp
@ -0,0 +1,165 @@
+// This is "kernel 2" (fill) in a 4-kernel pipeline. It processes the fill
+// (polyline) items in the scene and generates a list of segments for each, for
+// each tile.
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+layout(local_size_x = 32) in;
+
+layout(set = 0, binding = 0) readonly buffer SceneBuf {
+    uint[] scene;
+};
+
+layout(set = 0, binding = 1) buffer TilegroupBuf {
+    uint[] tilegroup;
+};
+
+layout(set = 0, binding = 2) buffer FillSegBuf {
+    uint[] fill_seg;
+};
+
+layout(set = 0, binding = 3) buffer AllocBuf {
+    uint alloc;
+};
+
+#include "scene.h"
+#include "tilegroup.h"
+#include "fill_seg.h"
+
+#include "setup.h"
+
+// Ensure that there is space to encode a segment.
+void alloc_chunk(inout uint chunk_n_segs, inout FillSegChunkRef seg_chunk_ref,
+    inout FillSegChunkRef first_seg_chunk, inout uint seg_limit)
+{
+    if (chunk_n_segs == 0) {
+        if (seg_chunk_ref.offset + 40 > seg_limit) {
+            seg_chunk_ref.offset = atomicAdd(alloc, SEG_CHUNK_ALLOC);
+            seg_limit = seg_chunk_ref.offset + SEG_CHUNK_ALLOC - FillSegment_size;
+        }
+        first_seg_chunk = seg_chunk_ref;
+    } else if (seg_chunk_ref.offset + FillSegChunk_size + FillSegment_size * chunk_n_segs > seg_limit) {
+        uint new_chunk_ref = atomicAdd(alloc, SEG_CHUNK_ALLOC);
+        seg_limit = new_chunk_ref + SEG_CHUNK_ALLOC - FillSegment_size;
+        FillSegChunk_write(seg_chunk_ref, FillSegChunk(chunk_n_segs, FillSegChunkRef(new_chunk_ref)));
+        seg_chunk_ref.offset = new_chunk_ref;
+        chunk_n_segs = 0;
+    }
+
+}
+
+void main() {
+    uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x;
+    uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS
+        + (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES);
+    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
+    TileGroupRef fill_start = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE + TILEGROUP_FILL_START);
+    uint fill_n = tilegroup[fill_start.offset >> 2];
+
+    FillTileHeaderRef tile_header_ref = FillTileHeaderRef(tile_ix * FillTileHeader_size);
+    if (fill_n > 0) {
+        ChunkRef chunk_ref = ChunkRef(fill_start.offset + 4);
+        Chunk chunk = Chunk_read(chunk_ref);
+        InstanceRef fill_ref = InstanceRef(chunk_ref.offset + Chunk_size);
+        FillItemHeaderRef item_header = FillItemHeaderRef(atomicAdd(alloc, fill_n * FillItemHeader_size));
+        FillTileHeader_write(tile_header_ref, FillTileHeader(fill_n, item_header));
+        FillSegChunkRef seg_chunk_ref = FillSegChunkRef(0);
+        uint seg_limit = 0;
+        // Iterate through items; fill_n holds count remaining.
+        while (true) {
+            if (chunk.chunk_n == 0) {
+                chunk_ref = chunk.next;
+                if (chunk_ref.offset == 0) {
+                    break;
+                }
+                chunk = Chunk_read(chunk_ref);
+                fill_ref = InstanceRef(chunk_ref.offset + Chunk_size);
+            }
+            Instance ins = Instance_read(fill_ref);
+            PietFill fill = PietItem_Fill_read(PietItemRef(ins.item_ref));
+
+            // Process the fill polyline item.
+            uint max_n_segs = fill.n_points - 1;
+            uint chunk_n_segs = 0;
+            int backdrop = 0;
+            FillSegChunkRef seg_chunk_ref;
+            FillSegChunkRef first_seg_chunk = FillSegChunkRef(0);
+            vec2 start = Point_read(fill.points).xy;
+            for (uint j = 0; j < max_n_segs; j++) {
+                fill.points.offset += Point_size;
+                vec2 end = Point_read(fill.points).xy;
+
+                // Process one segment.
+
+                // TODO: I think this would go more smoothly (and be easier to
+                // make numerically robust) if it were based on clipping the line
+                // to the tile box. See:
+                // https://tavianator.com/fast-branchless-raybounding-box-intersections/
+                vec2 xymin = min(start, end);
+                vec2 xymax = max(start, end);
+                float a = end.y - start.y;
+                float b = start.x - end.x;
+                float c = -(a * start.x + b * start.y);
+                vec2 xy1 = xy0 + vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
+                float ytop = max(xy0.y, xymin.y);
+                float ybot = min(xy1.y, xymax.y);
+                float s00 = sign(b * ytop + a * xy0.x + c);
+                float s01 = sign(b * ytop + a * xy1.x + c);
+                float s10 = sign(b * ybot + a * xy0.x + c);
+                float s11 = sign(b * ybot + a * xy1.x + c);
+                float sTopLeft = sign(b * xy0.y + a * xy0.x + c);
+                if (sTopLeft == sign(a) && xymin.y <= xy0.y && xymax.y > xy0.y) {
+                    backdrop -= int(s00);
+                }
+
+                // This is adapted from piet-metal but could be improved.
+
+                if (max(xymin.x, xy0.x) < min(xymax.x, xy1.x)
+                    && ytop < ybot
+                    && s00 * s01 + s00 * s10 + s00 * s11 < 3.0)
+                {
+                    if (xymin.x < xy0.x) {
+                        float yEdge = mix(start.y, end.y, (start.x - xy0.x) / b);
+                        if (yEdge >= xy0.y && yEdge < xy1.y) {
+                            // This is encoded the same as a general fill segment, but could be
+                            // special-cased, either here or in rendering. (It was special-cased
+                            // in piet-metal).
+                            FillSegment edge_seg;
+                            if (b > 0.0) {
+                                end = vec2(xy0.x, yEdge);
+                                edge_seg.start = end;
+                                edge_seg.end = vec2(xy0.x, xy1.y);
+                            } else {
+                                start = vec2(xy0.x, yEdge);
+                                edge_seg.start = vec2(xy0.x, xy1.y);
+                                edge_seg.end = start;
+                            }
+                            alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit);
+                            FillSegment_write(FillSegmentRef(seg_chunk_ref.offset + FillSegChunk_size + FillSegment_size * chunk_n_segs), edge_seg);
+                            chunk_n_segs++;
+                        }
+                    }
+                    alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit);
+                    FillSegment seg = FillSegment(start, end);
+                    FillSegment_write(FillSegmentRef(seg_chunk_ref.offset + FillSegChunk_size + FillSegment_size * chunk_n_segs), seg);
+                    chunk_n_segs++;
+                }
+
+                start = end;
+            }
+            FillItemHeader_write(item_header, FillItemHeader(backdrop, first_seg_chunk));
+            if (chunk_n_segs != 0) {
+                FillSegChunk_write(seg_chunk_ref, FillSegChunk(chunk_n_segs, FillSegChunkRef(0)));
+                seg_chunk_ref.offset += FillSegChunk_size + FillSegment_size * chunk_n_segs;
+            }
+
+            fill_ref.offset += Instance_size;
+            chunk.chunk_n--;
+            item_header.offset += FillItemHeader_size;
+        }
+    } else {
+        // As an optimization, we could just write 0 for the size.
+        FillTileHeader_write(tile_header_ref, FillTileHeader(fill_n, FillItemHeaderRef(0)));
+    }
+}
--- a/piet-gpu/shader/kernel2f.spv
+++ b/piet-gpu/shader/kernel2f.spv
--- a/piet-gpu/shader/kernel3.comp
+++ b/piet-gpu/shader/kernel3.comp
@ -21,17 +21,23 @@ layout(set = 0, binding = 2) buffer SegmentBuf {
    uint[] segment;
 };

-layout(set = 0, binding = 3) buffer PtclBuf {
+// Used readonly
+layout(set = 0, binding = 3) buffer FillSegmentBuf {
+    uint[] fill_seg;
+};
+
+layout(set = 0, binding = 4) buffer PtclBuf {
    uint[] ptcl;
 };

-layout(set = 0, binding = 4) buffer AllocBuf {
+layout(set = 0, binding = 5) buffer AllocBuf {
    uint alloc;
 };

 #include "scene.h"
 #include "tilegroup.h"
 #include "segment.h"
+#include "fill_seg.h"
 #include "ptcl.h"

 #include "setup.h"
@ -56,6 +62,7 @@ void main() {
    uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;

    TileHeader stroke_th = TileHeader_read(TileHeaderRef(tile_ix * TileHeader_size));
+    FillTileHeader fill_th = FillTileHeader_read(FillTileHeaderRef(tile_ix * FillTileHeader_size));

    while (true) {
        uint tg_tag = TileGroup_tag(tg_ref);
@ -99,6 +106,22 @@ void main() {
                cmd_ref.offset += Cmd_size;
            }
            break;
+        case PietItem_Fill:
+            FillItemHeader fill_item = FillItemHeader_read(fill_th.items);
+            fill_th.items.offset += FillItemHeader_size;
+            // TODO: handle segments == 0 but backdrop != specially, it's a solid tile.
+            if (fill_item.segments.offset != 0 || fill_item.backdrop != 0) {
+                PietFill fill = PietItem_Fill_read(item_ref);
+                CmdFill cmd = CmdFill(
+                    fill_item.segments.offset,
+                    fill_item.backdrop,
+                    fill.rgba_color
+                );
+                alloc_cmd(cmd_ref, cmd_limit);
+                Cmd_Fill_write(cmd_ref, cmd);
+                cmd_ref.offset += Cmd_size;
+            }
+            break;
        }
        tg_ref.offset += TileGroup_size;
    }
--- a/piet-gpu/shader/kernel3.spv
+++ b/piet-gpu/shader/kernel3.spv
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@ -19,12 +19,18 @@ layout(set = 0, binding = 1) buffer SegmentBuf {
    uint[] segment;
 };

-layout(set = 0, binding = 2) buffer ImageBuf {
+// Used readonly
+layout(set = 0, binding = 2) buffer FillSegBuf {
+    uint[] fill_seg;
+};
+
+layout(set = 0, binding = 3) buffer ImageBuf {
    uint[] image;
 };

 #include "ptcl.h"
 #include "segment.h"
+#include "fill_seg.h"

 #include "setup.h"

@ -70,6 +76,36 @@ void main() {
            alpha = clamp(stroke.half_width + 0.5 - df, 0.0, 1.0);
            rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a);
            break;
+        case Cmd_Fill:
+            CmdFill fill = Cmd_Fill_read(cmd_ref);
+            // Probably better to store as float, but conversion is no doubt cheap.
+            float area = float(fill.backdrop);
+            FillSegChunkRef fill_seg_chunk_ref = FillSegChunkRef(fill.seg_ref);
+            do {
+                FillSegChunk seg_chunk = FillSegChunk_read(fill_seg_chunk_ref);
+                for (int i = 0; i < seg_chunk.n; i++) {
+                    FillSegment seg = FillSegment_read(FillSegmentRef(fill_seg_chunk_ref.offset + FillSegChunk_size + FillSegment_size * i));
+                    vec2 start = seg.start - xy;
+                    vec2 end = seg.end - xy;
+                    vec2 window = clamp(vec2(start.y, end.y), 0.0, 1.0);
+                    if (window.x != window.y) {
+                        vec2 t = (window - start.y) / (end.y - start.y);
+                        vec2 xs = vec2(mix(start.x, end.x, t.x), mix(start.x, end.x, t.y));
+                        float xmin = min(min(xs.x, xs.y), 1.0) - 1e-6;
+                        float xmax = max(xs.x, xs.y);
+                        float b = min(xmax, 1.0);
+                        float c = max(b, 0.0);
+                        float d = max(xmin, 0.0);
+                        float a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin);
+                        area += a * (window.x - window.y);
+                    }
+                }
+                fill_seg_chunk_ref = seg_chunk.next;
+            } while (fill_seg_chunk_ref.offset != 0);
+            fg_rgba = unpackUnorm4x8(fill.rgba_color).wzyx;
+            alpha = min(abs(area), 1.0);
+            rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a);
+            break;
        case Cmd_Jump:
            cmd_ref = CmdRef(Cmd_Jump_read(cmd_ref).new_ref);
            continue;
--- a/piet-gpu/shader/kernel4.spv
+++ b/piet-gpu/shader/kernel4.spv
--- a/piet-gpu/shader/ptcl.h
+++ b/piet-gpu/shader/ptcl.h
@ -72,11 +72,12 @@ CmdStrokeRef CmdStroke_index(CmdStrokeRef ref, uint index) {
 }

 struct CmdFill {
-    vec2 start;
-    vec2 end;
+    uint seg_ref;
+    int backdrop;
+    uint rgba_color;
 };

-#define CmdFill_size 16
+#define CmdFill_size 12

 CmdFillRef CmdFill_index(CmdFillRef ref, uint index) {
    return CmdFillRef(ref.offset + index * CmdFill_size);
@ -205,19 +206,18 @@ CmdFill CmdFill_read(CmdFillRef ref) {
    uint raw0 = ptcl[ix + 0];
    uint raw1 = ptcl[ix + 1];
    uint raw2 = ptcl[ix + 2];
-    uint raw3 = ptcl[ix + 3];
    CmdFill s;
-    s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
-    s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.seg_ref = raw0;
+    s.backdrop = int(raw1);
+    s.rgba_color = raw2;
    return s;
 }

 void CmdFill_write(CmdFillRef ref, CmdFill s) {
    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = floatBitsToUint(s.start.x);
-    ptcl[ix + 1] = floatBitsToUint(s.start.y);
-    ptcl[ix + 2] = floatBitsToUint(s.end.x);
-    ptcl[ix + 3] = floatBitsToUint(s.end.y);
+    ptcl[ix + 0] = s.seg_ref;
+    ptcl[ix + 1] = uint(s.backdrop);
+    ptcl[ix + 2] = s.rgba_color;
 }

 CmdFillEdge CmdFillEdge_read(CmdFillEdgeRef ref) {
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@ -19,10 +19,14 @@
 // there is a region of size TILEGROUP_STRIDE for each tilegroup.
 // At offset 0 are the main instances, encoded with Jump. At offset
 // TILEGROUP_STROKE_START are the stroke instances, encoded with
-// Head and Link.
+// Head and Link. Similarly for fill.
 #define TILEGROUP_STRIDE 2048
 #define TILEGROUP_STROKE_START 1024
+#define TILEGROUP_FILL_START 1536
 #define TILEGROUP_STROKE_ALLOC 1024
+#define TILEGROUP_FILL_ALLOC 1024
+#define TILEGROUP_INITIAL_STROKE_ALLOC 512
+#define TILEGROUP_INITIAL_FILL_ALLOC 512

 // TODO: compute all these

--- a/piet-gpu/src/main.rs
+++ b/piet-gpu/src/main.rs
@ -10,9 +10,11 @@ use piet::{Color, RenderContext};
 use piet_gpu_hal::vulkan::VkInstance;
 use piet_gpu_hal::{CmdBuf, Device, MemFlags};

+mod pico_svg;
 mod render_ctx;

 use render_ctx::PietGpuRenderContext;
+use pico_svg::PicoSvg;

 const WIDTH: usize = 2048;
 const HEIGHT: usize = 1536;
@ -44,14 +46,22 @@ fn render_scene(rc: &mut impl RenderContext) {
        let circle = Circle::new(center, radius);
        rc.fill(circle, &color);
    }
+    let mut path = BezPath::new();
+    path.move_to((100.0, 1150.0));
+    path.line_to((200.0, 1200.0));
+    path.line_to((150.0, 1250.0));
+    path.close_path();
+    rc.fill(path, &Color::rgb8(128, 0, 128));
    rc.stroke(
        Line::new((100.0, 100.0), (200.0, 150.0)),
        &Color::WHITE,
        5.0,
    );
-    render_cardioid(rc);
+    //render_cardioid(rc);
+    render_tiger(rc);
 }

+#[allow(unused)]
 fn render_cardioid(rc: &mut impl RenderContext) {
    let n = 91;
    let dth = std::f64::consts::PI * 2.0 / (n as f64);
@ -69,6 +79,17 @@ fn render_cardioid(rc: &mut impl RenderContext) {
    rc.stroke(&path, &Color::BLACK, 2.0);
 }

+fn render_tiger(rc: &mut impl RenderContext) {
+    let xml_str = std::str::from_utf8(include_bytes!("../Ghostscript_Tiger.svg")).unwrap();
+    let start = std::time::Instant::now();
+    let svg = PicoSvg::load(xml_str, 8.0).unwrap();
+    println!("parsing time: {:?}", start.elapsed());
+
+    let start = std::time::Instant::now();
+    svg.render(rc);
+    println!("flattening and encoding time: {:?}", start.elapsed());
+}
+
 #[allow(unused)]
 fn dump_scene(buf: &[u8]) {
    for i in 0..(buf.len() / 4) {
@ -107,6 +128,7 @@ fn main() {
        let tilegroup_buf = device.create_buffer(4 * 1024 * 1024, dev).unwrap();
        let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev).unwrap();
        let segment_buf = device.create_buffer(64 * 1024 * 1024, dev).unwrap();
+        let fill_seg_buf = device.create_buffer(64 * 1024 * 1024, dev).unwrap();
        let image_buf = device
            .create_buffer((WIDTH * HEIGHT * 4) as u64, host)
            .unwrap();
@ -144,6 +166,26 @@ fn main() {
            )
            .unwrap();

+        let k2f_alloc_buf_host = device.create_buffer(4, host).unwrap();
+        let k2f_alloc_buf_dev = device.create_buffer(4, dev).unwrap();
+        let k2f_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * K2_PER_TILE_SIZE;
+        device
+            .write_buffer(&k2f_alloc_buf_host, &[k2f_alloc_start as u32])
+            .unwrap();
+        let k2f_code = include_bytes!("../shader/kernel2f.spv");
+        let k2f_pipeline = device.create_simple_compute_pipeline(k2f_code, 4).unwrap();
+        let k2f_ds = device
+            .create_descriptor_set(
+                &k2f_pipeline,
+                &[
+                    &scene_dev,
+                    &tilegroup_buf,
+                    &fill_seg_buf,
+                    &k2f_alloc_buf_dev,
+                ],
+            )
+            .unwrap();
+
        let k3_alloc_buf_host = device.create_buffer(4, host).unwrap();
        let k3_alloc_buf_dev = device.create_buffer(4, dev).unwrap();
        let k3_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC;
@ -151,7 +193,7 @@ fn main() {
            .write_buffer(&k3_alloc_buf_host, &[k3_alloc_start as u32])
            .unwrap();
        let k3_code = include_bytes!("../shader/kernel3.spv");
-        let k3_pipeline = device.create_simple_compute_pipeline(k3_code, 5).unwrap();
+        let k3_pipeline = device.create_simple_compute_pipeline(k3_code, 6).unwrap();
        let k3_ds = device
            .create_descriptor_set(
                &k3_pipeline,
@ -159,6 +201,7 @@ fn main() {
                    &scene_dev,
                    &tilegroup_buf,
                    &segment_buf,
+                    &fill_seg_buf,
                    &ptcl_buf,
                    &k3_alloc_buf_dev,
                ],
@ -166,18 +209,26 @@ fn main() {
            .unwrap();

        let k4_code = include_bytes!("../shader/kernel4.spv");
-        let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 3).unwrap();
+        let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 4).unwrap();
        let k4_ds = device
-            .create_descriptor_set(&k4_pipeline, &[&ptcl_buf, &segment_buf, &image_dev])
+            .create_descriptor_set(
+                &k4_pipeline,
+                &[&ptcl_buf, &segment_buf, &fill_seg_buf, &image_dev],
+            )
            .unwrap();

-        let query_pool = device.create_query_pool(5).unwrap();
+        let query_pool = device.create_query_pool(6).unwrap();
        let mut cmd_buf = device.create_cmd_buf().unwrap();
        cmd_buf.begin();
        cmd_buf.copy_buffer(&scene_buf, &scene_dev);
+        // Note: we could use one alloc buf and reuse it. But we'll stick with
+        // multiple ones for clarity.
        cmd_buf.copy_buffer(&k1_alloc_buf_host, &k1_alloc_buf_dev);
        cmd_buf.copy_buffer(&k2s_alloc_buf_host, &k2s_alloc_buf_dev);
+        cmd_buf.copy_buffer(&k2f_alloc_buf_host, &k2f_alloc_buf_dev);
        cmd_buf.copy_buffer(&k3_alloc_buf_host, &k3_alloc_buf_dev);
+        // Note: these clears aren't necessary, and are here to make inspection
+        // of the buffers cleaner. Can likely be removed.
        cmd_buf.clear_buffer(&tilegroup_buf);
        cmd_buf.clear_buffer(&ptcl_buf);
        cmd_buf.memory_barrier();
@ -196,20 +247,30 @@ fn main() {
            ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 1),
        );
        cmd_buf.write_timestamp(&query_pool, 2);
+        // Note: this barrier is not necessary (k2f does not depend on
+        // k2s output), but I'm keeping it here to increase transparency
+        // of performance.
+        cmd_buf.memory_barrier();
+        cmd_buf.dispatch(
+            &k2f_pipeline,
+            &k2f_ds,
+            ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 2),
+        );
+        cmd_buf.write_timestamp(&query_pool, 3);
        cmd_buf.memory_barrier();
        cmd_buf.dispatch(
            &k3_pipeline,
            &k3_ds,
-            ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 1),
+            ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 3),
        );
-        cmd_buf.write_timestamp(&query_pool, 3);
+        cmd_buf.write_timestamp(&query_pool, 4);
        cmd_buf.memory_barrier();
        cmd_buf.dispatch(
            &k4_pipeline,
            &k4_ds,
            ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1),
        );
-        cmd_buf.write_timestamp(&query_pool, 4);
+        cmd_buf.write_timestamp(&query_pool, 5);
        cmd_buf.memory_barrier();
        cmd_buf.copy_buffer(&image_dev, &image_buf);
        cmd_buf.finish();
@ -217,17 +278,21 @@ fn main() {
        let timestamps = device.reap_query_pool(query_pool).unwrap();
        println!("Kernel 1 time: {:.3}ms", timestamps[0] * 1e3);
        println!(
-            "Kernel 2 time: {:.3}ms",
+            "Kernel 2s time: {:.3}ms",
            (timestamps[1] - timestamps[0]) * 1e3
        );
        println!(
-            "Kernel 3 time: {:.3}ms",
+            "Kernel 2f time: {:.3}ms",
            (timestamps[2] - timestamps[1]) * 1e3
        );
        println!(
-            "Render time: {:.3}ms",
+            "Kernel 3 time: {:.3}ms",
            (timestamps[3] - timestamps[2]) * 1e3
        );
+        println!(
+            "Render time: {:.3}ms",
+            (timestamps[4] - timestamps[3]) * 1e3
+        );

        /*
        let mut k1_data: Vec<u32> = Default::default();
--- a/piet-gpu/src/pico_svg.rs
+++ b/piet-gpu/src/pico_svg.rs
@ -0,0 +1,80 @@
+//! A loader for a tiny fragment of SVG
+
+use std::str::FromStr;
+
+use roxmltree::Document;
+
+use kurbo::BezPath;
+
+use piet::{Color, RenderContext};
+
+pub struct PicoSvg {
+    items: Vec<Item>,
+}
+
+pub enum Item {
+    Fill(FillItem),
+    Stroke(StrokeItem),
+}
+
+pub struct StrokeItem {
+    width: f64,
+    color: Color,
+    path: BezPath,
+}
+
+pub struct FillItem {
+    color: Color,
+    path: BezPath,
+}
+
+impl PicoSvg {
+    pub fn load(xml_string: &str, scale: f64) -> Result<PicoSvg, Box<dyn std::error::Error>> {
+        let doc = Document::parse(xml_string)?;
+        let root = doc.root_element();
+        let g = root.first_element_child().ok_or("no root element")?;
+        let mut items = Vec::new();
+        for el in g.children() {
+            if el.is_element() {
+                let d = el.attribute("d").ok_or("missing 'd' attribute")?;
+                let bp = BezPath::from_svg(d)?;
+                let path = kurbo::Affine::scale(scale) * bp;
+                if let Some(fill_color) = el.attribute("fill") {
+                    let color = parse_color(fill_color);
+                    items.push(Item::Fill(FillItem { color, path: path.clone() }));
+                }
+                if let Some(stroke_color) = el.attribute("stroke") {
+                    let width = f64::from_str(el.attribute("stroke-width").ok_or("missing width")?)?;
+                    let color = parse_color(stroke_color);
+                    items.push(Item::Stroke(StrokeItem { width, color, path }));
+                }
+            }
+        }
+        Ok(PicoSvg { items })
+    }
+
+    pub fn render(&self, rc: &mut impl RenderContext) {
+        for item in &self.items {
+            match item {
+                Item::Fill(fill_item) => {
+                    rc.fill(&fill_item.path, &fill_item.color);
+                }
+                Item::Stroke(stroke_item) => {
+                    rc.stroke(&stroke_item.path, &stroke_item.color, stroke_item.width);
+                }
+            }
+        }
+    }
+}
+
+fn parse_color(color: &str) -> Color {
+    if color.as_bytes()[0] == b'#' {
+        let mut hex = u32::from_str_radix(&color[1..], 16).unwrap();
+        if color.len() == 4 {
+            hex = (hex >> 8) * 0x110000 + ((hex >> 4) & 0xf) * 0x1100 + (hex & 0xf) * 0x11;
+        }
+        Color::from_rgba32_u32((hex << 8) + 0xff)
+    } else {
+        Color::from_rgba32_u32(0xff00ff80)
+    }
+}
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@ -2,7 +2,7 @@ use std::borrow::Cow;

 use piet_gpu_types::encoder::{Encode, Encoder, Ref};
 use piet_gpu_types::scene;
-use piet_gpu_types::scene::{Bbox, PietCircle, PietItem, PietStrokePolyLine, SimpleGroup};
+use piet_gpu_types::scene::{Bbox, PietCircle, PietFill, PietItem, PietStrokePolyLine, SimpleGroup};

 use piet::kurbo::{Affine, PathEl, Point, Rect, Shape};

@ -119,6 +119,7 @@ impl RenderContext for PietGpuRenderContext {
                    n_points,
                    points,
                };
+                let bbox = bbox.inset(-0.5 * width);
                self.push_item(PietItem::Poly(poly_line), bbox);
            }
            _ => (),
@ -135,10 +136,11 @@ impl RenderContext for PietGpuRenderContext {
    }

    fn fill(&mut self, shape: impl Shape, brush: &impl IntoBrush<Self>) {
+        let bbox = shape.bounding_box();
        let brush = brush.make_brush(self, || shape.bounding_box()).into_owned();

-        match shape.as_circle() {
-            Some(circle) => match brush {
+        if let Some(circle) = shape.as_circle() {
+            match brush {
                PietGpuBrush::Solid(rgba_color) => {
                    let piet_circle = PietCircle {
                        rgba_color,
@ -149,8 +151,22 @@ impl RenderContext for PietGpuRenderContext {
                    self.push_item(PietItem::Circle(piet_circle), bbox);
                }
                _ => {}
-            },
-            None => {}
+            }
+            return;
+        }
+        let path = shape.to_bez_path(TOLERANCE);
+        let (n_points, points) = flatten_shape(&mut self.encoder, path);
+        match brush {
+            PietGpuBrush::Solid(rgba_color) => {
+                let fill = PietFill {
+                    flags: 0,
+                    rgba_color,
+                    n_points,
+                    points,
+                };
+                self.push_item(PietItem::Fill(fill), bbox);
+            }
+            _ => (),
        }
    }