Experimenting with sort-middle

Starting a prototype that explores the sort-middle approach. This commit has a prefix sum pass computing state per element.
2025-01-09 20:31:29 +11:00 · 2020-05-11 20:01:06 -07:00 · 2020-05-11 20:01:06 -07:00 · 9a8854ffab
parent 8d01aba237
commit 9a8854ffab
14 changed files with 762 additions and 268 deletions
--- a/piet-gpu-types/src/lib.rs
+++ b/piet-gpu-types/src/lib.rs
@ -3,5 +3,6 @@ pub mod fill_seg;
 pub mod ptcl;
 pub mod scene;
 pub mod segment;
+pub mod state;
 pub mod test;
 pub mod tilegroup;
--- a/piet-gpu-types/src/main.rs
+++ b/piet-gpu-types/src/main.rs
@ -5,6 +5,7 @@ fn main() {
        .expect("provide a module name");
    match mod_name.as_str() {
        "scene" => print!("{}", piet_gpu_types::scene::gen_gpu_scene()),
+        "state" => print!("{}", piet_gpu_types::state::gen_gpu_state()),
        "tilegroup" => print!("{}", piet_gpu_types::tilegroup::gen_gpu_tilegroup()),
        "segment" => print!("{}", piet_gpu_types::segment::gen_gpu_segment()),
        "fill_seg" => print!("{}", piet_gpu_types::fill_seg::gen_gpu_fill_seg()),
--- a/piet-gpu-types/src/scene.rs
+++ b/piet-gpu-types/src/scene.rs
@ -4,6 +4,8 @@ pub use self::scene::{
    Bbox, PietCircle, PietFill, PietItem, PietStrokeLine, PietStrokePolyLine, Point, SimpleGroup,
 };

+pub use self::scene::{CubicSeg, Element, Fill, LineSeg, QuadSeg, SetLineWidth, Stroke, Transform};
+
 piet_gpu! {
    #[rust_encode]
    mod scene {
@ -51,5 +53,46 @@ piet_gpu! {
            Fill(PietFill),
            Poly(PietStrokePolyLine),
        }
+
+        // New approach follows (above to be deleted)
+        struct LineSeg {
+            p0: [f32; 2],
+            p1: [f32; 2],
+        }
+        struct QuadSeg {
+            p0: [f32; 2],
+            p1: [f32; 2],
+            p2: [f32; 2],
+        }
+        struct CubicSeg {
+            p0: [f32; 2],
+            p1: [f32; 2],
+            p2: [f32; 2],
+            p3: [f32; 2],
+        }
+        struct Fill {
+            rgba_color: u32,
+        }
+        struct Stroke {
+            rgba_color: u32,
+        }
+        struct SetLineWidth {
+            width: f32,
+        }
+        struct Transform {
+            mat: [f32; 4],
+            translate: [f32; 2],
+        }
+        enum Element {
+            Nop,
+            // The segments need a flag to indicate fill/stroke
+            Line(LineSeg),
+            Quad(QuadSeg),
+            Cubic(CubicSeg),
+            Stroke(Stroke),
+            Fill(Fill),
+            SetLineWidth(SetLineWidth),
+            Transform(Transform),
+        }
    }
 }
--- a/piet-gpu-types/src/state.rs
+++ b/piet-gpu-types/src/state.rs
@ -0,0 +1,14 @@
+use piet_gpu_derive::piet_gpu;
+
+piet_gpu! {
+    #[gpu_write]
+    mod state {
+        struct State {
+            mat: [f32; 4],
+            translate: [f32; 2],
+            bbox: [f32; 4],
+            linewidth: f32,
+            flags: u32,
+        }
+    }
+}
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@ -5,7 +5,7 @@ use std::path::Path;
 use piet_gpu_hal::vulkan::VkInstance;
 use piet_gpu_hal::{CmdBuf, Device, Error, MemFlags};

-use piet_gpu::{PietGpuRenderContext, Renderer, render_scene, WIDTH, HEIGHT};
+use piet_gpu::{render_scene, PietGpuRenderContext, Renderer, HEIGHT, WIDTH};

 #[allow(unused)]
 fn dump_scene(buf: &[u8]) {
@ -16,6 +16,24 @@ fn dump_scene(buf: &[u8]) {
    }
 }

+#[allow(unused)]
+fn dump_state(buf: &[u8]) {
+    for i in 0..(buf.len() / 48) {
+        let j = i * 48;
+        let floats = (0..11).map(|k| {
+            let mut buf_f32 = [0u8; 4];
+            buf_f32.copy_from_slice(&buf[j + k * 4..j + k * 4 + 4]);
+            f32::from_le_bytes(buf_f32)
+        }).collect::<Vec<_>>();
+        println!("{}: [{} {} {} {} {} {}] ({}, {})-({} {}) {} {}",
+            i,
+            floats[0], floats[1], floats[2], floats[3], floats[4], floats[5],
+            floats[6], floats[7], floats[8], floats[9],
+            floats[10], buf[j + 44]);
+    }
+
+}
+
 fn main() -> Result<(), Error> {
    let (instance, _) = VkInstance::new(None)?;
    unsafe {
@ -23,7 +41,7 @@ fn main() -> Result<(), Error> {

        let fence = device.create_fence(false)?;
        let mut cmd_buf = device.create_cmd_buf()?;
-        let query_pool = device.create_query_pool(6)?;
+        let query_pool = device.create_query_pool(2)?;

        let mut ctx = PietGpuRenderContext::new();
        render_scene(&mut ctx);
@ -31,7 +49,8 @@ fn main() -> Result<(), Error> {
        //dump_scene(&scene);

        let renderer = Renderer::new(&device, scene)?;
-        let image_buf = device.create_buffer((WIDTH * HEIGHT * 4) as u64, MemFlags::host_coherent())?;
+        let image_buf =
+            device.create_buffer((WIDTH * HEIGHT * 4) as u64, MemFlags::host_coherent())?;

        cmd_buf.begin();
        renderer.record(&mut cmd_buf, &query_pool);
@ -40,28 +59,12 @@ fn main() -> Result<(), Error> {
        device.run_cmd_buf(&cmd_buf, &[], &[], Some(&fence))?;
        device.wait_and_reset(&[fence])?;
        let timestamps = device.reap_query_pool(&query_pool).unwrap();
-        println!("Kernel 1 time: {:.3}ms", timestamps[0] * 1e3);
-        println!(
-            "Kernel 2s time: {:.3}ms",
-            (timestamps[1] - timestamps[0]) * 1e3
-        );
-        println!(
-            "Kernel 2f time: {:.3}ms",
-            (timestamps[2] - timestamps[1]) * 1e3
-        );
-        println!(
-            "Kernel 3 time: {:.3}ms",
-            (timestamps[3] - timestamps[2]) * 1e3
-        );
-        println!(
-            "Render time: {:.3}ms",
-            (timestamps[4] - timestamps[3]) * 1e3
-        );
+        println!("Element kernel time: {:.3}ms", timestamps[0] * 1e3);

        /*
-        let mut k1_data: Vec<u32> = Default::default();
-        device.read_buffer(&segment_buf, &mut k1_data).unwrap();
-        dump_k1_data(&k1_data);
+        let mut data: Vec<u8> = Default::default();
+        device.read_buffer(&renderer.state_buf, &mut data).unwrap();
+        dump_state(&data);
        */

        let mut img_data: Vec<u8> = Default::default();
--- a/piet-gpu/bin/winit.rs
+++ b/piet-gpu/bin/winit.rs
@ -1,7 +1,7 @@
 use piet_gpu_hal::vulkan::VkInstance;
 use piet_gpu_hal::{CmdBuf, Device, Error, ImageLayout};

-use piet_gpu::{PietGpuRenderContext, Renderer, render_scene, WIDTH, HEIGHT};
+use piet_gpu::{render_scene, PietGpuRenderContext, Renderer, HEIGHT, WIDTH};

 use winit::{
    event::{Event, WindowEvent},
@ -69,7 +69,8 @@ fn main() -> Result<(), Error> {
                        device.wait_and_reset(&[frame_fences[frame_idx]]).unwrap();

                        let timestamps = device.reap_query_pool(query_pool).unwrap();
-                        window.set_title(&format!("k1: {:.3}ms, k2s: {:.3}ms, k2f: {:.3}ms, k3: {:.3}ms, k4: {:.3}ms",
+                        window.set_title(&format!(
+                            "k1: {:.3}ms, k2s: {:.3}ms, k2f: {:.3}ms, k3: {:.3}ms, k4: {:.3}ms",
                            timestamps[0] * 1e3,
                            (timestamps[1] - timestamps[0]) * 1e3,
                            (timestamps[2] - timestamps[1]) * 1e3,
@ -93,11 +94,7 @@ fn main() -> Result<(), Error> {
                        ImageLayout::BlitDst,
                    );
                    cmd_buf.blit_image(&renderer.image_dev, &swap_image);
-                    cmd_buf.image_barrier(
-                        &swap_image,
-                        ImageLayout::BlitDst,
-                        ImageLayout::Present,
-                    );
+                    cmd_buf.image_barrier(&swap_image, ImageLayout::BlitDst, ImageLayout::Present);
                    cmd_buf.finish();

                    device
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@ -18,3 +18,6 @@ build kernel2f.spv: glsl kernel2f.comp | scene.h tilegroup.h fill_seg.h setup.h
 build kernel3.spv: glsl kernel3.comp | scene.h tilegroup.h segment.h fill_seg.h ptcl.h setup.h

 build kernel4.spv: glsl kernel4.comp | ptcl.h segment.h fill_seg.h setup.h
+
+
+build elements.spv: glsl elements.comp | scene.h state.h
--- a/piet-gpu/shader/elements.comp
+++ b/piet-gpu/shader/elements.comp
@ -0,0 +1,173 @@
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+#define N_ROWS 4
+#define WG_SIZE 32
+#define LG_WG_SIZE 5
+#define TILE_SIZE (WG_SIZE * N_ROWS)
+
+layout(local_size_x = WG_SIZE, local_size_y = 1) in;
+
+layout(set = 0, binding = 0) readonly buffer SceneBuf {
+    uint[] scene;
+};
+
+// This will be used for inter-wprkgroup aggregates
+layout(set = 0, binding = 1) buffer StateBuf {
+    uint[] state;
+};
+
+#include "scene.h"
+#include "state.h"
+
+#define FLAG_SET_LINEWIDTH 1
+#define FLAG_RESET_BBOX 2
+
+// This is almost like a monoid (the interaction between transformation and
+// bounding boxes is approximate)
+State combine_state(State a, State b) {
+    State c;
+    c.bbox.x = min(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + min(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x;
+    c.bbox.y = min(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + min(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
+    c.bbox.z = max(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + max(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x;
+    c.bbox.w = max(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + max(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
+    if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) {
+        c.bbox = a.bbox;
+    } else if ((a.flags & FLAG_RESET_BBOX) == 0 && (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)) {
+        c.bbox.xy = min(a.bbox.xy, c.bbox.xy);
+        c.bbox.zw = max(a.bbox.zw, c.bbox.zw);
+    }
+    // It would be more concise to cast to matrix types; ah well.
+    c.mat.x = a.mat.x * b.mat.x + a.mat.z * b.mat.y;
+    c.mat.y = a.mat.y * b.mat.x + a.mat.w * b.mat.y;
+    c.mat.z = a.mat.x * b.mat.z + a.mat.z * b.mat.w;
+    c.mat.w = a.mat.y * b.mat.z + a.mat.w * b.mat.w;
+    c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x;
+    c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y;
+    c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth;
+    c.flags = a.flags | b.flags;
+    return c;
+}
+
+State map_element(ElementRef ref) {
+    // TODO: it would *probably* be more efficient to make the memory read patterns less
+    // divergent, though it would be more wasted memory.
+    uint tag = Element_tag(ref);
+    State c;
+    c.bbox = vec4(0.0, 0.0, 0.0, 0.0);
+    c.mat = vec4(1.0, 0.0, 0.0, 1.0);
+    c.translate = vec2(0.0, 0.0);
+    c.linewidth = 0.0;
+    c.flags = 0;
+    switch (tag) {
+    case Element_Line:
+        LineSeg line = Element_Line_read(ref);
+        c.bbox.xy = min(line.p0, line.p1);
+        c.bbox.zw = max(line.p0, line.p1);
+        break;
+    case Element_Quad:
+        QuadSeg quad = Element_Quad_read(ref);
+        c.bbox.xy = min(min(quad.p0, quad.p1), quad.p2);
+        c.bbox.zw = max(max(quad.p0, quad.p1), quad.p2);
+        break;
+    case Element_Cubic:
+        CubicSeg cubic = Element_Cubic_read(ref);
+        c.bbox.xy = min(min(cubic.p0, cubic.p1), min(cubic.p2, cubic.p3));
+        c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3));
+        break;
+    case Element_Fill:
+    case Element_Stroke:
+        c.flags = FLAG_RESET_BBOX;
+        break;
+    case Element_SetLineWidth:
+        SetLineWidth lw = Element_SetLineWidth_read(ref);
+        c.linewidth = lw.width;
+        c.flags = FLAG_SET_LINEWIDTH;
+        break;
+    case Element_Transform:
+        Transform t = Element_Transform_read(ref);
+        c.mat = t.mat;
+        c.translate = t.translate;
+        break;
+    }
+    return c;
+}
+
+// We should be able to use an array of structs but the NV shader compiler
+// doesn't seem to like it :/
+//shared State sh_state[WG_SIZE];
+shared vec4 sh_mat[WG_SIZE];
+shared vec2 sh_translate[WG_SIZE];
+shared vec4 sh_bbox[WG_SIZE];
+shared float sh_width[WG_SIZE];
+shared uint sh_flags[WG_SIZE];
+
+void main() {
+    State th_state[N_ROWS];
+    // this becomes an atomic counter
+    uint tile_ix = gl_WorkGroupID.x;
+
+    uint ix = tile_ix * TILE_SIZE + gl_LocalInvocationID.x * N_ROWS;
+    ElementRef ref = ElementRef(ix * Element_size);
+
+    th_state[0] = map_element(ref);
+    for (uint i = 1; i < N_ROWS; i++) {
+        // discussion question: would it be faster to load using more coherent patterns
+        // into thread memory? This is kinda strided.
+        th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i)));
+    }
+    State agg = th_state[N_ROWS - 1];
+    sh_mat[gl_LocalInvocationID.x] = agg.mat;
+    sh_translate[gl_LocalInvocationID.x] = agg.translate;
+    sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
+    sh_width[gl_LocalInvocationID.x] = agg.linewidth;
+    sh_flags[gl_LocalInvocationID.x] = agg.flags;
+    for (uint i = 0; i < LG_WG_SIZE; i++) {
+        barrier();
+        if (gl_LocalInvocationID.x >= (1 << i)) {
+            State other;
+            uint ix = gl_LocalInvocationID.x - (1 << i);
+            other.mat = sh_mat[ix];
+            other.translate = sh_translate[ix];
+            other.bbox = sh_bbox[ix];
+            other.linewidth = sh_width[ix];
+            other.flags = sh_flags[ix];
+            agg = combine_state(other, agg);
+        }
+        barrier();
+        sh_mat[gl_LocalInvocationID.x] = agg.mat;
+        sh_translate[gl_LocalInvocationID.x] = agg.translate;
+        sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
+        sh_width[gl_LocalInvocationID.x] = agg.linewidth;
+        sh_flags[gl_LocalInvocationID.x] = agg.flags;
+    }
+
+    // TODO: if last invocation in wg, publish agg.
+
+    barrier();
+    State exclusive;
+    exclusive.bbox = vec4(0.0, 0.0, 0.0, 0.0);
+    exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0);
+    exclusive.translate = vec2(0.0, 0.0);
+    exclusive.linewidth = 0.0;
+    exclusive.flags = 0;
+    // TODO: do decoupled look-back
+
+    State row = exclusive;
+    if (gl_LocalInvocationID.x > 0) {
+        uint ix = gl_LocalInvocationID.x - 1;
+        State other;
+        other.mat = sh_mat[ix];
+        other.translate = sh_translate[ix];
+        other.bbox = sh_bbox[ix];
+        other.linewidth = sh_width[ix];
+        other.flags = sh_flags[ix];
+        row = combine_state(row, other);
+    }
+    for (uint i = 0; i < N_ROWS; i++) {
+        State this_state = combine_state(row, th_state[i]);
+        // We write the state now for development purposes, but the
+        // actual goal is to write transformed and annotated elements.
+        State_write(StateRef((ix + i) * State_size), this_state);
+    }
+}
--- a/piet-gpu/shader/elements.spv
+++ b/piet-gpu/shader/elements.spv
--- a/piet-gpu/shader/scene.h
+++ b/piet-gpu/shader/scene.h
@ -32,6 +32,38 @@ struct PietItemRef {
    uint offset;
 };

+struct LineSegRef {
+    uint offset;
+};
+
+struct QuadSegRef {
+    uint offset;
+};
+
+struct CubicSegRef {
+    uint offset;
+};
+
+struct FillRef {
+    uint offset;
+};
+
+struct StrokeRef {
+    uint offset;
+};
+
+struct SetLineWidthRef {
+    uint offset;
+};
+
+struct TransformRef {
+    uint offset;
+};
+
+struct ElementRef {
+    uint offset;
+};
+
 struct Bbox {
    ivec4 bbox;
 };
@ -128,6 +160,97 @@ PietItemRef PietItem_index(PietItemRef ref, uint index) {
    return PietItemRef(ref.offset + index * PietItem_size);
 }

+struct LineSeg {
+    vec2 p0;
+    vec2 p1;
+};
+
+#define LineSeg_size 16
+
+LineSegRef LineSeg_index(LineSegRef ref, uint index) {
+    return LineSegRef(ref.offset + index * LineSeg_size);
+}
+
+struct QuadSeg {
+    vec2 p0;
+    vec2 p1;
+    vec2 p2;
+};
+
+#define QuadSeg_size 24
+
+QuadSegRef QuadSeg_index(QuadSegRef ref, uint index) {
+    return QuadSegRef(ref.offset + index * QuadSeg_size);
+}
+
+struct CubicSeg {
+    vec2 p0;
+    vec2 p1;
+    vec2 p2;
+    vec2 p3;
+};
+
+#define CubicSeg_size 32
+
+CubicSegRef CubicSeg_index(CubicSegRef ref, uint index) {
+    return CubicSegRef(ref.offset + index * CubicSeg_size);
+}
+
+struct Fill {
+    uint rgba_color;
+};
+
+#define Fill_size 4
+
+FillRef Fill_index(FillRef ref, uint index) {
+    return FillRef(ref.offset + index * Fill_size);
+}
+
+struct Stroke {
+    uint rgba_color;
+};
+
+#define Stroke_size 4
+
+StrokeRef Stroke_index(StrokeRef ref, uint index) {
+    return StrokeRef(ref.offset + index * Stroke_size);
+}
+
+struct SetLineWidth {
+    float width;
+};
+
+#define SetLineWidth_size 4
+
+SetLineWidthRef SetLineWidth_index(SetLineWidthRef ref, uint index) {
+    return SetLineWidthRef(ref.offset + index * SetLineWidth_size);
+}
+
+struct Transform {
+    vec4 mat;
+    vec2 translate;
+};
+
+#define Transform_size 24
+
+TransformRef Transform_index(TransformRef ref, uint index) {
+    return TransformRef(ref.offset + index * Transform_size);
+}
+
+#define Element_Nop 0
+#define Element_Line 1
+#define Element_Quad 2
+#define Element_Cubic 3
+#define Element_Stroke 4
+#define Element_Fill 5
+#define Element_SetLineWidth 6
+#define Element_Transform 7
+#define Element_size 36
+
+ElementRef Element_index(ElementRef ref, uint index) {
+    return ElementRef(ref.offset + index * Element_size);
+}
+
 Bbox Bbox_read(BboxRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = scene[ix + 0];
@ -236,3 +359,118 @@ PietStrokePolyLine PietItem_Poly_read(PietItemRef ref) {
    return PietStrokePolyLine_read(PietStrokePolyLineRef(ref.offset + 4));
 }

+LineSeg LineSeg_read(LineSegRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = scene[ix + 0];
+    uint raw1 = scene[ix + 1];
+    uint raw2 = scene[ix + 2];
+    uint raw3 = scene[ix + 3];
+    LineSeg s;
+    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    return s;
+}
+
+QuadSeg QuadSeg_read(QuadSegRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = scene[ix + 0];
+    uint raw1 = scene[ix + 1];
+    uint raw2 = scene[ix + 2];
+    uint raw3 = scene[ix + 3];
+    uint raw4 = scene[ix + 4];
+    uint raw5 = scene[ix + 5];
+    QuadSeg s;
+    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
+    return s;
+}
+
+CubicSeg CubicSeg_read(CubicSegRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = scene[ix + 0];
+    uint raw1 = scene[ix + 1];
+    uint raw2 = scene[ix + 2];
+    uint raw3 = scene[ix + 3];
+    uint raw4 = scene[ix + 4];
+    uint raw5 = scene[ix + 5];
+    uint raw6 = scene[ix + 6];
+    uint raw7 = scene[ix + 7];
+    CubicSeg s;
+    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
+    s.p3 = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7));
+    return s;
+}
+
+Fill Fill_read(FillRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = scene[ix + 0];
+    Fill s;
+    s.rgba_color = raw0;
+    return s;
+}
+
+Stroke Stroke_read(StrokeRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = scene[ix + 0];
+    Stroke s;
+    s.rgba_color = raw0;
+    return s;
+}
+
+SetLineWidth SetLineWidth_read(SetLineWidthRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = scene[ix + 0];
+    SetLineWidth s;
+    s.width = uintBitsToFloat(raw0);
+    return s;
+}
+
+Transform Transform_read(TransformRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = scene[ix + 0];
+    uint raw1 = scene[ix + 1];
+    uint raw2 = scene[ix + 2];
+    uint raw3 = scene[ix + 3];
+    uint raw4 = scene[ix + 4];
+    uint raw5 = scene[ix + 5];
+    Transform s;
+    s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
+    return s;
+}
+
+uint Element_tag(ElementRef ref) {
+    return scene[ref.offset >> 2];
+}
+
+LineSeg Element_Line_read(ElementRef ref) {
+    return LineSeg_read(LineSegRef(ref.offset + 4));
+}
+
+QuadSeg Element_Quad_read(ElementRef ref) {
+    return QuadSeg_read(QuadSegRef(ref.offset + 4));
+}
+
+CubicSeg Element_Cubic_read(ElementRef ref) {
+    return CubicSeg_read(CubicSegRef(ref.offset + 4));
+}
+
+Stroke Element_Stroke_read(ElementRef ref) {
+    return Stroke_read(StrokeRef(ref.offset + 4));
+}
+
+Fill Element_Fill_read(ElementRef ref) {
+    return Fill_read(FillRef(ref.offset + 4));
+}
+
+SetLineWidth Element_SetLineWidth_read(ElementRef ref) {
+    return SetLineWidth_read(SetLineWidthRef(ref.offset + 4));
+}
+
+Transform Element_Transform_read(ElementRef ref) {
+    return Transform_read(TransformRef(ref.offset + 4));
+}
+
--- a/piet-gpu/shader/state.h
+++ b/piet-gpu/shader/state.h
@ -0,0 +1,59 @@
+// Code auto-generated by piet-gpu-derive
+
+struct StateRef {
+    uint offset;
+};
+
+struct State {
+    vec4 mat;
+    vec2 translate;
+    vec4 bbox;
+    float linewidth;
+    uint flags;
+};
+
+#define State_size 48
+
+StateRef State_index(StateRef ref, uint index) {
+    return StateRef(ref.offset + index * State_size);
+}
+
+State State_read(StateRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = state[ix + 0];
+    uint raw1 = state[ix + 1];
+    uint raw2 = state[ix + 2];
+    uint raw3 = state[ix + 3];
+    uint raw4 = state[ix + 4];
+    uint raw5 = state[ix + 5];
+    uint raw6 = state[ix + 6];
+    uint raw7 = state[ix + 7];
+    uint raw8 = state[ix + 8];
+    uint raw9 = state[ix + 9];
+    uint raw10 = state[ix + 10];
+    uint raw11 = state[ix + 11];
+    State s;
+    s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
+    s.bbox = vec4(uintBitsToFloat(raw6), uintBitsToFloat(raw7), uintBitsToFloat(raw8), uintBitsToFloat(raw9));
+    s.linewidth = uintBitsToFloat(raw10);
+    s.flags = raw11;
+    return s;
+}
+
+void State_write(StateRef ref, State s) {
+    uint ix = ref.offset >> 2;
+    state[ix + 0] = floatBitsToUint(s.mat.x);
+    state[ix + 1] = floatBitsToUint(s.mat.y);
+    state[ix + 2] = floatBitsToUint(s.mat.z);
+    state[ix + 3] = floatBitsToUint(s.mat.w);
+    state[ix + 4] = floatBitsToUint(s.translate.x);
+    state[ix + 5] = floatBitsToUint(s.translate.y);
+    state[ix + 6] = floatBitsToUint(s.bbox.x);
+    state[ix + 7] = floatBitsToUint(s.bbox.y);
+    state[ix + 8] = floatBitsToUint(s.bbox.z);
+    state[ix + 9] = floatBitsToUint(s.bbox.w);
+    state[ix + 10] = floatBitsToUint(s.linewidth);
+    state[ix + 11] = s.flags;
+}
+
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -1,5 +1,5 @@
-mod render_ctx;
 mod pico_svg;
+mod render_ctx;

 pub use render_ctx::PietGpuRenderContext;

@ -8,6 +8,8 @@ use rand::{Rng, RngCore};
 use piet::kurbo::{BezPath, Circle, Line, Point, Vec2};
 use piet::{Color, RenderContext};

+use piet_gpu_types::encoder::Encode;
+
 use piet_gpu_hal::{CmdBuf, Device, Error, ImageLayout, MemFlags};

 use pico_svg::PicoSvg;
@ -110,6 +112,12 @@ pub struct Renderer<D: Device> {
    scene_buf: D::Buffer,
    scene_dev: D::Buffer,

+    pub state_buf: D::Buffer,
+
+    el_pipeline: D::Pipeline,
+    el_ds: D::DescriptorSet,
+
+    /*
    k1_alloc_buf_host: D::Buffer,
    k1_alloc_buf_dev: D::Buffer,
    k2s_alloc_buf_host: D::Buffer,
@ -131,6 +139,8 @@ pub struct Renderer<D: Device> {
    k3_ds: D::DescriptorSet,
    k4_pipeline: D::Pipeline,
    k4_ds: D::DescriptorSet,
+    */
+    n_elements: usize,
 }

 impl<D: Device> Renderer<D> {
@ -146,175 +156,123 @@ impl<D: Device> Renderer<D> {
            .unwrap();
        device.write_buffer(&scene_buf, &scene)?;

+        let state_buf = device.create_buffer(4 * 1024 * 1024, dev)?;
+        let image_dev = device.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?;
+
+        let el_code = include_bytes!("../shader/elements.spv");
+        let el_pipeline = device.create_simple_compute_pipeline(el_code, 2, 0)?;
+        let el_ds = device.create_descriptor_set(
+            &el_pipeline,
+            &[&scene_dev, &state_buf],
+            &[],
+        )?;
+
+        let n_elements = scene.len() / piet_gpu_types::scene::Element::fixed_size();
+        println!("scene: {} elements", n_elements);
+
+        /*
        let tilegroup_buf = device.create_buffer(4 * 1024 * 1024, dev)?;
        let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?;
        let segment_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
        let fill_seg_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
-        let image_dev = device.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?;

        let k1_alloc_buf_host = device.create_buffer(4, host)?;
        let k1_alloc_buf_dev = device.create_buffer(4, dev)?;
        let k1_alloc_start = WIDTH_IN_TILEGROUPS * HEIGHT_IN_TILEGROUPS * TILEGROUP_STRIDE;
        device.write_buffer(&k1_alloc_buf_host, &[k1_alloc_start as u32])?;
        let k1_code = include_bytes!("../shader/kernel1.spv");
-        let k1_pipeline = device
-            .create_simple_compute_pipeline(k1_code, 3, 0)?;
-        let k1_ds = device
-            .create_descriptor_set(
-                &k1_pipeline,
-                &[&scene_dev, &tilegroup_buf, &k1_alloc_buf_dev],
-                &[],
-            )?;
+        let k1_pipeline = device.create_simple_compute_pipeline(k1_code, 3, 0)?;
+        let k1_ds = device.create_descriptor_set(
+            &k1_pipeline,
+            &[&scene_dev, &tilegroup_buf, &k1_alloc_buf_dev],
+            &[],
+        )?;

        let k2s_alloc_buf_host = device.create_buffer(4, host)?;
        let k2s_alloc_buf_dev = device.create_buffer(4, dev)?;
        let k2s_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * K2_PER_TILE_SIZE;
-        device
-            .write_buffer(&k2s_alloc_buf_host, &[k2s_alloc_start as u32])
-            ?;
+        device.write_buffer(&k2s_alloc_buf_host, &[k2s_alloc_start as u32])?;
        let k2s_code = include_bytes!("../shader/kernel2s.spv");
-        let k2s_pipeline = device
-            .create_simple_compute_pipeline(k2s_code, 4, 0)
-            ?;
-        let k2s_ds = device
-            .create_descriptor_set(
-                &k2s_pipeline,
-                &[&scene_dev, &tilegroup_buf, &segment_buf, &k2s_alloc_buf_dev],
-                &[],
-            )
-            ?;
+        let k2s_pipeline = device.create_simple_compute_pipeline(k2s_code, 4, 0)?;
+        let k2s_ds = device.create_descriptor_set(
+            &k2s_pipeline,
+            &[&scene_dev, &tilegroup_buf, &segment_buf, &k2s_alloc_buf_dev],
+            &[],
+        )?;

        let k2f_alloc_buf_host = device.create_buffer(4, host)?;
        let k2f_alloc_buf_dev = device.create_buffer(4, dev)?;
        let k2f_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * K2_PER_TILE_SIZE;
-        device
-            .write_buffer(&k2f_alloc_buf_host, &[k2f_alloc_start as u32])
-            ?;
+        device.write_buffer(&k2f_alloc_buf_host, &[k2f_alloc_start as u32])?;
        let k2f_code = include_bytes!("../shader/kernel2f.spv");
        let k2f_pipeline = device.create_simple_compute_pipeline(k2f_code, 4, 0)?;
-        let k2f_ds = device
-            .create_descriptor_set(
-                &k2f_pipeline,
-                &[
-                    &scene_dev,
-                    &tilegroup_buf,
-                    &fill_seg_buf,
-                    &k2f_alloc_buf_dev,
-                ],
-                &[],
-            )
-            ?;
+        let k2f_ds = device.create_descriptor_set(
+            &k2f_pipeline,
+            &[
+                &scene_dev,
+                &tilegroup_buf,
+                &fill_seg_buf,
+                &k2f_alloc_buf_dev,
+            ],
+            &[],
+        )?;

        let k3_alloc_buf_host = device.create_buffer(4, host)?;
        let k3_alloc_buf_dev = device.create_buffer(4, dev)?;
        let k3_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC;
-        device
-            .write_buffer(&k3_alloc_buf_host, &[k3_alloc_start as u32])
-            ?;
+        device.write_buffer(&k3_alloc_buf_host, &[k3_alloc_start as u32])?;
        let k3_code = include_bytes!("../shader/kernel3.spv");
        let k3_pipeline = device.create_simple_compute_pipeline(k3_code, 6, 0)?;
-        let k3_ds = device
-            .create_descriptor_set(
-                &k3_pipeline,
-                &[
-                    &scene_dev,
-                    &tilegroup_buf,
-                    &segment_buf,
-                    &fill_seg_buf,
-                    &ptcl_buf,
-                    &k3_alloc_buf_dev,
-                ],
-                &[],
-            )
-            ?;
+        let k3_ds = device.create_descriptor_set(
+            &k3_pipeline,
+            &[
+                &scene_dev,
+                &tilegroup_buf,
+                &segment_buf,
+                &fill_seg_buf,
+                &ptcl_buf,
+                &k3_alloc_buf_dev,
+            ],
+            &[],
+        )?;

        let k4_code = include_bytes!("../shader/kernel4.spv");
        let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 3, 1)?;
-        let k4_ds = device
-            .create_descriptor_set(&k4_pipeline, &[&ptcl_buf, &segment_buf, &fill_seg_buf], &[&image_dev])
-            ?;
+        let k4_ds = device.create_descriptor_set(
+            &k4_pipeline,
+            &[&ptcl_buf, &segment_buf, &fill_seg_buf],
+            &[&image_dev],
+        )?;
+        */

        Ok(Renderer {
            scene_buf,
            scene_dev,
            image_dev,
-            k1_alloc_buf_host,
-            k1_alloc_buf_dev,
-            k2s_alloc_buf_host,
-            k2s_alloc_buf_dev,
-            k2f_alloc_buf_host,
-            k2f_alloc_buf_dev,
-            k3_alloc_buf_host,
-            k3_alloc_buf_dev,
-            tilegroup_buf,
-            ptcl_buf,
-            k1_pipeline,
-            k1_ds,
-            k2s_pipeline,
-            k2s_ds,
-            k2f_pipeline,
-            k2f_ds,
-            k3_pipeline,
-            k3_ds,
-            k4_pipeline,
-            k4_ds,
+            el_pipeline,
+            el_ds,
+            state_buf,
+            n_elements,
        })
    }

    pub unsafe fn record(&self, cmd_buf: &mut impl CmdBuf<D>, query_pool: &D::QueryPool) {
        cmd_buf.copy_buffer(&self.scene_buf, &self.scene_dev);
-        // Note: we could use one alloc buf and reuse it. But we'll stick with
-        // multiple ones for clarity.
-        cmd_buf.copy_buffer(&self.k1_alloc_buf_host, &self.k1_alloc_buf_dev);
-        cmd_buf.copy_buffer(&self.k2s_alloc_buf_host, &self.k2s_alloc_buf_dev);
-        cmd_buf.copy_buffer(&self.k2f_alloc_buf_host, &self.k2f_alloc_buf_dev);
-        cmd_buf.copy_buffer(&self.k3_alloc_buf_host, &self.k3_alloc_buf_dev);
-        // Note: these clears aren't necessary, and are here to make inspection
-        // of the buffers cleaner. Can likely be removed.
-        cmd_buf.clear_buffer(&self.tilegroup_buf);
-        cmd_buf.clear_buffer(&self.ptcl_buf);
        cmd_buf.memory_barrier();
-        cmd_buf.image_barrier(&self.image_dev, ImageLayout::Undefined, ImageLayout::General);
+        cmd_buf.image_barrier(
+            &self.image_dev,
+            ImageLayout::Undefined,
+            ImageLayout::General,
+        );
        cmd_buf.reset_query_pool(&query_pool);
        cmd_buf.write_timestamp(&query_pool, 0);
        cmd_buf.dispatch(
-            &self.k1_pipeline,
-            &self.k1_ds,
-            ((WIDTH / 512) as u32, (HEIGHT / 512) as u32, 1),
+            &self.el_pipeline,
+            &self.el_ds,
+            ((self.n_elements / 128) as u32, 1, 1),
        );
        cmd_buf.write_timestamp(&query_pool, 1);
        cmd_buf.memory_barrier();
-        cmd_buf.dispatch(
-            &self.k2s_pipeline,
-            &self.k2s_ds,
-            ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 1),
-        );
-        cmd_buf.write_timestamp(&query_pool, 2);
-        // Note: this barrier is not necessary (k2f does not depend on
-        // k2s output), but I'm keeping it here to increase transparency
-        // of performance.
-        cmd_buf.memory_barrier();
-        cmd_buf.dispatch(
-            &self.k2f_pipeline,
-            &self.k2f_ds,
-            ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 2),
-        );
-        cmd_buf.write_timestamp(&query_pool, 3);
-        cmd_buf.memory_barrier();
-        cmd_buf.dispatch(
-            &self.k3_pipeline,
-            &self.k3_ds,
-            ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 3),
-        );
-        cmd_buf.write_timestamp(&query_pool, 4);
-        cmd_buf.memory_barrier();
-        cmd_buf.dispatch(
-            &self.k4_pipeline,
-            &self.k4_ds,
-            ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1),
-        );
-        cmd_buf.write_timestamp(&query_pool, 5);
-        cmd_buf.memory_barrier();
        cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
    }
 }
--- a/piet-gpu/src/pico_svg.rs
+++ b/piet-gpu/src/pico_svg.rs
@ -41,10 +41,14 @@ impl PicoSvg {
                let path = Affine::scale(scale) * bp;
                if let Some(fill_color) = el.attribute("fill") {
                    let color = parse_color(fill_color);
-                    items.push(Item::Fill(FillItem { color, path: path.clone() }));
+                    items.push(Item::Fill(FillItem {
+                        color,
+                        path: path.clone(),
+                    }));
                }
                if let Some(stroke_color) = el.attribute("stroke") {
-                    let width = f64::from_str(el.attribute("stroke-width").ok_or("missing width")?)?;
+                    let width =
+                        f64::from_str(el.attribute("stroke-width").ok_or("missing width")?)?;
                    let color = parse_color(stroke_color);
                    items.push(Item::Stroke(StrokeItem { width, color, path }));
                }
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@ -2,7 +2,11 @@ use std::borrow::Cow;

 use piet_gpu_types::encoder::{Encode, Encoder, Ref};
 use piet_gpu_types::scene;
-use piet_gpu_types::scene::{Bbox, PietCircle, PietFill, PietItem, PietStrokePolyLine, SimpleGroup};
+use piet_gpu_types::scene::{
+    Bbox, PietCircle, PietFill, PietItem, PietStrokePolyLine, SimpleGroup,
+};
+
+use piet_gpu_types::scene::{CubicSeg, Element, Fill, LineSeg, QuadSeg, SetLineWidth, Stroke};

 use piet::kurbo::{Affine, PathEl, Point, Rect, Shape};

@ -27,10 +31,10 @@ pub struct PietGpuText;

 pub struct PietGpuRenderContext {
    encoder: Encoder,
-    bboxes: Vec<Bbox>,
-    items: Vec<PietItem>,
+    elements: Vec<Element>,
    // Will probably need direct accesss to hal Device to create images etc.
    inner_text: PietGpuText,
+    stroke_width: f32,
 }

 #[derive(Clone)]
@ -43,47 +47,22 @@ const TOLERANCE: f64 = 0.25;

 impl PietGpuRenderContext {
    pub fn new() -> PietGpuRenderContext {
-        let mut encoder = Encoder::new();
-        let _reserve_root = encoder.alloc_chunk(PietItem::fixed_size() as u32);
-        let bboxes = Vec::new();
-        let items = Vec::new();
+        let encoder = Encoder::new();
+        let elements = Vec::new();
        let inner_text = PietGpuText;
+        let stroke_width = 0.0;
        PietGpuRenderContext {
            encoder,
-            bboxes,
-            items,
+            elements,
            inner_text,
+            stroke_width,
        }
    }

    pub fn get_scene_buf(&mut self) -> &[u8] {
-        let n_items = self.bboxes.len() as u32;
-        let bboxes = self.bboxes.encode(&mut self.encoder).transmute();
-        let items = self.items.encode(&mut self.encoder).transmute();
-        let offset = scene::Point { xy: [0.0, 0.0] };
-        let simple_group = SimpleGroup {
-            n_items,
-            bboxes,
-            items,
-            offset,
-        };
-        let root_item = PietItem::Group(simple_group);
-        root_item.encode_to(&mut self.encoder.buf_mut()[0..PietItem::fixed_size()]);
+        self.elements.encode(&mut self.encoder);
        self.encoder.buf()
    }
-
-    fn push_item(&mut self, item: PietItem, bbox: Rect) {
-        let scene_bbox = Bbox {
-            bbox: [
-                bbox.x0.floor() as i16,
-                bbox.y0.floor() as i16,
-                bbox.x1.ceil() as i16,
-                bbox.y1.ceil() as i16,
-            ],
-        };
-        self.items.push(item);
-        self.bboxes.push(scene_bbox);
-    }
 }

 impl RenderContext for PietGpuRenderContext {
@ -107,20 +86,19 @@ impl RenderContext for PietGpuRenderContext {
    fn clear(&mut self, _color: Color) {}

    fn stroke(&mut self, shape: impl Shape, brush: &impl IntoBrush<Self>, width: f64) {
-        let bbox = shape.bounding_box();
-        let brush = brush.make_brush(self, || bbox).into_owned();
+        let width = width as f32;
+        if self.stroke_width != width {
+            self.elements
+                .push(Element::SetLineWidth(SetLineWidth { width }));
+            self.stroke_width = width;
+        }
+        let brush = brush.make_brush(self, || shape.bounding_box()).into_owned();
        let path = shape.to_bez_path(TOLERANCE);
-        let (n_points, points) = flatten_shape(&mut self.encoder, path);
+        self.encode_path(path);
        match brush {
            PietGpuBrush::Solid(rgba_color) => {
-                let poly_line = PietStrokePolyLine {
-                    rgba_color,
-                    width: width as f32,
-                    n_points,
-                    points,
-                };
-                let bbox = bbox.inset(-0.5 * width);
-                self.push_item(PietItem::Poly(poly_line), bbox);
+                let stroke = Stroke { rgba_color };
+                self.elements.push(Element::Stroke(stroke));
            }
            _ => (),
        }
@ -136,35 +114,13 @@ impl RenderContext for PietGpuRenderContext {
    }

    fn fill(&mut self, shape: impl Shape, brush: &impl IntoBrush<Self>) {
-        let bbox = shape.bounding_box();
        let brush = brush.make_brush(self, || shape.bounding_box()).into_owned();
-
-        if let Some(circle) = shape.as_circle() {
-            match brush {
-                PietGpuBrush::Solid(rgba_color) => {
-                    let piet_circle = PietCircle {
-                        rgba_color,
-                        center: to_scene_point(circle.center),
-                        radius: circle.radius as f32,
-                    };
-                    let bbox = circle.bounding_box();
-                    self.push_item(PietItem::Circle(piet_circle), bbox);
-                }
-                _ => {}
-            }
-            return;
-        }
        let path = shape.to_bez_path(TOLERANCE);
-        let (n_points, points) = flatten_shape(&mut self.encoder, path);
+        self.encode_path(path);
        match brush {
            PietGpuBrush::Solid(rgba_color) => {
-                let fill = PietFill {
-                    flags: 0,
-                    rgba_color,
-                    n_points,
-                    points,
-                };
-                self.push_item(PietItem::Fill(fill), bbox);
+                let fill = Fill { rgba_color };
+                self.elements.push(Element::Fill(fill));
            }
            _ => (),
        }
@ -241,45 +197,96 @@ impl RenderContext for PietGpuRenderContext {
    }
 }

-fn flatten_shape(
-    encoder: &mut Encoder,
-    path: impl Iterator<Item = PathEl>,
-) -> (u32, Ref<scene::Point>) {
-    let mut points = Vec::new();
-    let mut start_pt = None;
-    let mut last_pt = None;
-    piet::kurbo::flatten(path, TOLERANCE, |el| {
-        match el {
-            PathEl::MoveTo(p) => {
-                let scene_pt = to_scene_point(p);
-                start_pt = Some(clone_scene_pt(&scene_pt));
-                if !points.is_empty() {
-                    points.push(scene::Point {
-                        xy: [std::f32::NAN, std::f32::NAN],
-                    });
+impl PietGpuRenderContext {
+    fn encode_path(&mut self, path: impl Iterator<Item = PathEl>) {
+        let flatten = false;
+        if flatten {
+            let mut start_pt = None;
+            let mut last_pt = None;
+            piet::kurbo::flatten(path, TOLERANCE, |el| {
+                match el {
+                    PathEl::MoveTo(p) => {
+                        let scene_pt = to_f32_2(p);
+                        last_pt = Some(scene_pt);
+                    }
+                    PathEl::LineTo(p) => {
+                        let scene_pt = to_f32_2(p);
+                        let seg = LineSeg {
+                            p0: last_pt.unwrap(),
+                            p1: scene_pt,
+                        };
+                        self.elements.push(Element::Line(seg));
+                        last_pt = Some(scene_pt);
+                    }
+                    PathEl::ClosePath => {
+                        if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) {
+                            let seg = LineSeg {
+                                p0: last,
+                                p1: start,
+                            };
+                            self.elements.push(Element::Line(seg));
+                        }
+                    }
+                    _ => (),
                }
-                last_pt = Some(clone_scene_pt(&scene_pt));
-                points.push(scene_pt);
-            }
-            PathEl::LineTo(p) => {
-                let scene_pt = to_scene_point(p);
-                last_pt = Some(clone_scene_pt(&scene_pt));
-                points.push(scene_pt);
-            }
-            PathEl::ClosePath => {
-                if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) {
-                    if start.xy != last.xy {
-                        points.push(start);
+                //println!("{:?}", el);
+            });
+        } else {
+            let mut start_pt = None;
+            let mut last_pt = None;
+            for el in path {
+                match el {
+                    PathEl::MoveTo(p) => {
+                        let scene_pt = to_f32_2(p);
+                        last_pt = Some(scene_pt);
+                    }
+                    PathEl::LineTo(p) => {
+                        let scene_pt = to_f32_2(p);
+                        let seg = LineSeg {
+                            p0: last_pt.unwrap(),
+                            p1: scene_pt,
+                        };
+                        self.elements.push(Element::Line(seg));
+                        last_pt = Some(scene_pt);
+                    }
+                    PathEl::QuadTo(p1, p2) => {
+                        let scene_p1 = to_f32_2(p1);
+                        let scene_p2 = to_f32_2(p2);
+                        let seg = QuadSeg {
+                            p0: last_pt.unwrap(),
+                            p1: scene_p1,
+                            p2: scene_p2,
+                        };
+                        self.elements.push(Element::Quad(seg));
+                        last_pt = Some(scene_p2);
+                    }
+                    PathEl::CurveTo(p1, p2, p3) => {
+                        let scene_p1 = to_f32_2(p1);
+                        let scene_p2 = to_f32_2(p2);
+                        let scene_p3 = to_f32_2(p3);
+                        let seg = CubicSeg {
+                            p0: last_pt.unwrap(),
+                            p1: scene_p1,
+                            p2: scene_p2,
+                            p3: scene_p3,
+                        };
+                        self.elements.push(Element::Cubic(seg));
+                        last_pt = Some(scene_p3);
+                    }
+                    PathEl::ClosePath => {
+                        if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) {
+                            let seg = LineSeg {
+                                p0: last,
+                                p1: start,
+                            };
+                            self.elements.push(Element::Line(seg));
+                        }
                    }
                }
+                //println!("{:?}", el);
            }
-            _ => (),
        }
-        //println!("{:?}", el);
-    });
-    let n_points = points.len() as u32;
-    let points_ref = points.encode(encoder).transmute();
-    (n_points, points_ref)
+    }
 }

 impl Text for PietGpuText {
@ -360,13 +367,6 @@ impl IntoBrush<PietGpuRenderContext> for PietGpuBrush {
    }
 }

-fn to_scene_point(point: Point) -> scene::Point {
-    scene::Point {
-        xy: [point.x as f32, point.y as f32],
-    }
-}
-
-// TODO: allow #[derive(Clone)] in piet-gpu-derive.
-fn clone_scene_pt(p: &scene::Point) -> scene::Point {
-    scene::Point { xy: p.xy }
+fn to_f32_2(point: Point) -> [f32; 2] {
+    [point.x as f32, point.y as f32]
 }