Start work on new element pipeline

There's a bit of reorganizing as well. Shader stages are made available from piet-gpu to the test rig, config is now a proper structure (marshaled with bytemuck). This commit just has the transform stage, which is a simple monoid scan of affine transforms. Progress toward #119
2025-01-09 20:31:29 +11:00 · 2021-11-23 07:28:50 -08:00 · 2021-11-23 07:28:50 -08:00 · 47f8812e2f
parent a8103a4c20
commit 47f8812e2f
36 changed files with 1674 additions and 23 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -96,6 +96,20 @@ name = "bytemuck"
 version = "1.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "72957246c41db82b8ef88a5486143830adeb8227ef9837740bdec67724cf2c5b"
+dependencies = [
+ "bytemuck_derive",
+]
+
+[[package]]
+name = "bytemuck_derive"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e215f8c2f9f79cb53c8335e687ffd07d5bfcb6fe5fc80723762d0be46e7cc54"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]

 [[package]]
 name = "byteorder"
@ -872,6 +886,7 @@ dependencies = [
 name = "piet-gpu"
 version = "0.1.0"
 dependencies = [
+ "bytemuck",
 "clap",
 "ndk",
 "ndk-glue",
@ -920,7 +935,10 @@ version = "0.1.0"
 dependencies = [
 "bytemuck",
 "clap",
+ "kurbo",
+ "piet-gpu",
 "piet-gpu-hal",
+ "rand",
 ]

 [[package]]
--- a/piet-gpu/Cargo.toml
+++ b/piet-gpu/Cargo.toml
@ -33,6 +33,7 @@ roxmltree = "0.13"
 winit = "0.25"
 clap = "2.33"
 swash = "0.1.4"
+bytemuck = { version = "1.7.2", features = ["derive"] }

 [target.'cfg(target_os = "android")'.dependencies]
 ndk = "0.3"
--- a/piet-gpu/shader/backdrop.spv
+++ b/piet-gpu/shader/backdrop.spv
--- a/piet-gpu/shader/backdrop_lg.spv
+++ b/piet-gpu/shader/backdrop_lg.spv
--- a/piet-gpu/shader/binning.spv
+++ b/piet-gpu/shader/binning.spv
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@ -1,12 +1,27 @@
 # Build file for shaders.

-# You must have glslangValidator in your path, or patch here.
+# You must have Vulkan tools in your path, or patch here.

 glslang_validator = glslangValidator
+spirv_cross = spirv-cross
+dxc = dxc
+
+# See https://github.com/KhronosGroup/SPIRV-Cross/issues/1248 for
+# why we set this.
+msl_flags = --msl-decoration-binding

 rule glsl
  command = $glslang_validator $flags -V -o $out $in

+rule hlsl
+  command = $spirv_cross --hlsl $in --output $out
+
+rule dxil
+  command = $dxc -T cs_6_0 $in -Fo $out
+
+rule msl
+  command = $spirv_cross --msl $in --output $out $msl_flags
+

 build elements.spv: glsl elements.comp | scene.h state.h annotated.h

@ -24,3 +39,21 @@ build backdrop_lg.spv: glsl backdrop.comp | annotated.h tile.h setup.h
 build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h

 build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h
+
+# New element pipeline follows
+
+build gen/transform_reduce.spv: glsl transform_reduce.comp | scene.h setup.h mem.h
+build gen/transform_reduce.hlsl: hlsl gen/transform_reduce.spv
+build gen/transform_reduce.dxil: dxil gen/transform_reduce.hlsl
+build gen/transform_reduce.msl: msl gen/transform_reduce.spv
+
+build gen/transform_root.spv: glsl transform_scan.comp
+  flags = -DROOT
+build gen/transform_root.hlsl: hlsl gen/transform_root.spv
+build gen/transform_root.dxil: dxil gen/transform_root.hlsl
+build gen/transform_root.msl: msl gen/transform_root.spv
+
+build gen/transform_leaf.spv: glsl transform_leaf.comp | scene.h tile.h setup.h mem.h
+build gen/transform_leaf.hlsl: hlsl gen/transform_leaf.spv
+build gen/transform_leaf.dxil: dxil gen/transform_leaf.hlsl
+build gen/transform_leaf.msl: msl gen/transform_leaf.spv
--- a/piet-gpu/shader/coarse.spv
+++ b/piet-gpu/shader/coarse.spv
--- a/piet-gpu/shader/gen/transform_leaf.dxil
+++ b/piet-gpu/shader/gen/transform_leaf.dxil
--- a/piet-gpu/shader/gen/transform_leaf.hlsl
+++ b/piet-gpu/shader/gen/transform_leaf.hlsl
@ -0,0 +1,219 @@
+struct Alloc
+{
+    uint offset;
+};
+
+struct TransformRef
+{
+    uint offset;
+};
+
+struct Transform
+{
+    float4 mat;
+    float2 translate;
+};
+
+struct TransformSegRef
+{
+    uint offset;
+};
+
+struct TransformSeg
+{
+    float4 mat;
+    float2 translate;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+    Alloc trans_alloc;
+    uint n_trans;
+    uint trans_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
+
+static const Transform _224 = { float4(1.0f, 0.0f, 0.0f, 1.0f), 0.0f.xx };
+
+RWByteAddressBuffer _71 : register(u0);
+ByteAddressBuffer _96 : register(t2);
+ByteAddressBuffer _278 : register(t1);
+ByteAddressBuffer _377 : register(t3);
+
+static uint3 gl_WorkGroupID;
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_WorkGroupID : SV_GroupID;
+    uint3 gl_LocalInvocationID : SV_GroupThreadID;
+    uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+groupshared Transform sh_scratch[512];
+
+Transform Transform_read(TransformRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    uint raw0 = _96.Load((ix + 0u) * 4 + 0);
+    uint raw1 = _96.Load((ix + 1u) * 4 + 0);
+    uint raw2 = _96.Load((ix + 2u) * 4 + 0);
+    uint raw3 = _96.Load((ix + 3u) * 4 + 0);
+    uint raw4 = _96.Load((ix + 4u) * 4 + 0);
+    uint raw5 = _96.Load((ix + 5u) * 4 + 0);
+    Transform s;
+    s.mat = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
+    s.translate = float2(asfloat(raw4), asfloat(raw5));
+    return s;
+}
+
+TransformRef Transform_index(TransformRef ref, uint index)
+{
+    TransformRef _85 = { ref.offset + (index * 24u) };
+    return _85;
+}
+
+Transform combine_monoid(Transform a, Transform b)
+{
+    Transform c;
+    c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww);
+    c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate;
+    return c;
+}
+
+Transform monoid_identity()
+{
+    return _224;
+}
+
+bool touch_mem(Alloc alloc, uint offset)
+{
+    return true;
+}
+
+void write_mem(Alloc alloc, uint offset, uint val)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return;
+    }
+    _71.Store(offset * 4 + 8, val);
+}
+
+void TransformSeg_write(Alloc a, TransformSegRef ref, TransformSeg s)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = asuint(s.mat.x);
+    write_mem(param, param_1, param_2);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = asuint(s.mat.y);
+    write_mem(param_3, param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 2u;
+    uint param_8 = asuint(s.mat.z);
+    write_mem(param_6, param_7, param_8);
+    Alloc param_9 = a;
+    uint param_10 = ix + 3u;
+    uint param_11 = asuint(s.mat.w);
+    write_mem(param_9, param_10, param_11);
+    Alloc param_12 = a;
+    uint param_13 = ix + 4u;
+    uint param_14 = asuint(s.translate.x);
+    write_mem(param_12, param_13, param_14);
+    Alloc param_15 = a;
+    uint param_16 = ix + 5u;
+    uint param_17 = asuint(s.translate.y);
+    write_mem(param_15, param_16, param_17);
+}
+
+void comp_main()
+{
+    uint ix = gl_GlobalInvocationID.x * 8u;
+    TransformRef _285 = { _278.Load(44) + (ix * 24u) };
+    TransformRef ref = _285;
+    TransformRef param = ref;
+    Transform agg = Transform_read(param);
+    Transform local[8];
+    local[0] = agg;
+    for (uint i = 1u; i < 8u; i++)
+    {
+        TransformRef param_1 = ref;
+        uint param_2 = i;
+        TransformRef param_3 = Transform_index(param_1, param_2);
+        Transform param_4 = agg;
+        Transform param_5 = Transform_read(param_3);
+        agg = combine_monoid(param_4, param_5);
+        local[i] = agg;
+    }
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i_1 = 0u; i_1 < 9u; i_1++)
+    {
+        GroupMemoryBarrierWithGroupSync();
+        if (gl_LocalInvocationID.x >= (1u << i_1))
+        {
+            Transform other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+            Transform param_6 = other;
+            Transform param_7 = agg;
+            agg = combine_monoid(param_6, param_7);
+        }
+        GroupMemoryBarrierWithGroupSync();
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    GroupMemoryBarrierWithGroupSync();
+    Transform row = monoid_identity();
+    if (gl_WorkGroupID.x > 0u)
+    {
+        Transform _383;
+        _383.mat = asfloat(_377.Load4((gl_WorkGroupID.x - 1u) * 32 + 0));
+        _383.translate = asfloat(_377.Load2((gl_WorkGroupID.x - 1u) * 32 + 16));
+        row.mat = _383.mat;
+        row.translate = _383.translate;
+    }
+    if (gl_LocalInvocationID.x > 0u)
+    {
+        Transform param_8 = row;
+        Transform param_9 = sh_scratch[gl_LocalInvocationID.x - 1u];
+        row = combine_monoid(param_8, param_9);
+    }
+    Alloc param_12;
+    for (uint i_2 = 0u; i_2 < 8u; i_2++)
+    {
+        Transform param_10 = row;
+        Transform param_11 = local[i_2];
+        Transform m = combine_monoid(param_10, param_11);
+        TransformSeg _423 = { m.mat, m.translate };
+        TransformSeg transform = _423;
+        TransformSegRef _433 = { _278.Load(36) + ((ix + i_2) * 24u) };
+        TransformSegRef trans_ref = _433;
+        Alloc _437;
+        _437.offset = _278.Load(36);
+        param_12.offset = _437.offset;
+        TransformSegRef param_13 = trans_ref;
+        TransformSeg param_14 = transform;
+        TransformSeg_write(param_12, param_13, param_14);
+    }
+}
+
+[numthreads(512, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_WorkGroupID = stage_input.gl_WorkGroupID;
+    gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+    gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+    comp_main();
+}
--- a/piet-gpu/shader/gen/transform_leaf.msl
+++ b/piet-gpu/shader/gen/transform_leaf.msl
@ -0,0 +1,272 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct Alloc
+{
+    uint offset;
+};
+
+struct TransformRef
+{
+    uint offset;
+};
+
+struct Transform
+{
+    float4 mat;
+    float2 translate;
+};
+
+struct TransformSegRef
+{
+    uint offset;
+};
+
+struct TransformSeg
+{
+    float4 mat;
+    float2 translate;
+};
+
+struct Memory
+{
+    uint mem_offset;
+    uint mem_error;
+    uint memory[1];
+};
+
+struct SceneBuf
+{
+    uint scene[1];
+};
+
+struct Alloc_1
+{
+    uint offset;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc_1 tile_alloc;
+    Alloc_1 bin_alloc;
+    Alloc_1 ptcl_alloc;
+    Alloc_1 pathseg_alloc;
+    Alloc_1 anno_alloc;
+    Alloc_1 trans_alloc;
+    uint n_trans;
+    uint trans_offset;
+};
+
+struct ConfigBuf
+{
+    Config conf;
+};
+
+struct Transform_1
+{
+    float4 mat;
+    float2 translate;
+    char _m0_final_padding[8];
+};
+
+struct ParentBuf
+{
+    Transform_1 parent[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+Transform Transform_read(thread const TransformRef& ref, const device SceneBuf& v_96)
+{
+    uint ix = ref.offset >> uint(2);
+    uint raw0 = v_96.scene[ix + 0u];
+    uint raw1 = v_96.scene[ix + 1u];
+    uint raw2 = v_96.scene[ix + 2u];
+    uint raw3 = v_96.scene[ix + 3u];
+    uint raw4 = v_96.scene[ix + 4u];
+    uint raw5 = v_96.scene[ix + 5u];
+    Transform s;
+    s.mat = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
+    s.translate = float2(as_type<float>(raw4), as_type<float>(raw5));
+    return s;
+}
+
+static inline __attribute__((always_inline))
+TransformRef Transform_index(thread const TransformRef& ref, thread const uint& index)
+{
+    return TransformRef{ ref.offset + (index * 24u) };
+}
+
+static inline __attribute__((always_inline))
+Transform combine_monoid(thread const Transform& a, thread const Transform& b)
+{
+    Transform c;
+    c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww);
+    c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate;
+    return c;
+}
+
+static inline __attribute__((always_inline))
+Transform monoid_identity()
+{
+    return Transform{ float4(1.0, 0.0, 0.0, 1.0), float2(0.0) };
+}
+
+static inline __attribute__((always_inline))
+bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
+{
+    return true;
+}
+
+static inline __attribute__((always_inline))
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_71)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return;
+    }
+    v_71.memory[offset] = val;
+}
+
+static inline __attribute__((always_inline))
+void TransformSeg_write(thread const Alloc& a, thread const TransformSegRef& ref, thread const TransformSeg& s, device Memory& v_71)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = as_type<uint>(s.mat.x);
+    write_mem(param, param_1, param_2, v_71);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = as_type<uint>(s.mat.y);
+    write_mem(param_3, param_4, param_5, v_71);
+    Alloc param_6 = a;
+    uint param_7 = ix + 2u;
+    uint param_8 = as_type<uint>(s.mat.z);
+    write_mem(param_6, param_7, param_8, v_71);
+    Alloc param_9 = a;
+    uint param_10 = ix + 3u;
+    uint param_11 = as_type<uint>(s.mat.w);
+    write_mem(param_9, param_10, param_11, v_71);
+    Alloc param_12 = a;
+    uint param_13 = ix + 4u;
+    uint param_14 = as_type<uint>(s.translate.x);
+    write_mem(param_12, param_13, param_14, v_71);
+    Alloc param_15 = a;
+    uint param_16 = ix + 5u;
+    uint param_17 = as_type<uint>(s.translate.y);
+    write_mem(param_15, param_16, param_17, v_71);
+}
+
+kernel void main0(device Memory& v_71 [[buffer(0)]], const device ConfigBuf& _278 [[buffer(1)]], const device SceneBuf& v_96 [[buffer(2)]], const device ParentBuf& _377 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+{
+    threadgroup Transform sh_scratch[512];
+    uint ix = gl_GlobalInvocationID.x * 8u;
+    TransformRef ref = TransformRef{ _278.conf.trans_offset + (ix * 24u) };
+    TransformRef param = ref;
+    Transform agg = Transform_read(param, v_96);
+    spvUnsafeArray<Transform, 8> local;
+    local[0] = agg;
+    for (uint i = 1u; i < 8u; i++)
+    {
+        TransformRef param_1 = ref;
+        uint param_2 = i;
+        TransformRef param_3 = Transform_index(param_1, param_2);
+        Transform param_4 = agg;
+        Transform param_5 = Transform_read(param_3, v_96);
+        agg = combine_monoid(param_4, param_5);
+        local[i] = agg;
+    }
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i_1 = 0u; i_1 < 9u; i_1++)
+    {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        if (gl_LocalInvocationID.x >= (1u << i_1))
+        {
+            Transform other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+            Transform param_6 = other;
+            Transform param_7 = agg;
+            agg = combine_monoid(param_6, param_7);
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    Transform row = monoid_identity();
+    if (gl_WorkGroupID.x > 0u)
+    {
+        uint _380 = gl_WorkGroupID.x - 1u;
+        row.mat = _377.parent[_380].mat;
+        row.translate = _377.parent[_380].translate;
+    }
+    if (gl_LocalInvocationID.x > 0u)
+    {
+        Transform param_8 = row;
+        Transform param_9 = sh_scratch[gl_LocalInvocationID.x - 1u];
+        row = combine_monoid(param_8, param_9);
+    }
+    Alloc param_12;
+    for (uint i_2 = 0u; i_2 < 8u; i_2++)
+    {
+        Transform param_10 = row;
+        Transform param_11 = local[i_2];
+        Transform m = combine_monoid(param_10, param_11);
+        TransformSeg transform = TransformSeg{ m.mat, m.translate };
+        TransformSegRef trans_ref = TransformSegRef{ _278.conf.trans_alloc.offset + ((ix + i_2) * 24u) };
+        param_12.offset = _278.conf.trans_alloc.offset;
+        TransformSegRef param_13 = trans_ref;
+        TransformSeg param_14 = transform;
+        TransformSeg_write(param_12, param_13, param_14, v_71);
+    }
+}
+
--- a/piet-gpu/shader/gen/transform_leaf.spv
+++ b/piet-gpu/shader/gen/transform_leaf.spv
--- a/piet-gpu/shader/gen/transform_reduce.dxil
+++ b/piet-gpu/shader/gen/transform_reduce.dxil
--- a/piet-gpu/shader/gen/transform_reduce.hlsl
+++ b/piet-gpu/shader/gen/transform_reduce.hlsl
@ -0,0 +1,125 @@
+struct TransformRef
+{
+    uint offset;
+};
+
+struct Transform
+{
+    float4 mat;
+    float2 translate;
+};
+
+struct Alloc
+{
+    uint offset;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+    Alloc trans_alloc;
+    uint n_trans;
+    uint trans_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
+
+ByteAddressBuffer _49 : register(t2);
+ByteAddressBuffer _161 : register(t1);
+RWByteAddressBuffer _251 : register(u3);
+RWByteAddressBuffer _267 : register(u0);
+
+static uint3 gl_WorkGroupID;
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_WorkGroupID : SV_GroupID;
+    uint3 gl_LocalInvocationID : SV_GroupThreadID;
+    uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+groupshared Transform sh_scratch[512];
+
+Transform Transform_read(TransformRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    uint raw0 = _49.Load((ix + 0u) * 4 + 0);
+    uint raw1 = _49.Load((ix + 1u) * 4 + 0);
+    uint raw2 = _49.Load((ix + 2u) * 4 + 0);
+    uint raw3 = _49.Load((ix + 3u) * 4 + 0);
+    uint raw4 = _49.Load((ix + 4u) * 4 + 0);
+    uint raw5 = _49.Load((ix + 5u) * 4 + 0);
+    Transform s;
+    s.mat = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
+    s.translate = float2(asfloat(raw4), asfloat(raw5));
+    return s;
+}
+
+TransformRef Transform_index(TransformRef ref, uint index)
+{
+    TransformRef _37 = { ref.offset + (index * 24u) };
+    return _37;
+}
+
+Transform combine_monoid(Transform a, Transform b)
+{
+    Transform c;
+    c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww);
+    c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate;
+    return c;
+}
+
+void comp_main()
+{
+    uint ix = gl_GlobalInvocationID.x * 8u;
+    TransformRef _168 = { _161.Load(44) + (ix * 24u) };
+    TransformRef ref = _168;
+    TransformRef param = ref;
+    Transform agg = Transform_read(param);
+    for (uint i = 1u; i < 8u; i++)
+    {
+        TransformRef param_1 = ref;
+        uint param_2 = i;
+        TransformRef param_3 = Transform_index(param_1, param_2);
+        Transform param_4 = agg;
+        Transform param_5 = Transform_read(param_3);
+        agg = combine_monoid(param_4, param_5);
+    }
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i_1 = 0u; i_1 < 9u; i_1++)
+    {
+        GroupMemoryBarrierWithGroupSync();
+        if ((gl_LocalInvocationID.x + (1u << i_1)) < 512u)
+        {
+            Transform other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)];
+            Transform param_6 = agg;
+            Transform param_7 = other;
+            agg = combine_monoid(param_6, param_7);
+        }
+        GroupMemoryBarrierWithGroupSync();
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    if (gl_LocalInvocationID.x == 0u)
+    {
+        _251.Store4(gl_WorkGroupID.x * 32 + 0, asuint(agg.mat));
+        _251.Store2(gl_WorkGroupID.x * 32 + 16, asuint(agg.translate));
+    }
+}
+
+[numthreads(512, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_WorkGroupID = stage_input.gl_WorkGroupID;
+    gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+    gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+    comp_main();
+}
--- a/piet-gpu/shader/gen/transform_reduce.msl
+++ b/piet-gpu/shader/gen/transform_reduce.msl
@ -0,0 +1,138 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct TransformRef
+{
+    uint offset;
+};
+
+struct Transform
+{
+    float4 mat;
+    float2 translate;
+};
+
+struct SceneBuf
+{
+    uint scene[1];
+};
+
+struct Alloc
+{
+    uint offset;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+    Alloc trans_alloc;
+    uint n_trans;
+    uint trans_offset;
+};
+
+struct ConfigBuf
+{
+    Config conf;
+};
+
+struct Transform_1
+{
+    float4 mat;
+    float2 translate;
+    char _m0_final_padding[8];
+};
+
+struct OutBuf
+{
+    Transform_1 outbuf[1];
+};
+
+struct Memory
+{
+    uint mem_offset;
+    uint mem_error;
+    uint memory[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+Transform Transform_read(thread const TransformRef& ref, const device SceneBuf& v_49)
+{
+    uint ix = ref.offset >> uint(2);
+    uint raw0 = v_49.scene[ix + 0u];
+    uint raw1 = v_49.scene[ix + 1u];
+    uint raw2 = v_49.scene[ix + 2u];
+    uint raw3 = v_49.scene[ix + 3u];
+    uint raw4 = v_49.scene[ix + 4u];
+    uint raw5 = v_49.scene[ix + 5u];
+    Transform s;
+    s.mat = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
+    s.translate = float2(as_type<float>(raw4), as_type<float>(raw5));
+    return s;
+}
+
+static inline __attribute__((always_inline))
+TransformRef Transform_index(thread const TransformRef& ref, thread const uint& index)
+{
+    return TransformRef{ ref.offset + (index * 24u) };
+}
+
+static inline __attribute__((always_inline))
+Transform combine_monoid(thread const Transform& a, thread const Transform& b)
+{
+    Transform c;
+    c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww);
+    c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate;
+    return c;
+}
+
+kernel void main0(const device ConfigBuf& _161 [[buffer(1)]], const device SceneBuf& v_49 [[buffer(2)]], device OutBuf& _251 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+{
+    threadgroup Transform sh_scratch[512];
+    uint ix = gl_GlobalInvocationID.x * 8u;
+    TransformRef ref = TransformRef{ _161.conf.trans_offset + (ix * 24u) };
+    TransformRef param = ref;
+    Transform agg = Transform_read(param, v_49);
+    for (uint i = 1u; i < 8u; i++)
+    {
+        TransformRef param_1 = ref;
+        uint param_2 = i;
+        TransformRef param_3 = Transform_index(param_1, param_2);
+        Transform param_4 = agg;
+        Transform param_5 = Transform_read(param_3, v_49);
+        agg = combine_monoid(param_4, param_5);
+    }
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i_1 = 0u; i_1 < 9u; i_1++)
+    {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        if ((gl_LocalInvocationID.x + (1u << i_1)) < 512u)
+        {
+            Transform other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)];
+            Transform param_6 = agg;
+            Transform param_7 = other;
+            agg = combine_monoid(param_6, param_7);
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    if (gl_LocalInvocationID.x == 0u)
+    {
+        _251.outbuf[gl_WorkGroupID.x].mat = agg.mat;
+        _251.outbuf[gl_WorkGroupID.x].translate = agg.translate;
+    }
+}
+
--- a/piet-gpu/shader/gen/transform_reduce.spv
+++ b/piet-gpu/shader/gen/transform_reduce.spv
--- a/piet-gpu/shader/gen/transform_root.dxil
+++ b/piet-gpu/shader/gen/transform_root.dxil
--- a/piet-gpu/shader/gen/transform_root.hlsl
+++ b/piet-gpu/shader/gen/transform_root.hlsl
@ -0,0 +1,94 @@
+struct Transform
+{
+    float4 mat;
+    float2 translate;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
+
+static const Transform _23 = { float4(1.0f, 0.0f, 0.0f, 1.0f), 0.0f.xx };
+
+RWByteAddressBuffer _89 : register(u0);
+
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_LocalInvocationID : SV_GroupThreadID;
+    uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+groupshared Transform sh_scratch[512];
+
+Transform combine_monoid(Transform a, Transform b)
+{
+    Transform c;
+    c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww);
+    c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate;
+    return c;
+}
+
+Transform monoid_identity()
+{
+    return _23;
+}
+
+void comp_main()
+{
+    uint ix = gl_GlobalInvocationID.x * 8u;
+    Transform _93;
+    _93.mat = asfloat(_89.Load4(ix * 32 + 0));
+    _93.translate = asfloat(_89.Load2(ix * 32 + 16));
+    Transform local[8];
+    local[0].mat = _93.mat;
+    local[0].translate = _93.translate;
+    Transform param_1;
+    for (uint i = 1u; i < 8u; i++)
+    {
+        Transform param = local[i - 1u];
+        Transform _119;
+        _119.mat = asfloat(_89.Load4((ix + i) * 32 + 0));
+        _119.translate = asfloat(_89.Load2((ix + i) * 32 + 16));
+        param_1.mat = _119.mat;
+        param_1.translate = _119.translate;
+        local[i] = combine_monoid(param, param_1);
+    }
+    Transform agg = local[7];
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i_1 = 0u; i_1 < 9u; i_1++)
+    {
+        GroupMemoryBarrierWithGroupSync();
+        if (gl_LocalInvocationID.x >= (1u << i_1))
+        {
+            Transform other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+            Transform param_2 = other;
+            Transform param_3 = agg;
+            agg = combine_monoid(param_2, param_3);
+        }
+        GroupMemoryBarrierWithGroupSync();
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    GroupMemoryBarrierWithGroupSync();
+    Transform row = monoid_identity();
+    if (gl_LocalInvocationID.x > 0u)
+    {
+        row = sh_scratch[gl_LocalInvocationID.x - 1u];
+    }
+    for (uint i_2 = 0u; i_2 < 8u; i_2++)
+    {
+        Transform param_4 = row;
+        Transform param_5 = local[i_2];
+        Transform m = combine_monoid(param_4, param_5);
+        uint _209 = ix + i_2;
+        _89.Store4(_209 * 32 + 0, asuint(m.mat));
+        _89.Store2(_209 * 32 + 16, asuint(m.translate));
+    }
+}
+
+[numthreads(512, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+    gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+    comp_main();
+}
--- a/piet-gpu/shader/gen/transform_root.msl
+++ b/piet-gpu/shader/gen/transform_root.msl
@ -0,0 +1,129 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct Transform
+{
+    float4 mat;
+    float2 translate;
+};
+
+struct Transform_1
+{
+    float4 mat;
+    float2 translate;
+    char _m0_final_padding[8];
+};
+
+struct DataBuf
+{
+    Transform_1 data[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+Transform combine_monoid(thread const Transform& a, thread const Transform& b)
+{
+    Transform c;
+    c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww);
+    c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate;
+    return c;
+}
+
+static inline __attribute__((always_inline))
+Transform monoid_identity()
+{
+    return Transform{ float4(1.0, 0.0, 0.0, 1.0), float2(0.0) };
+}
+
+kernel void main0(device DataBuf& _89 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+{
+    threadgroup Transform sh_scratch[512];
+    uint ix = gl_GlobalInvocationID.x * 8u;
+    spvUnsafeArray<Transform, 8> local;
+    local[0].mat = _89.data[ix].mat;
+    local[0].translate = _89.data[ix].translate;
+    Transform param_1;
+    for (uint i = 1u; i < 8u; i++)
+    {
+        uint _113 = ix + i;
+        Transform param = local[i - 1u];
+        param_1.mat = _89.data[_113].mat;
+        param_1.translate = _89.data[_113].translate;
+        local[i] = combine_monoid(param, param_1);
+    }
+    Transform agg = local[7];
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i_1 = 0u; i_1 < 9u; i_1++)
+    {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        if (gl_LocalInvocationID.x >= (1u << i_1))
+        {
+            Transform other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+            Transform param_2 = other;
+            Transform param_3 = agg;
+            agg = combine_monoid(param_2, param_3);
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    Transform row = monoid_identity();
+    if (gl_LocalInvocationID.x > 0u)
+    {
+        row = sh_scratch[gl_LocalInvocationID.x - 1u];
+    }
+    for (uint i_2 = 0u; i_2 < 8u; i_2++)
+    {
+        Transform param_4 = row;
+        Transform param_5 = local[i_2];
+        Transform m = combine_monoid(param_4, param_5);
+        uint _209 = ix + i_2;
+        _89.data[_209].mat = m.mat;
+        _89.data[_209].translate = m.translate;
+    }
+}
+
--- a/piet-gpu/shader/gen/transform_root.spv
+++ b/piet-gpu/shader/gen/transform_root.spv
--- a/piet-gpu/shader/kernel4.spv
+++ b/piet-gpu/shader/kernel4.spv
--- a/piet-gpu/shader/path_coarse.spv
+++ b/piet-gpu/shader/path_coarse.spv
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@ -38,6 +38,12 @@ struct Config {
    Alloc pathseg_alloc;
    Alloc anno_alloc;
    Alloc trans_alloc;
+    // new element pipeline stuff follows
+
+    // Number of transforms in scene
+    uint n_trans;
+    // Offset (in bytes) of transform stream in scene buffer
+    uint trans_offset;
 };

 // Fill modes.
--- a/piet-gpu/shader/tile_alloc.spv
+++ b/piet-gpu/shader/tile_alloc.spv
--- a/piet-gpu/shader/transform_leaf.comp
+++ b/piet-gpu/shader/transform_leaf.comp
@ -0,0 +1,86 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// A scan for a tree reduction prefix scan that outputs the final result.
+// Output is written into memory at trans_alloc.
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+#include "mem.h"
+#include "setup.h"
+
+#define N_ROWS 8
+#define LG_WG_SIZE 9
+#define WG_SIZE (1 << LG_WG_SIZE)
+#define PARTITION_SIZE (WG_SIZE * N_ROWS)
+
+layout(local_size_x = WG_SIZE, local_size_y = 1) in;
+
+layout(binding = 1) readonly buffer ConfigBuf {
+    Config conf;
+};
+
+layout(binding = 2) readonly buffer SceneBuf {
+    uint[] scene;
+};
+
+#include "scene.h"
+#include "tile.h"
+
+#define Monoid Transform
+
+layout(set = 0, binding = 3) readonly buffer ParentBuf {
+    Monoid[] parent;
+};
+
+Monoid monoid_identity() {
+    return Monoid(vec4(1.0, 0.0, 0.0, 1.0), vec2(0.0, 0.0));
+}
+
+Monoid combine_monoid(Monoid a, Monoid b) {
+    Monoid c;
+    c.mat = a.mat.xyxy * b.mat.xxzz + a.mat.zwzw * b.mat.yyww;
+    c.translate = a.mat.xy * b.translate.x + a.mat.zw * b.translate.y + a.translate;
+    return c;
+}
+
+shared Monoid sh_scratch[WG_SIZE];
+
+void main() {
+    Monoid local[N_ROWS];
+
+    uint ix = gl_GlobalInvocationID.x * N_ROWS;
+    TransformRef ref = TransformRef(conf.trans_offset + ix * Transform_size);
+
+    Monoid agg = Transform_read(ref);
+    local[0] = agg;
+    for (uint i = 1; i < N_ROWS; i++) {
+        agg = combine_monoid(agg, Transform_read(Transform_index(ref, i)));
+        local[i] = agg;
+    }
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i = 0; i < LG_WG_SIZE; i++) {
+        barrier();
+        if (gl_LocalInvocationID.x >= (1u << i)) {
+            Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)];
+            agg = combine_monoid(other, agg);
+        }
+        barrier();
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    
+    barrier();
+    Monoid row = monoid_identity();
+    if (gl_WorkGroupID.x > 0) {
+        row = parent[gl_WorkGroupID.x - 1];
+    }
+    if (gl_LocalInvocationID.x > 0) {
+        row = combine_monoid(row, sh_scratch[gl_LocalInvocationID.x - 1]);
+    }
+    for (uint i = 0; i < N_ROWS; i++) {
+        Monoid m = combine_monoid(row, local[i]);
+        TransformSeg transform = TransformSeg(m.mat, m.translate);
+        TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + (ix + i) * TransformSeg_size);
+        TransformSeg_write(conf.trans_alloc, trans_ref, transform);
+    }
+}
--- a/piet-gpu/shader/transform_reduce.comp
+++ b/piet-gpu/shader/transform_reduce.comp
@ -0,0 +1,69 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// The reduction phase for transform scan implemented as a tree reduction.
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+#include "mem.h"
+#include "setup.h"
+
+#define N_ROWS 8
+#define LG_WG_SIZE 9
+#define WG_SIZE (1 << LG_WG_SIZE)
+#define PARTITION_SIZE (WG_SIZE * N_ROWS)
+
+layout(local_size_x = WG_SIZE, local_size_y = 1) in;
+
+layout(binding = 1) readonly buffer ConfigBuf {
+    Config conf;
+};
+
+layout(binding = 2) readonly buffer SceneBuf {
+    uint[] scene;
+};
+
+#include "scene.h"
+
+#define Monoid Transform
+
+layout(set = 0, binding = 3) buffer OutBuf {
+    Monoid[] outbuf;
+};
+
+Monoid monoid_identity() {
+    return Monoid(vec4(1.0, 0.0, 0.0, 1.0), vec2(0.0, 0.0));
+}
+
+Monoid combine_monoid(Monoid a, Monoid b) {
+    Monoid c;
+    c.mat = a.mat.xyxy * b.mat.xxzz + a.mat.zwzw * b.mat.yyww;
+    c.translate = a.mat.xy * b.translate.x + a.mat.zw * b.translate.y + a.translate;
+    return c;
+}
+
+shared Monoid sh_scratch[WG_SIZE];
+
+void main() {
+    uint ix = gl_GlobalInvocationID.x * N_ROWS;
+    TransformRef ref = TransformRef(conf.trans_offset + ix * Transform_size);
+
+    Monoid agg = Transform_read(ref);
+    for (uint i = 1; i < N_ROWS; i++) {
+        agg = combine_monoid(agg, Transform_read(Transform_index(ref, i)));
+    }
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i = 0; i < LG_WG_SIZE; i++) {
+        barrier();
+        // We could make this predicate tighter, but would it help?
+        if (gl_LocalInvocationID.x + (1u << i) < WG_SIZE) {
+            Monoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i)];
+            agg = combine_monoid(agg, other);
+        }
+        barrier();
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    if (gl_LocalInvocationID.x == 0) {
+        outbuf[gl_WorkGroupID.x] = agg;
+    }
+}
--- a/piet-gpu/shader/transform_scan.comp
+++ b/piet-gpu/shader/transform_scan.comp
@ -0,0 +1,89 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// A scan for a tree reduction prefix scan (either root or not, by ifdef).
+
+#version 450
+
+#define N_ROWS 8
+#define LG_WG_SIZE 9
+#define WG_SIZE (1 << LG_WG_SIZE)
+#define PARTITION_SIZE (WG_SIZE * N_ROWS)
+
+layout(local_size_x = WG_SIZE, local_size_y = 1) in;
+
+// This is copy-pasted from scene.h. It might be better for DRY
+// to include it, but that pulls in more stuff we don't need.
+struct Transform {
+    vec4 mat;
+    vec2 translate;
+};
+
+#define Monoid Transform
+
+layout(binding = 0) buffer DataBuf {
+    Monoid[] data;
+};
+
+#ifndef ROOT
+layout(binding = 1) readonly buffer ParentBuf {
+    Monoid[] parent;
+};
+#endif
+
+Monoid monoid_identity() {
+    return Monoid(vec4(1.0, 0.0, 0.0, 1.0), vec2(0.0, 0.0));
+}
+
+Monoid combine_monoid(Monoid a, Monoid b) {
+    Monoid c;
+    c.mat = a.mat.xyxy * b.mat.xxzz + a.mat.zwzw * b.mat.yyww;
+    c.translate = a.mat.xy * b.translate.x + a.mat.zw * b.translate.y + a.translate;
+    return c;
+}
+
+shared Monoid sh_scratch[WG_SIZE];
+
+void main() {
+    Monoid local[N_ROWS];
+
+    uint ix = gl_GlobalInvocationID.x * N_ROWS;
+
+    // TODO: gate buffer read
+    local[0] = data[ix];
+    for (uint i = 1; i < N_ROWS; i++) {
+        local[i] = combine_monoid(local[i - 1], data[ix + i]);
+    }
+    Monoid agg = local[N_ROWS - 1];
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i = 0; i < LG_WG_SIZE; i++) {
+        barrier();
+        if (gl_LocalInvocationID.x >= (1u << i)) {
+            Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)];
+            agg = combine_monoid(other, agg);
+        }
+        barrier();
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    
+    barrier();
+    // This could be a semigroup instead of a monoid if we reworked the
+    // conditional logic, but that might impact performance.
+    Monoid row = monoid_identity();
+#ifdef ROOT
+    if (gl_LocalInvocationID.x > 0) {
+        row = sh_scratch[gl_LocalInvocationID.x - 1];
+    }
+#else
+    if (gl_WorkGroupID.x > 0) {
+        row = parent[gl_WorkGroupID.x - 1];
+    }
+    if (gl_LocalInvocationID.x > 0) {
+        row = combine_monoid(row, sh_scratch[gl_LocalInvocationID.x - 1]);
+    }
+#endif
+    for (uint i = 0; i < N_ROWS; i++) {
+        Monoid m = combine_monoid(row, local[i]);
+        // TODO: gate buffer write
+        data[ix + i] = m;
+    }
+}
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -1,6 +1,7 @@
 mod gradient;
 mod pico_svg;
 mod render_ctx;
+pub mod stages;
 pub mod test_scenes;
 mod text;

@ -20,6 +21,8 @@ use piet_gpu_hal::{

 use pico_svg::PicoSvg;

+use crate::stages::Config;
+
 const TILE_W: usize = 16;
 const TILE_H: usize = 16;

@ -123,7 +126,7 @@ impl Renderer {
        let image_dev = session.create_image2d(width as u32, height as u32)?;

        // Note: this must be updated when the config struct size changes.
-        const CONFIG_BUFFER_SIZE: u64 = 40;
+        const CONFIG_BUFFER_SIZE: u64 = std::mem::size_of::<Config>() as u64;
        let config_buf = session.create_buffer(CONFIG_BUFFER_SIZE, dev).unwrap();
        // TODO: separate staging buffer (if needed)
        let config_bufs = (0..n_bufs)
@ -295,25 +298,28 @@ impl Renderer {
        alloc += (n_paths * ANNO_SIZE + 3) & !3;
        let trans_base = alloc;
        alloc += (n_trans * TRANS_SIZE + 3) & !3;
-        let config = &[
-            n_paths as u32,
-            n_pathseg as u32,
-            width_in_tiles as u32,
-            height_in_tiles as u32,
-            tile_base as u32,
-            bin_base as u32,
-            ptcl_base as u32,
-            pathseg_base as u32,
-            anno_base as u32,
-            trans_base as u32,
-        ];
+        let trans_offset = 0; // For new element pipeline, not yet used
+        let config = Config {
+            n_elements: n_paths as u32,
+            n_pathseg: n_pathseg as u32,
+            width_in_tiles: width_in_tiles as u32,
+            height_in_tiles: height_in_tiles as u32,
+            tile_alloc: tile_base as u32,
+            bin_alloc: bin_base as u32,
+            ptcl_alloc: ptcl_base as u32,
+            pathseg_alloc: pathseg_base as u32,
+            anno_alloc: anno_base as u32,
+            trans_alloc: trans_base as u32,
+            n_trans: n_trans as u32,
+            trans_offset: trans_offset as u32,
+        };
        unsafe {
            let scene = render_ctx.get_scene_buf();
            self.n_elements = scene.len() / piet_gpu_types::scene::Element::fixed_size();
            // TODO: reallocate scene buffer if size is inadequate
            assert!(self.scene_bufs[buf_ix].size() as usize >= scene.len());
            self.scene_bufs[buf_ix].write(scene)?;
-            self.config_bufs[buf_ix].write(config)?;
+            self.config_bufs[buf_ix].write(&[config])?;
            self.memory_buf_host[buf_ix].write(&[alloc as u32, 0 /* Overflow flag */])?;

            // Upload gradient data.
--- a/piet-gpu/src/stages.rs
+++ b/piet-gpu/src/stages.rs
@ -0,0 +1,209 @@
+// Copyright 2021 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+//! Stages for new element pipeline, exposed for testing.
+
+use bytemuck::{Pod, Zeroable};
+
+use piet::kurbo::Affine;
+use piet_gpu_hal::{
+    include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session,
+};
+
+/// The configuration block passed to piet-gpu shaders.
+///
+/// Note: this should be kept in sync with the version in setup.h.
+#[repr(C)]
+#[derive(Clone, Copy, Default, Zeroable, Pod)]
+pub struct Config {
+    pub n_elements: u32, // paths
+    pub n_pathseg: u32,
+    pub width_in_tiles: u32,
+    pub height_in_tiles: u32,
+    pub tile_alloc: u32,
+    pub bin_alloc: u32,
+    pub ptcl_alloc: u32,
+    pub pathseg_alloc: u32,
+    pub anno_alloc: u32,
+    pub trans_alloc: u32,
+    pub n_trans: u32,
+    pub trans_offset: u32,
+}
+
+// The individual stages will probably be separate files but for now, all in one.
+
+// This is equivalent to the version in piet-gpu-types, but the bytemuck
+// representation will likely be faster.
+#[repr(C)]
+#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
+pub struct Transform {
+    pub mat: [f32; 4],
+    pub translate: [f32; 2],
+}
+
+const TRANSFORM_WG: u64 = 512;
+const TRANSFORM_N_ROWS: u64 = 8;
+const TRANSFORM_PART_SIZE: u64 = TRANSFORM_WG * TRANSFORM_N_ROWS;
+
+pub struct TransformCode {
+    reduce_pipeline: Pipeline,
+    root_pipeline: Pipeline,
+    leaf_pipeline: Pipeline,
+}
+
+pub struct TransformStage {
+    // Right now we're limited to partition^2 (~16M) elements. This can be
+    // expanded but is tedious.
+    root_buf: Buffer,
+    root_ds: DescriptorSet,
+}
+
+pub struct TransformBinding {
+    reduce_ds: DescriptorSet,
+    leaf_ds: DescriptorSet,
+}
+
+impl TransformCode {
+    pub unsafe fn new(session: &Session) -> TransformCode {
+        let reduce_code = include_shader!(session, "../shader/gen/transform_reduce");
+        let reduce_pipeline = session
+            .create_compute_pipeline(
+                reduce_code,
+                &[
+                    BindType::Buffer,
+                    BindType::BufReadOnly,
+                    BindType::BufReadOnly,
+                    BindType::Buffer,
+                ],
+            )
+            .unwrap();
+        let root_code = include_shader!(session, "../shader/gen/transform_root");
+        let root_pipeline = session
+            .create_compute_pipeline(root_code, &[BindType::Buffer])
+            .unwrap();
+        let leaf_code = include_shader!(session, "../shader/gen/transform_leaf");
+        let leaf_pipeline = session
+            .create_compute_pipeline(
+                leaf_code,
+                &[
+                    BindType::Buffer,
+                    BindType::BufReadOnly,
+                    BindType::BufReadOnly,
+                    BindType::BufReadOnly,
+                ],
+            )
+            .unwrap();
+        TransformCode {
+            reduce_pipeline,
+            root_pipeline,
+            leaf_pipeline,
+        }
+    }
+}
+
+impl TransformStage {
+    pub unsafe fn new(session: &Session, code: &TransformCode) -> TransformStage {
+        // We're limited to TRANSFORM_PART_SIZE^2
+        // Also note: size here allows padding
+        let root_buf_size = TRANSFORM_PART_SIZE * 32;
+        let root_buf = session
+            .create_buffer(root_buf_size, BufferUsage::STORAGE)
+            .unwrap();
+        let root_ds = session
+            .create_simple_descriptor_set(&code.root_pipeline, &[&root_buf])
+            .unwrap();
+        TransformStage { root_buf, root_ds }
+    }
+
+    pub unsafe fn bind(
+        &self,
+        session: &Session,
+        code: &TransformCode,
+        config_buf: &Buffer,
+        scene_buf: &Buffer,
+        memory_buf: &Buffer,
+    ) -> TransformBinding {
+        let reduce_ds = session
+            .create_simple_descriptor_set(
+                &code.reduce_pipeline,
+                &[memory_buf, config_buf, scene_buf, &self.root_buf],
+            )
+            .unwrap();
+        let leaf_ds = session
+            .create_simple_descriptor_set(
+                &code.leaf_pipeline,
+                &[memory_buf, config_buf, scene_buf, &self.root_buf],
+            )
+            .unwrap();
+        TransformBinding { reduce_ds, leaf_ds }
+    }
+
+    pub unsafe fn record(
+        &self,
+        cmd_buf: &mut CmdBuf,
+        code: &TransformCode,
+        binding: &TransformBinding,
+        size: u64,
+    ) {
+        if size > TRANSFORM_PART_SIZE.pow(2) {
+            panic!("very large scan not yet implemented");
+        }
+        let n_workgroups = (size + TRANSFORM_PART_SIZE - 1) / TRANSFORM_PART_SIZE;
+        if n_workgroups > 1 {
+            cmd_buf.dispatch(
+                &code.reduce_pipeline,
+                &binding.reduce_ds,
+                (n_workgroups as u32, 1, 1),
+                (TRANSFORM_WG as u32, 1, 1),
+            );
+            cmd_buf.memory_barrier();
+            cmd_buf.dispatch(
+                &code.root_pipeline,
+                &self.root_ds,
+                (1, 1, 1),
+                (TRANSFORM_WG as u32, 1, 1),
+            );
+            cmd_buf.memory_barrier();
+        }
+        cmd_buf.dispatch(
+            &code.leaf_pipeline,
+            &binding.leaf_ds,
+            (n_workgroups as u32, 1, 1),
+            (TRANSFORM_WG as u32, 1, 1),
+        );
+    }
+}
+
+impl Transform {
+    pub fn from_kurbo(a: Affine) -> Transform {
+        let c = a.as_coeffs();
+        Transform {
+            mat: [c[0] as f32, c[1] as f32, c[2] as f32, c[3] as f32],
+            translate: [c[4] as f32, c[5] as f32],
+        }
+    }
+
+    pub fn to_kurbo(self) -> Affine {
+        Affine::new([
+            self.mat[0] as f64,
+            self.mat[1] as f64,
+            self.mat[2] as f64,
+            self.mat[3] as f64,
+            self.translate[0] as f64,
+            self.translate[1] as f64,
+        ])
+    }
+}
--- a/tests/Cargo.toml
+++ b/tests/Cargo.toml
@ -6,9 +6,18 @@ description = "Tests for piet-gpu shaders and generic GPU capabilities."
 license = "MIT/Apache-2.0"
 edition = "2021"

+[features]
+default = ["piet-gpu"]
+
 [dependencies]
 clap = "2.33"
 bytemuck = "1.7.2"
+kurbo = "0.7.1"
+rand = "0.7.3"

 [dependencies.piet-gpu-hal]
 path = "../piet-gpu-hal"
+
+[dependencies.piet-gpu]
+path = "../piet-gpu"
+optional = true
--- a/tests/src/clear.rs
+++ b/tests/src/clear.rs
@ -55,12 +55,12 @@ pub unsafe fn run_clear_test(runner: &mut Runner, config: &Config) -> TestResult
        commands.write_timestamp(0);
        stage.record(&mut commands, &code, &binding);
        commands.write_timestamp(1);
-        if i == 0 {
+        if i == 0 || config.verify_all {
            commands.cmd_buf.memory_barrier();
            commands.download(&out_buf);
        }
        total_elapsed += runner.submit(commands);
-        if i == 0 {
+        if i == 0 || config.verify_all {
            let mut dst: Vec<u32> = Default::default();
            out_buf.read(&mut dst);
            if let Some(failure) = verify(&dst) {
--- a/tests/src/config.rs
+++ b/tests/src/config.rs
@ -22,6 +22,7 @@ pub struct Config {
    pub groups: Groups,
    pub size: Size,
    pub n_iter: u64,
+    pub verify_all: bool,
 }

 pub struct Groups(String);
@ -40,10 +41,12 @@ impl Config {
            .value_of("n_iter")
            .and_then(|s| s.parse().ok())
            .unwrap_or(1000);
+        let verify_all = matches.is_present("verify_all");
        Config {
            groups,
            size,
            n_iter,
+            verify_all,
        }
    }
 }
--- a/tests/src/linkedlist.rs
+++ b/tests/src/linkedlist.rs
@ -48,12 +48,12 @@ pub unsafe fn run_linkedlist_test(runner: &mut Runner, config: &Config) -> TestR
        commands.write_timestamp(0);
        stage.record(&mut commands, &code, &binding, &mem_buf.dev_buf);
        commands.write_timestamp(1);
-        if i == 0 {
+        if i == 0 || config.verify_all {
            commands.cmd_buf.memory_barrier();
            commands.download(&mem_buf);
        }
        total_elapsed += runner.submit(commands);
-        if i == 0 {
+        if i == 0 || config.verify_all {
            let mut dst: Vec<u32> = Default::default();
            mem_buf.read(&mut dst);
            if !verify(&dst) {
--- a/tests/src/main.rs
+++ b/tests/src/main.rs
@ -25,6 +25,9 @@ mod prefix_tree;
 mod runner;
 mod test_result;

+#[cfg(feature = "piet-gpu")]
+mod transform;
+
 use clap::{App, Arg};
 use piet_gpu_hal::InstanceFlags;

@ -62,6 +65,11 @@ fn main() {
                .help("Number of iterations")
                .takes_value(true),
        )
+        .arg(
+            Arg::with_name("verify_all")
+                .long("verify_all")
+                .help("Verify all iterations"),
+        )
        .arg(
            Arg::with_name("dx12")
                .long("dx12")
@ -123,5 +131,9 @@ fn main() {
            }
            report(&linkedlist::run_linkedlist_test(&mut runner, &config));
        }
+        #[cfg(feature = "piet-gpu")]
+        if config.groups.matches("piet") {
+            report(&transform::transform_test(&mut runner, &config));
+        }
    }
 }
--- a/tests/src/prefix.rs
+++ b/tests/src/prefix.rs
@ -85,12 +85,12 @@ pub unsafe fn run_prefix_test(
        commands.write_timestamp(0);
        stage.record(&mut commands, &code, &binding);
        commands.write_timestamp(1);
-        if i == 0 {
+        if i == 0 || config.verify_all {
            commands.cmd_buf.memory_barrier();
            commands.download(&out_buf);
        }
        total_elapsed += runner.submit(commands);
-        if i == 0 {
+        if i == 0 || config.verify_all {
            let mut dst: Vec<u32> = Default::default();
            out_buf.read(&mut dst);
            if let Some(failure) = verify(&dst) {
--- a/tests/src/prefix_tree.rs
+++ b/tests/src/prefix_tree.rs
@ -66,12 +66,12 @@ pub unsafe fn run_prefix_test(runner: &mut Runner, config: &Config) -> TestResul
        commands.write_timestamp(0);
        stage.record(&mut commands, &code, &binding);
        commands.write_timestamp(1);
-        if i == 0 {
+        if i == 0 || config.verify_all {
            commands.cmd_buf.memory_barrier();
            commands.download(&out_buf);
        }
        total_elapsed += runner.submit(commands);
-        if i == 0 {
+        if i == 0 || config.verify_all {
            let mut dst: Vec<u32> = Default::default();
            out_buf.read(&mut dst);
            if let Some(failure) = verify(&dst) {
--- a/tests/src/transform.rs
+++ b/tests/src/transform.rs
@ -0,0 +1,133 @@
+// Copyright 2021 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+//! Tests for piet-gpu shaders.
+
+use crate::{Config, Runner, TestResult};
+
+use kurbo::Affine;
+use piet_gpu::stages::{self, Transform, TransformCode, TransformStage};
+use piet_gpu_hal::BufferUsage;
+use rand::Rng;
+
+struct AffineTestData {
+    input_data: Vec<Transform>,
+    expected: Vec<Affine>,
+}
+
+pub unsafe fn transform_test(runner: &mut Runner, config: &Config) -> TestResult {
+    let mut result = TestResult::new("transform");
+    let n_elements: u64 = config.size.choose(1 << 12, 1 << 18, 1 << 24);
+    // TODO: would be nice to validate with real transform.
+    let data = AffineTestData::new(n_elements as usize);
+    let data_buf = runner
+        .session
+        .create_buffer_init(&data.input_data, BufferUsage::STORAGE)
+        .unwrap();
+    let memory = runner.buf_down(data_buf.size() + 24, BufferUsage::empty());
+    let stage_config = stages::Config {
+        n_trans: n_elements as u32,
+        // This is a hack to get elements aligned.
+        trans_alloc: 16,
+        ..Default::default()
+    };
+    let config_buf = runner
+        .session
+        .create_buffer_init(std::slice::from_ref(&stage_config), BufferUsage::STORAGE)
+        .unwrap();
+
+    let code = TransformCode::new(&runner.session);
+    let stage = TransformStage::new(&runner.session, &code);
+    let binding = stage.bind(
+        &runner.session,
+        &code,
+        &config_buf,
+        &data_buf,
+        &memory.dev_buf,
+    );
+    let mut total_elapsed = 0.0;
+    let n_iter = config.n_iter;
+    for i in 0..n_iter {
+        let mut commands = runner.commands();
+        commands.write_timestamp(0);
+        stage.record(&mut commands.cmd_buf, &code, &binding, n_elements);
+        commands.write_timestamp(1);
+        if i == 0 || config.verify_all {
+            commands.cmd_buf.memory_barrier();
+            commands.download(&memory);
+        }
+        total_elapsed += runner.submit(commands);
+        if i == 0 || config.verify_all {
+            let mut dst: Vec<Transform> = Default::default();
+            memory.read(&mut dst);
+            if let Some(failure) = data.verify(&dst[1..]) {
+                result.fail(failure);
+            }
+        }
+    }
+    result.timing(total_elapsed, n_elements * n_iter);
+    result
+}
+
+impl AffineTestData {
+    fn new(n: usize) -> AffineTestData {
+        let mut rng = rand::thread_rng();
+        let mut a = Affine::default();
+        let mut b;
+        let mut input_data = Vec::with_capacity(n);
+        let mut expected = Vec::with_capacity(n);
+        for _ in 0..n {
+            loop {
+                b = Affine::new([
+                    rng.gen_range(-10.0, 10.0),
+                    rng.gen_range(-10.0, 10.0),
+                    rng.gen_range(-10.0, 10.0),
+                    rng.gen_range(-10.0, 10.0),
+                    rng.gen_range(-10.0, 10.0),
+                    rng.gen_range(-10.0, 10.0),
+                ]);
+                if b.determinant() >= 1.0 {
+                    break;
+                }
+            }
+            expected.push(b);
+            let c = a.inverse() * b;
+            input_data.push(Transform::from_kurbo(c));
+            a = b;
+        }
+        AffineTestData {
+            input_data,
+            expected,
+        }
+    }
+
+    fn verify(&self, actual: &[Transform]) -> Option<String> {
+        for (i, (actual, expected)) in actual.iter().zip(&self.expected).enumerate() {
+            let error: f64 = actual
+                .to_kurbo()
+                .as_coeffs()
+                .iter()
+                .zip(expected.as_coeffs())
+                .map(|(actual, expected)| (actual - expected).powi(2))
+                .sum();
+            let tolerance = 1e-6 * (i + 1) as f64;
+            if error > tolerance {
+                return Some(format!("{}: {} {}", i, error, tolerance));
+            }
+        }
+        None
+    }
+}