From 47f8812e2fbfc2b449ef8226647888b3da2981de Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Tue, 23 Nov 2021 07:28:50 -0800
Subject: [PATCH 1/2] Start work on new element pipeline

There's a bit of reorganizing as well. Shader stages are made available
from piet-gpu to the test rig, config is now a proper structure
(marshaled with bytemuck).

This commit just has the transform stage, which is a simple monoid scan
of affine transforms.

Progress toward #119
---
 Cargo.lock                                |  18 ++
 piet-gpu/Cargo.toml                       |   1 +
 piet-gpu/shader/backdrop.spv              | Bin 12232 -> 12328 bytes
 piet-gpu/shader/backdrop_lg.spv           | Bin 12264 -> 12360 bytes
 piet-gpu/shader/binning.spv               | Bin 16012 -> 16108 bytes
 piet-gpu/shader/build.ninja               |  35 ++-
 piet-gpu/shader/coarse.spv                | Bin 63796 -> 63892 bytes
 piet-gpu/shader/gen/transform_leaf.dxil   | Bin 0 -> 5664 bytes
 piet-gpu/shader/gen/transform_leaf.hlsl   | 219 +++++++++++++++++
 piet-gpu/shader/gen/transform_leaf.msl    | 272 ++++++++++++++++++++++
 piet-gpu/shader/gen/transform_leaf.spv    | Bin 0 -> 12216 bytes
 piet-gpu/shader/gen/transform_reduce.dxil | Bin 0 -> 4696 bytes
 piet-gpu/shader/gen/transform_reduce.hlsl | 125 ++++++++++
 piet-gpu/shader/gen/transform_reduce.msl  | 138 +++++++++++
 piet-gpu/shader/gen/transform_reduce.spv  | Bin 0 -> 7568 bytes
 piet-gpu/shader/gen/transform_root.dxil   | Bin 0 -> 4824 bytes
 piet-gpu/shader/gen/transform_root.hlsl   |  94 ++++++++
 piet-gpu/shader/gen/transform_root.msl    | 129 ++++++++++
 piet-gpu/shader/gen/transform_root.spv    | Bin 0 -> 5280 bytes
 piet-gpu/shader/kernel4.spv               | Bin 38684 -> 38780 bytes
 piet-gpu/shader/path_coarse.spv           | Bin 43040 -> 43136 bytes
 piet-gpu/shader/setup.h                   |   6 +
 piet-gpu/shader/tile_alloc.spv            | Bin 14788 -> 14884 bytes
 piet-gpu/shader/transform_leaf.comp       |  86 +++++++
 piet-gpu/shader/transform_reduce.comp     |  69 ++++++
 piet-gpu/shader/transform_scan.comp       |  89 +++++++
 piet-gpu/src/lib.rs                       |  34 +--
 piet-gpu/src/stages.rs                    | 209 +++++++++++++++++
 tests/Cargo.toml                          |   9 +
 tests/src/clear.rs                        |   4 +-
 tests/src/config.rs                       |   3 +
 tests/src/linkedlist.rs                   |   4 +-
 tests/src/main.rs                         |  12 +
 tests/src/prefix.rs                       |   4 +-
 tests/src/prefix_tree.rs                  |   4 +-
 tests/src/transform.rs                    | 133 +++++++++++
 36 files changed, 1674 insertions(+), 23 deletions(-)
 create mode 100644 piet-gpu/shader/gen/transform_leaf.dxil
 create mode 100644 piet-gpu/shader/gen/transform_leaf.hlsl
 create mode 100644 piet-gpu/shader/gen/transform_leaf.msl
 create mode 100644 piet-gpu/shader/gen/transform_leaf.spv
 create mode 100644 piet-gpu/shader/gen/transform_reduce.dxil
 create mode 100644 piet-gpu/shader/gen/transform_reduce.hlsl
 create mode 100644 piet-gpu/shader/gen/transform_reduce.msl
 create mode 100644 piet-gpu/shader/gen/transform_reduce.spv
 create mode 100644 piet-gpu/shader/gen/transform_root.dxil
 create mode 100644 piet-gpu/shader/gen/transform_root.hlsl
 create mode 100644 piet-gpu/shader/gen/transform_root.msl
 create mode 100644 piet-gpu/shader/gen/transform_root.spv
 create mode 100644 piet-gpu/shader/transform_leaf.comp
 create mode 100644 piet-gpu/shader/transform_reduce.comp
 create mode 100644 piet-gpu/shader/transform_scan.comp
 create mode 100644 piet-gpu/src/stages.rs
 create mode 100644 tests/src/transform.rs

diff --git a/Cargo.lock b/Cargo.lock
index e65ac2f..737c033 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -96,6 +96,20 @@ name = "bytemuck"
 version = "1.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "72957246c41db82b8ef88a5486143830adeb8227ef9837740bdec67724cf2c5b"
+dependencies = [
+ "bytemuck_derive",
+]
+
+[[package]]
+name = "bytemuck_derive"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e215f8c2f9f79cb53c8335e687ffd07d5bfcb6fe5fc80723762d0be46e7cc54"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
 
 [[package]]
 name = "byteorder"
@@ -872,6 +886,7 @@ dependencies = [
 name = "piet-gpu"
 version = "0.1.0"
 dependencies = [
+ "bytemuck",
  "clap",
  "ndk",
  "ndk-glue",
@@ -920,7 +935,10 @@ version = "0.1.0"
 dependencies = [
  "bytemuck",
  "clap",
+ "kurbo",
+ "piet-gpu",
  "piet-gpu-hal",
+ "rand",
 ]
 
 [[package]]
diff --git a/piet-gpu/Cargo.toml b/piet-gpu/Cargo.toml
index f8f5c0a..faaffbd 100644
--- a/piet-gpu/Cargo.toml
+++ b/piet-gpu/Cargo.toml
@@ -33,6 +33,7 @@ roxmltree = "0.13"
 winit = "0.25"
 clap = "2.33"
 swash = "0.1.4"
+bytemuck = { version = "1.7.2", features = ["derive"] }
 
 [target.'cfg(target_os = "android")'.dependencies]
 ndk = "0.3"
diff --git a/piet-gpu/shader/backdrop.spv b/piet-gpu/shader/backdrop.spv
index 450c91681a11baa22490f4e841aaac7a1de8ba29..870abe46716f3ff7fd830656bd0edcdd8713d629 100644
GIT binary patch
delta 118
zcmX>Rzan9S0lPjM11rOJ1_lN$AkK>~DN4*MW?*Ar2a9n7#lRx*`DtmzsU<+}W_@;L
r7G;DMWuTY_5PLw40cin=>1_7pj$~%!o4lM)mz51Dz_$4!-**K7htn1-

delta 28
kcmZ3Ha3X$#0sH0zb|se0tGGj$8F?oE=hNM6!v9490GPW8`Tzg`

diff --git a/piet-gpu/shader/backdrop_lg.spv b/piet-gpu/shader/backdrop_lg.spv
index 859e3e7b2daea79ac7851017389d2f9ae42e91b1..a8b1fd9ee71a1ce02aa2b9ba68112df551511a47 100644
GIT binary patch
delta 118
zcmaD6e<ERn0lPjM11rOJ1_lN$AkK>~DN4*MW?*Ar2a9n7#lRx*`DtmzsU<+}W_@;L
r7G;DMWuTY_5PLw40cin=>1_7pj$~%!o4lM)mz51Dz_$4!pSU6bqf`})

delta 28
kcmX?+@FISL0sH0zb|se0tGGj$8F?oE=hNM6!Y`r-0G+%ETL1t6

diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv
index 516cc1d1428f71e86ae8529f9f726ccc040cf99a..669585af39609a604db0e9144ba0a37db4d1c4b9 100644
GIT binary patch
delta 117
zcmeCFeN(%DQ=g52mEjiy0|OTj=f#&4CFT_~uraWM#khfDV3GLzw6x;X5+HZ;e~$Ak
q$_OpWKrsy<_JA4#(gG6G*?f|BCo?18<Yqx#RyLpj+vc@`_jLd|_Zdk5

delta 27
jcmaD;+f%!MbF%^GIhM_8d|Q|qc_!Z$)ZHu~bVmmOn{NsZ

diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja
index c1fcc92..777a77f 100644
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@@ -1,12 +1,27 @@
 # Build file for shaders.
 
-# You must have glslangValidator in your path, or patch here.
+# You must have Vulkan tools in your path, or patch here.
 
 glslang_validator = glslangValidator
+spirv_cross = spirv-cross
+dxc = dxc
+
+# See https://github.com/KhronosGroup/SPIRV-Cross/issues/1248 for
+# why we set this.
+msl_flags = --msl-decoration-binding
 
 rule glsl
   command = $glslang_validator $flags -V -o $out $in
 
+rule hlsl
+  command = $spirv_cross --hlsl $in --output $out
+
+rule dxil
+  command = $dxc -T cs_6_0 $in -Fo $out
+
+rule msl
+  command = $spirv_cross --msl $in --output $out $msl_flags
+
 
 build elements.spv: glsl elements.comp | scene.h state.h annotated.h
 
@@ -24,3 +39,21 @@ build backdrop_lg.spv: glsl backdrop.comp | annotated.h tile.h setup.h
 build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h
 
 build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h
+
+# New element pipeline follows
+
+build gen/transform_reduce.spv: glsl transform_reduce.comp | scene.h setup.h mem.h
+build gen/transform_reduce.hlsl: hlsl gen/transform_reduce.spv
+build gen/transform_reduce.dxil: dxil gen/transform_reduce.hlsl
+build gen/transform_reduce.msl: msl gen/transform_reduce.spv
+
+build gen/transform_root.spv: glsl transform_scan.comp
+  flags = -DROOT
+build gen/transform_root.hlsl: hlsl gen/transform_root.spv
+build gen/transform_root.dxil: dxil gen/transform_root.hlsl
+build gen/transform_root.msl: msl gen/transform_root.spv
+
+build gen/transform_leaf.spv: glsl transform_leaf.comp | scene.h tile.h setup.h mem.h
+build gen/transform_leaf.hlsl: hlsl gen/transform_leaf.spv
+build gen/transform_leaf.dxil: dxil gen/transform_leaf.hlsl
+build gen/transform_leaf.msl: msl gen/transform_leaf.spv
diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv
index eabb301f48187cb340e790909ffbfc9ca9ed54f2..4167197dcb99a9c41b8676d486a5596374177471 100644
GIT binary patch
delta 121
zcmdn;iFwLr<_!<j_1PF$8Sb$#FmN$2FyzIT6eZ>rGq5qRgT=UkVqlT@{Is;<)Dj?f
w^L=%FK4pX!WuTY_5PLw40cin=>1^gSv*lvso9ynO%f<#2U}M-^<Zx&Q0Mr^8YybcN

delta 30
mcmbR8nR&}6<_!<jHw$R!@oi2sv*2RnnLOV?ck>m813Li5;0%rc

diff --git a/piet-gpu/shader/gen/transform_leaf.dxil b/piet-gpu/shader/gen/transform_leaf.dxil
new file mode 100644
index 0000000000000000000000000000000000000000..bc4f9410cb194343136a58f29bf2f62a21114b42
GIT binary patch
literal 5664
zcmeHLdsI_by5HxKlM_h735gmA!U+f>q4tC~f=phZD7H~iu{w8xh(!w;kuo5zxhEkc
zDaINQEovXZN5{6#h_6x9ItgG!MT<&jxNS$F;-g(^xs2DVz23QZpFmuW_pWtkt$Y8t
zf6QKMpZ)#5@7v$moA2BEcY>ivr!W8f)y~WF-Zn=+s9hXZ)4db{0MHWw;KMN+_5#?K
z!(I-11T?0Q02pCkn3vDM;d#qnI7=PC-1z!S*tl>pa{%g@=QI9V_z&24FRD-gxV3N{
zhh08C!T~^&zXx*wHqOqqfd9*S4BEIJ&~tlxu?6&jIGk-^p|+5MjwZkZ-7v7BD8GPv
z0R?SO2mo|??+v98fYVSHY^D9{kwP2a64RvZ!Se|!DYF@|31fL=SR}rA1uM%%b<M%L
zeN>VxgOtw=N?PEY?UvE{Jm-E8mZTe=Ep|6WcN?|yEv}*Z+l~0NJYqiFG`BW|Ymoc%
z@HOZGk$r>{F)z4Alsw@?#Lce3w9hyZ%myGI&f)edwVHHv0cjT?2zZAE;1#G#ik>Z?
zS2p0=qW9V0<Cd+>EekDZbOqb?6;L(FKtLjd#HvIjUb>TUffkQ@XAG!MqHXcp+i1gE
z?}E1KFU?P-kE!M;^m02DZ{C1w)sbsRZDYFVy`L3_D4(j7zv&*1Ik6?kM(Osq-cuTm
z#q(xD`9RZTGNWKu^qkgbqTOVLJX+VB_G!G6<T*!IUz5%696mH+pJz)GN|gDEMKXg8
zHPpQeWN^tqDy#H5s!WcwGDmG024&G9qqw3^yb(S@$VP@=>EUmn`4w*Q`hZXq5Z0Uc
zoI@Y+{R?i@8vaHTzp9V8L-DJ}JWXO4Va)RmG<o>Yr(#eH<BUh}Yi5KP+{&0-1GvK?
ztyz<6dP@8S$1M_g-BS^i_KN_p%GXg`GY)|ML!4v-z@8mU`AdaU2y3Hh087c=<fBFn
zfc4~W%<y3r(Bps(=IT-cds1YpJWNvHb_wEUeSzzL;OWdbYZSPEOfPB44U_4RXxX5q
zNk*GBK3SQjX_9QE#?se{&@(a3CW7tc!q>w3aKJGfD{$wbQbVe|BzQv>c38D-_uRS-
z*v|DGh_)>_Yj<5!!it2TT?s$!>af(R=WYltnfp%^F2!v-Y5FozOsA2!wzzz)mCaDt
zF=ZV2#`z)D`BhdnTZ-dfyQzyhMz04`&aPZ*PMcCcN_F*7;GIJ|ckdRUp%t^`vC;e5
zl&q*8n<vGu+!}dXnyh7ah=S)Y#nmZ)d>c`q<(Re<Fxm{8$ZNL@o)l?M6wA}L8kQC9
z`UY=X7)Lr&w3?4aU2<U;VI?s&T`xLE5bR>Y)j2%eIeK>B#uuFfXS+x5UAz-VlDY8(
zuT4+<ar1+UMAw7OTgu;s%PzEydEUEn=j(u@^W4-i$A{&=4frk3?uL%nE)F}L7aBUY
zv~9n9l_{wU-n)A@P0Kf@oX9-YAwZj(W^eFo!X>AY&b<1&6U-3-dPLo(>iV}`d+WBT
zO6I?JCola3l7*$r#!`#0v@$Hc1`~+g8SeT_PyLMc`YdmKw&oG|r1Ohaf4tM2w-~y|
zlApSVItI31xV-=Kj(GxS+{&8i)y3u2rIib<d&FTMUW#}_zI5mE>x!je-?V^fdi&kz
zC29Y<cxw9MwDj1Xgr=d6zT3X-ypE2~uQptTAui7ooS6Y*l&-C+tTnD%y|!w@LTl1J
zo&9do<HdLLmc*r}$M$hohrdSV!!em;Bf5toR>_c=PJM>0%cW!awKe<=t?Pw%K*$41
z{D>7N3w4p5zR6eGB33axzgxYc7n|9mpXtJ8_SU$n%=|i%Upw1jHuE={vFaL!nRbNr
zO2qfw;vptvEfaEUL_A^?CsW;%Zzv-Ml#@G)BUdr+WSyzc>eFW)!?MrRXWOu>-Vs@~
ziC<U7PcL&+!YAu6yB%f|zdD{@{e%a@{*rb4f^@8xJ1ScpwXK%Kha=*LgW`YjitjVn
zik`NJ;kJlg|MT18h>OaIZp-A0#l#=n`ph1EW~={svMZxSpW(!^TDo3^FMVds>c_I$
zyM*0J{w6cFG2T(lI>N5%#ZLx_VOo55ka$dqN4?@9cgT;%kVj0tr3Z=nR>Qrhvv@Km
zKxzQ%X#mu4<e3S}&4_>hVC(JTpX4I^ipQ6*D6;y?%j;?tr0|q!`e{|(YbR8LLiG2U
zKV?N->UyLOC*RlBFZwk5{)fA7j;2xas&mV$m$=?vyIY&Q{Oq5fAXF|X2}4u;GQ2Yj
zXLtu=PE-iKGC#d?i(fdOl4NB{hM<Ojjnf26kOBjl(xn^Fq;OO6hK*Y@p7c~Kv-sh4
z3B2^hIhEh!xkc~&nEjV31I{?>2iPY4OsI_0FyI>81Z=al3upT;W+*=<Z2QGcGpb9m
z2R&<5RX=bX*&`a`%t}~EFLGu()CV|oHMIRroGBmU%<ku$x#GWYrX=PC&Q$;3aweCz
zq-dv~Go>+*Go>-(oaus`*<C=neL@YY1NEfO&5{Aqq)Xlf!iy|T#+JrYwjG(MZC9QB
z(nswM_cN-@-k`B0&!S|vwBfiV{?zJ|&jfYkg<l9($Hql<R20&rmEPa#=4+h^b1c~q
zkk{O)l({t}w}pACNRxcc7;IB2xa0;=7hl`<u`@L89~;ukqmLCk$^5Sm?4hG&^s&nl
zRu>J1C7K?YmqwZYpB!p3JuDnlZsm^1czcN`1=MqT71oB1ePD5(^3g;M;Jz@q#tgs1
z`acahVCvB$2;8o}dEMtxlpP0bA_dEj<4@U({265PNPupR3HQhwJ&6}RR1jschk<LW
zfO9wEG5M*P<I7w)_kYWmkg|?y<-(wn4GN;0hJ2Z8dWkF*&Dy2T;xZEI4T{jbrDPeD
zy9Tqb;+CUvWZ#iu<;M-Q-M%N@q&HU3Q-VPp>Co(XCGm2FwQrgfH|(P6>voI6F2dU?
z{$xilv6nvk+t>u%`#-Ts`wx(bS~{*?xUKzsf5%nGB-~Gfzw^Z3dE&q2iDDXnAK}TC
z;`0(ISSKN{_ru>4K$#30<i<vXH@`A=zMx?~-LQb+g&WTx`xV9_UWV~3vfrqkumEc)
z%)ng|c!hp<lRN1#gaBAg)M4%Zot7|<&#6UOT5hN?tuJMW)yxVvph==hvHa)Z`z+xu
z`4m7}wU3C__lP&!i5P-_Q{8>qeF3podyGhyM0UD53v3Q%FC!gN_Qv-bYRKfE7u74^
z2RKRePj*`>xBJR;pwOBxKnfXR#+X4Cwa+3wz!~~Ad}iD^s$DBP#_3vo#JH{{{tPm~
z>kn`wpevl#<D1aS>15XfI$1)$W#WK8-k$@y{)9p8q+w1Mf#1*cMq>A;c_Y#LVczNM
z0Jg;3a9d;=2|Cn%1`(Q&i96C+ZDM!lD3R>uEiJlXB91b_Z`!iTHTa*v`;v$4h@>1d
z=bh{#3~rG(Y)Gu2e5qi};j3|n0*skU2@64lQS8(3yvSV+lz}rk#)x>3kszpK39kyr
zCV+5o2pZ`<$U(iylF;L^rm;aL?JOY$a34E|zigm>aAKYm_1Mj<1jB)r@eC*@J)`>?
zuG&C+%p_AcKs1!_z%K_IjO51R6eURT@#Ja^pm1zX<gY9;)GVJzvMZ{Pys?m<^4DiH
zm?F54k^c|~InowUsAL>4WCl5~ft^VVEM#Zm1IcXat!-N;f3&HW#GVdFh|$ASgu^9s
z<UQw`)$Xm+O6aH<>WSRqJx}5}dGv_-P0?`4m*G9<KTLIR)xHI--+`Qo!jZ!|@n8vp
zd~{nhG!nY5{fsiggo<~h+w=w9zEQi`EyygoVHPK=`Ii(|EOGZt11z%-k*Cp(Wxy^X
z(QvjHTTw$mHPy_ittH@~xEv+AM3#Kkmsn5X<E6<q<a}#5j?G&xNKx*WA_q;H%^#3<
zk&{-i32Sv1r=mr^_K7(XAxu~rXt-x<c^z-J`d;ab=7O#TYrYqJ@)2p`>6g268JO*x
zpe1?Yf>jc4xV<A|GkiUH5;@S5xUWB&O%+zRP4vM^sDg=RU~Yd!AJeOi$cUv+d``pD
z1naV7Ft@Le4~)=wY|Pka^&6w7L*v+r+lL!oV{<R(n!7~HHN;UNPf|G5z%wgLC&9(j
zTTYYlAa6pI@;C=ITR!JtY>o092Tfak&%xA9%G(@dw_N35?p|eSIC8MKb<1@QF8a0d
zI0sK|`3ge&A`>08ty7azZLw@}1z}H%QixHt17`F(%qX4Y-YOwCxio_%ILhU<NPg0?
zskaU${xNhDiib;3BuYv?X*K)K<3Tw`ac-m3eQx)-PuvB|T;c~2F9W9qftO6;!+#Dx
zt}fTKqiO(;PJ$4Ff8$mw13`B@&H>iyw?xS>Q#BjO$k&qPm~d5?ODR_~u$=38IV*0Z
z*%g0@p6NC1Kr*eU=np4@im>szZVb5@zhy8T*0m``OXf+5;uMSgUGSJRZRH+seIgn^
z6>EOhcPrKUh&vTmab*TFAZ0&@)es5b2eYEYgL=d#)Er>Pz$p&g@o>N-KCQMZ`jrOw
zdTqY-4aW!@^i1DqfMrF;#D*GyE5Tq$$c4}pGmV3D3Jh~N=!toSgKG*6jT{_|QFE|y
zg`qJIE$X7;QaSkP8biV?2#s->5DGu@YI0V=Jl&UIPvukK$(sZ7v;*cT66Q&}@}-k^
z*a=Twr~+EkptT%Y2gXj`9+Cg#Ek#Q`s5i8V+h(Gczw#wu9<#(j%MMf60Zo5?rlYhL
zYaGajS7Po%L(vU3x+W=cou(;Uxg-6g^F!Gs#qyP}3zJwa!Snzk%u8ePm7vR&pHp2h
zs@~hYb@KL2iJ!o$A~tI7^rx+r7Xmi~YT=5k_bI$5rbgA}Je5DXaNZf>za&thf2re_
zTs3dex3<?7l)?+GT&ThD3C_lTO2a#_XR30U?lZ4bi@};I`ffC#CbYYeQXLLe9jTNa
z$vqh4lBYn{vgA%byl1NKAbB)NbyTOCrCdflpL8clwJEf}(WpAUK=r%I2}c@^1tv92
zKmOfR-*3nhm8ug~)g<Mz#^;lMs8o4Eha26hj%L-VO39IP?!csT(>tOQe7_}6A5op|
XRHgC1N9%{|^b%MkmZfOm|DgW~%X?MG

literal 0
HcmV?d00001

diff --git a/piet-gpu/shader/gen/transform_leaf.hlsl b/piet-gpu/shader/gen/transform_leaf.hlsl
new file mode 100644
index 0000000..80b5434
--- /dev/null
+++ b/piet-gpu/shader/gen/transform_leaf.hlsl
@@ -0,0 +1,219 @@
+struct Alloc
+{
+    uint offset;
+};
+
+struct TransformRef
+{
+    uint offset;
+};
+
+struct Transform
+{
+    float4 mat;
+    float2 translate;
+};
+
+struct TransformSegRef
+{
+    uint offset;
+};
+
+struct TransformSeg
+{
+    float4 mat;
+    float2 translate;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+    Alloc trans_alloc;
+    uint n_trans;
+    uint trans_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
+
+static const Transform _224 = { float4(1.0f, 0.0f, 0.0f, 1.0f), 0.0f.xx };
+
+RWByteAddressBuffer _71 : register(u0);
+ByteAddressBuffer _96 : register(t2);
+ByteAddressBuffer _278 : register(t1);
+ByteAddressBuffer _377 : register(t3);
+
+static uint3 gl_WorkGroupID;
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_WorkGroupID : SV_GroupID;
+    uint3 gl_LocalInvocationID : SV_GroupThreadID;
+    uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+groupshared Transform sh_scratch[512];
+
+Transform Transform_read(TransformRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    uint raw0 = _96.Load((ix + 0u) * 4 + 0);
+    uint raw1 = _96.Load((ix + 1u) * 4 + 0);
+    uint raw2 = _96.Load((ix + 2u) * 4 + 0);
+    uint raw3 = _96.Load((ix + 3u) * 4 + 0);
+    uint raw4 = _96.Load((ix + 4u) * 4 + 0);
+    uint raw5 = _96.Load((ix + 5u) * 4 + 0);
+    Transform s;
+    s.mat = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
+    s.translate = float2(asfloat(raw4), asfloat(raw5));
+    return s;
+}
+
+TransformRef Transform_index(TransformRef ref, uint index)
+{
+    TransformRef _85 = { ref.offset + (index * 24u) };
+    return _85;
+}
+
+Transform combine_monoid(Transform a, Transform b)
+{
+    Transform c;
+    c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww);
+    c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate;
+    return c;
+}
+
+Transform monoid_identity()
+{
+    return _224;
+}
+
+bool touch_mem(Alloc alloc, uint offset)
+{
+    return true;
+}
+
+void write_mem(Alloc alloc, uint offset, uint val)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return;
+    }
+    _71.Store(offset * 4 + 8, val);
+}
+
+void TransformSeg_write(Alloc a, TransformSegRef ref, TransformSeg s)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = asuint(s.mat.x);
+    write_mem(param, param_1, param_2);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = asuint(s.mat.y);
+    write_mem(param_3, param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 2u;
+    uint param_8 = asuint(s.mat.z);
+    write_mem(param_6, param_7, param_8);
+    Alloc param_9 = a;
+    uint param_10 = ix + 3u;
+    uint param_11 = asuint(s.mat.w);
+    write_mem(param_9, param_10, param_11);
+    Alloc param_12 = a;
+    uint param_13 = ix + 4u;
+    uint param_14 = asuint(s.translate.x);
+    write_mem(param_12, param_13, param_14);
+    Alloc param_15 = a;
+    uint param_16 = ix + 5u;
+    uint param_17 = asuint(s.translate.y);
+    write_mem(param_15, param_16, param_17);
+}
+
+void comp_main()
+{
+    uint ix = gl_GlobalInvocationID.x * 8u;
+    TransformRef _285 = { _278.Load(44) + (ix * 24u) };
+    TransformRef ref = _285;
+    TransformRef param = ref;
+    Transform agg = Transform_read(param);
+    Transform local[8];
+    local[0] = agg;
+    for (uint i = 1u; i < 8u; i++)
+    {
+        TransformRef param_1 = ref;
+        uint param_2 = i;
+        TransformRef param_3 = Transform_index(param_1, param_2);
+        Transform param_4 = agg;
+        Transform param_5 = Transform_read(param_3);
+        agg = combine_monoid(param_4, param_5);
+        local[i] = agg;
+    }
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i_1 = 0u; i_1 < 9u; i_1++)
+    {
+        GroupMemoryBarrierWithGroupSync();
+        if (gl_LocalInvocationID.x >= (1u << i_1))
+        {
+            Transform other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+            Transform param_6 = other;
+            Transform param_7 = agg;
+            agg = combine_monoid(param_6, param_7);
+        }
+        GroupMemoryBarrierWithGroupSync();
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    GroupMemoryBarrierWithGroupSync();
+    Transform row = monoid_identity();
+    if (gl_WorkGroupID.x > 0u)
+    {
+        Transform _383;
+        _383.mat = asfloat(_377.Load4((gl_WorkGroupID.x - 1u) * 32 + 0));
+        _383.translate = asfloat(_377.Load2((gl_WorkGroupID.x - 1u) * 32 + 16));
+        row.mat = _383.mat;
+        row.translate = _383.translate;
+    }
+    if (gl_LocalInvocationID.x > 0u)
+    {
+        Transform param_8 = row;
+        Transform param_9 = sh_scratch[gl_LocalInvocationID.x - 1u];
+        row = combine_monoid(param_8, param_9);
+    }
+    Alloc param_12;
+    for (uint i_2 = 0u; i_2 < 8u; i_2++)
+    {
+        Transform param_10 = row;
+        Transform param_11 = local[i_2];
+        Transform m = combine_monoid(param_10, param_11);
+        TransformSeg _423 = { m.mat, m.translate };
+        TransformSeg transform = _423;
+        TransformSegRef _433 = { _278.Load(36) + ((ix + i_2) * 24u) };
+        TransformSegRef trans_ref = _433;
+        Alloc _437;
+        _437.offset = _278.Load(36);
+        param_12.offset = _437.offset;
+        TransformSegRef param_13 = trans_ref;
+        TransformSeg param_14 = transform;
+        TransformSeg_write(param_12, param_13, param_14);
+    }
+}
+
+[numthreads(512, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_WorkGroupID = stage_input.gl_WorkGroupID;
+    gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+    gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+    comp_main();
+}
diff --git a/piet-gpu/shader/gen/transform_leaf.msl b/piet-gpu/shader/gen/transform_leaf.msl
new file mode 100644
index 0000000..6229b25
--- /dev/null
+++ b/piet-gpu/shader/gen/transform_leaf.msl
@@ -0,0 +1,272 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct Alloc
+{
+    uint offset;
+};
+
+struct TransformRef
+{
+    uint offset;
+};
+
+struct Transform
+{
+    float4 mat;
+    float2 translate;
+};
+
+struct TransformSegRef
+{
+    uint offset;
+};
+
+struct TransformSeg
+{
+    float4 mat;
+    float2 translate;
+};
+
+struct Memory
+{
+    uint mem_offset;
+    uint mem_error;
+    uint memory[1];
+};
+
+struct SceneBuf
+{
+    uint scene[1];
+};
+
+struct Alloc_1
+{
+    uint offset;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc_1 tile_alloc;
+    Alloc_1 bin_alloc;
+    Alloc_1 ptcl_alloc;
+    Alloc_1 pathseg_alloc;
+    Alloc_1 anno_alloc;
+    Alloc_1 trans_alloc;
+    uint n_trans;
+    uint trans_offset;
+};
+
+struct ConfigBuf
+{
+    Config conf;
+};
+
+struct Transform_1
+{
+    float4 mat;
+    float2 translate;
+    char _m0_final_padding[8];
+};
+
+struct ParentBuf
+{
+    Transform_1 parent[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+Transform Transform_read(thread const TransformRef& ref, const device SceneBuf& v_96)
+{
+    uint ix = ref.offset >> uint(2);
+    uint raw0 = v_96.scene[ix + 0u];
+    uint raw1 = v_96.scene[ix + 1u];
+    uint raw2 = v_96.scene[ix + 2u];
+    uint raw3 = v_96.scene[ix + 3u];
+    uint raw4 = v_96.scene[ix + 4u];
+    uint raw5 = v_96.scene[ix + 5u];
+    Transform s;
+    s.mat = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
+    s.translate = float2(as_type<float>(raw4), as_type<float>(raw5));
+    return s;
+}
+
+static inline __attribute__((always_inline))
+TransformRef Transform_index(thread const TransformRef& ref, thread const uint& index)
+{
+    return TransformRef{ ref.offset + (index * 24u) };
+}
+
+static inline __attribute__((always_inline))
+Transform combine_monoid(thread const Transform& a, thread const Transform& b)
+{
+    Transform c;
+    c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww);
+    c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate;
+    return c;
+}
+
+static inline __attribute__((always_inline))
+Transform monoid_identity()
+{
+    return Transform{ float4(1.0, 0.0, 0.0, 1.0), float2(0.0) };
+}
+
+static inline __attribute__((always_inline))
+bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
+{
+    return true;
+}
+
+static inline __attribute__((always_inline))
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_71)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return;
+    }
+    v_71.memory[offset] = val;
+}
+
+static inline __attribute__((always_inline))
+void TransformSeg_write(thread const Alloc& a, thread const TransformSegRef& ref, thread const TransformSeg& s, device Memory& v_71)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = as_type<uint>(s.mat.x);
+    write_mem(param, param_1, param_2, v_71);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = as_type<uint>(s.mat.y);
+    write_mem(param_3, param_4, param_5, v_71);
+    Alloc param_6 = a;
+    uint param_7 = ix + 2u;
+    uint param_8 = as_type<uint>(s.mat.z);
+    write_mem(param_6, param_7, param_8, v_71);
+    Alloc param_9 = a;
+    uint param_10 = ix + 3u;
+    uint param_11 = as_type<uint>(s.mat.w);
+    write_mem(param_9, param_10, param_11, v_71);
+    Alloc param_12 = a;
+    uint param_13 = ix + 4u;
+    uint param_14 = as_type<uint>(s.translate.x);
+    write_mem(param_12, param_13, param_14, v_71);
+    Alloc param_15 = a;
+    uint param_16 = ix + 5u;
+    uint param_17 = as_type<uint>(s.translate.y);
+    write_mem(param_15, param_16, param_17, v_71);
+}
+
+kernel void main0(device Memory& v_71 [[buffer(0)]], const device ConfigBuf& _278 [[buffer(1)]], const device SceneBuf& v_96 [[buffer(2)]], const device ParentBuf& _377 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+{
+    threadgroup Transform sh_scratch[512];
+    uint ix = gl_GlobalInvocationID.x * 8u;
+    TransformRef ref = TransformRef{ _278.conf.trans_offset + (ix * 24u) };
+    TransformRef param = ref;
+    Transform agg = Transform_read(param, v_96);
+    spvUnsafeArray<Transform, 8> local;
+    local[0] = agg;
+    for (uint i = 1u; i < 8u; i++)
+    {
+        TransformRef param_1 = ref;
+        uint param_2 = i;
+        TransformRef param_3 = Transform_index(param_1, param_2);
+        Transform param_4 = agg;
+        Transform param_5 = Transform_read(param_3, v_96);
+        agg = combine_monoid(param_4, param_5);
+        local[i] = agg;
+    }
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i_1 = 0u; i_1 < 9u; i_1++)
+    {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        if (gl_LocalInvocationID.x >= (1u << i_1))
+        {
+            Transform other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+            Transform param_6 = other;
+            Transform param_7 = agg;
+            agg = combine_monoid(param_6, param_7);
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    Transform row = monoid_identity();
+    if (gl_WorkGroupID.x > 0u)
+    {
+        uint _380 = gl_WorkGroupID.x - 1u;
+        row.mat = _377.parent[_380].mat;
+        row.translate = _377.parent[_380].translate;
+    }
+    if (gl_LocalInvocationID.x > 0u)
+    {
+        Transform param_8 = row;
+        Transform param_9 = sh_scratch[gl_LocalInvocationID.x - 1u];
+        row = combine_monoid(param_8, param_9);
+    }
+    Alloc param_12;
+    for (uint i_2 = 0u; i_2 < 8u; i_2++)
+    {
+        Transform param_10 = row;
+        Transform param_11 = local[i_2];
+        Transform m = combine_monoid(param_10, param_11);
+        TransformSeg transform = TransformSeg{ m.mat, m.translate };
+        TransformSegRef trans_ref = TransformSegRef{ _278.conf.trans_alloc.offset + ((ix + i_2) * 24u) };
+        param_12.offset = _278.conf.trans_alloc.offset;
+        TransformSegRef param_13 = trans_ref;
+        TransformSeg param_14 = transform;
+        TransformSeg_write(param_12, param_13, param_14, v_71);
+    }
+}
+
diff --git a/piet-gpu/shader/gen/transform_leaf.spv b/piet-gpu/shader/gen/transform_leaf.spv
new file mode 100644
index 0000000000000000000000000000000000000000..ec47a9f94f1f0e8034426f89715ba91137d43fb6
GIT binary patch
literal 12216
zcmZ{p37A|}mB(MwOPYNN1Z7F82}^(kNC<&UWJv^@XdofM5fQDfbXU?P-CeP|Itu~@
z46YavH9CR_C=wJ|f(x6=pfanYvpPB(-{<W6>NcY@zu&8Ox^KUcC+EBW|NoqO?zzjm
z^*VOVUN|SqW@QIwec9J$W%+YdHX9|&4#|4+c3|D6b!ShshtFPq{<#Jmo^|DM#vDPv
z9@=hftkxLUaR`oAVomHwa_Bos$81OHSQj~<+4Bh9j6SfgI<S8Iz`Dz;L)*4hC)&G4
z>($Z5c)dE@Xw`?>jqP>g`jWWD_|WL&a2n7<L|TWO$z`iYN1H>my;LgqX>QpvQEz7`
z2Q!~6Yd0r{Myg}=u|*T@*5puoN#1G6<kF?9CYP=<t~)!FxLT1*e%OpWhh)dVceEPq
z`b*}?Q{}l_k2KfOGjerhbK%=-qr~=R^T5}%YU2}Inys;o^)1kRF6uthb2%(K0X`L5
zZHy1scgBpSWKv8`tcQJgIel8Bbxy~F^Io(){I5bg)e-D&s`-r7+VI|-vyEBX8b@nw
zwu+vIW(#Sj`c_-@+VD%3LVQ<tGVz@{db3m0zMJY>owoa@?lV2--t2Vv)HuZIw@!7R
zs8jy`>pVVNf}ZxT=9?I`P9KEgWTc_7l)u2WWc!xo*fLL#>(P~+Nn0!MS+t$Gc4dpf
z6S+Po^937gjyD^_)y8mryxnN;T7*RZx!DTZq2}0TUyDvm)Hqer^q}R6ragHr%gGhB
zoDbfdkI|i3?6z8~HdgS9O8j#8mG!Y^YnN->o5_86bE>f?p}W@V^;WCd0=s|4``)AH
zIb4?kxR3Z++Qv?CcXmCvRoju@IiAHg!8Z-n$Lnh*r``cIp9ytZa||l*n8QGFEDb)|
z;md+s7yk2tGql|0$@|~Z;Vx00dk<}Aue!3k!BOjj;B?&{D)Id#{#c1WQQ`+m{OJ;Z
zw#1(+@voQoH%t7568~0-f4juLQ{vw(@$Z%R_h;~~><926%>C)(V06~4FaNFBI$9kV
zZEmiOt{var9ICY&&GEID6|qMV+j&pAv!lWJ+lbGzB=?DP=YG#cdu4NcOJi&2e9C>s
ztM$?P7%vo?=lN9EHD29TYmZFimqstY4(guI9gX4k2=`~T-59OseLB7_d?WS7){*wK
zGxGV{V|cZ=*hB9@=cNsKYW!aGZSA4aaK!j?)H_v$YpPElzS{VBGaNC-LFjGo-@@Vg
zIG2ObnS>s4bmnl#RBWuA`_hvgLwslLIxm<x)O?0mQ(O0MUZIKSgKJy2qIYKt!09`1
z3Y=eFe#JWdPKP&g+?}0Mid|ab%S(JkiC;K__hf6}6C>4$p;oOuG~#%D`8Nn>Yn>&O
zj#fU8we*R4t^+sQBX#cs=dyW5{_bpe2Jgzo;H~BkVte!N9L9NFvvtEjt2w#N!|QrZ
z5;Ju(lh@xQHJ|uRk}KsqH1*rrlkGs<P-}5ur}sk5hc!m;tm&>ov*tU%>H6=T!Mn1%
z;GOw)Wgi2_9e>Y^xbE!U8N4TZ0G`+9_Xp!UxBa2<yY9ho`~x%kb!ShN_%jI~Al`S}
zcfj}N3Nl<y>$9-aOlfq>wKUFm)HvUV0nVX)eK*bP@5#xSPw@T@&G{R_)8lsMzbm?5
zhFM4arbHd)SBw0E;55J7{MO`ne*gN8orsw)?y1P97WYzFE6+9VBjcQF^tBc>zez)L
zebkOjH18fYzfB{~xvRx|9m6|EEoxSCJhciIeZ6PYPDwQH7_~)-=KR(Cwz+~eiF+i*
z3~r2%Ypm9l<L>;9N!Qprryn0pzQ+E2;TR_r+`l#Ce!p64yl0U=H2bT$M()*i1lOa2
zS!cXsQD^YTW1VVIr|(}{r@WtB;r}Y!y8OnT%L(y}^yAepK9(_Kd}uZwk9*P4{EMfv
zH)<7PUH|YOLk8ErpZPk5*mz_5$>1CvBQ$mM`*%a+KMrg@HS;)cvAXf|h(C$8f*r^B
z^J(RtHTi;qJI;J~C2zA_?}zup)y(JEr_#pQr-2<?&AzASZKtnu^*rfUcRpv(#(eCn
z-?co4wvt<S=7*hq<j%(rJF&WU2Jx=bh1f~p%V@1<KC#{*Volu|UPbGe`c@TupwQgA
zE5H?O7Is#_*MjY1|8)g6XZWuI`^eQdVy?UO#NFgtdFRN@Q>8VhnsvwrF}29E8SE2$
z1RQlaj(?n|@%)4^%5?mVVD~lp?*{i^j=z`Ib#?#sJMTL&^XuP7tKah=zrWy~72|FH
z6w9*RmjwT_oO?&i$@Bg>+!%FDyqF5i`4@$zu2!La5wnjz&*DL|I(z4wtl?ld|K+-C
zCiiYu(|>4zV{dGaK<lG*{k^x<=JPKE-iPAU^#3A@7;|{1+1K%orFH$)tViy>7JK5F
z9tZbbJ`w-P;0oB9?7xuKHCB&3x&&+j<1YmpuV(yeTK7}k@#Jd?E!M+(l>aj02MV7#
zBL4<7?}6YO;eNkIfA3xXV~slBHl7dn%d_m=>OJf_cxTETXD6+5Rx^*>ds9vS%>}lv
z<GL;ngRMjVlZD^4U(AlU@7}Y{dxRKm1!u&3&fmu~@naOvvtydedAOc(?<fD(2<~0s
zw{39!3li>L_<b9G^Z9)n-1u`6uHSFm)W1C8j<=%3{l<-W`}@5b+<3oPQ@**x{bo)5
zezT_B@5Pk+y%^l_?n=1h`OO%9x!;W`_q#FpBK!{|{4}`VjN$KtA4s@<zZb)Af4>*y
ztJyW*)x$8~-_2+jv1h(#hZp)DV&q4{)qH0zX4l<gwbgjNi(dxzo%0U%U5h*~2kQ%d
zJlMXG_XM!oM*8h8#ySyhKYbgh$KTD9!0L|ed6Jt`Uj?kD&$A?7Nv;aMe(*wCb^V?_
zHT&v66|Cl3xb~-kJ%7%_c)2lR&!O+YPdM+3z@B^CACl`#+Osep{fi5~ZOnZM*c^{D
zmcNx}gVkKqpOQm<24*gEi(`)Gf@6-Je|h9x1~#uX`Yy;<vM;X9d0@Zg?4!>Z-w*Xz
zqw~S8k$SAl1z^|3c;6qn*muWx^H=Dz61;-e7=6arPd)1UC9w5H?p5G)&0hg`&5g6a
z+!)u<HFYg)Bi}_}^Tm0&SRdx^rv3A^fT#Q9n#Os*4D9`!p7+b)>QUpXz)|A>_zG+d
zW}IWljS+i)n9K2A4Ypoe&&4|0D={DaR~3HSn9q8!IpSPg4OZKLc`oFy!OUfD@r&qw
zcW4{oF}F?N^4zXLQ;+dp3wFG36z|(FgFV;kj}~Y1bzuAH`w??|J*|)R=(`qEvmSAb
z{|0b*{Oi%wWBfOQ)A8Q~cYO8Rikhlm`{^^cdoYN(2l`^4YK87TTnCniZwRcWZ!@ht
ze08w<t8bWA9%p(B*fVV(ea0B8?jFXxZC`31w&E+V!w8ys)X)H%H|F<du=6v{G3CZM
zrf0~twza=^@(p0uG2Vkwuv)wa`qa{M=@}~5ypnts@{Qv=m)5bz!1CDFZQv%BzB6xu
zt3OhlnHE^h7|%=_?BkiyH-V`+4{_wa5nRswRy6fE4>y6;JSYC<+zh@2+l6^o>vvry
zY2SwFyPCTEZrWXFz<kWTw6~%;NBcToxpDUKJL@*EdH65yvyb-e5c|aX%8he<V_vo~
zuRY-M8MqxyJ=W|UV6|AYcY%FeGkx#G)Lb)h<bF4}ocld!>ak|;1*^Gcapv9!cMi_^
z4qCZ62Wa1qt-<UU@pl%w@q5AYcuzh6_B}C=zBo7Pu1VBwtM4w_xF<gdc22h5lOLwt
zhxyp|BZc2K&ihBf=7@XpSHNmVV6K<^LzubDEsnl-gY6sh`Zze&;U2Kub?_VD6W~)Y
z`$YVGV0GtlFReV*;eN2|U><$Oe2iAzoSwH&g4H~4=JZ^C3a)O9=j*{D#`9&2=kU{L
z>KlkT9sUs5e(H|%0Ik|;zUcf8`wVy<<~R=*t+~7}J`2|${1I?p!aoOgjp9Ce6l{!o
ze6KwQR!_gz9*3J-pWkbzGS(Adb!*&DEBC#9lJ@ghTBG$GjrN7Y=R0V<Uo13WzD9mK
ze+lgX7VoXxICD7e(_qJQ+^1;e`dqtb!LFCSXK3XsS-08vo&ztTHMjlrdk*)}dX{Ws
z-M$P?*Xb*8wYV4bso6iyxo7fc+$I-d{te~#-{F{#f0tfBO?|Y77TCEShB@{Tv_Gfy
z-&Uf(HT!RmK1X8F-~azxZ2zMZJQr-PV`=@n%&}Hsa|&+!MFn@B>aW1e=Xx0{k2Sa$
zY<_+Allwk6uS+oJW*>d~$)|oPrqADoHMBmyx0lkc#?-7)>{^M{V$4^A9W(fAz|JfF
z2DuVEfUU)hwKg^5(zTxl_nX+qe;;za{JYYBON_Pi-#Z=uL|WfZ*JnLujAt)u*ih)f
zuZG(<-j_{awe)?t25vun@%PPZ!Rqd_IQF>$_S@X2AB$Q~23y}LwDEWMwcvC=uP^Z}
zB|ci>trFi^;<qGx8F){@V|`u^x2|-3u7j&TU%b<A0IRu2?vFLA#roC2YH<e*fz>u+
zvCqTcLCkUVN8dVFpLw0TJbYWhY2F50ExsqbJKu~AV&>H!eQyBkGp}nX58oI#&D(^l
zjbl;oHt-;3Uj5PcEnt1-buZ-Mn*gVIC*f-T+hXM14j#nJt3Ueg0P8cadnym#E^wOn
zt#Gv)vB-N9cn~wM{^)x%Sf6=!(#pg4HgKBvR=C=3Eb`t49>mP6Kl;8Mtk1ljF?slI
z2fLRyaZj8?`wmPk{O<(2C*gk=SS|eT2D`T5e-BtK{O<+3M&Z8$thRwSX+G`yFvm2n
zef7t0`uo9qFy~|Ka^oZS2f(gV<h~267XA-{op1O*1XlC6egW-=G5e^;H^N82j(I2M
zm~!LHS)u(Xrr&!+9QVj<@Dj}D49t7P-+(g<d>ZXp1ztqE81sHPn>Oy3yTNk`Zr=wJ
zeh~Pf65pS2;~z`7@lTZafrK0Xbi$2)w#1)Hxba_4xbfdC@fQ+q{I?Qr{I^T|I|(=b
zy9qb`dnNw;g2#P$58O5Ne8irA9IPJq;k{tBwV3<q8mmS8C%|fPAKnL6i~I0?@B^6p
z<v8m0^<DWSrqA=$OZ%xp)9*fh8gu_0*S(aB-N*Y1yr0&7@tyQA_!!K1<Mc<Zp8=<}
zeip7af_Vp6>myjysvfm|uJA>zkHXcW*2lq7tLI2=t)3yr5qo~@7qvbKwpQbf(;u~d
z9-P+t1-RNc7PWp69JQ)PtzQD`i&~!ot3|EPfTLE=u-y8drF9&!XV-pF>zBdSYP@m!
zqt>s0(^|g@S8HQY>({_ht9sP>t6+Um>+@i>sP!A*sMYsJZhgKxjw617)_zgzuYs-A
zc;oa(t-lUVYyAzlTKv9v27VK>R`0O%y#7|<i&}pht`@ca4mfJ{-IZIP@2%sAeP`_#
zwf-*HT8%eOf7JSW;I!7?hpX+zVqO0L9JQwB^$*eYMXi4XR*PEy7#y{FZ^*6BJHv6r
z-WT?ZTK@!Wt;QRtKmBg4z|X~e&cVE&m(qIIzepQ*eFeM%^EnUmu3tvGJi+IK?SBES
z_q+ZdU~{neefvwWeXhcs*L>Q)!qnIEMJ)g8LR0sd)oJoJ^85{&<E4527Ooz7{tm3B
z#WNeA-Olf!WBonmc*aKVe*l+j{6{qP$o)@XHO+ioNgH|o8LeF7zo4l{o___aY0gtK
zmliet8`$xTy|t+O-@(?dFUI>1u<PxbcoyWwh5tXn<$3)Vn)-TnX#wpIF*VJ{dg9#v
z2<&|B#L{={$8ddni*JmdfX$;GWB(Lf9{a!1)MM=b0jp`TrZM)<&|>WLo%uPsz8L#|
z!RAqSt(=pSuqJ)hvrh@ldYp^r(74%X>d~(YY+v>0*9~@!?dKZHjWb6Dt*7w&{)l~l
F{ttZo+gktt

literal 0
HcmV?d00001

diff --git a/piet-gpu/shader/gen/transform_reduce.dxil b/piet-gpu/shader/gen/transform_reduce.dxil
new file mode 100644
index 0000000000000000000000000000000000000000..65ff9449e2f060c1c5caf5569899c4fc0656f15c
GIT binary patch
literal 4696
zcmeHKZB!Fi8orZECIf^p3BeBE6Q~q~>JXI=0h=!{V6;%9YhBLiK-6Ld8^y9Bp54iZ
zBx=y0SW((8wASOcZcA5Mi*;=Ru~LePl<lF^TBs?dE>+4}k9O^L?*y}T?f%$vcK4k9
z(R<E&@B2Q_z3<$)_s)G1?Mk)gx*#mF<k@YEp|Zd5d|Y)f0svSk1waVfTxfb|rO+y&
zp^zMc#0_Z63JPf)Vu4b)Y*Bh3=GsT^i;I`7pj{BB0|V}Be}F}BFUA9an+s=rXe2Zg
z8do6Yw-A5yBOd4<z`1we3OAm!03OK0amx%ULmvhJ0|ekWOrTv^sOKI^K&}JgYK{A{
zTm--?5DT$T{^y26PH0YSRP~^R7@^4Ci`Yan1$bl}x~rU(<@3~gL)3=}g)9pv7Y8dI
zbLd<$N>ku?8AK}7KAqUrnBXl|m6)BQ)-QIVDFs*wd^v7z(asTW8H7sS(a`!y&gHwo
z0A{?!xrmE>im$rExj+X%AsoZKQ@DjuQ+m8U2tmLO3V??prby5QQLDG3`w|Y@V0FdT
z=8{SEJDnl6!+K)NTo8mKn8YGSByOsUc7mo>*TF<!RZzC%{cV(X{cg}!bHQ{w{3fwC
zo_DUDNH$fYs`R+0aMjMt&;yT@g~@NH%75?mC7x&qwh`)=o5$taj$}a|^p7^i;#vAb
z2}R9!LR;`EGC{pJ<CSCwE^th(8<3TB`Cgx_Ut-G;N#upm%Vk;{Pg}Pe$l#PgxNMW#
zzL~LSRM=}P_|Ol1y;xk;FWw4k5Vn;TZfX@)Q^G2jcw@lX5^y#$LaswU_SHSzYLvpQ
zjBs;5Hbw|*rh77Cm|?o-7^GT-uus*980NW~!H+Bm)4Jq|`C2fxjk#tTk?%o%JTICT
zP@Lta%a!`vAi{q>xj<Aaojx2~Ku)D8LEvkkeLGnH@#Pq%M}r`NZ@}$3%j2;e3=5k^
zaA!^t*DQ;a=v~gBq($$cnyUh}31y3eoIs|LH04Li)JTGiGASEnhR4NCE7mpf8Y02K
zT;P2YB=>P?YT>!q0^2CQ-c`U0*Ul%6A=NqjH&XYtEUv5OAKci9sM<nuTIw2SmCp)3
zH0!%Vo#xu~#nmCk#UB%BIBGjcQRlESDg#GVWtAH&Y*utVU!H_VogLkk^rVH&4M)*o
z7tyPxH5%~X>GI0*j0de#M0Y>o#ysXEymZm$i;pC&wXoW7wBqA-VnAE0S++c)UEZES
zToizCn`Q`9lK7HOh|1`C`4wHq_Ln=C5DV5=SY0@}dYHC(=t{~FCh{_x##)ep*}8nb
zuBp@G=f{Rmd#A>G$C7aT%tQL6v(v8cx>otx*Si`jcf(m4a=ozQK(Ei?cz=6mL)-py
z7inW%$V)9P6g6Ylyc5|cJA-(e8g<ovgp{0AoSO623HlE~yuImdsoj4)^itivRAb43
zv4YG@m9it$5ZUaBY*j_kPXTPb8><?@o(95=l(5>}uZlI48IP~8E=){?*`(<5p2^aQ
zUf<Zj<x59UZMY)$fp_SPcNCo|suH{C!W!|z&SU!5zG^t;tSVxg<lWVcsV#M{?u$-s
zX_1!hhhxX5MtUumE?pk)8iy&yj-aKnirfQ=%mtmfM^DVlJlUDzAveQuk#(<DzXZ3j
zMLG!YB5t2c;6BgA*%<F=%vu?e=g?%?x}9oPSi417-MmpW2Ex83#5Y*+T#-7i%M*LP
zEoLn(@OQID_wn<3G<i;bUf&kyW|OcE7uM?RCX;ZhiC?qDZldgweG>76OFT-4ZJ@)h
zOo}Iq#d8U7>}7e(kUX}lEN(5mBj=Pgr(cuZ!OuNq&9(7!`X*&HjIgdkm|0=p1Z&oA
za@kFcuqIhpb4viz{))9ff!jBd_D#+9+GcawwMp@{5%ITf@dV8;?`eziwZ-)L>q})Z
zz492ZIkvY9`^crq?$Km7`|G#6vzjzn4t`Ek_Z;}pJC>Y5eolM0$SW6aGx4`3+iO^R
z<VB76)(GaK#9xkJHwp2STRiFtyIvf2gSMJ`koZeV?m=B;v77^!5^SUZNQbS!gqc5!
z`TqGmUzB}EhN9<NJ+qd_)m%9DbZs;)I?2p_C$(VdiPRAh@7?V0a^gSezL6e<Kc})T
ze<gR~g_h5zG6-_>nbMkHIG@|lqMBQJ`Y*Q-A|IDT^5*+}c;6WoxJMFCR0aLXbo=}j
zp`nD3<YY@mAtD^$kPr!?*W&ZK)kDg8+>ioa@t&+(JypLn`@;_<@WK`^-1JF-EA+ti
z+`nzsqO`+0#5QX3Aef}2!2o;-SahmoI{&2#{d}|ctyG^Za_ZC5?Q8yc?0Zh%N`t5M
z?f?|hd-YBW@qpg#hTOlUcjUC*dH<nz<v-CoN#Z?vm;T??JMJ!3Bslzf7oG_9E<EwB
z-Z`P(dG!SA5h;0UV8uNy77qv>W!w!!y)1>#T$4=Lp3Uaj4%O9v@Y{B~>kd&-zg=mb
zyNHloQAMHV<df^(z7tf3zyCzgx|vDwomB=3w@@!PyM!votRizR4DbQhe8SW`FTaVt
z9i1T=rnR;QtGLY#CVH3dJK~Tg{bqY+WkN@p11}st`XU7{@Q!m5woB%wc--{=FSU<h
zt{FJ(<K{PA{<qo}3LkR6_66HoB|u%opsnQ2*0kPMBA77OL%CZE94$yI1Am=O--Bw{
z{-=7EggcI_NKvq{IvVp*N%*RK=035D-m7}tQC5L7))6Rz=MPcTMHE?;k*4h+2Kx0M
zqLuj3P}G39(NW?js#e+EZl)9spA?sp%rnb}h9-tf2YPxhjUPc(5u9KsIy*ZuPoy9h
zZK4ZjW;@Tlrr%J+48lwQpe7_Sb>I4}iK)gGDYXX5g|Bz)@>Sn^y-+BY-!Bv@;m^A8
zvo8EgT_~gA&r<j`hogmQ1pHpO{~NYGAS+kCAE7ljX=lm`+GG{V6ioKC@+(ZX0ujsf
zx3MPP{6sLbIBpF{#Dai@A)ZI(+f<p@-Kb5q-02}*9*2s@Qlxer6KN!{HFP-xC>+}n
zowfR|TNs@MQI<moqbh928<J3E=$<A*DLA;BqyTO~1j2YJ>#0zKK^6=2jB$9RP7Z65
z2e78tCUbZ*hZ&>+%n;YAk{#wSS#<!D#UDc=+Wq;A0c<e-ZBN804vRr2vfXj~i4=Dn
zZz9rtUW)P~0KZR77bCPg?We(wX=V;&9D49Bv2qeq9Hm69yu5x19|bD9wH%QRQ26$X
zHgHrP!FU0pBt*%pNI?d!E1M?=y`+FlR{{d%U!9>J4Ye#Im*8xX7AX)B0QIl~+YK(O
zGNK@y*Gd}LyD7JtBd$^v{*+(J0x3;f1i3iFr4;jS#xQ%3n=&R0xhZ1K;r!b}5^O5n
zDDoN4l09elrn~l}7^(Qhy~>40n#|jr$`K>Vi%%qOk!JVV=Q3P-@~Wu#x?$x)>`lm4
z@Ncc8-DiU{**yYu+eGLjf^~<Q7qFf*D}g3r7~Ff8!#2mK5qwL%8HOBVbV0wy>q8yN
zC^lhIbv2&1(&Lu?K!~CVWIlLi!aDc}+HM)0sZAg=DBgNd^o=mZg);)HzDY}0D$yZe
z&`2M9NaBvd4k>pqLoEIZl05$4;S~w&d{O2bQcpV~S;?mt(;@?Sk?PQ-7b6Z!zxa$&
zKn1IFWc&y=8UMN%l3S+9A+w)UCqZ(0w(aBIN1WP=T2n+NeTg`QM)oO){9?6U_o)`7
zlD)i#c)@H*!4lm!TsUCyheb<u-*VwFFO6fDKC1hk3qQB`L+RtX=kvjOa-8=t$Ie)#
zdxHxnEnHZ(AFg>CT=TruRJ^rExo}0Jd0Srs+!Q0+lpeS#KQ2CwE}YS6-lpFK*()3y
z>NDD+-DfSx!WE~GZE4iV+{N6xksfikQou?kxK;S4K<sDS#02dmN%;AArlK~RALz!=
zRFV&u_pmx0bYr#@f(x8oGz`#WoC|KKXs}lJJH$F#Pt9>N&w5D9X#L9KP`Y_dmTN@1
zdXDNQSH1O7`RoneR_+w14G--)ffv0*Ve;8aRX1g;*OLvg=@r~r{&U71^=#fCi4H0o
NI-6%IxogE={tk-I{6YW#

literal 0
HcmV?d00001

diff --git a/piet-gpu/shader/gen/transform_reduce.hlsl b/piet-gpu/shader/gen/transform_reduce.hlsl
new file mode 100644
index 0000000..09504f6
--- /dev/null
+++ b/piet-gpu/shader/gen/transform_reduce.hlsl
@@ -0,0 +1,125 @@
+struct TransformRef
+{
+    uint offset;
+};
+
+struct Transform
+{
+    float4 mat;
+    float2 translate;
+};
+
+struct Alloc
+{
+    uint offset;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+    Alloc trans_alloc;
+    uint n_trans;
+    uint trans_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
+
+ByteAddressBuffer _49 : register(t2);
+ByteAddressBuffer _161 : register(t1);
+RWByteAddressBuffer _251 : register(u3);
+RWByteAddressBuffer _267 : register(u0);
+
+static uint3 gl_WorkGroupID;
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_WorkGroupID : SV_GroupID;
+    uint3 gl_LocalInvocationID : SV_GroupThreadID;
+    uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+groupshared Transform sh_scratch[512];
+
+Transform Transform_read(TransformRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    uint raw0 = _49.Load((ix + 0u) * 4 + 0);
+    uint raw1 = _49.Load((ix + 1u) * 4 + 0);
+    uint raw2 = _49.Load((ix + 2u) * 4 + 0);
+    uint raw3 = _49.Load((ix + 3u) * 4 + 0);
+    uint raw4 = _49.Load((ix + 4u) * 4 + 0);
+    uint raw5 = _49.Load((ix + 5u) * 4 + 0);
+    Transform s;
+    s.mat = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
+    s.translate = float2(asfloat(raw4), asfloat(raw5));
+    return s;
+}
+
+TransformRef Transform_index(TransformRef ref, uint index)
+{
+    TransformRef _37 = { ref.offset + (index * 24u) };
+    return _37;
+}
+
+Transform combine_monoid(Transform a, Transform b)
+{
+    Transform c;
+    c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww);
+    c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate;
+    return c;
+}
+
+void comp_main()
+{
+    uint ix = gl_GlobalInvocationID.x * 8u;
+    TransformRef _168 = { _161.Load(44) + (ix * 24u) };
+    TransformRef ref = _168;
+    TransformRef param = ref;
+    Transform agg = Transform_read(param);
+    for (uint i = 1u; i < 8u; i++)
+    {
+        TransformRef param_1 = ref;
+        uint param_2 = i;
+        TransformRef param_3 = Transform_index(param_1, param_2);
+        Transform param_4 = agg;
+        Transform param_5 = Transform_read(param_3);
+        agg = combine_monoid(param_4, param_5);
+    }
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i_1 = 0u; i_1 < 9u; i_1++)
+    {
+        GroupMemoryBarrierWithGroupSync();
+        if ((gl_LocalInvocationID.x + (1u << i_1)) < 512u)
+        {
+            Transform other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)];
+            Transform param_6 = agg;
+            Transform param_7 = other;
+            agg = combine_monoid(param_6, param_7);
+        }
+        GroupMemoryBarrierWithGroupSync();
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    if (gl_LocalInvocationID.x == 0u)
+    {
+        _251.Store4(gl_WorkGroupID.x * 32 + 0, asuint(agg.mat));
+        _251.Store2(gl_WorkGroupID.x * 32 + 16, asuint(agg.translate));
+    }
+}
+
+[numthreads(512, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_WorkGroupID = stage_input.gl_WorkGroupID;
+    gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+    gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+    comp_main();
+}
diff --git a/piet-gpu/shader/gen/transform_reduce.msl b/piet-gpu/shader/gen/transform_reduce.msl
new file mode 100644
index 0000000..71e9935
--- /dev/null
+++ b/piet-gpu/shader/gen/transform_reduce.msl
@@ -0,0 +1,138 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct TransformRef
+{
+    uint offset;
+};
+
+struct Transform
+{
+    float4 mat;
+    float2 translate;
+};
+
+struct SceneBuf
+{
+    uint scene[1];
+};
+
+struct Alloc
+{
+    uint offset;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+    Alloc trans_alloc;
+    uint n_trans;
+    uint trans_offset;
+};
+
+struct ConfigBuf
+{
+    Config conf;
+};
+
+struct Transform_1
+{
+    float4 mat;
+    float2 translate;
+    char _m0_final_padding[8];
+};
+
+struct OutBuf
+{
+    Transform_1 outbuf[1];
+};
+
+struct Memory
+{
+    uint mem_offset;
+    uint mem_error;
+    uint memory[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+Transform Transform_read(thread const TransformRef& ref, const device SceneBuf& v_49)
+{
+    uint ix = ref.offset >> uint(2);
+    uint raw0 = v_49.scene[ix + 0u];
+    uint raw1 = v_49.scene[ix + 1u];
+    uint raw2 = v_49.scene[ix + 2u];
+    uint raw3 = v_49.scene[ix + 3u];
+    uint raw4 = v_49.scene[ix + 4u];
+    uint raw5 = v_49.scene[ix + 5u];
+    Transform s;
+    s.mat = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
+    s.translate = float2(as_type<float>(raw4), as_type<float>(raw5));
+    return s;
+}
+
+static inline __attribute__((always_inline))
+TransformRef Transform_index(thread const TransformRef& ref, thread const uint& index)
+{
+    return TransformRef{ ref.offset + (index * 24u) };
+}
+
+static inline __attribute__((always_inline))
+Transform combine_monoid(thread const Transform& a, thread const Transform& b)
+{
+    Transform c;
+    c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww);
+    c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate;
+    return c;
+}
+
+kernel void main0(const device ConfigBuf& _161 [[buffer(1)]], const device SceneBuf& v_49 [[buffer(2)]], device OutBuf& _251 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+{
+    threadgroup Transform sh_scratch[512];
+    uint ix = gl_GlobalInvocationID.x * 8u;
+    TransformRef ref = TransformRef{ _161.conf.trans_offset + (ix * 24u) };
+    TransformRef param = ref;
+    Transform agg = Transform_read(param, v_49);
+    for (uint i = 1u; i < 8u; i++)
+    {
+        TransformRef param_1 = ref;
+        uint param_2 = i;
+        TransformRef param_3 = Transform_index(param_1, param_2);
+        Transform param_4 = agg;
+        Transform param_5 = Transform_read(param_3, v_49);
+        agg = combine_monoid(param_4, param_5);
+    }
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i_1 = 0u; i_1 < 9u; i_1++)
+    {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        if ((gl_LocalInvocationID.x + (1u << i_1)) < 512u)
+        {
+            Transform other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)];
+            Transform param_6 = agg;
+            Transform param_7 = other;
+            agg = combine_monoid(param_6, param_7);
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    if (gl_LocalInvocationID.x == 0u)
+    {
+        _251.outbuf[gl_WorkGroupID.x].mat = agg.mat;
+        _251.outbuf[gl_WorkGroupID.x].translate = agg.translate;
+    }
+}
+
diff --git a/piet-gpu/shader/gen/transform_reduce.spv b/piet-gpu/shader/gen/transform_reduce.spv
new file mode 100644
index 0000000000000000000000000000000000000000..d6f84a21400b2026e7c3fe77e601295dfd814514
GIT binary patch
literal 7568
zcmZ{m33Oa#6~|wirRfGOrIgYV6Tow-v`81Ai&V;%g29%8;66+ylV&uTiIYhIMWqPt
zqUWHZh)Zz=H{4M~5Rtm!F7ASBt^3Lj`un|kH+>!CwfEfr{oniD?|%1NChgM>pPnRb
z$;_lXnbVe}-@IfROp?q>x>DUgaMr+46OEyxRv&Y;0sAHGX`C_pGp-Zefs7R^RUL0Z
zb|DWTbIGA^9zxnQpX1v3hnd`p(ZTHf1BL$0oBIb&DU`;?3lohCN6UrLO0`@Vs?^J+
zMrC{1xb9qBrCJ)D9LfiD5|OW?let-3a(2B~ofxjw$F`P-F=0DXUR)a<o+vjGm^u8*
z#TF{nq4EVwCmQw1Qe$~O#q!A&E7ni0SkIVtdXyw}(mMBSa5@Uz$-e{g`M`EU&Tp*P
z!0t+I`_Zt*(PD#TGIDmZ06jHnp<XTyJ!UDyw<iY@->hSPvKU;djSbqqv0AlOdGy%j
z+lN;pD_5*<UEp3%ns*VoiimR@f*#E9Qgr14`Ho~cxL!OzJsYn3N!VwV%GL6TlT-bc
z^P7-In{P$RG;^#0lOv_fxuI#VG(`L4roAfc)0_6{u(verW14n*at>euah@@BDZ^Kw
zn{{`m=XBd>p?|bCSR6gAy1iB^HY&C1Y0l-itI@Hy?*MNc9j%$i^P}yz_xyAu@5DZ-
zRvoTvGo~xGjj0yOqvf%3wLw9_yXV!yc(E}uQ6?OIdK&MYIKMK~7~%XC8kNy<I;QFK
z){T@a+eR9DosrLTZ+M~DEHZfSCI=ztsrkF$#~Y>5aK!xHm#nIAO^xZsSFBcR;fOhA
zz^CUobJ!nyF%w}Cp6sdhd0(2b(K&nFnY;_W*}LYUF^8OAiJq$6lbr%S@wMpUwr${!
z<T~(pv0ikq^?d|gNpVN=nSFd6$qg;`jV<;~E%wd(*qzDk*b^g#iBi4TD2+H@clzFO
zmIgROtw$-X=MKh1J$Hd?jgfL4ZSA|!du#7V_UvPKCih}by#-+35jnqjN3<XPj&vmV
z!*8B!q)oM_+J3dk#-O*;UOm{Pz+LH^%{tDl)nCwGuT7478LV*z$<HW{)#?{E&#<=N
zSb3}vJL!3l*Ck)B*K4lPdF4IZ@M#X`ot5S2C;w4tT=REi6L!282g7e-eAtI$`}H$k
z&O70FACAKizp(qj&Z+Hhl4oxTV!pV)kx$Mu>(|Ozrkr;y;>;`Ood~Wy<NO^6&N0T#
z&2ipIIp6Gv^FGSOdacE~C>J%$IiFlF5@Tm%+z~m>yJy_e9OwGwmZe<B)cuM%<GvZ=
zXOHFDQ{0i>y?l>%!rNo}V~=;Cox|TBYl|})`GPZE&Yn2#KD6uaMU06v8sn@(F6!{S
z=IhY5j_~_;hjsXSW&Um^G=J=0oZZOpoN;dD@^SfnwhrTB9?x&S4(&e1`#JAyZ0lHr
z%;zMUtB=u+7cU?t){U9;;Bn8JoPX^!&wyMnvF>yD`>5GH);3PxL5$xY-HR+F-u`O4
z9&P^y({}D2Z2uN9M!pYwK3dLv&bJV4emUp!oQUNW>pv9Ti?ks}W_AzQPusQFBgcmS
zNU${=O1*LKUAy;L+qsX<V&v9mws+idk#`l?&v<|M^*LYo*MXxh`4f=*_*1dni-;cp
z+pDm*g1xKeKM!p$?3aGmJ&2fJ{|H*Yz0}^3+4kGszX@#(&Zqy)=r%;&^=bP?$?3l`
z!};}G1?S$_E8j1DJ^VH9yCBYs{{JTtV-D}OW1a8aXnP}PJ=!~w=$Grg2Jt)}2>U5?
z^y$lB1D7!FHlSxj&b(hi+f#Y-YTur5vF=^iZHV#T$b9CA{67GDzQeu~?49-u-wTd2
z?0NDnk#l{%8QSK30PR}joLAd-LoVv@-H18_`po6{_RMxJ?XJvrznwSUn4at{i9X94
zKQoK>{ct{Q-wWqh#?E;Td@t;q=T<qET>9;wpJ(Sjm`=1g?8y~~=T6(RadpmK2);IF
z>-X=Zh<ChyCxxxwzm>w)e?!jJe`C%z|4l9S%{klg{@oM#jK8bJ-rZvF$?T1+@=EYq
z5$}a}{dm@BU*CqvUrLPj+cWMe^ai?QPvkb@^<8)mvJ>%Lx(10n?*;1%`+Z=?M&9>>
z<qktG&gS|6*m3%{P>*Z-AR_PF_Fvna`aXon>9hCRYsuyL{xGrtk=K81#);h*&%pic
z_eat0yZRpTUXT7b;-~);X#MI~%O}C+xSqMZtDi>X?9l_{(Eb==F2{<a=FftoX3v9m
z<oz7jyw>R1&|XWQUF+wOIf!HQ8RL18k2QY*?3(4xX&=6bxOU?`PugP7lJVv@?iOS*
zVvIgx9Cr(1AMA;1QAfRB0$XqF->qP|LB#R;<nn#CCvpD13f_gRLL%qa!17W5*TL4|
z7~ijNBDW*PIfu3};ycjha=ve&tyk^7eh2+s#83bC(E8Oe_xHi(h<*JbSk5=nebxRp
zVlKyu?}k4Ct=x&k+I|FXUE7c0<YT^{fSvEQ?EL=}ybh6%p8gE%IDJvq&%yG(XZN!1
zU!wi2QQt2RIcpS0O?QJ^Yx)(OeAM)7aK5JBU|W-X)bv}h<MhS8{0{8C=!?DkJy_no
z@omx$-ygv4tG?Z6?eP5(Y`nfbXze(se*$|>9iz_}^T@lev2OKa?aiO@?LzYB?l0K#
zQNv%s=8g6J4ea`ib53nzoYQk<uhow4LjN7HkI}P#WZWRy@%rTQXVr5Rd-YFnYmNVc
zlfN!|cm55Qa~+<8d(eKK1AYHN<g85`xgQ2Ugyh%%2)2Bz|G(ho`dLjI|B(mbdJtpJ
zq=vcZY4FDSncv^?jx63-*Q#xtW9Gqif*pfO$M}0c9dVB6kG664C)S{Tm{DsGf4`?=
z&q4fVBF8aTH+oivXP{>zu75wY^~pUBF~<60zk0#)VfTR@8^2eFgXIoGVo#3%_age@
z_v-Or`A3*j9JL+*UV!+`N21mP!Pa*WI{xlh3U+N_FK@BewAdS3?2}vU(_8E<IeQ`a
zoXn2?JOSIf^8I-tw*0O1(DS_vkvkG`kF8lQ`nM7+7rj~ymh;TT-wwxuS0K)#KgJ#h
z)@NSVt{uL$;C$Zo*mAy6k@v~q$g4lb9uL-MUVEq=zNdimc{gIqoq$B%6Ty*Je~di|
ztk1meg?9K(0q67fW6Pb2MBYu{$g4lbo(9%uUiVZxd`|`E^PYh%H-JRmr-37{{usL%
ztk1lj4ejuq3C`y|3tMh05_!)CM_&Cg_UT}K=JkwehwogldwCIW)BfmZAadb<CfGd*
z|Fghy;eR&R-iH4<V7c%=7i^Eh|2(kV7H*R7Q~_~L^Ey_)=lwi%5%CW6Al9yJeB>^H
z?Nj6)0?UQJ40gTY9|kwM=6kmd+cEOaIfAYr&N+xUr?zqC^sRV4qThE!oPS3aWBVP9
z_>L?@FUsLVz`iL<(7qx1$B^krthowyjB9pIdsIW@H}gtrk7t~`Ut3e9I`X^_&iV3r
z>e%v;X96sz#5WC~Tt+{ftARM5v5|Wc+*;#yIQhtZK3GmMUwckRo(tewYrGInKJvT>
zET_0m#av3%_+qg08M`B^`y#Nl>x=nb0=D0Ap0te%|4YHG>v|cS{APA(9{OTLPVuuI
z@81seC5Y=8L{=g0pLgozh`!6I%^a5^7bEg9_hsPLxnBV%A9KGFET=?IWA0bM#oX(_
zG54#%`eN?afXyRsudL^_h&Ac69`C{H5bJR*<}&W}h<uE@9PC*6821LSJ$9V;N831a
Pcz@oA==c7Jy+8j0#a<mO

literal 0
HcmV?d00001

diff --git a/piet-gpu/shader/gen/transform_root.dxil b/piet-gpu/shader/gen/transform_root.dxil
new file mode 100644
index 0000000000000000000000000000000000000000..0d16d04a7a85d55f33415f8b69781288a088677c
GIT binary patch
literal 4824
zcmeHKeN<D|mOnT5Cb@wm+>nTgeBD4%1hhARfCg#4#311{R`lUhR})Z&wm`$j@IY|A
ze33+n5)}I)SRd4nr9S7q*!I0jwNCg@BVt90eQ2o#OL=w%EkkXWI=-1XH_D8wv(~Ix
zYu+ET*34P!oU?y>?{n_HXYZWfjiyMYzI5@CXxo!-uK)gMg6tgn^dbTP7!3!24d-mw
zb+B)Sy$tpd*h`_o4*R;?JPL<+uH~&;Et|vi{OV7ze%%Jj0QJ<|f}hO)3LEo99RmP5
zmbos5G9A;4E%Z6SUQn(pR2I5f02lxh9D$29MR_`U69W3wK)p)sza!-W&<=H>RuaI+
z03qybOMI&`faS4>)aCmSJ2!!eOJcC*VzX!!L$xndb(BaIW#Qudkkr>)T91fS=emvo
zNvdj6%lEX#_3M>uE$#{1H@mULxvaG?SUR>y_c;ApYz5;4&oM)bs2>coN*>W7;xWI8
zDxcCK&;pPL*PwJPR)dPv;SLUhfL$a2i=ZwwPRk*;HevhYj@seywY|$D3fJv+huV+o
z2vZv1;0Q}#l_CN^*-N=WTc_t>Jg}vb_QV5Sq~@)?psV(}>9O!fVqYxdnvX~{Rbxt7
z%$vA!cP8)GuNcFmkCUZ;*FPEm(Vh@Hp*rThC)IqI$Xp5K6RpvBRzZ8*8t>D2cYM2s
z9MZU#-Z-+`>e8DNuM5oZ@Pwi&zq7{R%qVr%m!eSS{a(*6AL8$TBM93;v8y`S)g-&z
z!>^nZ>gI$>1DkdjVtxC9Tdjh<!@#Z?Voee3+Gm~yKHTA%=M*$`vY}7;I3Mowe21WB
zMVQ7TjbEh!Q+0+1rg3Qt@*j1fK8)dJc#^^9$mS5ix}p)cK7s^uj^bI30>I)3HfplV
z4Cu0>n;}qwAc~*}fFc49)>CSXS0oYWJZ{dy)t4~!ZKleEne`ku5UDqcR7jj?T+t&U
zO$teyYGU3prDdK{Ipjsim8il9D-nQtxO+XE7oa*%E<>nUCN2rBmZN8q_jlwsRHFwg
zyAfqqsJx@0HK{l$q&?|Ld$*-tmR}uOl7EZ9gxJ=DBss`3k{LLrG?rCZ&FPU2RJssf
zaCsth(cf6jSwc*E%SQ~T2=udbo=<AZAp9s0+SQ|=OpNE<CUlVw>3iA_n~rrC5P#fi
zHD?Pk>^8Nv&u1Z*>b}rxB0#yjcXG0K_QL3$OTD8P`e*MAOfAH<nnc~tmZp5ye7`)!
zeZP56*<Sc;t^Jwjy-%lZ&p8JA7d>-)Q1-i^-(F{t_t55F92~=tE|=>{Q}>>(1J?=&
z&u)k0i%sG$q54eG(e7(EXkD|WJ(AY+xP}f)>^$>HQ>&~ix$mFbk2LI0?jT7?^T+9#
zA9r&Y@0I&ke23a7!X^V~Y9gAJg{EuKWkqO4DVpiO1Ne6p{8PR&xBk$5wDmyujRKA}
zX=~lm+HF<UZ@gWxj`1Exa=3YmxMRz~v9`_=YR8{boe{Yk7XImB?uJVl35o&FM0eo8
zwNGwd*#jfx99{$0*3_G}DJv@8t})iYEvnZsPVps&^AAS8yE$fL6CHG$;U86SqaQc?
zIrrxaGcyxBXfW3WFxP3gIjVmmYP$$o=~8Fe``jusyS|QH?XBcafw21o|Dl<m##P1i
z_C$Z$6}6pW2Hk9tgXqeDdZim(IaueeF|iwPcD>eVGO>4<(AqkuiF8T^1^j6be}W3D
zpu)bM;m_##X+(eY9ck34G`iOqvz^){KW~!{sh59<W}mlZ+fn)8jHuSYZYX7EmO888
zXm*-BPLqLMo5-$x#Dv@a((HU4cUFp>RbFSk*OKyJhW}uk|AU`DO`*ksuBgebsKMYk
zW{et;M)g~w2aK%GJ?iBF^>S};JlmJmrp|Jq^0vO0;X|KV<s+!v*XIsM*^MT&if~q&
zQ2JdAPE)}7%M<J!-k$79b65q*pVjjxe0*8}l_UU5;Mp-vEX!9>e|fj%8{-o(5Bs#U
zCux05?e%MK*GJ;qj}1%DCFlO^qvUZe<L{S0k;h)`dnj9gA5z-ZpUR#--0{_H1|hDw
zxViS{?n4!ulxdqUeE$d`R^ftM$g-fkUZg2p;UACxs2sm(di?3v?83E#K)zfs0X3{)
zS`#WjbQ(OpPbIdb(@Q=Rz#EsJQ}soLhxhC6vj3$<gHbNqsJT_W5-JxeC@>7820*J^
zrxgVy0C-35Y`lX+wM+6pNlW$pwua^>9V82yuReqK>5F`o4)t?<H2`h@Exr;z<E#FE
z=d0rX#8-m&7x+r{*X64e_@1e8t{`6t;~`%O<Dc`D8}e1Zj&SvG6$}-y;XNKRK1Wz&
z$zH%6Fq3$~wnW1I&T@vmy}@zyH$JE5DN*WZQdrVf6QZw`3ouLK$8Vf{%4xu_yv}(e
zVPR}{c_E2g$zxs*Tj@$#W66d9ANDLGOzr8b+Nj6L=Z(THo?4?F&Yi~;bb+q2y+9O>
zkb8mZIA96P^ykI@N)FQ-9%yL}dtNqh`s;HT{f+Wd4hyk&3V>>j0qYbGyhnr(7Ka23
zZwHB97zxc+a=7My%2$v<K2VCeAtlw3tbTGK-mWqH7i2}=h<IlzLpi#S!<f|(L}{O%
z*T=GY6W>w`@<7dcW61|^Rp-SgS0CvJ$JPgCHct;sP7U7~JC13DsoC?-Q5ok_yBAy5
zI=R=k@t3$y>wfp`p40B~HRd*HUv+D8N5gOTM<#c4glGRPTsts!Z*sQt&hXg1spHsu
z`t0d<{{MUX|K7cAB;kJ^So1<GKZXA9Kz|jEv?=R}*RVA3jh@vc)`v6KlFSAAkYi$f
z)v-wZ1!S3`fJ0EZz)HG)6%YO6SsY+B5W9M0rbC&<20-GhJW14-I?S1Jgc9~huO<#^
zOo7ypG(pt+?%o&FBb8gEH}{B1fLoD2u)N`AM>q)8PGUzmHmY60-Bv0uHq*NmM%C#F
z)^lTX<l)RZ+eKbjl)vY$V-f$Zp|UTYU1jS3sE?%?;`#+&@goUk8d%*ZFH>OGK-53o
z_aTCG)OS{T&L%G!^GqNyhi0DXOuf$+t@Xz+r-MVD|1dV4qNMZ)?$5%|d1mCH3W7v`
zG7ZqvG|>7N(}3xQz~aX|p%&&=U=5ib&*c&T>oM<O=9p~bVYxzvAzg{*3R%W<i+C^i
z5jV8ZL%yIXTBZO5h9=F|Az{`+aRF|=fl0HFQK5Oun1|g>E~yBhTf7mOlRZoI&5^U_
z`KK=Oel-;O`CZ2=Gfh`JbvY5`v$E3tEvw#bEPFCpQYa2wK2CXB)|Qa5YW$H{<-bfY
zava2r>=bvh<fbHW`Bb{6W#e09tnfC-i4e`4LHOe(7UVZZR8_(<jG6=QAmR*?+6Ejv
zSF*%xM0*#|>JwIk%IIW^xtZ7`V12+<5pJ;<W=nZD?1Db5IkK!LX2e)&TQb(SBBjE|
zY&1A8^8V-vyYD+|ng6gSIwEtXH|IPm@sp@{E3k159)z1}UVH5o?H&zC78|TDS&e9%
zmeonNdRFWL8>qqlJPc;h$5j+0NPh@JP8)oj12~MSQ(}H!ulk2rMp2JPxJ1B>Bzn@o
zfmZuSKIU^67a8JU^wv_aTh2Lo5I68_uMTP`nElVdU^o|@2NvM>_1r>GN`9!8>5(DJ
z5dWfrsbH(*B4~_$W~>PsEknDY(Pcr*q&TIxH9Tc)wqVTEC&9K-SM`Ll%R<bXwvW7u
z`K*>D{<uycUjNiC+6W=CG(f`x?xw<^b?FcdZysuDqapJ0C=KVeHjQkeBYt^2xVEcF
zv<1S6m+#SVgT>OQ3qc=`GFh{-B$&7xFs)Iko|XcbR$D-klS|IX7?u_R-so10ml(&P
z+jqRlk}o0ym#HjI3kPe|k>e%3%vfRKqrq_c-ITsQSg!;or%z#Z5SV@n;#n;C^Yx86
zUbkQJAw41e03Pp2itG0w#*khb93g0cgj}M*PCeIrOIhYOoRtzh%Datn82bG{L2%z0
z1AA_WEABCG0oz8_Bt9_zM@Ev1ck(uFVA|?Na}@}6>I84`mo%)uN5iBmypy+RxZ_dK
z`Yms9(7D02nzjz}PR_z)svEsDJT=Q(`~wYlo}l4&j>i(B;PKy=v64V)R^XEO1UTPU
z=C*_Y`3Mb;s>>lTFCV?+P0+HEh*{;E(v4$O<oAM<3NP;o5s^8VyOY}-xxFvPb&=Oy
z8GG@rW6U2mSEWi86#y)WpDNkc61~51@Wf=v>V(+*q{rUZu3X-ipL2Zk><hWT7_rqk
zk4L}1a{2J9Ijo(tvb(kxCfax!ppS|a{>#1ND-|Z*`+p)E=wZ{UU1KgK3M;H=<!f<&
z5uDX3xeWJa>$EiRd1suwc!<99BD|fr3zMbdvheuy;x~ulDRBk<3yiy6()N-9Ub?XT
E8=u8t*#H0l

literal 0
HcmV?d00001

diff --git a/piet-gpu/shader/gen/transform_root.hlsl b/piet-gpu/shader/gen/transform_root.hlsl
new file mode 100644
index 0000000..42bbd38
--- /dev/null
+++ b/piet-gpu/shader/gen/transform_root.hlsl
@@ -0,0 +1,94 @@
+struct Transform
+{
+    float4 mat;
+    float2 translate;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
+
+static const Transform _23 = { float4(1.0f, 0.0f, 0.0f, 1.0f), 0.0f.xx };
+
+RWByteAddressBuffer _89 : register(u0);
+
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_LocalInvocationID : SV_GroupThreadID;
+    uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+groupshared Transform sh_scratch[512];
+
+Transform combine_monoid(Transform a, Transform b)
+{
+    Transform c;
+    c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww);
+    c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate;
+    return c;
+}
+
+Transform monoid_identity()
+{
+    return _23;
+}
+
+void comp_main()
+{
+    uint ix = gl_GlobalInvocationID.x * 8u;
+    Transform _93;
+    _93.mat = asfloat(_89.Load4(ix * 32 + 0));
+    _93.translate = asfloat(_89.Load2(ix * 32 + 16));
+    Transform local[8];
+    local[0].mat = _93.mat;
+    local[0].translate = _93.translate;
+    Transform param_1;
+    for (uint i = 1u; i < 8u; i++)
+    {
+        Transform param = local[i - 1u];
+        Transform _119;
+        _119.mat = asfloat(_89.Load4((ix + i) * 32 + 0));
+        _119.translate = asfloat(_89.Load2((ix + i) * 32 + 16));
+        param_1.mat = _119.mat;
+        param_1.translate = _119.translate;
+        local[i] = combine_monoid(param, param_1);
+    }
+    Transform agg = local[7];
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i_1 = 0u; i_1 < 9u; i_1++)
+    {
+        GroupMemoryBarrierWithGroupSync();
+        if (gl_LocalInvocationID.x >= (1u << i_1))
+        {
+            Transform other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+            Transform param_2 = other;
+            Transform param_3 = agg;
+            agg = combine_monoid(param_2, param_3);
+        }
+        GroupMemoryBarrierWithGroupSync();
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    GroupMemoryBarrierWithGroupSync();
+    Transform row = monoid_identity();
+    if (gl_LocalInvocationID.x > 0u)
+    {
+        row = sh_scratch[gl_LocalInvocationID.x - 1u];
+    }
+    for (uint i_2 = 0u; i_2 < 8u; i_2++)
+    {
+        Transform param_4 = row;
+        Transform param_5 = local[i_2];
+        Transform m = combine_monoid(param_4, param_5);
+        uint _209 = ix + i_2;
+        _89.Store4(_209 * 32 + 0, asuint(m.mat));
+        _89.Store2(_209 * 32 + 16, asuint(m.translate));
+    }
+}
+
+[numthreads(512, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+    gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+    comp_main();
+}
diff --git a/piet-gpu/shader/gen/transform_root.msl b/piet-gpu/shader/gen/transform_root.msl
new file mode 100644
index 0000000..2c58c06
--- /dev/null
+++ b/piet-gpu/shader/gen/transform_root.msl
@@ -0,0 +1,129 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct Transform
+{
+    float4 mat;
+    float2 translate;
+};
+
+struct Transform_1
+{
+    float4 mat;
+    float2 translate;
+    char _m0_final_padding[8];
+};
+
+struct DataBuf
+{
+    Transform_1 data[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+Transform combine_monoid(thread const Transform& a, thread const Transform& b)
+{
+    Transform c;
+    c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww);
+    c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate;
+    return c;
+}
+
+static inline __attribute__((always_inline))
+Transform monoid_identity()
+{
+    return Transform{ float4(1.0, 0.0, 0.0, 1.0), float2(0.0) };
+}
+
+kernel void main0(device DataBuf& _89 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+{
+    threadgroup Transform sh_scratch[512];
+    uint ix = gl_GlobalInvocationID.x * 8u;
+    spvUnsafeArray<Transform, 8> local;
+    local[0].mat = _89.data[ix].mat;
+    local[0].translate = _89.data[ix].translate;
+    Transform param_1;
+    for (uint i = 1u; i < 8u; i++)
+    {
+        uint _113 = ix + i;
+        Transform param = local[i - 1u];
+        param_1.mat = _89.data[_113].mat;
+        param_1.translate = _89.data[_113].translate;
+        local[i] = combine_monoid(param, param_1);
+    }
+    Transform agg = local[7];
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i_1 = 0u; i_1 < 9u; i_1++)
+    {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        if (gl_LocalInvocationID.x >= (1u << i_1))
+        {
+            Transform other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+            Transform param_2 = other;
+            Transform param_3 = agg;
+            agg = combine_monoid(param_2, param_3);
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    Transform row = monoid_identity();
+    if (gl_LocalInvocationID.x > 0u)
+    {
+        row = sh_scratch[gl_LocalInvocationID.x - 1u];
+    }
+    for (uint i_2 = 0u; i_2 < 8u; i_2++)
+    {
+        Transform param_4 = row;
+        Transform param_5 = local[i_2];
+        Transform m = combine_monoid(param_4, param_5);
+        uint _209 = ix + i_2;
+        _89.data[_209].mat = m.mat;
+        _89.data[_209].translate = m.translate;
+    }
+}
+
diff --git a/piet-gpu/shader/gen/transform_root.spv b/piet-gpu/shader/gen/transform_root.spv
new file mode 100644
index 0000000000000000000000000000000000000000..7824d0914594eb18dfe4cd7dac5988c2b784e2d9
GIT binary patch
literal 5280
zcmZ{mhm%xQ7Kb173=E(sqEfnmgBVZ&6)_+NR4g1tF<=g3Gc!QDGt)ZLVAjS2Cd@fu
zM$BT)88OFQcUP<YD_pfo%kO*r4)u7JmpXNR=XcV*=ib*tY2f(5NirZAnzST8CXF>B
z83>ak!;(R%o;CZJ*^~Qgos*~TzKapVlTsRJ%tp!Jq#50W^pq>TI%Xrwkd4WgAAwLZ
ztz}M$f0)U;7|lfFbDLp?;G5f5?(Lsn?dt(sMb6bzu3-;KZEMwx>n_*2z|HJsm@(Df
zYNfNi(%IEptJKb&0EKU4x{r=(&%8=+S9>F7LVvAqQAcgkx^9ye&!37+*>(Egd#F|7
z+(}YS>y|iYlFZBSmgtTQx1lTN*t;d2$%5|oS>4rn<?goL#np~-ty1l6o0-NoC3Df;
zklm@>oE(c@w=1wSkaNWhwEsGTrsR0|ndMsfz(w=PYff$RI^opr|3m>x?0zdLZc6&l
zXO{cQ&fU9Nw4Pl`>e%H43-C23OTqmM+xt8E%C(M#jdObMoO8BEDI7EHwH$x+S^=)s
z7IxXwUMtc0-mBKLOUY{NzUtX&owvY^I;CU{IQF}Cy_}}xlbk(^9==WA$Y|scD#c94
zW4nTLu5#Y3dB>p#A+3lp&e#=svzVh?2{GRJZjHFY9@j8a+c~+9e`Nk2GJ6a3NWKog
zk8xD9o_KRm^dCiiKHAjw+tF^xY-7h_4?!CvZ=IoNxmJ7|qV;Ky<1E87EU)ZDjJ-x8
zNmAcG>`hZUNhXqQ{$_|V@``vO_3XKM#>vasYZPLRzRl?2*;|nTh&{Bu7nl1SgSL+n
z{>fnD&7Xp9MO^x|cT25C{#3BJo+bR#!RA|k23o)S)jlM%t>a7&MT@-y{fD8=m7kv3
zz9%{ThiBNF=yxR8`lIn5jdl)l#vg;Wzr1(mTa5jTp{s9BZY<chrET5`Xzy9heQ5jM
z<n*7MVRJWyJp&!{TLQL6?C&zLiSF-mp!<`v&J}3im%M$nmu6g?e;M`w#Q1A6pEaWX
zE#SO;8@Brl|Lx!w3Yh<0=5v4A-)FY-o5(FXcfS+&=RGQW@V2byK3(3`FEIX1a*y6=
z8)8juXFfM)>+i_fzW=_0y{KT<bGG^ZrbT`8R}}1(1$$M&UY)b8=WkoS-kO5FwqW}k
zXZ!)2+BbLv;=A-)*q1Z=9_JwPml30VRL1!h_hV|#NG{?}0?Qr1+<i}{AigobPkqMu
zzD`B-+1s<9hB#m2j)HSv?TB$M<NU5qM~pSj{@TWw(}FG|=I}JqoEhkOh&lcSn4@i+
z*gNz+{L0;TqJ0DEpV{Yp^a8}Ce<51GI?h`GTjLq}`@Q@Vk#j!3QA4{6v6i{w$n6H3
z8+K1-UqUTs-HXV%zlf=Vop0D@f~`@{a?S!9BOkNv2RmEqnKuMoL*$)-?^ZkRY%w_Q
z%zEZ{XJ;e&;(pG_IMllPIT!4^kNeR!F7D?%uyykHb3V2?aX;F|P2&x@*9(vli2F1y
z&UGQ!b2&%Pp>3>p<-Iv?b<FA_aDG-7W6MQfeRBD?;Jrn^OThM9g2Wsz1<TvNj@CBb
z@99cp8Iu2=uFCkxzZ$!anB(u@wa8M$IBRMfBVLZSmixO7?fKNc)f><^A};+mq4lfd
z{5ONG5pVTYu-q+(Z&mwx#9HQxe}eb7SMfJD&UOd5c(yy?<fGqRVEetFz0JG9zGeBR
z;QU_hLCn+lf6jOx+U0)q-HXV%A93`*A6)GJ0GxdEe-NDS{}8tQ<>MQB7;K)txPwQ)
z-hsZjr$@o^-opyCcK99x%jsK*)(+p};5wpj6<Ryq^%G#<wK@8XsiWn+!#KD4@9yDA
ze8o9D1t%YScp6+s;(X75J)d#*)HcSRz9DC=Hh&8GS;RTUZ_INUHyLfdKDqq6^bHmF
zJdJvO&(9;n5OY^&cHHX=*v0d_2q*tU_GVrJ%lR$%W?n(Nd^7r9M&vw)IBLHNF4lex
zPCnkl>tH$GNzCgFu=COvGkp^*A7{|^{Lxpf?=7_Z)b}>pJ*oX>-$lQNxb(k|)~}A)
ze*m^de6t^d<&H!=ul75LwagVq?nhvAW6vLhowxg1i`Mpx8=^l&{B1KQ;y=rH<39mw
z$J_lJ?Ax`DK4aFP<)i->;I^FoCD{GMJNgQ2jC}O`8f=~TTl5Y1L}WB#eEjYC7Hq7`
zGkk~s7h=4z?nm3W_}lYuusH(|bG}Fa2Qeq!vbJ&Fhj-&l)p4f(g7atiAGTcN>yyjh
zw|Db9UH3x#f2ZGo@82~PvBw7JKhUGl`8a3n8jkGAnY^owa(H8~`J15qf2#g1k->;(
z9D^Q<nB!bLhrX>4`8n)HyA_dBT+YZ9v$D=O#D4iYTW5UK8ILWe#2N@tE@waX+6HMw
zjEy~R3oh<)J2?5My**e?v0f>sqs|1l;vRQ^laD$(g5?y?r&vpgJ?;dypRr4_eNP0t
zcYV=s64-gio6|Nf{5yk-=h_8Meh$-ULGOylDK7Wp`=5&59q~NAuiX&uZxDJ9MBj4u
zW{o`&>&Qp%y}-rZ)8OQz_ugPRC1x7E_koMv{vJl}eZl&o_kLjO$U7_dvp?dV^tqoI
rsc!7YvslZx12aDI4#GCK!8h_|Vmssb#<Y#IhTqb`h<?8xvER?1w|4b1

literal 0
HcmV?d00001

diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv
index 322a0474a261faee6cde5517d4289877d32c61d6..072910ba768cbef1a69b0c78b0bee54cfb67b86e 100644
GIT binary patch
delta 121
zcmbQUj_J=jrVSHB^w}6#87?p}FmN$2FyzIT6eZ>rGq5qRgT=UkVqlT@{Is;<)Dj?f
wbH9i(w=zPDGEhtdh&`ajfV6<bbT+S*Pvl|bo6N7H%f<#2U}M;9rSm8Y0BA}W2><{9

delta 30
mcmeyfj%m(1rVSHBHXjgC;@-@q5W~aBGdW#Hck>*b2U!5S-wSI1

diff --git a/piet-gpu/shader/path_coarse.spv b/piet-gpu/shader/path_coarse.spv
index d86ff9bde3010d811c803dd9f2ec5c2d8fbbd7c8..54814b7315f5608784dc15dfe2a1feb724b9332d 100644
GIT binary patch
delta 121
zcmZ2*fvMpl(}pKP`fLoW3<Asy3|tHh40-V-MTvRE3~UVSU@>l>7+54eKP{~|wFJoB
w{8&hyOBta>87QU!#2!#%Kw3azI-CE>ykukKoBUi^myHc5z{apyNQJKj0L{4;EdT%j

delta 30
mcmZp;$h6=B(}pKPn?;1>xHd=0K4D|znQX73ySYS#rv(7E8VZd7

diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h
index cb41a4b..52ea6e4 100644
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@@ -38,6 +38,12 @@ struct Config {
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
+    // new element pipeline stuff follows
+
+    // Number of transforms in scene
+    uint n_trans;
+    // Offset (in bytes) of transform stream in scene buffer
+    uint trans_offset;
 };
 
 // Fill modes.
diff --git a/piet-gpu/shader/tile_alloc.spv b/piet-gpu/shader/tile_alloc.spv
index 2d7363d729fb4f83c0d7040a9acc673ecdfe5662..b123f1890fa44c5595ef70e8cd5ffb1d2900ced1 100644
GIT binary patch
delta 119
zcmX?7yrg7<4VOL}11p0bBLf2$0|P@|d`VGaUNHk313OrZ8z=@AiO)|<D^4u|ayMIZ
too7}?Xi)}=X#lYY)EJN!keJS9Rz5`*M!w1FLb_~hKmj&}%|Sv%>HsjM69fPN

delta 28
kcmZ2da-?{J4cF!zu5-+rllWv<7<new3+Zm&B2=Ib0HDJO@c;k-

diff --git a/piet-gpu/shader/transform_leaf.comp b/piet-gpu/shader/transform_leaf.comp
new file mode 100644
index 0000000..e158c50
--- /dev/null
+++ b/piet-gpu/shader/transform_leaf.comp
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// A scan for a tree reduction prefix scan that outputs the final result.
+// Output is written into memory at trans_alloc.
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+#include "mem.h"
+#include "setup.h"
+
+#define N_ROWS 8
+#define LG_WG_SIZE 9
+#define WG_SIZE (1 << LG_WG_SIZE)
+#define PARTITION_SIZE (WG_SIZE * N_ROWS)
+
+layout(local_size_x = WG_SIZE, local_size_y = 1) in;
+
+layout(binding = 1) readonly buffer ConfigBuf {
+    Config conf;
+};
+
+layout(binding = 2) readonly buffer SceneBuf {
+    uint[] scene;
+};
+
+#include "scene.h"
+#include "tile.h"
+
+#define Monoid Transform
+
+layout(set = 0, binding = 3) readonly buffer ParentBuf {
+    Monoid[] parent;
+};
+
+Monoid monoid_identity() {
+    return Monoid(vec4(1.0, 0.0, 0.0, 1.0), vec2(0.0, 0.0));
+}
+
+Monoid combine_monoid(Monoid a, Monoid b) {
+    Monoid c;
+    c.mat = a.mat.xyxy * b.mat.xxzz + a.mat.zwzw * b.mat.yyww;
+    c.translate = a.mat.xy * b.translate.x + a.mat.zw * b.translate.y + a.translate;
+    return c;
+}
+
+shared Monoid sh_scratch[WG_SIZE];
+
+void main() {
+    Monoid local[N_ROWS];
+
+    uint ix = gl_GlobalInvocationID.x * N_ROWS;
+    TransformRef ref = TransformRef(conf.trans_offset + ix * Transform_size);
+
+    Monoid agg = Transform_read(ref);
+    local[0] = agg;
+    for (uint i = 1; i < N_ROWS; i++) {
+        agg = combine_monoid(agg, Transform_read(Transform_index(ref, i)));
+        local[i] = agg;
+    }
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i = 0; i < LG_WG_SIZE; i++) {
+        barrier();
+        if (gl_LocalInvocationID.x >= (1u << i)) {
+            Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)];
+            agg = combine_monoid(other, agg);
+        }
+        barrier();
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    
+    barrier();
+    Monoid row = monoid_identity();
+    if (gl_WorkGroupID.x > 0) {
+        row = parent[gl_WorkGroupID.x - 1];
+    }
+    if (gl_LocalInvocationID.x > 0) {
+        row = combine_monoid(row, sh_scratch[gl_LocalInvocationID.x - 1]);
+    }
+    for (uint i = 0; i < N_ROWS; i++) {
+        Monoid m = combine_monoid(row, local[i]);
+        TransformSeg transform = TransformSeg(m.mat, m.translate);
+        TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + (ix + i) * TransformSeg_size);
+        TransformSeg_write(conf.trans_alloc, trans_ref, transform);
+    }
+}
diff --git a/piet-gpu/shader/transform_reduce.comp b/piet-gpu/shader/transform_reduce.comp
new file mode 100644
index 0000000..4b72b11
--- /dev/null
+++ b/piet-gpu/shader/transform_reduce.comp
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// The reduction phase for transform scan implemented as a tree reduction.
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+#include "mem.h"
+#include "setup.h"
+
+#define N_ROWS 8
+#define LG_WG_SIZE 9
+#define WG_SIZE (1 << LG_WG_SIZE)
+#define PARTITION_SIZE (WG_SIZE * N_ROWS)
+
+layout(local_size_x = WG_SIZE, local_size_y = 1) in;
+
+layout(binding = 1) readonly buffer ConfigBuf {
+    Config conf;
+};
+
+layout(binding = 2) readonly buffer SceneBuf {
+    uint[] scene;
+};
+
+#include "scene.h"
+
+#define Monoid Transform
+
+layout(set = 0, binding = 3) buffer OutBuf {
+    Monoid[] outbuf;
+};
+
+Monoid monoid_identity() {
+    return Monoid(vec4(1.0, 0.0, 0.0, 1.0), vec2(0.0, 0.0));
+}
+
+Monoid combine_monoid(Monoid a, Monoid b) {
+    Monoid c;
+    c.mat = a.mat.xyxy * b.mat.xxzz + a.mat.zwzw * b.mat.yyww;
+    c.translate = a.mat.xy * b.translate.x + a.mat.zw * b.translate.y + a.translate;
+    return c;
+}
+
+shared Monoid sh_scratch[WG_SIZE];
+
+void main() {
+    uint ix = gl_GlobalInvocationID.x * N_ROWS;
+    TransformRef ref = TransformRef(conf.trans_offset + ix * Transform_size);
+
+    Monoid agg = Transform_read(ref);
+    for (uint i = 1; i < N_ROWS; i++) {
+        agg = combine_monoid(agg, Transform_read(Transform_index(ref, i)));
+    }
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i = 0; i < LG_WG_SIZE; i++) {
+        barrier();
+        // We could make this predicate tighter, but would it help?
+        if (gl_LocalInvocationID.x + (1u << i) < WG_SIZE) {
+            Monoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i)];
+            agg = combine_monoid(agg, other);
+        }
+        barrier();
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    if (gl_LocalInvocationID.x == 0) {
+        outbuf[gl_WorkGroupID.x] = agg;
+    }
+}
diff --git a/piet-gpu/shader/transform_scan.comp b/piet-gpu/shader/transform_scan.comp
new file mode 100644
index 0000000..e8e0019
--- /dev/null
+++ b/piet-gpu/shader/transform_scan.comp
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// A scan for a tree reduction prefix scan (either root or not, by ifdef).
+
+#version 450
+
+#define N_ROWS 8
+#define LG_WG_SIZE 9
+#define WG_SIZE (1 << LG_WG_SIZE)
+#define PARTITION_SIZE (WG_SIZE * N_ROWS)
+
+layout(local_size_x = WG_SIZE, local_size_y = 1) in;
+
+// This is copy-pasted from scene.h. It might be better for DRY
+// to include it, but that pulls in more stuff we don't need.
+struct Transform {
+    vec4 mat;
+    vec2 translate;
+};
+
+#define Monoid Transform
+
+layout(binding = 0) buffer DataBuf {
+    Monoid[] data;
+};
+
+#ifndef ROOT
+layout(binding = 1) readonly buffer ParentBuf {
+    Monoid[] parent;
+};
+#endif
+
+Monoid monoid_identity() {
+    return Monoid(vec4(1.0, 0.0, 0.0, 1.0), vec2(0.0, 0.0));
+}
+
+Monoid combine_monoid(Monoid a, Monoid b) {
+    Monoid c;
+    c.mat = a.mat.xyxy * b.mat.xxzz + a.mat.zwzw * b.mat.yyww;
+    c.translate = a.mat.xy * b.translate.x + a.mat.zw * b.translate.y + a.translate;
+    return c;
+}
+
+shared Monoid sh_scratch[WG_SIZE];
+
+void main() {
+    Monoid local[N_ROWS];
+
+    uint ix = gl_GlobalInvocationID.x * N_ROWS;
+
+    // TODO: gate buffer read
+    local[0] = data[ix];
+    for (uint i = 1; i < N_ROWS; i++) {
+        local[i] = combine_monoid(local[i - 1], data[ix + i]);
+    }
+    Monoid agg = local[N_ROWS - 1];
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i = 0; i < LG_WG_SIZE; i++) {
+        barrier();
+        if (gl_LocalInvocationID.x >= (1u << i)) {
+            Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)];
+            agg = combine_monoid(other, agg);
+        }
+        barrier();
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    
+    barrier();
+    // This could be a semigroup instead of a monoid if we reworked the
+    // conditional logic, but that might impact performance.
+    Monoid row = monoid_identity();
+#ifdef ROOT
+    if (gl_LocalInvocationID.x > 0) {
+        row = sh_scratch[gl_LocalInvocationID.x - 1];
+    }
+#else
+    if (gl_WorkGroupID.x > 0) {
+        row = parent[gl_WorkGroupID.x - 1];
+    }
+    if (gl_LocalInvocationID.x > 0) {
+        row = combine_monoid(row, sh_scratch[gl_LocalInvocationID.x - 1]);
+    }
+#endif
+    for (uint i = 0; i < N_ROWS; i++) {
+        Monoid m = combine_monoid(row, local[i]);
+        // TODO: gate buffer write
+        data[ix + i] = m;
+    }
+}
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index bee07aa..8d21fe6 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -1,6 +1,7 @@
 mod gradient;
 mod pico_svg;
 mod render_ctx;
+pub mod stages;
 pub mod test_scenes;
 mod text;
 
@@ -20,6 +21,8 @@ use piet_gpu_hal::{
 
 use pico_svg::PicoSvg;
 
+use crate::stages::Config;
+
 const TILE_W: usize = 16;
 const TILE_H: usize = 16;
 
@@ -123,7 +126,7 @@ impl Renderer {
         let image_dev = session.create_image2d(width as u32, height as u32)?;
 
         // Note: this must be updated when the config struct size changes.
-        const CONFIG_BUFFER_SIZE: u64 = 40;
+        const CONFIG_BUFFER_SIZE: u64 = std::mem::size_of::<Config>() as u64;
         let config_buf = session.create_buffer(CONFIG_BUFFER_SIZE, dev).unwrap();
         // TODO: separate staging buffer (if needed)
         let config_bufs = (0..n_bufs)
@@ -295,25 +298,28 @@ impl Renderer {
         alloc += (n_paths * ANNO_SIZE + 3) & !3;
         let trans_base = alloc;
         alloc += (n_trans * TRANS_SIZE + 3) & !3;
-        let config = &[
-            n_paths as u32,
-            n_pathseg as u32,
-            width_in_tiles as u32,
-            height_in_tiles as u32,
-            tile_base as u32,
-            bin_base as u32,
-            ptcl_base as u32,
-            pathseg_base as u32,
-            anno_base as u32,
-            trans_base as u32,
-        ];
+        let trans_offset = 0; // For new element pipeline, not yet used
+        let config = Config {
+            n_elements: n_paths as u32,
+            n_pathseg: n_pathseg as u32,
+            width_in_tiles: width_in_tiles as u32,
+            height_in_tiles: height_in_tiles as u32,
+            tile_alloc: tile_base as u32,
+            bin_alloc: bin_base as u32,
+            ptcl_alloc: ptcl_base as u32,
+            pathseg_alloc: pathseg_base as u32,
+            anno_alloc: anno_base as u32,
+            trans_alloc: trans_base as u32,
+            n_trans: n_trans as u32,
+            trans_offset: trans_offset as u32,
+        };
         unsafe {
             let scene = render_ctx.get_scene_buf();
             self.n_elements = scene.len() / piet_gpu_types::scene::Element::fixed_size();
             // TODO: reallocate scene buffer if size is inadequate
             assert!(self.scene_bufs[buf_ix].size() as usize >= scene.len());
             self.scene_bufs[buf_ix].write(scene)?;
-            self.config_bufs[buf_ix].write(config)?;
+            self.config_bufs[buf_ix].write(&[config])?;
             self.memory_buf_host[buf_ix].write(&[alloc as u32, 0 /* Overflow flag */])?;
 
             // Upload gradient data.
diff --git a/piet-gpu/src/stages.rs b/piet-gpu/src/stages.rs
new file mode 100644
index 0000000..0613585
--- /dev/null
+++ b/piet-gpu/src/stages.rs
@@ -0,0 +1,209 @@
+// Copyright 2021 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+//! Stages for new element pipeline, exposed for testing.
+
+use bytemuck::{Pod, Zeroable};
+
+use piet::kurbo::Affine;
+use piet_gpu_hal::{
+    include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session,
+};
+
+/// The configuration block passed to piet-gpu shaders.
+///
+/// Note: this should be kept in sync with the version in setup.h.
+#[repr(C)]
+#[derive(Clone, Copy, Default, Zeroable, Pod)]
+pub struct Config {
+    pub n_elements: u32, // paths
+    pub n_pathseg: u32,
+    pub width_in_tiles: u32,
+    pub height_in_tiles: u32,
+    pub tile_alloc: u32,
+    pub bin_alloc: u32,
+    pub ptcl_alloc: u32,
+    pub pathseg_alloc: u32,
+    pub anno_alloc: u32,
+    pub trans_alloc: u32,
+    pub n_trans: u32,
+    pub trans_offset: u32,
+}
+
+// The individual stages will probably be separate files but for now, all in one.
+
+// This is equivalent to the version in piet-gpu-types, but the bytemuck
+// representation will likely be faster.
+#[repr(C)]
+#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
+pub struct Transform {
+    pub mat: [f32; 4],
+    pub translate: [f32; 2],
+}
+
+const TRANSFORM_WG: u64 = 512;
+const TRANSFORM_N_ROWS: u64 = 8;
+const TRANSFORM_PART_SIZE: u64 = TRANSFORM_WG * TRANSFORM_N_ROWS;
+
+pub struct TransformCode {
+    reduce_pipeline: Pipeline,
+    root_pipeline: Pipeline,
+    leaf_pipeline: Pipeline,
+}
+
+pub struct TransformStage {
+    // Right now we're limited to partition^2 (~16M) elements. This can be
+    // expanded but is tedious.
+    root_buf: Buffer,
+    root_ds: DescriptorSet,
+}
+
+pub struct TransformBinding {
+    reduce_ds: DescriptorSet,
+    leaf_ds: DescriptorSet,
+}
+
+impl TransformCode {
+    pub unsafe fn new(session: &Session) -> TransformCode {
+        let reduce_code = include_shader!(session, "../shader/gen/transform_reduce");
+        let reduce_pipeline = session
+            .create_compute_pipeline(
+                reduce_code,
+                &[
+                    BindType::Buffer,
+                    BindType::BufReadOnly,
+                    BindType::BufReadOnly,
+                    BindType::Buffer,
+                ],
+            )
+            .unwrap();
+        let root_code = include_shader!(session, "../shader/gen/transform_root");
+        let root_pipeline = session
+            .create_compute_pipeline(root_code, &[BindType::Buffer])
+            .unwrap();
+        let leaf_code = include_shader!(session, "../shader/gen/transform_leaf");
+        let leaf_pipeline = session
+            .create_compute_pipeline(
+                leaf_code,
+                &[
+                    BindType::Buffer,
+                    BindType::BufReadOnly,
+                    BindType::BufReadOnly,
+                    BindType::BufReadOnly,
+                ],
+            )
+            .unwrap();
+        TransformCode {
+            reduce_pipeline,
+            root_pipeline,
+            leaf_pipeline,
+        }
+    }
+}
+
+impl TransformStage {
+    pub unsafe fn new(session: &Session, code: &TransformCode) -> TransformStage {
+        // We're limited to TRANSFORM_PART_SIZE^2
+        // Also note: size here allows padding
+        let root_buf_size = TRANSFORM_PART_SIZE * 32;
+        let root_buf = session
+            .create_buffer(root_buf_size, BufferUsage::STORAGE)
+            .unwrap();
+        let root_ds = session
+            .create_simple_descriptor_set(&code.root_pipeline, &[&root_buf])
+            .unwrap();
+        TransformStage { root_buf, root_ds }
+    }
+
+    pub unsafe fn bind(
+        &self,
+        session: &Session,
+        code: &TransformCode,
+        config_buf: &Buffer,
+        scene_buf: &Buffer,
+        memory_buf: &Buffer,
+    ) -> TransformBinding {
+        let reduce_ds = session
+            .create_simple_descriptor_set(
+                &code.reduce_pipeline,
+                &[memory_buf, config_buf, scene_buf, &self.root_buf],
+            )
+            .unwrap();
+        let leaf_ds = session
+            .create_simple_descriptor_set(
+                &code.leaf_pipeline,
+                &[memory_buf, config_buf, scene_buf, &self.root_buf],
+            )
+            .unwrap();
+        TransformBinding { reduce_ds, leaf_ds }
+    }
+
+    pub unsafe fn record(
+        &self,
+        cmd_buf: &mut CmdBuf,
+        code: &TransformCode,
+        binding: &TransformBinding,
+        size: u64,
+    ) {
+        if size > TRANSFORM_PART_SIZE.pow(2) {
+            panic!("very large scan not yet implemented");
+        }
+        let n_workgroups = (size + TRANSFORM_PART_SIZE - 1) / TRANSFORM_PART_SIZE;
+        if n_workgroups > 1 {
+            cmd_buf.dispatch(
+                &code.reduce_pipeline,
+                &binding.reduce_ds,
+                (n_workgroups as u32, 1, 1),
+                (TRANSFORM_WG as u32, 1, 1),
+            );
+            cmd_buf.memory_barrier();
+            cmd_buf.dispatch(
+                &code.root_pipeline,
+                &self.root_ds,
+                (1, 1, 1),
+                (TRANSFORM_WG as u32, 1, 1),
+            );
+            cmd_buf.memory_barrier();
+        }
+        cmd_buf.dispatch(
+            &code.leaf_pipeline,
+            &binding.leaf_ds,
+            (n_workgroups as u32, 1, 1),
+            (TRANSFORM_WG as u32, 1, 1),
+        );
+    }
+}
+
+impl Transform {
+    pub fn from_kurbo(a: Affine) -> Transform {
+        let c = a.as_coeffs();
+        Transform {
+            mat: [c[0] as f32, c[1] as f32, c[2] as f32, c[3] as f32],
+            translate: [c[4] as f32, c[5] as f32],
+        }
+    }
+
+    pub fn to_kurbo(self) -> Affine {
+        Affine::new([
+            self.mat[0] as f64,
+            self.mat[1] as f64,
+            self.mat[2] as f64,
+            self.mat[3] as f64,
+            self.translate[0] as f64,
+            self.translate[1] as f64,
+        ])
+    }
+}
diff --git a/tests/Cargo.toml b/tests/Cargo.toml
index a987c9e..1f0760a 100644
--- a/tests/Cargo.toml
+++ b/tests/Cargo.toml
@@ -6,9 +6,18 @@ description = "Tests for piet-gpu shaders and generic GPU capabilities."
 license = "MIT/Apache-2.0"
 edition = "2021"
 
+[features]
+default = ["piet-gpu"]
+
 [dependencies]
 clap = "2.33"
 bytemuck = "1.7.2"
+kurbo = "0.7.1"
+rand = "0.7.3"
 
 [dependencies.piet-gpu-hal]
 path = "../piet-gpu-hal"
+
+[dependencies.piet-gpu]
+path = "../piet-gpu"
+optional = true
diff --git a/tests/src/clear.rs b/tests/src/clear.rs
index 7d8bee0..009360b 100644
--- a/tests/src/clear.rs
+++ b/tests/src/clear.rs
@@ -55,12 +55,12 @@ pub unsafe fn run_clear_test(runner: &mut Runner, config: &Config) -> TestResult
         commands.write_timestamp(0);
         stage.record(&mut commands, &code, &binding);
         commands.write_timestamp(1);
-        if i == 0 {
+        if i == 0 || config.verify_all {
             commands.cmd_buf.memory_barrier();
             commands.download(&out_buf);
         }
         total_elapsed += runner.submit(commands);
-        if i == 0 {
+        if i == 0 || config.verify_all {
             let mut dst: Vec<u32> = Default::default();
             out_buf.read(&mut dst);
             if let Some(failure) = verify(&dst) {
diff --git a/tests/src/config.rs b/tests/src/config.rs
index edc1140..2593ed9 100644
--- a/tests/src/config.rs
+++ b/tests/src/config.rs
@@ -22,6 +22,7 @@ pub struct Config {
     pub groups: Groups,
     pub size: Size,
     pub n_iter: u64,
+    pub verify_all: bool,
 }
 
 pub struct Groups(String);
@@ -40,10 +41,12 @@ impl Config {
             .value_of("n_iter")
             .and_then(|s| s.parse().ok())
             .unwrap_or(1000);
+        let verify_all = matches.is_present("verify_all");
         Config {
             groups,
             size,
             n_iter,
+            verify_all,
         }
     }
 }
diff --git a/tests/src/linkedlist.rs b/tests/src/linkedlist.rs
index b3d03ed..3102a73 100644
--- a/tests/src/linkedlist.rs
+++ b/tests/src/linkedlist.rs
@@ -48,12 +48,12 @@ pub unsafe fn run_linkedlist_test(runner: &mut Runner, config: &Config) -> TestR
         commands.write_timestamp(0);
         stage.record(&mut commands, &code, &binding, &mem_buf.dev_buf);
         commands.write_timestamp(1);
-        if i == 0 {
+        if i == 0 || config.verify_all {
             commands.cmd_buf.memory_barrier();
             commands.download(&mem_buf);
         }
         total_elapsed += runner.submit(commands);
-        if i == 0 {
+        if i == 0 || config.verify_all {
             let mut dst: Vec<u32> = Default::default();
             mem_buf.read(&mut dst);
             if !verify(&dst) {
diff --git a/tests/src/main.rs b/tests/src/main.rs
index dd6f4bd..0ab9340 100644
--- a/tests/src/main.rs
+++ b/tests/src/main.rs
@@ -25,6 +25,9 @@ mod prefix_tree;
 mod runner;
 mod test_result;
 
+#[cfg(feature = "piet-gpu")]
+mod transform;
+
 use clap::{App, Arg};
 use piet_gpu_hal::InstanceFlags;
 
@@ -62,6 +65,11 @@ fn main() {
                 .help("Number of iterations")
                 .takes_value(true),
         )
+        .arg(
+            Arg::with_name("verify_all")
+                .long("verify_all")
+                .help("Verify all iterations"),
+        )
         .arg(
             Arg::with_name("dx12")
                 .long("dx12")
@@ -123,5 +131,9 @@ fn main() {
             }
             report(&linkedlist::run_linkedlist_test(&mut runner, &config));
         }
+        #[cfg(feature = "piet-gpu")]
+        if config.groups.matches("piet") {
+            report(&transform::transform_test(&mut runner, &config));
+        }
     }
 }
diff --git a/tests/src/prefix.rs b/tests/src/prefix.rs
index 71be865..00a69ca 100644
--- a/tests/src/prefix.rs
+++ b/tests/src/prefix.rs
@@ -85,12 +85,12 @@ pub unsafe fn run_prefix_test(
         commands.write_timestamp(0);
         stage.record(&mut commands, &code, &binding);
         commands.write_timestamp(1);
-        if i == 0 {
+        if i == 0 || config.verify_all {
             commands.cmd_buf.memory_barrier();
             commands.download(&out_buf);
         }
         total_elapsed += runner.submit(commands);
-        if i == 0 {
+        if i == 0 || config.verify_all {
             let mut dst: Vec<u32> = Default::default();
             out_buf.read(&mut dst);
             if let Some(failure) = verify(&dst) {
diff --git a/tests/src/prefix_tree.rs b/tests/src/prefix_tree.rs
index 9603385..5957e88 100644
--- a/tests/src/prefix_tree.rs
+++ b/tests/src/prefix_tree.rs
@@ -66,12 +66,12 @@ pub unsafe fn run_prefix_test(runner: &mut Runner, config: &Config) -> TestResul
         commands.write_timestamp(0);
         stage.record(&mut commands, &code, &binding);
         commands.write_timestamp(1);
-        if i == 0 {
+        if i == 0 || config.verify_all {
             commands.cmd_buf.memory_barrier();
             commands.download(&out_buf);
         }
         total_elapsed += runner.submit(commands);
-        if i == 0 {
+        if i == 0 || config.verify_all {
             let mut dst: Vec<u32> = Default::default();
             out_buf.read(&mut dst);
             if let Some(failure) = verify(&dst) {
diff --git a/tests/src/transform.rs b/tests/src/transform.rs
new file mode 100644
index 0000000..8cc485b
--- /dev/null
+++ b/tests/src/transform.rs
@@ -0,0 +1,133 @@
+// Copyright 2021 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+//! Tests for piet-gpu shaders.
+
+use crate::{Config, Runner, TestResult};
+
+use kurbo::Affine;
+use piet_gpu::stages::{self, Transform, TransformCode, TransformStage};
+use piet_gpu_hal::BufferUsage;
+use rand::Rng;
+
+struct AffineTestData {
+    input_data: Vec<Transform>,
+    expected: Vec<Affine>,
+}
+
+pub unsafe fn transform_test(runner: &mut Runner, config: &Config) -> TestResult {
+    let mut result = TestResult::new("transform");
+    let n_elements: u64 = config.size.choose(1 << 12, 1 << 18, 1 << 24);
+    // TODO: would be nice to validate with real transform.
+    let data = AffineTestData::new(n_elements as usize);
+    let data_buf = runner
+        .session
+        .create_buffer_init(&data.input_data, BufferUsage::STORAGE)
+        .unwrap();
+    let memory = runner.buf_down(data_buf.size() + 24, BufferUsage::empty());
+    let stage_config = stages::Config {
+        n_trans: n_elements as u32,
+        // This is a hack to get elements aligned.
+        trans_alloc: 16,
+        ..Default::default()
+    };
+    let config_buf = runner
+        .session
+        .create_buffer_init(std::slice::from_ref(&stage_config), BufferUsage::STORAGE)
+        .unwrap();
+
+    let code = TransformCode::new(&runner.session);
+    let stage = TransformStage::new(&runner.session, &code);
+    let binding = stage.bind(
+        &runner.session,
+        &code,
+        &config_buf,
+        &data_buf,
+        &memory.dev_buf,
+    );
+    let mut total_elapsed = 0.0;
+    let n_iter = config.n_iter;
+    for i in 0..n_iter {
+        let mut commands = runner.commands();
+        commands.write_timestamp(0);
+        stage.record(&mut commands.cmd_buf, &code, &binding, n_elements);
+        commands.write_timestamp(1);
+        if i == 0 || config.verify_all {
+            commands.cmd_buf.memory_barrier();
+            commands.download(&memory);
+        }
+        total_elapsed += runner.submit(commands);
+        if i == 0 || config.verify_all {
+            let mut dst: Vec<Transform> = Default::default();
+            memory.read(&mut dst);
+            if let Some(failure) = data.verify(&dst[1..]) {
+                result.fail(failure);
+            }
+        }
+    }
+    result.timing(total_elapsed, n_elements * n_iter);
+    result
+}
+
+impl AffineTestData {
+    fn new(n: usize) -> AffineTestData {
+        let mut rng = rand::thread_rng();
+        let mut a = Affine::default();
+        let mut b;
+        let mut input_data = Vec::with_capacity(n);
+        let mut expected = Vec::with_capacity(n);
+        for _ in 0..n {
+            loop {
+                b = Affine::new([
+                    rng.gen_range(-10.0, 10.0),
+                    rng.gen_range(-10.0, 10.0),
+                    rng.gen_range(-10.0, 10.0),
+                    rng.gen_range(-10.0, 10.0),
+                    rng.gen_range(-10.0, 10.0),
+                    rng.gen_range(-10.0, 10.0),
+                ]);
+                if b.determinant() >= 1.0 {
+                    break;
+                }
+            }
+            expected.push(b);
+            let c = a.inverse() * b;
+            input_data.push(Transform::from_kurbo(c));
+            a = b;
+        }
+        AffineTestData {
+            input_data,
+            expected,
+        }
+    }
+
+    fn verify(&self, actual: &[Transform]) -> Option<String> {
+        for (i, (actual, expected)) in actual.iter().zip(&self.expected).enumerate() {
+            let error: f64 = actual
+                .to_kurbo()
+                .as_coeffs()
+                .iter()
+                .zip(expected.as_coeffs())
+                .map(|(actual, expected)| (actual - expected).powi(2))
+                .sum();
+            let tolerance = 1e-6 * (i + 1) as f64;
+            if error > tolerance {
+                return Some(format!("{}: {} {}", i, error, tolerance));
+            }
+        }
+        None
+    }
+}

From 8f7ed161a660f860ad8988388394b39039e58464 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Wed, 24 Nov 2021 09:10:30 -0800
Subject: [PATCH 2/2] Tune transform test parameters

Previous threshold was seeing occasional failures, and also fairly wide
variance in the error. This seems to be reliable, but hasn't been
validated extremely rigorously.
---
 tests/src/transform.rs | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/tests/src/transform.rs b/tests/src/transform.rs
index 8cc485b..d696b10 100644
--- a/tests/src/transform.rs
+++ b/tests/src/transform.rs
@@ -31,7 +31,7 @@ struct AffineTestData {
 pub unsafe fn transform_test(runner: &mut Runner, config: &Config) -> TestResult {
     let mut result = TestResult::new("transform");
     let n_elements: u64 = config.size.choose(1 << 12, 1 << 18, 1 << 24);
-    // TODO: would be nice to validate with real transform.
+    // Validate with real transform data.
     let data = AffineTestData::new(n_elements as usize);
     let data_buf = runner
         .session
@@ -86,27 +86,26 @@ impl AffineTestData {
     fn new(n: usize) -> AffineTestData {
         let mut rng = rand::thread_rng();
         let mut a = Affine::default();
-        let mut b;
         let mut input_data = Vec::with_capacity(n);
         let mut expected = Vec::with_capacity(n);
         for _ in 0..n {
             loop {
-                b = Affine::new([
-                    rng.gen_range(-10.0, 10.0),
-                    rng.gen_range(-10.0, 10.0),
-                    rng.gen_range(-10.0, 10.0),
-                    rng.gen_range(-10.0, 10.0),
-                    rng.gen_range(-10.0, 10.0),
-                    rng.gen_range(-10.0, 10.0),
+                let b = Affine::new([
+                    rng.gen_range(-3.0, 3.0),
+                    rng.gen_range(-3.0, 3.0),
+                    rng.gen_range(-3.0, 3.0),
+                    rng.gen_range(-3.0, 3.0),
+                    rng.gen_range(-3.0, 3.0),
+                    rng.gen_range(-3.0, 3.0),
                 ]);
-                if b.determinant() >= 1.0 {
+                if b.determinant().abs() >= 1.0 {
+                    expected.push(b);
+                    let c = a.inverse() * b;
+                    input_data.push(Transform::from_kurbo(c));
+                    a = b;
                     break;
                 }
             }
-            expected.push(b);
-            let c = a.inverse() * b;
-            input_data.push(Transform::from_kurbo(c));
-            a = b;
         }
         AffineTestData {
             input_data,
@@ -123,7 +122,9 @@ impl AffineTestData {
                 .zip(expected.as_coeffs())
                 .map(|(actual, expected)| (actual - expected).powi(2))
                 .sum();
-            let tolerance = 1e-6 * (i + 1) as f64;
+            // Hopefully this is right; most of the time the error is much
+            // smaller, but occasionally we see outliers.
+            let tolerance = 1e-9 * (i + 1) as f64;
             if error > tolerance {
                 return Some(format!("{}: {} {}", i, error, tolerance));
             }