From 178761dcb367cc3b71d13683c2d258c3587583f9 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Wed, 24 Nov 2021 16:26:45 -0800
Subject: [PATCH 1/4] Path stream processing

This patch contains the core of the path stream processing, though some
integration bits are missing. The core logic is tested, though
combinations of path types, transforms, and line widths are not (yet).

Progress towards #119
---
 doc/pathseg.md                            |  65 +++++
 piet-gpu-hal/src/hub.rs                   |   4 +-
 piet-gpu/shader/backdrop.spv              | Bin 12328 -> 12536 bytes
 piet-gpu/shader/backdrop_lg.spv           | Bin 12360 -> 12568 bytes
 piet-gpu/shader/bbox_clear.comp           |  29 ++
 piet-gpu/shader/binning.spv               | Bin 16108 -> 16316 bytes
 piet-gpu/shader/build.ninja               |   9 +
 piet-gpu/shader/coarse.spv                | Bin 63892 -> 64100 bytes
 piet-gpu/shader/gen/bbox_clear.spv        | Bin 0 -> 2632 bytes
 piet-gpu/shader/gen/pathseg.spv           | Bin 0 -> 33908 bytes
 piet-gpu/shader/gen/pathtag_reduce.spv    | Bin 0 -> 7752 bytes
 piet-gpu/shader/gen/pathtag_root.spv      | Bin 0 -> 5852 bytes
 piet-gpu/shader/gen/transform_leaf.dxil   | Bin 5664 -> 5664 bytes
 piet-gpu/shader/gen/transform_leaf.hlsl   |   6 +-
 piet-gpu/shader/gen/transform_leaf.msl    |   4 +
 piet-gpu/shader/gen/transform_leaf.spv    | Bin 12216 -> 12424 bytes
 piet-gpu/shader/gen/transform_reduce.dxil | Bin 4696 -> 4696 bytes
 piet-gpu/shader/gen/transform_reduce.hlsl |   6 +-
 piet-gpu/shader/gen/transform_reduce.msl  |   4 +
 piet-gpu/shader/gen/transform_reduce.spv  | Bin 7568 -> 7776 bytes
 piet-gpu/shader/kernel4.spv               | Bin 38780 -> 38988 bytes
 piet-gpu/shader/path_coarse.spv           | Bin 43136 -> 43344 bytes
 piet-gpu/shader/pathseg.comp              | 284 ++++++++++++++++++
 piet-gpu/shader/pathtag.h                 |  49 ++++
 piet-gpu/shader/pathtag_reduce.comp       |  61 ++++
 piet-gpu/shader/pathtag_scan.comp         |  74 +++++
 piet-gpu/shader/setup.h                   |  10 +
 piet-gpu/shader/tile_alloc.spv            | Bin 14884 -> 15092 bytes
 piet-gpu/shader/transform_scan.comp       |   1 -
 piet-gpu/src/lib.rs                       |   4 +-
 piet-gpu/src/stages.rs                    |   8 +
 piet-gpu/src/stages/path.rs               | 339 ++++++++++++++++++++++
 tests/src/main.rs                         |   3 +
 tests/src/path.rs                         | 293 +++++++++++++++++++
 tests/src/transform.rs                    |  11 +-
 35 files changed, 1250 insertions(+), 14 deletions(-)
 create mode 100644 doc/pathseg.md
 create mode 100644 piet-gpu/shader/bbox_clear.comp
 create mode 100644 piet-gpu/shader/gen/bbox_clear.spv
 create mode 100644 piet-gpu/shader/gen/pathseg.spv
 create mode 100644 piet-gpu/shader/gen/pathtag_reduce.spv
 create mode 100644 piet-gpu/shader/gen/pathtag_root.spv
 create mode 100644 piet-gpu/shader/pathseg.comp
 create mode 100644 piet-gpu/shader/pathtag.h
 create mode 100644 piet-gpu/shader/pathtag_reduce.comp
 create mode 100644 piet-gpu/shader/pathtag_scan.comp
 create mode 100644 piet-gpu/src/stages/path.rs
 create mode 100644 tests/src/path.rs

diff --git a/doc/pathseg.md b/doc/pathseg.md
new file mode 100644
index 0000000..fefde03
--- /dev/null
+++ b/doc/pathseg.md
@@ -0,0 +1,65 @@
+# Path segment encoding
+
+The new (November 2021) element processing pipeline has a particularly clever approach to path segment encoding, and this document explains that.
+
+By way of motivation, in the old scene encoding, all elements take a fixed amount of space, currently 36 bytes, but that's at risk of expanding if a new element type requires even more space. The new design is based on stream compaction. The input is separated into multiple streams, so in particular path segment data gets its own stream. Further, that stream can be packed.
+
+As explained in [#119], the path stream is separated into one stream for tag bytes, and another stream for the path segment data.
+
+## Prefix sum for unpacking
+
+The key to this encoding is a prefix sum over the size of each element's payload. The payload size can be readily derived from the tag byte itself (see below for details on this), then an exclusive prefix sum gives the start offset of the packed encoding for each element. The combination of the tag byte and that offset gives you everything needed to unpack a segment.
+
+## Tag byte encoding
+
+Bits 0-1 indicate the type of path segment: 1 is line, 2 is quadratic bezier, 3 is cubic bezier.
+
+Bit 2 indicates whether this is the last segment in a subpath; see below.
+
+Bit 3 indicates whether the coordinates are i16 or f32.
+
+Thus, values of 1-7 indicate the following combinations in a 16 bit encoding, so `size` counts both points and u32 indices.
+
+```
+value op             size
+    1 lineto         1
+    2 quadto         2
+    3 curveto        3
+    5 lineto + end   2
+    6 quadto + end   3
+    7 curveto + end  4
+```
+
+Values of 9-15 are the same but with a 32 bit encoding, so double `size` to compute the size in u32 units.
+
+A value of 0 indicates no path segment present; it may be a nop, for example padding at the end of the stream to make it an integral number of workgroups, or other bits in the tag byte might indicate a transform, end path, or line width marker (with one bit left for future expansion). Values of 4, 8, and 12 are unused.
+
+In addition to path segments, bits 4-6 are "one hot" encodings of other element types. Bit 4 set (0x10) is a path (encoded after all path segments). Bit 5 set (0x20) is a transform. Bit 6 set (0x40) is a line width setting. Transforms and line widths have their own streams in the encoded scene buffer, so prefix sums of the counts serve as indices into those streams.
+
+### End subpath handling
+
+In the previous encoding, every path segment was encoded independently; the segments could be shuffled within a path without affecting the results. However, that encoding failed to take advantage of the fact that subpaths are continuous, meaning that the start point of each segment is equal to the end point of the previous segment. Thus, there was redundancy in the encoding, and more CPU-side work for the encoder.
+
+This encoding fixes that. Bit 2 of the tag byte indicates whether the segment is the last one in the subpath. If it is set, then the size encompasses all the points in the segment. If not, then it is short one, which leaves the offset for the next segment pointing at the last point in this one.
+
+There is a relatively straightforward state maching to convert the usual moveto/lineto representation to this one. In short, the point for the moveto is encoded, a moveto or closepath sets the end bit for the previously encoded segment (if any), and the end bit is also set for the last segment in the path. Certain cases, such as a lone moveto, must be avoided.
+
+### Bit magic
+
+The encoding is carefully designed for fast calculation based on bits, in particular to quickly compute a sum of counts based on all four tag bytes in a u32.
+
+To count whether a path segment is present, compute `(tag | (tag >> 1)) & 1`. Thus, the number of path segments in a 4-byte word is `bitCount((tag | (tag >> 1)) & 0x1010101)`. Also note: `((tag & 3) * 7) & 4` counts the same number of bits and might save one instruction given that `tag & 3` can be reused below.
+
+The number of points (ie the value of the table above) is `(tag & 3) + ((tag >> 2) & 1)`. The value `(tag >> 3) & 1` is 0 for 16 bit encodings and 1 for 32 bit encodings. Thus, `points + (point & (((tag >> 3) & 1) * 7))` is the number of u32 words. All these operations can be performed in parallel on the 4 bytes in a word, justifying the following code:
+
+```glsl
+    uint point_count = (tag & 0x3030303) + ((tag >> 2) & 0x1010101);
+    uint word_count = point_count + (point_count & (((tag >> 3) & 0x1010101) * 15));
+    word_count += word_count >> 8;
+    word_count += word_count >> 16;
+    word_count &= 0xff;
+```
+
+One possible optimization to explore is packing multiple tags into a byte by or'ing together the flags. This would add a small amount of complexity into the interpretation (mostly in pathseg), and increase utilization a bit.
+
+[#119]: https://github.com/linebender/piet-gpu/issues/119
diff --git a/piet-gpu-hal/src/hub.rs b/piet-gpu-hal/src/hub.rs
index edf6535..7b93372 100644
--- a/piet-gpu-hal/src/hub.rs
+++ b/piet-gpu-hal/src/hub.rs
@@ -813,8 +813,8 @@ impl Buffer {
     ) -> Result<BufReadGuard<'a>, Error> {
         let offset = match range.start_bound() {
             Bound::Unbounded => 0,
-            Bound::Excluded(&s) => s.try_into()?,
-            Bound::Included(_) => unreachable!(),
+            Bound::Excluded(_) => unreachable!(),
+            Bound::Included(&s) => s.try_into()?,
         };
         let end = match range.end_bound() {
             Bound::Unbounded => self.size(),
diff --git a/piet-gpu/shader/backdrop.spv b/piet-gpu/shader/backdrop.spv
index 870abe46716f3ff7fd830656bd0edcdd8713d629..3bc136540453378bcdf34013c80261bf36db7341 100644
GIT binary patch
delta 278
zcmZ3H@FQ`95qmuw!*&J+1}-2@O3JT@Pt3{5PiA0XV_*e~a|6Zm;!BDW^NN9D>|ik-
zpcq&rK0ht3IJE@Gh0E~**#(Iu86}D7a9K764zNBxpj=L7UTS$}N=XJxRu)M!KU8yZ
zDopd{ICdWvZ-`Ss)+z(B0T6pYod>cGBxZsn2C@buW`QIIG8rUhvw10ZBr~JX<o|rS
YlNI<`ChPEXOy&V%0e+UvCj4I%0Gli}?*IS*

delta 63
zcmey7xFTVL5j#68!*&J+2Cm70?7~9aKvqdnVqS54ep*^_Y6%0w=5y>mESs}<LYNu(
SChzCho&15HV>64u7X<)79TKYm

diff --git a/piet-gpu/shader/backdrop_lg.spv b/piet-gpu/shader/backdrop_lg.spv
index a8b1fd9ee71a1ce02aa2b9ba68112df551511a47..c02f92cda6b66ebda10e3ac0b5941266de458e2d 100644
GIT binary patch
delta 278
zcmX?+Fe7P$5qmuw!*&J+1}-2@O3JT@Pt3{5PiA0XV_*e~a|6Zm;!BDW^NN9D>|ik-
zpcq&rK0ht3IJE@Gh0E~**#(Iu86}D7a9K764zNBxpj=L7UTS$}N=XJxRu)M!KU8yZ
zDopd{ICdWvZ-`Ss)+z(B0T6pYod>cGBxZsn2C@buW`QIIG8rUhvw10ZBr~JX<o|rS
YlNI<`ChPEXOy&V%0e+UvCj26b09C*>(EtDd

delta 63
zcmbP{bRuDc5j#68!*&J+2Cm70?7~9aKvqdnVqS54ep*^_Y6%0w=5y>mESs}<LYNu(
SChzCho&15HV>63@h#~+koDq!x

diff --git a/piet-gpu/shader/bbox_clear.comp b/piet-gpu/shader/bbox_clear.comp
new file mode 100644
index 0000000..4ac5062
--- /dev/null
+++ b/piet-gpu/shader/bbox_clear.comp
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Clear path bbox to prepare for atomic min/max.
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+#include "mem.h"
+#include "setup.h"
+
+#define LG_WG_SIZE 9
+#define WG_SIZE (1 << LG_WG_SIZE)
+
+layout(local_size_x = WG_SIZE, local_size_y = 1) in;
+
+layout(binding = 1) readonly buffer ConfigBuf {
+    Config conf;
+};
+
+void main() {
+    uint ix = gl_GlobalInvocationID.x;
+    if (ix < conf.n_elements) {
+        uint out_ix = (conf.bbox_alloc.offset >> 2) + 4 * ix;
+        memory[out_ix] = 0xffff;
+        memory[out_ix + 1] = 0xffff;
+        memory[out_ix + 2] = 0;
+        memory[out_ix + 3] = 0;
+    }
+}
diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv
index 669585af39609a604db0e9144ba0a37db4d1c4b9..7c5c316e8a8d29542cec4eaa6ab830e45f3abe92 100644
GIT binary patch
delta 275
zcmaD;yQh8w6K6de!!HH~1}-2@O3JT@Pt3{5PiA0XV_*e~a|6Zm;!BDW^NN9D>|ik-
zpcq&rK0ht3IJE@Gh0E~**#(Iu86}D7a9K764zNBxpj=L7UTS$}N=XJxRu)M!KU8yZ
zDopccJ<cC2-VmpNtW^eL10eQ*IuB$WNX!ID3}g*R%mPUaWHLz1X0sySPG&}-$+rb{
SCke4|vH``wcr&Zea~%M)a5roK

delta 63
zcmdl}|E6{W6DK<>!!HH~2Cm79oWerfKvqdnVqS54ep*^_Y6%0w=6ReyST;Z7+rrGq
TH+i;@?&LE<9Gf2q-O&L6Wq=gH

diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja
index 777a77f..6f225d9 100644
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@@ -57,3 +57,12 @@ build gen/transform_leaf.spv: glsl transform_leaf.comp | scene.h tile.h setup.h
 build gen/transform_leaf.hlsl: hlsl gen/transform_leaf.spv
 build gen/transform_leaf.dxil: dxil gen/transform_leaf.hlsl
 build gen/transform_leaf.msl: msl gen/transform_leaf.spv
+
+build gen/pathtag_reduce.spv: glsl pathtag_reduce.comp | pathtag.h setup.h mem.h
+
+build gen/pathtag_root.spv: glsl pathtag_scan.comp | pathtag.h
+  flags = -DROOT
+
+build gen/bbox_clear.spv: glsl bbox_clear.comp | setup.h mem.h
+
+build gen/pathseg.spv: glsl pathseg.comp | tile.h pathseg.h pathtag.h setup.h mem.h
diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv
index 4167197dcb99a9c41b8676d486a5596374177471..a0ad82ad99e94a7b95aa555339267e1ae40aa9f1 100644
GIT binary patch
delta 272
zcmbR8nfb{V<_(Y3>)9CYu`n=jF)%PBCFNJdC+6hjCo?dxF|dNgxq;$&@g+rxdBs36
zcCZ)^Pz)>*pP!aioLU0p!sU2@?1IFSjFQB3xGWn32Us5;P%bAkFSR@~r6dC;D~qI=
zAF8=H6{dMJpGE|qH^eC*Yn6f60Ej)H&I4Ho5;H*(16czSvp^C9nG6!M*_>!*%f%=(
WdA@`0<P#1&lh-(KY`)-dWCsAz%{x~B

delta 65
zcmV-H0KWg^^aGUh1F+&J2L%A$1ONaElL06b5(@wT0CaL;ZgXF6W@d9`bN~ReekfQD
XvnDq<3IPt2UqLOCkU<Euq(Q*At85hw

diff --git a/piet-gpu/shader/gen/bbox_clear.spv b/piet-gpu/shader/gen/bbox_clear.spv
new file mode 100644
index 0000000000000000000000000000000000000000..181f99b828e164e5da3381ac79033a89650a1f6f
GIT binary patch
literal 2632
zcmZ{k*-jKe6owmS5kc7$aRUc&7jXkb5fWT7l3=*-0XoA>4|LMgO=cR@J1=}7pMjTh
zNeup9PoJ3ziIr69od2v<|LHCa&Y$tT0dLqF@;1Egn(zjNc;1Lt&d%!gr|rc~+E`p#
zeuBrSSI8Xk8Iw;*XHmOV3u6o;+T+5(CIAMh(A@>aO7)Hf6jiml?N@hps@w1V`oV$U
zNxw#cABAz?H^O#MPs5`CzahsjjO)>1!*wXh!>wOZjUm+tzj*3xDC=i0@~cs@TZ^{h
zqoiI-!zA8%Cm&+zaWjgNx=wDB^(;AQHakJu^_i6ZHi?_z9zJCcofZ2*6tsdk?euZ(
z8v6&ebiWhGUEXIhfA0M`Y^3{s82f1$1zAkj#*6L;;og4QH(DL;hPz+uHfcETzN=I@
z-0Q0>{2;AIX0ZAq+RM8#Q!j@1s>N|)2CH#ac-pSTo!nsfu<+g8<V)Ybl-&{TbvYt>
z_deIlVK3T$lk>9CQ(D{e%~}>`yzi(sv(SArv-V@h@>socFSaE9E(q^(_2#fC8BW1i
zb#chaj7%YDF#|VrCWm_bY-Wny1L+@wR?_}Td^ywcZ3V4fEyLj(JZQI*wq)uN58uMG
z9MoHo&Z?>&@5H{Y{XzaVbMw+!U^Zi?G3tRa7v@JTFnBbqkYl{RVZ`7!?qJLj*hL3p
zhQKa681n;WJ<}icz$)5SgP8%l<zUPU*gXg1{$RX|#nLMnKN`cB5wIl(V>ZB69gKMZ
zd!1p$9_Mxkn*n@SRdE7V$mC)+4>vJr!4dN@r_Fn#-`B~z*t^3q%kt4L*lB)j7VzV}
zs0(IwxxZT%ow_XQuIuEju4rG-RKfL}qKQYIk`H}Vv{@DDaK#3nmJTP##9$xRIiL-`
zmh032dng@WxM1NA2iU-8m4$ij%9GigBD)*=|F-LAGCARbnW7%E$lbu`4;?=2zLz9d
zwCRa{(WkXp#KSqh<i9BCnojepNap^g!wbH4@7t2$ct$-L=d(Gt)-O199_l0(KIs!3
z{@i?v(oatM`JYO+zR9<I%KxQwc(Q(&XA&;xHt@wffRTs2qTZ-S&n#kp3E{h@C+2{;
z<20TB9r~uc;Fj+R{^5LGG4#dv1iqn+#G{*y{|DHnJb7zo6Mo=;ZGKlJWBZ{xaKOKV
zSn!|9JFAoRM>Vm{XoJx|@}{~Z|D3kvuSj-l+|2FAszKlUcfjD)`nV&Re!u5FcO}C!
z__rLpCz)7m+?h85vmB2kgV~)QONMLeU?T<$zAc9T7%^7knPf0{vpmlwTORD>0dw>4
z?{o94NC#tfEYAzcmIpg|z+6sOrMr1vNe5%*EYF%`%Y&UfVC4BNUW^~kznbJ#?SDlN
B)6W0^

literal 0
HcmV?d00001

diff --git a/piet-gpu/shader/gen/pathseg.spv b/piet-gpu/shader/gen/pathseg.spv
new file mode 100644
index 0000000000000000000000000000000000000000..67fc57fb3685022e4f00cd7831fc097c86dc4a24
GIT binary patch
literal 33908
zcmb822b5h^`MxjAOcHwU5R%YB@1X}m3!TuLFiet3GBBA5$s|<iiV{Hq!9rI=nlzCn
zqS6Hc5d{^bs9-@*RH_KS=ehU1$<A4ewf_G@_S*0Je&62Tx63K_%$=ca{?!II8uK+4
zYz%4a+|g(rOEl(3X*3pU3~JV6_c~y&4Q32X++efKH_~CzMqATQpT%frvv$hVuAV*(
zGv{wKE}+~;S)4w!EujH!8%rDiK^qT1+KE1Pug<ai?mKp`T{<UBo7OpF;KbhU&fcEB
z?#_ul)4L}O^vvqkZ%E<S(>I}a=ETyW1CKHfIfHAA>Fw>GKs|^|<sSW$Ce7#`XrL^}
zcp8m?{+Sb|bWZJ_I&#Lq^qCU|)^9dif9A#;Z!>e_ZS-qzER0`Q)t7vcdHPwXu{8Yn
z={*D8pW08et7c!$N7>hs^Yqo$SPDL?tGAio3gGG8T@ydGTkPhu9Cg{}^6<Ii>}ae6
zKd@`^-u-?3Jrlu$_)pDaV0u^IjLx1D#Dkb6b=S!?IKF4%z?3<*RvXuDTGzmo8Qqhc
z7U$MB?hJLs^y8YucWypI8mpiWbWQG@+DxdkXJU8XK+nL5Ba!KC`NnG06Z)r)cN@(0
z%L2_A35!(z+jyI51wY-P*?(JOICWQrM^KM%;z5nIsHb;NoH?Nxc4W2w9gQ{7&Exp~
z=}sRx4r;7TeV}tUsek&^1G*>g-#rPnqp9mNcf5ld8`ONYdv>UMOzj$gxBQ&@=9=_&
z4Y2j;ammIe)MczrH$|pn?tabi=gq3QmF=gk;YplvZw#JJ&W>d>@Hp0Pr<vn>CO8ld
z$D-~rcPt%^QSdpwV8@~6F^vG>j$;}@!khbsfrK~r%ck;HpV=#|{WGZ9r`MNx_h-v@
z5ITeHKb8sVXsq1W7CtAZ&gPl;sk1Z3k8w7qrkb@m`uejbZ8A@3?J`s^2gGY@Y(<@U
z+#1}P$F_#{8BL###%OB$thPpToN69(=W~_DF7V2)vpN6&5BazHY-Z0M{r?+)`HvxH
z^4}5M%D=s_6L?Z@*W}h1+8R5<Te!V37Mu%+YbN&?Kfb@ZcC<9teQqC1H1>k$TyCB3
zS(;XFO!a2vI@HwWeD6UY$#+j<QoN$?OWo?TehnGiH~?Go+Bs>{O**Ib_w)_qd~R#(
zkIgHdeH<LWo{dM<{SKPrH>h!he*H81CU#EjKfZ6|q}IKmt#LTI6X1Dt)Eu9g(^?*_
z{c_|SpZ3NC@U*V!T~nKF+8dtZC0F0Oduspm6FoNuHRK-Lp*pi~qC3~>-P5P{PY1g$
z`g^XSH~VnC`{5qmD`rub@yvZLZ)=<m*T;6}Qk!oF|J_R6oW{-x%)&s^en(+{8r;{K
zK(|cO{v37I?QL*t&f6RBfTwpI-+ZPpr}yD=b7*gTSmPhH{Jmt+FINf%Ui!~p<Lz+z
z_p;gaA5ya~SmO(~{5OsN;x+q{HNJGqzj=8ypXF=z6>EIumjCAIe^|{vVjkYsSOeZV
z@3^LbGjBW3<JaEUwZ_NR_&zm0uEzJP@k49;@ESj&#>dzA#2W9e@!lHmtMUFCpH<^0
z)cA=tenyR-RpV#Z`0N@#ug1@>@r!Ew;u^oC#;>UHt7`n}8o#N=Z?5rMYW%Jmzo*9U
zt?>tH{Gl3uxW=ET@h5Bisd;!?;~9ACdePR{nu}2Dy4ca!7JR^j?!N9FXU=)HQ1h6f
zPVMz(bft-R0&@v$>id8>pPP7paI4?8#-WAJba3lD8Pfb5GP$>NY;XVguHN1IX7x|#
z8tCco+ui$_*RqrFo%eaJy>T*H^HB;PucLC0Tys6oPeI$Mzi(2{<kq!W?$OuT-P=8t
zhgeQpug&UAh1bK@lQP(6Vs)ET&((dM13kUn%{DDt8@4IkJ(H&lboRuU{<!KjbnmPl
z^rLs6^Pn7gPJEw{rVUK!jUn*|tGBX>X-=CV*t+`q`eR6p1<+g1VXffN7ex2|Ki9vb
zImU(1$=~PDIpg*-RI887=-fG31e-azCb16UGoSP2XSBJWMPi-XcCo6hpGRVKjXz7<
z)>^eToolM*F@ZI-cAuSAX}0+ym`{vdy`2Ni+pcSVE?Rl7`3ii-6hD^Oen|84E_-b+
zo7J{jv;WKR?`g(rZ+sWrKQN`+d$oPvQtSJ+d3amnUbva~Y;hl$IEVC4KXxq7A=BK<
z=KKKqfwj+eY98}GI}dL9x-aL>r)NxS&)UXj*&AA%Gj;c+dOq8!xgX<O&ig~y#&u2S
zWSBeeY97oZx_P%X9;r0v;bm|+$FJ7-Yx3$ts{U`ndyj8^{G_izjd$QPx@L7ZPq&tg
z-qCm;y)_3u;kM>{eg;EdO~E`ChPT$Iqp>vDJvjcv=0~q)A1k1_w+2p}=4A?v`?G!U
z^kOb+iswAfwKdk3^Q59}y_#(kIZq|pHm}*Xk@EziZS*|0_QseR-?hf~squs7;T?@b
z;d9o!IfjnLk!a1i@$+jnH%HCWuDvm_#?OK0dEY$$37`8pti7=qo1%ICRQ)XjpWtW8
zD(2eoNj<&2eDI&xy5_ex)<>H=f9;J;YJ9UAA2ko}Xlw_c_qnsZv2#=N)B23&d>#Pr
znbF)G@U})Dyle90CO%GEmG4R5GM7{4;T?^0;GFw2CQR=dm@vh)YHOTVw7(2op5NEZ
z)4siNZH?d3<gN3{{dya`c|Gl)$sOQ?X8-q~P492r`yK|ja&2oo3C_Ab4KDM3W**+r
zcn&_1tNK9y)M_o-8*k6!!!vk|e^TRvxY3j`M%4H^HNHuWZ(Z=Q?44fb+<O#Z8oM#l
z<^>nL8pR`;*DJNskJGxTdA}OV8rYVt2ySnF))@iz2p`^}O}-6{=f-VsKAUJ>o?^^x
z?VKZd*uPr(_ZekG%ca?$-2SPW+;^bf|5m2hU!Eh<pV}!pbJVr^SoPCYKgXK3=A!03
zHJbCIws@g2<xOqrLUZi;WxU4lnLsT$s~Jyi7$q?lsI=7!&1V4pMi!dmS6i>p%vH?~
zed%wJO7nwMH1GHNZCPmE<JEl6vm0xi`*_9>u8)Unrq<TP?ah1qh^D2rM&9#XGw0hq
z;yY_|_C2QjzKX91H?Q1J(|@%3dgUIfU)C%4PqnO9?w@L|moXnCrfbt$tM*3jnHghv
za&U}lZB={krz5}~_9u6YvG2rYaE!iFUW#3X8M|-eTYGF7+8BQr#bYQI_eD$d4SZ{#
zs13u{y%77-^yWSo&RiKotiL|P>A^7?FIr3QXe^6n|Ivriqidj6FXrn0)NcG$sf{l;
z?yw50TZZGaMx`0g@vKdq@vH;3z1p(a)}<asS&nw@Te;_yeEo_$hM{oZ+v=lk><y{a
z9EbfH%RQY~e(*Pz+E8X?6t!*3lJTlJzAdP2qpx%5`H(rZKkdfxonqo_54MfkaL^9a
zX`{_~Xtis-&vOiEyBpZHV=0bl_X?|9od10)&AHtVY_9fczK2rV-g(o07`1(?uT5L`
z#L<*hz`fMkh7xCLh2`q@)kkgK+P0~9f2Fxkr-6r2=A*1z@#Da@vHd`W?Nht;sT7a(
zSS9z&ITW>eNq@7kd4yjGF5_JZ_uPs7TCn>i@ov}75q>AUH9xI<?$)0p_WR*GaD+bu
zUz{WSariKf@Mq!kafH7J_c|$eeqN=xAH!b<dmhIB4e%!g{}`O{IbVYy&S(4=g=f6l
zmxY((9SZk;<NU7<cC4Ng@(nBQev^-?xcevlpMqw;;TOP@zxt(c_nLOccRAd6@xK=C
z9*g}tc-i0W@P#V7dvtRL$FUxnn%5|}T3Oa3!dPruplf%pxPS8u;rX{My4-PXN6o*M
zR&k#X)XdAc6D6Ex;m627Ic==hnHgYncAo8T08ZLX-Sa^1^M#uBSrxX8x%jNn!b>)v
zF~nuj{%NK33${=1qdpH9?^J63H8t~*doNYXyq^wPq~i8-o<_>b_@7Vhxu<4-U#4~s
zsoS62&(~@hulMGR_u|TCAI7u)E75$v8-MTJ`g<?&KAW+WwQK18R=Z>LJ}Nh-hpGA3
z)a+C4y;M#6;}y1T#{4?CJony$FN%NcAHcJB&EsSE@`b&P1uw4+L*U*kjMoXqy}54o
z=N@_)?Zz8!814S_9C{m`eH)*5(Kxnu-WOp)-N&A@_PHqBc<S0eSJ|^JpNI3WsXGU9
zKl`ZJes+cJGh@6R&GS$Dufg^&Ux)d%ejZI<$Nv)e6LMJ(=AQT2#MoYgETb5gW5%s{
zIF$EMJm1a7x%7RXF~WVP7rrXG?{dQR_dQO@FDtnIzMF|%?z@?i`);P>zMCnz?=VX4
zJB)DS`wpYzZ`Sxb1z!h$-(AFid${i{!tKxZ7A5!HMag|{5$^TYcNXEc_q|29cHdis
zYxmtnxOU%NglqTxMYwj~UxaJ-9Y)E=)VS|3O1tkcO71(1aGxQ3hY{}m#rGHC-Y0x_
z5$^o??jqd#kMAzR&DVDq;hqn^y9oCg$#)mwj>mTw;l^9P;I5DFEn?T-_ZH#y=X;BA
z?Y_4tx$i7W?mLU{-SGFFMY!#KZxOEDcNZo1-9@<m=M`Lk-&Mpe_gzKFeOD1~d*4-*
z+;<h>w)Z_nxc<JY2-p5(jr-1`wENB?-0O$$EW%xX-&d5}_Z21geMQNAUr}=3SCriM
z6(#q5Mag|%As@q4$NTK@6z{!018i4a$4;p9v+?n|b|PGDS@zdxu6tgi)W%@-ne+^>
z_siMf)2Y+XnP6=XRQ;R<R`Z!QK4-)2FFv0GtLbxk)$a3f+iFY9bHF}>7*l)t`vSOW
zBTuiL^4MmB)f|J@P5D;D@V@&c@LJUB+P%)IWu7hsd(STC=_0tg{oBV^!M3xF_w-9B
z-uLyhPq{uHaLxZ3wby>@cWM6(>Tgm!v|n1;t&{)dV6X4D(!ckfE5K?#Lw-mf@~>0u
z%f7|0p!>{g@tGpyxe8n#&(&z^iT5qA@g5?c`{3JP?_=sWS2<h*ww<>37~{3n9_FL%
zI}|nZ5hwn2;ClS)(bN<F25=ewM!50S&#ZE~32ZxUnfLF3op)_no14Mvu7mePd2F|W
z)wFqkl*e{E*frJWy;8mvv*CJwAG{j1ZM5m5uexiPaa(`dI^2P+J`X=YQ%??eg6%is
zy9@02^fRVhA7i?QoNH^_yKZ-boyVL{_kh*>>}PvzYURFk57l$tivAq;53%`OH{0F|
zmS<gm1ip__o+tOi)o-fy%#Xop`nYF)3ifc%X!{98&2fm+_oLu?-;bfGXFvQ5tmZ!P
zTJbpe3CbfBuNB&zmj|eSPSLhswU?f(G;=Yg*ZZf?9HVU=uUx;hc^Yg#oL+O<`~q&9
z%&%O(w0Q<>8~)8{^Gk?ryzUuCu3y?b3$_in_c?8T1-H$WRU5f}Y4dBaZMcQZY4aT1
zHrG~d<odbxnRDyR`SalV{`?J^de-x|V70903t$h|Q`_$-YObd^eZL5<_x*b`^{nSh
zU^Uk>=ff*-$KX7@Of9$1i>d!W@!YUo{Qp?#`o9X6=N$eM*mKx^wCQ8t>dr~>w$8oh
z&*0N2?upl_bDsSLtS$Glzk=0V)9k0e!R_DqPrEnZ>N!9E4z{hj`MgH0miYewC;nUD
z^z%=!w#0uMtX9VV7u@*n&9wVBTs`ss1GcTY@!zCY%NX7T+lTuueY^)(*Z&=A`IyGw
z=JUaa;N<%O*m+C4|H8GW?~lOtUHbeFuAb+SPrzy)Q<5`b8}_Ab9Q$r<V$A2sLNTZJ
z(dEgh9h{u}oLa`~fNM`qgTUtGJSOkKXzIyn0kE2%ag)=6VEfW$PNQkR5Ln%u{Olta
z`?<&S$33?Q*mK3&XM@G47pHh=U!t;GC!Zz3_K|0UrNC-Fvw0rM7p2&jeT&m}8SqJr
zFML_JePwLRfz^C>mA$n*+;-Z`-_L1k>0?E(+N0IAbtSNx@7whEbDUcIR|c!yh`-M^
ztAN#VO<5J};aIc{rHpHDjYXV(hJnkt!{KV?p1w!G)zkOt;PkC+HOdYY=SiHt*8+Pk
zn!lej<vu_9u3{wk0BUvZelAr@+jYRU&E8lSuI6)@{(df1OMmNw?N431pJUb1-v(gk
zE6-sYg4OKLd+Nqu59dqUMie#2Bu+n@g3G)%gRA9P-_OHp$!iqYywuIh&(CV<Zws(_
z-B7LLmS8p4-MqF2dzhEDtte{dB~Cxvg3G+NgRA*I)wq5RS4&>ogUw4_yPw<DJYUkk
zwLJ4+EqDEmzXNsR?+DIXj{(aQYbWr*#8NkwYps^HJA=#IcY&*YY<~Ff3RX*hW5M>P
zuHCg(OMkn8%eCGeu9mgl6YSxfYTJXN=9t9kXD@J>*WPfo?7@A&YRPL~uz9JQmusz-
z{>FjJwcZb|mbE?r>|tKo_NS<smpJ_#2rlzF2(FfE#lc{;<aG$xywuIhwN`U2)4#Pm
z^I%=BwQHXEhk?Dv<^Fm&d^Fs9g0}b_0oHa3<5?HmXTWOaa45ChK02w7rsV$OJ<mQ)
zrtYe2c}5!#SM$jIW&->uit+7RuAhB)U+V^YKhri5EcbcFpG!;vZ$fQ*ZTcKZt#1Fm
zGnx!m%RSGZ6HI}t@5?>g>sb$2&Ei3`W_#aR9D}w7#p|azITy#m)z7c4hrM9?P*2>c
z;CkFXH1)*o2dh~aBH7Q4I}Ku7&!faW4z8ZK)4}$kenyqc46ylVGZ){J4S>xh&q0oP
zCR{!3W`S+1?mQY>E#o;JTp!N~XzCfyiC{I0M>&rtp{-3R=ka8?dd71K*gn(~_p{)7
z+*8ri6ZbT*nx#IEr=uCSoX0cZ>WO<M*gn)VA7_EhN1M4gk7t9;rJTpl!PV35^I+Sm
zJCDXza}AyMFMyrf@GpY(S-9GZ=YpNLH7Z~2@tqAW+kOdNwmq+EJCe5gYWLi8&z=vq
zZ;xnSt~BFC`wE)<ctpFP()1fgo}NP&qS>c?*hU}6dk(cWbN2K2Mc@@EH<O3YI$s5=
z<$8Vz*u(3lwu>pRQ@n19vk$%ouJ41dqp4>fd;_dz@i1T8dmeuiZ4*kl|1O2AXCGV!
zwh#5hy&PPRdj*<$;$8_>v()$BRcOX7_utiU^~C)a*gn*=kG>5yA8qF1{<{WjF6I9F
z4qQF$t_9mx-SsfGn(N`ZeHZK+CHL#VY8DUYEc@+xG}~uy-T>C_-n5-|b8~)g1lzCk
zn|*T=+%_KOqK{hId=G3J=O%4#h9{0~^ifOxw}5Sv{BH%TSu*#@|2DKT|J&i_@A+Xn
z?aAf)VEc7nCYL+lw(&3*ebmzC2VmQH?@pUL;fZ4#ebmg!^WiSA=R=+u?*^;oIpSWh
zhv$Q~dnhkZJRiiy_P+K*aD89iho+vn`4L#n;^Ekn`~7I!P|EY+$8h!R%b$SlLp^bS
z3a-a}08Kq{9|WsevL=c95Snqz^WkB*dg49;wh#3)7_ZmON5ST!&AvPz9s`?8c|QCM
zuAX*}gKewsdKg<R`{@a=Yna@B4py^xIA_^UPomjA=fhKA?Vb;|)1GlY4YuF%eE0?2
zHXi1pk6PM11GY_hKKv4%IJVJ8E%`qSF7y8tT+NcXPyW9~EAxL2ZvLJRw$q+mo(J1+
zc|QCGZW|AC(MK(9ehaouc|QCOo;bGAN6qI`_qzM~1?uGZB6t&uzUC^A?f2ld3;q&V
z-*MF*c?JA(RTk!(C+?e9;f~SgMcdm(Kl`_>_VoV;aM}MK;cHOR|DVA1{$E2&|F*Y{
ze)ex)+RL#zX89%5j{SA&#Q!tcHW|A-w!eU#kMO^O%iR73ujlp#T5_|!ZS>37wWq(o
zgY7T;O|bDY_P4<G{{MlN{%vm?{W5m#<yalFwtrH;O)-|wCjSB-O1YnP^cmsbV71NJ
zz<!Q?2khZBQQLngYR;cH{k#XZpU=^c{k{)=m!j?-^}6{1xPJZm5KTSTfd7KkEFPXm
z3v>0h&yUa?^Ntk#((nJk^?pA_Q%}F2fYmJaK{cmehBl{P{W9<Kp&L_su8;HUOL2Ue
zPqnOF8`wEMjYggW?Qpf&J1TqbS%cvEctjgqY33Pi2%2p@qAgHq`sLYhK{Wfc58LQt
zY@a_C0(*@(gY}Rv3|F&_W6`GOzR4O}+ur-<;@B3hxb2sK+s4nLuJ4j?zXzb-Z0z=_
zreFG43Tz*?@#h{(!)>GPy~H+Zxi>n8%fdZBTyO6^%fbDAL1M`D$s8{a_FRtt3UK|?
zzg(aAuL$;f6#tdr`sZgPa(&{zGT8Go{;R<C&(BBX`e=8IL&3h+E#G6Wil!c)VPO04
zeZ6xa*GKy-+71WLq|Us!9wX4y^Zs--u-dZBQRaJfxbvO&fos6^Q%~$Q!N&GGfyS2W
zm+`Izc2Dc?{Evj|?{@_Cm+Pay=jl4&@|v<PT+M6B^{l(OuLoE6e(RX*$M1-!=iSBz
z;Jn*tBfkCnZev4qZJr1AwGmj&BV*ba?srVGN96k1$7uXE1<y~P#`U`=a&7jvIoMpZ
z`8^c5wo%x&06TVVekVn~756*RYHSJKh1$6GX<L1g(^g<(np0wJji#RU*#@jOq?&K{
z&$jTiGp<~}^f4N|H2!%9zCB#cXMy<aQ2FG&`WU#HdpzxSgsZzI$!90H{b@6{-@8#u
zzq^PFzH5z-g_~RA?gm%$Tu9vA;p)cCUfu()?)P%zw<p~GwI#p3z~-RM{C1{ROMZKU
z)x!4ym+t`gg{$djdu?i716L()?{nkeX}=#h*DtSm`@<KbsAsRKjiR5lIS^jX!9j4f
zYm2cR3|Ds!%;NyCTKYHyoNKRp^ia4yxsDzNHV^%Bom0~<eH;#MwP{_Kj)2#%OP@hg
z&-@+<R<mvP@loKkGgrBOiPH%#$9ptf&Ff3nvJ0+WuK9Slx<7AAtO;=6L6+ytL^So}
z*bP>*ecDX|r=2;<^-GSE!DWt9;A&pel4B2C-5k^I7`X3n6Z=@Wn!d)-rdIAB_mO<@
zYCl=0zg~RH>+@8&TGmjTn(Y^<+Al`!IQ?E&KQ`Ar^D`E08eH9c`l#jN<ER~reN6|u
zx2^rz-vISYiih@DmEAhI9uKyUTZ!$n;t60i?>9al%4bmQ%f7{Fdm`Aja&tTxY;NXt
z616;I`YhNn*(Uy{g4Ojug<9^rX!|-1ycf0oXv?_O6aRE@8UGBp@e}h*urc+sy<8vX
z!g+B#*6Hsou>EBpoDEjXx@l7@=h}HOp7--Ffa~8$_#&Em#(XX~InD-uiE<7_KXa7p
zBR-G%ROY~V=Y!4H+O_=(^#v3U?H5*d>x}0juzh50zY12%cfRCbrr4K#i(jE{KciZ%
zWK0>`#n|d&y97-=@xBH&-b2+r{Oe%%ruxm*{`v;kcG~<baw)ZkdrjLnDQe~;PW;Qk
z_4rqysVDxG;4=PIaO10=S><#!*mm0N+co$W#Wm2Db^3OtyAGFu<*|JStfuW6YI$tm
z1-ri5uBDb|PhSUiPuoVDKKiP=h8ef@r>(>F*y{6e1DbkrxDjl>8Q)D{$ETk$<@y-Y
zJ>*<l+ur-o_rT6$&V!r5YB>+Isg?WEJyg#*{oRVq?{yga7O*_urT9KL&x5yvbImi4
zJK);$obdy&TKQdyJK^@3XN<eRYF@vR|J`ugYBPu1sMY*Vn%A0p!TVBcy9X@KzWX7#
zKHvADspmUPKLV?HuDYjx4EAtOYrCJK<~)nj_fNp}zJH3Qp8fg&Sj~N$^?VTSdTPs>
zKLl3K804-`;#zBanA$wGJpwi-Yp*GfQU8qMq5bj7Zk@G!0&E|-ru-bNmivYLQHp)p
zw>WK|0^2q@{{rlMo7dCSa>wXB@R#7lD7K0JvtV`opP`oL-1-&RbIX3T>GLGDy7)Qj
z=PAkMH(+zgd*k1N)$-o>1+a%}sO@(YHOC~*8omgwui@{})YJFNV6~Sh&TIOyey1_G
zdEb2nY#irHe`C0p{y?2P^m&z{W**{<?P#3rJ2q=`^s~kv!On%}Z^r*8xO$$8UIVMS
z&+=WD*Wv2f*M9+fI49cvOi?ozapL_IT#xrRH1))L1FYtKCiC=nxO(Ef1x`F|Z&Ln2
zF`l?ScKx;KlX3nN?D_69wLe$+7uejq&KO6(b*cZ2rp-Rzrk4Ai0k1><0UtrFZa;o+
z{T;Bsf9bWz7`9W>H?iLZ8#~vb_rPk-aq{^9o_y4k&xh#R?Du_YdGh%$*nHHJ&qv_o
zV;tM5>6?812QKsZ7_R0TBp-%6Cm;3XGatG(`~8GA^5io=_$bWk$)^o1`54D`YWgOh
zc5s<b2VBi_F!>CIXMXb8*Yz5LuFZb^y#{&mSpaN4>d9w8aPl#Z?bP&5J_~`%d=`eQ
zd2Q2Qo0>S+6VJKqJ8OB)32XcIxq1=W*~W8vF|dc{k@}((HTSMKV_zJcv6pMH1iH5P
zEm`=TO21jNrO>tA%fgw%(qP-FUsvU|3|QT~mZSDCFZE?9YUU+Qyyd~kt6Y;6(6z;H
z#lo-L+bf}KOI|C3ZL9A5C8l+*kE?*q)A`Zge9U=O>g24?P>Py4i?g=Fz^<RUdHxQE
zXFq65&Z~hlwsJ04N7v@s`+GC;?6)<*o{#Epay|E&u_o9Uj@y3pNiJ)F%Y8NyuBLDD
zTN|GIv?srH!O2hmto3^6+RV@2<B=!7^}*(+p8Pfd8zcGYll(RWm-%f3SIc$6`P&$t
z{In;(O~J{p+_RgZYcoH8uSlN!HU}R?pX&dp=5G|(7|Bnc<hKR5%x_D$TCN-Bw-r43
zX-|IJfRkUjceh2?W`6#jlRWut2R1+T<TqM7CHd*&-p$^#PQKfN%Y1i$tCiOkHQVPp
zU@iA~%)M-#{>EVQ*(LXt9pP$uKG&vZ`#j@4hW0;xBC;jL-zD{W;P2ylEJWFYaSx$h
zxWYa+EJ88%V$>f~A5ES1JkB@&E(cZfcj(i83AotyOBQ%3uzf8<?eC`=YXW6(#r5y5
zxZ_lxNU=ZHL|=L4U=rB=wb@SYv!LUeOmS?s(Uxo76tFh08^=(4c&$_Kp{SXo*trs`
zCFXRnF~eto9aqlTnc(a>ea%fxzjE%qKKM6BJeH+o?v|^tbFn<d`2M|>%-1Y%IbSE#
z_~{kT{2dRsPxG*?Jh}UK1dXB1e&m_IlfcfOZM0?nP6lgp{yt0X;ryweLQykkvHRuJ
zimNB?8DQgP&dvm9&a}nvEU-57JDpk{+u2|>V~DdSR|5OHMII|t%w;IGe&<k^eyhPf
zhEbgVRjG#;cm&w?t5ZAw+Rvpp|9Q^+64*9AOD4DT!0PciA8cQx&zIrqIe)(bww-#~
zT>v(o?VJO-ez9K!Hui;-#QrK+Jw6wMja~X&0#{G$uYql+o_1dc8{2lqmg^V$H^9a|
zj*{5l1gppAQn0a0pUdFtiG4ZPcIs(&1=!fOGqzm6*slT`yZpJ^)o}Ird<$&s(&yW7
z^~AmgY&-R|`wrOHwllU|zu3PEHuklYjQcvUdVH=28@u$m0j{3dH-c@ao_05Zjcq$)
z%k?vk`}%tn?e1l9xtG_1d#pimFMIu1lj8mzNu9q3a4XpToBe!8!EN(&jXziMjP*9S
z{pK^=_rYqnQ_R`^)#CpHu-cq=GhnqlDe2=L@LiO<DNi!iY1H>p)cvmXAnG4hTIT9L
zxVdMK$n{CwAA!p}e+*Z<pOP3q0jnp*Pr+)5@c`HuIS1tWB*uf_GRDJjwTCE)@d#Kw
zF&+i0CB|c5V`RU{^+}AMfy)?Az||h7B*xFd>WT3rSS>N00vlsC#Wj-alNi4MyGG&9
zfQ^}VZodSpIoDaUUxC%1rDRY48f;(MjN@KXOTW*9)xv*M<G+QMaeoI_b8Zs%1+cnt
zv!{L!R)3Kazn8%FuPyn#3^oUC=I7p4OFplF)iUlsfYn~5#OIG-ebVQjz-s!YPqp~J
z1}^*jGhFTU!sjoQPx|~TTutBfsg^$f2DZ<P_3vP{Hz-;2H^Hv?wG{i1>t`P;QNKme
z?zKdmYsm)SO(-54Qu17~QH9r`-k9QbWmD>0SKbB>F8DhI9|C^A;0u61tnrTuu0M}n
z^Yk}=jkgzEfB)7^8Q;HUUUL7IdAR=mE%R{m@o$-z+`nZWuD^fFJY0YOmU+qjTjt^V
z4=cF-BMQDcc#Vo@z5a!d^XfU8b^bS4J^S%LU^UM}*VH*zi~qY|wd|w!z-p&b^fA8=
zD6XaLT@$(3_0Zqvzd_XhrK~{FN1Hybse0o54_wCk1g`dR<zu}07-!<SrgHmoJ@rq#
zHn{Qh(WZ}kLOt<1z-7F_aJ78KG~NR6#B)!`?W?&Tz=^jI*m(MA)5krfo_LFZ%Xo{y
z)$*CqcuT+&&pjo#FZYxFiMJHkc=~A5$33Z@c*}sxc+0`n{EVoN@m2sQo_kVmU+zc!
z6K^GL`0JxhANRC+;;jNM<E;u;^KX_V-f(c@xu@m!<$l&b@m2#HPakdicuuG%-kM<d
zkoT!|sn>$5#Xb`3n#aC2TrKu>z^+m3>w?u{Uk~h@#=bsSE#INr0BlVAwXJr~Kfmv>
z5yf*d_ei<^>3d_a^OwFi0jtHnDcJGGz8P38-#OeIY#ViBj-uXzV$2H}v-2g_&py2l
zZ&}&BMvHTeo*&z`6pt+_xkhhQ;mxVHuJ9Jr+fck-Z`Z`FzZ10${kfmKetW&zx!_)}
zcCGQT1=oL{g6lu7#`i0@{)ZM^|HEtih=S`szTo;#tnu!G>)%^&{rhUXzu@}MD!Be9
z)VTi-g>rt*D7gM-)%e*3xBuA%*Z;g4KfmDmUsQ1YFRt-R3a-EZ4~68X|5Y`9b;0$&
zso?tGT;sPCT>rZYuD|~eg)-i~70>-^Tey3!ynk(nrk?xPXt3IG__^QRgH4S+=sjn9
zxLWRCJAl=^f92iX7=5ZZxvzOG*%4iv=bwG;1XlB0GVfi$o*%|_ugk^mbK{8J<F?z1
z7M^3fftRM%Uq9_R-**R>x$Xg1^WK+S_tXdOoaCNpu6v<tORjr^)spMJ;N<FgCO1FN
zG2@6mzigLW_XnG+{`zT8t_Oh2Tn~h+4WcC1gY==j*P?P>4@TFPTn_=OCD+5i$<=dQ
zZhoHM#t|PuZM)?98L+wPub=khdL+2a^(eU7qLk#?sZW(_Ij={fYfG+OV725r0i0aN
zgXQMuwZ%AMuPe4ou9Lv#s=t2Plj~$~nd=m|+Om}7+M^Hc-Sg$V9)qqexgHBvORiJF
z$<=F{-2D2ejU)D&X1nA%4Q#IZ>!&@r9tSRSoeozUN=dFW^r><!=XC&GTXLNVR!gqO
zgOlqlu-yE-rW!}=_0)FB^(3&l>aU;n<a#o=%=NQywNog`^;CUme<h`y*VE9oCD+rz
zYRUCXaB@8ZEH}TisEs2&o7#5C^>biz)n7mD$@TN#GS_q9YIowFT)&`Cm1{Y#UqsiI
zT+aooCD$*3lk04--2Bd?HjemwYTG5(FN4ihfBm#4*RO!fTrY&HT~PUWo%|{|F)sqk
zjdL-z{<+?N4eXfwe3|~gj;5~vCDiid^$oCj*{6MJPrGk|%e*dwt6f_8nAery<aGsD
zZk(&A^-o^k0-M+6)an1*V0Hbkrj{qKYry7ZpZ29a?Y;vp^ZG7a?OICK_IiD2e;swX
z4{v~L%i7)uR<m!%^F46Ja}!u@em7GaM|=yl?UL)QU~|=9Kkdo&HgK8i_u*={Q<Cc)
z`c%1=`|t<o+LG&?V726WH#oW81(uuNJ=De#-%D+~<oZLfx$3W<_VVW+-d}g2c<eyQ
zGwhfOZ%@5rg?;|mslq<b?o9D{c30{=zdk_!&THOBJXGTk7u@HYCkpQK@{<KO##0r~
zzPb-?KIQYxkI>Zfd~-io&1-$u<;QSy&-WpJ0#|z-?O{F#_^k3%xO&#aaXbhgL+#u;
zHo4ew8At4xY@0cM1nivaub=kh|0uYg|6^$CdEWXNSZ!H~e{X0r`g$C!p1tsM@R%wu
z=TL4g&Y!;ejl%vk*mct8T*{N@FTj3&vAylImwRI@w!J7GyHng7yHW2^;61_afxW3+
zbM4R2k8|gkHw6EZqF>f*J@B)Y9-m*qZKsd9%Js>-{TggspNq1l&!MU3{oC_kHP<iS
zWBLtT-D|>o^!q!ohd(RR_FIaYV-V*YeF2=D2Z4>bF6~|fYm49S3qNDT?<I6?_b~?x
zqQ4Bbt-3!8PE6~3ui_Q3IXRd5n~ypFf!f2I^?8+|X3pZ|@<(uTUKE^M{sh(*zt;-C
z>~+WXI=Z&x{AaLj)y+9Et&{U#z~<yQ^fw=K{u{N2IqUOR%3Bn37H8ex0K2ZUDdy|m
z_&b_<eBK0?dqb{|IT^=3?K^wrAK?04`6rrs_R8B}wQ{fg3$C8M@*l8=W6<_*ikk7n
ziT4iJc`x_oyJ+f(_a0cyy_CKAK3Ls&jwO9tXMR5b8@oAo)W&sw|4Z#*Zu)#kQ8PDj
z*7_rG=C|C(|3lXnzmE$)^U2=%1YKLcA4K;J+p6bh(et6Hn^&8d!@Shzr>L2iIPu!S
z$*bI}9q8KPH>mJ)t&-PZbZyCN2-vpj&Shd+XD$~2o2T=mzxn8!d9#l1g4*D&h3)Le
z@hwd4;rR4fh@$5B#M!fpfHS`A>qXJj^E<PPfz>P?gqqWSakT9Dv|j>EJ?)nSt64m3
zpWokK3QfKS#W?zAPL~F!Z`V4%@45`S?L52&>!+4>%YxI+wMe_=&=b#g`l)3umIv48
zVg)qyjAccznuUM#GPnIo2#K4ySQ$+{?N<S-S@_p%pSc){;9QKP7)RfXX;pCg_Wa10
zhN0We!!hZnmUhFzY3Ke;yAkM#XFL7W+-KP**797ttaB|{4V!aSUQ1R-Q_r<z4X~Q|
zrrnzG`ZZ`RH1#~+j0D?OJ?+*8I}YQSuUx;_*8$h_Sr<(``K$+4b5G>?YJIr6{id%C
z;EClr`pmT<ntF2F2&`uNwA&b*cIGJ8&pv!c*`%`jepa0Kv)cBhc<e**934lk-xh`6
z`e+AGJock(K@6XD_ovwAKx&_D4yE3jc0SvL?^N-OXM4Eq+$RInJHSU%yuWFS-x#nq
zpY_+pwj)^0@fkxN-<`qsndie@;A+ly;_M1f9PMd47OX9Cb_1&!$K2)mn#1m3V~6hn
z)+g`I_XO*s?i|W9*UsU=6pwNa4}v>~hfwD;!9KJr=WzcTKeXZ*!@h9)$ymq1GgfW!
z+Yj8dF*ak!V><w>mi>7kSk32u=Vlu9L2&g4d1jLz3|7;pznY^%;OfqwdCL>yFt9Pw
z-{D}j^mhbUt?chJaCQ50+;aPKJ&pt$Bm5|^K3R`Wus-Uphdk@yxpf4^<1mVQ+WmVt
z#r60Mb^0AoJN5h>hzW2m3C*8JXgh=XFrRL)wuuz4t!k6N`g(2M9IVe&@;Cluuy%73
zm$@B<-J{IyNVvImQYW`#X{Vmtdf~?N{M44*rh>JVx%E}P-UE*T>yzC2!P?DDY;OMt
DZWAtk

literal 0
HcmV?d00001

diff --git a/piet-gpu/shader/gen/pathtag_reduce.spv b/piet-gpu/shader/gen/pathtag_reduce.spv
new file mode 100644
index 0000000000000000000000000000000000000000..44cd9384713fe63c71a37771ec7769871840891e
GIT binary patch
literal 7752
zcmai%36zy(8OJ}EI|G6Xh$Je~AXcP6QmA+k6_-W|B2%-hyxzHUnYnrA-to>I#B->|
zBDKu+T}xUiE7NSJGRd;dHrtJ&eX-KY+)C~D`|kJ5eB7g_7oYR~pZ~MH%k#Z+?V7r1
zT9QmjW+c;-?kPz+W+zi&l4NGmlg@qX&Rn-_v^B77#nDIUuy4|p`suTuam)o|xKyoc
zcp0(-xf_{94sElMal{y<i+@v@ccK)i-nXvUw_!uyy3>l~k&)tP>*89aSgY16#er(G
zQf^f@RrH&l^Q+d&wXuP`LpL7zb#${bHkX`L8eHF~H>v~R9{$NWTFp{@v{>CNmX&vH
z*1f1Y&>EVswPRfGNU1e6S{Y0&Jz0KFLtZlSxQ_5`Y}`0nX(f2iOZJ8@H-`J&TB|fz
z98MRuc(m0VE4P+TtaRzv^8A0xSN&fH@^*8kGnh*mp2^&w;-2*Uo0WmFa$0ur*z#5S
zcXL0S1M9e`(e!xI9CMh<Deg{iOGX;idaGD&jMZDI{luL8TyVXZdW-^7yY*uYTfm-h
zH+N@W4AzQ$wMKubwzj^hQ7*Nrjr!VC)41;BQs&Weg_CLDiT8OK+?rag;d(r8b;snH
z7Lv=+PifRQRtMYntL~^5E49jSrQTv?-n+c_%X-_l2=>0@z1!^0>%~^JR!L*pHt%<+
zQXL#>6{}&4eBQI}#Zr5b!TU{c$w5!l?|~m_m1|*$`u1M*OO|RPW;(V~z22CNY42q(
z_;wGsBcirF(BI$KJn7$^?$Lf={dWDtzU{|$jyTcDGua25YqCGW+QU71Ui%`G-$_^}
zWB1Eq{Z_(izr6~+eOD)sZ3KI#a*i^0=xiTfAz&EuCgweRv^#kV+?n=DJ58EL&XM*I
zvw7Z@GHrYnxHLEjUr4S7kCd9F;mq$E=4y%y$#r|!3ds!}`i*<&-O1<CM~8}|<z}f>
z9<tW-^!=mP*U{sht)8yyW@4h=?chdhsM2J1ZFewFuC0*l+C%S7?nIw>DZt*loTKwi
zlegz5?<(HAcFd!_X~uNEX>yPLrWKOEVcRg)N^ipZRChGSTK!(K_i$I60{5hU7dV@9
z8qEv(nvJm$9{~3~g@vxK3^$q=xA&y(7_JOY^pJa%*CbzQHXELsb>-a;e44|$U0IGk
z^85bGMCK!>;~#IyB6NqGz3u4aOe5zRgrBwK{62&0$~eEP;Ed7Fw>CK62e|`soNt5N
zyd3AdAQx+Peb$rfMIzr`8FzS&^BvG{agKBUa!YcYYnAhld*t)mms^qJ{O09W<~YA|
zx#Lr=F!3$N9^%{9$I-`~$aST-kbdWfX<&FqS1>O`Ja6?kGkY%Wb1$UdJ?g|6#W$bN
zFBkd!{`2|O&F|fre+RQ?atPwy?__pFzTjMsT-?8Yo%b(iUH#(z<?{D$&A-CAr?~t0
zUZSRFBxf%D{dV$uP<Ibu_jlGz<X}X94{!3!+TR7HJ@(|c(zz$O{GMEgT-5Pf%CAFx
zp@|vREug!OgOE97GS@<)jTg_w$9450js@9oF1b$W#n(Q!|CNQ@jXf@wSM;$b<&0m5
zZa=EaThBA@MOa#LWTwvrJ3{x{%G(#AyWX%L0d^0eF9&<4>egM2bTL9-gYFtaKLuQ1
zguWL2Bu3~P&}T71KO4Q55&C)PQy8HSqPtIZ_g6zakI*j!`__d2Ch#N3f%w0K*)_W#
z&+nzo?p=OWru$CDdFVTiIP|}SS(|+1-wO6T7LfI_%(fueKYdH}%Zv8MlDuDFbNXIf
zj#&TI%-)Th>s7xZ<6{4>LHGUA|H{l}zC+0s?}>SRC)D+OYvw0sUUlD!=mX=oA@2PE
z*zL@*|IdKOk@>`ZmU$W?XWq{-dp7dsRrmLTT%5OWka_jLDYKa)@_z%IU&lAmrxW1!
z<NFe4m7m?K?~Hc$>U*MYjlVOSPtG-{`=-dnntW4YP4{FrbD39zwHEMAPRX#ky!odx
zn@gMd)O{=DmJn&L`Br#F+cG(ZT-rSshy8g!jDIK1;rF}+@mxZ`v_toIV%Ys2uFC1g
zU)`Z!)1hCN(~b9cUp}9|`$9M0%{g6vf5(Mg{f-WOSEjFFSC@caj`*#OBdgh?XZQ+4
zek(reugtiu%qMUvo`KvNtoHV65&Pda&AyL3uLEn_ndNysSZ)y#K5qb<Z#?sPBUnFq
zYk9tM;s189T-1IC*l)zz+QRRhU~T5`PSnHpF0h<7??e4K>U(bQMrI@O+PyzHv3Io%
zd4QgH53{}Dd@twrKIZo$4(%Uc*6tj2KL|F*_0-zV{2@fnbA6B;>hDF&Wvn=2KMZ!?
zp<fF&SDf2N!1~C?*?bfnXQMr0KL*woXQLjrkAvfE>_zoBo9n@zjlBMz*C!Cq$XqvM
zy7)%0{>RZH`h61F8_`FbKE}y=UY?zMc8;}t3Y_2Lr_tr+BgSi!%io7*7rp%j@D5}g
ziJV^q%g6e^1a=+9+{FB4#J5pDYpClZ_FEfAthbZd^*Ve1w=jPdacKV<vv%jGeJj`;
zasOWj%dJ7YfAy~*<}y~i8{W59@y(6BeGA;Vw{OGAN4?v?*1Lth^)P=2*@nnR-+UKr
zoVHlk_rUVLjrXzdA22&yqqgrOa;{MvYx*I$b4@>jlaDq17@S|zPtdm^^0B6$f{oJ_
z_wqBa_o6NC?&n~6@8))9^|1W{?7eEcgIPUnzXa>AZ5Ok8^y#m__Ng)2^f8aT_Zs_l
zeyqFMjco@qj>Nuy4c?vU)0lsQm^b$KTd@1n&zkD`Sku1pteqp@@4%j8oZ0Wea`O@6
zwaMjs^%nHFt3QA{*Z4;``D^Ky?aY5d<i-*E;4jP$`#{^D5joc;j@<tM-;InTvEF}z
z<>L<i1#a)ZC%FgwZ{#jyE~4)lS<l`J*4JVFE1B;@^w-zDs_Pdq_k)d@f<(*%U}NHZ
z)b)#)2f@ZnMIz=uU}F{`)=}3lV*U#@rVELfhrq@hff%E%U&K5NHl~0?%p+i9mLtZf
z>*w9Y`8q$uj#eW6PO*pVQ^#J&vBXVho{_^d!QRb2%>G`{J_j+@+WRs;4l!mf5_ffA
z#)r>5baUl>4nmiY{yP|KoP5L`0=AxU?n_<2upbJxws#V>7l7r%=kZ``=Y4w7<)ijO
zuyMUe#2p5<wsF>0*DvgcgRSk|M(rnn<-_NRU~A`no`fzRwHJeplaIJ3gRLEXqpn}r
zj|5x$2qbDR0n3NaQn0o2KFiSMqxMt4#>q$AQDAEu=f2hT3;R>S*0$H8_S3-f;j;p4
z?Yz&?=<-qf7_f2j5qB)u+R?}A`dP=GTZw3Yn4T2pdwDjxV?V@Rwl`-X_V518@q2p}
z*j&-iC+2ix&h5}knI3yR9^JgRX79@hV7b+Z=VgAm@IMJG7r%v12FtBMBFCv<`BRYF
z*y{nzry=rN(0iDlo^f$jedw+|dPH5HsCzm%zs_f%%dJJC#xue4QDYrgE^4d?TO+;!
zb$z17v%vWp&qkNqfJBWm!17V!IbgY{aVFRr(QoSdM2)k+`5MnfmpdDY8s~uJqsH^V
za#7>?U~9xXqOMQWcmdcu3jKv(YsUX~ya+7kS;w6f!Sd%J(bMOH&85v=u$Rt9B5yxf
zF7$GTK7gLDTS1reY@+T)u)KAnr-s1tgGl&Q!RFT%>w7WSHE45v_O@KC=K`=??0Xn2
zS3|<54%R1fHo$WFMozi#9|7lcHqqrS%=wIFK9RG9E~jtgl#85WU~|S^F9OSLLgLOh
ZgT3?Rh&j~tGl$>*#fWy_60vW|{{UWEO?dzS

literal 0
HcmV?d00001

diff --git a/piet-gpu/shader/gen/pathtag_root.spv b/piet-gpu/shader/gen/pathtag_root.spv
new file mode 100644
index 0000000000000000000000000000000000000000..88e20b907c6f45e137a705f3462f2a62d9151c8c
GIT binary patch
literal 5852
zcmai%iFaL96^Cz|p(%w9NNthQrc`T{rcy0K?E-CW4Us@aa2}qoFE7c{m%NyF(^7GO
zf&*1?7Qq3mbFGL2&QlQwtbjAlqWCv}x|S}#@8<4s`W7s2y1Re-xA#8#oPGAaS?QX;
zZb6pK%TCDpvVUZ=&$4VjOqMOmdJ8=`eA)1(c0RIc%h_icu{i500*zUcEy#M%-N;0>
zF{xuaau8WczWg$TC5yGp>EZ`7y91+}%7ep|!QHzDhj&$KQ&W|8zQ0+oG#iuk%1C3n
zUdtPM>&EpZagE7Zb7mwB=piEALl3*+VA<u>v5Q-it;PtrmmfKwe7ZW>t~9O?%gVbq
z>+Wle<l}R`*&O#fRn5oS^|8X!Tdr>><RxPr_YtwJ(b0B2&xr2J7Q*M%vC2fTp-N+<
zKAAW2{Trb1ucVh*R%=b{@#qi7ZD{AyGqrqR?j!><o6~=rxBcIMV*Re{By_dJi_v>Z
zyc}IC@daqR;(oV-$C{PFW@}HiIW)PqRjcNW*5uI6BDOmlLN_6sh24`q2R*k-u>F?v
z>Fl@sqxD<gBlTOpv)^)$-0$ve7~jrnUfnS>%1(L;8!8)tQ}?jiB?2DtE6|MscV`ar
zRCT&KQQEIL&hE+%U{}Y+$m_|j1GmR3?b>uTuZ_?4?jRh!8@4?itt)E13xCu)1a9Ty
z^=Y(od>Eb9K60GhmEDd#-P%{Ia}+$gPFHppIQDz&xSa0n?!+FXhIhLNS%qB4O7_ub
z-K?79ydyb#9C6-_TwlU@FLEa)oH=q!5o=zA?nO>PT+93MF~(<*JCJiN<Gq_R5TCHu
z&YFd7k3GX_rT>A_J{A2KCdhrRWi{6mFK12b_ai>5%lBTcQ_S;xei!=5vyNC<%icYw
zSYA1^T)X(zVQ(Z`|H+74r?{8L!uKQd5c9QHAU@VyiFOV57XCHhG=4p{XAA!T*z<<H
z1?)Yn;Qwc!T}!{~KNIa*^4m&#d&%kFQR1kt-i6qgRrm+d?oZD63rkzx-a1p}bjs=M
z;39batHIv6wrgCB_B?X#N!vM))BoHOo9h`}yMoL^P9bWv<n84(i2IAUH-lZsy}zaO
z$+_-Z(e_E+y4t?ia?u;-%{s<kU;5JZZ-85fM7<lq?mO(8z<n%W{(Ys-{cq$H-oXQi
z`|&)=In0^)?!m`<`zMUQZ{3sg;rH4-Xxsmx#MbXOE&R@@-?h~CyO!Dq65D*gQ>p(@
zhkdxiKGI?Pt%`itJDS+;$8T2nwU2e!ez){*=hQpU&nocj`*c2MwtqWI{#s(RpIvg!
z<pp%j9?3=g?t*vC-eR6E#dgMg_w*U(d_51*XKnX(8Df8pyR@uzIoLQK<2>W@5o3+B
zzP53Z^8&CrjKW;b3&G~BLCn!ME^=N3Hph90IxhyBvmP-=+qlSi3D_LxCURa1HplOT
zIoifW&db2&ICGKna<DmmOU%(WPV9U+tN&tWuS7ea>IYe254wi<=pRArSI1q{!LD%|
zy&FZ3A#&c`zga`Oinx}!;>aBbn;Ukcv=5Yb^D1mP_ZKm*20IU7H^HuPRhcsZHb&m_
zd+#m8dv`taoX06d-X8c)X~#TGgJYgt&m8Bejp&PB=Ou@l>-7xScQR&2+qlTt3wE7!
zcJ^VL6SJdjT;yB<HYc5({n+Nj>}VSoIj;qqlg`eS*yhCSXd4$fuLGNt&d%$x&57C3
zHf}5Lo_%}+vIMb*#zoKH2)5_mu|3u{);V&foCS5<<5l4F9<Rohi=OC{OXt*?ih9?A
zt#=I)cl;)>y!8*DwT<6_ej9QSS%$<O-d^&N{|;>P%())@PUJeoIM>uRM(lUiwcOvk
z(VkE3d-PuP`w$=f??>xbN9_-QT_fJ34}#@xKzxt1--Ec8x#IibZ$v9MA#t`3fjiIk
zVL17y_Ytu5?kwM%kAf?R{H<`l%QqwD>3fJXehls7e)Qdf$hjYJ)c-iRv;HUG<fHy4
z!D;<ZVOw85-v3X7&C?fm@ENdopfB#}vtW7e;SgFoe4hi$={t<p4&Ud&-mktRXziHm
zTfxq?Ir@w-R^B^|bE_Zi9=?FDvk$ky$;TeP2zK2#-<QCi&p2yp8)Hpp$X=_>??rzZ
zv5)cn`%1~JL7T5nE}ctfsB_O-S#JUSSCNH?xwn^g-0Rn{JJ0iVIQb*x%zOhZw+wM+
zzJ>O2X7qg%k@FnlSo_=H&b7Y-Cm-|hU9g;U68-uf*na7Yo_-%JA7{|^{83k}?<m@R
z>bnE&p47gxccOoY_~`!;TE9Aa|6{Og#5?;Fu-q=h^J@P9aV>Mjk^57yxv}S;f$g{Z
zI)>KvjEm6sAVY{b5r1#V8-F)gJ7)Li;3bIb=riUnw0zY6MZrI8{}S8%#2oz!Y>a%=
z{59Bh;`iilz?UE^5aZ+b$Zx^M`gn%>(7!{BH`e`V8y7jh2b(hwiJU)x&52poHZF4h
z2sURv5;=bYo3kFVj<#`;^JlO*T}b5o1#Hd$Vve?Pk@HuuIo(L)+z&Qq3u2D8an6c!
z<9(~6-+u$A{rx+(T%1jxTsn)+&41{~d5C{Y_<k-#eEJY;oPho>`ebw(=l%Pfh@8v*
zoTHNxyclf$QnY`2=wFUl!!xc#pNg2{oy9pGSMm{a8n$btF{`lUW0p<_n<pQ6tHIWb
zdC)d4{QY2S+n=bt7Tk|S%o$*7r!niW<)ilF!RE<F-V?yqj^1k<7yc)Lt?dj&?G0f0
zh<Os&+G)&_vE`%onPBteBX1+v+A*Kn#)W?q*xCa~)P4$BK4Q)STRV-}j4dCvp9(fl
zKJuOhwzhfBthRCCKO1aqXF6)11D21NbHUb5W6r~tkJ?+o=E+Ci)4|q`_eI+{>o{}U
L5dF@w*jfG`E6hro

literal 0
HcmV?d00001

diff --git a/piet-gpu/shader/gen/transform_leaf.dxil b/piet-gpu/shader/gen/transform_leaf.dxil
index bc4f9410cb194343136a58f29bf2f62a21114b42..3864dbe3c0c09471f3931d11f8ea656c68ef46a9 100644
GIT binary patch
delta 58
zcmV-A0LA~HETAkDL|8&Y+EF5BxhcvQQHww^vPO1_u@o!_5EnA|qmM*WwAdd_g`?^M
Q3A0cLJq7__vyKju2Y*o&NdN!<

delta 58
zcmV-A0LA~HETAkDL|8&YY`rhS%}sT1BJFur8F<85u@o!_5cURyr4Ergc4Y(Y$Zjii
Q>a$P?Jq7_-vyKju2dS(UlK=n!

diff --git a/piet-gpu/shader/gen/transform_leaf.hlsl b/piet-gpu/shader/gen/transform_leaf.hlsl
index 80b5434..c0343f0 100644
--- a/piet-gpu/shader/gen/transform_leaf.hlsl
+++ b/piet-gpu/shader/gen/transform_leaf.hlsl
@@ -37,8 +37,12 @@ struct Config
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
+    Alloc bbox_alloc;
     uint n_trans;
     uint trans_offset;
+    uint pathtag_offset;
+    uint linewidth_offset;
+    uint pathseg_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
@@ -144,7 +148,7 @@ void TransformSeg_write(Alloc a, TransformSegRef ref, TransformSeg s)
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 8u;
-    TransformRef _285 = { _278.Load(44) + (ix * 24u) };
+    TransformRef _285 = { _278.Load(48) + (ix * 24u) };
     TransformRef ref = _285;
     TransformRef param = ref;
     Transform agg = Transform_read(param);
diff --git a/piet-gpu/shader/gen/transform_leaf.msl b/piet-gpu/shader/gen/transform_leaf.msl
index 6229b25..16c1e13 100644
--- a/piet-gpu/shader/gen/transform_leaf.msl
+++ b/piet-gpu/shader/gen/transform_leaf.msl
@@ -100,8 +100,12 @@ struct Config
     Alloc_1 pathseg_alloc;
     Alloc_1 anno_alloc;
     Alloc_1 trans_alloc;
+    Alloc_1 bbox_alloc;
     uint n_trans;
     uint trans_offset;
+    uint pathtag_offset;
+    uint linewidth_offset;
+    uint pathseg_offset;
 };
 
 struct ConfigBuf
diff --git a/piet-gpu/shader/gen/transform_leaf.spv b/piet-gpu/shader/gen/transform_leaf.spv
index ec47a9f94f1f0e8034426f89715ba91137d43fb6..49c9789a002902f70bfc0f1ff7cf784894012230 100644
GIT binary patch
delta 281
zcmdlH-;ucC7P~wfgD@il0~Z4WLsC+HMSNmTPJS{20~-S?Se$$ELw4bM9-wSVQDR<k
ze12M5acT*W3sk`lR>2Eo7bKQslq9CZW!V@wz;b**xtz?r)bh-fk_@;kLN7m5Z*eL@
zFDuYy5ul5l^YhX&)16Aw7#KE7b9l3OLmUKhiZT!z0I>(uksv34#7vOHKvsjqERe)N
m)`7%qHYf5NWnmPWtS+cKIY5wyiH%`$kf0MI&*rIu%5nh2doarY

delta 79
zcmeB3+!4Rw7P}5BgD@il0~Z4WLtcDIQDR;(0~-T7Sd1Ge1{R6WPfIIKEn#5TT*i^W
cvRRGy01G4E<mZCAlU0Nq7`Zp63MtD000FTRmH+?%

diff --git a/piet-gpu/shader/gen/transform_reduce.dxil b/piet-gpu/shader/gen/transform_reduce.dxil
index 65ff9449e2f060c1c5caf5569899c4fc0656f15c..f9e1cbfcfeacd21eec2aee869e6d1c4ac790a7e1 100644
GIT binary patch
delta 58
zcmV-A0LA~<B-kVrL|8&Y7nLnWg0aiC%~%EX-BVhNu@uk;5YpHO9YuFy_kr^T{fA__
Q+OzZrCk6pnvm6dQ1ox#G3IG5A

delta 58
zcmV-A0LA~<B-kVrL|8&Y?*$JTQI>pQRBX^m&K`3Lu@uk;5JiE+wkzZm8S>q}uw}ih
Q{j>B3Ck6plvm6dQ1h}ym&Hw-a

diff --git a/piet-gpu/shader/gen/transform_reduce.hlsl b/piet-gpu/shader/gen/transform_reduce.hlsl
index 09504f6..75e7e3f 100644
--- a/piet-gpu/shader/gen/transform_reduce.hlsl
+++ b/piet-gpu/shader/gen/transform_reduce.hlsl
@@ -26,8 +26,12 @@ struct Config
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
+    Alloc bbox_alloc;
     uint n_trans;
     uint trans_offset;
+    uint pathtag_offset;
+    uint linewidth_offset;
+    uint pathseg_offset;
 };
 
 static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
@@ -81,7 +85,7 @@ Transform combine_monoid(Transform a, Transform b)
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 8u;
-    TransformRef _168 = { _161.Load(44) + (ix * 24u) };
+    TransformRef _168 = { _161.Load(48) + (ix * 24u) };
     TransformRef ref = _168;
     TransformRef param = ref;
     Transform agg = Transform_read(param);
diff --git a/piet-gpu/shader/gen/transform_reduce.msl b/piet-gpu/shader/gen/transform_reduce.msl
index 71e9935..aabfaed 100644
--- a/piet-gpu/shader/gen/transform_reduce.msl
+++ b/piet-gpu/shader/gen/transform_reduce.msl
@@ -38,8 +38,12 @@ struct Config
     Alloc pathseg_alloc;
     Alloc anno_alloc;
     Alloc trans_alloc;
+    Alloc bbox_alloc;
     uint n_trans;
     uint trans_offset;
+    uint pathtag_offset;
+    uint linewidth_offset;
+    uint pathseg_offset;
 };
 
 struct ConfigBuf
diff --git a/piet-gpu/shader/gen/transform_reduce.spv b/piet-gpu/shader/gen/transform_reduce.spv
index d6f84a21400b2026e7c3fe77e601295dfd814514..451775d3a4f218e355a16c6f1ccb4dcd6325520a 100644
GIT binary patch
delta 259
zcmbPW{lI2J7>h9*!#oBC1}-2@O3JT@Pt3{5PiA0XV_*e~a|6Zm;!BDW^NN9D>|imT
z$%QP!VK5#qP(?vvNk&OxdVGFbT5)O#NH+&qB_B{OCo?a#JTs*v11<}ahnU0<)mxkj
z)4REerGnWTVmA*^RvCy5fY<}-9*}JyF%u*)kS!oF3nVd+$sjSC&0pEiGBXNIKFqB<
V`3*M@BirP++|G<Vn+<v51p)KoGARH6

delta 85
zcmaE0Gr@X87>f=o!#oBC1}-4Zi!Uik%qwPKV_*l1aRbG`BJufYX~n4}3=Eq;u~aZ`
h-o<f(nVFAa-sFuu!jot4NK8J!<G{$hnUObM5CFK}7bO4y

diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv
index 072910ba768cbef1a69b0c78b0bee54cfb67b86e..4db2c3a971dfd506946f377c8f0d0bf49c559236 100644
GIT binary patch
delta 271
zcmeyfj_J$>rVW!s>e(1BFflN2F)%PBCFNJdC+6hjCo?dxF|dNgxq;$&@g+rxdBs36
zcCZ)^Pz)>*pP!aioLU0p!sU2@?1IFSjFQB3xGWn32Us5;P%bAkFSR@~r6dC;D~qI=
zAF8=H6{dOfJ`o>oZ-`Ss)+z(B0T6pYod>cGBxZsn2C@buW`QIIG8rUhvzb*Pk%v)e
Va=MQ0<Ow=FOl%CBC+WP(0sy;2H^~41

delta 61
zcmV-D0K)&wumb$90<e-42L%Ao0ssICld%*N5(@wT0CaL;ZgXF6W@d9`bN~ReE)`4*
Tv&tY`3;_<4CoL_rWG(1wP!$sj

diff --git a/piet-gpu/shader/path_coarse.spv b/piet-gpu/shader/path_coarse.spv
index 54814b7315f5608784dc15dfe2a1feb724b9332d..2fc59fe716919d33634cf7d60c3ae74e654e8fa7 100644
GIT binary patch
delta 257
zcmZp;$aLWn(}rh4#%v4%%nS@%3=9lON%<A=i8(p>$qWo^46I;rZlHKxd`VGaUNKON
z9W2H(nNe6c494RHswhY-$tX!okIzp_D^4u|>E-~d<O9m(Wag!oXQq^7z-2-55R>?!
zdW%zGdN&IRdvJL}?B)T=Dg&_r5PLw~1F{VyW`ZOJvIQh&fg}bp86;-2IaKx~8>7%<
VdllWu87e%JeN;F$7pMrd002flE8zeD

delta 79
zcmca`iK*ct(}rh4I;;!=%nS@%3=9l;@g+rxdBqHD4D4VrZlD-gBtAbatvI!Wfnjr#
cum{&>DY++XjC_;-tLRQPQ03U{qRP_(05x_M82|tP

diff --git a/piet-gpu/shader/pathseg.comp b/piet-gpu/shader/pathseg.comp
new file mode 100644
index 0000000..8653c16
--- /dev/null
+++ b/piet-gpu/shader/pathseg.comp
@@ -0,0 +1,284 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Processing of the path stream, after the tag scan.
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+#include "mem.h"
+#include "setup.h"
+#include "pathtag.h"
+
+#define N_SEQ 4
+#define LG_WG_SIZE 9
+#define WG_SIZE (1 << LG_WG_SIZE)
+#define PARTITION_SIZE (WG_SIZE * N_SEQ)
+
+layout(local_size_x = WG_SIZE, local_size_y = 1) in;
+
+layout(binding = 1) readonly buffer ConfigBuf {
+    Config conf;
+};
+
+layout(binding = 2) readonly buffer SceneBuf {
+    uint[] scene;
+};
+
+#include "tile.h"
+#include "pathseg.h"
+
+layout(binding = 3) readonly buffer ParentBuf {
+    TagMonoid[] parent;
+};
+
+struct Monoid {
+    vec4 bbox;
+    uint flags;
+};
+
+#define FLAG_RESET_BBOX 1
+#define FLAG_SET_BBOX 2
+
+Monoid combine_monoid(Monoid a, Monoid b) {
+    Monoid c;
+    c.bbox = b.bbox;
+    // TODO: I think this should be gated on b & SET_BBOX == false also.
+    if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) {
+        c.bbox = a.bbox;
+    } else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 &&
+        (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y))
+    {
+        c.bbox.xy = min(a.bbox.xy, c.bbox.xy);
+        c.bbox.zw = max(a.bbox.zw, c.bbox.zw);
+    }
+    c.flags = (a.flags & FLAG_SET_BBOX) | b.flags;
+    c.flags |= ((a.flags & FLAG_RESET_BBOX) << 1);
+    return c;
+}
+
+Monoid monoid_identity() {
+    return Monoid(vec4(0.0, 0.0, 0.0, 0.0), 0);
+}
+
+// These are not both live at the same time. A very smart shader compiler
+// would be able to figure that out, but I suspect many won't.
+shared TagMonoid sh_tag[WG_SIZE];
+shared Monoid sh_scratch[WG_SIZE];
+
+vec2 read_f32_point(uint ix) {
+    float x = uintBitsToFloat(scene[ix]);
+    float y = uintBitsToFloat(scene[ix + 1]);
+    return vec2(x, y);
+}
+
+vec2 read_i16_point(uint ix) {
+    uint raw = scene[ix];
+    float x = float(int(raw << 16) >> 16);
+    float y = float(int(raw) >> 16);
+    return vec2(x, y);
+}
+
+// Note: these are 16 bit, which is adequate, but we could use 32 bits.
+
+// Round down and saturate to minimum integer; add bias
+uint round_down(float x) {
+    return uint(max(0.0, floor(x) + 32768.0));
+}
+
+// Round up and saturate to maximum integer; add bias
+uint round_up(float x) {
+    return uint(min(65535.0, ceil(x) + 32768.0));
+}
+
+void main() {
+    Monoid local[N_SEQ];
+
+    uint ix = gl_GlobalInvocationID.x * N_SEQ;
+
+    uint tag_word = scene[(conf.pathtag_offset >> 2) + (ix >> 2)];
+
+    // Scan the tag monoid
+    TagMonoid local_tm = reduce_tag(tag_word);
+    sh_tag[gl_LocalInvocationID.x] = local_tm;
+    for (uint i; i < LG_WG_SIZE; i++) {
+        barrier();
+        if (gl_LocalInvocationID.x >= (1u << i)) {
+            TagMonoid other = sh_tag[gl_LocalInvocationID.x - (1u << i)];
+            local_tm = combine_tag_monoid(other, local_tm);
+        }
+        barrier();
+        sh_tag[gl_LocalInvocationID.x] = local_tm;
+    }
+    barrier();
+    // sh_tag is now the partition-wide inclusive scan of the tag monoid.
+    TagMonoid tm = tag_monoid_identity();
+    if (gl_WorkGroupID.x > 0) {
+        tm = parent[gl_WorkGroupID.x - 1];
+    }
+    if (gl_LocalInvocationID.x > 0) {
+        tm = combine_tag_monoid(tm, sh_tag[gl_LocalInvocationID.x - 1]);
+    }
+    // tm is now the full exclusive scan of the tag monoid.
+
+    // Indices to scene buffer in u32 units.
+    uint ps_ix = (conf.pathseg_offset >> 2) + tm.pathseg_offset;
+    uint lw_ix = (conf.linewidth_offset >> 2) + tm.linewidth_ix;
+    uint save_path_ix = tm.path_ix;
+    TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + tm.trans_ix * TransformSeg_size);
+    PathSegRef ps_ref = PathSegRef(conf.pathseg_alloc.offset + tm.pathseg_ix * PathSeg_size);
+    for (uint i = 0; i < N_SEQ; i++) {
+        // if N_SEQ > 4, need to load tag_word from local if N_SEQ % 4 == 0
+        uint tag_byte = tag_word >> (i * 8);
+        uint seg_type = tag_byte & 3;
+        if (seg_type != 0) {
+            // 1 = line, 2 = quad, 3 = cubic
+            // Unpack path segment from input
+            vec2 p0;
+            vec2 p1;
+            vec2 p2;
+            vec2 p3;
+            if ((tag_byte & 8) != 0) {
+                // 32 bit encoding
+                p0 = read_f32_point(ps_ix);
+                p1 = read_f32_point(ps_ix + 2);
+                if (seg_type >= 2) {
+                    p2 = read_f32_point(ps_ix + 4);
+                    if (seg_type == 3) {
+                        p3 = read_f32_point(ps_ix + 6);
+                    }
+                }
+            } else {
+                // 16 bit encoding
+                p0 = read_i16_point(ps_ix);
+                p1 = read_i16_point(ps_ix + 1);
+                if (seg_type >= 2) {
+                    p2 = read_i16_point(ps_ix + 2);
+                    if (seg_type == 3) {
+                        p3 = read_i16_point(ps_ix + 3);
+                    }
+                }
+            }
+            float linewidth = uintBitsToFloat(scene[lw_ix]);
+            TransformSeg transform = TransformSeg_read(conf.trans_alloc, trans_ref);
+            p0 = transform.mat.xy * p0.x + transform.mat.zw * p0.y + transform.translate;
+            p1 = transform.mat.xy * p1.x + transform.mat.zw * p1.y + transform.translate;
+            vec4 bbox = vec4(min(p0, p1), max(p0, p1));
+            // Degree-raise and compute bbox
+            if (seg_type >= 2) {
+                p2 = transform.mat.xy * p2.x + transform.mat.zw * p2.y + transform.translate;
+                bbox.xy = min(bbox.xy, p2);
+                bbox.zw = max(bbox.zw, p2);
+                if (seg_type == 3) {
+                    p3 = transform.mat.xy * p3.x + transform.mat.zw * p3.y + transform.translate;
+                    bbox.xy = min(bbox.xy, p3);
+                    bbox.zw = max(bbox.zw, p3);
+                } else {
+                    p3 = p2;
+                    p2 = mix(p1, p2, 1.0 / 3.0);
+                    p1 = mix(p1, p0, 1.0 / 3.0);
+                }
+            } else {
+                p3 = p1;
+                p2 = mix(p3, p0, 1.0 / 3.0);
+                p1 = mix(p0, p3, 1.0 / 3.0);
+            }
+            vec2 stroke = vec2(0.0, 0.0);
+            if (linewidth >= 0.0) {
+                // See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm
+                stroke = 0.5 * linewidth * vec2(length(transform.mat.xz), length(transform.mat.yw));
+                bbox += vec4(-stroke, stroke);
+            }
+            local[i].bbox = bbox;
+            local[i].flags = 0;
+
+            // Write path segment to output
+            PathCubic cubic;
+            cubic.p0 = p0;
+            cubic.p1 = p1;
+            cubic.p2 = p2;
+            cubic.p3 = p3;
+            cubic.path_ix = tm.path_ix;
+            // Not needed, TODO remove from struct
+            cubic.trans_ix = gl_GlobalInvocationID.x * 4 + i;
+            cubic.stroke = stroke;
+            uint fill_mode = uint(linewidth >= 0.0);
+            PathSeg_Cubic_write(conf.pathseg_alloc, ps_ref, fill_mode, cubic);
+
+            ps_ref.offset += PathSeg_size;
+            uint n_points = (tag_byte & 3) + ((tag_byte >> 2) & 1);
+            uint n_words = n_points + (n_points & (((tag_byte >> 3) & 1) * 15));
+            ps_ix += n_words;
+        } else {
+            local[i].bbox = vec4(0.0, 0.0, 0.0, 0.0);
+            // These shifts need to be kept in sync with setup.h
+            uint is_path = (tag_byte >> 4) & 1;
+            // Relies on the fact that RESET_BBOX == 1
+            local[i].flags = is_path;
+            tm.path_ix += is_path;
+            trans_ref.offset += ((tag_byte >> 5) & 1) * TransformSeg_size;
+            lw_ix += (tag_byte >> 6) & 1;
+        }
+    }
+
+    // Partition-wide monoid scan for bbox monoid
+    Monoid agg = local[0];
+    for (uint i = 1; i < N_SEQ; i++) {
+        // Note: this could be fused with the map above, but probably
+        // a thin performance gain not worth the complexity.
+        agg = combine_monoid(agg, local[i]);
+        local[i] = agg;
+    }
+    // local is N_SEQ sub-partition inclusive scan of bbox monoid.
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i = 0; i < LG_WG_SIZE; i++) {
+        barrier();
+        if (gl_LocalInvocationID.x >= (1u << i)) {
+            Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)];
+            agg = combine_monoid(other, agg);
+        }
+        barrier();
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    // sh_scratch is the partition-wide inclusive scan of the bbox monoid,
+    // sampled at the end of the N_SEQ sub-partition.
+    
+    barrier();
+    uint path_ix = save_path_ix;
+    uint bbox_out_ix = (conf.bbox_alloc.offset >> 2) + path_ix * 4;
+    // Write bboxes to paths; do atomic min/max if partial
+    Monoid row = monoid_identity();
+    if (gl_LocalInvocationID.x > 0) {
+        row = sh_scratch[gl_LocalInvocationID.x - 1];
+    }
+    for (uint i = 0; i < N_SEQ; i++) {
+        Monoid m = combine_monoid(row, local[i]);
+        // m is partition-wide inclusive scan of bbox monoid.
+        bool do_atomic = false;
+        if (i == N_SEQ - 1 && gl_LocalInvocationID.x == WG_SIZE - 1) {
+            // last element
+            do_atomic = true;
+        }
+        if ((m.flags & FLAG_RESET_BBOX) != 0) {
+            if ((m.flags & FLAG_SET_BBOX) == 0) {
+                do_atomic = true;
+            } else {
+                memory[bbox_out_ix] = round_down(m.bbox.x);
+                memory[bbox_out_ix + 1] = round_down(m.bbox.y);
+                memory[bbox_out_ix + 2] = round_up(m.bbox.z);
+                memory[bbox_out_ix + 3] = round_up(m.bbox.w);
+                bbox_out_ix += 4;
+                do_atomic = false;
+            }
+        }
+        if (do_atomic) {
+            if (m.bbox.z > m.bbox.x || m.bbox.w > m.bbox.y) {
+                // atomic min/max
+                atomicMin(memory[bbox_out_ix], round_down(m.bbox.x));
+                atomicMin(memory[bbox_out_ix + 1], round_down(m.bbox.y));
+                atomicMax(memory[bbox_out_ix + 2], round_up(m.bbox.z));
+                atomicMax(memory[bbox_out_ix + 3], round_up(m.bbox.w));
+            }
+            bbox_out_ix += 4;
+        }
+    }
+}
diff --git a/piet-gpu/shader/pathtag.h b/piet-gpu/shader/pathtag.h
new file mode 100644
index 0000000..c7af0d6
--- /dev/null
+++ b/piet-gpu/shader/pathtag.h
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Common data structures and functions for the path tag stream.
+
+// This is the layout for tag bytes in the path stream. See
+// doc/pathseg.md for an explanation.
+
+#define PATH_TAG_PATHSEG_BITS 0xf
+#define PATH_TAG_PATH 0x10
+#define PATH_TAG_TRANSFORM 0x20
+#define PATH_TAG_LINEWIDTH 0x40
+
+struct TagMonoid {
+    uint trans_ix;
+    uint linewidth_ix;
+    uint pathseg_ix;
+    uint path_ix;
+    uint pathseg_offset;
+};
+
+TagMonoid tag_monoid_identity() {
+    return TagMonoid(0, 0, 0, 0, 0);
+}
+
+TagMonoid combine_tag_monoid(TagMonoid a, TagMonoid b) {
+    TagMonoid c;
+    c.trans_ix = a.trans_ix + b.trans_ix;
+    c.linewidth_ix = a.linewidth_ix + b.linewidth_ix;
+    c.pathseg_ix = a.pathseg_ix + b.pathseg_ix;
+    c.path_ix = a.path_ix + b.path_ix;
+    c.pathseg_offset = a.pathseg_offset + b.pathseg_offset;
+    return c;
+}
+
+TagMonoid reduce_tag(uint tag_word) {
+    TagMonoid c;
+    // Some fun bit magic here, see doc/pathseg.md for explanation.
+    uint point_count = tag_word & 0x3030303;
+    c.pathseg_ix = bitCount((point_count * 7) & 0x4040404);
+    c.linewidth_ix = bitCount(tag_word & (PATH_TAG_LINEWIDTH * 0x1010101));
+    c.path_ix = bitCount(tag_word & (PATH_TAG_PATH * 0x1010101));
+    c.trans_ix = bitCount(tag_word & (PATH_TAG_TRANSFORM * 0x1010101));
+    uint n_points = point_count + ((tag_word >> 2) & 0x1010101);
+    uint a = n_points + (n_points & (((tag_word >> 3) & 0x1010101) * 15));
+    a += a >> 8;
+    a += a >> 16;
+    c.pathseg_offset = a & 0xff;
+    return c;
+}
diff --git a/piet-gpu/shader/pathtag_reduce.comp b/piet-gpu/shader/pathtag_reduce.comp
new file mode 100644
index 0000000..86bb9d0
--- /dev/null
+++ b/piet-gpu/shader/pathtag_reduce.comp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// The reduction phase for path tag scan implemented as a tree reduction.
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+#include "mem.h"
+#include "setup.h"
+#include "pathtag.h"
+
+// Note: the partition size is smaller than pathseg by a factor
+// of 4, as there are 4 tag bytes to a tag word.
+#define N_ROWS 4
+#define LG_WG_SIZE 7
+#define WG_SIZE (1 << LG_WG_SIZE)
+#define PARTITION_SIZE (WG_SIZE * N_ROWS)
+
+layout(local_size_x = WG_SIZE, local_size_y = 1) in;
+
+layout(binding = 1) readonly buffer ConfigBuf {
+    Config conf;
+};
+
+layout(binding = 2) readonly buffer SceneBuf {
+    uint[] scene;
+};
+
+#define Monoid TagMonoid
+
+layout(set = 0, binding = 3) buffer OutBuf {
+    Monoid[] outbuf;
+};
+
+shared Monoid sh_scratch[WG_SIZE];
+
+void main() {
+    uint ix = gl_GlobalInvocationID.x * N_ROWS;
+    uint scene_ix = (conf.pathtag_offset >> 2) + ix;
+    uint tag_word = scene[scene_ix];
+
+    Monoid agg = reduce_tag(tag_word);
+    for (uint i = 1; i < N_ROWS; i++) {
+        tag_word = scene[scene_ix + i];
+        agg = combine_tag_monoid(agg, reduce_tag(tag_word));
+    }
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i = 0; i < LG_WG_SIZE; i++) {
+        barrier();
+        // We could make this predicate tighter, but would it help?
+        if (gl_LocalInvocationID.x + (1u << i) < WG_SIZE) {
+            Monoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i)];
+            agg = combine_tag_monoid(agg, other);
+        }
+        barrier();
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    if (gl_LocalInvocationID.x == 0) {
+        outbuf[gl_WorkGroupID.x] = agg;
+    }
+}
diff --git a/piet-gpu/shader/pathtag_scan.comp b/piet-gpu/shader/pathtag_scan.comp
new file mode 100644
index 0000000..c0d386e
--- /dev/null
+++ b/piet-gpu/shader/pathtag_scan.comp
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// A scan for path tag scan implemented as a tree reduction.
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+#include "pathtag.h"
+
+#define N_ROWS 8
+#define LG_WG_SIZE 9
+#define WG_SIZE (1 << LG_WG_SIZE)
+#define PARTITION_SIZE (WG_SIZE * N_ROWS)
+
+layout(local_size_x = WG_SIZE, local_size_y = 1) in;
+
+#define Monoid TagMonoid
+#define combine_monoid combine_tag_monoid
+#define monoid_identity tag_monoid_identity
+
+layout(binding = 0) buffer DataBuf {
+    Monoid[] data;
+};
+
+#ifndef ROOT
+layout(binding = 1) readonly buffer ParentBuf {
+    Monoid[] parent;
+};
+#endif
+
+shared Monoid sh_scratch[WG_SIZE];
+
+void main() {
+    Monoid local[N_ROWS];
+
+    uint ix = gl_GlobalInvocationID.x * N_ROWS;
+
+    local[0] = data[ix];
+    for (uint i = 1; i < N_ROWS; i++) {
+        local[i] = combine_monoid(local[i - 1], data[ix + i]);
+    }
+    Monoid agg = local[N_ROWS - 1];
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i = 0; i < LG_WG_SIZE; i++) {
+        barrier();
+        if (gl_LocalInvocationID.x >= (1u << i)) {
+            Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)];
+            agg = combine_monoid(other, agg);
+        }
+        barrier();
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    
+    barrier();
+    // This could be a semigroup instead of a monoid if we reworked the
+    // conditional logic, but that might impact performance.
+    Monoid row = monoid_identity();
+#ifdef ROOT
+    if (gl_LocalInvocationID.x > 0) {
+        row = sh_scratch[gl_LocalInvocationID.x - 1];
+    }
+#else
+    if (gl_WorkGroupID.x > 0) {
+        row = parent[gl_WorkGroupID.x - 1];
+    }
+    if (gl_LocalInvocationID.x > 0) {
+        row = combine_monoid(row, sh_scratch[gl_LocalInvocationID.x - 1]);
+    }
+#endif
+    for (uint i = 0; i < N_ROWS; i++) {
+        Monoid m = combine_monoid(row, local[i]);
+        data[ix + i] = m;
+    }
+}
diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h
index 52ea6e4..c74903e 100644
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@@ -40,10 +40,20 @@ struct Config {
     Alloc trans_alloc;
     // new element pipeline stuff follows
 
+    // Bounding boxes of paths, stored as int (so atomics work)
+    Alloc bbox_alloc;
+
     // Number of transforms in scene
+    // This is probably not needed.
     uint n_trans;
     // Offset (in bytes) of transform stream in scene buffer
     uint trans_offset;
+    // Offset (in bytes) of path tag stream in scene
+    uint pathtag_offset;
+    // Offset (in bytes) of linewidth stream in scene
+    uint linewidth_offset;
+    // Offset (in bytes) of path segment stream in scene
+    uint pathseg_offset;
 };
 
 // Fill modes.
diff --git a/piet-gpu/shader/tile_alloc.spv b/piet-gpu/shader/tile_alloc.spv
index b123f1890fa44c5595ef70e8cd5ffb1d2900ced1..69dddf552f07d01d27c0ea9d71fccc22308a48f1 100644
GIT binary patch
delta 270
zcmZ2d@}+cx9alXYgB~LT0~Z4WLsC+HMSNmTPJS{20~-S?SezRuo)=$Il$cix6k`XA
z@c_lZBJufYX~n4}KrUR47sxJ1EXgQIOoz*|F>rwO@d4#>GV@Z)GgC@3V6w7En)#uc
zi&J5mH)nDEVD^SM1!S!<5E}ro2h@2W>p)^ANMaysKw=h1Vjz=2Vm6y&`4m|gg(lYv
T=}ulD#51`^h-32xp%QffUYao=

delta 64
zcmexTx};=-9hWpKgB~LT0~Z4WLtcDIQDR;(0~-T7Sd4peA(!yxJ6zwGH!JbWurTsX
Rwini&Tp-M`xkb1@9RL8%59R;>

diff --git a/piet-gpu/shader/transform_scan.comp b/piet-gpu/shader/transform_scan.comp
index e8e0019..492bf04 100644
--- a/piet-gpu/shader/transform_scan.comp
+++ b/piet-gpu/shader/transform_scan.comp
@@ -48,7 +48,6 @@ void main() {
 
     uint ix = gl_GlobalInvocationID.x * N_ROWS;
 
-    // TODO: gate buffer read
     local[0] = data[ix];
     for (uint i = 1; i < N_ROWS; i++) {
         local[i] = combine_monoid(local[i - 1], data[ix + i]);
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index 8d21fe6..2b45e7c 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -298,7 +298,6 @@ impl Renderer {
         alloc += (n_paths * ANNO_SIZE + 3) & !3;
         let trans_base = alloc;
         alloc += (n_trans * TRANS_SIZE + 3) & !3;
-        let trans_offset = 0; // For new element pipeline, not yet used
         let config = Config {
             n_elements: n_paths as u32,
             n_pathseg: n_pathseg as u32,
@@ -311,7 +310,8 @@ impl Renderer {
             anno_alloc: anno_base as u32,
             trans_alloc: trans_base as u32,
             n_trans: n_trans as u32,
-            trans_offset: trans_offset as u32,
+            // We'll fill the rest of the fields in when we hook up the new element pipeline.
+            ..Default::default()
         };
         unsafe {
             let scene = render_ctx.get_scene_buf();
diff --git a/piet-gpu/src/stages.rs b/piet-gpu/src/stages.rs
index 0613585..2fe4e4f 100644
--- a/piet-gpu/src/stages.rs
+++ b/piet-gpu/src/stages.rs
@@ -16,6 +16,8 @@
 
 //! Stages for new element pipeline, exposed for testing.
 
+mod path;
+
 use bytemuck::{Pod, Zeroable};
 
 use piet::kurbo::Affine;
@@ -23,6 +25,8 @@ use piet_gpu_hal::{
     include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session,
 };
 
+pub use path::{PathBinding, PathCode, PathEncoder, PathStage};
+
 /// The configuration block passed to piet-gpu shaders.
 ///
 /// Note: this should be kept in sync with the version in setup.h.
@@ -39,8 +43,12 @@ pub struct Config {
     pub pathseg_alloc: u32,
     pub anno_alloc: u32,
     pub trans_alloc: u32,
+    pub bbox_alloc: u32,
     pub n_trans: u32,
     pub trans_offset: u32,
+    pub pathtag_offset: u32,
+    pub linewidth_offset: u32,
+    pub pathseg_offset: u32,
 }
 
 // The individual stages will probably be separate files but for now, all in one.
diff --git a/piet-gpu/src/stages/path.rs b/piet-gpu/src/stages/path.rs
new file mode 100644
index 0000000..b3f417e
--- /dev/null
+++ b/piet-gpu/src/stages/path.rs
@@ -0,0 +1,339 @@
+// Copyright 2021 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+//! The path stage (includes substages).
+
+use piet_gpu_hal::{
+    BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session, ShaderCode,
+};
+
+pub struct PathCode {
+    reduce_pipeline: Pipeline,
+    tag_root_pipeline: Pipeline,
+    clear_pipeline: Pipeline,
+    pathseg_pipeline: Pipeline,
+}
+
+pub struct PathStage {
+    tag_root_buf: Buffer,
+    tag_root_ds: DescriptorSet,
+}
+
+pub struct PathBinding {
+    reduce_ds: DescriptorSet,
+    clear_ds: DescriptorSet,
+    path_ds: DescriptorSet,
+}
+
+const REDUCE_WG: u32 = 128;
+const REDUCE_N_ROWS: u32 = 4;
+const REDUCE_PART_SIZE: u32 = REDUCE_WG * REDUCE_N_ROWS;
+
+const ROOT_WG: u32 = 512;
+const ROOT_N_ROWS: u32 = 8;
+const ROOT_PART_SIZE: u32 = ROOT_WG * ROOT_N_ROWS;
+
+const SCAN_WG: u32 = 512;
+const SCAN_N_ROWS: u32 = 4;
+const SCAN_PART_SIZE: u32 = SCAN_WG * SCAN_N_ROWS;
+
+const CLEAR_WG: u32 = 512;
+
+impl PathCode {
+    pub unsafe fn new(session: &Session) -> PathCode {
+        // TODO: add cross-compilation
+        let reduce_code = ShaderCode::Spv(include_bytes!("../../shader/gen/pathtag_reduce.spv"));
+        let reduce_pipeline = session
+            .create_compute_pipeline(
+                reduce_code,
+                &[
+                    BindType::Buffer,
+                    BindType::BufReadOnly,
+                    BindType::BufReadOnly,
+                    BindType::Buffer,
+                ],
+            )
+            .unwrap();
+        let tag_root_code = ShaderCode::Spv(include_bytes!("../../shader/gen/pathtag_root.spv"));
+        let tag_root_pipeline = session
+            .create_compute_pipeline(tag_root_code, &[BindType::Buffer])
+            .unwrap();
+        let clear_code = ShaderCode::Spv(include_bytes!("../../shader/gen/bbox_clear.spv"));
+        let clear_pipeline = session
+            .create_compute_pipeline(clear_code, &[BindType::Buffer, BindType::BufReadOnly])
+            .unwrap();
+        let pathseg_code = ShaderCode::Spv(include_bytes!("../../shader/gen/pathseg.spv"));
+        let pathseg_pipeline = session
+            .create_compute_pipeline(
+                pathseg_code,
+                &[
+                    BindType::Buffer,
+                    BindType::BufReadOnly,
+                    BindType::BufReadOnly,
+                    BindType::BufReadOnly,
+                ],
+            )
+            .unwrap();
+        PathCode {
+            reduce_pipeline,
+            tag_root_pipeline,
+            clear_pipeline,
+            pathseg_pipeline,
+        }
+    }
+}
+
+impl PathStage {
+    pub unsafe fn new(session: &Session, code: &PathCode) -> PathStage {
+        let tag_root_buf_size = (ROOT_PART_SIZE * 20) as u64;
+        let tag_root_buf = session
+            .create_buffer(tag_root_buf_size, BufferUsage::STORAGE)
+            .unwrap();
+        let tag_root_ds = session
+            .create_simple_descriptor_set(&code.tag_root_pipeline, &[&tag_root_buf])
+            .unwrap();
+        PathStage {
+            tag_root_buf,
+            tag_root_ds,
+        }
+    }
+
+    pub unsafe fn bind(
+        &self,
+        session: &Session,
+        code: &PathCode,
+        config_buf: &Buffer,
+        scene_buf: &Buffer,
+        memory_buf: &Buffer,
+    ) -> PathBinding {
+        let reduce_ds = session
+            .create_simple_descriptor_set(
+                &code.reduce_pipeline,
+                &[memory_buf, config_buf, scene_buf, &self.tag_root_buf],
+            )
+            .unwrap();
+        let clear_ds = session
+            .create_simple_descriptor_set(&code.clear_pipeline, &[memory_buf, config_buf])
+            .unwrap();
+        let path_ds = session
+            .create_simple_descriptor_set(
+                &code.pathseg_pipeline,
+                &[memory_buf, config_buf, scene_buf, &self.tag_root_buf],
+            )
+            .unwrap();
+        PathBinding {
+            reduce_ds,
+            clear_ds,
+            path_ds,
+        }
+    }
+
+    /// Record the path stage.
+    ///
+    /// Note: no barrier is needed for transform output, we have a barrier before
+    /// those are consumed. Result is written without barrier.
+    pub unsafe fn record(
+        &self,
+        cmd_buf: &mut CmdBuf,
+        code: &PathCode,
+        binding: &PathBinding,
+        n_paths: u32,
+        n_tags: u32,
+    ) {
+        if n_tags > ROOT_PART_SIZE * SCAN_PART_SIZE {
+            println!(
+                "number of pathsegs exceeded {} > {}",
+                n_tags,
+                ROOT_PART_SIZE * SCAN_PART_SIZE
+            );
+        }
+
+        // Number of tags consumed in a tag reduce workgroup
+        let reduce_part_tags = REDUCE_PART_SIZE * 4;
+        let n_wg_tag_reduce = (n_tags + reduce_part_tags - 1) / reduce_part_tags;
+        if n_wg_tag_reduce > 1 {
+            cmd_buf.dispatch(
+                &code.reduce_pipeline,
+                &binding.reduce_ds,
+                (n_wg_tag_reduce, 1, 1),
+                (REDUCE_WG, 1, 1),
+            );
+            // I think we can skip root if n_wg_tag_reduce == 2
+            cmd_buf.memory_barrier();
+            cmd_buf.dispatch(
+                &code.tag_root_pipeline,
+                &self.tag_root_ds,
+                (1, 1, 1),
+                (ROOT_WG, 1, 1),
+            );
+            // No barrier needed here; clear doesn't depend on path tags
+        }
+        let n_wg_clear = (n_paths + CLEAR_WG - 1) / CLEAR_WG;
+        cmd_buf.dispatch(
+            &code.clear_pipeline,
+            &binding.clear_ds,
+            (n_wg_clear, 1, 1),
+            (CLEAR_WG, 1, 1),
+        );
+        cmd_buf.memory_barrier();
+        let n_wg_pathseg = (n_tags + SCAN_PART_SIZE - 1) / SCAN_PART_SIZE;
+        cmd_buf.dispatch(
+            &code.pathseg_pipeline,
+            &binding.path_ds,
+            (n_wg_pathseg, 1, 1),
+            (SCAN_WG, 1, 1),
+        );
+    }
+}
+
+pub struct PathEncoder<'a> {
+    tag_stream: &'a mut Vec<u8>,
+    // If we're never going to use the i16 encoding, it might be
+    // slightly faster to store this as Vec<u32>, we'd get aligned
+    // stores on ARM etc.
+    pathseg_stream: &'a mut Vec<u8>,
+    first_pt: [f32; 2],
+    state: State,
+    n_pathseg: u32,
+}
+
+#[derive(PartialEq)]
+enum State {
+    Start,
+    MoveTo,
+    NonemptySubpath,
+}
+
+impl<'a> PathEncoder<'a> {
+    pub fn new(tags: &'a mut Vec<u8>, pathsegs: &'a mut Vec<u8>) -> PathEncoder<'a> {
+        PathEncoder {
+            tag_stream: tags,
+            pathseg_stream: pathsegs,
+            first_pt: [0.0, 0.0],
+            state: State::Start,
+            n_pathseg: 0,
+        }
+    }
+
+    pub fn move_to(&mut self, x: f32, y: f32) {
+        let buf = [x, y];
+        let bytes = bytemuck::bytes_of(&buf);
+        self.first_pt = buf;
+        if self.state == State::MoveTo {
+            let new_len = self.pathseg_stream.len() - 8;
+            self.pathseg_stream.truncate(new_len);
+        }
+        if self.state == State::NonemptySubpath {
+            if let Some(tag) = self.tag_stream.last_mut() {
+                *tag |= 4;
+            }
+        }
+        self.pathseg_stream.extend_from_slice(bytes);
+        self.state = State::MoveTo;
+    }
+
+    pub fn line_to(&mut self, x: f32, y: f32) {
+        if self.state == State::Start {
+            // should warn or error
+            return;
+        }
+        let buf = [x, y];
+        let bytes = bytemuck::bytes_of(&buf);
+        self.pathseg_stream.extend_from_slice(bytes);
+        self.tag_stream.push(9);
+        self.state = State::NonemptySubpath;
+        self.n_pathseg += 1;
+    }
+
+    pub fn quad_to(&mut self, x0: f32, y0: f32, x1: f32, y1: f32) {
+        if self.state == State::Start {
+            return;
+        }
+        let buf = [x0, y0, x1, y1];
+        let bytes = bytemuck::bytes_of(&buf);
+        self.pathseg_stream.extend_from_slice(bytes);
+        self.tag_stream.push(10);
+        self.state = State::NonemptySubpath;
+        self.n_pathseg += 1;
+    }
+
+    pub fn cubic_to(&mut self, x0: f32, y0: f32, x1: f32, y1: f32, x2: f32, y2: f32) {
+        if self.state == State::Start {
+            return;
+        }
+        let buf = [x0, y0, x1, y1, x2, y2];
+        let bytes = bytemuck::bytes_of(&buf);
+        self.pathseg_stream.extend_from_slice(bytes);
+        self.tag_stream.push(11);
+        self.state = State::NonemptySubpath;
+        self.n_pathseg += 1;
+    }
+
+    pub fn close_path(&mut self) {
+        match self.state {
+            State::Start => return,
+            State::MoveTo => {
+                let new_len = self.pathseg_stream.len() - 8;
+                self.pathseg_stream.truncate(new_len);
+                return;
+            }
+            State::NonemptySubpath => (),
+        }
+        let len = self.pathseg_stream.len();
+        if len < 8 {
+            // can't happen
+            return;
+        }
+        let first_bytes = bytemuck::bytes_of(&self.first_pt);
+        if &self.pathseg_stream[len - 8..len] != first_bytes {
+            self.pathseg_stream.extend_from_slice(first_bytes);
+            self.tag_stream.push(13);
+            self.n_pathseg += 1;
+        } else {
+            if let Some(tag) = self.tag_stream.last_mut() {
+                *tag |= 4;
+            }
+        }
+        self.state = State::Start;
+    }
+
+    fn finish(&mut self) {
+        if self.state == State::MoveTo {
+            let new_len = self.pathseg_stream.len() - 8;
+            self.pathseg_stream.truncate(new_len);
+        }
+        if let Some(tag) = self.tag_stream.last_mut() {
+            *tag |= 4;
+        }
+    }
+
+    /// Finish encoding a path.
+    ///
+    /// Encode this after encoding path segments.
+    pub fn path(&mut self) {
+        self.finish();
+        // maybe don't encode if path is empty? might throw off sync though
+        self.tag_stream.push(0x10);
+    }
+
+    /// Get the number of path segments.
+    ///
+    /// This is the number of path segments that will be written by the
+    /// path stage; use this for allocating the output buffer.
+    pub fn n_pathseg(&self) -> u32 {
+        self.n_pathseg
+    }
+}
diff --git a/tests/src/main.rs b/tests/src/main.rs
index 0ab9340..9aab351 100644
--- a/tests/src/main.rs
+++ b/tests/src/main.rs
@@ -25,6 +25,8 @@ mod prefix_tree;
 mod runner;
 mod test_result;
 
+#[cfg(feature = "piet-gpu")]
+mod path;
 #[cfg(feature = "piet-gpu")]
 mod transform;
 
@@ -134,6 +136,7 @@ fn main() {
         #[cfg(feature = "piet-gpu")]
         if config.groups.matches("piet") {
             report(&transform::transform_test(&mut runner, &config));
+            report(&path::path_test(&mut runner, &config));
         }
     }
 }
diff --git a/tests/src/path.rs b/tests/src/path.rs
new file mode 100644
index 0000000..948bd6f
--- /dev/null
+++ b/tests/src/path.rs
@@ -0,0 +1,293 @@
+// Copyright 2021 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+//! Tests for the piet-gpu path stage.
+
+use crate::{Config, Runner, TestResult};
+
+use bytemuck::{Pod, Zeroable};
+use piet_gpu::stages::{self, PathCode, PathEncoder, PathStage};
+use piet_gpu_hal::{BufWrite, BufferUsage};
+use rand::{prelude::ThreadRng, Rng};
+
+struct PathData {
+    n_trans: u32,
+    n_linewidth: u32,
+    n_path: u32,
+    n_pathseg: u32,
+    tags: Vec<u8>,
+    pathsegs: Vec<u8>,
+    bbox: Vec<(f32, f32, f32, f32)>,
+    lines: Vec<([f32; 2], [f32; 2])>,
+}
+
+// This is designed to match pathseg.h
+#[repr(C)]
+#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
+struct PathSeg {
+    tag: u32,
+    p0: [f32; 2],
+    p1: [f32; 2],
+    p2: [f32; 2],
+    p3: [f32; 2],
+    path_ix: u32,
+    trans_ix: u32,
+    stroke: [f32; 2],
+}
+
+#[repr(C)]
+#[derive(Clone, Copy, Debug, Default, PartialEq, Zeroable, Pod)]
+struct Bbox {
+    left: u32,
+    top: u32,
+    right: u32,
+    bottom: u32,
+}
+
+pub unsafe fn path_test(runner: &mut Runner, config: &Config) -> TestResult {
+    let mut result = TestResult::new("path");
+
+    let n_path: u64 = config.size.choose(1 << 12, 1 << 16, 1 << 18);
+    let path_data = PathData::new(n_path as u32);
+    let stage_config = path_data.get_config();
+    let config_buf = runner
+        .session
+        .create_buffer_init(std::slice::from_ref(&stage_config), BufferUsage::STORAGE)
+        .unwrap();
+    let scene_size = n_path * 256;
+    let scene_buf = runner
+        .session
+        .create_buffer_with(
+            scene_size,
+            |b| path_data.fill_scene(b),
+            BufferUsage::STORAGE,
+        )
+        .unwrap();
+    let memory_init = runner
+        .session
+        .create_buffer_with(
+            path_data.memory_init_size(),
+            |b| path_data.fill_memory(b),
+            BufferUsage::COPY_SRC,
+        )
+        .unwrap();
+    let memory = runner.buf_down(path_data.memory_full_size(), BufferUsage::empty());
+
+    let code = PathCode::new(&runner.session);
+    let stage = PathStage::new(&runner.session, &code);
+    let binding = stage.bind(
+        &runner.session,
+        &code,
+        &config_buf,
+        &scene_buf,
+        &memory.dev_buf,
+    );
+
+    let mut total_elapsed = 0.0;
+    let n_iter = config.n_iter;
+    for i in 0..n_iter {
+        let mut commands = runner.commands();
+        commands.cmd_buf.copy_buffer(&memory_init, &memory.dev_buf);
+        commands.cmd_buf.memory_barrier();
+        commands.write_timestamp(0);
+        stage.record(
+            &mut commands.cmd_buf,
+            &code,
+            &binding,
+            path_data.n_path,
+            path_data.tags.len() as u32,
+        );
+        commands.write_timestamp(1);
+        if i == 0 || config.verify_all {
+            commands.cmd_buf.memory_barrier();
+            commands.download(&memory);
+        }
+        total_elapsed += runner.submit(commands);
+        if i == 0 || config.verify_all {
+            let dst = memory.map_read(..);
+            if let Some(failure) = path_data.verify(&dst) {
+                result.fail(failure);
+            }
+        }
+    }
+    let n_elements = path_data.n_pathseg as u64;
+    result.timing(total_elapsed, n_elements * n_iter);
+
+    result
+}
+
+fn rand_point(rng: &mut ThreadRng) -> (f32, f32) {
+    let x = rng.gen_range(0.0, 100.0);
+    let y = rng.gen_range(0.0, 100.0);
+    (x, y)
+}
+
+// Must match shader/pathseg.h
+const PATHSEG_SIZE: u32 = 52;
+
+impl PathData {
+    fn new(n_path: u32) -> PathData {
+        let mut rng = rand::thread_rng();
+        let n_trans = 1;
+        let n_linewidth = 1;
+        let segments_per_path = 8;
+        let mut tags = Vec::new();
+        let mut pathsegs = Vec::new();
+        let mut bbox = Vec::new();
+        let mut lines = Vec::new();
+        let mut encoder = PathEncoder::new(&mut tags, &mut pathsegs);
+        for _ in 0..n_path {
+            let (x, y) = rand_point(&mut rng);
+            let mut min_x = x;
+            let mut max_x = x;
+            let mut min_y = y;
+            let mut max_y = y;
+            let first_pt = [x, y];
+            let mut last_pt = [x, y];
+            encoder.move_to(x, y);
+            for _ in 0..segments_per_path {
+                let (x, y) = rand_point(&mut rng);
+                lines.push((last_pt, [x, y]));
+                last_pt = [x, y];
+                encoder.line_to(x, y);
+                min_x = min_x.min(x);
+                max_x = max_x.max(x);
+                min_y = min_y.min(y);
+                max_y = max_y.max(y);
+            }
+            bbox.push((min_x, min_y, max_x, max_y));
+            encoder.close_path();
+            // With very low probability last_pt and first_pt might be equal, which
+            // would cause a test failure - might want to seed RNG.
+            lines.push((last_pt, first_pt));
+            encoder.path();
+        }
+        let n_pathseg = encoder.n_pathseg();
+        //println!("tags: {:x?}", &tags[0..8]);
+        //println!("path: {:?}", bytemuck::cast_slice::<u8, f32>(&pathsegs[0..64]));
+        PathData {
+            n_trans,
+            n_linewidth,
+            n_path,
+            n_pathseg,
+            tags,
+            pathsegs,
+            bbox,
+            lines,
+        }
+    }
+
+    fn get_config(&self) -> stages::Config {
+        let n_trans = self.n_trans;
+
+        // Layout of scene buffer
+        let linewidth_offset = 0;
+        let pathtag_offset = linewidth_offset + self.n_linewidth * 4;
+        let n_tagbytes = self.tags.len() as u32;
+        // Depends on workgroup size, maybe get from stages?
+        let padded_n_tagbytes = (n_tagbytes + 2047) & !2047;
+        let pathseg_offset = pathtag_offset + padded_n_tagbytes;
+
+        // Layout of memory
+        let trans_alloc = 0;
+        let pathseg_alloc = trans_alloc + n_trans * 24;
+        let bbox_alloc = pathseg_alloc + self.n_pathseg * PATHSEG_SIZE;
+        let stage_config = stages::Config {
+            n_elements: self.n_path,
+            pathseg_alloc,
+            trans_alloc,
+            bbox_alloc,
+            n_trans,
+            pathtag_offset,
+            linewidth_offset,
+            pathseg_offset,
+            ..Default::default()
+        };
+        stage_config
+    }
+
+    fn fill_scene(&self, buf: &mut BufWrite) {
+        let linewidth = -1.0f32;
+        buf.push(linewidth);
+        buf.extend_slice(&self.tags);
+        buf.fill_zero(self.tags.len().wrapping_neg() & 2047);
+        buf.extend_slice(&self.pathsegs);
+    }
+
+    fn memory_init_size(&self) -> u64 {
+        let mut size = 8; // offset and error
+        size += self.n_trans * 24;
+        size as u64
+    }
+
+    fn memory_full_size(&self) -> u64 {
+        let mut size = self.memory_init_size();
+        size += (self.n_pathseg * PATHSEG_SIZE) as u64;
+        size += (self.n_path * 16) as u64;
+        size
+    }
+
+    fn fill_memory(&self, buf: &mut BufWrite) {
+        // This stage is not dynamically allocating memory
+        let mem_offset = 0u32;
+        let mem_error = 0u32;
+        let mem_init = [mem_offset, mem_error];
+        buf.push(mem_init);
+        let trans = [1.0f32, 0.0, 0.0, 1.0, 0.0, 0.0];
+        buf.push(trans);
+    }
+
+    fn verify(&self, memory: &[u8]) -> Option<String> {
+        fn round_down(x: f32) -> u32 {
+            (x.floor() + 32768.0) as u32
+        }
+        fn round_up(x: f32) -> u32 {
+            (x.ceil() + 32768.0) as u32
+        }
+        let begin_pathseg = 32;
+        for i in 0..self.n_pathseg {
+            let offset = (begin_pathseg + PATHSEG_SIZE * i) as usize;
+            let actual =
+                bytemuck::from_bytes::<PathSeg>(&memory[offset..offset + PATHSEG_SIZE as usize]);
+            let expected = self.lines[i as usize];
+            const EPSILON: f32 = 1e-9;
+            if (expected.0[0] - actual.p0[0]).abs() > EPSILON
+                || (expected.0[1] - actual.p0[1]).abs() > EPSILON
+                || (expected.1[0] - actual.p3[0]).abs() > EPSILON
+                || (expected.1[1] - actual.p3[1]).abs() > EPSILON
+            {
+                println!("{}: {:.1?} {:.1?}", i, actual, expected);
+            }
+        }
+        let begin_bbox = 32 + PATHSEG_SIZE * self.n_pathseg;
+        for i in 0..self.n_path {
+            let offset = (begin_bbox + 16 * i) as usize;
+            let actual = bytemuck::from_bytes::<Bbox>(&memory[offset..offset + 16]);
+            let expected_f32 = self.bbox[i as usize];
+            let expected = Bbox {
+                left: round_down(expected_f32.0),
+                top: round_down(expected_f32.1),
+                right: round_up(expected_f32.2),
+                bottom: round_up(expected_f32.3),
+            };
+            if expected != *actual {
+                println!("{}: {:?} {:?}", i, actual, expected);
+                return Some(format!("bbox mismatch at {}", i));
+            }
+        }
+        None
+    }
+}
diff --git a/tests/src/transform.rs b/tests/src/transform.rs
index d696b10..1c15634 100644
--- a/tests/src/transform.rs
+++ b/tests/src/transform.rs
@@ -14,7 +14,7 @@
 //
 // Also licensed under MIT license, at your choice.
 
-//! Tests for piet-gpu shaders.
+//! Tests for the piet-gpu transform stage.
 
 use crate::{Config, Runner, TestResult};
 
@@ -37,11 +37,9 @@ pub unsafe fn transform_test(runner: &mut Runner, config: &Config) -> TestResult
         .session
         .create_buffer_init(&data.input_data, BufferUsage::STORAGE)
         .unwrap();
-    let memory = runner.buf_down(data_buf.size() + 24, BufferUsage::empty());
+    let memory = runner.buf_down(data_buf.size() + 8, BufferUsage::empty());
     let stage_config = stages::Config {
         n_trans: n_elements as u32,
-        // This is a hack to get elements aligned.
-        trans_alloc: 16,
         ..Default::default()
     };
     let config_buf = runner
@@ -71,9 +69,8 @@ pub unsafe fn transform_test(runner: &mut Runner, config: &Config) -> TestResult
         }
         total_elapsed += runner.submit(commands);
         if i == 0 || config.verify_all {
-            let mut dst: Vec<Transform> = Default::default();
-            memory.read(&mut dst);
-            if let Some(failure) = data.verify(&dst[1..]) {
+            let dst = memory.map_read(8..);
+            if let Some(failure) = data.verify(dst.cast_slice()) {
                 result.fail(failure);
             }
         }

From 8af4707525a8987bc2fca85de3fa3f73ca3accbf Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Wed, 1 Dec 2021 08:34:22 -0800
Subject: [PATCH 2/4] Fix uninitialized variable

---
 piet-gpu/shader/gen/pathseg.spv | Bin 33908 -> 33920 bytes
 piet-gpu/shader/pathseg.comp    |   2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/piet-gpu/shader/gen/pathseg.spv b/piet-gpu/shader/gen/pathseg.spv
index 67fc57fb3685022e4f00cd7831fc097c86dc4a24..2ac684df95b29911ff6e00f1707287c9abf47852 100644
GIT binary patch
delta 185
zcmey;!PL;nv>}6^ar)*={^^21icxVftEU|UGsAQy28KEY28N#uObj!C>>p728v_f&
zEFgb25Q6|nehyR~MB4)8*?{76f$Zs%AF9c-&IGb%O?K3jo~)p5z&L$!pn5&fqKoSK
jjFKQjKn8<on8qDojgp$uq97;i1Y%?-%-Za!sT~afN&_gE

delta 199
zcmZqZWct#<v>}6^an9yU{^^21icxVftL;w)CWh%u3=BV@^fv|;hM7SAEFcB}ko;_@
zJczah%CiB*=K|StCcjma=bZs$fpmaOm@#>whV*0;bq&TjlVjEEftK7=*JpGD83vLD
s(J%vc0NHbZxPgHcY~W5H8zeRfXt*5%GguBJ26o_NL3QcP8JZH&02N>=%K!iX

diff --git a/piet-gpu/shader/pathseg.comp b/piet-gpu/shader/pathseg.comp
index 8653c16..7b8f3f0 100644
--- a/piet-gpu/shader/pathseg.comp
+++ b/piet-gpu/shader/pathseg.comp
@@ -100,7 +100,7 @@ void main() {
     // Scan the tag monoid
     TagMonoid local_tm = reduce_tag(tag_word);
     sh_tag[gl_LocalInvocationID.x] = local_tm;
-    for (uint i; i < LG_WG_SIZE; i++) {
+    for (uint i = 0; i < LG_WG_SIZE; i++) {
         barrier();
         if (gl_LocalInvocationID.x >= (1u << i)) {
             TagMonoid other = sh_tag[gl_LocalInvocationID.x - (1u << i)];

From 1d1801c1aa346ef011f10109176171710a94b7b0 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Wed, 1 Dec 2021 08:42:06 -0800
Subject: [PATCH 3/4] Cross-platform path stage shaders

---
 piet-gpu/shader/build.ninja             |  12 +
 piet-gpu/shader/gen/bbox_clear.dxil     | Bin 0 -> 3152 bytes
 piet-gpu/shader/gen/bbox_clear.hlsl     |  55 ++
 piet-gpu/shader/gen/bbox_clear.msl      |  57 ++
 piet-gpu/shader/gen/pathseg.dxil        | Bin 0 -> 9424 bytes
 piet-gpu/shader/gen/pathseg.hlsl        | 643 ++++++++++++++++++++++
 piet-gpu/shader/gen/pathseg.msl         | 699 ++++++++++++++++++++++++
 piet-gpu/shader/gen/pathtag_reduce.dxil | Bin 0 -> 4700 bytes
 piet-gpu/shader/gen/pathtag_reduce.hlsl | 127 +++++
 piet-gpu/shader/gen/pathtag_reduce.msl  | 143 +++++
 piet-gpu/shader/gen/pathtag_root.dxil   | Bin 0 -> 4716 bytes
 piet-gpu/shader/gen/pathtag_root.hlsl   | 115 ++++
 piet-gpu/shader/gen/pathtag_root.msl    | 146 +++++
 piet-gpu/src/stages/path.rs             |  10 +-
 14 files changed, 2002 insertions(+), 5 deletions(-)
 create mode 100644 piet-gpu/shader/gen/bbox_clear.dxil
 create mode 100644 piet-gpu/shader/gen/bbox_clear.hlsl
 create mode 100644 piet-gpu/shader/gen/bbox_clear.msl
 create mode 100644 piet-gpu/shader/gen/pathseg.dxil
 create mode 100644 piet-gpu/shader/gen/pathseg.hlsl
 create mode 100644 piet-gpu/shader/gen/pathseg.msl
 create mode 100644 piet-gpu/shader/gen/pathtag_reduce.dxil
 create mode 100644 piet-gpu/shader/gen/pathtag_reduce.hlsl
 create mode 100644 piet-gpu/shader/gen/pathtag_reduce.msl
 create mode 100644 piet-gpu/shader/gen/pathtag_root.dxil
 create mode 100644 piet-gpu/shader/gen/pathtag_root.hlsl
 create mode 100644 piet-gpu/shader/gen/pathtag_root.msl

diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja
index 6f225d9..c8b4858 100644
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@@ -59,10 +59,22 @@ build gen/transform_leaf.dxil: dxil gen/transform_leaf.hlsl
 build gen/transform_leaf.msl: msl gen/transform_leaf.spv
 
 build gen/pathtag_reduce.spv: glsl pathtag_reduce.comp | pathtag.h setup.h mem.h
+build gen/pathtag_reduce.hlsl: hlsl gen/pathtag_reduce.spv
+build gen/pathtag_reduce.dxil: dxil gen/pathtag_reduce.hlsl
+build gen/pathtag_reduce.msl: msl gen/pathtag_reduce.spv
 
 build gen/pathtag_root.spv: glsl pathtag_scan.comp | pathtag.h
   flags = -DROOT
+build gen/pathtag_root.hlsl: hlsl gen/pathtag_root.spv
+build gen/pathtag_root.dxil: dxil gen/pathtag_root.hlsl
+build gen/pathtag_root.msl: msl gen/pathtag_root.spv
 
 build gen/bbox_clear.spv: glsl bbox_clear.comp | setup.h mem.h
+build gen/bbox_clear.hlsl: hlsl gen/bbox_clear.spv
+build gen/bbox_clear.dxil: dxil gen/bbox_clear.hlsl
+build gen/bbox_clear.msl: msl gen/bbox_clear.spv
 
 build gen/pathseg.spv: glsl pathseg.comp | tile.h pathseg.h pathtag.h setup.h mem.h
+build gen/pathseg.hlsl: hlsl gen/pathseg.spv
+build gen/pathseg.dxil: dxil gen/pathseg.hlsl
+build gen/pathseg.msl: msl gen/pathseg.spv
diff --git a/piet-gpu/shader/gen/bbox_clear.dxil b/piet-gpu/shader/gen/bbox_clear.dxil
new file mode 100644
index 0000000000000000000000000000000000000000..8a4672594d16e9a059513de25eb5cbc11c653de5
GIT binary patch
literal 3152
zcmeHJeNY?66<_HN-ARb!K(Z}GGC3enN=bYI%*Oy7eSyKqj)<JbZbftAtD++jfhgvq
zO@$CT#a3E@p%KuyBEcQfc2b<uCS;NfA1Z?}wAkY~jv*e?P~vu+gh}cdrsHY4C!>t(
zcBY;7k0vwOnYZul+uy$1efREnt3RsMJ$QWI;jyD7Pp<#I?$+vWzjv4i05FOHz@RRM
zVt`_XVuNxMid6uB2a2hzg2v%E+A2&3RM9b~AHHX7G<}V3hkiP`;2!-6Y%zEH_yBNw
z;RKpM1_gz}wWvS#&M>mq9W$b${wSVvnvQ6Ww8954AO=*!0Q#d91}+c@=giQr)lFYZ
zhUY#4eX(vT@=OVw!@5)ZGz(~jfXpuJ=Xu0wF}OSlJz3+Dmh!dzvDz^*TUvn2Dn;3k
z_{sxPN>}DP1LWD-jdCK;ml8H=s=EE_J%2obez{Cg1rNdHM&ZBC9T!vc$K$+PoQY7p
zHw87@oXHD-3W!4>PU2>?lmUm?2H+_QfDEX!Q_6)@T`zhnWy}M+cRiy4X`<nTKh`s5
zAkC>D4M;cf>5#ZqoZ>|Bp)T+FpG>fU+oaXotDYH3#ZPp5<*9wIHokIO*oB{aT-cbF
ze*cvHh+j3I5u>n*zg@~ho>tAvyw}vzmW$G=*^MWz=&f^PLjPn{sYah6d*I<yKlLT1
zf4?{1mNIGa;T2cMpQBQw;^&-b2yPEZq%G5IE5qhl*$yiL?YLhW3Hu7s340LVNn<S`
z%t>MP0MQ&Z+M`A@gK>Zr!JqDewW+aA25VgrY?4^p9VA1*2zQX1a4LjBjQu(R<LpF;
zcyy;D$>uf%fVUVWBXz?K#2r%V(_)f+l&k&}NN;qwBBjQ`s90D)TrFiM1*P%?L%=Ug
zKQNE#?n|TjMsa-=9^*4+m8u(2m#y3QEH6~Zy|WJZ?!F22eTX4Y#+T^#%W7ktMabE&
zoEoX@aw0>`Q#{RRY|%(pUuI3FXgKq)!&BWIs!C^UZROh}D)Ed4De=6(LgnG8#$s!7
z(@KRGNlwQl;q}V&1~*M961B&C76xZ@D+cms?A)HXWi+`lFDr~Bgs1gYs0|;B^W^ZR
zQ4Pog3c>74dFWeS%Q&5Aw=_0M(DOr-_#A36=84~-ZItinkMpF10^|J&sww(q)#OWs
zyBnLFtxk!@lH9jhmVdzWqWHs;??33bxoO&PV{z+^%^TM?7LQ&!Gj(Cv&df4okBUOZ
z#7o!31-W!jrp?x7f2gspzNyvZ4vFNaPgW1DnXitl4!EpZ?|U_S<{KVM&ri$vb6|aH
zX|%U*@ZA2_`v%V*5(cvC+8=7EcRul*mfCu!$^Am2{B&$}lKEI*WK~futA-)g2EfmC
z-uE8aEc?<k*sUMI&E=<UzX}8SUw?Dz!r-~(L&BAv{2zik2X`sM>&gZxuf(S-@XYzO
zF07*+a|WBmn;`yulK6{@$PsIkW@q;PZdBPo$3$Q~iY26EL095ON|xIFtq!aU$2!Vc
zhXd<$AZ_iegJR`N3B=6+u};S~(edwY5nDzghYat%maM#zym!`;)IdK~^jc5Rimq@H
zDSoY|*n<=;ZAsf0tjmh!TiF)ayI4nnbud_42G+J61H&G7v5(_yvy5#CvK_(htPi$`
z53Uopriq(0QnN6s+!$3ZMfx?1axqyM?%um-5nK-F3Kw*R!ASqgT)}{@z=sqK%<Y2@
zz3nc#gcME8`4^J0?>dkcl65)|?ko)JSYVHjW1Yd7;tf}P6GePvB-SSg&H&96{C;8X
z^3-w9d|3RXdRI)sAVF_0*cF=n<~Kgx<~=IahYugtL(hmSm-A@#K$5^;o*;Kq+>*bw
z>QUO)^TV~@@JO*M7lw+oa*e56x??XM9NJhuvqzUX*uU>aRjZ!k;qVSGH{Zj{zqpH+
zt)FLJX8ymtOv5~Wrg(X$7NGjCd07M(LtZBTH81}oDRZ3sRfzvzIXNAlJFAh2MYT?a
zAWWs>r*zCe5psh&kfqrDhhumRL*e{9^nzfv4>0gT@xayDi)-ol?i`J*UpO1G$tEmU
z)a2zsd&amZIQbtkC;ji3lm7xCKV3Pbc-EYB=^Mj}ch&~fNaUaN1t)%XIMG7E{}Zf7
z398HD?w{P9Ft-65RE0J~s?hCE;t%GBLYbqK=y|oM*C6!6X^<)q0yh(~`5o04eHy-t
zl4Y1vnYVLm7(n3y&&zeG0vrNXT7M8KeO}0;*J!kkK2PE^J{||oWyoE`{$Uc-J08=5
zUHY$U&{RPjT(C8unMa4uG!C1zQBSb@GP?R}`>IJ7_565d6E({nN^Docj8B8Wc*WM~
T63w>+)o+#l3gvV6bFTdzU76o`

literal 0
HcmV?d00001

diff --git a/piet-gpu/shader/gen/bbox_clear.hlsl b/piet-gpu/shader/gen/bbox_clear.hlsl
new file mode 100644
index 0000000..ae40b13
--- /dev/null
+++ b/piet-gpu/shader/gen/bbox_clear.hlsl
@@ -0,0 +1,55 @@
+struct Alloc
+{
+    uint offset;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+    Alloc trans_alloc;
+    Alloc bbox_alloc;
+    uint n_trans;
+    uint trans_offset;
+    uint pathtag_offset;
+    uint linewidth_offset;
+    uint pathseg_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
+
+ByteAddressBuffer _21 : register(t1);
+RWByteAddressBuffer _44 : register(u0);
+
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+void comp_main()
+{
+    uint ix = gl_GlobalInvocationID.x;
+    if (ix < _21.Load(0))
+    {
+        uint out_ix = (_21.Load(40) >> uint(2)) + (4u * ix);
+        _44.Store(out_ix * 4 + 8, 65535u);
+        _44.Store((out_ix + 1u) * 4 + 8, 65535u);
+        _44.Store((out_ix + 2u) * 4 + 8, 0u);
+        _44.Store((out_ix + 3u) * 4 + 8, 0u);
+    }
+}
+
+[numthreads(512, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+    comp_main();
+}
diff --git a/piet-gpu/shader/gen/bbox_clear.msl b/piet-gpu/shader/gen/bbox_clear.msl
new file mode 100644
index 0000000..f424448
--- /dev/null
+++ b/piet-gpu/shader/gen/bbox_clear.msl
@@ -0,0 +1,57 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct Alloc
+{
+    uint offset;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+    Alloc trans_alloc;
+    Alloc bbox_alloc;
+    uint n_trans;
+    uint trans_offset;
+    uint pathtag_offset;
+    uint linewidth_offset;
+    uint pathseg_offset;
+};
+
+struct ConfigBuf
+{
+    Config conf;
+};
+
+struct Memory
+{
+    uint mem_offset;
+    uint mem_error;
+    uint memory[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u);
+
+kernel void main0(device Memory& _44 [[buffer(0)]], const device ConfigBuf& _21 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+{
+    uint ix = gl_GlobalInvocationID.x;
+    if (ix < _21.conf.n_elements)
+    {
+        uint out_ix = (_21.conf.bbox_alloc.offset >> uint(2)) + (4u * ix);
+        _44.memory[out_ix] = 65535u;
+        _44.memory[out_ix + 1u] = 65535u;
+        _44.memory[out_ix + 2u] = 0u;
+        _44.memory[out_ix + 3u] = 0u;
+    }
+}
+
diff --git a/piet-gpu/shader/gen/pathseg.dxil b/piet-gpu/shader/gen/pathseg.dxil
new file mode 100644
index 0000000000000000000000000000000000000000..5ad35e75a7f24f16e30db7ccedc2b2ac1cb4792a
GIT binary patch
literal 9424
zcmeHNe_Rt+wx7u)lOKc-AizNQF#)6qs1v||pqLN@6lkL&#b0#*p{PhBqV)%DCkcdz
zpur+Wt!=2<qO~=)YSDGu1frs%4X)coOKoT=vaRh#Yx~gkweL;{ZeQQq-S_#tf8KsR
z?|wcr_nvd^oOACz=ic+Zfvn<8<-^A4V?hrZ+P-Dg>-Ri$uNH<N$Po=eY*5bxPd<2-
zfu{;Qp+H#6g&-q%7G}>SIDj`%<}S>UPSAAy%x`Gb!Xly*@QH~JepCMf3hUQA1_aS#
zp&urAFi#CC2#DN|7fpad%jq)F|F?Vu$mnN+PtWbwWr9DUhn6iYR1~VfL`aAQiGabZ
z;<@>>0U?m30)D2_dRxK=I)Thfi@V3;4`ggjRI8$wITsZs&e#Rl^Cz)5!Qss9r4%-Y
zk-5t&(=JRDrE;)5USg>s*C@i3*@nYVaAM|guE6LUWmhTYYnq32UpFzO+30*QX?kqp
zW+%NoY&oOTTkoQqh!whymX6U)*eJv7bi;>Y7DCWm@C`jze=V7b=X3O27>2gt5R?RX
ziID~_zOsSoA9bi6yuPaMFpB*0o14Ar?fF8ZEfmJ0ahNa&`AHC$Bjra5dpL3~KkYL3
zbW%A1QSF(0E1_Dbhq2MUQS#Wc&v!lMZslC^(Nq+3M7b=O0&xN1t9RhB7noTP*}vfr
z>WBV633q+$cYR)dM$Kom|Dk!Uus>uh5+Np<!XvA3C$Xw2w&Z?FRpiMM18457&NuN$
z5%l0dOo8vD{GWL*3}jXI2z_^*oIgjA6${0_76n`a`HO0-jkRQ>tg>-)B?8*s$5n#0
z4#5`ihJ3aV>>4wBJ<eWh6s((QY?^4SBiVEZ2m0q<b=xRsZz0*W4s=Aw-Z<Wq6aWq5
zJx74Z%m#hdIt4(FM}wP_=Cjf$iJFrIT_{0_uPm@>-2l{=S_M^v%Ah2Hn40^Pw^jK%
z0%FEf^fS3acdXJhf3x3s@X!QyKU)4Inu+1XlsnvL3EBGx2&Q9?E>`0mn1Q>S5JX9Q
zSWK&jfRwK@In)H9pBW^4qK>~g8HXkYf!$ZjArQUpVq}3BreilT+=K=D<{O*2F*z5Q
zO2#~rX~vq&375@6p0GoXG`$UJSiGau<wfSM;HE|9x_O5vq%uMHel}z<!MGZw?7<MP
z!mV^Oh{K(r6@ug&vl;$b)3GwI_36lo_*eJmZC#J-U3Ugnba<uj-`W~i8pqog_h{c4
z&1Pxddats)n?j~PliY_xH_#GX#$hT-s;ae6p;(VdVmPD`js9qLb!7~Pny!G}^X7IW
zFq4v*@-MGc#5}J|jP;T7y<y|n*=?G=I5eQIpC6eS62cfDQXR<(Au=7m7vS%oq_A8g
zdol|H75u9kmRt4EXL40A1hzmKVkyB>tCfzmnEq;1e_;EFOo1UuCLFrSgdH+|8NnBG
zBKdZTFB6F|Xh|@^d-YYXD}OEBD)uV-tKXG_@%#7Vkhp`GamI3P+UXBZ5T~cf-k))n
z%dq;6_k1&b|C%Qq!`>M0c=m%c6aAe1oP%4}<cYtM1janIrzj>TD}QR=TK>EA3e+X{
z4czV?9f{#Um?d}p{)4o!5_#m+mTDk1z`kb_X`{sZTn2&Wk6w0tG%|X7_%i4oZ0XEf
z@9ti1QKTcuxybZlL{^EUY(k_I5-k`e1*2xcU8lf>`#d0h#>_sC%{=k`<@@tO7Vi~*
z-iCK(>hF|38<Vo<rxDpCe4laX%s|75o4YR+aGTOrZkoEjvSej>O~DH*%c~YL4)KFu
z-@feT;L-zw#Wx4r@OQHGcQW^|pA8%y`6)Xnn^uuB$@LA;(RQZ)($|XLA3oE6wSars
z>=pd_m4T%P#LM>%4z^i+Ac&5k`5=apI8>;8D6~QZ&oU@e>wB6rDfZ?~?DeK~{1M3K
zJE7nKB}n3DhIiY-u62Y~5G;2$owyH~)vKJ<jLhoW)Lg4(Z{@Hz=QgU<>@8|!<EBP6
z-Wc5HE4XVE3=uxngwIzlflDPw657LVOF{=EVcjL+6~wmmbGmega>hwy<~iNWdL+Hi
zCE7@`w^p)KDjRFSn`%@W8`UIxV=Q~)7z?O<k!oDQX<Uaj)|eVMn=}dEy9D1m1wUE^
zcL}7lw<C18Bec)`9xMsHED5!1!Y-GfpBj}jdX+Ow?sr#DYMV0EfTXweOan$AYtwHa
z>6RY8UBa$YBU@q{H&Ttk{Yt@@6CK6{cbw=?Lcx8jV94n67nRQgLZ|74BL?NPq3)6}
zx<Ti~bvOjA0Cl+>)qD~9?+12%UGfO?W?nPfR{!wb%Wq$jlMdGUo6Ds&XP2{<J5zFA
zxl^{;l#z{Sk}55?>VEgdszf1Hd!9#mJsugIHmeS0JntZPhh(ID@%@YY_)K)Uh38~^
zJmXQG&l?(RaB}fs{U=vH{Of@jM5B227_-az!j7M>_dE}d2=AgJL+d9jPI?EP>e_$s
z;|H!_&T9(YqSa%A6Q-^<{asVZ>pS&VUJQ>gg`&Q)LyCxqGi#St(wy1Qn;c1G8Rc%v
zL-6j$U!I<#4Vn7I=EqbVFETESbhUma+qSx^?#?~rXsp3-;57ry;jE%L4F)JClDV%$
z!k5sW)J4^F=7iP_pDs|iHT>DYA%$#UE3cQ<|4LdU1G6itti2uKfVpN9&3t1u#ra|T
zXQvz83Xh-$i}&7WAWg?HXcbs|P;wOub4BA#D4qSz!sGpwQ)GF>#mndCKBh57s(2i8
zTtoje=D-1e0&|=PvVR54kqYMd4>8Bde<94#5RDn~8C0KwNtzgD4W+0rq22g@5r_B!
z9AW@C#13#sDZn98C}{@<a0s8qA)eR+)&EHx;tdA&4{?a*f-l98tYK2tV94)z8hDsV
zupj7Lu?T$ozr`Cd9Qi2)#^;r-7o&C@;0>oQNdKQQ;VxNp{rkc`htR5DsR}H`LKuKg
z1B^h%IKX%Y01Vv=T5^LpuyhHhv@*7dA!Gg#&3qpswNz;It?uiCF`P1Mfsm@wLX~2)
z8j}g@Pj3+O5|F7omHWhDZ+s~GFu5z`Y+_29QF@Y?k-c!$T6I-9uAHWfkH-r$6$&v!
zYQmdsr%O0GDX#NIur@fj3hT4-d;h2Dy7`}_YwJ;^{?3At?5Fmye{3JgP64U<|MT$w
z=i&b@<^Rj(;SwBzegKz!e`cM~3)GR|++g-WqEfj=v|uvYHeb<}H6+T{prUytg_)NE
zmON}LSckSLxqgwEH{^=lFUbFlwn?I5@+FyCMgVEXC7Buqg7QQJXe&0i{UX1I7S3xZ
z%$_59&K9652`HZ2_6s~G3YsME!C;OSpNS?xWla3%U;Yhi_N8!6_ZOIL$vdYWwLnl5
z%7wI~dapJ(^i7<%nJ6_vI0yaH<ku8_yjRR@kGO*o5_?$1+v*WN8+XXIfOpUe#3=aM
zXwSq<-3ARz;aKzuA^JAquRyKIyd>Y1Q2xQ+5GAxE3gv+B%r<RAwA#f!$c0t8@yoby
zA<0fFrRV`HnbMr;Z|ttjgbE$#YB;Rd%t&$gA_$})ayLnQv~joPIeb3EwBh#}W^ulu
zkT59$37I-(=p(!{T2O1ZLUJMBZ{3*A9Qd4-_eeg4BvnsOBUxs1;Gl-z#JQnmpaH#%
zCAihhSSI&kVucR&S?y57rycwv;xUqLvhnn<R5m6R+gv{1Q)oQ)^HA7za)63`3&v!)
zVGX4BHb}%&36j}~0yd`^HcjH7aZn2MC5d%H4#=P@F*7U(F3@Xukbp|XfRN{As(b)*
zd@jX5+k(0~zL#_j=~FqqWP6EFy9<#N5ht@UnDN+>%!v8`{*)6wJv@E8lU>SE<~>j<
zSm1XpGyn`6D})a6xJS+AChk!acvWy|Q@&0d*vDXjR>PBz6QaKn<EwN&<{LOd@6D0o
z?y>7i!E87LMiXQ8^!(0|5Pv-v<EJnmLg)wlEMYT-K^(2(eg7es-?-B)!_17kJ!&Q7
zPZC#1k%Bvh2Q(wi7`XInH5LBRu$N+Pa|~1N`!$F$x3|Bx&+W^k{Ah1qoMAP<3=l4r
zGduiF3y;}>quX`~h;irNdgf%-ROVFaJ|XnSW~N7FOdw(NQse`b)&z}I1#=oYg-8Ik
zz{mqcZo?6pC^FN8LD3OsS0kw=7S@-D=ctC{eip4}nj^B<W(`^;IYv6YPGLoa>--X7
zJT|7C!i%GRwWy1dRp*gKS*r8QMTJBj#V>@qZRiC#iwvzI0dd()ILC?Wfbh)YmHxmn
zqhB>W<^UI~h?<z+LEoUqR=*lIpF)d_6a2U52jSA7jWqw%xrR331pl4ZplC_Z7I*(s
zG(SO+sDjP#f}C4&mQ}Lhkar;zNE7GfF}Y!@kO19YoM!SXwKrO2DrqOz-IJ8%h65p0
z-v^kmx;J#y^xg{rbD72P+EKOAC<Z<=PF8ro)zRZ^tELpQ&aMk^qWl!>V+nBgwake%
z54UvqE34==XZ7qDcWqJl2mSu6wd1@ZKe5MWOSTMV7c;x)aR>32UR^UWpYKkH%Y9uJ
zfPa?}cM}l*Sph$y9_@h<AGZd=n>?Bto@jchIcF(zcwAHZpyNCnxG{OY>edSrIv;7m
zYkUt~bnC3zJPdo{>mhsVz#K<7h%dFx@&U8o;~B~Q@XZp`<z7+i#l^yi;4ab<ITAGu
ztFl8dI3<!rCwv@o-CwZ+4#oTas|4##{xYK2_fcyQUv|ZnRjpD}5uRCwG>@@YGk<aG
zdVFy)6zSHL6%BNCX?#c~zarQi+c|T<2b0RdafD3Cc{*Cg5CzkkRl^Cnt!~ZnyMX3A
zq4y2`n8S(ae71+lH@XHiNvb4=Qt88=l~4=y+m!5U@~cikOFHqr#RA(=A8{%&9Z5?b
zwVHe0M2#UcS3CJP_+L5<h%(R%lanrE15n_~?D66+P6!{KwQ>lEPdbb&W#D#Z>v?4$
zt&Zotm{@s2E?hiw<<N~O&6irnecB$A;uj9@823crlrBRN8RePrzWuJ;EzDVoz^`-q
zI1`?GYIb)xYy$n|VI;$|{QcWZ=W>V`dV4qRm~hpL3u5m3nl91j_zzE1zHXlT5_1);
zlGVcXsQmP)HQESxKRpCR&+AEdr9H1F-j(#cE<F}pjz=W*$ulZUn!3JJ<e^D!y~t!_
z=s7NF7PWK68a!hCfIMTw0;H>+NKvj!noE6Ru+cdw`K(`D6H%(B6U){23{8>ESSWp(
z^~85{3!_jrNJWkZO`U{Yt(!gA1fnf>5xGCC_u}bv<Ic4&;t`*@<QXwSmsCUT+`y<i
z=I>1M=sMFiFUq4!H}&{+jz`y_?e8SpJ*L08ALlI{wD^6@02|9c01K(t*73cw6d8K-
zO*}%Wn{V97W!B~6&N6KXnAh|tV}9^{e2Zty1;2Mau~Rnfch@JzJm_^i*`A;&-;LQX
z#z_r3xiygy{v%l#*(jr~na>=<Ld}XgzQBQJ#5lLr<@fqOG=+dwn=W)F<&k^Cx)gPN
zf<Yi01Hv)?hgc{OrV5>9wN~~$X_`|vW|GI<mv$5mm%WTzFFux2pCo9eJEYbg`~0k5
z)uY{ZRJ6#umVcHw0nb^4gdvmjKavkAK^J59u$L&K@PEeY@%=^^-<NWQTCVxLdrI%c
zDqG-inFe}eJXLPmGO~j42i9`q&aX44_FiPb8RL1#Re!aYDf2lO8Ii<zIDq$F#3dOh
z_nwrY-RUc=tKeERP4F3qHKY?WdU+rF4WT((cm+s0x1TyTN8r@TPMzq5mtDUg3!Fi1
zzcqLvlC54W9t8Q2Q|`|9!F0ZtyL09acwhvYTJ@^92hDQG5c2F$JdW)kwrvdl$cwT-
zRn;oZVAVtYCvsNm!6?)lgqZba%Rp})Dbw9FwcH%e(_H*a&PqFYM__FyWJIZh4$jwZ
zn_7O)TpiWfa;vl8<H2-jsr?hHL4kl6R@dodV}6zvtE{R=_Df9-R294i@hE=Sa%(it
zDbs$|)$&<=-&NSsVDxUrl0k^W?6j($Y49LNjijiP5_8zrcI^{mqg^c@Gi#!HTLyaz
zE-ed#miFD6Fm?z@!TlXA@|0R-%2Vq%LRB-SP8i#}W6+RKrf5GOX!-n5-{-J})jwgZ
zF|Aga2;b&x(xq(rATeOu1b<K3hJ}fW+jnzr4;S3F<j<8(=sL3Fb5p)KW&iEbmfI)$
zw!xM;m-wE6UrgJiON77R)b*s){g9Y~THC*VGVpIlTP`zSjv8zEVyvJ~^zu3HwwI0x
zid<={Z7*5P_jMaAjGOacJ%BUkFqfR)CTB7F{3^GHF1xTXdu6Nc%AjCdLXA3jaB}M$
z!PHmNV-l<6!Wl_W4{A3iAw-p&<0i5Ip^93NUJU<bRK_+99S{Y}a7GC<&*0nXFBKny
z>TXz{;wV^04+fm%h_)Miic}d(Y%KtqfOPBdjVVV0pkT7L4P@)u`rw=L^UfeF3@~*o
z+)Vz<n*RA9TaU4_b8gC6IL{Ocgj8rram7r=Yyzw1e98v7<aBiLuu*o;fA;p!>)9_4
zKi{gn)%VHcgqmFOqmDPq1*;a${!w|J=)}-~9&;&xgJtcMxD#ue63|VWOI4=}IXX>~
z;^qPgAB{uH^BM>M4BNh=g$$vM+}c0@?I{vbLQSvO*Dr9!@KuiY_R#zKdPBKj^TJHl
zxKh5ZsYC@gmlo$XiWz|n-H_qS*pK`eVLJ%pm+=zEVm&dcO?xgkK6PW4#3UiQAj4wx
zFaG){qF~JZn#cKxfmd)p0%+9LR^26WNC;Co2O&7wd;!4k=HRFRhHkapN#pg9#3d4<
z(<7MtYe5n%sNjO3m&vViK}MkyO+%UdVW*N+{a;z{^-|66X%h!{7jomB8wV%?SxN8T
z#8i6g;z$`yNMmV2s3W1fd<O>wI0)0&5_Fi_#zSY$J9uyM{mD^Brt?T&UHFOlATF5K
zpc@a(tvAFhi=kuZ?O1RsFe;&WFMXQ(O$(#g4$Z3(@x?+-GW5rc>S$IDoKT7#PU8VX
zR0udV-qfIEJ4yQ?MX1}P0)DZ1+wt%lz)xyKYG6;tI<Y}H|4W58xF#fzci~yX$3^3y
z{cv0FX!FsXT5)yP8x#nR8x&vL9RV&2UDqR6lkT<_#)6T-?OBt}Bv$^owQVvWHSPH{
zdANDk4Vr}a6w>6^uk9*~qosS6(WD4I$XI-RNH1@<lBuQNHXUau8lzH=kHraV+a=zY
z@y_?!PwpTa#oA>Iofl;)Cv-_deH5M#{JU@7n8&}@=%DJ8Pu30Tq1UV=_|So4(p36R
z7IT2Sv{dJ-qc3ljHzv2Ap6(G;93Z(*1Ot=Xk4`sD%GGIhzzSqVDzX57(wzOb=FGn}
zXW+Wa*XG7kx_4U+yvNvZJw$t861}pDdB`U0@7Fx}Tw3nAfQ_x%Q@3h>N^Nl8ffkHk
zh#E>jgm7jH=?GCZfV+^-D3^|qX&^#+LYfssDM=sK+h_mim+FB~`e#N&IVWnLrs0ob
zR*7cEtmRy|FYzQSc0AqRKLrHU?ptr0>$mRu=4~iYYH6Fz67df<zqHHm2q3L(xis1K
z%B~QabhOQ-$=6zU#nGg*Z5|+Xkz}9WNt^t<^#uGhx`t3mi>u79+Cun+e4*I2(t?7+
zr5nu%K4z1%%odFabjfo9CKY<pwt5sW$+r~ZWBTxf8gkM;Uf>MNf&%XL(BeyP>E{TJ
z)h@IrvWvQpXu)2W^5gUhaH>Gr(`O$BlYT~HQ)jIct>9_|Iauz!-NMm?p3rj&Rb%lg
z)<r=Ivx0>NLDDM_dmYZtS-IckO}};v@3Q25es*9Nw`%T##ZxS$7caCIES?*PwJiU!
z5G(W`ror9MG$^Kz3%sBEm0;K{$RpSQLPWA_gzTeWPXDag<6VY7OA>>^XMSd)0h}$c
z@Q>f^1|!|d0vHVZO<nP>V#RV+l_j9wR^)g#ZBR%B0?;?02xEF6g}J?WR-j2#=Q)WM
zPwDbI;$)Dk1=-)m8RIc&C5`Q;p8DqKA?>-4`*FwbHyzG){~F<3H|6A;3FPE44=VIu
zy0Kw_jb&}~eRd+nule)td90TwjqP6Xa8Z7X`&w;F0^Jr17rKJSH_V08k9cosy{IFs
zlce2DUs1YlK6+;lFFOx=w?Aac{*dtlf&Jl6O&?g_dV)r)w-ptzf}n-M6*doM=$f-6
zIl$abE$M7nlD_DWa4C^Tf|aCH>^LbqP63+GAiBW`{&C?$I1F6&&(I6BMa{OU>x0c2
z;7Jo4!dbz!T}yN-!Y3BTxM#(?8dfRm4bP6gw*6FHTo+_o@bb5D>M5ACm|mQ?-)F}k
zv&VP7dn?4VGK-I2Z$iAT^8jpf1MaRrRqbA+hKUVbfz4O~ja7bp?#PCs(SW<3R&zam
zum`QZsbRhF#MJ1T4+m*etLwb7`GFvNP)CpdX?6Ucc-`^m>OC;=`@9c-vTDz2C$Rsc
zfahuhmO=RizhgzAUI2&5zk7D=)qu6EI~$(sPEds32wFkpnb}~Y7};i$T_kwO0%sWY
g5jew2d<JnVg+cV+FqWeGxP9amb-E@PP~+u407ZL@jsO4v

literal 0
HcmV?d00001

diff --git a/piet-gpu/shader/gen/pathseg.hlsl b/piet-gpu/shader/gen/pathseg.hlsl
new file mode 100644
index 0000000..40e60cd
--- /dev/null
+++ b/piet-gpu/shader/gen/pathseg.hlsl
@@ -0,0 +1,643 @@
+struct Alloc
+{
+    uint offset;
+};
+
+struct TagMonoid
+{
+    uint trans_ix;
+    uint linewidth_ix;
+    uint pathseg_ix;
+    uint path_ix;
+    uint pathseg_offset;
+};
+
+struct TransformSegRef
+{
+    uint offset;
+};
+
+struct TransformSeg
+{
+    float4 mat;
+    float2 translate;
+};
+
+struct PathCubicRef
+{
+    uint offset;
+};
+
+struct PathCubic
+{
+    float2 p0;
+    float2 p1;
+    float2 p2;
+    float2 p3;
+    uint path_ix;
+    uint trans_ix;
+    float2 stroke;
+};
+
+struct PathSegRef
+{
+    uint offset;
+};
+
+struct Monoid
+{
+    float4 bbox;
+    uint flags;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+    Alloc trans_alloc;
+    Alloc bbox_alloc;
+    uint n_trans;
+    uint trans_offset;
+    uint pathtag_offset;
+    uint linewidth_offset;
+    uint pathseg_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
+
+static const TagMonoid _135 = { 0u, 0u, 0u, 0u, 0u };
+static const Monoid _567 = { 0.0f.xxxx, 0u };
+
+RWByteAddressBuffer _111 : register(u0);
+ByteAddressBuffer _574 : register(t2);
+ByteAddressBuffer _639 : register(t1);
+ByteAddressBuffer _710 : register(t3);
+
+static uint3 gl_WorkGroupID;
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_WorkGroupID : SV_GroupID;
+    uint3 gl_LocalInvocationID : SV_GroupThreadID;
+    uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+groupshared TagMonoid sh_tag[512];
+groupshared Monoid sh_scratch[512];
+
+TagMonoid reduce_tag(uint tag_word)
+{
+    uint point_count = tag_word & 50529027u;
+    TagMonoid c;
+    c.pathseg_ix = uint(int(countbits((point_count * 7u) & 67372036u)));
+    c.linewidth_ix = uint(int(countbits(tag_word & 1077952576u)));
+    c.path_ix = uint(int(countbits(tag_word & 269488144u)));
+    c.trans_ix = uint(int(countbits(tag_word & 538976288u)));
+    uint n_points = point_count + ((tag_word >> uint(2)) & 16843009u);
+    uint a = n_points + (n_points & (((tag_word >> uint(3)) & 16843009u) * 15u));
+    a += (a >> uint(8));
+    a += (a >> uint(16));
+    c.pathseg_offset = a & 255u;
+    return c;
+}
+
+TagMonoid combine_tag_monoid(TagMonoid a, TagMonoid b)
+{
+    TagMonoid c;
+    c.trans_ix = a.trans_ix + b.trans_ix;
+    c.linewidth_ix = a.linewidth_ix + b.linewidth_ix;
+    c.pathseg_ix = a.pathseg_ix + b.pathseg_ix;
+    c.path_ix = a.path_ix + b.path_ix;
+    c.pathseg_offset = a.pathseg_offset + b.pathseg_offset;
+    return c;
+}
+
+TagMonoid tag_monoid_identity()
+{
+    return _135;
+}
+
+float2 read_f32_point(uint ix)
+{
+    float x = asfloat(_574.Load(ix * 4 + 0));
+    float y = asfloat(_574.Load((ix + 1u) * 4 + 0));
+    return float2(x, y);
+}
+
+float2 read_i16_point(uint ix)
+{
+    uint raw = _574.Load(ix * 4 + 0);
+    float x = float(int(raw << uint(16)) >> 16);
+    float y = float(int(raw) >> 16);
+    return float2(x, y);
+}
+
+bool touch_mem(Alloc alloc, uint offset)
+{
+    return true;
+}
+
+uint read_mem(Alloc alloc, uint offset)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return 0u;
+    }
+    uint v = _111.Load(offset * 4 + 8);
+    return v;
+}
+
+TransformSeg TransformSeg_read(Alloc a, TransformSegRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9);
+    Alloc param_10 = a;
+    uint param_11 = ix + 5u;
+    uint raw5 = read_mem(param_10, param_11);
+    TransformSeg s;
+    s.mat = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
+    s.translate = float2(asfloat(raw4), asfloat(raw5));
+    return s;
+}
+
+void write_mem(Alloc alloc, uint offset, uint val)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return;
+    }
+    _111.Store(offset * 4 + 8, val);
+}
+
+void PathCubic_write(Alloc a, PathCubicRef ref, PathCubic s)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = asuint(s.p0.x);
+    write_mem(param, param_1, param_2);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = asuint(s.p0.y);
+    write_mem(param_3, param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 2u;
+    uint param_8 = asuint(s.p1.x);
+    write_mem(param_6, param_7, param_8);
+    Alloc param_9 = a;
+    uint param_10 = ix + 3u;
+    uint param_11 = asuint(s.p1.y);
+    write_mem(param_9, param_10, param_11);
+    Alloc param_12 = a;
+    uint param_13 = ix + 4u;
+    uint param_14 = asuint(s.p2.x);
+    write_mem(param_12, param_13, param_14);
+    Alloc param_15 = a;
+    uint param_16 = ix + 5u;
+    uint param_17 = asuint(s.p2.y);
+    write_mem(param_15, param_16, param_17);
+    Alloc param_18 = a;
+    uint param_19 = ix + 6u;
+    uint param_20 = asuint(s.p3.x);
+    write_mem(param_18, param_19, param_20);
+    Alloc param_21 = a;
+    uint param_22 = ix + 7u;
+    uint param_23 = asuint(s.p3.y);
+    write_mem(param_21, param_22, param_23);
+    Alloc param_24 = a;
+    uint param_25 = ix + 8u;
+    uint param_26 = s.path_ix;
+    write_mem(param_24, param_25, param_26);
+    Alloc param_27 = a;
+    uint param_28 = ix + 9u;
+    uint param_29 = s.trans_ix;
+    write_mem(param_27, param_28, param_29);
+    Alloc param_30 = a;
+    uint param_31 = ix + 10u;
+    uint param_32 = asuint(s.stroke.x);
+    write_mem(param_30, param_31, param_32);
+    Alloc param_33 = a;
+    uint param_34 = ix + 11u;
+    uint param_35 = asuint(s.stroke.y);
+    write_mem(param_33, param_34, param_35);
+}
+
+void PathSeg_Cubic_write(Alloc a, PathSegRef ref, uint flags, PathCubic s)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = (flags << uint(16)) | 1u;
+    write_mem(param, param_1, param_2);
+    PathCubicRef _458 = { ref.offset + 4u };
+    Alloc param_3 = a;
+    PathCubicRef param_4 = _458;
+    PathCubic param_5 = s;
+    PathCubic_write(param_3, param_4, param_5);
+}
+
+Monoid combine_monoid(Monoid a, Monoid b)
+{
+    Monoid c;
+    c.bbox = b.bbox;
+    bool _472 = (a.flags & 1u) == 0u;
+    bool _480;
+    if (_472)
+    {
+        _480 = b.bbox.z <= b.bbox.x;
+    }
+    else
+    {
+        _480 = _472;
+    }
+    bool _488;
+    if (_480)
+    {
+        _488 = b.bbox.w <= b.bbox.y;
+    }
+    else
+    {
+        _488 = _480;
+    }
+    if (_488)
+    {
+        c.bbox = a.bbox;
+    }
+    else
+    {
+        bool _498 = (a.flags & 1u) == 0u;
+        bool _505;
+        if (_498)
+        {
+            _505 = (b.flags & 2u) == 0u;
+        }
+        else
+        {
+            _505 = _498;
+        }
+        bool _522;
+        if (_505)
+        {
+            bool _512 = a.bbox.z > a.bbox.x;
+            bool _521;
+            if (!_512)
+            {
+                _521 = a.bbox.w > a.bbox.y;
+            }
+            else
+            {
+                _521 = _512;
+            }
+            _522 = _521;
+        }
+        else
+        {
+            _522 = _505;
+        }
+        if (_522)
+        {
+            float4 _529 = c.bbox;
+            float2 _531 = min(a.bbox.xy, _529.xy);
+            c.bbox.x = _531.x;
+            c.bbox.y = _531.y;
+            float4 _540 = c.bbox;
+            float2 _542 = max(a.bbox.zw, _540.zw);
+            c.bbox.z = _542.x;
+            c.bbox.w = _542.y;
+        }
+    }
+    c.flags = (a.flags & 2u) | b.flags;
+    c.flags |= ((a.flags & 1u) << uint(1));
+    return c;
+}
+
+Monoid monoid_identity()
+{
+    return _567;
+}
+
+uint round_down(float x)
+{
+    return uint(max(0.0f, floor(x) + 32768.0f));
+}
+
+uint round_up(float x)
+{
+    return uint(min(65535.0f, ceil(x) + 32768.0f));
+}
+
+void comp_main()
+{
+    uint ix = gl_GlobalInvocationID.x * 4u;
+    uint tag_word = _574.Load(((_639.Load(52) >> uint(2)) + (ix >> uint(2))) * 4 + 0);
+    uint param = tag_word;
+    TagMonoid local_tm = reduce_tag(param);
+    sh_tag[gl_LocalInvocationID.x] = local_tm;
+    for (uint i = 0u; i < 9u; i++)
+    {
+        GroupMemoryBarrierWithGroupSync();
+        if (gl_LocalInvocationID.x >= (1u << i))
+        {
+            TagMonoid other = sh_tag[gl_LocalInvocationID.x - (1u << i)];
+            TagMonoid param_1 = other;
+            TagMonoid param_2 = local_tm;
+            local_tm = combine_tag_monoid(param_1, param_2);
+        }
+        GroupMemoryBarrierWithGroupSync();
+        sh_tag[gl_LocalInvocationID.x] = local_tm;
+    }
+    GroupMemoryBarrierWithGroupSync();
+    TagMonoid tm = tag_monoid_identity();
+    if (gl_WorkGroupID.x > 0u)
+    {
+        TagMonoid _716;
+        _716.trans_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 0);
+        _716.linewidth_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 4);
+        _716.pathseg_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 8);
+        _716.path_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 12);
+        _716.pathseg_offset = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 16);
+        tm.trans_ix = _716.trans_ix;
+        tm.linewidth_ix = _716.linewidth_ix;
+        tm.pathseg_ix = _716.pathseg_ix;
+        tm.path_ix = _716.path_ix;
+        tm.pathseg_offset = _716.pathseg_offset;
+    }
+    if (gl_LocalInvocationID.x > 0u)
+    {
+        TagMonoid param_3 = tm;
+        TagMonoid param_4 = sh_tag[gl_LocalInvocationID.x - 1u];
+        tm = combine_tag_monoid(param_3, param_4);
+    }
+    uint ps_ix = (_639.Load(60) >> uint(2)) + tm.pathseg_offset;
+    uint lw_ix = (_639.Load(56) >> uint(2)) + tm.linewidth_ix;
+    uint save_path_ix = tm.path_ix;
+    TransformSegRef _769 = { _639.Load(36) + (tm.trans_ix * 24u) };
+    TransformSegRef trans_ref = _769;
+    PathSegRef _779 = { _639.Load(28) + (tm.pathseg_ix * 52u) };
+    PathSegRef ps_ref = _779;
+    float2 p0;
+    float2 p1;
+    float2 p2;
+    float2 p3;
+    Alloc param_13;
+    Monoid local[4];
+    PathCubic cubic;
+    Alloc param_15;
+    for (uint i_1 = 0u; i_1 < 4u; i_1++)
+    {
+        uint tag_byte = tag_word >> (i_1 * 8u);
+        uint seg_type = tag_byte & 3u;
+        if (seg_type != 0u)
+        {
+            if ((tag_byte & 8u) != 0u)
+            {
+                uint param_5 = ps_ix;
+                p0 = read_f32_point(param_5);
+                uint param_6 = ps_ix + 2u;
+                p1 = read_f32_point(param_6);
+                if (seg_type >= 2u)
+                {
+                    uint param_7 = ps_ix + 4u;
+                    p2 = read_f32_point(param_7);
+                    if (seg_type == 3u)
+                    {
+                        uint param_8 = ps_ix + 6u;
+                        p3 = read_f32_point(param_8);
+                    }
+                }
+            }
+            else
+            {
+                uint param_9 = ps_ix;
+                p0 = read_i16_point(param_9);
+                uint param_10 = ps_ix + 1u;
+                p1 = read_i16_point(param_10);
+                if (seg_type >= 2u)
+                {
+                    uint param_11 = ps_ix + 2u;
+                    p2 = read_i16_point(param_11);
+                    if (seg_type == 3u)
+                    {
+                        uint param_12 = ps_ix + 3u;
+                        p3 = read_i16_point(param_12);
+                    }
+                }
+            }
+            float linewidth = asfloat(_574.Load(lw_ix * 4 + 0));
+            Alloc _865;
+            _865.offset = _639.Load(36);
+            param_13.offset = _865.offset;
+            TransformSegRef param_14 = trans_ref;
+            TransformSeg transform = TransformSeg_read(param_13, param_14);
+            p0 = ((transform.mat.xy * p0.x) + (transform.mat.zw * p0.y)) + transform.translate;
+            p1 = ((transform.mat.xy * p1.x) + (transform.mat.zw * p1.y)) + transform.translate;
+            float4 bbox = float4(min(p0, p1), max(p0, p1));
+            if (seg_type >= 2u)
+            {
+                p2 = ((transform.mat.xy * p2.x) + (transform.mat.zw * p2.y)) + transform.translate;
+                float4 _935 = bbox;
+                float2 _938 = min(_935.xy, p2);
+                bbox.x = _938.x;
+                bbox.y = _938.y;
+                float4 _943 = bbox;
+                float2 _946 = max(_943.zw, p2);
+                bbox.z = _946.x;
+                bbox.w = _946.y;
+                if (seg_type == 3u)
+                {
+                    p3 = ((transform.mat.xy * p3.x) + (transform.mat.zw * p3.y)) + transform.translate;
+                    float4 _971 = bbox;
+                    float2 _974 = min(_971.xy, p3);
+                    bbox.x = _974.x;
+                    bbox.y = _974.y;
+                    float4 _979 = bbox;
+                    float2 _982 = max(_979.zw, p3);
+                    bbox.z = _982.x;
+                    bbox.w = _982.y;
+                }
+                else
+                {
+                    p3 = p2;
+                    p2 = lerp(p1, p2, 0.3333333432674407958984375f.xx);
+                    p1 = lerp(p1, p0, 0.3333333432674407958984375f.xx);
+                }
+            }
+            else
+            {
+                p3 = p1;
+                p2 = lerp(p3, p0, 0.3333333432674407958984375f.xx);
+                p1 = lerp(p0, p3, 0.3333333432674407958984375f.xx);
+            }
+            float2 stroke = 0.0f.xx;
+            if (linewidth >= 0.0f)
+            {
+                stroke = float2(length(transform.mat.xz), length(transform.mat.yw)) * (0.5f * linewidth);
+                bbox += float4(-stroke, stroke);
+            }
+            local[i_1].bbox = bbox;
+            local[i_1].flags = 0u;
+            cubic.p0 = p0;
+            cubic.p1 = p1;
+            cubic.p2 = p2;
+            cubic.p3 = p3;
+            cubic.path_ix = tm.path_ix;
+            cubic.trans_ix = (gl_GlobalInvocationID.x * 4u) + i_1;
+            cubic.stroke = stroke;
+            uint fill_mode = uint(linewidth >= 0.0f);
+            Alloc _1071;
+            _1071.offset = _639.Load(28);
+            param_15.offset = _1071.offset;
+            PathSegRef param_16 = ps_ref;
+            uint param_17 = fill_mode;
+            PathCubic param_18 = cubic;
+            PathSeg_Cubic_write(param_15, param_16, param_17, param_18);
+            ps_ref.offset += 52u;
+            uint n_points = (tag_byte & 3u) + ((tag_byte >> uint(2)) & 1u);
+            uint n_words = n_points + (n_points & (((tag_byte >> uint(3)) & 1u) * 15u));
+            ps_ix += n_words;
+        }
+        else
+        {
+            local[i_1].bbox = 0.0f.xxxx;
+            uint is_path = (tag_byte >> uint(4)) & 1u;
+            local[i_1].flags = is_path;
+            tm.path_ix += is_path;
+            trans_ref.offset += (((tag_byte >> uint(5)) & 1u) * 24u);
+            lw_ix += ((tag_byte >> uint(6)) & 1u);
+        }
+    }
+    Monoid agg = local[0];
+    for (uint i_2 = 1u; i_2 < 4u; i_2++)
+    {
+        Monoid param_19 = agg;
+        Monoid param_20 = local[i_2];
+        agg = combine_monoid(param_19, param_20);
+        local[i_2] = agg;
+    }
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i_3 = 0u; i_3 < 9u; i_3++)
+    {
+        GroupMemoryBarrierWithGroupSync();
+        if (gl_LocalInvocationID.x >= (1u << i_3))
+        {
+            Monoid other_1 = sh_scratch[gl_LocalInvocationID.x - (1u << i_3)];
+            Monoid param_21 = other_1;
+            Monoid param_22 = agg;
+            agg = combine_monoid(param_21, param_22);
+        }
+        GroupMemoryBarrierWithGroupSync();
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    GroupMemoryBarrierWithGroupSync();
+    uint path_ix = save_path_ix;
+    uint bbox_out_ix = (_639.Load(40) >> uint(2)) + (path_ix * 4u);
+    Monoid row = monoid_identity();
+    if (gl_LocalInvocationID.x > 0u)
+    {
+        row = sh_scratch[gl_LocalInvocationID.x - 1u];
+    }
+    for (uint i_4 = 0u; i_4 < 4u; i_4++)
+    {
+        Monoid param_23 = row;
+        Monoid param_24 = local[i_4];
+        Monoid m = combine_monoid(param_23, param_24);
+        bool do_atomic = false;
+        bool _1241 = i_4 == 3u;
+        bool _1248;
+        if (_1241)
+        {
+            _1248 = gl_LocalInvocationID.x == 511u;
+        }
+        else
+        {
+            _1248 = _1241;
+        }
+        if (_1248)
+        {
+            do_atomic = true;
+        }
+        if ((m.flags & 1u) != 0u)
+        {
+            if ((m.flags & 2u) == 0u)
+            {
+                do_atomic = true;
+            }
+            else
+            {
+                float param_25 = m.bbox.x;
+                _111.Store(bbox_out_ix * 4 + 8, round_down(param_25));
+                float param_26 = m.bbox.y;
+                _111.Store((bbox_out_ix + 1u) * 4 + 8, round_down(param_26));
+                float param_27 = m.bbox.z;
+                _111.Store((bbox_out_ix + 2u) * 4 + 8, round_up(param_27));
+                float param_28 = m.bbox.w;
+                _111.Store((bbox_out_ix + 3u) * 4 + 8, round_up(param_28));
+                bbox_out_ix += 4u;
+                do_atomic = false;
+            }
+        }
+        if (do_atomic)
+        {
+            bool _1300 = m.bbox.z > m.bbox.x;
+            bool _1309;
+            if (!_1300)
+            {
+                _1309 = m.bbox.w > m.bbox.y;
+            }
+            else
+            {
+                _1309 = _1300;
+            }
+            if (_1309)
+            {
+                float param_29 = m.bbox.x;
+                uint _1318;
+                _111.InterlockedMin(bbox_out_ix * 4 + 8, round_down(param_29), _1318);
+                float param_30 = m.bbox.y;
+                uint _1326;
+                _111.InterlockedMin((bbox_out_ix + 1u) * 4 + 8, round_down(param_30), _1326);
+                float param_31 = m.bbox.z;
+                uint _1334;
+                _111.InterlockedMax((bbox_out_ix + 2u) * 4 + 8, round_up(param_31), _1334);
+                float param_32 = m.bbox.w;
+                uint _1342;
+                _111.InterlockedMax((bbox_out_ix + 3u) * 4 + 8, round_up(param_32), _1342);
+            }
+            bbox_out_ix += 4u;
+        }
+    }
+}
+
+[numthreads(512, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_WorkGroupID = stage_input.gl_WorkGroupID;
+    gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+    gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+    comp_main();
+}
diff --git a/piet-gpu/shader/gen/pathseg.msl b/piet-gpu/shader/gen/pathseg.msl
new file mode 100644
index 0000000..25d001f
--- /dev/null
+++ b/piet-gpu/shader/gen/pathseg.msl
@@ -0,0 +1,699 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+#include <metal_atomic>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct Alloc
+{
+    uint offset;
+};
+
+struct TagMonoid
+{
+    uint trans_ix;
+    uint linewidth_ix;
+    uint pathseg_ix;
+    uint path_ix;
+    uint pathseg_offset;
+};
+
+struct TransformSegRef
+{
+    uint offset;
+};
+
+struct TransformSeg
+{
+    float4 mat;
+    float2 translate;
+};
+
+struct PathCubicRef
+{
+    uint offset;
+};
+
+struct PathCubic
+{
+    float2 p0;
+    float2 p1;
+    float2 p2;
+    float2 p3;
+    uint path_ix;
+    uint trans_ix;
+    float2 stroke;
+};
+
+struct PathSegRef
+{
+    uint offset;
+};
+
+struct Monoid
+{
+    float4 bbox;
+    uint flags;
+};
+
+struct Memory
+{
+    uint mem_offset;
+    uint mem_error;
+    uint memory[1];
+};
+
+struct SceneBuf
+{
+    uint scene[1];
+};
+
+struct Alloc_1
+{
+    uint offset;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc_1 tile_alloc;
+    Alloc_1 bin_alloc;
+    Alloc_1 ptcl_alloc;
+    Alloc_1 pathseg_alloc;
+    Alloc_1 anno_alloc;
+    Alloc_1 trans_alloc;
+    Alloc_1 bbox_alloc;
+    uint n_trans;
+    uint trans_offset;
+    uint pathtag_offset;
+    uint linewidth_offset;
+    uint pathseg_offset;
+};
+
+struct ConfigBuf
+{
+    Config conf;
+};
+
+struct TagMonoid_1
+{
+    uint trans_ix;
+    uint linewidth_ix;
+    uint pathseg_ix;
+    uint path_ix;
+    uint pathseg_offset;
+};
+
+struct ParentBuf
+{
+    TagMonoid_1 parent[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+TagMonoid reduce_tag(thread const uint& tag_word)
+{
+    uint point_count = tag_word & 50529027u;
+    TagMonoid c;
+    c.pathseg_ix = uint(int(popcount((point_count * 7u) & 67372036u)));
+    c.linewidth_ix = uint(int(popcount(tag_word & 1077952576u)));
+    c.path_ix = uint(int(popcount(tag_word & 269488144u)));
+    c.trans_ix = uint(int(popcount(tag_word & 538976288u)));
+    uint n_points = point_count + ((tag_word >> uint(2)) & 16843009u);
+    uint a = n_points + (n_points & (((tag_word >> uint(3)) & 16843009u) * 15u));
+    a += (a >> uint(8));
+    a += (a >> uint(16));
+    c.pathseg_offset = a & 255u;
+    return c;
+}
+
+static inline __attribute__((always_inline))
+TagMonoid combine_tag_monoid(thread const TagMonoid& a, thread const TagMonoid& b)
+{
+    TagMonoid c;
+    c.trans_ix = a.trans_ix + b.trans_ix;
+    c.linewidth_ix = a.linewidth_ix + b.linewidth_ix;
+    c.pathseg_ix = a.pathseg_ix + b.pathseg_ix;
+    c.path_ix = a.path_ix + b.path_ix;
+    c.pathseg_offset = a.pathseg_offset + b.pathseg_offset;
+    return c;
+}
+
+static inline __attribute__((always_inline))
+TagMonoid tag_monoid_identity()
+{
+    return TagMonoid{ 0u, 0u, 0u, 0u, 0u };
+}
+
+static inline __attribute__((always_inline))
+float2 read_f32_point(thread const uint& ix, const device SceneBuf& v_574)
+{
+    float x = as_type<float>(v_574.scene[ix]);
+    float y = as_type<float>(v_574.scene[ix + 1u]);
+    return float2(x, y);
+}
+
+static inline __attribute__((always_inline))
+float2 read_i16_point(thread const uint& ix, const device SceneBuf& v_574)
+{
+    uint raw = v_574.scene[ix];
+    float x = float(int(raw << uint(16)) >> 16);
+    float y = float(int(raw) >> 16);
+    return float2(x, y);
+}
+
+static inline __attribute__((always_inline))
+bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
+{
+    return true;
+}
+
+static inline __attribute__((always_inline))
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_111)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return 0u;
+    }
+    uint v = v_111.memory[offset];
+    return v;
+}
+
+static inline __attribute__((always_inline))
+TransformSeg TransformSeg_read(thread const Alloc& a, thread const TransformSegRef& ref, device Memory& v_111)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_111);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3, v_111);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5, v_111);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7, v_111);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9, v_111);
+    Alloc param_10 = a;
+    uint param_11 = ix + 5u;
+    uint raw5 = read_mem(param_10, param_11, v_111);
+    TransformSeg s;
+    s.mat = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
+    s.translate = float2(as_type<float>(raw4), as_type<float>(raw5));
+    return s;
+}
+
+static inline __attribute__((always_inline))
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_111)
+{
+    Alloc param = alloc;
+    uint param_1 = offset;
+    if (!touch_mem(param, param_1))
+    {
+        return;
+    }
+    v_111.memory[offset] = val;
+}
+
+static inline __attribute__((always_inline))
+void PathCubic_write(thread const Alloc& a, thread const PathCubicRef& ref, thread const PathCubic& s, device Memory& v_111)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = as_type<uint>(s.p0.x);
+    write_mem(param, param_1, param_2, v_111);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = as_type<uint>(s.p0.y);
+    write_mem(param_3, param_4, param_5, v_111);
+    Alloc param_6 = a;
+    uint param_7 = ix + 2u;
+    uint param_8 = as_type<uint>(s.p1.x);
+    write_mem(param_6, param_7, param_8, v_111);
+    Alloc param_9 = a;
+    uint param_10 = ix + 3u;
+    uint param_11 = as_type<uint>(s.p1.y);
+    write_mem(param_9, param_10, param_11, v_111);
+    Alloc param_12 = a;
+    uint param_13 = ix + 4u;
+    uint param_14 = as_type<uint>(s.p2.x);
+    write_mem(param_12, param_13, param_14, v_111);
+    Alloc param_15 = a;
+    uint param_16 = ix + 5u;
+    uint param_17 = as_type<uint>(s.p2.y);
+    write_mem(param_15, param_16, param_17, v_111);
+    Alloc param_18 = a;
+    uint param_19 = ix + 6u;
+    uint param_20 = as_type<uint>(s.p3.x);
+    write_mem(param_18, param_19, param_20, v_111);
+    Alloc param_21 = a;
+    uint param_22 = ix + 7u;
+    uint param_23 = as_type<uint>(s.p3.y);
+    write_mem(param_21, param_22, param_23, v_111);
+    Alloc param_24 = a;
+    uint param_25 = ix + 8u;
+    uint param_26 = s.path_ix;
+    write_mem(param_24, param_25, param_26, v_111);
+    Alloc param_27 = a;
+    uint param_28 = ix + 9u;
+    uint param_29 = s.trans_ix;
+    write_mem(param_27, param_28, param_29, v_111);
+    Alloc param_30 = a;
+    uint param_31 = ix + 10u;
+    uint param_32 = as_type<uint>(s.stroke.x);
+    write_mem(param_30, param_31, param_32, v_111);
+    Alloc param_33 = a;
+    uint param_34 = ix + 11u;
+    uint param_35 = as_type<uint>(s.stroke.y);
+    write_mem(param_33, param_34, param_35, v_111);
+}
+
+static inline __attribute__((always_inline))
+void PathSeg_Cubic_write(thread const Alloc& a, thread const PathSegRef& ref, thread const uint& flags, thread const PathCubic& s, device Memory& v_111)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = (flags << uint(16)) | 1u;
+    write_mem(param, param_1, param_2, v_111);
+    Alloc param_3 = a;
+    PathCubicRef param_4 = PathCubicRef{ ref.offset + 4u };
+    PathCubic param_5 = s;
+    PathCubic_write(param_3, param_4, param_5, v_111);
+}
+
+static inline __attribute__((always_inline))
+Monoid combine_monoid(thread const Monoid& a, thread const Monoid& b)
+{
+    Monoid c;
+    c.bbox = b.bbox;
+    bool _472 = (a.flags & 1u) == 0u;
+    bool _480;
+    if (_472)
+    {
+        _480 = b.bbox.z <= b.bbox.x;
+    }
+    else
+    {
+        _480 = _472;
+    }
+    bool _488;
+    if (_480)
+    {
+        _488 = b.bbox.w <= b.bbox.y;
+    }
+    else
+    {
+        _488 = _480;
+    }
+    if (_488)
+    {
+        c.bbox = a.bbox;
+    }
+    else
+    {
+        bool _498 = (a.flags & 1u) == 0u;
+        bool _505;
+        if (_498)
+        {
+            _505 = (b.flags & 2u) == 0u;
+        }
+        else
+        {
+            _505 = _498;
+        }
+        bool _522;
+        if (_505)
+        {
+            bool _512 = a.bbox.z > a.bbox.x;
+            bool _521;
+            if (!_512)
+            {
+                _521 = a.bbox.w > a.bbox.y;
+            }
+            else
+            {
+                _521 = _512;
+            }
+            _522 = _521;
+        }
+        else
+        {
+            _522 = _505;
+        }
+        if (_522)
+        {
+            float4 _529 = c.bbox;
+            float2 _531 = fast::min(a.bbox.xy, _529.xy);
+            c.bbox.x = _531.x;
+            c.bbox.y = _531.y;
+            float4 _540 = c.bbox;
+            float2 _542 = fast::max(a.bbox.zw, _540.zw);
+            c.bbox.z = _542.x;
+            c.bbox.w = _542.y;
+        }
+    }
+    c.flags = (a.flags & 2u) | b.flags;
+    c.flags |= ((a.flags & 1u) << uint(1));
+    return c;
+}
+
+static inline __attribute__((always_inline))
+Monoid monoid_identity()
+{
+    return Monoid{ float4(0.0), 0u };
+}
+
+static inline __attribute__((always_inline))
+uint round_down(thread const float& x)
+{
+    return uint(fast::max(0.0, floor(x) + 32768.0));
+}
+
+static inline __attribute__((always_inline))
+uint round_up(thread const float& x)
+{
+    return uint(fast::min(65535.0, ceil(x) + 32768.0));
+}
+
+kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _639 [[buffer(1)]], const device SceneBuf& v_574 [[buffer(2)]], const device ParentBuf& _710 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+{
+    threadgroup TagMonoid sh_tag[512];
+    threadgroup Monoid sh_scratch[512];
+    uint ix = gl_GlobalInvocationID.x * 4u;
+    uint tag_word = v_574.scene[(_639.conf.pathtag_offset >> uint(2)) + (ix >> uint(2))];
+    uint param = tag_word;
+    TagMonoid local_tm = reduce_tag(param);
+    sh_tag[gl_LocalInvocationID.x] = local_tm;
+    for (uint i = 0u; i < 9u; i++)
+    {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        if (gl_LocalInvocationID.x >= (1u << i))
+        {
+            TagMonoid other = sh_tag[gl_LocalInvocationID.x - (1u << i)];
+            TagMonoid param_1 = other;
+            TagMonoid param_2 = local_tm;
+            local_tm = combine_tag_monoid(param_1, param_2);
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        sh_tag[gl_LocalInvocationID.x] = local_tm;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    TagMonoid tm = tag_monoid_identity();
+    if (gl_WorkGroupID.x > 0u)
+    {
+        uint _713 = gl_WorkGroupID.x - 1u;
+        tm.trans_ix = _710.parent[_713].trans_ix;
+        tm.linewidth_ix = _710.parent[_713].linewidth_ix;
+        tm.pathseg_ix = _710.parent[_713].pathseg_ix;
+        tm.path_ix = _710.parent[_713].path_ix;
+        tm.pathseg_offset = _710.parent[_713].pathseg_offset;
+    }
+    if (gl_LocalInvocationID.x > 0u)
+    {
+        TagMonoid param_3 = tm;
+        TagMonoid param_4 = sh_tag[gl_LocalInvocationID.x - 1u];
+        tm = combine_tag_monoid(param_3, param_4);
+    }
+    uint ps_ix = (_639.conf.pathseg_offset >> uint(2)) + tm.pathseg_offset;
+    uint lw_ix = (_639.conf.linewidth_offset >> uint(2)) + tm.linewidth_ix;
+    uint save_path_ix = tm.path_ix;
+    TransformSegRef trans_ref = TransformSegRef{ _639.conf.trans_alloc.offset + (tm.trans_ix * 24u) };
+    PathSegRef ps_ref = PathSegRef{ _639.conf.pathseg_alloc.offset + (tm.pathseg_ix * 52u) };
+    float2 p0;
+    float2 p1;
+    float2 p2;
+    float2 p3;
+    Alloc param_13;
+    spvUnsafeArray<Monoid, 4> local;
+    PathCubic cubic;
+    Alloc param_15;
+    for (uint i_1 = 0u; i_1 < 4u; i_1++)
+    {
+        uint tag_byte = tag_word >> (i_1 * 8u);
+        uint seg_type = tag_byte & 3u;
+        if (seg_type != 0u)
+        {
+            if ((tag_byte & 8u) != 0u)
+            {
+                uint param_5 = ps_ix;
+                p0 = read_f32_point(param_5, v_574);
+                uint param_6 = ps_ix + 2u;
+                p1 = read_f32_point(param_6, v_574);
+                if (seg_type >= 2u)
+                {
+                    uint param_7 = ps_ix + 4u;
+                    p2 = read_f32_point(param_7, v_574);
+                    if (seg_type == 3u)
+                    {
+                        uint param_8 = ps_ix + 6u;
+                        p3 = read_f32_point(param_8, v_574);
+                    }
+                }
+            }
+            else
+            {
+                uint param_9 = ps_ix;
+                p0 = read_i16_point(param_9, v_574);
+                uint param_10 = ps_ix + 1u;
+                p1 = read_i16_point(param_10, v_574);
+                if (seg_type >= 2u)
+                {
+                    uint param_11 = ps_ix + 2u;
+                    p2 = read_i16_point(param_11, v_574);
+                    if (seg_type == 3u)
+                    {
+                        uint param_12 = ps_ix + 3u;
+                        p3 = read_i16_point(param_12, v_574);
+                    }
+                }
+            }
+            float linewidth = as_type<float>(v_574.scene[lw_ix]);
+            param_13.offset = _639.conf.trans_alloc.offset;
+            TransformSegRef param_14 = trans_ref;
+            TransformSeg transform = TransformSeg_read(param_13, param_14, v_111);
+            p0 = ((transform.mat.xy * p0.x) + (transform.mat.zw * p0.y)) + transform.translate;
+            p1 = ((transform.mat.xy * p1.x) + (transform.mat.zw * p1.y)) + transform.translate;
+            float4 bbox = float4(fast::min(p0, p1), fast::max(p0, p1));
+            if (seg_type >= 2u)
+            {
+                p2 = ((transform.mat.xy * p2.x) + (transform.mat.zw * p2.y)) + transform.translate;
+                float4 _935 = bbox;
+                float2 _938 = fast::min(_935.xy, p2);
+                bbox.x = _938.x;
+                bbox.y = _938.y;
+                float4 _943 = bbox;
+                float2 _946 = fast::max(_943.zw, p2);
+                bbox.z = _946.x;
+                bbox.w = _946.y;
+                if (seg_type == 3u)
+                {
+                    p3 = ((transform.mat.xy * p3.x) + (transform.mat.zw * p3.y)) + transform.translate;
+                    float4 _971 = bbox;
+                    float2 _974 = fast::min(_971.xy, p3);
+                    bbox.x = _974.x;
+                    bbox.y = _974.y;
+                    float4 _979 = bbox;
+                    float2 _982 = fast::max(_979.zw, p3);
+                    bbox.z = _982.x;
+                    bbox.w = _982.y;
+                }
+                else
+                {
+                    p3 = p2;
+                    p2 = mix(p1, p2, float2(0.3333333432674407958984375));
+                    p1 = mix(p1, p0, float2(0.3333333432674407958984375));
+                }
+            }
+            else
+            {
+                p3 = p1;
+                p2 = mix(p3, p0, float2(0.3333333432674407958984375));
+                p1 = mix(p0, p3, float2(0.3333333432674407958984375));
+            }
+            float2 stroke = float2(0.0);
+            if (linewidth >= 0.0)
+            {
+                stroke = float2(length(transform.mat.xz), length(transform.mat.yw)) * (0.5 * linewidth);
+                bbox += float4(-stroke, stroke);
+            }
+            local[i_1].bbox = bbox;
+            local[i_1].flags = 0u;
+            cubic.p0 = p0;
+            cubic.p1 = p1;
+            cubic.p2 = p2;
+            cubic.p3 = p3;
+            cubic.path_ix = tm.path_ix;
+            cubic.trans_ix = (gl_GlobalInvocationID.x * 4u) + i_1;
+            cubic.stroke = stroke;
+            uint fill_mode = uint(linewidth >= 0.0);
+            param_15.offset = _639.conf.pathseg_alloc.offset;
+            PathSegRef param_16 = ps_ref;
+            uint param_17 = fill_mode;
+            PathCubic param_18 = cubic;
+            PathSeg_Cubic_write(param_15, param_16, param_17, param_18, v_111);
+            ps_ref.offset += 52u;
+            uint n_points = (tag_byte & 3u) + ((tag_byte >> uint(2)) & 1u);
+            uint n_words = n_points + (n_points & (((tag_byte >> uint(3)) & 1u) * 15u));
+            ps_ix += n_words;
+        }
+        else
+        {
+            local[i_1].bbox = float4(0.0);
+            uint is_path = (tag_byte >> uint(4)) & 1u;
+            local[i_1].flags = is_path;
+            tm.path_ix += is_path;
+            trans_ref.offset += (((tag_byte >> uint(5)) & 1u) * 24u);
+            lw_ix += ((tag_byte >> uint(6)) & 1u);
+        }
+    }
+    Monoid agg = local[0];
+    for (uint i_2 = 1u; i_2 < 4u; i_2++)
+    {
+        Monoid param_19 = agg;
+        Monoid param_20 = local[i_2];
+        agg = combine_monoid(param_19, param_20);
+        local[i_2] = agg;
+    }
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i_3 = 0u; i_3 < 9u; i_3++)
+    {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        if (gl_LocalInvocationID.x >= (1u << i_3))
+        {
+            Monoid other_1 = sh_scratch[gl_LocalInvocationID.x - (1u << i_3)];
+            Monoid param_21 = other_1;
+            Monoid param_22 = agg;
+            agg = combine_monoid(param_21, param_22);
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    uint path_ix = save_path_ix;
+    uint bbox_out_ix = (_639.conf.bbox_alloc.offset >> uint(2)) + (path_ix * 4u);
+    Monoid row = monoid_identity();
+    if (gl_LocalInvocationID.x > 0u)
+    {
+        row = sh_scratch[gl_LocalInvocationID.x - 1u];
+    }
+    for (uint i_4 = 0u; i_4 < 4u; i_4++)
+    {
+        Monoid param_23 = row;
+        Monoid param_24 = local[i_4];
+        Monoid m = combine_monoid(param_23, param_24);
+        bool do_atomic = false;
+        bool _1241 = i_4 == 3u;
+        bool _1248;
+        if (_1241)
+        {
+            _1248 = gl_LocalInvocationID.x == 511u;
+        }
+        else
+        {
+            _1248 = _1241;
+        }
+        if (_1248)
+        {
+            do_atomic = true;
+        }
+        if ((m.flags & 1u) != 0u)
+        {
+            if ((m.flags & 2u) == 0u)
+            {
+                do_atomic = true;
+            }
+            else
+            {
+                float param_25 = m.bbox.x;
+                v_111.memory[bbox_out_ix] = round_down(param_25);
+                float param_26 = m.bbox.y;
+                v_111.memory[bbox_out_ix + 1u] = round_down(param_26);
+                float param_27 = m.bbox.z;
+                v_111.memory[bbox_out_ix + 2u] = round_up(param_27);
+                float param_28 = m.bbox.w;
+                v_111.memory[bbox_out_ix + 3u] = round_up(param_28);
+                bbox_out_ix += 4u;
+                do_atomic = false;
+            }
+        }
+        if (do_atomic)
+        {
+            bool _1300 = m.bbox.z > m.bbox.x;
+            bool _1309;
+            if (!_1300)
+            {
+                _1309 = m.bbox.w > m.bbox.y;
+            }
+            else
+            {
+                _1309 = _1300;
+            }
+            if (_1309)
+            {
+                float param_29 = m.bbox.x;
+                uint _1318 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix], round_down(param_29), memory_order_relaxed);
+                float param_30 = m.bbox.y;
+                uint _1326 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 1u], round_down(param_30), memory_order_relaxed);
+                float param_31 = m.bbox.z;
+                uint _1334 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 2u], round_up(param_31), memory_order_relaxed);
+                float param_32 = m.bbox.w;
+                uint _1342 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 3u], round_up(param_32), memory_order_relaxed);
+            }
+            bbox_out_ix += 4u;
+        }
+    }
+}
+
diff --git a/piet-gpu/shader/gen/pathtag_reduce.dxil b/piet-gpu/shader/gen/pathtag_reduce.dxil
new file mode 100644
index 0000000000000000000000000000000000000000..81448e70c67c30ad935a58050ef5752e800ec0d8
GIT binary patch
literal 4700
zcmeHKdsGzH8NahLJ2Sg1y92xEtUR0*A%JLx4G$GeXCERjb*cDB>S<;{32A_WC;{{|
zvkzD#SQj;sXiOJl2szD3R;4wV)UyjD-4IlKG>53Qph#^_RAOynQj_-13dW|V{?l`M
z(sOdp`R@I`-}l|`es|{1{_dLB7)@VlFUVJpuJ|DLqnk%xlfAbd0RXHG2Y`TM7L)?f
zhEfg%gU<OR07Xz%WaqMQxYu%5EJ>NN#c}y}Y|DyOtQ+F&y&3<i`~x=WeUJcku@+HK
zC^QrdirA*`kh|tRTmo%Trf_i&ViV_!GKEj&5q&F)3`NN>kOxxmAxvyulUpDzKtZ1d
z;zpD2qD}!oFT_IZOfbJ1`UqQWn<0SZ;*@^D4#cU5laksP?73n=vshx>5n?<*={567
zI$y5;adVDa!<e$0Uj<sdaV$saZkyF_F%;TbM;%vMu*7V<5N=MaEwXh)d<+C9=?ZO{
z5LMI%PU9uFMHO)iKSK?7L>1%!kPGLCds5qZBU3;&$q)o=WdKNmn0{7{j9J%+?VNSM
z3A@YA4!0(}prtj$d7ywQufce?fbf)#&IWmG^DYE%oeC5fYcTTl%}pQv%Ik99p~{*X
zIscwmvc)e*JZ<kh_4XavX7Yn4WKYG#&+Mrvf{P?bF$HK@tQEw!(K62Ytfw`Fy<<Ng
zHkb8LVLQ4D7aPn8VBPYaoz3C#&o*Y1&+4`|ley=*_A|3IptlC|0JUa=&$W?vrI)$t
z%TTC?zGYEX3@IC64^#~-vB5*sFhqr0Sv{rHO)1qpAqEWLf4MKLmLnQ?V&f1#P7$?t
zBY7pva5r)sx;zAosTfhhJd+uM$U@a@w=Q<E8PxWI!DY1Yu?%H1t)TQFh0hctnL5`1
zG-csb@c6+!14X^!?s*Um`X<fYJBaL<K?t58(ZdHy03seIvH9G;M<jeu1wha($1vZd
z0ZcE;$l&o3fkPbZR8aIW?X)(mz}+f~U-CX?lEkqRi(-8tw%$TY6D&f)mBRg!#v=I@
z^HHvA8yMP$c(PE=io=-z)Wb8f363++0(Z7VZBC<0LTWP6gL8K7&EH&uzF6IZ7&<~S
z_ik=WDo&E`N&0F}kF7o>zb2$4|6>YMW4;#|a1ysN=_F>bmRH&V7ukgB;>q1hF3l;8
zf6@-jk>SS2nyox3^sg>hI*1|e^x2L})JY%Z!#iy8<i|@c<p%=`)XvWHUh1^jg05H@
z;njK5sZ&y*cJ3br<utkUJXIdqq`REc-S}$HQYsa$WKm<DbF9<PmNU&NML%z9s|V@0
zvv+K)_vW#yC&tem>%Vz*U_71_-g~HE*}UW%&s{Hn^V)OU%6|zbug&U*fu{$?nwvjp
z?Ag}w{PBZ1HG5y#yO#m<*3^unJu-<qtYc2!--|bg#FiLWH+_+|<cs*#=?0KGZPQbg
z8*9`Uf1Y{XrhN&Salroy&U^9lA-r+|*F^Q(q6Q5S>FMvJ?@m3W&p4WBurw-8zV~K9
z<-BXhEAqHhU0=<?kdnjj5r=Cw1t!*x5A_BH#`atc3=DT-Ia+<z^qxaKM;7)RJk*ou
zU%6FrvT>g3&E&xGYuo0viYY4de1CvB5^G-B`JtmBzXnzv-fEbx&)R+P$ij@H`W`qf
zYVzF{!toJOi0U7WD%Bv1noaYaeXT};sIMbx+N%}gKy{5$-V&6LD2y?^{^%1OQKhUj
z80Lr^L>C22i(1h|gLSPNtBB1cQJ>?gsv;Vy(Aqjz730zlhABUFD@R#XC9AqTp`5TN
zAEElAFY2O(b<w@nm{N9Y=DUu}A=83xH0xbQmJ`h!oY2(r#O5+0qs+Ad_JXU*?W*F5
z+61EZwiKp)MsPhrx~gf{hIUtdyDj<pg!1}`@^3!nrz~0==!hEYh#Cy`7pzeOI{2`o
z2dwx<ZqtH*X+e9if4gt~F4O#GG;>$q!*J0%_RO<rrngVguOqfpp$!SHTEV3~Wm4WA
z!N(ZopGWY|DdkO{a@4K5VNu;;9ku{6^8zO>)N73v74g@rnt{KkaFhbv_NS<?_ievo
z{fZ97PI&xDX<I6f{qaVsmQ)<&BOd53j(ebcUD^ZPrHdcvE_-1}vHM2WFMv7BXnvmy
z6?HB`8uq-By&b6di79REvA^C%R%*=s%a)lTa^9XQB1SV%he?imsZvUuax84Y*;wVH
z8_qW|qWR56GsZSMwqN*d^t!$)ncII99fl|_G3or6Wjh2D3q){v8t1GC%>ELtg7`s>
zNI4+Kup&nzDzJUMoqFq(_xAN~$X|b1my|dW<WKosJ{2Gt-_NI4W#97YGtl?{z^BP@
zj}PS2jo%wSWu<I0fLE38C9kHlnUU@LLI+RarB9-&`fn-qyUdv)fGOv9n3F&x9NeAQ
z1l@K7<AN&eOfdu*yoQKCQ&t!iycvmxqCTFvmo6ZIAG`Q`Ag5~nBd2ozPNX7_8sWVE
zmPg}B&~2a<@{*cJyq}3Dr)Tr`aVXStB6k=bBcJ3+-Z7M=b|yo@I)ioeq$iuZsnfoK
zGgvv<6^a!hKCB~QiY%X=BFpkVuZ_`fJ8xM_^P5%<4}W@Y?dib4g{z&IAp+9QJCHiM
zGma!8r<{<m=CvM&oR!C)g<SKZDI|8z&Ziq<=alRXXVyc~8XFkD_{rG&1LK|8%4kSE
z`izA=SzSj&>P$2c!F$CI-17tX{I9vk$^dW`euru-H(9(;!@sEjD+L3uY0x}36}Ep&
zp5lp;@8<^GVd_pwXRygxi<|1;YzQiq(@^(h4Ls}N(7gS1XkLbAXduI5kvocHtuRk4
zE(3Ocq{|l9ZlmiatvCE}9z?rk(()qd9>Wa5-)ut!hNd@o4@{}m-Y$c*tSS~KVGind
z{%n?K_fR0>4P+I^Nh_utrw}^9plyxEIO)reGq}EM<V+Zf0t3r_&V<^NeB2U+0$_gM
zRYw~635iChuz<qQ%c^rsmszo~-v>BK(Q2u4c&07#$uDwqdETO&$xAnQrg?0N7V@lJ
zf=2|9#~Gi;zJLqC=%NvPVLqiO0_jM%k+mQcmxQ4aKnq?K!Ajs1K`$~y_&F($b1~R@
zCsH&b&nOlEL*j*VoxL-#&frGw*dAx)`YxcEtU~cqOQWa7GBjEK7F&dUq1iB5J|I?;
zNm}V-`KBmd*R_7SSF*+uLesm>gskx=cS&Do9J3_9@dyU#VPP7{b_vslBnTeyuKk5R
z!_t^tQrxnW<|5XCQv5UE7;O@wef$ds(muJ~E#0&+YolTIxNeP=k~L7PX!HJ<rU<;v
zOB6%+dIz1(w3LFTP|~gytZ4BXoF5jbT?mBjFsH1^CI^o?7bW0HAg|6*?2-7K8KSW)
zXaICHw8bVD6cEe~z7e34Rg<o-h|5W3{M))ALf^igY2h1@c%yNRRgksOt7yJ>c5?*Y
z&TK7)a9*!RqDb;hcCATKT@riJ>@K@KDY-pPT*y}CuzDmcnL$INF=$lXHMT<I?wfY+
z3lLji#r+Ho_b^N06wCMVqH!r!+|Wb|;yn@1vll2gMi1&~6MMmeXCU}t1rdHWbJ)>J
zrvlRM?^e%AQ5o_zb5DiOEmTDeVUu{jI@zGI<wwmM4WDOK#l19zpH;&!fBx);KMQ}j
tT9r0}PR5U^VHi@7%pz*mrdsToiod0XVTOX#lrVKlo9Zw|!kZlc{{&l3^9%q0

literal 0
HcmV?d00001

diff --git a/piet-gpu/shader/gen/pathtag_reduce.hlsl b/piet-gpu/shader/gen/pathtag_reduce.hlsl
new file mode 100644
index 0000000..5ed84b8
--- /dev/null
+++ b/piet-gpu/shader/gen/pathtag_reduce.hlsl
@@ -0,0 +1,127 @@
+struct TagMonoid
+{
+    uint trans_ix;
+    uint linewidth_ix;
+    uint pathseg_ix;
+    uint path_ix;
+    uint pathseg_offset;
+};
+
+struct Alloc
+{
+    uint offset;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+    Alloc trans_alloc;
+    Alloc bbox_alloc;
+    uint n_trans;
+    uint trans_offset;
+    uint pathtag_offset;
+    uint linewidth_offset;
+    uint pathseg_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(128u, 1u, 1u);
+
+ByteAddressBuffer _139 : register(t1);
+ByteAddressBuffer _151 : register(t2);
+RWByteAddressBuffer _239 : register(u3);
+RWByteAddressBuffer _259 : register(u0);
+
+static uint3 gl_WorkGroupID;
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_WorkGroupID : SV_GroupID;
+    uint3 gl_LocalInvocationID : SV_GroupThreadID;
+    uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+groupshared TagMonoid sh_scratch[128];
+
+TagMonoid reduce_tag(uint tag_word)
+{
+    uint point_count = tag_word & 50529027u;
+    TagMonoid c;
+    c.pathseg_ix = uint(int(countbits((point_count * 7u) & 67372036u)));
+    c.linewidth_ix = uint(int(countbits(tag_word & 1077952576u)));
+    c.path_ix = uint(int(countbits(tag_word & 269488144u)));
+    c.trans_ix = uint(int(countbits(tag_word & 538976288u)));
+    uint n_points = point_count + ((tag_word >> uint(2)) & 16843009u);
+    uint a = n_points + (n_points & (((tag_word >> uint(3)) & 16843009u) * 15u));
+    a += (a >> uint(8));
+    a += (a >> uint(16));
+    c.pathseg_offset = a & 255u;
+    return c;
+}
+
+TagMonoid combine_tag_monoid(TagMonoid a, TagMonoid b)
+{
+    TagMonoid c;
+    c.trans_ix = a.trans_ix + b.trans_ix;
+    c.linewidth_ix = a.linewidth_ix + b.linewidth_ix;
+    c.pathseg_ix = a.pathseg_ix + b.pathseg_ix;
+    c.path_ix = a.path_ix + b.path_ix;
+    c.pathseg_offset = a.pathseg_offset + b.pathseg_offset;
+    return c;
+}
+
+void comp_main()
+{
+    uint ix = gl_GlobalInvocationID.x * 4u;
+    uint scene_ix = (_139.Load(52) >> uint(2)) + ix;
+    uint tag_word = _151.Load(scene_ix * 4 + 0);
+    uint param = tag_word;
+    TagMonoid agg = reduce_tag(param);
+    for (uint i = 1u; i < 4u; i++)
+    {
+        tag_word = _151.Load((scene_ix + i) * 4 + 0);
+        uint param_1 = tag_word;
+        TagMonoid param_2 = agg;
+        TagMonoid param_3 = reduce_tag(param_1);
+        agg = combine_tag_monoid(param_2, param_3);
+    }
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i_1 = 0u; i_1 < 7u; i_1++)
+    {
+        GroupMemoryBarrierWithGroupSync();
+        if ((gl_LocalInvocationID.x + (1u << i_1)) < 128u)
+        {
+            TagMonoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)];
+            TagMonoid param_4 = agg;
+            TagMonoid param_5 = other;
+            agg = combine_tag_monoid(param_4, param_5);
+        }
+        GroupMemoryBarrierWithGroupSync();
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    if (gl_LocalInvocationID.x == 0u)
+    {
+        _239.Store(gl_WorkGroupID.x * 20 + 0, agg.trans_ix);
+        _239.Store(gl_WorkGroupID.x * 20 + 4, agg.linewidth_ix);
+        _239.Store(gl_WorkGroupID.x * 20 + 8, agg.pathseg_ix);
+        _239.Store(gl_WorkGroupID.x * 20 + 12, agg.path_ix);
+        _239.Store(gl_WorkGroupID.x * 20 + 16, agg.pathseg_offset);
+    }
+}
+
+[numthreads(128, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_WorkGroupID = stage_input.gl_WorkGroupID;
+    gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+    gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+    comp_main();
+}
diff --git a/piet-gpu/shader/gen/pathtag_reduce.msl b/piet-gpu/shader/gen/pathtag_reduce.msl
new file mode 100644
index 0000000..edb6d03
--- /dev/null
+++ b/piet-gpu/shader/gen/pathtag_reduce.msl
@@ -0,0 +1,143 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct TagMonoid
+{
+    uint trans_ix;
+    uint linewidth_ix;
+    uint pathseg_ix;
+    uint path_ix;
+    uint pathseg_offset;
+};
+
+struct Alloc
+{
+    uint offset;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+    Alloc trans_alloc;
+    Alloc bbox_alloc;
+    uint n_trans;
+    uint trans_offset;
+    uint pathtag_offset;
+    uint linewidth_offset;
+    uint pathseg_offset;
+};
+
+struct ConfigBuf
+{
+    Config conf;
+};
+
+struct SceneBuf
+{
+    uint scene[1];
+};
+
+struct TagMonoid_1
+{
+    uint trans_ix;
+    uint linewidth_ix;
+    uint pathseg_ix;
+    uint path_ix;
+    uint pathseg_offset;
+};
+
+struct OutBuf
+{
+    TagMonoid_1 outbuf[1];
+};
+
+struct Memory
+{
+    uint mem_offset;
+    uint mem_error;
+    uint memory[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(128u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+TagMonoid reduce_tag(thread const uint& tag_word)
+{
+    uint point_count = tag_word & 50529027u;
+    TagMonoid c;
+    c.pathseg_ix = uint(int(popcount((point_count * 7u) & 67372036u)));
+    c.linewidth_ix = uint(int(popcount(tag_word & 1077952576u)));
+    c.path_ix = uint(int(popcount(tag_word & 269488144u)));
+    c.trans_ix = uint(int(popcount(tag_word & 538976288u)));
+    uint n_points = point_count + ((tag_word >> uint(2)) & 16843009u);
+    uint a = n_points + (n_points & (((tag_word >> uint(3)) & 16843009u) * 15u));
+    a += (a >> uint(8));
+    a += (a >> uint(16));
+    c.pathseg_offset = a & 255u;
+    return c;
+}
+
+static inline __attribute__((always_inline))
+TagMonoid combine_tag_monoid(thread const TagMonoid& a, thread const TagMonoid& b)
+{
+    TagMonoid c;
+    c.trans_ix = a.trans_ix + b.trans_ix;
+    c.linewidth_ix = a.linewidth_ix + b.linewidth_ix;
+    c.pathseg_ix = a.pathseg_ix + b.pathseg_ix;
+    c.path_ix = a.path_ix + b.path_ix;
+    c.pathseg_offset = a.pathseg_offset + b.pathseg_offset;
+    return c;
+}
+
+kernel void main0(const device ConfigBuf& _139 [[buffer(1)]], const device SceneBuf& _151 [[buffer(2)]], device OutBuf& _239 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+{
+    threadgroup TagMonoid sh_scratch[128];
+    uint ix = gl_GlobalInvocationID.x * 4u;
+    uint scene_ix = (_139.conf.pathtag_offset >> uint(2)) + ix;
+    uint tag_word = _151.scene[scene_ix];
+    uint param = tag_word;
+    TagMonoid agg = reduce_tag(param);
+    for (uint i = 1u; i < 4u; i++)
+    {
+        tag_word = _151.scene[scene_ix + i];
+        uint param_1 = tag_word;
+        TagMonoid param_2 = agg;
+        TagMonoid param_3 = reduce_tag(param_1);
+        agg = combine_tag_monoid(param_2, param_3);
+    }
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i_1 = 0u; i_1 < 7u; i_1++)
+    {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        if ((gl_LocalInvocationID.x + (1u << i_1)) < 128u)
+        {
+            TagMonoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)];
+            TagMonoid param_4 = agg;
+            TagMonoid param_5 = other;
+            agg = combine_tag_monoid(param_4, param_5);
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    if (gl_LocalInvocationID.x == 0u)
+    {
+        _239.outbuf[gl_WorkGroupID.x].trans_ix = agg.trans_ix;
+        _239.outbuf[gl_WorkGroupID.x].linewidth_ix = agg.linewidth_ix;
+        _239.outbuf[gl_WorkGroupID.x].pathseg_ix = agg.pathseg_ix;
+        _239.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix;
+        _239.outbuf[gl_WorkGroupID.x].pathseg_offset = agg.pathseg_offset;
+    }
+}
+
diff --git a/piet-gpu/shader/gen/pathtag_root.dxil b/piet-gpu/shader/gen/pathtag_root.dxil
new file mode 100644
index 0000000000000000000000000000000000000000..1f27f265c5be01104cb77bab185d0e73358e5fd2
GIT binary patch
literal 4716
zcmeHLe^gV~9ltN{<t2ni@?v5P1?-Cyis0-cAYj2Yzd(>sgMu>a))#&iEzlsAirV%8
zk)#?mDp-`d1w{{SwM}bRrM8}gK#deFI@?42L17}Jom#5f(bMDB-4|f<besRI=WIKt
z_niCV^SR&qz2EyI_mjj>pwmBOT`Xz!i*u@GcYSr`z?TRD04SdZfB@EMFdJcB$#F1W
zfVmxZcwk<dkwpt3o=RCu7sgLvu3r2DY+AaEW*|>b4fsL*Jxu)jIUWF9EK^wt+gwcd
zr{GC|-A69X)8_ec02sgr@4~@`f-EEFgoHM3$m{f;K@|bO>yQhsrvRJ=$Y2)KMefl0
zu`HZa&)<nyi737>bSAc?kd>$NbUTA}?W9_sB2;FJ)JyD{PC2E|upb1WYTZbt*tug?
zw@I5@=NN9dwi%n9f#<?tx!A%SL)@|i8eT_8<2ctuJ>WcEbeC%)PIfh_y~i~{CIDG*
z49dk)&*&(lurUap#48j4QIJ#5$_%24H(|SGwOe6v)!OEiPcv?I1Y6sUWN|gdI9ZG#
zRmu#oh_*K&Ky!#dfvy@8zP_RHli#>)&U<88<0g%FZ=`Uum*u&3oLF`IUeE^N>8FEM
zMMXc{S&;`PiQ!`e*qk29ay!0xRO4B7UY^@E^7eRRjx~vps<OiJ<pwKHQ?m`op@mGo
z#$#K{*pkX@wPh%5hrDePSM-b5!xBi=(}FcFK{X|)aEhy@8tbMSRg8c$=*Pdh&#Xox
zSkDO7_T!_Zpk~675kn6XmZPxCC4e>+Lt^M>(nElGT=Ixh6`5`THEDF}%$5jA-!?(%
zDO6G`q+pyANqI}D0aMq69*Jkc)Lmu#4FI7gtboUP3gI!6fK@HVFwZ0a`d<eL1DAy(
z-qipxtkg4G<e^fd(-9QC@MBERi=ufZq9j+yx0^h?O-oo8QeqMve9UA_*B;k&Yz6)M
z5El&(xfY)F^Kkq_sL`3hlNshIi-N0D(Rbo@wPtUqM)y>8BHFg#)Yc6<VhdwMdt<-b
z+gVo|pIseXl>G&X$uM>g1<v4RDoKcG&E=K#ASJ93RYePDoErWk{!BeclUYkIxyfD~
ziT*Az#I3R<kscJttosK*xl)*Qku-)is`h8Tx9MPK4oO$k1C0!$F4Dz5cO5m?c*$f?
zfC@*~$Vk_%lLLbvcMY8EzBSf68ZBfEF~%q7&iQ7`jfy#r8(X%PZ-a9itrM1GXGbqi
z8G@ciCk$_vzZJ0CW-DlUZRLtRmocQnZa=-Lb8B1knH&;qN#A=Z{0d}`XSR2q;~0nV
zm!X3*E<L6E0z>>Q{msKKo;ml?>E?Gjva1iYUSggf6u^b!MF1~0;iVS5%!#jd<K_K$
zr3HVE%`+DreYQF)G7j2K!}9&(E3fyCjGiC7d<Zkh)M+zL&`BrMowK#MHsaF~@y{HG
zjc<Lm^{}I25!<BlRqu#v-SFnFu(;ONX=%-H?AWcL-ulaz2gka`po`H%SYCuW?NxR1
z<DF?8#}bo2=$!5Sd=qqZ{m9v^Zy75q7mdUBg*)3Ec(w^bHoSW{yi|@Xu<KK-K8KDK
z)UFd$w^R|MKyrf=|A`eR5W1ON-iWhp;iWV`VAc@Uhc58z7dX%bed`=+ErJa~L2ahZ
zViByjpf&4k7RnadClz0Jiic@QB`vu+E*>|D6Ugp}K~?yGDx%9gvy^@%^}~kLe*OIS
z(6kR5(yVA|-?+Sn5o{<EB$wINz>>9DoHh$1sEHBO+~q^JFR-?!g|;fCZB2`<wxw>)
z&2jO~A@QF*;_Eb8=x+-jX$$WQl$Xuny{ho;x`<vg{<%{>->;wF5-5-RQkwKBb~Lrg
z_ZXb?UVZ8XG}Y~M_*H_97IY11tG1xrS{R$fZ+oFbu)f8cHo{6ODe*0nc-SrG8lZ}T
z?=##xKEUg~4FAi%?bpoTDMPTcE^q9-jg=>_e3KX|BtBrKKd`$n>Ve(G^B&k;n*PA<
zvc`Vm)o;>X0tTtha7YuvwKdcE(q=Jz*I#i-H*e+1+jkN8P<B80qyaKd)hBX{&H!Xm
z-WfMpLvcg?y4HZv_J-}3-w!SJT}$2mM1&L}eorMaBc|<;oK6yKBHV^R%G55+<lb8@
zy>WZa`x0X3n@5wE9C&%**B3Z#YZpyu-Rn@u?$<g8<fpXG1^fOVwC+*3#s_NMy8nXK
zO+!r4G@rHu;(t=}=F+J%TJ{C^o#mJQfd@4&a8UxqkFR;$cbb2%d75<?dr=66d2Okv
z#tfgrz+0FF>;F{epz<8mDhW|hbr{}FMGKdwGZP|r@n4A?bxyH+G)e2}2;zYpk}dO@
zLVS3AOU$!+X%<*pm05k@>g~@*cjI?5mXBr#)QyVhQ)RJ7Lbe?tipNUIO8C}AF)?L&
zH&n(|K3CP1M%C>PLVS0g&mS1LesSe_fA8h7Lzs3trzYyo&i9g!%|^~yiBEgxI?g~5
zSj1d_O0q{E92vLk+4YfeMXl4Q5~w93y`zI)k9^z<Rb=r$sR(~^uYS^vpLFBL&<!&M
zz%2MRUWR4Q;ePi7!HksxQM`-lZIh#m_S2$v4JtY~vMAX_hzeXA(ba4oG>XK7Ks__G
zaiP27GkYDvQc7i^#gZTm^*FkW2An=X%XT1rF@1(&kRS@sy3oGQynZ{vgSLieoKba2
zdr2_Q;oYXBfUsWMj)%+E9tc8A8OI`VA@oiD)_VtIXlT@INiAdnCB&uS_By%4Tc!i@
zt2ZV+Rh#g928RRPw@s((!ig}lE&-%UJ`3;jx)CXjLi!vM!O?0+=SN)NXo{o1s0+~X
z;3($R@IByVlE?#cw}1o43mhm0eOlUtxRG%jk}ioS0#lSz9ZX1ZH$XHbF)P=Fs)b)O
zi7K#q9970^0Esnp$Zrl2yfkHwkX4U^41xrhm(uYUm>L*KhK%RhnaWC03}|WlvNp*@
zYP{i`6~shKm`Rt*0Y62kGeeilW)8Wg?{5~QSoo{1A>v<|SJbj-feX@<;*FhiV`3WC
zB*h$RSn_ja64g`)8bj<>C2K}`p*S<ZrdSb3I~Y*z<12||>@dKO5<1eMRKm@!L&R;)
z;5f;PuHYl?Tm6C)Lm~(~!EbZmBf#|x1;3-=EAv?GWD!Svs}X-9(x)|{N4rSLX#Zil
z+c!T2(yaJXH<BBJ(&ZZRG=Lk8s(}x;S_lu^XeCN#({Lvpq@YK`2SHUP2$}+bodB-|
zfG12BE|0`<c<yY!{xMeX)mwZa%QY})zEmFFE%H6#*SovMCAX==^gVO|5l4J)l5q5Z
zek#O3u=#1GbS&R!O6TXVq_er%c!<EK;WMO#i$fFuLla;c1QU0ciqlH&zy;W~G>qM#
z6mqL!y&ULX$Sq`eI?4=Wd6?rWkAQ9|p&OZQ4}dqox!Y~qsH|LvHi3p;;3GnjcnDdh
ztkgx-Pj7532r_RPjxH!HFdu(qHwEdgC{e7(#Umz*mM9G#3$AHoMJ}F6JTIOD%mhdd
z09IDCn+oj!BiO2j;%jXWubov;J7(KQE}QvX-J^}owwK0&3L(95|37@Xc#c@AgI7OU
z&|{BiOx_&7U5mnxow4GajFwt^qzbcC5ub~!@J#ZR!?Jj(y+@@eR4LTCyn-qIxo>4^
zsiP-eQ53I8$ki{N;w^|GLF(+$DvGs=#9YJQ`6{U*QQFj#t0>7;%rh%iavMek!woa-
n1ZW|AS^%f@PvLX!zE7+euLwOpbV<F=d#nUB3~_&GOs2m9ww)b*

literal 0
HcmV?d00001

diff --git a/piet-gpu/shader/gen/pathtag_root.hlsl b/piet-gpu/shader/gen/pathtag_root.hlsl
new file mode 100644
index 0000000..388f99d
--- /dev/null
+++ b/piet-gpu/shader/gen/pathtag_root.hlsl
@@ -0,0 +1,115 @@
+struct TagMonoid
+{
+    uint trans_ix;
+    uint linewidth_ix;
+    uint pathseg_ix;
+    uint path_ix;
+    uint pathseg_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
+
+static const TagMonoid _18 = { 0u, 0u, 0u, 0u, 0u };
+
+RWByteAddressBuffer _78 : register(u0);
+
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_LocalInvocationID : SV_GroupThreadID;
+    uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+groupshared TagMonoid sh_scratch[512];
+
+TagMonoid combine_tag_monoid(TagMonoid a, TagMonoid b)
+{
+    TagMonoid c;
+    c.trans_ix = a.trans_ix + b.trans_ix;
+    c.linewidth_ix = a.linewidth_ix + b.linewidth_ix;
+    c.pathseg_ix = a.pathseg_ix + b.pathseg_ix;
+    c.path_ix = a.path_ix + b.path_ix;
+    c.pathseg_offset = a.pathseg_offset + b.pathseg_offset;
+    return c;
+}
+
+TagMonoid tag_monoid_identity()
+{
+    return _18;
+}
+
+void comp_main()
+{
+    uint ix = gl_GlobalInvocationID.x * 8u;
+    TagMonoid _82;
+    _82.trans_ix = _78.Load(ix * 20 + 0);
+    _82.linewidth_ix = _78.Load(ix * 20 + 4);
+    _82.pathseg_ix = _78.Load(ix * 20 + 8);
+    _82.path_ix = _78.Load(ix * 20 + 12);
+    _82.pathseg_offset = _78.Load(ix * 20 + 16);
+    TagMonoid local[8];
+    local[0].trans_ix = _82.trans_ix;
+    local[0].linewidth_ix = _82.linewidth_ix;
+    local[0].pathseg_ix = _82.pathseg_ix;
+    local[0].path_ix = _82.path_ix;
+    local[0].pathseg_offset = _82.pathseg_offset;
+    TagMonoid param_1;
+    for (uint i = 1u; i < 8u; i++)
+    {
+        TagMonoid param = local[i - 1u];
+        TagMonoid _115;
+        _115.trans_ix = _78.Load((ix + i) * 20 + 0);
+        _115.linewidth_ix = _78.Load((ix + i) * 20 + 4);
+        _115.pathseg_ix = _78.Load((ix + i) * 20 + 8);
+        _115.path_ix = _78.Load((ix + i) * 20 + 12);
+        _115.pathseg_offset = _78.Load((ix + i) * 20 + 16);
+        param_1.trans_ix = _115.trans_ix;
+        param_1.linewidth_ix = _115.linewidth_ix;
+        param_1.pathseg_ix = _115.pathseg_ix;
+        param_1.path_ix = _115.path_ix;
+        param_1.pathseg_offset = _115.pathseg_offset;
+        local[i] = combine_tag_monoid(param, param_1);
+    }
+    TagMonoid agg = local[7];
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i_1 = 0u; i_1 < 9u; i_1++)
+    {
+        GroupMemoryBarrierWithGroupSync();
+        if (gl_LocalInvocationID.x >= (1u << i_1))
+        {
+            TagMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+            TagMonoid param_2 = other;
+            TagMonoid param_3 = agg;
+            agg = combine_tag_monoid(param_2, param_3);
+        }
+        GroupMemoryBarrierWithGroupSync();
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    GroupMemoryBarrierWithGroupSync();
+    TagMonoid row = tag_monoid_identity();
+    if (gl_LocalInvocationID.x > 0u)
+    {
+        row = sh_scratch[gl_LocalInvocationID.x - 1u];
+    }
+    for (uint i_2 = 0u; i_2 < 8u; i_2++)
+    {
+        TagMonoid param_4 = row;
+        TagMonoid param_5 = local[i_2];
+        TagMonoid m = combine_tag_monoid(param_4, param_5);
+        uint _211 = ix + i_2;
+        _78.Store(_211 * 20 + 0, m.trans_ix);
+        _78.Store(_211 * 20 + 4, m.linewidth_ix);
+        _78.Store(_211 * 20 + 8, m.pathseg_ix);
+        _78.Store(_211 * 20 + 12, m.path_ix);
+        _78.Store(_211 * 20 + 16, m.pathseg_offset);
+    }
+}
+
+[numthreads(512, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+    gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+    comp_main();
+}
diff --git a/piet-gpu/shader/gen/pathtag_root.msl b/piet-gpu/shader/gen/pathtag_root.msl
new file mode 100644
index 0000000..923e77c
--- /dev/null
+++ b/piet-gpu/shader/gen/pathtag_root.msl
@@ -0,0 +1,146 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct TagMonoid
+{
+    uint trans_ix;
+    uint linewidth_ix;
+    uint pathseg_ix;
+    uint path_ix;
+    uint pathseg_offset;
+};
+
+struct TagMonoid_1
+{
+    uint trans_ix;
+    uint linewidth_ix;
+    uint pathseg_ix;
+    uint path_ix;
+    uint pathseg_offset;
+};
+
+struct DataBuf
+{
+    TagMonoid_1 data[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+TagMonoid combine_tag_monoid(thread const TagMonoid& a, thread const TagMonoid& b)
+{
+    TagMonoid c;
+    c.trans_ix = a.trans_ix + b.trans_ix;
+    c.linewidth_ix = a.linewidth_ix + b.linewidth_ix;
+    c.pathseg_ix = a.pathseg_ix + b.pathseg_ix;
+    c.path_ix = a.path_ix + b.path_ix;
+    c.pathseg_offset = a.pathseg_offset + b.pathseg_offset;
+    return c;
+}
+
+static inline __attribute__((always_inline))
+TagMonoid tag_monoid_identity()
+{
+    return TagMonoid{ 0u, 0u, 0u, 0u, 0u };
+}
+
+kernel void main0(device DataBuf& _78 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+{
+    threadgroup TagMonoid sh_scratch[512];
+    uint ix = gl_GlobalInvocationID.x * 8u;
+    spvUnsafeArray<TagMonoid, 8> local;
+    local[0].trans_ix = _78.data[ix].trans_ix;
+    local[0].linewidth_ix = _78.data[ix].linewidth_ix;
+    local[0].pathseg_ix = _78.data[ix].pathseg_ix;
+    local[0].path_ix = _78.data[ix].path_ix;
+    local[0].pathseg_offset = _78.data[ix].pathseg_offset;
+    TagMonoid param_1;
+    for (uint i = 1u; i < 8u; i++)
+    {
+        uint _109 = ix + i;
+        TagMonoid param = local[i - 1u];
+        param_1.trans_ix = _78.data[_109].trans_ix;
+        param_1.linewidth_ix = _78.data[_109].linewidth_ix;
+        param_1.pathseg_ix = _78.data[_109].pathseg_ix;
+        param_1.path_ix = _78.data[_109].path_ix;
+        param_1.pathseg_offset = _78.data[_109].pathseg_offset;
+        local[i] = combine_tag_monoid(param, param_1);
+    }
+    TagMonoid agg = local[7];
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i_1 = 0u; i_1 < 9u; i_1++)
+    {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        if (gl_LocalInvocationID.x >= (1u << i_1))
+        {
+            TagMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+            TagMonoid param_2 = other;
+            TagMonoid param_3 = agg;
+            agg = combine_tag_monoid(param_2, param_3);
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    TagMonoid row = tag_monoid_identity();
+    if (gl_LocalInvocationID.x > 0u)
+    {
+        row = sh_scratch[gl_LocalInvocationID.x - 1u];
+    }
+    for (uint i_2 = 0u; i_2 < 8u; i_2++)
+    {
+        TagMonoid param_4 = row;
+        TagMonoid param_5 = local[i_2];
+        TagMonoid m = combine_tag_monoid(param_4, param_5);
+        uint _211 = ix + i_2;
+        _78.data[_211].trans_ix = m.trans_ix;
+        _78.data[_211].linewidth_ix = m.linewidth_ix;
+        _78.data[_211].pathseg_ix = m.pathseg_ix;
+        _78.data[_211].path_ix = m.path_ix;
+        _78.data[_211].pathseg_offset = m.pathseg_offset;
+    }
+}
+
diff --git a/piet-gpu/src/stages/path.rs b/piet-gpu/src/stages/path.rs
index b3f417e..e3786fc 100644
--- a/piet-gpu/src/stages/path.rs
+++ b/piet-gpu/src/stages/path.rs
@@ -17,7 +17,7 @@
 //! The path stage (includes substages).
 
 use piet_gpu_hal::{
-    BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session, ShaderCode,
+    BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session, include_shader,
 };
 
 pub struct PathCode {
@@ -55,7 +55,7 @@ const CLEAR_WG: u32 = 512;
 impl PathCode {
     pub unsafe fn new(session: &Session) -> PathCode {
         // TODO: add cross-compilation
-        let reduce_code = ShaderCode::Spv(include_bytes!("../../shader/gen/pathtag_reduce.spv"));
+        let reduce_code = include_shader!(session, "../../shader/gen/pathtag_reduce");
         let reduce_pipeline = session
             .create_compute_pipeline(
                 reduce_code,
@@ -67,15 +67,15 @@ impl PathCode {
                 ],
             )
             .unwrap();
-        let tag_root_code = ShaderCode::Spv(include_bytes!("../../shader/gen/pathtag_root.spv"));
+        let tag_root_code = include_shader!(session, "../../shader/gen/pathtag_root");
         let tag_root_pipeline = session
             .create_compute_pipeline(tag_root_code, &[BindType::Buffer])
             .unwrap();
-        let clear_code = ShaderCode::Spv(include_bytes!("../../shader/gen/bbox_clear.spv"));
+        let clear_code = include_shader!(session, "../../shader/gen/bbox_clear");
         let clear_pipeline = session
             .create_compute_pipeline(clear_code, &[BindType::Buffer, BindType::BufReadOnly])
             .unwrap();
-        let pathseg_code = ShaderCode::Spv(include_bytes!("../../shader/gen/pathseg.spv"));
+        let pathseg_code = include_shader!(session, "../../shader/gen/pathseg");
         let pathseg_pipeline = session
             .create_compute_pipeline(
                 pathseg_code,

From 70723bf076e1f5153a2581f32175460795165a30 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Wed, 1 Dec 2021 15:46:36 -0800
Subject: [PATCH 4/4] Very minor reorg

Move transform stage into its own file, cargo fmt.
---
 piet-gpu/src/stages.rs           | 172 +---------------------------
 piet-gpu/src/stages/path.rs      |   2 +-
 piet-gpu/src/stages/transform.rs | 188 +++++++++++++++++++++++++++++++
 3 files changed, 191 insertions(+), 171 deletions(-)
 create mode 100644 piet-gpu/src/stages/transform.rs

diff --git a/piet-gpu/src/stages.rs b/piet-gpu/src/stages.rs
index 2fe4e4f..59e8b50 100644
--- a/piet-gpu/src/stages.rs
+++ b/piet-gpu/src/stages.rs
@@ -17,15 +17,12 @@
 //! Stages for new element pipeline, exposed for testing.
 
 mod path;
+mod transform;
 
 use bytemuck::{Pod, Zeroable};
 
-use piet::kurbo::Affine;
-use piet_gpu_hal::{
-    include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session,
-};
-
 pub use path::{PathBinding, PathCode, PathEncoder, PathStage};
+pub use transform::{Transform, TransformBinding, TransformCode, TransformStage};
 
 /// The configuration block passed to piet-gpu shaders.
 ///
@@ -50,168 +47,3 @@ pub struct Config {
     pub linewidth_offset: u32,
     pub pathseg_offset: u32,
 }
-
-// The individual stages will probably be separate files but for now, all in one.
-
-// This is equivalent to the version in piet-gpu-types, but the bytemuck
-// representation will likely be faster.
-#[repr(C)]
-#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
-pub struct Transform {
-    pub mat: [f32; 4],
-    pub translate: [f32; 2],
-}
-
-const TRANSFORM_WG: u64 = 512;
-const TRANSFORM_N_ROWS: u64 = 8;
-const TRANSFORM_PART_SIZE: u64 = TRANSFORM_WG * TRANSFORM_N_ROWS;
-
-pub struct TransformCode {
-    reduce_pipeline: Pipeline,
-    root_pipeline: Pipeline,
-    leaf_pipeline: Pipeline,
-}
-
-pub struct TransformStage {
-    // Right now we're limited to partition^2 (~16M) elements. This can be
-    // expanded but is tedious.
-    root_buf: Buffer,
-    root_ds: DescriptorSet,
-}
-
-pub struct TransformBinding {
-    reduce_ds: DescriptorSet,
-    leaf_ds: DescriptorSet,
-}
-
-impl TransformCode {
-    pub unsafe fn new(session: &Session) -> TransformCode {
-        let reduce_code = include_shader!(session, "../shader/gen/transform_reduce");
-        let reduce_pipeline = session
-            .create_compute_pipeline(
-                reduce_code,
-                &[
-                    BindType::Buffer,
-                    BindType::BufReadOnly,
-                    BindType::BufReadOnly,
-                    BindType::Buffer,
-                ],
-            )
-            .unwrap();
-        let root_code = include_shader!(session, "../shader/gen/transform_root");
-        let root_pipeline = session
-            .create_compute_pipeline(root_code, &[BindType::Buffer])
-            .unwrap();
-        let leaf_code = include_shader!(session, "../shader/gen/transform_leaf");
-        let leaf_pipeline = session
-            .create_compute_pipeline(
-                leaf_code,
-                &[
-                    BindType::Buffer,
-                    BindType::BufReadOnly,
-                    BindType::BufReadOnly,
-                    BindType::BufReadOnly,
-                ],
-            )
-            .unwrap();
-        TransformCode {
-            reduce_pipeline,
-            root_pipeline,
-            leaf_pipeline,
-        }
-    }
-}
-
-impl TransformStage {
-    pub unsafe fn new(session: &Session, code: &TransformCode) -> TransformStage {
-        // We're limited to TRANSFORM_PART_SIZE^2
-        // Also note: size here allows padding
-        let root_buf_size = TRANSFORM_PART_SIZE * 32;
-        let root_buf = session
-            .create_buffer(root_buf_size, BufferUsage::STORAGE)
-            .unwrap();
-        let root_ds = session
-            .create_simple_descriptor_set(&code.root_pipeline, &[&root_buf])
-            .unwrap();
-        TransformStage { root_buf, root_ds }
-    }
-
-    pub unsafe fn bind(
-        &self,
-        session: &Session,
-        code: &TransformCode,
-        config_buf: &Buffer,
-        scene_buf: &Buffer,
-        memory_buf: &Buffer,
-    ) -> TransformBinding {
-        let reduce_ds = session
-            .create_simple_descriptor_set(
-                &code.reduce_pipeline,
-                &[memory_buf, config_buf, scene_buf, &self.root_buf],
-            )
-            .unwrap();
-        let leaf_ds = session
-            .create_simple_descriptor_set(
-                &code.leaf_pipeline,
-                &[memory_buf, config_buf, scene_buf, &self.root_buf],
-            )
-            .unwrap();
-        TransformBinding { reduce_ds, leaf_ds }
-    }
-
-    pub unsafe fn record(
-        &self,
-        cmd_buf: &mut CmdBuf,
-        code: &TransformCode,
-        binding: &TransformBinding,
-        size: u64,
-    ) {
-        if size > TRANSFORM_PART_SIZE.pow(2) {
-            panic!("very large scan not yet implemented");
-        }
-        let n_workgroups = (size + TRANSFORM_PART_SIZE - 1) / TRANSFORM_PART_SIZE;
-        if n_workgroups > 1 {
-            cmd_buf.dispatch(
-                &code.reduce_pipeline,
-                &binding.reduce_ds,
-                (n_workgroups as u32, 1, 1),
-                (TRANSFORM_WG as u32, 1, 1),
-            );
-            cmd_buf.memory_barrier();
-            cmd_buf.dispatch(
-                &code.root_pipeline,
-                &self.root_ds,
-                (1, 1, 1),
-                (TRANSFORM_WG as u32, 1, 1),
-            );
-            cmd_buf.memory_barrier();
-        }
-        cmd_buf.dispatch(
-            &code.leaf_pipeline,
-            &binding.leaf_ds,
-            (n_workgroups as u32, 1, 1),
-            (TRANSFORM_WG as u32, 1, 1),
-        );
-    }
-}
-
-impl Transform {
-    pub fn from_kurbo(a: Affine) -> Transform {
-        let c = a.as_coeffs();
-        Transform {
-            mat: [c[0] as f32, c[1] as f32, c[2] as f32, c[3] as f32],
-            translate: [c[4] as f32, c[5] as f32],
-        }
-    }
-
-    pub fn to_kurbo(self) -> Affine {
-        Affine::new([
-            self.mat[0] as f64,
-            self.mat[1] as f64,
-            self.mat[2] as f64,
-            self.mat[3] as f64,
-            self.translate[0] as f64,
-            self.translate[1] as f64,
-        ])
-    }
-}
diff --git a/piet-gpu/src/stages/path.rs b/piet-gpu/src/stages/path.rs
index e3786fc..e233c65 100644
--- a/piet-gpu/src/stages/path.rs
+++ b/piet-gpu/src/stages/path.rs
@@ -17,7 +17,7 @@
 //! The path stage (includes substages).
 
 use piet_gpu_hal::{
-    BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session, include_shader,
+    include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session,
 };
 
 pub struct PathCode {
diff --git a/piet-gpu/src/stages/transform.rs b/piet-gpu/src/stages/transform.rs
new file mode 100644
index 0000000..4fb5e9f
--- /dev/null
+++ b/piet-gpu/src/stages/transform.rs
@@ -0,0 +1,188 @@
+// Copyright 2021 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+//! The transform stage of the element processing pipeline.
+
+use bytemuck::{Pod, Zeroable};
+
+use piet::kurbo::Affine;
+use piet_gpu_hal::{
+    include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session,
+};
+
+/// An affine transform.
+// This is equivalent to the version in piet-gpu-types, but the bytemuck
+// representation will likely be faster.
+#[repr(C)]
+#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
+pub struct Transform {
+    pub mat: [f32; 4],
+    pub translate: [f32; 2],
+}
+
+const TRANSFORM_WG: u64 = 512;
+const TRANSFORM_N_ROWS: u64 = 8;
+const TRANSFORM_PART_SIZE: u64 = TRANSFORM_WG * TRANSFORM_N_ROWS;
+
+pub struct TransformCode {
+    reduce_pipeline: Pipeline,
+    root_pipeline: Pipeline,
+    leaf_pipeline: Pipeline,
+}
+
+pub struct TransformStage {
+    // Right now we're limited to partition^2 (~16M) elements. This can be
+    // expanded but is tedious.
+    root_buf: Buffer,
+    root_ds: DescriptorSet,
+}
+
+pub struct TransformBinding {
+    reduce_ds: DescriptorSet,
+    leaf_ds: DescriptorSet,
+}
+
+impl TransformCode {
+    pub unsafe fn new(session: &Session) -> TransformCode {
+        let reduce_code = include_shader!(session, "../../shader/gen/transform_reduce");
+        let reduce_pipeline = session
+            .create_compute_pipeline(
+                reduce_code,
+                &[
+                    BindType::Buffer,
+                    BindType::BufReadOnly,
+                    BindType::BufReadOnly,
+                    BindType::Buffer,
+                ],
+            )
+            .unwrap();
+        let root_code = include_shader!(session, "../../shader/gen/transform_root");
+        let root_pipeline = session
+            .create_compute_pipeline(root_code, &[BindType::Buffer])
+            .unwrap();
+        let leaf_code = include_shader!(session, "../../shader/gen/transform_leaf");
+        let leaf_pipeline = session
+            .create_compute_pipeline(
+                leaf_code,
+                &[
+                    BindType::Buffer,
+                    BindType::BufReadOnly,
+                    BindType::BufReadOnly,
+                    BindType::BufReadOnly,
+                ],
+            )
+            .unwrap();
+        TransformCode {
+            reduce_pipeline,
+            root_pipeline,
+            leaf_pipeline,
+        }
+    }
+}
+
+impl TransformStage {
+    pub unsafe fn new(session: &Session, code: &TransformCode) -> TransformStage {
+        // We're limited to TRANSFORM_PART_SIZE^2
+        // Also note: size here allows padding
+        let root_buf_size = TRANSFORM_PART_SIZE * 32;
+        let root_buf = session
+            .create_buffer(root_buf_size, BufferUsage::STORAGE)
+            .unwrap();
+        let root_ds = session
+            .create_simple_descriptor_set(&code.root_pipeline, &[&root_buf])
+            .unwrap();
+        TransformStage { root_buf, root_ds }
+    }
+
+    pub unsafe fn bind(
+        &self,
+        session: &Session,
+        code: &TransformCode,
+        config_buf: &Buffer,
+        scene_buf: &Buffer,
+        memory_buf: &Buffer,
+    ) -> TransformBinding {
+        let reduce_ds = session
+            .create_simple_descriptor_set(
+                &code.reduce_pipeline,
+                &[memory_buf, config_buf, scene_buf, &self.root_buf],
+            )
+            .unwrap();
+        let leaf_ds = session
+            .create_simple_descriptor_set(
+                &code.leaf_pipeline,
+                &[memory_buf, config_buf, scene_buf, &self.root_buf],
+            )
+            .unwrap();
+        TransformBinding { reduce_ds, leaf_ds }
+    }
+
+    pub unsafe fn record(
+        &self,
+        cmd_buf: &mut CmdBuf,
+        code: &TransformCode,
+        binding: &TransformBinding,
+        size: u64,
+    ) {
+        if size > TRANSFORM_PART_SIZE.pow(2) {
+            panic!("very large scan not yet implemented");
+        }
+        let n_workgroups = (size + TRANSFORM_PART_SIZE - 1) / TRANSFORM_PART_SIZE;
+        if n_workgroups > 1 {
+            cmd_buf.dispatch(
+                &code.reduce_pipeline,
+                &binding.reduce_ds,
+                (n_workgroups as u32, 1, 1),
+                (TRANSFORM_WG as u32, 1, 1),
+            );
+            cmd_buf.memory_barrier();
+            cmd_buf.dispatch(
+                &code.root_pipeline,
+                &self.root_ds,
+                (1, 1, 1),
+                (TRANSFORM_WG as u32, 1, 1),
+            );
+            cmd_buf.memory_barrier();
+        }
+        cmd_buf.dispatch(
+            &code.leaf_pipeline,
+            &binding.leaf_ds,
+            (n_workgroups as u32, 1, 1),
+            (TRANSFORM_WG as u32, 1, 1),
+        );
+    }
+}
+
+impl Transform {
+    pub fn from_kurbo(a: Affine) -> Transform {
+        let c = a.as_coeffs();
+        Transform {
+            mat: [c[0] as f32, c[1] as f32, c[2] as f32, c[3] as f32],
+            translate: [c[4] as f32, c[5] as f32],
+        }
+    }
+
+    pub fn to_kurbo(self) -> Affine {
+        Affine::new([
+            self.mat[0] as f64,
+            self.mat[1] as f64,
+            self.mat[2] as f64,
+            self.mat[3] as f64,
+            self.translate[0] as f64,
+            self.translate[1] as f64,
+        ])
+    }
+}