From 6976f877e02fe0103095bc556a1b85f821d5c430 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Tue, 21 Apr 2020 17:55:17 -0700
Subject: [PATCH] Add first draft of kernel 3

A fairly simple approach, but it adds the translation (not tested yet
in scene encoding) and does bounding box culling.
---
 piet-gpu-types/src/ptcl.rs   |   5 +-
 piet-gpu/shader/build.ninja  |   2 +
 piet-gpu/shader/image.comp   |   2 +-
 piet-gpu/shader/image.spv    | Bin 8096 -> 8116 bytes
 piet-gpu/shader/kernel1.comp |  27 ++-
 piet-gpu/shader/kernel3.comp |  72 ++++++++
 piet-gpu/shader/kernel3.spv  | Bin 0 -> 9964 bytes
 piet-gpu/shader/ptcl.h       | 323 +++++++++++++++++++++++++++++++++++
 piet-gpu/src/main.rs         |  32 +++-
 9 files changed, 449 insertions(+), 14 deletions(-)
 create mode 100644 piet-gpu/shader/kernel3.comp
 create mode 100644 piet-gpu/shader/kernel3.spv
 create mode 100644 piet-gpu/shader/ptcl.h
diff --git a/piet-gpu-types/src/ptcl.rs b/piet-gpu-types/src/ptcl.rs
index f5e42af..b6df77d 100644
--- a/piet-gpu-types/src/ptcl.rs
+++ b/piet-gpu-types/src/ptcl.rs
@@ -4,8 +4,9 @@ piet_gpu! {
     #[gpu_write]
     mod ptcl {
         struct CmdCircle {
-            // In existing code, this is packed; we might need an annotation for this.
-            bbox: [u16; 4],
+            center: [f32; 2],
+            radius: f32,
+            rgba_color: u32,
         }
         struct CmdLine {
             start: [f32; 2],
diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja
index 5befa7f..ada8694 100644
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@@ -10,3 +10,5 @@ rule glsl
 build image.spv: glsl image.comp | scene.h
 
 build kernel1.spv: glsl kernel1.comp | scene.h tilegroup.h
+
+build kernel3.spv: glsl kernel3.comp | scene.h tilegroup.h ptcl.h
diff --git a/piet-gpu/shader/image.comp b/piet-gpu/shader/image.comp
index 60739d5..6d84eb5 100644
--- a/piet-gpu/shader/image.comp
+++ b/piet-gpu/shader/image.comp
@@ -40,7 +40,7 @@ void main() {
         if (tag == PietItem_Circle) {
             PietCircle circle = PietItem_Circle_read(item_ref);
             float r = length(xy + vec2(0.5, 0.5) - circle.center.xy);
-            float alpha = clamp(circle.radius - r, 0.0, 1.0);
+            float alpha = clamp(0.5 + circle.radius - r, 0.0, 1.0);
             vec4 fg_rgba = unpackUnorm4x8(circle.rgba_color);
             // TODO: sRGB
             rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a);
diff --git a/piet-gpu/shader/image.spv b/piet-gpu/shader/image.spv
index 527c9ae2b14983aaa723ab098d6d68b77092cab6..097add185fc152cd7131529812585e68b2eebfd6 100644
GIT binary patch
delta 1542
zcmYk6xldG46vpo`Fam-i;EL?xE^a6)vJAK|jz%o42yVm>64aQOSUe2c+E7@T*iss7
z?KIjF6JzJ!U?+vq-|z8moaY?x_kCx%=iK`q??yk2Hs-5(H-(T3wfwvBzHzQ_ds{Aq
z2eZ?UXYb5}nvf4Y@FyXFYr;{mj_HM&yL0#NrP?T3X?|w*{`k`egoOgCn$JUZR_hd(
z*9y;Xjo>MuRAZABk-c!At3;ZePjPi9f_qYrz75_DM^5?m)wi3z<&y?E&k;hlu{NfB
zadanqD_G4fyZFBe&djN<zY88)(%+r=)nmPb;1H4t`DxHecr9zjlW-QH>Aw($;Je`M
zOcH1A$Yztf(HLtLG0r=bapzUDuuWZpyHn(4xceN%`<3-Nyqx2-%BznW4&?eA&Kq<<
zYdVZK7<rw!s~;Pe^2rO1-*c(M{g<_tb(j>jdhj3Cr7UJM+&J|H`Y~n;m>;c$px-<2
z+(#I*z^cDUzFRhf*I5TJ_h7Unv>@tefO<)-7{4%v>9dftsGIa1eLMIEtK6I&m@zkL
zO8?GGQ}=Q4JmX-y5Nxg(lA5rBV>iSCZz_WJz|~)6)3<?5RG*?*H`)tUkLIbUx1;e3
z{g}<#kg*P9<NDWW>G$$Vb8lY<>?i6XJ9*umaJ6S(GxT*X|7vbadQr{4mG!g_uI3Z9
zzD!e&+Wt)Q5T<wyJhTIt`Nr$_)?QOr-yuxDZ+yI%!#QpXo|*ykoPHPU5e#SAs&AP*
zUimSwT8X^u%0oX~J;of*Vx}r$PN1oO%8uj|_(U$*_~t%m;%P*S>5Rn&z;3uuQS1y{
zJr)}TtNFRcTIayl(-&)<2U{z?6@^KPUO><hi(Le(4Pmj^FxZ0nyg>hAs>Q<N%WJJI
z$pp$c_A(piGFZ*8Jw{#ue__|k^Sg?s9$%M9u(|5-j7wm5E%HK@$-gGX{Pc{ct3oQ4
KRv&I3tp5kt!)`?Y

delta 1533
zcmYk6%S%;J6o>cq=1om2Ez?TPNA}Xn(y}tUS9|#g(qW^pQV9{P2!e<sSkopB8U;-o
zhBa(v)HH%<R`gfoAn5zfJ)1n;-Tpn+UVE**&+&cnO|dRt(X};%T&UvTjdyhmg<Csw
zA<WI+emH+;CRB!e=zu>80bCh+z#6xgX6`P`-b=MhXyc1B^RuIm=Lic0R5hRb>dfXT
zt$r;$ogT(1plo4`ZHnxI`&=i|)%g@}2_<l6>gbhu2OK%&lWVWHe#<A@<UBou)HBzn
ze7V>J-wsxD%Xa?X0jK9w*FOM{CF!?kzIx1e7#u=!S>6xogI6(UJPBtIcK#Q_1$Z;O
zl}=*MZCP(}H!3n$5M#eX882m;i7o0f+?^tyfV)pG?ic1s_(YD=npk^KdpOr^bKam0
zn$u>y!N{x8d)7YIt>lwoY|o_{`!Dkf*CDTgS25IU*;}m+%#Sk4zUS>3U!tV~v;GL{
zO<xb*U>;z&hix>Vsn^21k6pPiktQr<rirW@BVF_zeK+_Avq@ak$S~$6Md|O!w8que
zhNh$mW+|5O7PlE}^s6j`ZR~}szsOqM2X>wM6s4JUKUh5uk3hWzjbAvyurhZy*2<}}
zBDAc2Xy{2=k$w`g%ocXu23LCmHdIe-gB8EPf0%zW+oaoc;@R|cR$>=i%_nLHGfg>a
z-DNFF->50>+Y>v)Fo6pU@XB72Snmi!-&Z?c%TX}Dbm%Wql%K&dhMo=hHp%04_kz{N
zSvMS#@_ipi(1<}NvY@Fns2p<=O?^E(sZ(IJKE|g^I}J9MUd+`GcEhDjxd!0s>q)lo
zEJV@oGA0`Yn@}$%I|nvdd`rBt^KkW8*brFFZ#w3>2sWRdr{Z5mwIn7UfqW*}=8i|v
z)SqWXUIDB5k;ipc!LBPGXZfVAp{d7r>N?o9>hTL315-pNi3(xT1PCRDpP>t^Qf+MQ
IS>s^MKkoc(nE(I)

diff --git a/piet-gpu/shader/kernel1.comp b/piet-gpu/shader/kernel1.comp
index 436b8bd..3a4156c 100644
--- a/piet-gpu/shader/kernel1.comp
+++ b/piet-gpu/shader/kernel1.comp
@@ -1,3 +1,16 @@
+// This is "kernel 1" in a 4-kernel pipeline. It traverses the scene graph
+// and outputs "instances" (references to item + translation) for each item
+// that intersects the tilegroup.
+//
+// This implementation is simplistic and leaves a lot of performance on the
+// table. A fancier implementation would use threadgroup shared memory or
+// subgroups (or possibly both) to parallelize the reading of the input and
+// the computation of tilegroup intersection.
+//
+// In addition, there are some features currently missing. One is the use of
+// a bump allocator to extend the current fixed allocation. Another is support
+// for clipping.
+
 #version 450
 #extension GL_GOOGLE_include_directive : enable
 
@@ -18,10 +31,10 @@ layout(set = 0, binding = 1) buffer TilegroupBuf {
 // TODO: compute this
 #define WIDTH_IN_TILEGROUPS 4
 
-#define TILEGROUP_WIDTH 512
-#define TILEGROUP_HEIGHT 16
+#define TILEGROUP_WIDTH_PX 512
+#define TILEGROUP_HEIGHT_PX 16
 
-#define INITIAL_ALLOC 1024
+#define TILEGROUP_INITIAL_ALLOC 1024
 
 #define MAX_STACK 8
 
@@ -35,8 +48,8 @@ void main() {
     StackElement stack[MAX_STACK];
     uint stack_ix = 0;
     uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS + gl_GlobalInvocationID.x;
-    TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * INITIAL_ALLOC);
-    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILEGROUP_WIDTH, TILEGROUP_HEIGHT);
+    TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_INITIAL_ALLOC);
+    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILEGROUP_WIDTH_PX, TILEGROUP_HEIGHT_PX);
     PietItemRef root = PietItemRef(0);
     SimpleGroup group = PietItem_Group_read(root);
     StackElement tos = StackElement(root, 0, group.offset.xy);
@@ -45,8 +58,8 @@ void main() {
         if (tos.index < group.n_items) {
             Bbox bbox = Bbox_read(Bbox_index(group.bboxes, tos.index));
             vec4 bb = vec4(bbox.bbox) + tos.offset.xyxy;
-            bool hit = max(bb.x, xy0.x) < min(bb.z, xy0.x + float(TILEGROUP_WIDTH))
-                && max(bb.y, xy0.y) < min(bb.w, xy0.y + float(TILEGROUP_HEIGHT));
+            bool hit = max(bb.x, xy0.x) < min(bb.z, xy0.x + float(TILEGROUP_WIDTH_PX))
+                && max(bb.y, xy0.y) < min(bb.w, xy0.y + float(TILEGROUP_HEIGHT_PX));
             bool is_group = false;
             if (hit) {
                 PietItemRef item_ref = PietItem_index(group.items, tos.index);
diff --git a/piet-gpu/shader/kernel3.comp b/piet-gpu/shader/kernel3.comp
new file mode 100644
index 0000000..f9f9362
--- /dev/null
+++ b/piet-gpu/shader/kernel3.comp
@@ -0,0 +1,72 @@
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+layout(local_size_x = 32, local_size_y = 1) in;
+
+layout(set = 0, binding = 0) readonly buffer SceneBuf {
+    uint[] scene;
+};
+
+// TODO: this should have a `readonly` qualifier, but then inclusion
+// of ptcl.h would fail because of the writers.
+layout(set = 0, binding = 1) buffer TilegroupBuf {
+    uint[] tilegroup;
+};
+
+layout(set = 0, binding = 2) buffer PtclBuf {
+    uint[] ptcl;
+};
+
+#include "scene.h"
+#include "tilegroup.h"
+#include "ptcl.h"
+
+// TODO: compute all these
+
+#define WIDTH_IN_TILEGROUPS 4
+#define WIDTH_IN_TILES 128
+#define TILEGROUP_WIDTH_TILES 32
+#define TILE_WIDTH_PX 16
+#define TILE_HEIGHT_PX 16
+
+// Must be the same as kernel1. Might be a good idea to move these particular
+// constants to their own .h file.
+#define TILEGROUP_INITIAL_ALLOC 1024
+
+#define PTCL_INITIAL_ALLOC 4096
+
+void main() {
+    uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x;
+    uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS
+        + (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES);
+    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
+    TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_INITIAL_ALLOC);
+    CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC);
+
+    while (true) {
+        uint tg_tag = TileGroup_tag(tg_ref);
+        if (tg_tag == TileGroup_End) {
+            break;
+        }
+        // Assume tg_tag is `Instance`, though there will be more cases.
+        Instance ins = TileGroup_Instance_read(tg_ref);
+        PietItemRef item_ref = PietItemRef(ins.item_ref);
+        uint item_tag = PietItem_tag(item_ref);
+        switch (item_tag) {
+        case PietItem_Circle:
+            PietCircle circle = PietItem_Circle_read(item_ref);
+            vec2 center = ins.offset + circle.center.xy;
+            float r = circle.radius;
+            if (max(center.x - r, xy0.x) < min(center.x + r, xy0.x + float(TILE_WIDTH_PX))
+                && max(center.y - r, xy0.y) < min(center.y + r, xy0.y + float(TILE_HEIGHT_PX)))
+            {
+                CmdCircle cmd = CmdCircle(center, r, circle.rgba_color);
+                Cmd_Circle_write(cmd_ref, cmd);
+                cmd_ref.offset += Cmd_size;
+            }
+            break;
+        }
+        tg_ref.offset += TileGroup_size;
+    }
+    Cmd_End_write(cmd_ref);
+}
diff --git a/piet-gpu/shader/kernel3.spv b/piet-gpu/shader/kernel3.spv
new file mode 100644
index 0000000000000000000000000000000000000000..23a7c3efd84eec32a179cc6d5b1047f2ecd6af2e
GIT binary patch
literal 9964
zcmaKx2ar|e8HNvRk*0JD2vQWKN)-@6`nrmOfE9IFw&2EPcip{9u^<R)(8QKWF*7Dn
zV~Q!pBpTCaOqoeBGbS-H-SpmLFUj+qbH2TY>r9UO@xR~u)xX_ycdhC_`G_Rxmki*)
zxAjZXXL!;dCP@Y-wdq*5ZqvG1J*DPZ^B2t4;i#l4_0wl4J~fQh$d1Og4h>ZphQZ4X
zM<yWpDpmZ~pYaKl>SRz-x2~aX{rb9fYZ{umx*B>)d)iwX+S@u>8k*a>TbfF3J6rS{
zQ1EN(Xln0mE;`iUQLJA>M@%Z&(Am~e+St+xOuLlx>1=K7X(=UmRWrxjkI|0ReRl8B
zK8U&I_J;13#^zIcO5MFprJ0d=X78LiOYo~oj>NB<QZ-~5W=va4X;oWyQ+rF+fU6|$
zQ?6f=j6g5@+Cy#1>DRq|TVq30XM1ON*1x{4VN**-sinZVp52YjZM{AAd}J~jy`1y#
z#*1xb-5XPtj50^se@${s9$#N-*<ttgF6UG3eMmAE-8%X;lp449txnDn8zs^jWAM*v
z9F?3<jLjD64UOjY56dyWFGo!>5nSKVQ)=vJN>|xF<$TJ0)+CeB!%v%QEa%f^?>5C_
zyu52%jvtatL67*unhdfqQP=V${Ihx1CZ~Wmx3#y_b$9l5;fI>mk@qRqnStKtTU^jS
zvRueI)A7&h3{AY4eQpiW*kRrFnaf$Sy+5@tPi-<6ylO{t@#v=g%KMc2tx4vi_xXZd
zKRKV`8I^NQ<BYoga_PA(obeh?;bQo{oDI9W*`GL8eRTAnU&cOu=FZl+Gh62zc3u`*
zH$B%?$s#aUgf6R-C0QL(Wqqsrl;<}%ITM}a4Qo1@-_S|)V2pdQj4`V-B3Yfs<;S{m
z5)t};+c)RRzMNIbDsa|&RZ<6T+nvqhJbfB>rJu`w?rlKd#NxNC?Cm>iay~usjON~$
zGFgsGz~q=CrEM!?)Vc~BwK~D&TGdG@)8{dyan~}&`W^&#HFh`dNNv^0J%{O4$-U^2
z_dakrZ*B4bx`%bUccyP18ILI`g$-=K4@&a5uMaVn=Te<KoayQD)%P*<=<!K#xyP#H
zDfH;+2jC5*rgqDFuGM|I;2h2MjDW|wuQHZ%RVS}i=w1N(ZlFL7?=#<#+uIxJ+B>&3
zw%2#;>}+Z*wRLvXdv<(>49We|H83dUtCG?1yZ6jaYmU=rd-myN50lWFIIr0`tWIWS
zc0QT29#2It_gIxIL~rZpNpmea%vO^u(N_3u^lY6OT$(Y<x%{xW>SQf?Q~KV^>~$6P
zv$f~%FV}DbdUrmz-P(3Er~T}$h}&18-%-%(nEQHi)FOTlIE#Pd%paTT*}VYveeTy;
z8@&oK-uo~K@d<rmrl$J+;Bx+&^o(kr+~-r3a*@w7S<I(yKH5(6J<V9m7w4;(PcHI#
z){6Pm&DS5XU(b`@U5?N7yL0_G^pSY!H<4`S6OSgw`V$bJ@mTzRlyQFJN>W|!Xlw%+
zCm_cVH;7T)JjPj9ZX&iJjOxZ~8^#zp{Vr%ex$*4sh}@>X`HUaQD6g;ae(#8Q_tJPd
zeVv;&dG&FO#;O}TkuhQ?fsK_LkFV>jE&4tYET^BbzMms@D%e;#>rBgS^6Ik~ja4^x
zE@Q;{;b^R!bN0O#>lZ!?avyCI@i~prwKZ0oIekBc{~2I&%2|IYqyF;7uPEroPrzp_
zqcP@i@6Ss4>^x4-^?Ep;(AR;@JC6EszwE_(qi){wb04`SxxO*y%wxUHh>!jkGMZC9
z>|4P75bX_&_UV~WZ_RbrEc|zZJ;&Ow$?fiY^t~5szVP1<_H#l1LyYDhh&ZqN8STY+
zJ;120-}@QuQ_gv*dxzyB?<2Xc|D#~@dJfF{G-KrT9;@s3Lq>DRnOEJrEa%<o*Ds5U
z`SnNVC*}25_pZyCuR6!Y^*jP@ANJ^T1o{MY^B)b?H|F6wjDaI!#SG4l>pT`+PKg{R
z=))q0exf=Tc@&vufTuEN$C*g@8~~5PE%aN_z56rhz<K%^WpD1kavJsRi!$ls8oUTo
z!|2@C%Hr=jhwgha^j3J^kD*UR_Z=C!_rrH&=;res8M^+yBSUvjeMg3_-S=eZe$M!w
z4BdRbBSY7|yr6qleLoiMz8^!kf8URxA42#282SQq-;tp^AK#OqYu{JUwfmmbzJgWs
ztkfW$AJ=;+Yg)@#oAU?pQ6GRV=Q)bFfnejj%MmvSU4CC4HyCWJ{4&midnFe+js#cM
zJ_=4gY7Yg=?Waz}4MVT2JseK{=DhYuu$<>9`WXc_pEmpPtjO8F=kFNAyz5yZ@5Qk>
zC-39Xg|BzwxZHjd_0*3?m-DWe+jA!uYcn2fy_nYou$=d313A-o30U5E&*?-Y;wOV`
zAmUE~%X#mNUxIlGSl;+ajOs6OUruK9-Z(yw?^MQVh>!N^x!v()?B3Z^5dDl3+lM{Q
z2HTr8yc_C^^EICX_IFl!?cOE1n8RGK+(Y>L*)R{0^X$EV{{pa&J!qSc$k~I~UZW@Z
z=xH%Hdh$N1+moNYr-NrR%G;CoSI)EK{X7FKR}YRpm!iwNu70;z29`74`<?2!Z3TMD
zVOtJXkM&py9!tEuv3?fFMV(b(Ip@BH(dQNVl2?~n%|FiH(K!1#3mi55yim8M@4R}j
ze@l_K=2}KM=j-RmIbi4OJJEce-F4{lE4c0YoC`KipI8_D&V$popIqwe!N$s4%g-t~
zYx!AqKG-~dZ?M)t#*OIm`fp%V5C2VIbHrzg{+rR|BmM%gaR;cO{X(!l@{#8v@M!r$
z-HXxXhj6!zy9CTndZx{xJ>o9~SN5?5PCoj$3~V3!^FA&I>mwg|t^il|(EukOaa+N1
zW02^r5zJ5ecO&!a^Eh?9Guse**T-|y%;@8t(bj~>d1l1XOAFXJ#pk?h(26b}bJ`9r
zuI-iR`p8F~Hn4Ll=6MskeDr@6n4ffQ&7nQw+rgE6?0}PxK03g~we3XLM?Ug&fh+sC
z8cu$Hz9-#ax#HUPp!1Wit$Fo{wJm||T^~PdVjUfy<J`FZy~s9XHKM<3>^|&b^l=~b
z*@?)x4`S!$o!J9+{xR2Uz|Pfq`FV6L*gI1X)~A;7I&}FBX!Fsp2b;%OZQiBbjPk}T
z!RH3>q};ZbQC*wy`@#Afw~tYMF}eM&bt8B+qcPf|uY+Lo75lmgT|VBgH-qI4AQ5*9
zIO43QuAl20>*J`thSBwMjQHEY<B($Ax1h^M-M50}Zbc&Qc5uX5S6x5rOk=c;qx#9Y
zK9w=n?@q9N8+!+%x}UMG(OqEw7AXH3r)d%5cU6B!@&6C__<Qd{Vh1q#{{pNxlrg@i
z)`8=Dn)~kG?TWg8gA3i?fwty)%<pV;^A^|T9CZ07=*4q$E+XgWuk)~`T%4;-V7XYw
z3&3)IZu&0pce9HSe}A)%_K3Y0tj)akt{%2a!Nt6nq09N15qU2MM_%m_dj(jVd0hkb
zur-3MbzPFZOSXaK!rlZnChW~%xecs`cdrF8M&5O7W!#QzMVy~C)%7!{`*|g@6?vWg
z6UScrt{si|j6~dP|3)ziaj%bNjJ>`Jd_+O-tk6pZ-QNkWE$E(~gSj4i-;Qpb;@<B-
zmyf;g0L#VR+k;%JRTo$;_Wo+HTygKa(PNLaM{EyRo9pFV)Wg;bF6P~ZF1Hhjz26Ov
zyxJpn4_KRdT@&@NT?e*StiknQ*D3Z*U7xVu0Ipo4eQ@$I=NnV(`@%C)o$k>Au)ICE
z?&{wA!HhQ{M<Mdo@m}7H*oWVhj8WGw`o0D1T;fc=87yb~A;wz~<71!H^)Y?~<86p`
z&!0HXpZ9w#;&UA0`8$?z4C47ao-x+q_M9)CmpkC(<GkDnma~u8r@O$hPugQncZ0Rr
zlQ?>^-xCm@aY*zu9&Aq&7`+q28Sg>76Vc<_!TJ?@d<VLG?AJTNa<N~-!0$rjU3+t?
z$9Lg(gGVFs_NpGeP6kgwd`?8{brR!Ah`pZ7829&Hu)T(UUxofqh5m3se-!*!uE$*8
zgKnMTT;Gc>AN%n>uw0xAdytDg_yAZg*5-p?x#Io(5c-3Nb+kw9hr!xhGv}fnwvT{|
zc|VFSw*`rF@iB1Z)gG}Q2WvC0YoZ>uPk@VgKZ!2)2oiZe1&+MhBlgo^ZRT}t)x-7~
zuycsDeH`q1J7@jX^$Gvag6%Et<LAI~VgEeX8e#teST5cnUj!Q?Z_QT5FCo_4i&#@#
zKXZD{zKm%142t6n)`DjsKGP7-;8e!xh-YvnW6bM`oOfQ)>sP??ab~^>mh;TSntlx&
zYpOkBzYf-BPvYpwerF>-vykZNRIokGVe}q&SDr+?7t!N4!1@(?{3g15oYQZC<>C%c
z1b-WmckRrn?j5$5?;ukUd3#llUi*U=BR;1g_Bt;eGrR!22(j<e8RPyw1upjeUG#-W
zvG4Dp%g5S&A1vqE+NU|>Vm*HZmWz4*7%aCLaW4)r{sgff>&P2BknyL8HqV@A;%7M*
zd-!woXAtAeYkfKWqW&+y#rn^p%WXlT{x88%Uq0&p3al;a{~9b8^?w77`p<%`FQ;GB
z|1G#!|99wek0728=l^@e`rebM{fC^7+J8isi`stzM{RRjTTZ{I{bz8o_FvHDb|P^$
z{|dIY`xkM41ItIPzk{QeIjkk8U-5gs=k83zXDQ;jJA-i<;#phH7<K=V^X^^L{U=yH
z&ep%ca`qOz{2Ls-Xtx(R&rM}7tKocBAkoXp9D4^=A@;R~(K|4Y@m0h-psvqrxgLH0
z2i+XSzMn&vkG`J=%N6^60X_QGZr^g*Im~{?eG$Df_e*f{aerP0%k4+}ZZ?<k6-3^g
d_OI@m&S!ibah}?oi+cP{``;92bJiB?_djpJ27&+p

literal 0
HcmV?d00001

diff --git a/piet-gpu/shader/ptcl.h b/piet-gpu/shader/ptcl.h
new file mode 100644
index 0000000..583cc10
--- /dev/null
+++ b/piet-gpu/shader/ptcl.h
@@ -0,0 +1,323 @@
+// Code auto-generated by piet-gpu-derive
+
+struct CmdCircleRef {
+    uint offset;
+};
+
+struct CmdLineRef {
+    uint offset;
+};
+
+struct CmdStrokeRef {
+    uint offset;
+};
+
+struct CmdFillRef {
+    uint offset;
+};
+
+struct CmdFillEdgeRef {
+    uint offset;
+};
+
+struct CmdDrawFillRef {
+    uint offset;
+};
+
+struct CmdSolidRef {
+    uint offset;
+};
+
+struct CmdRef {
+    uint offset;
+};
+
+struct CmdCircle {
+    vec2 center;
+    float radius;
+    uint rgba_color;
+};
+
+#define CmdCircle_size 16
+
+CmdCircleRef CmdCircle_index(CmdCircleRef ref, uint index) {
+    return CmdCircleRef(ref.offset + index * CmdCircle_size);
+}
+
+struct CmdLine {
+    vec2 start;
+    vec2 end;
+};
+
+#define CmdLine_size 16
+
+CmdLineRef CmdLine_index(CmdLineRef ref, uint index) {
+    return CmdLineRef(ref.offset + index * CmdLine_size);
+}
+
+struct CmdStroke {
+    float halfWidth;
+    uint rgba_color;
+};
+
+#define CmdStroke_size 8
+
+CmdStrokeRef CmdStroke_index(CmdStrokeRef ref, uint index) {
+    return CmdStrokeRef(ref.offset + index * CmdStroke_size);
+}
+
+struct CmdFill {
+    vec2 start;
+    vec2 end;
+};
+
+#define CmdFill_size 16
+
+CmdFillRef CmdFill_index(CmdFillRef ref, uint index) {
+    return CmdFillRef(ref.offset + index * CmdFill_size);
+}
+
+struct CmdFillEdge {
+    int sign;
+    float y;
+};
+
+#define CmdFillEdge_size 8
+
+CmdFillEdgeRef CmdFillEdge_index(CmdFillEdgeRef ref, uint index) {
+    return CmdFillEdgeRef(ref.offset + index * CmdFillEdge_size);
+}
+
+struct CmdDrawFill {
+    int backdrop;
+    uint rgba_color;
+};
+
+#define CmdDrawFill_size 8
+
+CmdDrawFillRef CmdDrawFill_index(CmdDrawFillRef ref, uint index) {
+    return CmdDrawFillRef(ref.offset + index * CmdDrawFill_size);
+}
+
+struct CmdSolid {
+    uint rgba_color;
+};
+
+#define CmdSolid_size 4
+
+CmdSolidRef CmdSolid_index(CmdSolidRef ref, uint index) {
+    return CmdSolidRef(ref.offset + index * CmdSolid_size);
+}
+
+#define Cmd_End 0
+#define Cmd_Circle 1
+#define Cmd_Line 2
+#define Cmd_Fill 3
+#define Cmd_Stroke 4
+#define Cmd_FillEdge 5
+#define Cmd_DrawFill 6
+#define Cmd_Solid 7
+#define Cmd_Bail 8
+#define Cmd_size 20
+
+CmdRef Cmd_index(CmdRef ref, uint index) {
+    return CmdRef(ref.offset + index * Cmd_size);
+}
+
+CmdCircle CmdCircle_read(CmdCircleRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = ptcl[ix + 0];
+    uint raw1 = ptcl[ix + 1];
+    uint raw2 = ptcl[ix + 2];
+    uint raw3 = ptcl[ix + 3];
+    CmdCircle s;
+    s.center = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.radius = uintBitsToFloat(raw2);
+    s.rgba_color = raw3;
+    return s;
+}
+
+void CmdCircle_write(CmdCircleRef ref, CmdCircle s) {
+    uint ix = ref.offset >> 2;
+    ptcl[ix + 0] = floatBitsToUint(s.center.x);
+    ptcl[ix + 1] = floatBitsToUint(s.center.y);
+    ptcl[ix + 2] = floatBitsToUint(s.radius);
+    ptcl[ix + 3] = s.rgba_color;
+}
+
+CmdLine CmdLine_read(CmdLineRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = ptcl[ix + 0];
+    uint raw1 = ptcl[ix + 1];
+    uint raw2 = ptcl[ix + 2];
+    uint raw3 = ptcl[ix + 3];
+    CmdLine s;
+    s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    return s;
+}
+
+void CmdLine_write(CmdLineRef ref, CmdLine s) {
+    uint ix = ref.offset >> 2;
+    ptcl[ix + 0] = floatBitsToUint(s.start.x);
+    ptcl[ix + 1] = floatBitsToUint(s.start.y);
+    ptcl[ix + 2] = floatBitsToUint(s.end.x);
+    ptcl[ix + 3] = floatBitsToUint(s.end.y);
+}
+
+CmdStroke CmdStroke_read(CmdStrokeRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = ptcl[ix + 0];
+    uint raw1 = ptcl[ix + 1];
+    CmdStroke s;
+    s.halfWidth = uintBitsToFloat(raw0);
+    s.rgba_color = raw1;
+    return s;
+}
+
+void CmdStroke_write(CmdStrokeRef ref, CmdStroke s) {
+    uint ix = ref.offset >> 2;
+    ptcl[ix + 0] = floatBitsToUint(s.halfWidth);
+    ptcl[ix + 1] = s.rgba_color;
+}
+
+CmdFill CmdFill_read(CmdFillRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = ptcl[ix + 0];
+    uint raw1 = ptcl[ix + 1];
+    uint raw2 = ptcl[ix + 2];
+    uint raw3 = ptcl[ix + 3];
+    CmdFill s;
+    s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    return s;
+}
+
+void CmdFill_write(CmdFillRef ref, CmdFill s) {
+    uint ix = ref.offset >> 2;
+    ptcl[ix + 0] = floatBitsToUint(s.start.x);
+    ptcl[ix + 1] = floatBitsToUint(s.start.y);
+    ptcl[ix + 2] = floatBitsToUint(s.end.x);
+    ptcl[ix + 3] = floatBitsToUint(s.end.y);
+}
+
+CmdFillEdge CmdFillEdge_read(CmdFillEdgeRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = ptcl[ix + 0];
+    uint raw1 = ptcl[ix + 1];
+    CmdFillEdge s;
+    s.sign = int(raw0);
+    s.y = uintBitsToFloat(raw1);
+    return s;
+}
+
+void CmdFillEdge_write(CmdFillEdgeRef ref, CmdFillEdge s) {
+    uint ix = ref.offset >> 2;
+    ptcl[ix + 0] = uint(s.sign);
+    ptcl[ix + 1] = floatBitsToUint(s.y);
+}
+
+CmdDrawFill CmdDrawFill_read(CmdDrawFillRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = ptcl[ix + 0];
+    uint raw1 = ptcl[ix + 1];
+    CmdDrawFill s;
+    s.backdrop = int(raw0);
+    s.rgba_color = raw1;
+    return s;
+}
+
+void CmdDrawFill_write(CmdDrawFillRef ref, CmdDrawFill s) {
+    uint ix = ref.offset >> 2;
+    ptcl[ix + 0] = uint(s.backdrop);
+    ptcl[ix + 1] = s.rgba_color;
+}
+
+CmdSolid CmdSolid_read(CmdSolidRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = ptcl[ix + 0];
+    CmdSolid s;
+    s.rgba_color = raw0;
+    return s;
+}
+
+void CmdSolid_write(CmdSolidRef ref, CmdSolid s) {
+    uint ix = ref.offset >> 2;
+    ptcl[ix + 0] = s.rgba_color;
+}
+
+uint Cmd_tag(CmdRef ref) {
+    return ptcl[ref.offset >> 2];
+}
+
+CmdCircle Cmd_Circle_read(CmdRef ref) {
+    return CmdCircle_read(CmdCircleRef(ref.offset + 4));
+}
+
+CmdLine Cmd_Line_read(CmdRef ref) {
+    return CmdLine_read(CmdLineRef(ref.offset + 4));
+}
+
+CmdFill Cmd_Fill_read(CmdRef ref) {
+    return CmdFill_read(CmdFillRef(ref.offset + 4));
+}
+
+CmdStroke Cmd_Stroke_read(CmdRef ref) {
+    return CmdStroke_read(CmdStrokeRef(ref.offset + 4));
+}
+
+CmdFillEdge Cmd_FillEdge_read(CmdRef ref) {
+    return CmdFillEdge_read(CmdFillEdgeRef(ref.offset + 4));
+}
+
+CmdDrawFill Cmd_DrawFill_read(CmdRef ref) {
+    return CmdDrawFill_read(CmdDrawFillRef(ref.offset + 4));
+}
+
+CmdSolid Cmd_Solid_read(CmdRef ref) {
+    return CmdSolid_read(CmdSolidRef(ref.offset + 4));
+}
+
+void Cmd_End_write(CmdRef ref) {
+    ptcl[ref.offset >> 2] = Cmd_End;
+}
+
+void Cmd_Circle_write(CmdRef ref, CmdCircle s) {
+    ptcl[ref.offset >> 2] = Cmd_Circle;
+    CmdCircle_write(CmdCircleRef(ref.offset + 4), s);
+}
+
+void Cmd_Line_write(CmdRef ref, CmdLine s) {
+    ptcl[ref.offset >> 2] = Cmd_Line;
+    CmdLine_write(CmdLineRef(ref.offset + 4), s);
+}
+
+void Cmd_Fill_write(CmdRef ref, CmdFill s) {
+    ptcl[ref.offset >> 2] = Cmd_Fill;
+    CmdFill_write(CmdFillRef(ref.offset + 4), s);
+}
+
+void Cmd_Stroke_write(CmdRef ref, CmdStroke s) {
+    ptcl[ref.offset >> 2] = Cmd_Stroke;
+    CmdStroke_write(CmdStrokeRef(ref.offset + 4), s);
+}
+
+void Cmd_FillEdge_write(CmdRef ref, CmdFillEdge s) {
+    ptcl[ref.offset >> 2] = Cmd_FillEdge;
+    CmdFillEdge_write(CmdFillEdgeRef(ref.offset + 4), s);
+}
+
+void Cmd_DrawFill_write(CmdRef ref, CmdDrawFill s) {
+    ptcl[ref.offset >> 2] = Cmd_DrawFill;
+    CmdDrawFill_write(CmdDrawFillRef(ref.offset + 4), s);
+}
+
+void Cmd_Solid_write(CmdRef ref, CmdSolid s) {
+    ptcl[ref.offset >> 2] = Cmd_Solid;
+    CmdSolid_write(CmdSolidRef(ref.offset + 4), s);
+}
+
+void Cmd_Bail_write(CmdRef ref) {
+    ptcl[ref.offset >> 2] = Cmd_Bail;
+}
+
diff --git a/piet-gpu/src/main.rs b/piet-gpu/src/main.rs
index 72f0d3c..56b73ca 100644
--- a/piet-gpu/src/main.rs
+++ b/piet-gpu/src/main.rs
@@ -73,6 +73,7 @@ fn dump_scene(buf: &[u8]) {
     }
 }
 
+#[allow(unused)]
 fn dump_k1_data(k1_buf: &[u32]) {
     for i in 0..k1_buf.len() {
         if k1_buf[i] != 0 {
@@ -96,7 +97,9 @@ fn main() {
             .create_buffer(std::mem::size_of_val(&scene[..]) as u64, dev)
             .unwrap();
         device.write_buffer(&scene_buf, &scene).unwrap();
+        // These should only be on the host if we're going to examine them from Rust.
         let tilegroup_buf = device.create_buffer(384 * 1024, host).unwrap();
+        let ptcl_buf = device.create_buffer(12 * 1024 * 4096, host).unwrap();
         let image_buf = device
             .create_buffer((WIDTH * HEIGHT * 4) as u64, host)
             .unwrap();
@@ -110,16 +113,23 @@ fn main() {
             .create_descriptor_set(&k1_pipeline, &[&scene_dev, &tilegroup_buf])
             .unwrap();
 
+        let k3_code = include_bytes!("../shader/kernel3.spv");
+        let k3_pipeline = device.create_simple_compute_pipeline(k3_code, 3).unwrap();
+        let k3_ds = device
+            .create_descriptor_set(&k3_pipeline, &[&scene_dev, &tilegroup_buf, &ptcl_buf])
+            .unwrap();
+
         let code = include_bytes!("../shader/image.spv");
         let pipeline = device.create_simple_compute_pipeline(code, 2).unwrap();
         let descriptor_set = device
             .create_descriptor_set(&pipeline, &[&scene_dev, &image_dev])
             .unwrap();
-        let query_pool = device.create_query_pool(3).unwrap();
+        let query_pool = device.create_query_pool(4).unwrap();
         let mut cmd_buf = device.create_cmd_buf().unwrap();
         cmd_buf.begin();
         cmd_buf.copy_buffer(&scene_buf, &scene_dev);
         cmd_buf.clear_buffer(&tilegroup_buf);
+        cmd_buf.clear_buffer(&ptcl_buf);
         cmd_buf.memory_barrier();
         cmd_buf.write_timestamp(&query_pool, 0);
         cmd_buf.dispatch(
@@ -129,22 +139,36 @@ fn main() {
         );
         cmd_buf.write_timestamp(&query_pool, 1);
         cmd_buf.memory_barrier();
+        cmd_buf.dispatch(
+            &k3_pipeline,
+            &k3_ds,
+            ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 1),
+        );
+        cmd_buf.write_timestamp(&query_pool, 2);
+        cmd_buf.memory_barrier();
         cmd_buf.dispatch(
             &pipeline,
             &descriptor_set,
             ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1),
         );
-        cmd_buf.write_timestamp(&query_pool, 2);
+        cmd_buf.write_timestamp(&query_pool, 3);
         cmd_buf.memory_barrier();
         cmd_buf.copy_buffer(&image_dev, &image_buf);
         cmd_buf.finish();
         device.run_cmd_buf(&cmd_buf).unwrap();
         let timestamps = device.reap_query_pool(query_pool).unwrap();
         println!("Kernel 1 time: {:.3}ms", timestamps[0] * 1e3);
-        println!("Render time: {:.3}ms", (timestamps[1] - timestamps[0]) * 1e3);
+        println!(
+            "Kernel 3 time: {:.3}ms",
+            (timestamps[1] - timestamps[0]) * 1e3
+        );
+        println!(
+            "Render time: {:.3}ms",
+            (timestamps[2] - timestamps[1]) * 1e3
+        );
 
         let mut k1_data: Vec<u32> = Default::default();
-        device.read_buffer(&tilegroup_buf, &mut k1_data).unwrap();
+        device.read_buffer(&ptcl_buf, &mut k1_data).unwrap();
         dump_k1_data(&k1_data);
 
         let mut img_data: Vec<u8> = Default::default();