From 1d1801c1aa346ef011f10109176171710a94b7b0 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Wed, 1 Dec 2021 08:42:06 -0800 Subject: [PATCH] Cross-platform path stage shaders --- piet-gpu/shader/build.ninja | 12 + piet-gpu/shader/gen/bbox_clear.dxil | Bin 0 -> 3152 bytes piet-gpu/shader/gen/bbox_clear.hlsl | 55 ++ piet-gpu/shader/gen/bbox_clear.msl | 57 ++ piet-gpu/shader/gen/pathseg.dxil | Bin 0 -> 9424 bytes piet-gpu/shader/gen/pathseg.hlsl | 643 ++++++++++++++++++++++ piet-gpu/shader/gen/pathseg.msl | 699 ++++++++++++++++++++++++ piet-gpu/shader/gen/pathtag_reduce.dxil | Bin 0 -> 4700 bytes piet-gpu/shader/gen/pathtag_reduce.hlsl | 127 +++++ piet-gpu/shader/gen/pathtag_reduce.msl | 143 +++++ piet-gpu/shader/gen/pathtag_root.dxil | Bin 0 -> 4716 bytes piet-gpu/shader/gen/pathtag_root.hlsl | 115 ++++ piet-gpu/shader/gen/pathtag_root.msl | 146 +++++ piet-gpu/src/stages/path.rs | 10 +- 14 files changed, 2002 insertions(+), 5 deletions(-) create mode 100644 piet-gpu/shader/gen/bbox_clear.dxil create mode 100644 piet-gpu/shader/gen/bbox_clear.hlsl create mode 100644 piet-gpu/shader/gen/bbox_clear.msl create mode 100644 piet-gpu/shader/gen/pathseg.dxil create mode 100644 piet-gpu/shader/gen/pathseg.hlsl create mode 100644 piet-gpu/shader/gen/pathseg.msl create mode 100644 piet-gpu/shader/gen/pathtag_reduce.dxil create mode 100644 piet-gpu/shader/gen/pathtag_reduce.hlsl create mode 100644 piet-gpu/shader/gen/pathtag_reduce.msl create mode 100644 piet-gpu/shader/gen/pathtag_root.dxil create mode 100644 piet-gpu/shader/gen/pathtag_root.hlsl create mode 100644 piet-gpu/shader/gen/pathtag_root.msl diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja index 6f225d9..c8b4858 100644 --- a/piet-gpu/shader/build.ninja +++ b/piet-gpu/shader/build.ninja @@ -59,10 +59,22 @@ build gen/transform_leaf.dxil: dxil gen/transform_leaf.hlsl build gen/transform_leaf.msl: msl gen/transform_leaf.spv build gen/pathtag_reduce.spv: glsl pathtag_reduce.comp | pathtag.h setup.h mem.h +build gen/pathtag_reduce.hlsl: hlsl gen/pathtag_reduce.spv +build gen/pathtag_reduce.dxil: dxil gen/pathtag_reduce.hlsl +build gen/pathtag_reduce.msl: msl gen/pathtag_reduce.spv build gen/pathtag_root.spv: glsl pathtag_scan.comp | pathtag.h flags = -DROOT +build gen/pathtag_root.hlsl: hlsl gen/pathtag_root.spv +build gen/pathtag_root.dxil: dxil gen/pathtag_root.hlsl +build gen/pathtag_root.msl: msl gen/pathtag_root.spv build gen/bbox_clear.spv: glsl bbox_clear.comp | setup.h mem.h +build gen/bbox_clear.hlsl: hlsl gen/bbox_clear.spv +build gen/bbox_clear.dxil: dxil gen/bbox_clear.hlsl +build gen/bbox_clear.msl: msl gen/bbox_clear.spv build gen/pathseg.spv: glsl pathseg.comp | tile.h pathseg.h pathtag.h setup.h mem.h +build gen/pathseg.hlsl: hlsl gen/pathseg.spv +build gen/pathseg.dxil: dxil gen/pathseg.hlsl +build gen/pathseg.msl: msl gen/pathseg.spv diff --git a/piet-gpu/shader/gen/bbox_clear.dxil b/piet-gpu/shader/gen/bbox_clear.dxil new file mode 100644 index 0000000000000000000000000000000000000000..8a4672594d16e9a059513de25eb5cbc11c653de5 GIT binary patch literal 3152 zcmeHJeNY?66<_HN-ARb!K(Z}GGC3enN=bYI%*Oy7eSyKqj)t( zcBY;7k0vwOnYZul+uy$1efREnt3RsMJ$QWI;jyD7Pp<#I?$+vWzjv4i05FOHz@RRM zVt`_XVuNxMid6uB2a2hzg2v%E+A2&3RM9b~AHHX7G<}V3hkiP`;2!-6Y%zEH_yBNw z;RKpM1_gz}wWvS#&M>mq9W$b${wSVvnvQ6Ww8954AO=*!0Q#d91}+c@=giQr)lFYZ zhUY#4eX(vT@=OVw!@5)ZGz(~jfXpuJ=Xu0wF}OSlJz3+Dmh!dzvDz^*TUvn2Dn;3k z_{sxPN>}DP1LWD-jdCK;ml8H=s=EE_J%2obez{Cg1rNdHM&ZBC9T!vc$K$+PoQY7p zHw87@oXHD-3W!4>PU2>?lmUm?2H+_QfDEX!Q_6)@T`zhnWy}M+cRiy4X`D4M;cf>5#ZqoZ>|Bp)T+FpG>fU+oaXotDYH3#ZPp5<*9wIHokIO*oB{aT-cbF ze*cvHh+j3I5u>n*zg@~ho>tAvyw}vzmW$G=*^MWz=&f^PLjPn{sYah6d*IMP0MQ&Z+M`A@gK>Zr!JqDewW+aA25VgrY?4^p9VA1*2zQX1a4LjBjQu(Ruf%fVUVWBXz?K#2r%V(_)f+l&k&}NN;qwBBjQ`s90D)TrFiM1*P%?L%=Ug zKQNE#?n|TjMsa-=9^*4+m8u(2m#y3QEH6~Zy|WJZ?!F22eTX4Y#+T^#%W7ktMabE& zoEoX@aw0>`Q#{RRY|%(pUuI3FXgKq)!&BWIs!C^UZROh}D)Ed4De=6(LgnG8#$s!7 z(@KRGNlwQl;q}V&1~*M961B&C76xZ@D+cms?A)HXWi+`lFDr~Bgs1gYs0|;B^W^ZR zQ4Pog3c>74dFWeS%Q&5Aw=_0M(DOr-_#A36=84~-ZItinkMpF10^|J&sww(q)#OWs zyBnLFtxk!@lH9jhmVdzWqWHs;??33bxoO&PV{z+^%^TM?7LQ&!Gj(Cv&df4okBUOZ z#7o!31-W!jrp?x7f2gspzNyvZ4vFNaPgW1DnXitl4!EpZ?|U_S<{KVM&ri$vb6|aH zX|%U*@ZA2_`v%V*5(cvC+8=7EcRul*mfCu!$^Am2{B&$}lKEI*WK~futA-)g2EfmC z-uE8aEc?&gZxuf(S-@XYzO zF07*+a|WBmn;`yulK6{@$PsIkW@q;PZdBPo$3$Q~iY26EL095ON|xIFtq!aU$2!Vc zhXd<$AZ_iegJR`N3B=6+u};S~(edwY5nDzghYat%maM#zym!`;)IdK~^jc5Rimq@H zDSoY|*n<=;ZAsf0tjmh!TiF)ayI4nnbud_42G+J61H&G7v5(_yvy5#CvK_(htPi$` z53Uopriq(0QnN6s+!$3ZMfx?1axqyM?%um-5nK-F3Kw*R!ASqgT)}{@z=sqK%dkcl65)|?ko)JSYVHjW1Yd7;tf}P6GePvB-SSg&H&96{C;8X z^3-w9d|3RXdRI)sAVF_0*cF=n<~Kgx<~=IahYugtL(hmSm-A@#K$5^;o*;Kq+>*bw z>QUO)^TV~@@JO*M7lw+oa*e56x??XM9NJhuvqzUX*uU>aRjZ!k;qVSGH{Zj{zqpH+ zt)FLJX8ymtOv5~Wrg(X$7NGjCd07M(LtZBTH81}oDRZ3sRfzvzIXNAlJFAh2MYT?a zAWWs>r*zCe5psh&kfqrDhhumRL*e{9^nzfv4>0gT@xayDi)-ol?i`J*UpO1G$tEmU z)a2zsd&amZIQbtkC;ji3lm7xCKV3Pbc-EYB=^Mj}ch&~fNaUaN1t)%XIMG7E{}Zf7 z398HD?w{P9Ft-65RE0J~s?hCE;t%GBLYbqK=y|oM*C6!6X^<)q0yh(~`5o04eHy-t zl4Y1vnYVLm7(n3y&&zeG0vrNXT7M8KeO}0;*J!kkK2PE^J{||oWyoE`{$Uc-J08=5 zUHY$U&{RPjT(C8unMa4uG!C1zQBSb@GP?R}`>IJ7_565d6E({nN^Docj8B8Wc*WM~ T63w>+)o+#l3gvV6bFTdzU76o` literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/bbox_clear.hlsl b/piet-gpu/shader/gen/bbox_clear.hlsl new file mode 100644 index 0000000..ae40b13 --- /dev/null +++ b/piet-gpu/shader/gen/bbox_clear.hlsl @@ -0,0 +1,55 @@ +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc bbox_alloc; + uint n_trans; + uint trans_offset; + uint pathtag_offset; + uint linewidth_offset; + uint pathseg_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); + +ByteAddressBuffer _21 : register(t1); +RWByteAddressBuffer _44 : register(u0); + +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x; + if (ix < _21.Load(0)) + { + uint out_ix = (_21.Load(40) >> uint(2)) + (4u * ix); + _44.Store(out_ix * 4 + 8, 65535u); + _44.Store((out_ix + 1u) * 4 + 8, 65535u); + _44.Store((out_ix + 2u) * 4 + 8, 0u); + _44.Store((out_ix + 3u) * 4 + 8, 0u); + } +} + +[numthreads(512, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/bbox_clear.msl b/piet-gpu/shader/gen/bbox_clear.msl new file mode 100644 index 0000000..f424448 --- /dev/null +++ b/piet-gpu/shader/gen/bbox_clear.msl @@ -0,0 +1,57 @@ +#include +#include + +using namespace metal; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc bbox_alloc; + uint n_trans; + uint trans_offset; + uint pathtag_offset; + uint linewidth_offset; + uint pathseg_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); + +kernel void main0(device Memory& _44 [[buffer(0)]], const device ConfigBuf& _21 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) +{ + uint ix = gl_GlobalInvocationID.x; + if (ix < _21.conf.n_elements) + { + uint out_ix = (_21.conf.bbox_alloc.offset >> uint(2)) + (4u * ix); + _44.memory[out_ix] = 65535u; + _44.memory[out_ix + 1u] = 65535u; + _44.memory[out_ix + 2u] = 0u; + _44.memory[out_ix + 3u] = 0u; + } +} + diff --git a/piet-gpu/shader/gen/pathseg.dxil b/piet-gpu/shader/gen/pathseg.dxil new file mode 100644 index 0000000000000000000000000000000000000000..5ad35e75a7f24f16e30db7ccedc2b2ac1cb4792a GIT binary patch literal 9424 zcmeHNe_Rt+wx7u)lOKc-AizNQF#)6qs1v||pqLN@6lkL&#b0#*p{PhBqV)%DCkcdz zpur+Wt!=2-Ri$uNHSIDj`%<}S>UPSAAy%x`Gb!Xly*@QH~JepCMf3hUQA1_aS# zp&urAFi#CC2#DN|7fpad%jq)F|F?Vu$mnN+PtWbwWr9DUhn6iYR1~VfL`aAQiGabZ z;<@>>0U?m30)D2_dRxK=I)Thfi@V3;4`ggjRI8$wITsZs&e#Rl^Cz)5!Qss9r4%-Y zk-5t&(=JRDrE;)5USg>s*C@i3*@nYVaAM|guE6LUWmhTYYnq32UpFzO+30*QX?kqp zW+%NoY&oOTTkoQqh!whymX6U)*eJv7bi;>Y7DCWm@C`jze=V7b=X3O27>2gt5R?RX ziID~_zOsSoA9bi6yuPaMFpB*0o14Ar?fF8ZEfmJ0ahNa&`AHC$Bjra5dpL3~KkYL3 zbW%A1QSF(0E1_Dbhq2MUQS#Wc&v!lMZslC^(Nq+3M7b=O0&xN1t9RhB7noTP*}vfr z>WBV633q+$cYR)dM$Kom|Dk!Uus>uh5+Nptg>-)B?8*s$5n#0 z4#5`ihJ3aV>>4wBJ-2l{=S_M^v%Ah2Hn40^Pw^jK% z0%FEf^fS3acdXJhf3x3s@X!QyKU)4Inu+1XlsnvL3EBGx2&Q9?E>`0mn1Q>S5JX9Q zSWK&jfRwK@In)H9pBW^4qK>~g8HXkYf!$ZjArQUpVq}3BreilT+=K=D<{O*2F*z5Q zO2#~rX~vq&375@6p0GoXG`$UJSiGautCfzmnEq;1e_;EFOo1UuCLFrSgdH+|8NnBG zBKdZTFB6F|Xh|@^d-YYXD}OEBD)uV-tKXG_@%#7Vkhp`GamI3P+UXBZ5T~cf-k))n z%dq;6_k1&b|C%Qq!`>M0c=m%c6aAe1oP%4}TD}QR=TK>EA3e+X{ z4czV?9f{#Um?d}p{)4o!5_#m+mTDk1z`kb_X`{sZTn2&Wk6w0tG%|X7_%i4oZ0XEf z@9ti1QKTcuxybZlL{^EUY(k_I5-k`e1*2xcU8lf>`#d0h#>_sC%{=k`<@@tO7Vi~* z-iCK(>hF|38O~DH*%c~YL4)KFu z-@feT;L-zw#Wx4r@OQHGcQW^|pA8%y`6)Xnn^uuB$@LA;(RQZ)($|XLA3oE6wSars z>=pd_m4T%P#LM>%4z^i+Ac&5k`5=apI8>;8D6~QZ&oU@e>wB6rDfZ?~?DeK~{1M3K zJE7nKB}n3DhIiY-u62Y~5G;2$owyH~)vKJ@8|!hwy<~iNWdL+Hi zCE7@`w^p)KDjRFSn`%@W8`UIxV=Q~)7z?O6RY8UBa$YBU@q{H&Ttk{Yt@@6CK6{cbw=?Lcx8jV94n67nRQgLZ|74BL?NPq3)6} zx)qD~9?+12%UGfO?W?nPfR{!wb%Wq$jlMdGUo6Ds&XP2{VEgdszf1Hd!9#mJsugIHmeS0JntZPhh(ID@%@YY_)K)Uh38~^ zJmXQG&l?(RaB}fs{U=vH{Of@jM5B227_-az!j7M>_dE}d2=AgJL+d9jPI?EP>e_$s z;|H!_&T9(YqSa%A6Q-^<{asVZ>pS&VUJQ>gg`&Q)LyCxqGi#St(wy1Qn;c1G8Rc%v zL-6j$U!I<#4Vn7I=EqbVFETESbhUma+qSx^?#?~rXsp3-;57ry;jE%L4F)JClDV%$ z!k5sW)J4^F=7iP_pDs|iHT>DYA%$#UE3cQ<|4LdU1G6itti2uKfVpN9&3t1u#ra|T zXQvz83Xh-$i}&7WAWg?HXcbs|P;wOub4BA#D4qSz!sGpwQ)GF>#mndCKBh57s(2i8 zTtoje=D-1e0&|=PvVR54kqYMd4>8Bde<94#5RDn~8C0KwNtzgD4W+0rq22g@5r_B! z9AW@C#13#sDZn98C}{@oQNdKQQ;VxNp{rkc`htR5DsR}H`LKuKg z1B^h%IKX%Y01Vv=T5^LpuyhHhv@*7dA!Gg#&3qpswNz;It?uiCF`P1Mfsm@wLX~2) z8j}g@Pj3+O5|F7omHWhDZ+s~GFu5z`Y+_29QF@Y?k-c!$T6I-9uAHWfkH-r$6$&v! zYQmdsr%O0GDX#NIur@fj3hT4-d;h2Dy7`}_YwJ;^{?3At?5Fmye{3JgP64U<|MT$w z=i&b@<^Rj(;SwBzegKz!e`cM~3)GR|++g-WqEfj=v|uvYHeb<}H6+T{prUytg_)NE zmON}LSckSLxqgwEH{^=lFUbFlwn?I5@+FyCMgVEXC7Buqg7QQJXe&0i{UX1I7S3xZ z%$_59&K9652`HZ2_6s~G3YsME!C;OSpNS?xWla3%U;Yhi_N8!6_ZOIL$vdYWwLnl5 z%7wI~dapJ(^i7<%nJ6_vI0yaHY1Q2xQ+5GAxE3gv+B%r%!v8`{*)6wJv@E8lU>SE<~>j< zSm1XpGyn`6D})a6xJS+AChk!acvWy|Q@&0d*vDXjR>PBz6QaKn7i!E87LMiXQ8^!(0|5Pv-vquX`~h;irNdgf%-ROVFaJ|XnSW~N7FOdw(NQse`b)&z}I1#=oYg-8Ik zz{mqcZo?6pC^FN8LD3OsS0kw=7S@-D=ctC{eip4}nj^B--X7 zJT|7C!i%GRwWy1dRp*gKS*r8QMTJBj#V>@qZRiC#iwvzI0dd()ILC?Wfbh)YmHxmn zqhB>W<^UI~h?V+nBgwake% z54UvqE34==XZ7qDcWqJl2mSu6wd1@ZKe5MWOSTMV7c;x)aR>32UR^UWpYKkH%Y9uJ zfPa?}cM}l*Sph$y9_@h?Bto@jchIcF(zcwAHZpyNCnxG{OY>edSrIv;7m zYkUt~bnC3zJPdo{>mhsVz#K<7h%dFx@&U8o;~B~Q@XZp`@@YGklEPdbb&W#D#Z>v?4$ zt&Zotm{@s2E?hiw<V`dV4qRm~hpL3u5m3nl91j_zzE1zHXlT5_1); zlGVcXsQmP)HQESxKRpCR&+AEdr9H1F-j(#cE+NKvj!noE6Ru+cdw`K(`D6H%(B6U){23{8>ESSWp( z^~85{3!_jrNJWkZO`U{Yt(!gA1fnf>5xGCC_u}bv1M=sMFiFUq4!H}&{+jz`y_?e8SpJ*L08ALlI{wD^6@02|9c01K(t*73cw6d8K- zO*}%Wn{V97W!B~6&N6KXnAh|tV}9^{e2Zty1;2Mau~Rnfch@JzJm_^i*`A;&-;LQX z#z_r3xiygy{v%l#*(jr~na>=9wN~~$X_`|vW|GITXz{;wV^04+fm%h_)Miic}d(Y%KtqfOPBdjVVV0pkT7L4P@)u`rw=L^UfeF3@~*o z+)Vz)9_4 zKi{gn)%VHcgqmFOqmDPq1*;a${!w|J=)}-~9&;&xgJtcMxD#ue63|VWOI4=}IXX>~ z;^qPgAB{uH^BM>M4BNh=g$$vM+}c0@?I{vbLQSvO*Dr9!@KuiY_R#zKdPBKj^TJHl zxKh5ZsYC@gmlo$XiWz|n-H_qS*pK`eVLJ%pm+=zEVm&dcO?xgkK6PW4#3UiQAj4wx zFaG){qF~JZn#cKxfmd)p0%+9LR^26WNC;Co2O&7wd;!4k=HRFRhHkapN#pg9#3d4< z(<7MtYe5n%sNjO3m&vViK}MkyO+%UdVW*N+{a;z{^-|66X%h!{7jomB8wV%?SxN8T z#8i6g;z$`yNMmV2s3W1fdwI0)0&5_Fi_#zSY$J9uyM{mD^Brt?T&UHFOlATF5K zpc@a(tvAFhi=kuZ?O1RsFe;&WFMXQ(O$(#g4$Z3(@x?+-GW5rc>S$IDoKT7#PU8VX zR0udV-qfIEJ4yQ?MX1}P0)DZ1+wt%lz)xyKYG6;tI6^uk9*~qosS6(WD4I$XI-RNH1@Iofl;)Cv-_deH5M#{JU@7n8&}@=%DJ8Pu30Tq1UV=_|So4(p36R z7IT2Sv{dJ-qc3ljHzv2Ap6(G;93Z(*1Ot=Xk4`sD%GGIhzzSqVDzX57(wzOb=FGn} zXW+Wa*XG7kx_4U+yvNvZJw$t861}pDdB`U0@7Fx}Tw3nAfQ_x%Q@3h>N^Nl8ffkHk zh#E>jgm7jH=?GCZfV+^-D3^|qX&^#+LYfssDM=sK+h_mim+FB~`e#N&IVWnLrs0ob zR*7cEtmRy|FYzQSc0AqRKLrHU?ptr0>$mRu=4~iYYH6Fz67dfu79+Cun+e4*I2(t?7+ zr5nu%K4z1%%odFabjfo9CKYMNf&%XL(BeyP>E{TJ z)h@IrvWvQpXu)2W^5gUhaH>Gr(`O$BlYT~HQ)jIct>9_|Iauz!-NMm?p3rj&Rb%lg z)NLDDM_dmYZtS-IckO}};v@3Q25es*9Nw`%T##ZxS$7caCIES?*PwJiU! z5G(W`ror9MG$^Kz3%sBEm0;K{$RpSQLPWA_gzTeWPXDag<6VY7OA>>^XMSd)0h}$c z@Q>f^1|!|d0vHVZOV#RV+l_j9wR^)g#ZBR%B0?;?02xEF6g}J?WR-j2#=Q)WM zPwDbI;$)Dk1=-)m8RIc&C5`Q;p8DqKA?>-4`*FwbHyzG){~F<3H|6A;3FPE44=VIu zy0Kw_jb&}~eRd+nule)td90TwjqP6Xa8Z7X`&w;F0^Jr17rKJSH_V08k9cosy{IFs zlce2DUs1YlK6+;lFFOx=w?Aac{*dtlf&Jl6O&?g_dV)r)w-ptzf}n-M6*doM=$f-6 zIl$abE$M7nlD_DWa4C^Tf|aCH>^LbqP63+GAiBW`{&C?$I1F6&&(I6BMa{OU>x0c2 z;7Jo4!dbz!T}yN-!Y3BTxM#(?8dfRm4bP6gw*6FHTo+_o@bb5D>M5ACm|mQ?-)F}k zv&VP7dn?4VGK-I2Z$iAT^8jpf1MaRrRqbA+hKUVbfz4O~ja7bp?#PCs(SW<3R&zam zum`QZsbRhF#MJ1T4+m*etLwb7`GFvNP)CpdX?6Ucc-`^m>OC;=`@9c-vTDz2C$Rsc zfahuhmO=RizhgzAUI2&5zk7D=)qu6EI~$(sPEds32wFkpnb}~Y7};i$T_kwO0%sWY g5jew2d> uint(2)) & 16843009u); + uint a = n_points + (n_points & (((tag_word >> uint(3)) & 16843009u) * 15u)); + a += (a >> uint(8)); + a += (a >> uint(16)); + c.pathseg_offset = a & 255u; + return c; +} + +TagMonoid combine_tag_monoid(TagMonoid a, TagMonoid b) +{ + TagMonoid c; + c.trans_ix = a.trans_ix + b.trans_ix; + c.linewidth_ix = a.linewidth_ix + b.linewidth_ix; + c.pathseg_ix = a.pathseg_ix + b.pathseg_ix; + c.path_ix = a.path_ix + b.path_ix; + c.pathseg_offset = a.pathseg_offset + b.pathseg_offset; + return c; +} + +TagMonoid tag_monoid_identity() +{ + return _135; +} + +float2 read_f32_point(uint ix) +{ + float x = asfloat(_574.Load(ix * 4 + 0)); + float y = asfloat(_574.Load((ix + 1u) * 4 + 0)); + return float2(x, y); +} + +float2 read_i16_point(uint ix) +{ + uint raw = _574.Load(ix * 4 + 0); + float x = float(int(raw << uint(16)) >> 16); + float y = float(int(raw) >> 16); + return float2(x, y); +} + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +uint read_mem(Alloc alloc, uint offset) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = _111.Load(offset * 4 + 8); + return v; +} + +TransformSeg TransformSeg_read(Alloc a, TransformSegRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11); + TransformSeg s; + s.mat = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3)); + s.translate = float2(asfloat(raw4), asfloat(raw5)); + return s; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _111.Store(offset * 4 + 8, val); +} + +void PathCubic_write(Alloc a, PathCubicRef ref, PathCubic s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = asuint(s.p0.x); + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = asuint(s.p0.y); + write_mem(param_3, param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = asuint(s.p1.x); + write_mem(param_6, param_7, param_8); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = asuint(s.p1.y); + write_mem(param_9, param_10, param_11); + Alloc param_12 = a; + uint param_13 = ix + 4u; + uint param_14 = asuint(s.p2.x); + write_mem(param_12, param_13, param_14); + Alloc param_15 = a; + uint param_16 = ix + 5u; + uint param_17 = asuint(s.p2.y); + write_mem(param_15, param_16, param_17); + Alloc param_18 = a; + uint param_19 = ix + 6u; + uint param_20 = asuint(s.p3.x); + write_mem(param_18, param_19, param_20); + Alloc param_21 = a; + uint param_22 = ix + 7u; + uint param_23 = asuint(s.p3.y); + write_mem(param_21, param_22, param_23); + Alloc param_24 = a; + uint param_25 = ix + 8u; + uint param_26 = s.path_ix; + write_mem(param_24, param_25, param_26); + Alloc param_27 = a; + uint param_28 = ix + 9u; + uint param_29 = s.trans_ix; + write_mem(param_27, param_28, param_29); + Alloc param_30 = a; + uint param_31 = ix + 10u; + uint param_32 = asuint(s.stroke.x); + write_mem(param_30, param_31, param_32); + Alloc param_33 = a; + uint param_34 = ix + 11u; + uint param_35 = asuint(s.stroke.y); + write_mem(param_33, param_34, param_35); +} + +void PathSeg_Cubic_write(Alloc a, PathSegRef ref, uint flags, PathCubic s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = (flags << uint(16)) | 1u; + write_mem(param, param_1, param_2); + PathCubicRef _458 = { ref.offset + 4u }; + Alloc param_3 = a; + PathCubicRef param_4 = _458; + PathCubic param_5 = s; + PathCubic_write(param_3, param_4, param_5); +} + +Monoid combine_monoid(Monoid a, Monoid b) +{ + Monoid c; + c.bbox = b.bbox; + bool _472 = (a.flags & 1u) == 0u; + bool _480; + if (_472) + { + _480 = b.bbox.z <= b.bbox.x; + } + else + { + _480 = _472; + } + bool _488; + if (_480) + { + _488 = b.bbox.w <= b.bbox.y; + } + else + { + _488 = _480; + } + if (_488) + { + c.bbox = a.bbox; + } + else + { + bool _498 = (a.flags & 1u) == 0u; + bool _505; + if (_498) + { + _505 = (b.flags & 2u) == 0u; + } + else + { + _505 = _498; + } + bool _522; + if (_505) + { + bool _512 = a.bbox.z > a.bbox.x; + bool _521; + if (!_512) + { + _521 = a.bbox.w > a.bbox.y; + } + else + { + _521 = _512; + } + _522 = _521; + } + else + { + _522 = _505; + } + if (_522) + { + float4 _529 = c.bbox; + float2 _531 = min(a.bbox.xy, _529.xy); + c.bbox.x = _531.x; + c.bbox.y = _531.y; + float4 _540 = c.bbox; + float2 _542 = max(a.bbox.zw, _540.zw); + c.bbox.z = _542.x; + c.bbox.w = _542.y; + } + } + c.flags = (a.flags & 2u) | b.flags; + c.flags |= ((a.flags & 1u) << uint(1)); + return c; +} + +Monoid monoid_identity() +{ + return _567; +} + +uint round_down(float x) +{ + return uint(max(0.0f, floor(x) + 32768.0f)); +} + +uint round_up(float x) +{ + return uint(min(65535.0f, ceil(x) + 32768.0f)); +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 4u; + uint tag_word = _574.Load(((_639.Load(52) >> uint(2)) + (ix >> uint(2))) * 4 + 0); + uint param = tag_word; + TagMonoid local_tm = reduce_tag(param); + sh_tag[gl_LocalInvocationID.x] = local_tm; + for (uint i = 0u; i < 9u; i++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i)) + { + TagMonoid other = sh_tag[gl_LocalInvocationID.x - (1u << i)]; + TagMonoid param_1 = other; + TagMonoid param_2 = local_tm; + local_tm = combine_tag_monoid(param_1, param_2); + } + GroupMemoryBarrierWithGroupSync(); + sh_tag[gl_LocalInvocationID.x] = local_tm; + } + GroupMemoryBarrierWithGroupSync(); + TagMonoid tm = tag_monoid_identity(); + if (gl_WorkGroupID.x > 0u) + { + TagMonoid _716; + _716.trans_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 0); + _716.linewidth_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 4); + _716.pathseg_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 8); + _716.path_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 12); + _716.pathseg_offset = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 16); + tm.trans_ix = _716.trans_ix; + tm.linewidth_ix = _716.linewidth_ix; + tm.pathseg_ix = _716.pathseg_ix; + tm.path_ix = _716.path_ix; + tm.pathseg_offset = _716.pathseg_offset; + } + if (gl_LocalInvocationID.x > 0u) + { + TagMonoid param_3 = tm; + TagMonoid param_4 = sh_tag[gl_LocalInvocationID.x - 1u]; + tm = combine_tag_monoid(param_3, param_4); + } + uint ps_ix = (_639.Load(60) >> uint(2)) + tm.pathseg_offset; + uint lw_ix = (_639.Load(56) >> uint(2)) + tm.linewidth_ix; + uint save_path_ix = tm.path_ix; + TransformSegRef _769 = { _639.Load(36) + (tm.trans_ix * 24u) }; + TransformSegRef trans_ref = _769; + PathSegRef _779 = { _639.Load(28) + (tm.pathseg_ix * 52u) }; + PathSegRef ps_ref = _779; + float2 p0; + float2 p1; + float2 p2; + float2 p3; + Alloc param_13; + Monoid local[4]; + PathCubic cubic; + Alloc param_15; + for (uint i_1 = 0u; i_1 < 4u; i_1++) + { + uint tag_byte = tag_word >> (i_1 * 8u); + uint seg_type = tag_byte & 3u; + if (seg_type != 0u) + { + if ((tag_byte & 8u) != 0u) + { + uint param_5 = ps_ix; + p0 = read_f32_point(param_5); + uint param_6 = ps_ix + 2u; + p1 = read_f32_point(param_6); + if (seg_type >= 2u) + { + uint param_7 = ps_ix + 4u; + p2 = read_f32_point(param_7); + if (seg_type == 3u) + { + uint param_8 = ps_ix + 6u; + p3 = read_f32_point(param_8); + } + } + } + else + { + uint param_9 = ps_ix; + p0 = read_i16_point(param_9); + uint param_10 = ps_ix + 1u; + p1 = read_i16_point(param_10); + if (seg_type >= 2u) + { + uint param_11 = ps_ix + 2u; + p2 = read_i16_point(param_11); + if (seg_type == 3u) + { + uint param_12 = ps_ix + 3u; + p3 = read_i16_point(param_12); + } + } + } + float linewidth = asfloat(_574.Load(lw_ix * 4 + 0)); + Alloc _865; + _865.offset = _639.Load(36); + param_13.offset = _865.offset; + TransformSegRef param_14 = trans_ref; + TransformSeg transform = TransformSeg_read(param_13, param_14); + p0 = ((transform.mat.xy * p0.x) + (transform.mat.zw * p0.y)) + transform.translate; + p1 = ((transform.mat.xy * p1.x) + (transform.mat.zw * p1.y)) + transform.translate; + float4 bbox = float4(min(p0, p1), max(p0, p1)); + if (seg_type >= 2u) + { + p2 = ((transform.mat.xy * p2.x) + (transform.mat.zw * p2.y)) + transform.translate; + float4 _935 = bbox; + float2 _938 = min(_935.xy, p2); + bbox.x = _938.x; + bbox.y = _938.y; + float4 _943 = bbox; + float2 _946 = max(_943.zw, p2); + bbox.z = _946.x; + bbox.w = _946.y; + if (seg_type == 3u) + { + p3 = ((transform.mat.xy * p3.x) + (transform.mat.zw * p3.y)) + transform.translate; + float4 _971 = bbox; + float2 _974 = min(_971.xy, p3); + bbox.x = _974.x; + bbox.y = _974.y; + float4 _979 = bbox; + float2 _982 = max(_979.zw, p3); + bbox.z = _982.x; + bbox.w = _982.y; + } + else + { + p3 = p2; + p2 = lerp(p1, p2, 0.3333333432674407958984375f.xx); + p1 = lerp(p1, p0, 0.3333333432674407958984375f.xx); + } + } + else + { + p3 = p1; + p2 = lerp(p3, p0, 0.3333333432674407958984375f.xx); + p1 = lerp(p0, p3, 0.3333333432674407958984375f.xx); + } + float2 stroke = 0.0f.xx; + if (linewidth >= 0.0f) + { + stroke = float2(length(transform.mat.xz), length(transform.mat.yw)) * (0.5f * linewidth); + bbox += float4(-stroke, stroke); + } + local[i_1].bbox = bbox; + local[i_1].flags = 0u; + cubic.p0 = p0; + cubic.p1 = p1; + cubic.p2 = p2; + cubic.p3 = p3; + cubic.path_ix = tm.path_ix; + cubic.trans_ix = (gl_GlobalInvocationID.x * 4u) + i_1; + cubic.stroke = stroke; + uint fill_mode = uint(linewidth >= 0.0f); + Alloc _1071; + _1071.offset = _639.Load(28); + param_15.offset = _1071.offset; + PathSegRef param_16 = ps_ref; + uint param_17 = fill_mode; + PathCubic param_18 = cubic; + PathSeg_Cubic_write(param_15, param_16, param_17, param_18); + ps_ref.offset += 52u; + uint n_points = (tag_byte & 3u) + ((tag_byte >> uint(2)) & 1u); + uint n_words = n_points + (n_points & (((tag_byte >> uint(3)) & 1u) * 15u)); + ps_ix += n_words; + } + else + { + local[i_1].bbox = 0.0f.xxxx; + uint is_path = (tag_byte >> uint(4)) & 1u; + local[i_1].flags = is_path; + tm.path_ix += is_path; + trans_ref.offset += (((tag_byte >> uint(5)) & 1u) * 24u); + lw_ix += ((tag_byte >> uint(6)) & 1u); + } + } + Monoid agg = local[0]; + for (uint i_2 = 1u; i_2 < 4u; i_2++) + { + Monoid param_19 = agg; + Monoid param_20 = local[i_2]; + agg = combine_monoid(param_19, param_20); + local[i_2] = agg; + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_3 = 0u; i_3 < 9u; i_3++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i_3)) + { + Monoid other_1 = sh_scratch[gl_LocalInvocationID.x - (1u << i_3)]; + Monoid param_21 = other_1; + Monoid param_22 = agg; + agg = combine_monoid(param_21, param_22); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + GroupMemoryBarrierWithGroupSync(); + uint path_ix = save_path_ix; + uint bbox_out_ix = (_639.Load(40) >> uint(2)) + (path_ix * 4u); + Monoid row = monoid_identity(); + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_4 = 0u; i_4 < 4u; i_4++) + { + Monoid param_23 = row; + Monoid param_24 = local[i_4]; + Monoid m = combine_monoid(param_23, param_24); + bool do_atomic = false; + bool _1241 = i_4 == 3u; + bool _1248; + if (_1241) + { + _1248 = gl_LocalInvocationID.x == 511u; + } + else + { + _1248 = _1241; + } + if (_1248) + { + do_atomic = true; + } + if ((m.flags & 1u) != 0u) + { + if ((m.flags & 2u) == 0u) + { + do_atomic = true; + } + else + { + float param_25 = m.bbox.x; + _111.Store(bbox_out_ix * 4 + 8, round_down(param_25)); + float param_26 = m.bbox.y; + _111.Store((bbox_out_ix + 1u) * 4 + 8, round_down(param_26)); + float param_27 = m.bbox.z; + _111.Store((bbox_out_ix + 2u) * 4 + 8, round_up(param_27)); + float param_28 = m.bbox.w; + _111.Store((bbox_out_ix + 3u) * 4 + 8, round_up(param_28)); + bbox_out_ix += 4u; + do_atomic = false; + } + } + if (do_atomic) + { + bool _1300 = m.bbox.z > m.bbox.x; + bool _1309; + if (!_1300) + { + _1309 = m.bbox.w > m.bbox.y; + } + else + { + _1309 = _1300; + } + if (_1309) + { + float param_29 = m.bbox.x; + uint _1318; + _111.InterlockedMin(bbox_out_ix * 4 + 8, round_down(param_29), _1318); + float param_30 = m.bbox.y; + uint _1326; + _111.InterlockedMin((bbox_out_ix + 1u) * 4 + 8, round_down(param_30), _1326); + float param_31 = m.bbox.z; + uint _1334; + _111.InterlockedMax((bbox_out_ix + 2u) * 4 + 8, round_up(param_31), _1334); + float param_32 = m.bbox.w; + uint _1342; + _111.InterlockedMax((bbox_out_ix + 3u) * 4 + 8, round_up(param_32), _1342); + } + bbox_out_ix += 4u; + } + } +} + +[numthreads(512, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/pathseg.msl b/piet-gpu/shader/gen/pathseg.msl new file mode 100644 index 0000000..25d001f --- /dev/null +++ b/piet-gpu/shader/gen/pathseg.msl @@ -0,0 +1,699 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" +#pragma clang diagnostic ignored "-Wunused-variable" + +#include +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Alloc +{ + uint offset; +}; + +struct TagMonoid +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct TransformSegRef +{ + uint offset; +}; + +struct TransformSeg +{ + float4 mat; + float2 translate; +}; + +struct PathCubicRef +{ + uint offset; +}; + +struct PathCubic +{ + float2 p0; + float2 p1; + float2 p2; + float2 p3; + uint path_ix; + uint trans_ix; + float2 stroke; +}; + +struct PathSegRef +{ + uint offset; +}; + +struct Monoid +{ + float4 bbox; + uint flags; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct SceneBuf +{ + uint scene[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 bbox_alloc; + uint n_trans; + uint trans_offset; + uint pathtag_offset; + uint linewidth_offset; + uint pathseg_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct TagMonoid_1 +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct ParentBuf +{ + TagMonoid_1 parent[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); + +static inline __attribute__((always_inline)) +TagMonoid reduce_tag(thread const uint& tag_word) +{ + uint point_count = tag_word & 50529027u; + TagMonoid c; + c.pathseg_ix = uint(int(popcount((point_count * 7u) & 67372036u))); + c.linewidth_ix = uint(int(popcount(tag_word & 1077952576u))); + c.path_ix = uint(int(popcount(tag_word & 269488144u))); + c.trans_ix = uint(int(popcount(tag_word & 538976288u))); + uint n_points = point_count + ((tag_word >> uint(2)) & 16843009u); + uint a = n_points + (n_points & (((tag_word >> uint(3)) & 16843009u) * 15u)); + a += (a >> uint(8)); + a += (a >> uint(16)); + c.pathseg_offset = a & 255u; + return c; +} + +static inline __attribute__((always_inline)) +TagMonoid combine_tag_monoid(thread const TagMonoid& a, thread const TagMonoid& b) +{ + TagMonoid c; + c.trans_ix = a.trans_ix + b.trans_ix; + c.linewidth_ix = a.linewidth_ix + b.linewidth_ix; + c.pathseg_ix = a.pathseg_ix + b.pathseg_ix; + c.path_ix = a.path_ix + b.path_ix; + c.pathseg_offset = a.pathseg_offset + b.pathseg_offset; + return c; +} + +static inline __attribute__((always_inline)) +TagMonoid tag_monoid_identity() +{ + return TagMonoid{ 0u, 0u, 0u, 0u, 0u }; +} + +static inline __attribute__((always_inline)) +float2 read_f32_point(thread const uint& ix, const device SceneBuf& v_574) +{ + float x = as_type(v_574.scene[ix]); + float y = as_type(v_574.scene[ix + 1u]); + return float2(x, y); +} + +static inline __attribute__((always_inline)) +float2 read_i16_point(thread const uint& ix, const device SceneBuf& v_574) +{ + uint raw = v_574.scene[ix]; + float x = float(int(raw << uint(16)) >> 16); + float y = float(int(raw) >> 16); + return float2(x, y); +} + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_111) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_111.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +TransformSeg TransformSeg_read(thread const Alloc& a, thread const TransformSegRef& ref, device Memory& v_111) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_111); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_111); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_111); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_111); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9, v_111); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11, v_111); + TransformSeg s; + s.mat = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); + s.translate = float2(as_type(raw4), as_type(raw5)); + return s; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_111) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_111.memory[offset] = val; +} + +static inline __attribute__((always_inline)) +void PathCubic_write(thread const Alloc& a, thread const PathCubicRef& ref, thread const PathCubic& s, device Memory& v_111) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = as_type(s.p0.x); + write_mem(param, param_1, param_2, v_111); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = as_type(s.p0.y); + write_mem(param_3, param_4, param_5, v_111); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = as_type(s.p1.x); + write_mem(param_6, param_7, param_8, v_111); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = as_type(s.p1.y); + write_mem(param_9, param_10, param_11, v_111); + Alloc param_12 = a; + uint param_13 = ix + 4u; + uint param_14 = as_type(s.p2.x); + write_mem(param_12, param_13, param_14, v_111); + Alloc param_15 = a; + uint param_16 = ix + 5u; + uint param_17 = as_type(s.p2.y); + write_mem(param_15, param_16, param_17, v_111); + Alloc param_18 = a; + uint param_19 = ix + 6u; + uint param_20 = as_type(s.p3.x); + write_mem(param_18, param_19, param_20, v_111); + Alloc param_21 = a; + uint param_22 = ix + 7u; + uint param_23 = as_type(s.p3.y); + write_mem(param_21, param_22, param_23, v_111); + Alloc param_24 = a; + uint param_25 = ix + 8u; + uint param_26 = s.path_ix; + write_mem(param_24, param_25, param_26, v_111); + Alloc param_27 = a; + uint param_28 = ix + 9u; + uint param_29 = s.trans_ix; + write_mem(param_27, param_28, param_29, v_111); + Alloc param_30 = a; + uint param_31 = ix + 10u; + uint param_32 = as_type(s.stroke.x); + write_mem(param_30, param_31, param_32, v_111); + Alloc param_33 = a; + uint param_34 = ix + 11u; + uint param_35 = as_type(s.stroke.y); + write_mem(param_33, param_34, param_35, v_111); +} + +static inline __attribute__((always_inline)) +void PathSeg_Cubic_write(thread const Alloc& a, thread const PathSegRef& ref, thread const uint& flags, thread const PathCubic& s, device Memory& v_111) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = (flags << uint(16)) | 1u; + write_mem(param, param_1, param_2, v_111); + Alloc param_3 = a; + PathCubicRef param_4 = PathCubicRef{ ref.offset + 4u }; + PathCubic param_5 = s; + PathCubic_write(param_3, param_4, param_5, v_111); +} + +static inline __attribute__((always_inline)) +Monoid combine_monoid(thread const Monoid& a, thread const Monoid& b) +{ + Monoid c; + c.bbox = b.bbox; + bool _472 = (a.flags & 1u) == 0u; + bool _480; + if (_472) + { + _480 = b.bbox.z <= b.bbox.x; + } + else + { + _480 = _472; + } + bool _488; + if (_480) + { + _488 = b.bbox.w <= b.bbox.y; + } + else + { + _488 = _480; + } + if (_488) + { + c.bbox = a.bbox; + } + else + { + bool _498 = (a.flags & 1u) == 0u; + bool _505; + if (_498) + { + _505 = (b.flags & 2u) == 0u; + } + else + { + _505 = _498; + } + bool _522; + if (_505) + { + bool _512 = a.bbox.z > a.bbox.x; + bool _521; + if (!_512) + { + _521 = a.bbox.w > a.bbox.y; + } + else + { + _521 = _512; + } + _522 = _521; + } + else + { + _522 = _505; + } + if (_522) + { + float4 _529 = c.bbox; + float2 _531 = fast::min(a.bbox.xy, _529.xy); + c.bbox.x = _531.x; + c.bbox.y = _531.y; + float4 _540 = c.bbox; + float2 _542 = fast::max(a.bbox.zw, _540.zw); + c.bbox.z = _542.x; + c.bbox.w = _542.y; + } + } + c.flags = (a.flags & 2u) | b.flags; + c.flags |= ((a.flags & 1u) << uint(1)); + return c; +} + +static inline __attribute__((always_inline)) +Monoid monoid_identity() +{ + return Monoid{ float4(0.0), 0u }; +} + +static inline __attribute__((always_inline)) +uint round_down(thread const float& x) +{ + return uint(fast::max(0.0, floor(x) + 32768.0)); +} + +static inline __attribute__((always_inline)) +uint round_up(thread const float& x) +{ + return uint(fast::min(65535.0, ceil(x) + 32768.0)); +} + +kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _639 [[buffer(1)]], const device SceneBuf& v_574 [[buffer(2)]], const device ParentBuf& _710 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +{ + threadgroup TagMonoid sh_tag[512]; + threadgroup Monoid sh_scratch[512]; + uint ix = gl_GlobalInvocationID.x * 4u; + uint tag_word = v_574.scene[(_639.conf.pathtag_offset >> uint(2)) + (ix >> uint(2))]; + uint param = tag_word; + TagMonoid local_tm = reduce_tag(param); + sh_tag[gl_LocalInvocationID.x] = local_tm; + for (uint i = 0u; i < 9u; i++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i)) + { + TagMonoid other = sh_tag[gl_LocalInvocationID.x - (1u << i)]; + TagMonoid param_1 = other; + TagMonoid param_2 = local_tm; + local_tm = combine_tag_monoid(param_1, param_2); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_tag[gl_LocalInvocationID.x] = local_tm; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + TagMonoid tm = tag_monoid_identity(); + if (gl_WorkGroupID.x > 0u) + { + uint _713 = gl_WorkGroupID.x - 1u; + tm.trans_ix = _710.parent[_713].trans_ix; + tm.linewidth_ix = _710.parent[_713].linewidth_ix; + tm.pathseg_ix = _710.parent[_713].pathseg_ix; + tm.path_ix = _710.parent[_713].path_ix; + tm.pathseg_offset = _710.parent[_713].pathseg_offset; + } + if (gl_LocalInvocationID.x > 0u) + { + TagMonoid param_3 = tm; + TagMonoid param_4 = sh_tag[gl_LocalInvocationID.x - 1u]; + tm = combine_tag_monoid(param_3, param_4); + } + uint ps_ix = (_639.conf.pathseg_offset >> uint(2)) + tm.pathseg_offset; + uint lw_ix = (_639.conf.linewidth_offset >> uint(2)) + tm.linewidth_ix; + uint save_path_ix = tm.path_ix; + TransformSegRef trans_ref = TransformSegRef{ _639.conf.trans_alloc.offset + (tm.trans_ix * 24u) }; + PathSegRef ps_ref = PathSegRef{ _639.conf.pathseg_alloc.offset + (tm.pathseg_ix * 52u) }; + float2 p0; + float2 p1; + float2 p2; + float2 p3; + Alloc param_13; + spvUnsafeArray local; + PathCubic cubic; + Alloc param_15; + for (uint i_1 = 0u; i_1 < 4u; i_1++) + { + uint tag_byte = tag_word >> (i_1 * 8u); + uint seg_type = tag_byte & 3u; + if (seg_type != 0u) + { + if ((tag_byte & 8u) != 0u) + { + uint param_5 = ps_ix; + p0 = read_f32_point(param_5, v_574); + uint param_6 = ps_ix + 2u; + p1 = read_f32_point(param_6, v_574); + if (seg_type >= 2u) + { + uint param_7 = ps_ix + 4u; + p2 = read_f32_point(param_7, v_574); + if (seg_type == 3u) + { + uint param_8 = ps_ix + 6u; + p3 = read_f32_point(param_8, v_574); + } + } + } + else + { + uint param_9 = ps_ix; + p0 = read_i16_point(param_9, v_574); + uint param_10 = ps_ix + 1u; + p1 = read_i16_point(param_10, v_574); + if (seg_type >= 2u) + { + uint param_11 = ps_ix + 2u; + p2 = read_i16_point(param_11, v_574); + if (seg_type == 3u) + { + uint param_12 = ps_ix + 3u; + p3 = read_i16_point(param_12, v_574); + } + } + } + float linewidth = as_type(v_574.scene[lw_ix]); + param_13.offset = _639.conf.trans_alloc.offset; + TransformSegRef param_14 = trans_ref; + TransformSeg transform = TransformSeg_read(param_13, param_14, v_111); + p0 = ((transform.mat.xy * p0.x) + (transform.mat.zw * p0.y)) + transform.translate; + p1 = ((transform.mat.xy * p1.x) + (transform.mat.zw * p1.y)) + transform.translate; + float4 bbox = float4(fast::min(p0, p1), fast::max(p0, p1)); + if (seg_type >= 2u) + { + p2 = ((transform.mat.xy * p2.x) + (transform.mat.zw * p2.y)) + transform.translate; + float4 _935 = bbox; + float2 _938 = fast::min(_935.xy, p2); + bbox.x = _938.x; + bbox.y = _938.y; + float4 _943 = bbox; + float2 _946 = fast::max(_943.zw, p2); + bbox.z = _946.x; + bbox.w = _946.y; + if (seg_type == 3u) + { + p3 = ((transform.mat.xy * p3.x) + (transform.mat.zw * p3.y)) + transform.translate; + float4 _971 = bbox; + float2 _974 = fast::min(_971.xy, p3); + bbox.x = _974.x; + bbox.y = _974.y; + float4 _979 = bbox; + float2 _982 = fast::max(_979.zw, p3); + bbox.z = _982.x; + bbox.w = _982.y; + } + else + { + p3 = p2; + p2 = mix(p1, p2, float2(0.3333333432674407958984375)); + p1 = mix(p1, p0, float2(0.3333333432674407958984375)); + } + } + else + { + p3 = p1; + p2 = mix(p3, p0, float2(0.3333333432674407958984375)); + p1 = mix(p0, p3, float2(0.3333333432674407958984375)); + } + float2 stroke = float2(0.0); + if (linewidth >= 0.0) + { + stroke = float2(length(transform.mat.xz), length(transform.mat.yw)) * (0.5 * linewidth); + bbox += float4(-stroke, stroke); + } + local[i_1].bbox = bbox; + local[i_1].flags = 0u; + cubic.p0 = p0; + cubic.p1 = p1; + cubic.p2 = p2; + cubic.p3 = p3; + cubic.path_ix = tm.path_ix; + cubic.trans_ix = (gl_GlobalInvocationID.x * 4u) + i_1; + cubic.stroke = stroke; + uint fill_mode = uint(linewidth >= 0.0); + param_15.offset = _639.conf.pathseg_alloc.offset; + PathSegRef param_16 = ps_ref; + uint param_17 = fill_mode; + PathCubic param_18 = cubic; + PathSeg_Cubic_write(param_15, param_16, param_17, param_18, v_111); + ps_ref.offset += 52u; + uint n_points = (tag_byte & 3u) + ((tag_byte >> uint(2)) & 1u); + uint n_words = n_points + (n_points & (((tag_byte >> uint(3)) & 1u) * 15u)); + ps_ix += n_words; + } + else + { + local[i_1].bbox = float4(0.0); + uint is_path = (tag_byte >> uint(4)) & 1u; + local[i_1].flags = is_path; + tm.path_ix += is_path; + trans_ref.offset += (((tag_byte >> uint(5)) & 1u) * 24u); + lw_ix += ((tag_byte >> uint(6)) & 1u); + } + } + Monoid agg = local[0]; + for (uint i_2 = 1u; i_2 < 4u; i_2++) + { + Monoid param_19 = agg; + Monoid param_20 = local[i_2]; + agg = combine_monoid(param_19, param_20); + local[i_2] = agg; + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_3 = 0u; i_3 < 9u; i_3++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i_3)) + { + Monoid other_1 = sh_scratch[gl_LocalInvocationID.x - (1u << i_3)]; + Monoid param_21 = other_1; + Monoid param_22 = agg; + agg = combine_monoid(param_21, param_22); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint path_ix = save_path_ix; + uint bbox_out_ix = (_639.conf.bbox_alloc.offset >> uint(2)) + (path_ix * 4u); + Monoid row = monoid_identity(); + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_4 = 0u; i_4 < 4u; i_4++) + { + Monoid param_23 = row; + Monoid param_24 = local[i_4]; + Monoid m = combine_monoid(param_23, param_24); + bool do_atomic = false; + bool _1241 = i_4 == 3u; + bool _1248; + if (_1241) + { + _1248 = gl_LocalInvocationID.x == 511u; + } + else + { + _1248 = _1241; + } + if (_1248) + { + do_atomic = true; + } + if ((m.flags & 1u) != 0u) + { + if ((m.flags & 2u) == 0u) + { + do_atomic = true; + } + else + { + float param_25 = m.bbox.x; + v_111.memory[bbox_out_ix] = round_down(param_25); + float param_26 = m.bbox.y; + v_111.memory[bbox_out_ix + 1u] = round_down(param_26); + float param_27 = m.bbox.z; + v_111.memory[bbox_out_ix + 2u] = round_up(param_27); + float param_28 = m.bbox.w; + v_111.memory[bbox_out_ix + 3u] = round_up(param_28); + bbox_out_ix += 4u; + do_atomic = false; + } + } + if (do_atomic) + { + bool _1300 = m.bbox.z > m.bbox.x; + bool _1309; + if (!_1300) + { + _1309 = m.bbox.w > m.bbox.y; + } + else + { + _1309 = _1300; + } + if (_1309) + { + float param_29 = m.bbox.x; + uint _1318 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix], round_down(param_29), memory_order_relaxed); + float param_30 = m.bbox.y; + uint _1326 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 1u], round_down(param_30), memory_order_relaxed); + float param_31 = m.bbox.z; + uint _1334 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 2u], round_up(param_31), memory_order_relaxed); + float param_32 = m.bbox.w; + uint _1342 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 3u], round_up(param_32), memory_order_relaxed); + } + bbox_out_ix += 4u; + } + } +} + diff --git a/piet-gpu/shader/gen/pathtag_reduce.dxil b/piet-gpu/shader/gen/pathtag_reduce.dxil new file mode 100644 index 0000000000000000000000000000000000000000..81448e70c67c30ad935a58050ef5752e800ec0d8 GIT binary patch literal 4700 zcmeHKdsGzH8NahLJ2Sg1y92xEtUR0*A%JLx4G$GeXCERjb*cDB>S<;{32A_WC;{{| zvkzD#SQj;sXiOJl2szD3R;4wV)UyjD-4IlKG>53Qph#^_RAOynQj_-13dW|V{?l`M z(sOdp`R@I`-}l|`es|{1{_dLB7)@VlFUVJpuJ|DLqnk%xlfAbd0RXHG2Y`TM7L)?f zhEfg%gUNN#c}y}Y|DyOtQ+F&y&3kOxxmAxvyulUpDzKtZ1d z;zpD2qD}!oFT_IZOfbJ1`UqQWn<0SZ;*@^D4#cU5laksP?73n=vshx>5n?<*={567 zI$y5;adVDa!1%!kPGLCds5qZBU3;&$q)o=WdKNmn0{7{j9J%+?VNSM z3A@YA4!0(}prtj$d7ywQufce?fbf)#&IWmG^DYE%oeC5fYcTTl%}pQv%Ik99p~{*X zIscwmvc)e*JZzb2$4|6>YMW4;#|a1ysN=_F>bmRH&V7ukgB;>q1hF3l;8 zf6@-jk>SS2nyox3^sg>hI*1|e^x2L})JY%Z!#iy8%Vz*U_71_-g~HE*}UW%&s{Hn^V)OU%6|zbug&U*fu{$?nwvjp z?Ag}w{PBZ1HG5y#yO#m<*3^unJu-FMvJ?@m3W&p4WBurw-8zV~K9 z<-BXhEAqHhU0=g3&E&xGYuo0viYY4de1CvB5^G-B`JtmBzXnzv-fEbx&)R+P$ij@H`W`qf zYVzF{!toJOi0U7WD%Bv1noaYaeXT};sIMbx+N%}gKy{5$-V&6LD2y?^{^%1OQKhUj z80Lr^L>C22i(1h|gLSPNtBB1cQJ>?gsv;Vy(Aqjz730zlhABUFD@R#XC9AqTp`5TN zAEElAFY2O(b$q!*J0%_RO)N73v74g@rnt{KkaFhbv_NSELtg7`s> zNI4+Kup&nzDzJUMoqFq(_xAN~$X|b1my|dWP$2c!F$CI-17tX{I9vk$^dW`euru-H(9(;!@sEjD+L3uY0x}36}Ep& zp5lp;@8<^GVd_pwXRygxi<|1;YzQiq(@^(h4Ls}N(7gS1XkLbAXduI5kvocHtuRk4 zE(3Ocq{|l9ZlmiatvCE}9z?rk(()qd9>Wa5-)ut!hNd@o4@{}m-Y$c*tSS~KVGind z{%n?K_fR0>4P+I^Nh_utrw}^9plyxEIO)reGq}EMBK(Q2u4c&07#$uDwqdETO&$xAnQrg?0N7V@lJ zf=2|9#~Gi;zJLqC=%NvPVLqiO0_jM%k+mQcmxQ4aKnq?K!Ajs1K`$~y_&F($b1~R@ zCsH&b&nOlEL*j*VoxL-#&frGw*dAx)`YxcEtU~cqOQWa7GBjEK7F&dUq1iB5J|I?; zNm}V-`KBmd*R_7SSF*+uLesm>gskx=cS&Do9J3_9@dyU#VPP7{b_vslBnTeyuKk5R z!_t^tQrxnW<|5XCQv5UE7;O@wef$ds(muJ~E#0&+YolTIxNeP=k~L7PX!HJc5?*Y z&TK7)a9*!RqDb;hcCATKT@riJ>@K@KDY-pPT*y}CuzDmcnL$INF=$lXHMTJ zrvlRM?^e%AQ5o_zb5DiOEmTDeVUu{jI@zGI> uint(2)) & 16843009u); + uint a = n_points + (n_points & (((tag_word >> uint(3)) & 16843009u) * 15u)); + a += (a >> uint(8)); + a += (a >> uint(16)); + c.pathseg_offset = a & 255u; + return c; +} + +TagMonoid combine_tag_monoid(TagMonoid a, TagMonoid b) +{ + TagMonoid c; + c.trans_ix = a.trans_ix + b.trans_ix; + c.linewidth_ix = a.linewidth_ix + b.linewidth_ix; + c.pathseg_ix = a.pathseg_ix + b.pathseg_ix; + c.path_ix = a.path_ix + b.path_ix; + c.pathseg_offset = a.pathseg_offset + b.pathseg_offset; + return c; +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 4u; + uint scene_ix = (_139.Load(52) >> uint(2)) + ix; + uint tag_word = _151.Load(scene_ix * 4 + 0); + uint param = tag_word; + TagMonoid agg = reduce_tag(param); + for (uint i = 1u; i < 4u; i++) + { + tag_word = _151.Load((scene_ix + i) * 4 + 0); + uint param_1 = tag_word; + TagMonoid param_2 = agg; + TagMonoid param_3 = reduce_tag(param_1); + agg = combine_tag_monoid(param_2, param_3); + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 7u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if ((gl_LocalInvocationID.x + (1u << i_1)) < 128u) + { + TagMonoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)]; + TagMonoid param_4 = agg; + TagMonoid param_5 = other; + agg = combine_tag_monoid(param_4, param_5); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 0u) + { + _239.Store(gl_WorkGroupID.x * 20 + 0, agg.trans_ix); + _239.Store(gl_WorkGroupID.x * 20 + 4, agg.linewidth_ix); + _239.Store(gl_WorkGroupID.x * 20 + 8, agg.pathseg_ix); + _239.Store(gl_WorkGroupID.x * 20 + 12, agg.path_ix); + _239.Store(gl_WorkGroupID.x * 20 + 16, agg.pathseg_offset); + } +} + +[numthreads(128, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/pathtag_reduce.msl b/piet-gpu/shader/gen/pathtag_reduce.msl new file mode 100644 index 0000000..edb6d03 --- /dev/null +++ b/piet-gpu/shader/gen/pathtag_reduce.msl @@ -0,0 +1,143 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +struct TagMonoid +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc bbox_alloc; + uint n_trans; + uint trans_offset; + uint pathtag_offset; + uint linewidth_offset; + uint pathseg_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct SceneBuf +{ + uint scene[1]; +}; + +struct TagMonoid_1 +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct OutBuf +{ + TagMonoid_1 outbuf[1]; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(128u, 1u, 1u); + +static inline __attribute__((always_inline)) +TagMonoid reduce_tag(thread const uint& tag_word) +{ + uint point_count = tag_word & 50529027u; + TagMonoid c; + c.pathseg_ix = uint(int(popcount((point_count * 7u) & 67372036u))); + c.linewidth_ix = uint(int(popcount(tag_word & 1077952576u))); + c.path_ix = uint(int(popcount(tag_word & 269488144u))); + c.trans_ix = uint(int(popcount(tag_word & 538976288u))); + uint n_points = point_count + ((tag_word >> uint(2)) & 16843009u); + uint a = n_points + (n_points & (((tag_word >> uint(3)) & 16843009u) * 15u)); + a += (a >> uint(8)); + a += (a >> uint(16)); + c.pathseg_offset = a & 255u; + return c; +} + +static inline __attribute__((always_inline)) +TagMonoid combine_tag_monoid(thread const TagMonoid& a, thread const TagMonoid& b) +{ + TagMonoid c; + c.trans_ix = a.trans_ix + b.trans_ix; + c.linewidth_ix = a.linewidth_ix + b.linewidth_ix; + c.pathseg_ix = a.pathseg_ix + b.pathseg_ix; + c.path_ix = a.path_ix + b.path_ix; + c.pathseg_offset = a.pathseg_offset + b.pathseg_offset; + return c; +} + +kernel void main0(const device ConfigBuf& _139 [[buffer(1)]], const device SceneBuf& _151 [[buffer(2)]], device OutBuf& _239 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +{ + threadgroup TagMonoid sh_scratch[128]; + uint ix = gl_GlobalInvocationID.x * 4u; + uint scene_ix = (_139.conf.pathtag_offset >> uint(2)) + ix; + uint tag_word = _151.scene[scene_ix]; + uint param = tag_word; + TagMonoid agg = reduce_tag(param); + for (uint i = 1u; i < 4u; i++) + { + tag_word = _151.scene[scene_ix + i]; + uint param_1 = tag_word; + TagMonoid param_2 = agg; + TagMonoid param_3 = reduce_tag(param_1); + agg = combine_tag_monoid(param_2, param_3); + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 7u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if ((gl_LocalInvocationID.x + (1u << i_1)) < 128u) + { + TagMonoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)]; + TagMonoid param_4 = agg; + TagMonoid param_5 = other; + agg = combine_tag_monoid(param_4, param_5); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 0u) + { + _239.outbuf[gl_WorkGroupID.x].trans_ix = agg.trans_ix; + _239.outbuf[gl_WorkGroupID.x].linewidth_ix = agg.linewidth_ix; + _239.outbuf[gl_WorkGroupID.x].pathseg_ix = agg.pathseg_ix; + _239.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix; + _239.outbuf[gl_WorkGroupID.x].pathseg_offset = agg.pathseg_offset; + } +} + diff --git a/piet-gpu/shader/gen/pathtag_root.dxil b/piet-gpu/shader/gen/pathtag_root.dxil new file mode 100644 index 0000000000000000000000000000000000000000..1f27f265c5be01104cb77bab185d0e73358e5fd2 GIT binary patch literal 4716 zcmeHLe^gV~9ltN{sgMu>a))#&iEzlsAirV%8 zk)#?mDp-`d1w{{SwM}bRrM8}gK#deFI@?42L17}Jom#5f(bMDB-4|fpwmBOT`Xz!i*u@GcYSr`z?TRD04SdZfB@EMFdJcB$#F1W zfVmxZcwkb4fsL*Jxu)jIUWF9EK^wt+gwcd zr{GC|-A69X)8_ec02sgr@4~@`f-EEFgoHM3$m{f;K@|bO>yQhsrvRJ=$Y2)KMefl0 zu`HZa&)bSAc?kd>$NbUTA}?W9_sB2;FJ)JyD{PC2E|upb1WYTZbt*tug? zw@I5@=NN9dwi%n9f#WcEbeC%)PIfh_y~i~{CIDG* z49dk)&*&(lurUap#48j4QIJ#5$_%24H(|SGwOe6v)!OEiPcv?I1Y6sUWN|gdI9ZG# zRmu#oh_*K&Ky!#dfvy@8zP_RHli#>)&U<88<0g%FZ=`Uum*u&3oLF`IUeE^N>8FEM zMMXc{S&;`PiQ!`e*qk29ay!0xRO4B7UY^@E^7eRRjx~vps+Lt^M>(nElGT=Ixh6`5`THEDF}%$5jA-!?(% zDO6G`q+pyANqI}D0aMq69*Jkc)Lmu#4FI7gtboUP3gI!6fK@HVFwZ0a`d8BHFg#)Yc6G&X$uM>g1>Q{msKKo;ml?>E?Gjva1iYUSggf6u^b!MF1~0;iVS5%!#jd|D6Ugp}K~?yGDx%9gvy^@%^}~kLe*OIS z(6kR5(yVA|-?+Sn5o{>9DoHh$1sEHBO+~q^JFR-?!g|;fCZB2`) z&2jO~A@QF*;_Eb8=x+-jX$$WQl$Xuny{ho;x`->;wF5-5-RQkwKBb~Lrg z_ZXb?UVZ8XG}Y~M_*H_97IY11tG1xrS{R$fZ+oFbu)f8cHo{6ODe*0nc-SrG8lZ}T z?=##xKEUg~4FAi%?bpoTDMPTcE^q9-jg=>_e3KX|BtBrKKd`$n>Ve(G^B&k;n*PA< zvc`Vm)o;>X0tTtha7YuvwKdcE(q=Jz*I#i-H*e+1+jkN8PWZa`x0X3n@5wE9C&%**B3Z#YZpyu-Rn@u?$;F{epz<8mDhW|hbr{}FMGKdwGZP|r@n4A?bxyH+G)e2}2;zYpk}dO@ zLVS3AOU$!+X%<*pm05k@>g~@*cjI?5mXBr#)QyVhQ)RJ7Lbe?tipNUIO8C}AF)?L& zH&n(|K3CP1M%C>PLVS0g&mS1LesSe_fA8h7Lzs3trzYyo&i9g!%|^~yiBEgxI?g~5 zSj1d_O0q{E92vLk+4YfeMXl4Q5~w93y`zI)k9^zXK7Ks__G zaiP27GkYDvQc7i^#gZTm^*FkW2An=X%XT1rF@1(&kRS@sy3oGQynZ{vgSLieoKba2 zdr2_Q;oYXBfUsWMj)%+E9tc8A8OI`VA@oiD)_VtIXlT@INiAdnCB&uS_By%4Tc!i@ zt2ZV+Rh#g928RRPw@s((!ig}lE&-%UJ`3;jx)CXjLi!vM!O?0+=SN)NXo{o1s0+~X z;3($R@IByVlE?#cw}1o43mhm0eOlUtxRG%jk}ioS0#lSz9ZX1ZH$XHbF)P=Fs)b)O zi7K#q9970^0Esnp$Zrl2yfkHwkX4U^41xrhm(uYUm>L*KhK%RhnaWC03}|WlvNp*@ zYP{i`6~shKm`Rt*0Y62kGeeilW)8Wg?{5~QSoo{1A>v<|SJbj-feX@<;*FhiV`3WC zB*h$RSn_ja64g`)8bj<>C2K}`p*S@dKO5<1eMRKm@!L&R;) z;5f;PuHYl?Tm6C)Lm~(~!EbZmBf#|x1;3-=EAv?GWD!Svs}X-9(x)|{N4rSLX#Zil z+c!T2(yaJXHqM# z6mqL!y&ULX$Sq`eI?4=Wd6?rWkAQ9|p&OZQ4}dqox!Y~qsH|LvHi3p;;3GnjcnDdh ztkgx-Pj7532r_RPjxH!HFdu(qHwEdgC{e7(#Umz*mM9G#3$AHoMJ}F6JTIOD%mhdd z09IDCn+oj!BiO2j;%jXWubov;J7(KQE}QvX-J^}owwK0&3L(95|37@Xc#c@AgI7OU z&|{BiOx_&7U5mnxow4GajFwt^qzbcC5ub~!@J#ZR!?Jj(y+@@eR4LTCyn-qIxo>4^ zsiP-eQ53I8$ki{N;w^|GLF(+$DvGs=#9YJQ`6{U*QQFj#t0>7;%rh%iavMek!woa- n1ZW|AS^%f@PvLX!zE7+euLwOpbV= (1u << i_1)) + { + TagMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + TagMonoid param_2 = other; + TagMonoid param_3 = agg; + agg = combine_tag_monoid(param_2, param_3); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + GroupMemoryBarrierWithGroupSync(); + TagMonoid row = tag_monoid_identity(); + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + TagMonoid param_4 = row; + TagMonoid param_5 = local[i_2]; + TagMonoid m = combine_tag_monoid(param_4, param_5); + uint _211 = ix + i_2; + _78.Store(_211 * 20 + 0, m.trans_ix); + _78.Store(_211 * 20 + 4, m.linewidth_ix); + _78.Store(_211 * 20 + 8, m.pathseg_ix); + _78.Store(_211 * 20 + 12, m.path_ix); + _78.Store(_211 * 20 + 16, m.pathseg_offset); + } +} + +[numthreads(512, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/pathtag_root.msl b/piet-gpu/shader/gen/pathtag_root.msl new file mode 100644 index 0000000..923e77c --- /dev/null +++ b/piet-gpu/shader/gen/pathtag_root.msl @@ -0,0 +1,146 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" + +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct TagMonoid +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct TagMonoid_1 +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct DataBuf +{ + TagMonoid_1 data[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); + +static inline __attribute__((always_inline)) +TagMonoid combine_tag_monoid(thread const TagMonoid& a, thread const TagMonoid& b) +{ + TagMonoid c; + c.trans_ix = a.trans_ix + b.trans_ix; + c.linewidth_ix = a.linewidth_ix + b.linewidth_ix; + c.pathseg_ix = a.pathseg_ix + b.pathseg_ix; + c.path_ix = a.path_ix + b.path_ix; + c.pathseg_offset = a.pathseg_offset + b.pathseg_offset; + return c; +} + +static inline __attribute__((always_inline)) +TagMonoid tag_monoid_identity() +{ + return TagMonoid{ 0u, 0u, 0u, 0u, 0u }; +} + +kernel void main0(device DataBuf& _78 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup TagMonoid sh_scratch[512]; + uint ix = gl_GlobalInvocationID.x * 8u; + spvUnsafeArray local; + local[0].trans_ix = _78.data[ix].trans_ix; + local[0].linewidth_ix = _78.data[ix].linewidth_ix; + local[0].pathseg_ix = _78.data[ix].pathseg_ix; + local[0].path_ix = _78.data[ix].path_ix; + local[0].pathseg_offset = _78.data[ix].pathseg_offset; + TagMonoid param_1; + for (uint i = 1u; i < 8u; i++) + { + uint _109 = ix + i; + TagMonoid param = local[i - 1u]; + param_1.trans_ix = _78.data[_109].trans_ix; + param_1.linewidth_ix = _78.data[_109].linewidth_ix; + param_1.pathseg_ix = _78.data[_109].pathseg_ix; + param_1.path_ix = _78.data[_109].path_ix; + param_1.pathseg_offset = _78.data[_109].pathseg_offset; + local[i] = combine_tag_monoid(param, param_1); + } + TagMonoid agg = local[7]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + TagMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + TagMonoid param_2 = other; + TagMonoid param_3 = agg; + agg = combine_tag_monoid(param_2, param_3); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + TagMonoid row = tag_monoid_identity(); + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + TagMonoid param_4 = row; + TagMonoid param_5 = local[i_2]; + TagMonoid m = combine_tag_monoid(param_4, param_5); + uint _211 = ix + i_2; + _78.data[_211].trans_ix = m.trans_ix; + _78.data[_211].linewidth_ix = m.linewidth_ix; + _78.data[_211].pathseg_ix = m.pathseg_ix; + _78.data[_211].path_ix = m.path_ix; + _78.data[_211].pathseg_offset = m.pathseg_offset; + } +} + diff --git a/piet-gpu/src/stages/path.rs b/piet-gpu/src/stages/path.rs index b3f417e..e3786fc 100644 --- a/piet-gpu/src/stages/path.rs +++ b/piet-gpu/src/stages/path.rs @@ -17,7 +17,7 @@ //! The path stage (includes substages). use piet_gpu_hal::{ - BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session, ShaderCode, + BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session, include_shader, }; pub struct PathCode { @@ -55,7 +55,7 @@ const CLEAR_WG: u32 = 512; impl PathCode { pub unsafe fn new(session: &Session) -> PathCode { // TODO: add cross-compilation - let reduce_code = ShaderCode::Spv(include_bytes!("../../shader/gen/pathtag_reduce.spv")); + let reduce_code = include_shader!(session, "../../shader/gen/pathtag_reduce"); let reduce_pipeline = session .create_compute_pipeline( reduce_code, @@ -67,15 +67,15 @@ impl PathCode { ], ) .unwrap(); - let tag_root_code = ShaderCode::Spv(include_bytes!("../../shader/gen/pathtag_root.spv")); + let tag_root_code = include_shader!(session, "../../shader/gen/pathtag_root"); let tag_root_pipeline = session .create_compute_pipeline(tag_root_code, &[BindType::Buffer]) .unwrap(); - let clear_code = ShaderCode::Spv(include_bytes!("../../shader/gen/bbox_clear.spv")); + let clear_code = include_shader!(session, "../../shader/gen/bbox_clear"); let clear_pipeline = session .create_compute_pipeline(clear_code, &[BindType::Buffer, BindType::BufReadOnly]) .unwrap(); - let pathseg_code = ShaderCode::Spv(include_bytes!("../../shader/gen/pathseg.spv")); + let pathseg_code = include_shader!(session, "../../shader/gen/pathseg"); let pathseg_pipeline = session .create_compute_pipeline( pathseg_code,