From f6ea9308ba4b986164095dcb48ca28985071309a Mon Sep 17 00:00:00 2001 From: Commit by GitHub Action Date: Wed, 13 Jul 2022 19:27:07 +0000 Subject: [PATCH] commit compiled shaders --- .gitignore | 1 - piet-gpu-hal/examples/shader/gen/collatz.dxil | Bin 0 -> 3136 bytes piet-gpu-hal/examples/shader/gen/collatz.hlsl | 62 + piet-gpu-hal/examples/shader/gen/collatz.msl | 48 + piet-gpu-hal/examples/shader/gen/collatz.spv | Bin 0 -> 1616 bytes piet-gpu/shader/gen/backdrop.dxil | Bin 0 -> 4576 bytes piet-gpu/shader/gen/backdrop.hlsl | 244 +++ piet-gpu/shader/gen/backdrop.msl | 247 +++ piet-gpu/shader/gen/backdrop.spv | Bin 0 -> 11508 bytes piet-gpu/shader/gen/backdrop_lg.dxil | Bin 0 -> 4580 bytes piet-gpu/shader/gen/backdrop_lg.hlsl | 244 +++ piet-gpu/shader/gen/backdrop_lg.msl | 247 +++ piet-gpu/shader/gen/backdrop_lg.spv | Bin 0 -> 11540 bytes piet-gpu/shader/gen/bbox_clear.dxil | Bin 0 -> 3160 bytes piet-gpu/shader/gen/bbox_clear.hlsl | 66 + piet-gpu/shader/gen/bbox_clear.msl | 68 + piet-gpu/shader/gen/bbox_clear.spv | Bin 0 -> 3212 bytes piet-gpu/shader/gen/binning.dxil | Bin 0 -> 6336 bytes piet-gpu/shader/gen/binning.hlsl | 342 +++++ piet-gpu/shader/gen/binning.msl | 347 +++++ piet-gpu/shader/gen/binning.spv | Bin 0 -> 18536 bytes piet-gpu/shader/gen/clip_leaf.dxil | Bin 0 -> 7228 bytes piet-gpu/shader/gen/clip_leaf.hlsl | 371 +++++ piet-gpu/shader/gen/clip_leaf.msl | 370 +++++ piet-gpu/shader/gen/clip_leaf.spv | Bin 0 -> 19240 bytes piet-gpu/shader/gen/clip_reduce.dxil | Bin 0 -> 4624 bytes piet-gpu/shader/gen/clip_reduce.hlsl | 181 +++ piet-gpu/shader/gen/clip_reduce.msl | 177 +++ piet-gpu/shader/gen/clip_reduce.spv | Bin 0 -> 9696 bytes piet-gpu/shader/gen/coarse.dxil | Bin 0 -> 11972 bytes piet-gpu/shader/gen/coarse.hlsl | 1254 +++++++++++++++ piet-gpu/shader/gen/coarse.msl | 1266 ++++++++++++++++ piet-gpu/shader/gen/coarse.spv | Bin 0 -> 60516 bytes piet-gpu/shader/gen/draw_leaf.dxil | Bin 0 -> 6764 bytes piet-gpu/shader/gen/draw_leaf.hlsl | 268 ++++ piet-gpu/shader/gen/draw_leaf.msl | 316 ++++ piet-gpu/shader/gen/draw_leaf.spv | Bin 0 -> 20104 bytes piet-gpu/shader/gen/draw_reduce.dxil | Bin 0 -> 4260 bytes piet-gpu/shader/gen/draw_reduce.hlsl | 126 ++ piet-gpu/shader/gen/draw_reduce.msl | 140 ++ piet-gpu/shader/gen/draw_reduce.spv | Bin 0 -> 7140 bytes piet-gpu/shader/gen/draw_root.dxil | Bin 0 -> 4468 bytes piet-gpu/shader/gen/draw_root.hlsl | 108 ++ piet-gpu/shader/gen/draw_root.msl | 140 ++ piet-gpu/shader/gen/draw_root.spv | Bin 0 -> 5440 bytes piet-gpu/shader/gen/kernel4.dxil | Bin 0 -> 14484 bytes piet-gpu/shader/gen/kernel4.hlsl | 1303 ++++++++++++++++ piet-gpu/shader/gen/kernel4.msl | 1349 +++++++++++++++++ piet-gpu/shader/gen/kernel4.spv | Bin 0 -> 66224 bytes piet-gpu/shader/gen/kernel4_gray.dxil | Bin 0 -> 14564 bytes piet-gpu/shader/gen/kernel4_gray.hlsl | 1302 ++++++++++++++++ piet-gpu/shader/gen/kernel4_gray.msl | 1348 ++++++++++++++++ piet-gpu/shader/gen/kernel4_gray.spv | Bin 0 -> 65980 bytes piet-gpu/shader/gen/path_coarse.dxil | Bin 0 -> 7064 bytes piet-gpu/shader/gen/path_coarse.hlsl | 673 ++++++++ piet-gpu/shader/gen/path_coarse.msl | 717 +++++++++ piet-gpu/shader/gen/path_coarse.spv | Bin 0 -> 39788 bytes piet-gpu/shader/gen/pathseg.dxil | Bin 0 -> 9596 bytes piet-gpu/shader/gen/pathseg.hlsl | 661 ++++++++ piet-gpu/shader/gen/pathseg.msl | 717 +++++++++ piet-gpu/shader/gen/pathseg.spv | Bin 0 -> 35212 bytes piet-gpu/shader/gen/pathtag_reduce.dxil | Bin 0 -> 4644 bytes piet-gpu/shader/gen/pathtag_reduce.hlsl | 138 ++ piet-gpu/shader/gen/pathtag_reduce.msl | 154 ++ piet-gpu/shader/gen/pathtag_reduce.spv | Bin 0 -> 8300 bytes piet-gpu/shader/gen/pathtag_root.dxil | Bin 0 -> 4716 bytes piet-gpu/shader/gen/pathtag_root.hlsl | 115 ++ piet-gpu/shader/gen/pathtag_root.msl | 146 ++ piet-gpu/shader/gen/pathtag_root.spv | Bin 0 -> 5836 bytes piet-gpu/shader/gen/tile_alloc.dxil | Bin 0 -> 5132 bytes piet-gpu/shader/gen/tile_alloc.hlsl | 264 ++++ piet-gpu/shader/gen/tile_alloc.msl | 273 ++++ piet-gpu/shader/gen/tile_alloc.spv | Bin 0 -> 13360 bytes piet-gpu/shader/gen/transform_leaf.dxil | Bin 0 -> 5664 bytes piet-gpu/shader/gen/transform_leaf.hlsl | 234 +++ piet-gpu/shader/gen/transform_leaf.msl | 287 ++++ piet-gpu/shader/gen/transform_leaf.spv | Bin 0 -> 12972 bytes piet-gpu/shader/gen/transform_reduce.dxil | Bin 0 -> 4700 bytes piet-gpu/shader/gen/transform_reduce.hlsl | 140 ++ piet-gpu/shader/gen/transform_reduce.msl | 153 ++ piet-gpu/shader/gen/transform_reduce.spv | Bin 0 -> 8324 bytes piet-gpu/shader/gen/transform_root.dxil | Bin 0 -> 4824 bytes piet-gpu/shader/gen/transform_root.hlsl | 94 ++ piet-gpu/shader/gen/transform_root.msl | 129 ++ piet-gpu/shader/gen/transform_root.spv | Bin 0 -> 5336 bytes tests/shader/gen/clear.dxil | Bin 0 -> 3076 bytes tests/shader/gen/clear.hlsl | 26 + tests/shader/gen/clear.msl | 27 + tests/shader/gen/clear.spv | Bin 0 -> 1212 bytes tests/shader/gen/linkedlist.dxil | Bin 0 -> 3024 bytes tests/shader/gen/linkedlist.hlsl | 39 + tests/shader/gen/linkedlist.msl | 36 + tests/shader/gen/linkedlist.spv | Bin 0 -> 1936 bytes tests/shader/gen/message_passing.dxil | Bin 0 -> 3116 bytes tests/shader/gen/message_passing.hlsl | 54 + tests/shader/gen/message_passing.msl | 54 + tests/shader/gen/message_passing.spv | Bin 0 -> 2196 bytes tests/shader/gen/message_passing_vkmm.spv | Bin 0 -> 2300 bytes tests/shader/gen/prefix.dxil | Bin 0 -> 4876 bytes tests/shader/gen/prefix.hlsl | 225 +++ tests/shader/gen/prefix.msl | 264 ++++ tests/shader/gen/prefix.spv | Bin 0 -> 9828 bytes tests/shader/gen/prefix_atomic.dxil | Bin 0 -> 4884 bytes tests/shader/gen/prefix_atomic.hlsl | 229 +++ tests/shader/gen/prefix_atomic.msl | 265 ++++ tests/shader/gen/prefix_atomic.spv | Bin 0 -> 9852 bytes tests/shader/gen/prefix_reduce.dxil | Bin 0 -> 3764 bytes tests/shader/gen/prefix_reduce.hlsl | 72 + tests/shader/gen/prefix_reduce.msl | 68 + tests/shader/gen/prefix_reduce.spv | Bin 0 -> 3472 bytes tests/shader/gen/prefix_root.dxil | Bin 0 -> 3888 bytes tests/shader/gen/prefix_root.hlsl | 80 + tests/shader/gen/prefix_root.msl | 112 ++ tests/shader/gen/prefix_root.spv | Bin 0 -> 4072 bytes tests/shader/gen/prefix_scan.dxil | Bin 0 -> 4168 bytes tests/shader/gen/prefix_scan.hlsl | 92 ++ tests/shader/gen/prefix_scan.msl | 123 ++ tests/shader/gen/prefix_scan.spv | Bin 0 -> 4720 bytes tests/shader/gen/prefix_vkmm.spv | Bin 0 -> 10016 bytes 119 files changed, 18595 insertions(+), 1 deletion(-) create mode 100644 piet-gpu-hal/examples/shader/gen/collatz.dxil create mode 100644 piet-gpu-hal/examples/shader/gen/collatz.hlsl create mode 100644 piet-gpu-hal/examples/shader/gen/collatz.msl create mode 100644 piet-gpu-hal/examples/shader/gen/collatz.spv create mode 100644 piet-gpu/shader/gen/backdrop.dxil create mode 100644 piet-gpu/shader/gen/backdrop.hlsl create mode 100644 piet-gpu/shader/gen/backdrop.msl create mode 100644 piet-gpu/shader/gen/backdrop.spv create mode 100644 piet-gpu/shader/gen/backdrop_lg.dxil create mode 100644 piet-gpu/shader/gen/backdrop_lg.hlsl create mode 100644 piet-gpu/shader/gen/backdrop_lg.msl create mode 100644 piet-gpu/shader/gen/backdrop_lg.spv create mode 100644 piet-gpu/shader/gen/bbox_clear.dxil create mode 100644 piet-gpu/shader/gen/bbox_clear.hlsl create mode 100644 piet-gpu/shader/gen/bbox_clear.msl create mode 100644 piet-gpu/shader/gen/bbox_clear.spv create mode 100644 piet-gpu/shader/gen/binning.dxil create mode 100644 piet-gpu/shader/gen/binning.hlsl create mode 100644 piet-gpu/shader/gen/binning.msl create mode 100644 piet-gpu/shader/gen/binning.spv create mode 100644 piet-gpu/shader/gen/clip_leaf.dxil create mode 100644 piet-gpu/shader/gen/clip_leaf.hlsl create mode 100644 piet-gpu/shader/gen/clip_leaf.msl create mode 100644 piet-gpu/shader/gen/clip_leaf.spv create mode 100644 piet-gpu/shader/gen/clip_reduce.dxil create mode 100644 piet-gpu/shader/gen/clip_reduce.hlsl create mode 100644 piet-gpu/shader/gen/clip_reduce.msl create mode 100644 piet-gpu/shader/gen/clip_reduce.spv create mode 100644 piet-gpu/shader/gen/coarse.dxil create mode 100644 piet-gpu/shader/gen/coarse.hlsl create mode 100644 piet-gpu/shader/gen/coarse.msl create mode 100644 piet-gpu/shader/gen/coarse.spv create mode 100644 piet-gpu/shader/gen/draw_leaf.dxil create mode 100644 piet-gpu/shader/gen/draw_leaf.hlsl create mode 100644 piet-gpu/shader/gen/draw_leaf.msl create mode 100644 piet-gpu/shader/gen/draw_leaf.spv create mode 100644 piet-gpu/shader/gen/draw_reduce.dxil create mode 100644 piet-gpu/shader/gen/draw_reduce.hlsl create mode 100644 piet-gpu/shader/gen/draw_reduce.msl create mode 100644 piet-gpu/shader/gen/draw_reduce.spv create mode 100644 piet-gpu/shader/gen/draw_root.dxil create mode 100644 piet-gpu/shader/gen/draw_root.hlsl create mode 100644 piet-gpu/shader/gen/draw_root.msl create mode 100644 piet-gpu/shader/gen/draw_root.spv create mode 100644 piet-gpu/shader/gen/kernel4.dxil create mode 100644 piet-gpu/shader/gen/kernel4.hlsl create mode 100644 piet-gpu/shader/gen/kernel4.msl create mode 100644 piet-gpu/shader/gen/kernel4.spv create mode 100644 piet-gpu/shader/gen/kernel4_gray.dxil create mode 100644 piet-gpu/shader/gen/kernel4_gray.hlsl create mode 100644 piet-gpu/shader/gen/kernel4_gray.msl create mode 100644 piet-gpu/shader/gen/kernel4_gray.spv create mode 100644 piet-gpu/shader/gen/path_coarse.dxil create mode 100644 piet-gpu/shader/gen/path_coarse.hlsl create mode 100644 piet-gpu/shader/gen/path_coarse.msl create mode 100644 piet-gpu/shader/gen/path_coarse.spv create mode 100644 piet-gpu/shader/gen/pathseg.dxil create mode 100644 piet-gpu/shader/gen/pathseg.hlsl create mode 100644 piet-gpu/shader/gen/pathseg.msl create mode 100644 piet-gpu/shader/gen/pathseg.spv create mode 100644 piet-gpu/shader/gen/pathtag_reduce.dxil create mode 100644 piet-gpu/shader/gen/pathtag_reduce.hlsl create mode 100644 piet-gpu/shader/gen/pathtag_reduce.msl create mode 100644 piet-gpu/shader/gen/pathtag_reduce.spv create mode 100644 piet-gpu/shader/gen/pathtag_root.dxil create mode 100644 piet-gpu/shader/gen/pathtag_root.hlsl create mode 100644 piet-gpu/shader/gen/pathtag_root.msl create mode 100644 piet-gpu/shader/gen/pathtag_root.spv create mode 100644 piet-gpu/shader/gen/tile_alloc.dxil create mode 100644 piet-gpu/shader/gen/tile_alloc.hlsl create mode 100644 piet-gpu/shader/gen/tile_alloc.msl create mode 100644 piet-gpu/shader/gen/tile_alloc.spv create mode 100644 piet-gpu/shader/gen/transform_leaf.dxil create mode 100644 piet-gpu/shader/gen/transform_leaf.hlsl create mode 100644 piet-gpu/shader/gen/transform_leaf.msl create mode 100644 piet-gpu/shader/gen/transform_leaf.spv create mode 100644 piet-gpu/shader/gen/transform_reduce.dxil create mode 100644 piet-gpu/shader/gen/transform_reduce.hlsl create mode 100644 piet-gpu/shader/gen/transform_reduce.msl create mode 100644 piet-gpu/shader/gen/transform_reduce.spv create mode 100644 piet-gpu/shader/gen/transform_root.dxil create mode 100644 piet-gpu/shader/gen/transform_root.hlsl create mode 100644 piet-gpu/shader/gen/transform_root.msl create mode 100644 piet-gpu/shader/gen/transform_root.spv create mode 100644 tests/shader/gen/clear.dxil create mode 100644 tests/shader/gen/clear.hlsl create mode 100644 tests/shader/gen/clear.msl create mode 100644 tests/shader/gen/clear.spv create mode 100644 tests/shader/gen/linkedlist.dxil create mode 100644 tests/shader/gen/linkedlist.hlsl create mode 100644 tests/shader/gen/linkedlist.msl create mode 100644 tests/shader/gen/linkedlist.spv create mode 100644 tests/shader/gen/message_passing.dxil create mode 100644 tests/shader/gen/message_passing.hlsl create mode 100644 tests/shader/gen/message_passing.msl create mode 100644 tests/shader/gen/message_passing.spv create mode 100644 tests/shader/gen/message_passing_vkmm.spv create mode 100644 tests/shader/gen/prefix.dxil create mode 100644 tests/shader/gen/prefix.hlsl create mode 100644 tests/shader/gen/prefix.msl create mode 100644 tests/shader/gen/prefix.spv create mode 100644 tests/shader/gen/prefix_atomic.dxil create mode 100644 tests/shader/gen/prefix_atomic.hlsl create mode 100644 tests/shader/gen/prefix_atomic.msl create mode 100644 tests/shader/gen/prefix_atomic.spv create mode 100644 tests/shader/gen/prefix_reduce.dxil create mode 100644 tests/shader/gen/prefix_reduce.hlsl create mode 100644 tests/shader/gen/prefix_reduce.msl create mode 100644 tests/shader/gen/prefix_reduce.spv create mode 100644 tests/shader/gen/prefix_root.dxil create mode 100644 tests/shader/gen/prefix_root.hlsl create mode 100644 tests/shader/gen/prefix_root.msl create mode 100644 tests/shader/gen/prefix_root.spv create mode 100644 tests/shader/gen/prefix_scan.dxil create mode 100644 tests/shader/gen/prefix_scan.hlsl create mode 100644 tests/shader/gen/prefix_scan.msl create mode 100644 tests/shader/gen/prefix_scan.spv create mode 100644 tests/shader/gen/prefix_vkmm.spv diff --git a/.gitignore b/.gitignore index e0229c8..6853bbc 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,3 @@ **/*.rs.bk .ninja_deps .ninja_log -**/shader/gen diff --git a/piet-gpu-hal/examples/shader/gen/collatz.dxil b/piet-gpu-hal/examples/shader/gen/collatz.dxil new file mode 100644 index 0000000000000000000000000000000000000000..a03f96a31f63a6f2055144854e947d6299f09e55 GIT binary patch literal 3136 zcmeHJdrVu`89(;D_}ch#xrtqRT?4(w0YQSYi+P0tnjb*SD=~p2s9}oDYfLrFV8VD* zn{ABkq((IjSxitP$b+?A#vqEiP^A-Kmw6Nx(xnq5ZUif#X_cic>w~OSwsQ$gsOo>4 zrd8UJzVkiKcfa$U?|k~*lT)tNa7RCWf&1t0o5Mcm{`Kom26{LE03{9p26+~wB1jdG zjF4_ZD(3;v45>6PpW#D&bL5xqP1@91zW*D#uJjP2hjwQ3gKzMEz!9>=#^$k=D5$eJ zZ5_f51>IsSEmM`f2m|Rs2sjD@yS!rAmu9m#cn7sJ=id#C!NuVW{GOOY0HSnmpG@d|%@0I$Mhl0gmQ zo9mM+<3C8Qcx=zMFFTdViD>A}NR`f=`X1%JRqVUFe_o?~T%g^R^e9k&zS5nPmvE=N zicAcpUT`N(X?I}UeJ6)p+v9%Lm2Ql=VsP>EZ@qSojuGQ;n9%{44iJf(#+}W2XIh=J ztqy^5*spa&{T$H_XAs`bU`+#r064Nxa=lMj1#cssTqvth4^x!?a%7gR zmh~23SM&inhqKv-$_PFPg+@iRS1j39vHXnaJp_ll_GqW~8rijH4P9X7+dV?5x}wdS>U}&x~2y zk_yejs=@^l71<}eH2yoDfllM2Duc1X#?Vp+B8%fk_?LdUQfp&&h|sBp5pr5hB7aN? z8fSHo4o{ui zJ=@*T(CIrC{m`nSToV*t&Gf1ccXiJW&&GCNn(1yRAj@Pv^Ttv8x0A(z)r$L*^Y^Eh zhtS+8Mb_zzE6ioZ*iiaq#hVG70}lLp?y&K}ul6-uGBy;fmc{tYgYS0k5;mDT&I`}O zYW4ST*Byf~ri1sE9}e7kFL-Zd2z~x(7-;l2&!xYm7#mAaNj7R;0(mmu66s%xtQB*z zU79_1pIdFg+FCKQw-H|f;eR2C&n!eTu8y9Zi2B1|WGxf2?$#-tL9zpyY&ViU)9P+E zVI6#|P3tt7uyzyD(&{wPPRYzR;=YGiV!|7k@DEpsRUMH``lIg1BIjjMlZNP8=6L3{ z&dfPY#uX&%T4$CW$(&ggx9G8sIxM};*#u{c)8uiQ^jJ$g*0L4?!~VqLJj!=AQqCr? zv(0Nwe6UJ9SR_6lC+;&yWneIJc`$NjeGVETr)80TYt*!X_a~1gBcRFfuFqF}dwMl{ zTu5fG?|X35uWgw(k<1Yv?w4UNnvnK*XN$!txuGG}7J17w@zEmh3zB#^PAqxCKh=eQ z#&lW(oM(edcB4r{6f3a(+eiZt0ePM>?DVHuKUF>zvh5VXtOe?is8TC_`?odDL2-_M z-@Y7Zxot^dE$SQ~+Qyw7AtPw^lgG_DDC6pUIryd^%Xc^P<^8Ej`=fX2rCD*ym-B-?QvZLiJ9{V@&xe+F`6`wC{!RI=-_MOG&>i;f2Gq7&| zQ~11b_mKTZ@L2!@L3}2^4WIpa;9dlOfrW{6Y<>>%e;1qM_{e1yg@;y|r93|!$3LXe zuLE=Kw*oVLBjV>PX{xJU!Q_CEav0{7r08IMe2l6r@q~DYP4A3QB=3?ZmKc=_*hd_ zjk(nJ3qo?XuuuAGMfA-hXQUqld&k*#3;QqnKQ4YZaM3`+9)@=&LUR+@wZiLXYa*09BEXp&O9%1i+@Bs}Lp?Uw2tK7Mdb&sSl0> zT^ufyl?w{=dYu~J>8OO7QIIO8kjBgKJ{p@)zM~W5nXCyy@gp}y0lsZQ#w*cV<IZ3*tt6U{6=Y_%D>DR4?iVdV) MMcGvjHYqs%2}h{p0ssI2 literal 0 HcmV?d00001 diff --git a/piet-gpu-hal/examples/shader/gen/collatz.hlsl b/piet-gpu-hal/examples/shader/gen/collatz.hlsl new file mode 100644 index 0000000..762f06d --- /dev/null +++ b/piet-gpu-hal/examples/shader/gen/collatz.hlsl @@ -0,0 +1,62 @@ +static const uint3 gl_WorkGroupSize = uint3(1u, 1u, 1u); + +RWByteAddressBuffer _57 : register(u0); + +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +float mod(float x, float y) +{ + return x - y * floor(x / y); +} + +float2 mod(float2 x, float2 y) +{ + return x - y * floor(x / y); +} + +float3 mod(float3 x, float3 y) +{ + return x - y * floor(x / y); +} + +float4 mod(float4 x, float4 y) +{ + return x - y * floor(x / y); +} + +uint collatz_iterations(inout uint n) +{ + uint i = 0u; + while (n != 1u) + { + if (mod(float(n), 2.0f) == 0.0f) + { + n /= 2u; + } + else + { + n = (3u * n) + 1u; + } + i++; + } + return i; +} + +void comp_main() +{ + uint index = gl_GlobalInvocationID.x; + uint param = _57.Load(index * 4 + 0); + uint _65 = collatz_iterations(param); + _57.Store(index * 4 + 0, _65); +} + +[numthreads(1, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu-hal/examples/shader/gen/collatz.msl b/piet-gpu-hal/examples/shader/gen/collatz.msl new file mode 100644 index 0000000..1b75efe --- /dev/null +++ b/piet-gpu-hal/examples/shader/gen/collatz.msl @@ -0,0 +1,48 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +// Implementation of the GLSL mod() function, which is slightly different than Metal fmod() +template +inline Tx mod(Tx x, Ty y) +{ + return x - y * floor(x / y); +} + +struct PrimeIndices +{ + uint indices[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(1u); + +static inline __attribute__((always_inline)) +uint collatz_iterations(thread uint& n) +{ + uint i = 0u; + while (n != 1u) + { + if (mod(float(n), 2.0) == 0.0) + { + n /= 2u; + } + else + { + n = (3u * n) + 1u; + } + i++; + } + return i; +} + +kernel void main0(device PrimeIndices& _57 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) +{ + uint index = gl_GlobalInvocationID.x; + uint param = _57.indices[index]; + uint _65 = collatz_iterations(param); + _57.indices[index] = _65; +} + diff --git a/piet-gpu-hal/examples/shader/gen/collatz.spv b/piet-gpu-hal/examples/shader/gen/collatz.spv new file mode 100644 index 0000000000000000000000000000000000000000..886797e6937b1918712237eaba6a9ff12daa1c67 GIT binary patch literal 1616 zcmYk6*-leY6o$8yB7+LZ`=hyTvq#o=R5ubzCGu)Z(iJSYi_nF+C_3);#TWPdjR~y^ScdflF)3!4f z>1`SQ@~D1L+ivdH_iD$8*(B;cN`*`C5~Pp#3dGqA*_l0&qYSpEkZ*wP(;n}@OOQ3RE8orK zF=QW>J?ifa^SMp^S}wb5+yXpz|YkZ}L_B-{v zXy45ra*TreA#)ikj@U7{0`%8)EVzR*ppm2?#jJJzFDyO{$UsUHvR)Zt7irP literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/backdrop.dxil b/piet-gpu/shader/gen/backdrop.dxil new file mode 100644 index 0000000000000000000000000000000000000000..0fb9622eda673c439b74f7538c649b770edef248 GIT binary patch literal 4576 zcmeHKe^3)w9)HPhHk)J#8weN-*aU?FV!I#>Ab8#U08y|P6cwvIHy{)(U<67*J>7&z zqQz@a)Kq#lDD|jATTttX+FlZ`IJ4*Qd3{^EHFlYOrJ{m=Tw#Y?Fgh*OgRKWl%4MetJ$E`(bPXMAW1 zXeczUOyZJ>u79ofU4Ju)Px3i%ah^8MjRC*_0ce8>4EghOxI`ozb3k0D_gqyI0K5n> ziB;vFGZ>BuEz=vdz36<5OibO$vk_4OJZu{JYym6J;Ollubgg8fJOx)|ixU?)vYc|2 zKGX3M2uswBWJ#Tkkv&FjuBBnP?z8P^Y$lcqH^Hq<*)YUyS4i{Q0_w*&7x93LSm7Pc zVGIIef~|~Bm4nxdcs#I81;9**B}Qh6RI9h5yCPd{u=>>2;*F*v_sR4rs>96p^}}n!_;;T)^HA=f!|vd4kzfkDgbYkzbJ0y(f&Ke^_&7ekis@`noAxg z%W*b{qUW7K_53J`ZzPIxaY2@mrJ@#LZ^au+1lrT|=ni1q&vVa3Xp=u*ExZfmFn%hM zejta`?C9Qm{yaMD z)x`H8vL25i7cIqG18lQ-9<)WF0ZPnwM1y`%Z)&50%1mp@gVCG&RNx(y$7+a=sZy0= z$2S`JZp`>V3zgc{ar)ttN4t(c+?jszNSBCDel6OyOLFesfwbnQ?iEaLKG>`RL4lo+ zcq|bPQ!?2uNu^Rhv$?SGnr2>dvKGVyttqdn3`X6Nx@GnMck(jO+#JL_4RiDjj*i@NU*%SE2wgP=ChGj_oaB>__JI>gP5fbNg!=s?5TTxUe?MZZ-=yn33ulyIExq>z7GyIHki>U^x}|>6moP zD4k9AL|j#eUr|T+Ow)>~ZE2_L(gyUY$B^{Xb?G)Ft$$2j%?LM^2$M_f6|iRQW~bfE z2&-d+)prCi?K7->6>eXzuvfV3wJuBi%`xfCA?dds=?w}g=xqrfX$kN5*Mp|;K6QAH zC8E!Sz3P9AO(ZB-mVed)W_f zZs{)lZXKnTo%ltvl9==!%T`#DR=)YoolAZY z#%4OX*dMA4DCMZT!ZzyXLQq3f;3C`&09o3_S#rMzz`cCO^+!fmJhP&Dc&tKHAO}{D)%i$4`6I%Rh)v`ZJpqAagJh1s+ zYZ?4p`UNecH=*oR9DL1hfdW>Gu|AIOe+Mpv?SHFb(Kvlnt02UMl}fBf1vM;#`49bC zYaPo{hJLafFJM$SUxU7j`5FOZH4-yE@?9K^#s`zMV84hD$`l=@D>V83?y{IRu`5Xm zg=W*$j|MLeT^)rlbo$JIZ$z>1rPsZkyDh{U#4GKrQ6vzQ;ipCXDP@Y2frN%wsbIbJ zTA~LKhdVkye7&o+ad+3d3q{tr)in=Q7gbcQSy#T8e?Szr|Jf%L%}+c(*yL%`*MFX9 z56xT}{lV?brDrrzbf0s$t9SSLH{LtDWB2)mqSiSuwy3?h`f$8d?m*32s_^0AK5-kp%}v#0STJ4D3sGS0UmQcTnFJ_c$Up zx&b0?3-5Ew5yx&14(_)6iN*m&)lIZ1@DIpzOa!b9c-d=kS;APaL(6AX3i>Hp_9Tm$ zr~(G4a10C2TJvwNkwfwDtmzmIgQ+g>GB;1r5RH1w`T>lg2ALB)MSmh{pbA)yxhuSZ zcMDrh?IS^VKo*T1C46LqLIrTEH&0mH(c}$-vGgoP6@?nLe+g;j-5)$AEL920xUdY@ zBsL;kXB3z@>}k&=CKp$*f=WNuZVA;1^D~tt5!Gxosm?b!wl2C-AK;)BK)IbHlF>GR zbq5IJY+8Z!vE(+stev{-i?ew@uqBw~LZV`xaV^y&Mp7>8Xr3ZOMP-{| zUaLA}I)=gL<-Ws@d%+iv>}-egq>lKMlk;qgJq7B9>)jz9W{)=3Dnk&UrHnpPNS;3fBQ?_64R3gQoXCvX<-`BK1ESsoc^v}Sq4`fP^*?zualm~8M?=}j9Za6T=P>p|b*bL^3^B4e!)|ylE(C8864<;Bj|2 zrywEtE;-Osq67Uw26%9?gaKMGWk_kA58-V|x2>nmM7q!=27{>vHSMD|xeE-yukJ6)|nh9=DHGfKd3` zT!AIvJVbElp|kO YLo$-ik==HVe3v7eX7CeW+JBXQ0-yiGH~;_u literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/backdrop.hlsl b/piet-gpu/shader/gen/backdrop.hlsl new file mode 100644 index 0000000..aba3cff --- /dev/null +++ b/piet-gpu/shader/gen/backdrop.hlsl @@ -0,0 +1,244 @@ +struct Alloc +{ + uint offset; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +RWByteAddressBuffer _67 : register(u0, space0); +ByteAddressBuffer _166 : register(t1, space0); + +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +static uint gl_LocalInvocationIndex; +struct SPIRV_Cross_Input +{ + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; + uint gl_LocalInvocationIndex : SV_GroupIndex; +}; + +groupshared uint sh_row_width[256]; +groupshared Alloc sh_row_alloc[256]; +groupshared uint sh_row_count[256]; + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +uint read_mem(Alloc alloc, uint offset) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = _67.Load(offset * 4 + 8); + return v; +} + +Path Path_read(Alloc a, PathRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + TileRef _134 = { raw2 }; + s.tiles = _134; + return s; +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _67.Store(offset * 4 + 8, val); +} + +void comp_main() +{ + uint th_ix = gl_LocalInvocationIndex; + uint element_ix = gl_GlobalInvocationID.x; + uint row_count = 0u; + bool mem_ok = _67.Load(4) == 0u; + if (gl_LocalInvocationID.y == 0u) + { + if (element_ix < _166.Load(0)) + { + PathRef _180 = { _166.Load(16) + (element_ix * 12u) }; + PathRef path_ref = _180; + Alloc _185; + _185.offset = _166.Load(16); + Alloc param; + param.offset = _185.offset; + PathRef param_1 = path_ref; + Path path = Path_read(param, param_1); + sh_row_width[th_ix] = path.bbox.z - path.bbox.x; + row_count = path.bbox.w - path.bbox.y; + bool _210 = row_count == 1u; + bool _216; + if (_210) + { + _216 = path.bbox.y > 0u; + } + else + { + _216 = _210; + } + if (_216) + { + row_count = 0u; + } + uint param_2 = path.tiles.offset; + uint param_3 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_4 = mem_ok; + Alloc path_alloc = new_alloc(param_2, param_3, param_4); + sh_row_alloc[th_ix] = path_alloc; + } + sh_row_count[th_ix] = row_count; + } + for (uint i = 0u; i < 8u; i++) + { + GroupMemoryBarrierWithGroupSync(); + bool _262 = gl_LocalInvocationID.y == 0u; + bool _269; + if (_262) + { + _269 = th_ix >= (1u << i); + } + else + { + _269 = _262; + } + if (_269) + { + row_count += sh_row_count[th_ix - (1u << i)]; + } + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.y == 0u) + { + sh_row_count[th_ix] = row_count; + } + } + GroupMemoryBarrierWithGroupSync(); + uint total_rows = sh_row_count[255]; + uint _348; + for (uint row = th_ix; row < total_rows; row += 256u) + { + uint el_ix = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint probe = el_ix + (128u >> i_1); + if (row >= sh_row_count[probe - 1u]) + { + el_ix = probe; + } + } + uint width = sh_row_width[el_ix]; + if ((width > 0u) && mem_ok) + { + Alloc tiles_alloc = sh_row_alloc[el_ix]; + if (el_ix > 0u) + { + _348 = sh_row_count[el_ix - 1u]; + } + else + { + _348 = 0u; + } + uint seq_ix = row - _348; + uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width); + Alloc param_5 = tiles_alloc; + uint param_6 = tile_el_ix; + uint sum = read_mem(param_5, param_6); + for (uint x = 1u; x < width; x++) + { + tile_el_ix += 2u; + Alloc param_7 = tiles_alloc; + uint param_8 = tile_el_ix; + sum += read_mem(param_7, param_8); + Alloc param_9 = tiles_alloc; + uint param_10 = tile_el_ix; + uint param_11 = sum; + write_mem(param_9, param_10, param_11); + } + } + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + gl_LocalInvocationIndex = stage_input.gl_LocalInvocationIndex; + comp_main(); +} diff --git a/piet-gpu/shader/gen/backdrop.msl b/piet-gpu/shader/gen/backdrop.msl new file mode 100644 index 0000000..1c0a0bb --- /dev/null +++ b/piet-gpu/shader/gen/backdrop.msl @@ -0,0 +1,247 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +struct Alloc +{ + uint offset; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 path_bbox_alloc; + Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; + Alloc_1 draw_bbox_alloc; + Alloc_1 drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_67) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_67.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_67) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_67); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_67); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_67); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + s.tiles = TileRef{ raw2 }; + return s; +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_67) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_67.memory[offset] = val; +} + +kernel void main0(device Memory& v_67 [[buffer(0)]], const device ConfigBuf& _166 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup uint sh_row_width[256]; + threadgroup Alloc sh_row_alloc[256]; + threadgroup uint sh_row_count[256]; + uint th_ix = gl_LocalInvocationIndex; + uint element_ix = gl_GlobalInvocationID.x; + uint row_count = 0u; + bool mem_ok = v_67.mem_error == 0u; + if (gl_LocalInvocationID.y == 0u) + { + if (element_ix < _166.conf.n_elements) + { + PathRef path_ref = PathRef{ _166.conf.tile_alloc.offset + (element_ix * 12u) }; + Alloc param; + param.offset = _166.conf.tile_alloc.offset; + PathRef param_1 = path_ref; + Path path = Path_read(param, param_1, v_67); + sh_row_width[th_ix] = path.bbox.z - path.bbox.x; + row_count = path.bbox.w - path.bbox.y; + bool _210 = row_count == 1u; + bool _216; + if (_210) + { + _216 = path.bbox.y > 0u; + } + else + { + _216 = _210; + } + if (_216) + { + row_count = 0u; + } + uint param_2 = path.tiles.offset; + uint param_3 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_4 = mem_ok; + Alloc path_alloc = new_alloc(param_2, param_3, param_4); + sh_row_alloc[th_ix] = path_alloc; + } + sh_row_count[th_ix] = row_count; + } + for (uint i = 0u; i < 8u; i++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + bool _262 = gl_LocalInvocationID.y == 0u; + bool _269; + if (_262) + { + _269 = th_ix >= (1u << i); + } + else + { + _269 = _262; + } + if (_269) + { + row_count += sh_row_count[th_ix - (1u << i)]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.y == 0u) + { + sh_row_count[th_ix] = row_count; + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint total_rows = sh_row_count[255]; + uint _348; + for (uint row = th_ix; row < total_rows; row += 256u) + { + uint el_ix = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint probe = el_ix + (128u >> i_1); + if (row >= sh_row_count[probe - 1u]) + { + el_ix = probe; + } + } + uint width = sh_row_width[el_ix]; + if ((width > 0u) && mem_ok) + { + Alloc tiles_alloc = sh_row_alloc[el_ix]; + if (el_ix > 0u) + { + _348 = sh_row_count[el_ix - 1u]; + } + else + { + _348 = 0u; + } + uint seq_ix = row - _348; + uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width); + Alloc param_5 = tiles_alloc; + uint param_6 = tile_el_ix; + uint sum = read_mem(param_5, param_6, v_67); + for (uint x = 1u; x < width; x++) + { + tile_el_ix += 2u; + Alloc param_7 = tiles_alloc; + uint param_8 = tile_el_ix; + sum += read_mem(param_7, param_8, v_67); + Alloc param_9 = tiles_alloc; + uint param_10 = tile_el_ix; + uint param_11 = sum; + write_mem(param_9, param_10, param_11, v_67); + } + } + } +} + diff --git a/piet-gpu/shader/gen/backdrop.spv b/piet-gpu/shader/gen/backdrop.spv new file mode 100644 index 0000000000000000000000000000000000000000..2bd17d8069b3d234ab7866cc0dfca0df47d22062 GIT binary patch literal 11508 zcmbW6d4Szjb;p0nyqScAErfjulLUqk0uqY?253TpLm)^ZQ4x{Ho0&J0Co}I&X5J(L zTmnQv8x=*xmM#=1TBTOoYB%eW+FG~nJ6-5XX`u@(qEf}r=lAZ(+<8L(=ueO5d%ow~ zbI(2Z{_gut?3urEK~c;r78MJN56mmdXGt+1rYIH{eWf1Ua?O@=rrRUuocFSG4LH2$ zDdUVef^og*3Nle^G7~Myu+CHt( z(dl}-fLXvAilSNHU9B0nYG&;^{yVgG9llC&=$zO}aTxY=;})uMY=vI1?)pTvb(8j@ z;u!39Yi4*y6?WBhdunF5eRf&E*)wa)%$;#di7V$zd5%WU*0i`d348Zcqg{W|JQSJ6 z9*@p*oiHa?PjMpluG)B6Uq5)NUK@GQwD?`uGIXBvRP5RMD#Z%yt+n=!SJy|&zH0mA zF{fj1Z;aO?M%yQk8NfD=y!)f=Gc?rNqYln9(D3K`LCQ~_Y~J)?g4&=6nnE17*a(tu9&yk)JZ$F5CDXNU$+2wa{X7l%yHFox^QtZdaZdod07nXO# z*m!kIYq&PPxw#9~ZnT=4n-v>Y09v%-z%)zD4UX@ja>tM|b@zt8mRyZQpA@J>~T5~!%BA0inxvIXj z2Yn06buPv?!l9XHHCv65$T(ZK_v`R@V=_FE(|fk$hZ@7PzD`c>+|oDQt_|N5-r3%I zpLfRQ^>q4vBtGkzTWiPXZGQ^?b=v1ywM}?{n*29pT7aF<%P!id!TQm)~-cT z-?@k2JLme+nNhH7lJgnnysN#d9v1L8<`Hzc_gYVWis=8UZrPj=Z)&9VE6r?FG+cr~cHuXuKrkL#jb+jH>V zgRX4_FK@fmu8mt7@3dQmtIOV>*@M)&o^|yh-ba^H%?7m3%8ok7F|6eBIPX5=yk`dKsbjtK%)1f20`UoZ zdB-emzb(B}PDy@$6SPlG_I=4d32j{upndYX9_&y*z;)MZTwbgB2PSr)%xhTPb^DtlV+Tbz1UO(8ivg>}B`{!20FP zeFj?2-1gQ!oQ>{B<{@V#`y8;(ndGx>&*~}Q^=Ru@My?GBYp+LK_oe9hh`x2n-k7+^ zy(!to4<>95{pxLqPuOGNs7-$haSq4NpzXx~67hS$&p$uAmRqrXZ;ZbkZM;2+diH_M zum4W8e*2~UZnQqv@(eQ152E92jf)+cYgws)mm*xr@e?$rZm$1kJ14<@V~ zqW}LRG5-J3&i@eko!8ob20b5BhX3+6|Ee#H5FPga6CS_kJ}hrJhj4J&icy?2~{EXm7xPwa^~eXRXkFuiE! z^sMbi)M4M5;@;nSJD1nhTUj5q$XnXUiP%r@?cF8j&McK)X_ z+xX|Y>=pDN=2QFamfL>6<+k5%x$QSxZu<=vw)OfA7q;5(x7@zI%l3ON_xmjuw)y;? z3tR2?TyFb47q;=YWwyNEbh+Pey0F!L)8)3`blMlQ|DNslBHk&Na5_8-+8a0_-ZdXU zyg#-f5&yx&8}C`s7JE*NbL=4c!^lQt9};7KH1QuJ#`osO5IM)0!?Pt9^Zz*5`)Uk{ z96te;j~qV-`AwFye2I5^)~|>-%J? z`Dek7m3O|6qkWuD-^UO+=M!(B+vfiS68XKy@;QD!`D4ycVq4oj=5&vK0g?L@vNv(R z2zH#lt<1dw{Y!|vz4X1*c75KBzk=lRJC}FmuO{ENl=s&Xhbs4B0s7Yw-(zbwx3+QS zaNgfQoKK(cxAuk1JAnQivKn!`K4ZKi&9;(j-a8_VK;53Db0 z_MK1FG0az~1^&f)e%;`D%W3-RwOy3_Na-K7B)bl6cs5`IcPvP}F!ul7Y z{|u3{Z{H?IEV||f3eCnw4ufS1fzQ=zJ zuP@^MCW|v)^y_cI`l7#&fgLL!WB(2ubL4yY_wf26?jN$ayl4LiuP^5KC$MAXT$8-vURzzA~w69z=jDHsPyJ2S|2_2ch>x}SP3Ks3)G`ljj<~1i zW6QY*o>T4bg3aYv@ni6wJ;iUW_}!_1?}HmdB7ZNOe7v81U^#2{o$x!&$GP+^K;)cD z9C;1_=RI45ZQWl=chzES=av6NdRGqxJ5HbLkG$%5R}TX_uj?}2Ij!pmw2yTeb2uVr zUE-+gNN`@)670OLqp+<@KHi6=V8`i;JwF=kp6iR;$AIO-J{J5TV&b>#IBYrNJPZ2d zTx-;-ezE;J9$)u9JU&K-CDd4;>%do94)_W?p>ov~W zwT-d%$fwr!44Ypa`TOzN=jg?9Y&q|VcrWA}AMb@{_=S4m@7ig^T91D##2lxC<;}MO ztsTBIz^+f<09f0(R-vs`?K`*{eHP-Qe@*hMBiGqrbA){k*!kj}dq z5?hDg=a@yE^ADE*S41ttJb#9j#bCl%fa>|*1j1m z7iU+Woa1Bf>~oC25^R5Com;>Ms98Q@t^&*3%PY{@;d>QWF1~552FrO~jPZAKEB3wU zkEFYF8@6%s_URh5kA2ejYDCVpies+VfTLD_D_@K4Z>793{+_-LEN`vb(b_To_27Zb zz7A}@sQnFKxwxNh0Q*>%zBeMnh;@l0Zxw8wI6pUni*Z9>d278Mt?fQJ zuX8w8_~QKwpE~l4fE^$6*TJq;-tpI><=o>1)G!9NH~BZv4s7{&uNq*vO~l99dK0$1 z>x#K=!Zv?=ca4MP{5_AD32e_u#5BRi$w%%M*!=pec@!;Y4)=8mY|Z+1qO~J`8*IG3 zX|%R~`}jRH1MWvVMxQa0XnFB2wCl0&yTSHd?cH`W`WD1T|EZR0lU_? z>)r}BM&5Izy$3P3bBSZ@+rX|Pda@TRXD_X_^kExsPi_ZCPi_NiN3MO~=*b;m?dZwd z!TpSPjJ_E24zRt6`R@eF86SO*l z7VY@+&@tZqa<6=jLt^~#2|NCT44()#*U9Ku+xcMQ;@>_mOm@_M0k-*Em$BN>hgX2j zug`JXo^jW;4smUc(f3h$A%78~&)=O(&_2GG@)sj=)+lx##d0y{WnkwF`;}nV<=>ct z=*y9f$R@;CYm+lBKNsFj{fN&gNc3)5!uI4;#QB$_qhFiBdB3jgvad;Y^zRC6b6SUE zwWIbeVCT?h9_{GgRbcz)7=6*dSAq4}zpZE=`zL=jB4^EF&&#%C%g4Oi!Oj~!dkr{x zrZ3`N3)W|Su1h<7uLH|Dhd9pU0C+Xxa|RN9U6t@k^qGh?o`sHccO5wI>kXOhJ5kGQ zar&?+xVTCZTxg*&wzI&JKp&>V7m_A`RK{@VEOnBezBafY zF-D&;?xB3lw;Sx5znuOyxfv|C2Z@+lz{bQLy%{WbD-tnp0lSvS^H#82p66{TCi3jX zmNU;*PV#B!+YoDT4s+~9JU_Ri??60Lo*&nuZG6=5cChR7-?dLizXOqT{GI5#5XbLB U9ItJR;|I|1MD+W9ihV!74`1Evg#Z8m literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/backdrop_lg.dxil b/piet-gpu/shader/gen/backdrop_lg.dxil new file mode 100644 index 0000000000000000000000000000000000000000..e24a6d3e8724ffe415eb3595008849f7310679c0 GIT binary patch literal 4580 zcmeHKdr%Zd8t>Veot@nsmRVTL;x4)if`}vzNEQ@K_64qrmqAeRk#m+;&;TxvBcNU! zmd9d(l_jD;AI;;alxE&-#LJLVj$JHKHxpp7V2h8thh9f@uvgxrK0i3Y)nGPk3Gb?6Em)rBau z172ff#Y3_!umyozXi=Fd-Ne_{J_0m2^&zPO!l z1A)eG0>Ehl1(^Q6157i7g#QroKtl>e`R@|%kkwHd08f;^DDPzH{b!8#oI(MR!9IRQ zt0GjM?raiAEk1{t1d+6WA@ef`p@re-$QAh6Sf)^DIHSL@6EFu5*CJGJ_2;XGXQ2ef zPe;?8X@X$$qsm-qc{18PcURl8t>x(6(jLUnE=_LR+8mP;BiR@8w|zY|)p5(prMb&K zps-+!-KzluxK*PgFoU(I#10Zv4X8GXSd;eg+;is&>>x=MWL(o|WeL_aluqe(`d%6w z#6~^3*j`lL>o#X#MMOtH-F(E2wJUW%g?o?du=g9RopeyKbz?~|c59yoyo({6jtomI z(lqY;TBq2JGc()i1wCD7=AAy#b8=pH%IV`hVgYqqyla>A!uJQ0+n)YDC!+08n+600 zc0=Op@o<=y%Xdl3Ri?RZxy{#gixU$KAUbGcNkw@u<_b5iZusljZ>Oc{3?MeDBEM=w zdBx_k{Maup4d1w~vsg8Ft{%^4@u%D_!}Lzi^e%(ylxG~zW9oMf9vh6@d30cB$xnp#1;o5?z<$dGpZ+frZ>>oRfrG~|)ytLqT`gnZL;l$(dC+8q58pwB3j}_f|^;acF zi%OQ#E5m)|?H6k2O3TW(9+Dn{*S@VSi2V)B(La1+?2hXizmmh)hKFFH?msL{JRRTD zGsmL>{Qb*-_isMIse4D&1qx(QqbaG**JR{G)m5VM)>8612>gnaLq;ZDV6ZTHK zx_}n?X|<{WbWxvaQ4_jopsK0DCfZ7fsx1ziO|->^R#rJ|8b|1WTz1nb8>ItF=)h0L zW#f!&KGhp`O{>1D4f9%O6wo`8&(tOlnHIc`rktrwsY8H&W{Y*qJb z)x9-g{Z{-vr)fc-X+f*MJ>^SkF(oyk$t}KlaMDS8@)b0>%SZNVMcZuXmS{&M=Lo%M zlHDD_$279fM(}Sb*&VlR)EW2{6F5QF*7PAWhxPnKUTYX{@bCUo4ZM!9j~o#`TRyS+ zA8&2<6@9yv#NKgvHaQ2MeWAwflCDBljwCML{^?iWKQnpkkDu#D$m!|0)Fqj`e&e;t z-~Mpza^Aw}#C}8Vc~@A1@6+TLW`c}5syUMJ+f1s#{0d4wR`!>}Ou$QDr97(-5z4!F z1^?G|EMrybYtyRuGzE*f;v47F2UUZU@>G`6n+bURBw(aan9iF`IC52*kb8r zUbA8)_cZKzD2fkim;&Nc8fJ%M{|_4WFx=-)YS^})m4>AwWcWQT{<&(|%TlOi_YV*3 z{?}RtKbL+&%kXkTU66NUPI)t8bYAMF5=#1z}gvd)`#9p!%@U=f&m;53qY~5>*=d{>b>2?(Vddk z1Q`^XZPz{=zBF>}27ICG&kcFUl*?av!_&RHhJ2HJ^(e=Pg~DRuj9BncvGQ~vsUz1b zIZuOu>;>eJuI>-s=;>(Q-E(od*dCKt^+;uYS^37zC7FVQ;?M)nKdEeca_?}9yVKP0 zS-fLf+N!AcC(>4((?#n0oufT{y9eL=^ZA{-2bYUG7Q)#4l8UlwW;3sRnSy{{^y3%( z_$mFcYT&;T6vtp}RV@Dr%m2S%1waz#(n}5!n_PNy>A^9{p~)Wt`x9Lis3gf1)W6*| ziAoq3KqZ~(ewPB3Q1)zb6L8M_f~bQ(OY@cfp5vt!MBrjzXThsT>V8jXou|2pax=hBkchlV zsR4xDlOW3D+=kGmD9mj`gPfk&S{l0_`++Akwt(YfnPp0g7|9yhDBJ{Bya2}hlSz={sT$clWugB(+C*2Y&U8Khb!9Wm+h5+pibC=gCJx1d`+^iwpu}B<) z+{N_d3GA$1NJk2wo3$gGiokKWd)Kfrnlla>ZN7j)s>!h1;5tE`lebJKySPfkfEeSw?xbeETrogSZ>$B-kL=Vni7Z9=*=pLve$bpt&` zDoWYWA7S@!p`Bn$j?qy+0h&f}t+MuWJd<*`1ew{=LIiFJek3A7 zmQ2)ml{1 zv0VBiM(L5%rtdfVQ*VM)q2>s|)+4jC!(DLY4em&<%c~t)(@-E;rK}J{NhEG%Nm8DG zEC`)3J34O`TR%Fxkf*M(KjjioYTZUjpuIC9a!BwpXLvNZG=y4V0=@v}6OEo&G(i}7 zrgtS3Oy_X?`MjJxWPS{LG;SXSPGA@HLYtXPV2~6!5|9O+=!jtgbV$f>?@H&t_-n__ z7HkpE4^h)OLwt^v>pHO|6f5V1*_?3cRnIg^SwcJp?~d~zyN8Wgoh^obP7PeHGNv`M zlh)wV;4`%jk0W~73qyE~vM6*yL)AWPipFxf@{{WI> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + TileRef _134 = { raw2 }; + s.tiles = _134; + return s; +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _67.Store(offset * 4 + 8, val); +} + +void comp_main() +{ + uint th_ix = gl_LocalInvocationIndex; + uint element_ix = gl_GlobalInvocationID.x; + uint row_count = 0u; + bool mem_ok = _67.Load(4) == 0u; + if (gl_LocalInvocationID.y == 0u) + { + if (element_ix < _166.Load(0)) + { + PathRef _180 = { _166.Load(16) + (element_ix * 12u) }; + PathRef path_ref = _180; + Alloc _185; + _185.offset = _166.Load(16); + Alloc param; + param.offset = _185.offset; + PathRef param_1 = path_ref; + Path path = Path_read(param, param_1); + sh_row_width[th_ix] = path.bbox.z - path.bbox.x; + row_count = path.bbox.w - path.bbox.y; + bool _210 = row_count == 1u; + bool _216; + if (_210) + { + _216 = path.bbox.y > 0u; + } + else + { + _216 = _210; + } + if (_216) + { + row_count = 0u; + } + uint param_2 = path.tiles.offset; + uint param_3 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_4 = mem_ok; + Alloc path_alloc = new_alloc(param_2, param_3, param_4); + sh_row_alloc[th_ix] = path_alloc; + } + sh_row_count[th_ix] = row_count; + } + for (uint i = 0u; i < 8u; i++) + { + GroupMemoryBarrierWithGroupSync(); + bool _262 = gl_LocalInvocationID.y == 0u; + bool _269; + if (_262) + { + _269 = th_ix >= (1u << i); + } + else + { + _269 = _262; + } + if (_269) + { + row_count += sh_row_count[th_ix - (1u << i)]; + } + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.y == 0u) + { + sh_row_count[th_ix] = row_count; + } + } + GroupMemoryBarrierWithGroupSync(); + uint total_rows = sh_row_count[255]; + uint _348; + for (uint row = th_ix; row < total_rows; row += 1024u) + { + uint el_ix = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint probe = el_ix + (128u >> i_1); + if (row >= sh_row_count[probe - 1u]) + { + el_ix = probe; + } + } + uint width = sh_row_width[el_ix]; + if ((width > 0u) && mem_ok) + { + Alloc tiles_alloc = sh_row_alloc[el_ix]; + if (el_ix > 0u) + { + _348 = sh_row_count[el_ix - 1u]; + } + else + { + _348 = 0u; + } + uint seq_ix = row - _348; + uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width); + Alloc param_5 = tiles_alloc; + uint param_6 = tile_el_ix; + uint sum = read_mem(param_5, param_6); + for (uint x = 1u; x < width; x++) + { + tile_el_ix += 2u; + Alloc param_7 = tiles_alloc; + uint param_8 = tile_el_ix; + sum += read_mem(param_7, param_8); + Alloc param_9 = tiles_alloc; + uint param_10 = tile_el_ix; + uint param_11 = sum; + write_mem(param_9, param_10, param_11); + } + } + } +} + +[numthreads(256, 4, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + gl_LocalInvocationIndex = stage_input.gl_LocalInvocationIndex; + comp_main(); +} diff --git a/piet-gpu/shader/gen/backdrop_lg.msl b/piet-gpu/shader/gen/backdrop_lg.msl new file mode 100644 index 0000000..de43ebe --- /dev/null +++ b/piet-gpu/shader/gen/backdrop_lg.msl @@ -0,0 +1,247 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +struct Alloc +{ + uint offset; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 path_bbox_alloc; + Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; + Alloc_1 draw_bbox_alloc; + Alloc_1 drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 4u, 1u); + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_67) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_67.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_67) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_67); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_67); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_67); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + s.tiles = TileRef{ raw2 }; + return s; +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_67) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_67.memory[offset] = val; +} + +kernel void main0(device Memory& v_67 [[buffer(0)]], const device ConfigBuf& _166 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup uint sh_row_width[256]; + threadgroup Alloc sh_row_alloc[256]; + threadgroup uint sh_row_count[256]; + uint th_ix = gl_LocalInvocationIndex; + uint element_ix = gl_GlobalInvocationID.x; + uint row_count = 0u; + bool mem_ok = v_67.mem_error == 0u; + if (gl_LocalInvocationID.y == 0u) + { + if (element_ix < _166.conf.n_elements) + { + PathRef path_ref = PathRef{ _166.conf.tile_alloc.offset + (element_ix * 12u) }; + Alloc param; + param.offset = _166.conf.tile_alloc.offset; + PathRef param_1 = path_ref; + Path path = Path_read(param, param_1, v_67); + sh_row_width[th_ix] = path.bbox.z - path.bbox.x; + row_count = path.bbox.w - path.bbox.y; + bool _210 = row_count == 1u; + bool _216; + if (_210) + { + _216 = path.bbox.y > 0u; + } + else + { + _216 = _210; + } + if (_216) + { + row_count = 0u; + } + uint param_2 = path.tiles.offset; + uint param_3 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_4 = mem_ok; + Alloc path_alloc = new_alloc(param_2, param_3, param_4); + sh_row_alloc[th_ix] = path_alloc; + } + sh_row_count[th_ix] = row_count; + } + for (uint i = 0u; i < 8u; i++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + bool _262 = gl_LocalInvocationID.y == 0u; + bool _269; + if (_262) + { + _269 = th_ix >= (1u << i); + } + else + { + _269 = _262; + } + if (_269) + { + row_count += sh_row_count[th_ix - (1u << i)]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.y == 0u) + { + sh_row_count[th_ix] = row_count; + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint total_rows = sh_row_count[255]; + uint _348; + for (uint row = th_ix; row < total_rows; row += 1024u) + { + uint el_ix = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint probe = el_ix + (128u >> i_1); + if (row >= sh_row_count[probe - 1u]) + { + el_ix = probe; + } + } + uint width = sh_row_width[el_ix]; + if ((width > 0u) && mem_ok) + { + Alloc tiles_alloc = sh_row_alloc[el_ix]; + if (el_ix > 0u) + { + _348 = sh_row_count[el_ix - 1u]; + } + else + { + _348 = 0u; + } + uint seq_ix = row - _348; + uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width); + Alloc param_5 = tiles_alloc; + uint param_6 = tile_el_ix; + uint sum = read_mem(param_5, param_6, v_67); + for (uint x = 1u; x < width; x++) + { + tile_el_ix += 2u; + Alloc param_7 = tiles_alloc; + uint param_8 = tile_el_ix; + sum += read_mem(param_7, param_8, v_67); + Alloc param_9 = tiles_alloc; + uint param_10 = tile_el_ix; + uint param_11 = sum; + write_mem(param_9, param_10, param_11, v_67); + } + } + } +} + diff --git a/piet-gpu/shader/gen/backdrop_lg.spv b/piet-gpu/shader/gen/backdrop_lg.spv new file mode 100644 index 0000000000000000000000000000000000000000..ff2b1d72fea710d523ddc3d06141513d45abef9c GIT binary patch literal 11540 zcmbW6d4Szjb;p0nyqScAErfjulLUqk0uqY?253TpLm)^ZQ7IyiH#2W0PiEem%)Chg zxCDrTHY$jMmZ}I8tx~IP?P^<_+FHBs8(rv1X`u@(qEf}r=lAZ(+<8L(=ueO5d%ow~ zbI(2Z{_gut?3urEK~c;r78MJN`{otpv!s|0QxuDfzETfvxn|2b)9sOS&U@9l1{_}W zlySx!!MI*@1(~QdnmXQ(+>1Pb97zs+OAwQZ9oNHu=A#dk+@fM|OLg$-s|UASRvn(4 ztWLLY9n80* z#WC3J*39sZD(tH1_SDR9`|PrSvuDmxYv&-+^%;xVYYwYY-rPz;;-Lh21E-dee zvGMAb)^Kfnb8{D}-Dov8H%IDw@b?yX6IUOvPt=?3a*$*0g=5U%cx!0RpiO1Y-r_#= zRBLy2xHZ${)h=tiA6@QC`F<`e@1&QQ*A!*^L+J8#MY~tpKKYq_7<*%@IocTO?7y~8 zvzjVq(eA&z``@fia#E-3gu^Z-v>Y09v%-z%)zD4UX@ja>tM|b@zt8mRyZQpA@J>~T5~!%BA0inxvIXj z2Yn06buPv?!l9XHHCv65$T(ZK_v`R@V=_FE(|fk$hZ@7PzD`c>+|oDQt_|N5-r3%I zpLfRQ^>q4vBtGkzTWiPXZGQ^?b=v1ywM}?{n*29pT7aF<)y~?d!TQm)~-cT z-?>NNJLme+nNhH7lJgnnysN#d9u@F7<}q}+_gwin>N z2VL6=Ufy=AT^qMF-f7o90EY#x6z78L<8i0;6l<|NWBJVhPfoRlykm{M0y`FG?(Nv+ zYo8Wp%(ZaS^_}Uy>n%o84A)BHdWuP7W+uvzI- z@{U>Bep`B{oRa+hCTO3W><5y4656^RLi^-(J=~$bhwHA@xV%>L$@Qds-d(}DzLkjg zOK{%Fa^4BS*)uuc-Qc{7_C~Hh{k9WOqH7|%pM;?s}qc~In!Cd#uQ@BYhcFC&9{Z_IN3*EzIftg()9 zPvrD1=lr@Kjt}0k_Rm^^Gu~L=G4luSoUXM~Z>8*sv2w>T*J;UDK^uE|vX|i-0PB}C z_Zet8bK6_{a5lOhnTMQ}>~p|AXOhplJ*%gH*Q2dt8M!tjti2v_-It>0Bl^}Qdt>4v z_oieUKbWvN^sBcaK4Fi6qc;65#5o*4gSHm~NW|{}zxd+pT5iSmy)piFwDI;N>e&Z2 zzy3SX`t6tY`_THF*Y#+77s}cDd(iqkSH^!3txw)~ZSP9Cu)Qm_-K&Stj$cN1A5K_1 zMF0OsV*LN5o&OQ?JFm6>2zovuXRX?gCN6sQW7riWdi@2kcbwyY2JLw34gW8LUF^yD zzYMeuI(Ezy=ZIlEbd3tVc*?l`}bG)y?^e{?3LihGTZm%i~~^}H~veRt^e`N zHtwk|`{~Sf{%11V_!qkD6|6nxQ~Pa~+kW5Ww%>QT?KfU-`;8a2_4hwY`TX7sTkZE=Zu`9#w(+-Rw!Ghbx!-TTu+@I^<+k5^+849`p8XFX-Z__W zIy?*78#p1}MIS=EU$!9;|KY?N?^)3ndrpjV>>&Ch$VOxz5@SD__|FjId-X$zoa4;l z*^-O-e;Dk2HikrwkAdYQ$H&2P=6IeQ52Ag(#rpJp0(lVe+>0a6C&AXgfi76@r;tYx zf18wu`!rbJ=Tpr;3U;i#^L-ZW<9zx)gUC6bcmv%w|L2g%?>(2#@#D!KbABG%+V(N0 zd-M~C+!v6&iTg>gfjWdVy{vzUh`h35&FJ#^U^e-W+5y$H@#ydhj*88hq&z`k9-&YW0oUesM z+^>Q4Mcl80BhLMgxUYis-JaI;8(_!Ed$zuY{!Qcn;@sAxZCvF3ZE)oFcOi2B4p?8r z{caXFmc{)ZSYOoe`(Ve)JD=~1T;%-&uw0z$KLpE}({uL6Xdlm+zCS|bJZIvl=TE>< zcV5q*!s~mC^)E#K86s!jzDbVArGAcGFy_w@=kgpI?>yG>muMerG3GB2IcpKe`XYDu z)KTYOfuqiRkN+B8U&Q@Q7H7Wb*WZHmMSq_FJ61l%{v9~x$oKH?;q^t_KV)%v&;Ai! zU(E4OV8_bGyZO)HClGzX{R>$BNhId@SFpA0Lp;~9j(-E|vu97CwZ%`P&10N<@$ZQD zqrPX7eIfg_2>c(&5s2d*r{B28(fVSp{|4_*w&VT>F~)b|IrP_&XA$EZuWgLyRsOEC0Flt{w_@oIckddDZc*9tL(^*JZqO zTGtV1AL}yaa750!#8KCg;JmIS*m+$?VOy7cybnvkj?))=el*xU*B7~u0n3MdEciZR z;&bs|_U_UmM@kA2a15+Y|`#8K-h;Jh!( zu&poFdn&f;HO|_#jj{H~r`Glin_nII`|;W5=*4nuIq!*hFXS8_?}cahrF!A-+G)gE zkAF+V9H)cj&9?%r9lkTbu20_pSlhW)p{-TzJGdHs7UH9SP4cTF*V$lmgnbU!`Qn{? z1=tvQ`>uT^Vs7UWTZiB1?vvQ>^SBpYnc~#ebRPOuh>tbBI{DR6!}(xyL=6{!<(_1W zHE4eyVlHbC$Jp0^ttIS>z~(xT-qnl2{*5Ld`*#W0ar&&^zbE7($M=Kt9<9eVCf=zH zU^(BlIAfP$yQlgh&qlC(+!33==9c%pe=XX_HR`(zk#mjW$TJ8oYfOFDwwDpB*0#@% zRma%N!S*B8z8NeRXIGz`<74mabBw`(W+A;nIzyq0m9oT$P`k>!aD%d=6er^QI<>#k{?fKCc ztZVJ3i*GgI%k<x@NR&06K6?5N&ZT|S~8VAeydmb?p*q)JyX@ZTDkK8S=`Sn@zC|b@O?&}oT zn)U5OYe)Vz*m!-@Xl?)Y@q1_n+>dsQK4T`)^5R`+*JIyzgYCQ8yX|K5Er^f)Ta#ZM zbH5F2jT; zT>HS$lRLoL(UW(9`x);TeKF?UV0#nu-wBp8KKdpX^S=ii^WO#5j$A(oj``mU)(+qM zz|LdN{b+5!qxqUF-VZ(rZ9d28xA*RY{ZdDs4}k4`)O$BrZXe@gZ{_m6weQ+-zo@m( zLYrS5`9BCAKqCLWV7Yq`^Xrpy{3^8LSEIjO%;W#(nOFYXpZjzK;J{N3^@3VDj8)LlluSN8KhrHsb&Aa&+#OEl) z+MI7`!p?a#Vm-&A9e*A=#=BqcmCtcVj6XhM$Dfel6T#*>869goA8cIw+vkPJj@mE4 zHlOPnSIkFMigcxgWa>nK7!n>&-@i_&F-YrYmo}7v}|8jKnYcn|S*OguN zHOY?tU4d;*>u{`g)V>Am9Qw?o9sRosZ2ugiFZ%a7us-{@742jH`?fLF3 literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/bbox_clear.dxil b/piet-gpu/shader/gen/bbox_clear.dxil new file mode 100644 index 0000000000000000000000000000000000000000..6655b7f9ca1149d804b413fba4d5c3313c10bf74 GIT binary patch literal 3160 zcmeHJYfu~46~1B@?MleYf>bP70xw`R7*lrt`A>D2)b_^)A!z8{MMfLjY^Fa%O) zC^W9D;xTudk+rUb5x0u3@;R_@KSin}TX1QKQ1!64$SVVa!9fN*Jyy=K9rsI@}EKdb# zK(dIB1x1yj6eo)Jw|ai}>@e%ULs~tZ+VTEW{D|8lOAVZ@dHar_6+d@CP?M%yKU%xn zmo=3hEw_q(w2g;!Wlc#vbJ{V>B}wVT!l5fh>m(U>bfk2f&X_LUu&evDH(vSk&OBSn zh{cN+UmZG0rAS04ooEm)4~Ql8V{8M%s;z9Z6@l@Z*UUui4ABB>5Zgjy^+C)@VYPmu zZZ&9H4eA(-Bh2tWzmL|a#abAwVTQj*VvTp13;{FTWiG<5AO<n9BkTaXQ2TEF< zNPpcZPd6A-Fwh#vsK^i=&-lmjQFn7zi8H3M<-f zti;Pg^CimaHky*h>mTu27@RT8n8*z8cN^)d4k- z^Cw2wTJ#BRqr6>TsU^L9^ZK}~QTnZ{k=OGRYV6Ji zr?|tC99S&O%hO4vDGgOk2W*Ed2jSu@`hA{Tx!Z3k_f4JLe4)#3N5uSgsgEb?am#+m zTj!{ucUvmo__4D%b(8bCfp~OJc)5J({KDd;x!KdGQLNIeeTP=RqZ-|)D`iE$uOhzd zd)f5r=Up%RYInGLk|&*kO#`hjb;~ym48)sy;n*8Dubpq3ot?WeaRa7UJdN&8RB4W> z^1eT+8G0)x@9gNt@sB!TqNP_ZcD-t{+jsc6mxX%NRZXnDpsg zSaTER?5h(kg4i!e;%_b@Tcl5#7*D)7sHmo+BeZt;G_o~p*y=;JPB-})99S!kH5aiC z2iD?18k<-L#mc7Rh$TNUPsiHn*iV;_?O_*t0B7r@Y<(Zw+~>~xa+&z@ z8u9lrVu?m7!h?#1LB({Wp0y~>Co4kk#Pb&ZhkiqT*pS~Bso$R5(qq`-MGAT*zXvD1 z(^ha9DHxvgg_E(LIFNdhbvh7kFAVDlvxkSUmcDV#f-BZe5jV}m{4l`Mzo{-O z@yRnfsYqDql=DNB67M!J4~p`O)ouPvdBPtb!7CUFk5Z%W@h1X+fj<=+u1;Kz#8Z#;&z4^u__)Vq)qAd2 ztevXfr_4*E|Ky(^ogVBA^q$N8b)fgmPJv%_uxWFBmGjUO^_5l5vZxmb*@=>V`7`B7 zmmfVY|75mDi~L^`zh#S5 zQJ#n=n^vKEy28aVx}>JxR(2^-t<2v;YNG>t$#Jg^fde#;4`=b%I>q&#dx&)`S9k1* r$8KG|ht$mEyAShET-R+g(iJmYzNgnU?OGljzxC7iGTkn2fw24w> uint(2)) + (6u * ix); + _45.Store(out_ix * 4 + 8, 65535u); + _45.Store((out_ix + 1u) * 4 + 8, 65535u); + _45.Store((out_ix + 2u) * 4 + 8, 0u); + _45.Store((out_ix + 3u) * 4 + 8, 0u); + } +} + +[numthreads(512, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/bbox_clear.msl b/piet-gpu/shader/gen/bbox_clear.msl new file mode 100644 index 0000000..c278c68 --- /dev/null +++ b/piet-gpu/shader/gen/bbox_clear.msl @@ -0,0 +1,68 @@ +#include +#include + +using namespace metal; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); + +kernel void main0(device Memory& _45 [[buffer(0)]], const device ConfigBuf& _21 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) +{ + uint ix = gl_GlobalInvocationID.x; + if (ix < _21.conf.n_path) + { + uint out_ix = (_21.conf.path_bbox_alloc.offset >> uint(2)) + (6u * ix); + _45.memory[out_ix] = 65535u; + _45.memory[out_ix + 1u] = 65535u; + _45.memory[out_ix + 2u] = 0u; + _45.memory[out_ix + 3u] = 0u; + } +} + diff --git a/piet-gpu/shader/gen/bbox_clear.spv b/piet-gpu/shader/gen/bbox_clear.spv new file mode 100644 index 0000000000000000000000000000000000000000..58a270e31e9ca8418dc37845943753f44fd4208d GIT binary patch literal 3212 zcmbW2Tay!25XaAMk|lz25fCrzI?F1e;EK8+ih!$FfvQ+o1rI)0HBKgz&6JawR5Huu z%?Ce_pMfvsrOLwpZ)Q3VH*Z($*r_*p=KFN|WOYB~b z$ipN{CP`k-mz=Za;WR48^F-#dK2+Ir?nmjc9EWKhmT8t$dukmwI!@BPak*4l9L|P$ z7}ZG{=iUc$)!;>5YvR)~&P-wPRcO~_Wvay9yQ&(jN+e7yS&?DrhhJ< z`fiU)M?TNzTI;Aax-=hoXYW+!pvEc`r`J8z>}18bMPk?NRK+;aGN038^}Sgap{vjG znsoGd-3z|H&gqQwL@1-Z(6n|(uQxh%*1vRbNXJ?JYu}rVZWxu3ZR{w2&QtgG*ZU)_ zITMT#tKa$Zo;VMK_Y&w^?5k0$_tkb6C4ZMpirEwPuT`>;n`RYvPS{r-$J?#A6BQDJ1((`^?Wa2W2dsAQffHR+zCwy&cH~U$imJN9Ynv6g)eaM8Lkxaj_ zW)FILZABA)y^)ClyC@l3`iy}+{e=#`AusetyV3bHMV?iXpO^5icGEPB^Z8~ zSCt#_=xGLf{}AE#j=Q9X`2FH*a?r@TvZ62eouPN=mmAGJ@H+#)DUa+&HXXk=uwCWE zUGr|K7d4>{X7`rX=zdciYOt=!Uik0w`?kJ}zZ4VQh9->sS??-q{GZb_|J08-7GtZ? zKUECw^Ic6Cbv2v!v?kwQ8=DJSpVx%{(y;fnW-mI<9Ow%(AKO~P?97+7rq;wk#~v8< zwmlzf9cWsNPql_oH}kWjwfRAhADHjwisV4k{CuW0jNUOn*R(c2=SFcb`?p(Bj1~6}E!G`= literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/binning.dxil b/piet-gpu/shader/gen/binning.dxil new file mode 100644 index 0000000000000000000000000000000000000000..3050aa83bdb31b0655c4e6a2c8d80987840b6532 GIT binary patch literal 6336 zcmeHJe_T^Xwx65a+*}|DH;~}vr#C?;SfrPTh`(NM5=20tQK=Qh_9j-1Ezl^|idf&x z4?@&XLq$um?FOZleQRyB)|Req6G#;)RJ3;2sFurABZr(ri)@|P{jm>Bc@L;o-~EL%=*g?xH)#s4IJfkkvL3mU@P!dC&T z&?JDu%9ja#*w}E?)^T#Qo8V8#^UP%>+LAuFDHe#pT?mk~BEOIaBH-LRkgwDGuc;&e zdd*p-Sc6(AfZuYhW4MGO^VGo39)#9z{f< z+a2Zz77}fNIE0j_fnE>_?0AQVBm9&g!wnk!-a~MohNYMgm|sLva)g=EIcv?F)2oAO zJei9);^}jipCf-Ho}Vh{^${s;p#l&Er`2fJKcZW?^X#I^SS7zO^*{UqAlPjhg#m_XGBhr^NS;*qS7(tU zOCne0$3CpcaR|8XPg+wCI?g^eAU%LrkZRrYS^t&d!bRZaD%1;cK%`=mziu;Imu0T2 zHVYvweZwHD=#y=OEePL6i#K`2Rg}2GBilGZY@HxBvSMDK5BugGu`P^v8!O)2hg~Pc zTgD_=8AKS9ybhd{WRkUq(z0B-PZ|-Qz{c;XG#Y($j_iUykX0Dl>wj2N%B8s5=r*vAM9J-LF& zoLS_Wu&Q~&WB2bpu(+m5_~OQPMB5TJ??6q%bkp?E=IQ_4+-|MTSX>phX7OhP8i`WP z6ukaMDho%o#_~!VD2#OqRmu1|2AlT2>51}W9K5AcHP(`t6J(*iC?f6VI;Rx(pw>vc zH0%z8X3$F%NUUIkaG(&-TE->yJFUJW(!g6ra=JTb)r#gTs7l(w+CWJp>c32H?(|uy ztdFl4N@AF29UX&%9mDV6`1tze_k+VXy00hW{txdfEKX1Re)sM2qqla~mp=zhVlz8# zT)TYXbTZDI)~CVg*9)I>HI+48AyA=J>OeIaP9=|-3Ybh)XVtN=HE%+f zM<|}|8Y;Wq*U{BI*nF+4`_gM@UbJTZl=kE8Z_jN%cDy|`P_k2Ut~NdVXj)hAt@?B~ z4^dgbeF`yBs}Ak`$i8iH)u96iBDSxFK&P*Ld-X>5;Puzgrzg&O2_$CXTwHJM1I^8->T)kCX)wEG^9faQ^WOq5)Oo=X`BM|>_OWZnIG%99~?GY~M(l2le7xZj( zZ?=eQaB+2Boy8*FW)W`LT4$l^qI=}Bn;zK!9bQR?e>EfCm`msSu_J@A#M?yC2 zmkoHrzc+;6rR~-(Wa?FhSJYvQ=Lwi*z(xvyV%R4QiFSOM`Ch#9oYfzl*;E^1*HSw( zCCT=q3rgy?SHsqJ!>S4`k1?S z8m^qQuj4sFY8Ro_pypA!6;by*PL8!c7lVo9??$4LQA9p0i!0=62%Nbt~{EoA@PPP_N01u3X(qxyNmz`1M^a z2^)tRu4LKEd3A>q)tojih`HmwJfm?3 zx(?)Nm*pvV0+!se?9PYEUif9rM+tWi?JbzW=lDzEV>$mvm_+Z*`F_Zs$oaS6+`mN5 zKk~1Y^J(}Y{3p%%f3)VG5$=6XoG=_WcK7)9A5!#QfbhRdzW6=s5dp(fuyCq9LX;UL zVWUsLac*MZoF8%TUm)V%zd#uS;JDrT(#+&5^3EzL9imZCIml84m5XdrEd5MTT&-BoNEJ^pKVypJ0}tnE zm4yh_sMBI4IddA3jkB1GI1^%+naf?D`_wA|5^4VKRJCjkO5wJxvvYxs!uP;*3=**r zU}O8NI#RLSFWyX>Jb=Qn)hVvPdGQW|I1$6(wC9NJ>%Wsfh?Sfm&E%>z;7+NFQD9!| zL2jYsO*%B-L*$+gP-4Y=u9m0*6`4f|mXX}t3m_`vYDX&CiREyiLc`7&zuH-rSIb`F z@Dd|ui@JrVernZ6-iLcg}C#iA^vO)xs+7|Vve%J$k)-N~rm(n)v%X`L*Zh~aQg z0Bh$$qhnh;%sSAxB!#m2e5k9*rVrQl0EA7Te15lLkk2nP?kON(_M+W>GtI^qAIL{;*2oF0Zw$ zuUz@=B>xVOZ6X|!hlwvA{V+9cPdSzok+i9gM>a80v{9xl zB4#a#oOMk0HdrMr^t-e|Xn@OF+J(+VXREFTi>~I(dr45PydO!#q);)1nYEGM5`zv; ztO zS8iC{nqxtj-1GMBxe^IL1E6E~K0FVCmB9WqW1WPkDg%9(wi6wy` z4DOqRXQ#c8@V30fz`QDY5(13m?HLC+HVH5s-~k#}`60lK@IQ?MXeI$RO&#C?dPNlw zK>p$cz==tK5n}>x!K3{UAX`2%4q%!Dh){pW1Nyai@X48yBj z9x}QGZop~G@w58}N^R=Dew8bUf_JHZWy2SF^%1G^5&s@G*v~ubg!0$YPrLS1H!@&! z?FK!X>^(5fIzrqMe_CK>A7YSw`jba9e%I(kwI+5XL(Shc??CJ$Fod{W%MI#@_r@ho zI~c3owIcO-b?R_zk{h3RHiLOPZ{~W{u3c@eUEGM@7SMQ))AN1q3x1U5Rf+C81G*m)ClZv-{bYHt=Krg|H1?!43QF^w!XB z?R3&>2`p-Lz+BT8@*=7LhXkwp2AutcyzdU$zN{szYC(Jk4G7det$52c=$*P*h`kj1 zO=w|U0g5VHca}uLgL$xeFDGD2EdDg>ECYfob7mN+eRCR8K(5rXN!b)qLNlWE2Pa}h z&8FnP$bDz4eIW;YJ6$~yWhcK0!JVV9^`G&GbjKF_5NpA{j96 zrog%iOC;i#- zA%{zhnY^SY7YaBp;&8b)hF|q4k0~tYh1=?R;gh4nvw7j6QQ;w8m`dP8tAO|4p literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/binning.hlsl b/piet-gpu/shader/gen/binning.hlsl new file mode 100644 index 0000000..986f42b --- /dev/null +++ b/piet-gpu/shader/gen/binning.hlsl @@ -0,0 +1,342 @@ +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct BinInstanceRef +{ + uint offset; +}; + +struct BinInstance +{ + uint element_ix; +}; + +struct DrawMonoid +{ + uint path_ix; + uint clip_ix; + uint scene_offset; + uint info_offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +RWByteAddressBuffer _81 : register(u0, space0); +ByteAddressBuffer _156 : register(t1, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; +}; + +groupshared uint bitmaps[8][256]; +groupshared bool sh_alloc_failed; +groupshared uint count[8][256]; +groupshared Alloc sh_chunk_alloc[256]; + +DrawMonoid load_draw_monoid(uint element_ix) +{ + uint base = (_156.Load(44) >> uint(2)) + (4u * element_ix); + uint path_ix = _81.Load(base * 4 + 8); + uint clip_ix = _81.Load((base + 1u) * 4 + 8); + uint scene_offset = _81.Load((base + 2u) * 4 + 8); + uint info_offset = _81.Load((base + 3u) * 4 + 8); + DrawMonoid _190 = { path_ix, clip_ix, scene_offset, info_offset }; + return _190; +} + +float4 load_clip_bbox(uint clip_ix) +{ + uint base = (_156.Load(60) >> uint(2)) + (4u * clip_ix); + float x0 = asfloat(_81.Load(base * 4 + 8)); + float y0 = asfloat(_81.Load((base + 1u) * 4 + 8)); + float x1 = asfloat(_81.Load((base + 2u) * 4 + 8)); + float y1 = asfloat(_81.Load((base + 3u) * 4 + 8)); + float4 bbox = float4(x0, y0, x1, y1); + return bbox; +} + +float4 load_path_bbox(uint path_ix) +{ + uint base = (_156.Load(40) >> uint(2)) + (6u * path_ix); + float bbox_l = float(_81.Load(base * 4 + 8)) - 32768.0f; + float bbox_t = float(_81.Load((base + 1u) * 4 + 8)) - 32768.0f; + float bbox_r = float(_81.Load((base + 2u) * 4 + 8)) - 32768.0f; + float bbox_b = float(_81.Load((base + 3u) * 4 + 8)) - 32768.0f; + float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); + return bbox; +} + +float4 bbox_intersect(float4 a, float4 b) +{ + return float4(max(a.xy, b.xy), min(a.zw, b.zw)); +} + +void store_draw_bbox(uint draw_ix, float4 bbox) +{ + uint base = (_156.Load(64) >> uint(2)) + (4u * draw_ix); + _81.Store(base * 4 + 8, asuint(bbox.x)); + _81.Store((base + 1u) * 4 + 8, asuint(bbox.y)); + _81.Store((base + 2u) * 4 + 8, asuint(bbox.z)); + _81.Store((base + 3u) * 4 + 8, asuint(bbox.w)); +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +MallocResult malloc(uint size) +{ + uint _87; + _81.InterlockedAdd(0, size, _87); + uint offset = _87; + uint _94; + _81.GetDimensions(_94); + _94 = (_94 - 8) / 4; + MallocResult r; + r.failed = (offset + size) > uint(int(_94) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _116; + _81.InterlockedMax(4, 1u, _116); + return r; + } + return r; +} + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _81.Store(offset * 4 + 8, val); +} + +void BinInstance_write(Alloc a, BinInstanceRef ref, BinInstance s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.element_ix; + write_mem(param, param_1, param_2); +} + +void comp_main() +{ + uint my_partition = gl_WorkGroupID.x; + for (uint i = 0u; i < 8u; i++) + { + bitmaps[i][gl_LocalInvocationID.x] = 0u; + } + if (gl_LocalInvocationID.x == 0u) + { + sh_alloc_failed = false; + } + GroupMemoryBarrierWithGroupSync(); + uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x; + int x0 = 0; + int y0 = 0; + int x1 = 0; + int y1 = 0; + if (element_ix < _156.Load(0)) + { + uint param = element_ix; + DrawMonoid draw_monoid = load_draw_monoid(param); + uint path_ix = draw_monoid.path_ix; + float4 clip_bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f); + uint clip_ix = draw_monoid.clip_ix; + if (clip_ix > 0u) + { + uint param_1 = clip_ix - 1u; + clip_bbox = load_clip_bbox(param_1); + } + uint param_2 = path_ix; + float4 path_bbox = load_path_bbox(param_2); + float4 param_3 = path_bbox; + float4 param_4 = clip_bbox; + float4 bbox = bbox_intersect(param_3, param_4); + float4 _417 = bbox; + float4 _419 = bbox; + float2 _421 = max(_417.xy, _419.zw); + bbox.z = _421.x; + bbox.w = _421.y; + uint param_5 = element_ix; + float4 param_6 = bbox; + store_draw_bbox(param_5, param_6); + x0 = int(floor(bbox.x * 0.00390625f)); + y0 = int(floor(bbox.y * 0.00390625f)); + x1 = int(ceil(bbox.z * 0.00390625f)); + y1 = int(ceil(bbox.w * 0.00390625f)); + } + uint width_in_bins = ((_156.Load(8) + 16u) - 1u) / 16u; + uint height_in_bins = ((_156.Load(12) + 16u) - 1u) / 16u; + x0 = clamp(x0, 0, int(width_in_bins)); + x1 = clamp(x1, x0, int(width_in_bins)); + y0 = clamp(y0, 0, int(height_in_bins)); + y1 = clamp(y1, y0, int(height_in_bins)); + if (x0 == x1) + { + y1 = y0; + } + int x = x0; + int y = y0; + uint my_slice = gl_LocalInvocationID.x / 32u; + uint my_mask = 1u << (gl_LocalInvocationID.x & 31u); + while (y < y1) + { + uint _523; + InterlockedOr(bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, _523); + x++; + if (x == x1) + { + x = x0; + y++; + } + } + GroupMemoryBarrierWithGroupSync(); + uint element_count = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + element_count += uint(int(countbits(bitmaps[i_1][gl_LocalInvocationID.x]))); + count[i_1][gl_LocalInvocationID.x] = element_count; + } + uint param_7 = 0u; + uint param_8 = 0u; + bool param_9 = true; + Alloc chunk_alloc = new_alloc(param_7, param_8, param_9); + if (element_count != 0u) + { + uint param_10 = element_count * 4u; + MallocResult _573 = malloc(param_10); + MallocResult chunk = _573; + chunk_alloc = chunk.alloc; + sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc; + if (chunk.failed) + { + sh_alloc_failed = true; + } + } + uint out_ix = (_156.Load(20) >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u); + Alloc _603; + _603.offset = _156.Load(20); + Alloc param_11; + param_11.offset = _603.offset; + uint param_12 = out_ix; + uint param_13 = element_count; + write_mem(param_11, param_12, param_13); + Alloc _615; + _615.offset = _156.Load(20); + Alloc param_14; + param_14.offset = _615.offset; + uint param_15 = out_ix + 1u; + uint param_16 = chunk_alloc.offset; + write_mem(param_14, param_15, param_16); + GroupMemoryBarrierWithGroupSync(); + bool _630; + if (!sh_alloc_failed) + { + _630 = _81.Load(4) != 0u; + } + else + { + _630 = sh_alloc_failed; + } + if (_630) + { + return; + } + x = x0; + y = y0; + while (y < y1) + { + uint bin_ix = (uint(y) * width_in_bins) + uint(x); + uint out_mask = bitmaps[my_slice][bin_ix]; + if ((out_mask & my_mask) != 0u) + { + uint idx = uint(int(countbits(out_mask & (my_mask - 1u)))); + if (my_slice > 0u) + { + idx += count[my_slice - 1u][bin_ix]; + } + Alloc out_alloc = sh_chunk_alloc[bin_ix]; + uint out_offset = out_alloc.offset + (idx * 4u); + BinInstanceRef _692 = { out_offset }; + BinInstance _694 = { element_ix }; + Alloc param_17 = out_alloc; + BinInstanceRef param_18 = _692; + BinInstance param_19 = _694; + BinInstance_write(param_17, param_18, param_19); + } + x++; + if (x == x1) + { + x = x0; + y++; + } + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/binning.msl b/piet-gpu/shader/gen/binning.msl new file mode 100644 index 0000000..2ee5168 --- /dev/null +++ b/piet-gpu/shader/gen/binning.msl @@ -0,0 +1,347 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wunused-variable" + +#include +#include +#include + +using namespace metal; + +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct BinInstanceRef +{ + uint offset; +}; + +struct BinInstance +{ + uint element_ix; +}; + +struct DrawMonoid +{ + uint path_ix; + uint clip_ix; + uint scene_offset; + uint info_offset; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 path_bbox_alloc; + Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; + Alloc_1 draw_bbox_alloc; + Alloc_1 drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +DrawMonoid load_draw_monoid(thread const uint& element_ix, device Memory& v_81, constant uint& v_81BufferSize, const device ConfigBuf& v_156) +{ + uint base = (v_156.conf.drawmonoid_alloc.offset >> uint(2)) + (4u * element_ix); + uint path_ix = v_81.memory[base]; + uint clip_ix = v_81.memory[base + 1u]; + uint scene_offset = v_81.memory[base + 2u]; + uint info_offset = v_81.memory[base + 3u]; + return DrawMonoid{ path_ix, clip_ix, scene_offset, info_offset }; +} + +static inline __attribute__((always_inline)) +float4 load_clip_bbox(thread const uint& clip_ix, device Memory& v_81, constant uint& v_81BufferSize, const device ConfigBuf& v_156) +{ + uint base = (v_156.conf.clip_bbox_alloc.offset >> uint(2)) + (4u * clip_ix); + float x0 = as_type(v_81.memory[base]); + float y0 = as_type(v_81.memory[base + 1u]); + float x1 = as_type(v_81.memory[base + 2u]); + float y1 = as_type(v_81.memory[base + 3u]); + float4 bbox = float4(x0, y0, x1, y1); + return bbox; +} + +static inline __attribute__((always_inline)) +float4 load_path_bbox(thread const uint& path_ix, device Memory& v_81, constant uint& v_81BufferSize, const device ConfigBuf& v_156) +{ + uint base = (v_156.conf.path_bbox_alloc.offset >> uint(2)) + (6u * path_ix); + float bbox_l = float(v_81.memory[base]) - 32768.0; + float bbox_t = float(v_81.memory[base + 1u]) - 32768.0; + float bbox_r = float(v_81.memory[base + 2u]) - 32768.0; + float bbox_b = float(v_81.memory[base + 3u]) - 32768.0; + float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); + return bbox; +} + +static inline __attribute__((always_inline)) +float4 bbox_intersect(thread const float4& a, thread const float4& b) +{ + return float4(fast::max(a.xy, b.xy), fast::min(a.zw, b.zw)); +} + +static inline __attribute__((always_inline)) +void store_draw_bbox(thread const uint& draw_ix, thread const float4& bbox, device Memory& v_81, constant uint& v_81BufferSize, const device ConfigBuf& v_156) +{ + uint base = (v_156.conf.draw_bbox_alloc.offset >> uint(2)) + (4u * draw_ix); + v_81.memory[base] = as_type(bbox.x); + v_81.memory[base + 1u] = as_type(bbox.y); + v_81.memory[base + 2u] = as_type(bbox.z); + v_81.memory[base + 3u] = as_type(bbox.w); +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +MallocResult malloc(thread const uint& size, device Memory& v_81, constant uint& v_81BufferSize) +{ + uint _87 = atomic_fetch_add_explicit((device atomic_uint*)&v_81.mem_offset, size, memory_order_relaxed); + uint offset = _87; + MallocResult r; + r.failed = (offset + size) > uint(int((v_81BufferSize - 8) / 4) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _116 = atomic_fetch_max_explicit((device atomic_uint*)&v_81.mem_error, 1u, memory_order_relaxed); + return r; + } + return r; +} + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_81, constant uint& v_81BufferSize) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_81.memory[offset] = val; +} + +static inline __attribute__((always_inline)) +void BinInstance_write(thread const Alloc& a, thread const BinInstanceRef& ref, thread const BinInstance& s, device Memory& v_81, constant uint& v_81BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.element_ix; + write_mem(param, param_1, param_2, v_81, v_81BufferSize); +} + +kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_81 [[buffer(0)]], const device ConfigBuf& v_156 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup uint bitmaps[8][256]; + threadgroup short sh_alloc_failed; + threadgroup uint count[8][256]; + threadgroup Alloc sh_chunk_alloc[256]; + constant uint& v_81BufferSize = spvBufferSizeConstants[0]; + uint my_partition = gl_WorkGroupID.x; + for (uint i = 0u; i < 8u; i++) + { + bitmaps[i][gl_LocalInvocationID.x] = 0u; + } + if (gl_LocalInvocationID.x == 0u) + { + sh_alloc_failed = short(false); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x; + int x0 = 0; + int y0 = 0; + int x1 = 0; + int y1 = 0; + if (element_ix < v_156.conf.n_elements) + { + uint param = element_ix; + DrawMonoid draw_monoid = load_draw_monoid(param, v_81, v_81BufferSize, v_156); + uint path_ix = draw_monoid.path_ix; + float4 clip_bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0); + uint clip_ix = draw_monoid.clip_ix; + if (clip_ix > 0u) + { + uint param_1 = clip_ix - 1u; + clip_bbox = load_clip_bbox(param_1, v_81, v_81BufferSize, v_156); + } + uint param_2 = path_ix; + float4 path_bbox = load_path_bbox(param_2, v_81, v_81BufferSize, v_156); + float4 param_3 = path_bbox; + float4 param_4 = clip_bbox; + float4 bbox = bbox_intersect(param_3, param_4); + float4 _417 = bbox; + float4 _419 = bbox; + float2 _421 = fast::max(_417.xy, _419.zw); + bbox.z = _421.x; + bbox.w = _421.y; + uint param_5 = element_ix; + float4 param_6 = bbox; + store_draw_bbox(param_5, param_6, v_81, v_81BufferSize, v_156); + x0 = int(floor(bbox.x * 0.00390625)); + y0 = int(floor(bbox.y * 0.00390625)); + x1 = int(ceil(bbox.z * 0.00390625)); + y1 = int(ceil(bbox.w * 0.00390625)); + } + uint width_in_bins = ((v_156.conf.width_in_tiles + 16u) - 1u) / 16u; + uint height_in_bins = ((v_156.conf.height_in_tiles + 16u) - 1u) / 16u; + x0 = clamp(x0, 0, int(width_in_bins)); + x1 = clamp(x1, x0, int(width_in_bins)); + y0 = clamp(y0, 0, int(height_in_bins)); + y1 = clamp(y1, y0, int(height_in_bins)); + if (x0 == x1) + { + y1 = y0; + } + int x = x0; + int y = y0; + uint my_slice = gl_LocalInvocationID.x / 32u; + uint my_mask = 1u << (gl_LocalInvocationID.x & 31u); + while (y < y1) + { + uint _523 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, memory_order_relaxed); + x++; + if (x == x1) + { + x = x0; + y++; + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint element_count = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + element_count += uint(int(popcount(bitmaps[i_1][gl_LocalInvocationID.x]))); + count[i_1][gl_LocalInvocationID.x] = element_count; + } + uint param_7 = 0u; + uint param_8 = 0u; + bool param_9 = true; + Alloc chunk_alloc = new_alloc(param_7, param_8, param_9); + if (element_count != 0u) + { + uint param_10 = element_count * 4u; + MallocResult _573 = malloc(param_10, v_81, v_81BufferSize); + MallocResult chunk = _573; + chunk_alloc = chunk.alloc; + sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc; + if (chunk.failed) + { + sh_alloc_failed = short(true); + } + } + uint out_ix = (v_156.conf.bin_alloc.offset >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u); + Alloc param_11; + param_11.offset = v_156.conf.bin_alloc.offset; + uint param_12 = out_ix; + uint param_13 = element_count; + write_mem(param_11, param_12, param_13, v_81, v_81BufferSize); + Alloc param_14; + param_14.offset = v_156.conf.bin_alloc.offset; + uint param_15 = out_ix + 1u; + uint param_16 = chunk_alloc.offset; + write_mem(param_14, param_15, param_16, v_81, v_81BufferSize); + threadgroup_barrier(mem_flags::mem_threadgroup); + bool _630; + if (!bool(sh_alloc_failed)) + { + _630 = v_81.mem_error != 0u; + } + else + { + _630 = bool(sh_alloc_failed); + } + if (_630) + { + return; + } + x = x0; + y = y0; + while (y < y1) + { + uint bin_ix = (uint(y) * width_in_bins) + uint(x); + uint out_mask = bitmaps[my_slice][bin_ix]; + if ((out_mask & my_mask) != 0u) + { + uint idx = uint(int(popcount(out_mask & (my_mask - 1u)))); + if (my_slice > 0u) + { + idx += count[my_slice - 1u][bin_ix]; + } + Alloc out_alloc = sh_chunk_alloc[bin_ix]; + uint out_offset = out_alloc.offset + (idx * 4u); + Alloc param_17 = out_alloc; + BinInstanceRef param_18 = BinInstanceRef{ out_offset }; + BinInstance param_19 = BinInstance{ element_ix }; + BinInstance_write(param_17, param_18, param_19, v_81, v_81BufferSize); + } + x++; + if (x == x1) + { + x = x0; + y++; + } + } +} + diff --git a/piet-gpu/shader/gen/binning.spv b/piet-gpu/shader/gen/binning.spv new file mode 100644 index 0000000000000000000000000000000000000000..30eacd6b33c706f9056c8d9762669211821c186c GIT binary patch literal 18536 zcmbW8cYt0+xyDb(?j{D5-XSK`5JCw(1ds$sKohDo1(wZblPuZn#@!7eNC%@LDuN0s zAgCx^Pz1pWqS$-ySnowFSg>4Cf4}pcCz*WeKlix9JkRsK^S(24X6Br;S;wxo9b)quIi-jpe*a@ zAJi}pLpNn3+N*D@2-n{-mj9XRQAh`w=g#Y%d)#qz=N;AEyL@@~$mptpzV3nk!M^T= z{lk5|qy6Xh={LUc>mTeLSh28l=)|M!gM0!xW(^Ds^-_1yOS#w3qD3QpqZP_H`d?Lp zeJi_r^qaC``b>TnOrMFZquO8{-;V0J@R9!W>5XkWm=(cTRk6s zZqGnH2Rndg_YWR3I5OHZ*xPqv-=fwWsC%t#*IDfZFMOH7u3FQtZ=i2k-{5F>|9SRd zt7_`HzTI`-|9$+;gcMQfK|M47ZmDH+b+R6JrS56)6zX9*uD%<1q}DsDY1Bs!_pF>h zG&t1HJ#$T|c`ff5U1DdWd2f3M`j@w~akZaoex$dLg`HV2MptcfNA(Xb8j7it^}ZL{ zz);V^?u8`nURGyyi6WD^v)Yq--FX^U?VmPv@&yZq&TF;sKJJGteKNj*^yuy%9PJw( zVXI9!chP<`E%n%EV9%T!tS`kqa~O5quX@ny<~S5v=E59Dz(+=hhS{qQBF;^Yj%pV6 z+QQt~$Jub5sOml(McwK{M>QXunbY?;`24W z`=stYL9b(YzfXXBiBG52*IY}em-h_!ENl2VZG51OU)aVkUWbpZu7tPNc1LyfI<}7L znl^rI8@~>|pl4*wIdna|J_4v6m)xtItGB?97#dvEzqmD5a<9SeW(BdxohxbP2Wu2%LhbfFJ+k8*C+ZCW9st z`ts4`aB+6&*0!t42ks|^wHs-!I8$0SUwBZEZ;S@@%1>@$L5-B z?YKl-+i#x>^*(Ako|rx#THCI_H?6wZ1J0?>irU7x*t;~=wR5{+?Dd9h1y0%ncb_fV>a$}_e>&@&n_%N!IC|!(rEOYkj@;ZW&F9Wwx4m22pUu%)^OjYltu>Y{ zu=#v$t%cIw>eH6k%`n=tIL8@nTehtloA)p+W7`^=WBE_*eC}ym*fZLbhMs|MMQ^R^ z*(-R|)MHZf>SeuKyO(beaJiRnrLOONbLQL*ukTIGyQ+KO?dP<)?nhtujP9r&L0h{Q z$5x+(pT}F0Illy6)!=WaPjBhpQ?GGH^#pkBdDl_>U0&?%PBvq`BsgH~`v&2oExu_u z2bK11!a1zvlN(OhnopIliQiQn17Eg^%|1NZKiWSu*z)VD-{BSybe}pjymapH(2C{9 z9BKQp)$#cEH?ht%R{!X-p5-Ih#@FwBw3#>5+cR*?;JH*rw54skssa2*mb4adck|-o zybhtS`y3lvor{)vTnWxRo~Pc-<0|zUcUG5z>pjxiv_$QwE<-EN>{ryh^_({EtKhBu z+IryF=BjAT+IZ7CF*>X35~I0kwz+;CpN{Hn@baGC&}P4>jo;D6?^=g=ehz)@ncFgWkDf&L0V;k9ysE~X)o;Mu>yd%}-ZjsTKeV)EJtIr&m|Jkj=U(&< ztr%pP)p52&&wZZ)UTf>D_LeVMF_>e`Ihl@D&iy`ZeBU-cBQffH9n~T0*t@FPaMn(l zhJJNa^U#M@y(ft!H;aPtJ@oUo^fh-H(Jg=$MGOqIbRR8@rMgO7yo0)<#W(RvYUB+w2R<%as_SRbI zN2pr!nK_p=Vq2fz#yx?02Z~qtb}BUA3)H3*n(qT@(+bVLs&Txp$?Nl9 zZNEbExvzFmq4}&=JEYJYr`q9#wsE5!S!h1f^*g%Ie5R|-D>Uaw?f63TnXdNYLi1U! zc3PqN{8l@o(41ejvkPtOMq5~DKBx6tQfnRc^BK72{^Z%LkC*FDZETG@>gRO1{%$~b z{kfK0kMjWD5A(?VJ%fDPhJT6$Cbw^!vf5m4lZkG8@#eH~%}k)*N7H@dN1xArxf{^s9- zy4ClNdXClAw7G_3+Zn7)&3L;|C!X`JeImN+**0ad&wHS0GsYIgo=!c9vL&{CsO23L zefOnS)7SAG*kCos=eifGThtDxwm-H}^PbG1o;GaG(FqiH{}!L~EL^~sdvlb=HI(%=1H|GanU&r87Zp94Od;ymm||GbCiQ{uZC zY=68r*HasRA|vs<6PHE%|EH8O{u?&G`^&tJ>uuEbOU-$byT{bh_Zz{k@66LJV8
n|T@w5iRyS_V(w2EpSgwp~H(+|FrYUD9y(ZszXQ zXj7IBelh&Y7C&|xE!~%|f$L+@W{K~8;HtW0W^z9cci!dZ{5(AO-SIpPpVZjp&%piN zVCKd5M(4$OJRIyjcU~RKtj4CUzub2|{k6|&aMp?W=cD;fp#5|(KlMJE#vOBA_$)BK z``mH>-uf8R%f9~w#Te#vF8v;|4)=SfK=D1bk`1_4Au0 z{?>l8gnL%{%@S^XRU7wvC3gM&UJ1AMTP0k(-zp{dTczZFtAtzMQE=PeRdDTotCaqq zE4cIZWE+2~;I{J{rEKr_Ny+^_DY@Sz;XW_@774fZJEY`(hlKmg_B$lpGvDu!aBIIq zO78bYxcU6v2)FioqvU57Tz|heO1s}1CHH%yhdfe7`rst^L*r*Y0;l z_>OSDGs1l@@H<03i(TPac`Rife@Z4)tsMk=&QkM zWxrktSJ&UOTQ2t8_F0>@uLaxIb(6ll4z8~MRn+p>UJq8gt63vggVnM|t^s>FUTtrn zs5xG7;#>+7>do>*_K z_0}^;yU!*y+nVb}u$nfXL-N?(4t5T;-9#;)!5XpM&ET2Tw$Y}K&o%Y5zXhyz9Cyxl z(072b7@bmPk^MH*Os&U(_pp7DLD^516KEbtmatMAFSRML*1I$nT zdkx!qUhCsnjH5lVzX|qUhJOoO=K40g%=I0(T5^3C%uk)mShm+EYeaiue-G>$%{aaf zcB~o455Q`P{X;N6b!^)@4tOPzQhuX_$m9}3})Lh5n#Qk5eeNIlfYd*f#@%g`y%g?op62B6Qc|4y-AF5 zaJ8Hdq+PVezfV708Z zoxy7H-v#VEWbN$=R?{zYq!$0(z-3>j!rS}0JDPghOaq%EecuD@@4$?uuX)t;bAFv` z=g2xS_5_`t-OEEwXyx(NAB}J1@?Cy=QBq$*gO6{Z9llbuhm~a?Y8%K zDcaJP{lSjs&gMKh0IX&X_rXD6FYmjy11V~bNt`$bgN>83VkTJa7TTKE-?^yy-LNs* zp^(+lA08HYUU>D6r$V%^d2vl-U&h zj49VgyT3;|8f+iDFUIh^JqE7s+`Nd|%ehg1Aw|u(5gT)Wb?treUdCoE&pcYs;=cK9 zd@MNcUGu>izx(nyxb{cspZs{R+G=p_!wK+X$eH_aB3$isw9M5>@b^`I;A&p>a~$=V@Y5*H zfwATKWvty`^JzPaS{_>u*!8IGY-+iCgQ{ARIp|E1LOjB5})iCo4vmiG2% zh2wM`4`Fj{-q!Hta5dNUFtwNKTK#1dHP^M+{<&61z$Z{=e|R59(bRLEtN^R!d6YQT zf8u`Fmvh19b|2_(4*PK)wU_4O!3lwNn^K8tV_W@zr!zsm+xhl!}U?O|MCkd#x|EYZC?R4R-S9C;cD8Q z(^rDa_pn#N^-)iZSA)yKaGC2Z zaC7Aw_^og?_i^&S4Q@MadB?Z`te$&uBUsJxT}LfX&YQr_S$==_cDP#lehb*kzH7Ug zqGsR4iSrJyanjd!!qx0+eBK2w@5in1DHMIR#qZr@dsaS5 z?d4k2_6SAIwI)uSPk_twQ6Af4V9z1v+&+I2u9iM(Q*(@oZ7si;+FaK13mSeAwdeFf z6#uQ(_2PS}mw%6NAblT4y+MKfn=5tS!Teh*<8DOlzu)?9cofAp_S>`SXo~H#=8ggD zsAEtY-QoI3x8JRfWy=C`(#Fpz zxZ@gVc%G#v!F`t6=Zwq0=`#0|WOAIRQuOm&@Ql&!`0R_ew0kL7E&L3yak7`r1nZ-I zB=^MFXHm@0?W)f{?|(N%-F+$VX|%Jcb3QI;^qhgcaO0~xHpjY<;{0SC$@Po9udv%^ z-w75`)boC_xY07Eez;?F{LZyJwxwXltIc_ri=A`hGN_lb@$^iwU_%*{bdw2*OAyb^*a>Ud3SDp+uY|-@CYUMInRNO(N<8* z>)I6?&$#wM`$|ge;;e1wbQ_A-mK4{vg?MK!TMzn`Tw7h+xH377g3DoeCsREJ-q~Md~LRq`)?kO>r#qi zvyHZ_najZ1ydPIkdwD<9FQ=&4M{(XI)spj-U~`7Q3hcOiC-UCCn&Q1%P0`oBsp(gq zdCuKbiq|fb%-yaH-jRArgLkIhjbgucr_Nkm33jeBmsb~D{hBs@ZNpt>+FwI4$6QLr zb5*0~d-An#HOJ#v%&8Xt*MZe?j=UbM=2@iA1nM_XcBR z>^H&lJ$NIy9Mjw3+B3&DgVmhl%-UpjwQ~J_3U0r%ejf+Be$}&8ss5Pf%;KoqoThsJS=9<=&VH_d1B;-f-Q{DDc5x_sJpDiTA6B)HiW{2ew~{^E6n^vL0nDCBDB0mpT6cSF^-7Isb@e zA9kh~Q{VjU$e+OW$t&73;Pf-vpV5rt740wJ#LoTtE1EHlVHb_%;RizE5s=&Z~c*-9VX3dA6yY<1X+&DcW;h zIf-hkDe+kk&3@;+8VesoQO{Y?0k*9+*REqyON=hCTF#Mi@a#Kn@f#1;w!GO}&jG7l z&U=}AZGEuX1d4f_E48$LF4#Qj*Yn_NCl|RlgeSN5wA~1-Ex9)at0ng)V727-zNn@B zX5cdS=5V$0T-ySk+}hK2OR%=&-U_Ui+*^aylG`<)miF6%%iP<+)ylJOdw6ndPuq!L zZOQ$7uv&6Y0;?sr>rKt}-jf}`-iPoV!Ol(iPGI{RzBAYu*-yKGjbS@|wR>+}$J%V? z`L`=r&AlehUh^zCg5q^JC3|gFgWVIeDek)?sk85<62pC#eKHMx1KfSGJJ{OX?vp*> z+Otpg1gn+%WH0zEe6vsXhTj5L&w88=wyieDXdl#)XCJVd@BN+y`-1KJDU{6LerW1G zo9x^EV72V61Hie@#xswae#vtn*gWaiL14AkK56Bi0XMf=avzMQp4>CRYRP>FIJu2! zZZ-Xq`%rM1`!H>k);?+FegWLvYRP>#ntF220;?tWY;bZL)7)zMCHE2FGWU^iwbnjq g4qI!%+BGo|BZ8uJ9kr~5)>`Xa7hBx0W$n)8rR(|EH! zflUk>1{?d#(EX?2Cc!5$xO|tRw7lxeUi4F#Fgp>IB#D z(s8zn*z}uG#RS_0Dgc&49Te(>7&Tc`K4In|2-rsfun^MZ(H0(6wigSD>1=?nPa4{6 zqTqb1)xW`$FSNNL5%F9*pe;i3^QC}{Cfy{%i58iNUYeukG&~~*fXR~xf&Dn^gE3J% zXfn2Ea2k&9GGBVBtI;+gtT69okd$RYQ3po`x^AD@)IGu5MO@s#+Y~FC)3db@s)1u( zJs+qN_?$^Z2c-#ht;snp^n-}hpc`~{!`!Wf1;QXc`1;X9$1TCKJ$utCqfcoq#PS=C zb}Cu~M3;l~?EZ#`b~qbrG>xehjrA2M>t*_Zwa}zQ<9nhXkkk77G&AEP-g2$0JJl0VUn^au*@cn z$;k%2h|`MdGhQa0FRa2fH-b?5j~1lPduTHL*tG+v?MZ|~O?Cm$ORsP+W*PwHi++TD z2ACHMgp(1)63F_fsqkWf_{(hO`?#iW_UI=Ao#kk}9SAaWTAa2F3(dMW~6uy=VysZeO z!}T5s2;VqqtRBYKQ?M6~=G$^PA=xS7Qvd2q^kl-Jqj|fk(dV}JAc{8s%%i&w#+StN zpO63g`5r@kQeL%xY2N!nECd73Q{XDDrBVq@p{=Yk0#$?=mC6VWg-8BIr;!oPQ&MTm zUNLS4sn`IfAba)hwFd?;7sn*JC?hcx}dT8kGjpA$GzTw-)F-53+@vOJxJw2z=-kgtIZ6L3l zUtk@0HNR?w`nuFz{hZQ2CgISQoiPceM}w(yudn!K->nbt-0bbUbsSq23w6&d?tYC* zdo%3}eFmE7nUAb#B(EInZ+>(^;GKsCnyac-_@07>B)eXz?tmM2G`JsTF#y=TEr8#J zgg(+U8o5z~EU_rl8{F0`J-5D&Tiv>yyafXPC=3|W2P`DBqRzQyUv7)sNc&BSnIihp zC0^wcE4rk=&RV16?jpGLsz#lTyHkhO)-~#=MoE8Az+GFwC>>Zu2Y%uU@Tmh93O%zw zltvCoXP?tXZKU^Qo;76-Dl<-@i_e-CH=vpQK2fcPyQ_kmR?)ZvzV(edTcb|Ht&QW> zj{8BgKhZaCAR4!e8+Wue*0&lGzwiZoF%od!8E}_IOT2B7H`^lnC*Q-`$UbSL$1uB3 zi@#@6W_Xnut&{I=cY2F5-GXMexF3dtP8c(W)}cG&8f*29lB>#q@e%wc z74X>z{%2vpm(GAuTi~D6fn&7E;6>&PGweX;w6oa`>&)%cR4rVs!3`fp{xx*}M|JQZ z3D)nKyQ@Bb&n3gk;7T0BpQv4HeWB`EedzeW)eYNslGtUt>zDKroMT#0Bq*9xeE9-1 zLWZBy> z`%kFLDR7jn zyXFRUSczE784@OkU`c~ABJ8wQ`r=*+$BmRmDcnj$@_>wJQnWvmIIz`70{P>r7CKBDSSK z8d^9X7v&m6D;~5IBYaW5VKoJx?2JOigE^vY%mcioifKFi9fRN^wQKbXBwDm4M-;U# zg?X1?chu@mtqyKeswEDopC)$hzgOb>I)3XU}xETOAO8fMvd-) z;>p{FsD774!O>IV;yokMT~2NdU7`ULf$yJRy!-FnpZ>l3G(Aj_W%MN>;j$pCVnqkd z^~>dU@DTO+h+-a62)OHAvz^n-U)Z?$_DQBEVup6Y$^FqFOQZ8BvJj{=Ax_4R!P1cv z+L66$^zYF=V?wqYZ(vAg7ZOCGbRG@GEWS`Q6h!Yn&-hVc=hj-qVn7%bhx|Jzeu}?q zwZ^4W0)IF$M&t^|cR2jC0yJaj9K)eipx5F;;7k|4L-F^i-JDRO07FBCNqK5$79SNr zvx+*m#lXrWe`=t(hV6DKk#I9rz+8l8}Paco3L^~)^t-$@rsVQRXG;t)XR)DI! zQ^fXZBABfaD3=dU5l_qz=V%0`6}~Cr>1m=9O1kqJvL^>}Qk3a)yq$=rc|~p}L>8n5 z)={&Zos}WZm_nwv1tWe=aTRfmAva0_=Ei+I=T&oxg^!7o7$C&Zt+L$qFt1^5ZV1P2 z-eu$WBk^Lp(Kg6eb8<-`Tr%@6@9^!tnpjqVu<3(P3@GTqo$awssczrZ2GDIE%F3OQ z_21u0H8L_NJ7O%^pTwIU#>R;ryRdN zv|lr%Ct4B59^Nuao`1n^d2&lh|RJgaeyZ_gJ_SO@-W~5*>Tqu5KO6QQ|Q; z7oJSvsCh$N89l`)s2V@7m|=Y0yFpk+=XF6BSUCO{nH)o0m+c?G^fW`FUTF(A<8L9R zQL{b8sX1a;4%h!NX+^}Tl-DlSLuZf{7M#_B=^g&2`FF(r_w>zhH^o(8za9ok_`K<@ zTe-N*l=y8)D!cu?AE&&^xuNId4v<W1__o)(w|mKOEa`b6m?bJVY;it$1cW={oFi8yPukm^Q7>-Ci!&Mc4UP^2-vYPl0P8^_Myy|Fym(q}fC-O|*1&|9*lHJqqF_0+V{#_p~$@T%k(J@ULoZSH*YW;h93O2r|BP3`=z{ zdc^+>DFUTO;2qW*+8PVsX6bzm+m1*{k3BB^-rV~ zR*{R6-(!6Zx#HWPl@|8uPkA&)>?DjnA7xsvG8vjUi@o-+YZ`ky2$VbaCW~Q|n#j#k z0rdF7DG@&Nn@N#w>EJB7wLBeNNWc@enT?NyaZ6pZ=0DBTNR)kJd29yj1BH&e`$b-? z%W5cRWH6jK8$nayL=OMQoT6TzJSK$AxeLKl@zR*shmpimo8%A=ngii5*|QN*QH*c+ zqP%8xG_<9;@CNkH zXw`L@-7a3;+1&C0gxYPxEIfRw`T7->yX__mf9q*(8DhCLcUd@{x$h4QCEu93ZDc>K zOl?Y?&h8)C2HKj%k?f5HTU4&Bcpa;x)dkcRCwKF|{8lnnP7-kASRA|pRWf4+8iIx> z44uA=4Senq42!C|$t&`1Ma0eC=^Iz4GEEkf_S8a{;@pDR5SuH}5J46eWkpbx^d%qu zl+Y@s;5-cGMRgtW+CypVJ#3iqM%I!3kJ`?7zDF;VJTlZvf`k>ko|#tR75L zyFlp8<^xm8VVP~pg@)T!8Rkq>M8aznCvyBo#I)ty%+Atylef=*WaboV@VM*`oDxxWa;rf$wXUR+%6r^<2F#YbT&y?J3bTT1iP+}{SuP`3d2 z0$N08UstHV(>$U^zE++55tJHx`KN3-_WMS4o`nD*^Wsl_^awL3-jal zOq)bw{o=faT<*6ccw8D6jolmNMRHga&mzCsUW1ZF3nqCHUL=b}bQ*WsllyEFwgZ3Y zzruR*_V|_GBb5U9$H>z4uD?1h3KU*Bb#XTzIT3O@ux;tr9ZO%U=69uZ2U^9+aF39e zvwWW{LcSv=+)F2vnQwx}&wn$clsI>3SIGF!MwWj2$ROW2#@BHj%yHCO2#Rn=^j)CYNg9%0&!X_h o@h4c}5r0;ALJEaV-K=o>v#jv_U??mtV}(zwq^5qRfFl6#H*als{r~^~ literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/clip_leaf.hlsl b/piet-gpu/shader/gen/clip_leaf.hlsl new file mode 100644 index 0000000..ed45bf1 --- /dev/null +++ b/piet-gpu/shader/gen/clip_leaf.hlsl @@ -0,0 +1,371 @@ +struct Bic +{ + uint a; + uint b; +}; + +struct ClipEl +{ + uint parent_ix; + float4 bbox; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +static const Bic _393 = { 0u, 0u }; + +ByteAddressBuffer _80 : register(t1, space0); +RWByteAddressBuffer _96 : register(u0, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared Bic sh_bic[510]; +groupshared uint sh_stack[256]; +groupshared float4 sh_stack_bbox[256]; +groupshared uint sh_link[256]; +groupshared float4 sh_bbox[256]; + +Bic load_bic(uint ix) +{ + uint base = (_80.Load(52) >> uint(2)) + (2u * ix); + Bic _286 = { _96.Load(base * 4 + 8), _96.Load((base + 1u) * 4 + 8) }; + return _286; +} + +Bic bic_combine(Bic x, Bic y) +{ + uint m = min(x.b, y.a); + Bic _72 = { (x.a + y.a) - m, (x.b + y.b) - m }; + return _72; +} + +ClipEl load_clip_el(uint ix) +{ + uint base = (_80.Load(56) >> uint(2)) + (5u * ix); + uint parent_ix = _96.Load(base * 4 + 8); + float x0 = asfloat(_96.Load((base + 1u) * 4 + 8)); + float y0 = asfloat(_96.Load((base + 2u) * 4 + 8)); + float x1 = asfloat(_96.Load((base + 3u) * 4 + 8)); + float y1 = asfloat(_96.Load((base + 4u) * 4 + 8)); + float4 bbox = float4(x0, y0, x1, y1); + ClipEl _335 = { parent_ix, bbox }; + return _335; +} + +float4 bbox_intersect(float4 a, float4 b) +{ + return float4(max(a.xy, b.xy), min(a.zw, b.zw)); +} + +uint load_path_ix(uint ix) +{ + if (ix < _80.Load(80)) + { + return _96.Load(((_80.Load(48) >> uint(2)) + ix) * 4 + 8); + } + else + { + return 2147483648u; + } +} + +float4 load_path_bbox(uint path_ix) +{ + uint base = (_80.Load(40) >> uint(2)) + (6u * path_ix); + float bbox_l = float(_96.Load(base * 4 + 8)) - 32768.0f; + float bbox_t = float(_96.Load((base + 1u) * 4 + 8)) - 32768.0f; + float bbox_r = float(_96.Load((base + 2u) * 4 + 8)) - 32768.0f; + float bbox_b = float(_96.Load((base + 3u) * 4 + 8)) - 32768.0f; + float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); + return bbox; +} + +uint search_link(inout Bic bic) +{ + uint ix = gl_LocalInvocationID.x; + uint j = 0u; + while (j < 8u) + { + uint base = 512u - (2u << (8u - j)); + if (((ix >> j) & 1u) != 0u) + { + Bic param = sh_bic[(base + (ix >> j)) - 1u]; + Bic param_1 = bic; + Bic test = bic_combine(param, param_1); + if (test.b > 0u) + { + break; + } + bic = test; + ix -= (1u << j); + } + j++; + } + if (ix > 0u) + { + while (j > 0u) + { + j--; + uint base_1 = 512u - (2u << (8u - j)); + Bic param_2 = sh_bic[(base_1 + (ix >> j)) - 1u]; + Bic param_3 = bic; + Bic test_1 = bic_combine(param_2, param_3); + if (test_1.b == 0u) + { + bic = test_1; + ix -= (1u << j); + } + } + } + if (ix > 0u) + { + return ix - 1u; + } + else + { + return 4294967295u - bic.a; + } +} + +void store_clip_bbox(uint ix, float4 bbox) +{ + uint base = (_80.Load(60) >> uint(2)) + (4u * ix); + _96.Store(base * 4 + 8, asuint(bbox.x)); + _96.Store((base + 1u) * 4 + 8, asuint(bbox.y)); + _96.Store((base + 2u) * 4 + 8, asuint(bbox.z)); + _96.Store((base + 3u) * 4 + 8, asuint(bbox.w)); +} + +void comp_main() +{ + uint th = gl_LocalInvocationID.x; + Bic bic = _393; + if (th < gl_WorkGroupID.x) + { + uint param = th; + bic = load_bic(param); + } + sh_bic[th] = bic; + for (uint i = 0u; i < 8u; i++) + { + GroupMemoryBarrierWithGroupSync(); + if ((th + (1u << i)) < 256u) + { + Bic other = sh_bic[th + (1u << i)]; + Bic param_1 = bic; + Bic param_2 = other; + bic = bic_combine(param_1, param_2); + } + GroupMemoryBarrierWithGroupSync(); + sh_bic[th] = bic; + } + GroupMemoryBarrierWithGroupSync(); + uint stack_size = sh_bic[0].b; + uint sp = 255u - th; + uint ix = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint probe = ix + (128u >> i_1); + if (sp < sh_bic[probe].b) + { + ix = probe; + } + } + uint b = sh_bic[ix].b; + float4 bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f); + if (sp < b) + { + uint param_3 = (((ix * 256u) + b) - sp) - 1u; + ClipEl el = load_clip_el(param_3); + sh_stack[th] = el.parent_ix; + bbox = el.bbox; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + sh_stack_bbox[th] = bbox; + GroupMemoryBarrierWithGroupSync(); + if (th >= (1u << i_2)) + { + float4 param_4 = sh_stack_bbox[th - (1u << i_2)]; + float4 param_5 = bbox; + bbox = bbox_intersect(param_4, param_5); + } + GroupMemoryBarrierWithGroupSync(); + } + sh_stack_bbox[th] = bbox; + uint param_6 = gl_GlobalInvocationID.x; + uint inp = load_path_ix(param_6); + bool is_push = int(inp) >= 0; + Bic _559 = { 1u - uint(is_push), uint(is_push) }; + bic = _559; + sh_bic[th] = bic; + if (is_push) + { + uint param_7 = inp; + bbox = load_path_bbox(param_7); + } + else + { + bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f); + } + uint inbase = 0u; + for (uint i_3 = 0u; i_3 < 7u; i_3++) + { + uint outbase = 512u - (1u << (8u - i_3)); + GroupMemoryBarrierWithGroupSync(); + if (th < (1u << (7u - i_3))) + { + Bic param_8 = sh_bic[inbase + (th * 2u)]; + Bic param_9 = sh_bic[(inbase + (th * 2u)) + 1u]; + sh_bic[outbase + th] = bic_combine(param_8, param_9); + } + inbase = outbase; + } + GroupMemoryBarrierWithGroupSync(); + bic = _393; + Bic param_10 = bic; + uint _618 = search_link(param_10); + bic = param_10; + uint link = _618; + sh_link[th] = link; + GroupMemoryBarrierWithGroupSync(); + uint grandparent; + if (int(link) >= 0) + { + grandparent = sh_link[link]; + } + else + { + grandparent = link - 1u; + } + uint parent; + if (int(link) >= 0) + { + parent = (gl_WorkGroupID.x * 256u) + link; + } + else + { + if (int(link + stack_size) >= 0) + { + parent = sh_stack[256u + link]; + } + else + { + parent = 4294967295u; + } + } + for (uint i_4 = 0u; i_4 < 8u; i_4++) + { + if (i_4 != 0u) + { + sh_link[th] = link; + } + sh_bbox[th] = bbox; + GroupMemoryBarrierWithGroupSync(); + if (int(link) >= 0) + { + float4 param_11 = sh_bbox[link]; + float4 param_12 = bbox; + bbox = bbox_intersect(param_11, param_12); + link = sh_link[link]; + } + GroupMemoryBarrierWithGroupSync(); + } + if (int(link + stack_size) >= 0) + { + float4 param_13 = sh_stack_bbox[256u + link]; + float4 param_14 = bbox; + bbox = bbox_intersect(param_13, param_14); + } + sh_bbox[th] = bbox; + GroupMemoryBarrierWithGroupSync(); + uint path_ix = inp; + bool _717 = !is_push; + bool _725; + if (_717) + { + _725 = gl_GlobalInvocationID.x < _80.Load(80); + } + else + { + _725 = _717; + } + if (_725) + { + uint param_15 = parent; + path_ix = load_path_ix(param_15); + uint drawmonoid_out_base = (_80.Load(44) >> uint(2)) + (4u * (~inp)); + _96.Store(drawmonoid_out_base * 4 + 8, path_ix); + if (int(grandparent) >= 0) + { + bbox = sh_bbox[grandparent]; + } + else + { + if (int(grandparent + stack_size) >= 0) + { + bbox = sh_stack_bbox[256u + grandparent]; + } + else + { + bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f); + } + } + } + uint param_16 = gl_GlobalInvocationID.x; + float4 param_17 = bbox; + store_clip_bbox(param_16, param_17); +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/clip_leaf.msl b/piet-gpu/shader/gen/clip_leaf.msl new file mode 100644 index 0000000..5f5e0a7 --- /dev/null +++ b/piet-gpu/shader/gen/clip_leaf.msl @@ -0,0 +1,370 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +struct Bic +{ + uint a; + uint b; +}; + +struct ClipEl +{ + uint parent_ix; + float4 bbox; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +Bic load_bic(thread const uint& ix, const device ConfigBuf& v_80, device Memory& v_96) +{ + uint base = (v_80.conf.clip_bic_alloc.offset >> uint(2)) + (2u * ix); + return Bic{ v_96.memory[base], v_96.memory[base + 1u] }; +} + +static inline __attribute__((always_inline)) +Bic bic_combine(thread const Bic& x, thread const Bic& y) +{ + uint m = min(x.b, y.a); + return Bic{ (x.a + y.a) - m, (x.b + y.b) - m }; +} + +static inline __attribute__((always_inline)) +ClipEl load_clip_el(thread const uint& ix, const device ConfigBuf& v_80, device Memory& v_96) +{ + uint base = (v_80.conf.clip_stack_alloc.offset >> uint(2)) + (5u * ix); + uint parent_ix = v_96.memory[base]; + float x0 = as_type(v_96.memory[base + 1u]); + float y0 = as_type(v_96.memory[base + 2u]); + float x1 = as_type(v_96.memory[base + 3u]); + float y1 = as_type(v_96.memory[base + 4u]); + float4 bbox = float4(x0, y0, x1, y1); + return ClipEl{ parent_ix, bbox }; +} + +static inline __attribute__((always_inline)) +float4 bbox_intersect(thread const float4& a, thread const float4& b) +{ + return float4(fast::max(a.xy, b.xy), fast::min(a.zw, b.zw)); +} + +static inline __attribute__((always_inline)) +uint load_path_ix(thread const uint& ix, const device ConfigBuf& v_80, device Memory& v_96) +{ + if (ix < v_80.conf.n_clip) + { + return v_96.memory[(v_80.conf.clip_alloc.offset >> uint(2)) + ix]; + } + else + { + return 2147483648u; + } +} + +static inline __attribute__((always_inline)) +float4 load_path_bbox(thread const uint& path_ix, const device ConfigBuf& v_80, device Memory& v_96) +{ + uint base = (v_80.conf.path_bbox_alloc.offset >> uint(2)) + (6u * path_ix); + float bbox_l = float(v_96.memory[base]) - 32768.0; + float bbox_t = float(v_96.memory[base + 1u]) - 32768.0; + float bbox_r = float(v_96.memory[base + 2u]) - 32768.0; + float bbox_b = float(v_96.memory[base + 3u]) - 32768.0; + float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); + return bbox; +} + +static inline __attribute__((always_inline)) +uint search_link(thread Bic& bic, thread uint3& gl_LocalInvocationID, threadgroup Bic (&sh_bic)[510]) +{ + uint ix = gl_LocalInvocationID.x; + uint j = 0u; + while (j < 8u) + { + uint base = 512u - (2u << (8u - j)); + if (((ix >> j) & 1u) != 0u) + { + Bic param = sh_bic[(base + (ix >> j)) - 1u]; + Bic param_1 = bic; + Bic test = bic_combine(param, param_1); + if (test.b > 0u) + { + break; + } + bic = test; + ix -= (1u << j); + } + j++; + } + if (ix > 0u) + { + while (j > 0u) + { + j--; + uint base_1 = 512u - (2u << (8u - j)); + Bic param_2 = sh_bic[(base_1 + (ix >> j)) - 1u]; + Bic param_3 = bic; + Bic test_1 = bic_combine(param_2, param_3); + if (test_1.b == 0u) + { + bic = test_1; + ix -= (1u << j); + } + } + } + if (ix > 0u) + { + return ix - 1u; + } + else + { + return 4294967295u - bic.a; + } +} + +static inline __attribute__((always_inline)) +void store_clip_bbox(thread const uint& ix, thread const float4& bbox, const device ConfigBuf& v_80, device Memory& v_96) +{ + uint base = (v_80.conf.clip_bbox_alloc.offset >> uint(2)) + (4u * ix); + v_96.memory[base] = as_type(bbox.x); + v_96.memory[base + 1u] = as_type(bbox.y); + v_96.memory[base + 2u] = as_type(bbox.z); + v_96.memory[base + 3u] = as_type(bbox.w); +} + +kernel void main0(device Memory& v_96 [[buffer(0)]], const device ConfigBuf& v_80 [[buffer(1)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) +{ + threadgroup Bic sh_bic[510]; + threadgroup uint sh_stack[256]; + threadgroup float4 sh_stack_bbox[256]; + threadgroup uint sh_link[256]; + threadgroup float4 sh_bbox[256]; + uint th = gl_LocalInvocationID.x; + Bic bic = Bic{ 0u, 0u }; + if (th < gl_WorkGroupID.x) + { + uint param = th; + bic = load_bic(param, v_80, v_96); + } + sh_bic[th] = bic; + for (uint i = 0u; i < 8u; i++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if ((th + (1u << i)) < 256u) + { + Bic other = sh_bic[th + (1u << i)]; + Bic param_1 = bic; + Bic param_2 = other; + bic = bic_combine(param_1, param_2); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_bic[th] = bic; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint stack_size = sh_bic[0].b; + uint sp = 255u - th; + uint ix = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint probe = ix + (128u >> i_1); + if (sp < sh_bic[probe].b) + { + ix = probe; + } + } + uint b = sh_bic[ix].b; + float4 bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0); + if (sp < b) + { + uint param_3 = (((ix * 256u) + b) - sp) - 1u; + ClipEl el = load_clip_el(param_3, v_80, v_96); + sh_stack[th] = el.parent_ix; + bbox = el.bbox; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + sh_stack_bbox[th] = bbox; + threadgroup_barrier(mem_flags::mem_threadgroup); + if (th >= (1u << i_2)) + { + float4 param_4 = sh_stack_bbox[th - (1u << i_2)]; + float4 param_5 = bbox; + bbox = bbox_intersect(param_4, param_5); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + sh_stack_bbox[th] = bbox; + uint param_6 = gl_GlobalInvocationID.x; + uint inp = load_path_ix(param_6, v_80, v_96); + bool is_push = int(inp) >= 0; + bic = Bic{ 1u - uint(is_push), uint(is_push) }; + sh_bic[th] = bic; + if (is_push) + { + uint param_7 = inp; + bbox = load_path_bbox(param_7, v_80, v_96); + } + else + { + bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0); + } + uint inbase = 0u; + for (uint i_3 = 0u; i_3 < 7u; i_3++) + { + uint outbase = 512u - (1u << (8u - i_3)); + threadgroup_barrier(mem_flags::mem_threadgroup); + if (th < (1u << (7u - i_3))) + { + Bic param_8 = sh_bic[inbase + (th * 2u)]; + Bic param_9 = sh_bic[(inbase + (th * 2u)) + 1u]; + sh_bic[outbase + th] = bic_combine(param_8, param_9); + } + inbase = outbase; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + bic = Bic{ 0u, 0u }; + Bic param_10 = bic; + uint _618 = search_link(param_10, gl_LocalInvocationID, sh_bic); + bic = param_10; + uint link = _618; + sh_link[th] = link; + threadgroup_barrier(mem_flags::mem_threadgroup); + uint grandparent; + if (int(link) >= 0) + { + grandparent = sh_link[link]; + } + else + { + grandparent = link - 1u; + } + uint parent; + if (int(link) >= 0) + { + parent = (gl_WorkGroupID.x * 256u) + link; + } + else + { + if (int(link + stack_size) >= 0) + { + parent = sh_stack[256u + link]; + } + else + { + parent = 4294967295u; + } + } + for (uint i_4 = 0u; i_4 < 8u; i_4++) + { + if (i_4 != 0u) + { + sh_link[th] = link; + } + sh_bbox[th] = bbox; + threadgroup_barrier(mem_flags::mem_threadgroup); + if (int(link) >= 0) + { + float4 param_11 = sh_bbox[link]; + float4 param_12 = bbox; + bbox = bbox_intersect(param_11, param_12); + link = sh_link[link]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + if (int(link + stack_size) >= 0) + { + float4 param_13 = sh_stack_bbox[256u + link]; + float4 param_14 = bbox; + bbox = bbox_intersect(param_13, param_14); + } + sh_bbox[th] = bbox; + threadgroup_barrier(mem_flags::mem_threadgroup); + uint path_ix = inp; + bool _717 = !is_push; + bool _725; + if (_717) + { + _725 = gl_GlobalInvocationID.x < v_80.conf.n_clip; + } + else + { + _725 = _717; + } + if (_725) + { + uint param_15 = parent; + path_ix = load_path_ix(param_15, v_80, v_96); + uint drawmonoid_out_base = (v_80.conf.drawmonoid_alloc.offset >> uint(2)) + (4u * (~inp)); + v_96.memory[drawmonoid_out_base] = path_ix; + if (int(grandparent) >= 0) + { + bbox = sh_bbox[grandparent]; + } + else + { + if (int(grandparent + stack_size) >= 0) + { + bbox = sh_stack_bbox[256u + grandparent]; + } + else + { + bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0); + } + } + } + uint param_16 = gl_GlobalInvocationID.x; + float4 param_17 = bbox; + store_clip_bbox(param_16, param_17, v_80, v_96); +} + diff --git a/piet-gpu/shader/gen/clip_leaf.spv b/piet-gpu/shader/gen/clip_leaf.spv new file mode 100644 index 0000000000000000000000000000000000000000..beac64bdaa072ba89007bfba4e7c1aac3ffa7c77 GIT binary patch literal 19240 zcmbW7cVM1X*~Z^C2`y0BWsj6m%LZ98g#s3&KmidDG)dF64Wvm)QlN-ZmV&s&g)#(0 z%61Ei3W6XaZbd)^6-8X2?D_qE&-+~I;Z^_np3n20>$>jy-1phf^MY3BE zuxE6B*X-Ftr%YS4{qDADZ>)x`lKW4APi(9M*Q2X{aI|lDgkeuRY3`1@TdEvuVYlz* zTU*}>uh7_ldZe#sxOcu&cEW$@ceBmINGHfYDcA4^GM&dcQCV4sjK+C1nBA;D06Ivt(jv&V~hCJ6H)qY zKIS*Eu`PIHbZEG*s|uKj9Gjc1$9&rx+kkbTqAzWY?Wq@3xV>=@_3WMzc2jeVGpY9( z7#QlMb}VwAp}BKM`bL{R2gCOr8l2lduQ?BLpFzgew}7b_S*AM=gVkb>^x;m-2@U<- z>nHcmVGr~Vc8&HA^i^$|Hc!+1zW#ahnbR1ZZ`-+_b?@qFb~3tWj!haF@23B9=nF@C z2VzKm_qtO|uVNZ&GZ9ANNq?g_xjj&9 zHgkF>Y3m=HTa3M<8i!|-wQ%&zQ&aOSH`mtOO>K=zb41JfGa0QpZ`noKnqyfLoA+9C zFO>FXpFHEv?P$-uu2?gcwQHMe*lP3ca4i3=-MY1HPS0ph8g|hCL(rT1ditWdV8^89 z)60Ih_WaBe@OUR2M_o-bcY^%w@Pqpn3=MPPxo_k?3;Gr$)tK(tQ12Ta9vTLlPk+w_ zdKJTb-Ebe{dxolf^;sMWCxG^o!?_IQr`Mbc-qyH~x_Q@3tlmrW2D)Yr_4W)LICv7( zX#ddQf&0bxGQjawyY|Ml)a5;K6MVF9gkCiB+(JFVh0b+a<-Hxwz1Xv$w%^slAAy(g z9)p+ho@nuZvW2hAJ@Hp%m6RD zH`^O?rq3)C(IZgTC~s^buB)OzHCm~8lQmoGcIjc zfrm!t_qh+`H@wQXz43)t@s7q#@aFL!=|7d*+rHnT#<>&U3U01*TlGzmJa-#scxbj4 zzJB+^*(z1e--?U(#G~-OdS5?oyRr4`XgmpLy{j(Jyo&V#S~CwjnN1vfb=TB6)<9nt zr@gV(t9V;uBY6MdLgG!VzL%KC83RMJ|6_wXM^j$aw!Jacwj*5&7mduX#l`*G16dwXMki+upj_$wFhmPPP++@5oqo3iTP z8O=7lFq`vt0X+S=yxG254*PRm(|7DV-Ppo!ZQ-}K@VhJS>(+0p_Qrj$vQKC{059HC zOhwn&T)g-y-}c5U1wVjjGo1vUO`26h&3Cm=tHu_oG~eU#BV48V-i_8)Ym*AiHpW|} z(0uQ!tx;&c^VNJmB%bekHNW|ynNO{=(3Y>YO$*KUyMEIO&G)+6R)uC?)wov1a`+xs z+p*Amf2+N|(0pgB?NMlsQ*G};^Bt|ml`@vYceC07h330i&9%vRog=kbh330i?ahVe zJ6Y}MLi0VW=KC)3oL{x>Li4?=HmA^h->S{8wDz%gYSt(3SbcnEa5mN2D%@W6cVi7a zGuXTKe+tFDB!3&VYwrHAZ+Yh?UbNP|mv^mx?gjIzC2u#l%qusqk9qxF=RR1k=HB~q z$E&@QjV*W2$;T5|?i{VoTOKp<{N>kt=TFARadc9AHiWxx)tb7$_!^a_K83NmU&ZQ{ zPTIJ?(r!(nySF+iYhiOw$(^ey{D1e8nr&?Bez2{Y@!bnzb&G8rul+Hfd?RYx7{fk0 zpW2%-+8djK)wJtBjaqw?cT{J{xsW&Wv^QP{?xc*PY)KpW4lR6VxX*@+(S2^r4f*xw z-6JlG_Ww^QWBe~{{zHgwUgz*oYR99td(CInTE=o1+ebmYkPBCQ`z$&`5ENH&ff6 za&Etc_Da$I9{7@)oBMb0X(tTcYdrZwO@7E$^vE;*FkBytHjBCbOl_Y%`>w~o;O4bI z+W$>$U(_9o-0w;?{o7bqx+&U_a#U>2nLN z-QNkZ%l(}YuK!gn+}{eN-QNn~w)eL}$^D&B@>^TDe}gRT{$?n-zZt^4ul?N+?wUVZ zaP#}yA$GaH6T)rpZ-SEhn;=|&e-o74@BVPt)bIXq{r&DQx!?UI_q)I3e*1?zzkd6d z+;9JI@;oAMyFS+0OCHFf&-0}NuUvj_gOYXOQ$^EtuH=p13CHLFD5C0XZAg4>U*)H-M@0Nd)GTDe(wcqPptQWy`Rch z?}w}3NIuWv2f(&de}7+}b`!QS%&$lj|d3bJ-8ijC@yOI+l-ur%pK8w$ zBeAR#>jJQ`xQ>cyfKjo=?YT>`#L2r}fjsx{&(Q6d&!MsqNOu zeG%9gnU9OXYQFQF1No;Y#xl0}A#~r#7VrDSx)hvoxNe_?8~b*~A-@dlJ*B>+)-DIz zPFwo$Ik38Sn``oUY9ITf?MjN8{ShbsQgE67YIvFd8o2q@lmA+Z+Oj6s zgVmjL??8EMH-cR+ZQh6S*uDtXUz>NMd{<_}b^a3A_pfcV>0=yq*DGVR{$F$cWo#L< zF~0&gM#l10uw&8B_;P)W@7{0@tuu#T13PD#pRa?}{QkAQHnnoUxi?z-w+~~y8T}g+ z^ZUJQjO6)dZ8N6t2zhMZ20NxR>pk)vu-^&l`uh%16Z`&f|BG*{v3>h4*s)oA7Ve<_ z9>qudoweOMdG7)nBm40CV6~hD`Rx>A8C#sTKLFd8@E?MWbz41#d%^cm)HAm~0^3fT z{lA-9Eirx!cC6t)0qc{w{i$}wqh8MK&*0iJj-P{#rS5t9CAE*|N!u?dYK}>qe7^#l zFY_#S?&51LcV2C4owoOZ9e>99Yp`1Okv6q*t(@1ie*o-!W~>i_9jkgd*5AOj*~j~- zP4|#oxgCsmJHe2oQR`(8bjE{rWeDC<3`viC@wYv64snwF>DeyTI z?*`x9PlNUGY{usqxVk=`-)AXLQfzBXxjx!`e?JE{M%Kjd-{;}#S%-=_m+ir@G8*;lzft@m8wt$;1@eAlfAw{Lm(O@g~_>erLQZ;#i2ZKusWw=%VldrsR* z6gB%UPOeqJWv*4>Wv*g4YXce31f zne(>}*uO2=Mw>qRs=Iy}hjr^5YIm(2r*+1;F20T{=Vm>)TJ8{SYPQe*ajnySeQcR0 z`?vwTT;mPlj$c3fE7!;Q@mq9ba3`hwe%u7EuKyHjx#!O{-xRzBwYqkHJE)~UuLY}} z#`^kuei~TqhI&724p#F#+TPz5YPNU3YynnVN;dOt308Bh?PJz@t6I~Zwp)YCw%fqf z(smYo@cUs~G}~*p59czUY8k_J;LNH1+r#y7&iqz=9o&0bKikXoNk4V~+Yk5qPSif` zcl8}9yHeZ-;^f=~?7SJz-&bPTT##w#{0;5xkUK z>bXm1fL%9j@jC$Q-=nlSp8crRTF+YY9EdIJ=w5mgyv%zL-2SR3?@X|HwPkDvgN>!l zH9eHt$8l;qgrep+#mPSlY#+l91KXdR^~2%%sK@6Budr~_|J&g1fAgBpGxBz@y6bWrwLJdaVEv61 z{~oZq{$14a{AQR9_BVrZv?We2*f`Fc?d1BpM>GG{nS(iC=U-pvLawiS+qJTeZy(sb zk#WxjtK}Znre^z`7x#AS-ZPhX#d(yBwI4qq?Kv0)`*;qtjZhX- zoLh0?o&+xYbu!$3smJFOu)6(RL@jr&&0##_#`jdP@p3P`1FYsAO?$Pny>Zh1G;lf2 zcf!>&&Ub;;GR`GnAIGWfbjo`u_Dk%1`z}2bd_MDDzDv(SQ_uN58?07-JD&r$KiPZl z2J5Hpc+Q|!b3FFNyynk%&IQ{({5)J2v&~!w-Y~zSfp@9|5apd~(NUUUQf$w&iJC+Vm;sVoB|L0d?MK z9|If5+IQL~s6R>ZvF)d7yLI|>A=ntX_dg9*TS^<}LH==yv5YNF+s}Y)dkuTR@0g3g zYWByoatYYSINC0zs2N9`IG2LUGyPe(^P9Q64D8v_&-sz-CzoFLc=XGFtY+nF7E^Rl0<@s*^BG~VC+i25AU-jhw5;*z2JHHHfp4H>?6|lPP z^_4sC8IN`AylHpeIbQ3;`YPD{k~8`>uv*?X+SF{Hz3jeA`>%uTL-swXU)Gu_#4M(dCD#{^uKkkP+%JG*cIxT(8^Gqvcm3XA{nQhCA8_k=NkAUZnJq%o~m;V+~d+yk`fYrR) zG8adK?RW0fW5E6!DfP_ZTfw%~=3a43YKhSUR?8ha8{ADXkGA;rg0(GVF8uc|b0}(O zG3WBRwbn;%9_LCe?dOBdlYSi!R%`yvqWSJT0p3k9xAwFh0BcL`1z@%09t5i;x9g&o z_6xyf?qRsviIj|c1l&zAxAwFh1#3(0MPRk$J_)Rr-0lIjv_A!0<~|j!wwRLK?*MmG z%&k3bPXlX9?stOKlKWj?wd8icsoCB&Swe9g!p{IZH{oZ3?Qi&5U}MZ;*LzORrYxb@ zPG9Y=t@~J;?esf`qUKo>muGDqxX+pt&)Q__wFf{@`y5I?&IP}hqGs_K zM~UzGU}MDhePA_2f=EV@su`7d_M#(bAA}EW{Gcd zegw@vY(_DrzK7Lk=A&TykB)WUCX;dd3>c#jm^_K&sjCtCQEHTOMp z3HVaVMU;yvu8m{+Eag&)`_%r+#r9o4zbP)KcC6O^-Qjc8S5kbmf1X;qb>dzHHb%bR zuK}wqrT9H1zk*^c+ltfnda!Nt&b$F^JN4Tci~YWl;(eqoeqR7zN73dOoNu+n{UX>g zhJOicp1d=@4Aw{e&N|Om!0P>!{2k=0V0Gu`8S;Fc+Q)He`x-^fafy@fTVVSX{%vsC zpYOoS{@e=JM?LR>+rY-rmT`XpQ5wN4bTf&3x|5J1OeUjeAWV+xNllX>IO1 zdH!~NH+T}oHrn)Y@2ck;<_BQ49Vi*!Jz(`~>C@HJKcuK--hKr3ao)7uOHp&)#O|%^ z5B2o*Ct!2g5Bn@n&Yyyv!|aovft?ri`1~B~yl4J?0anx3nA+6LpLMiO`(J|1ku~`h zSk1B9UYnZjucUT7*7BSs$CY^Zfz6qD`ZZWB^Yj4N$9d9rKSj;BV>ob8FA&Z>fJr z@zMSewf@%0`!LwPChzaTYUVYU{5KS1*;bske*oJybNENFS~&-5@&6OJ^#3z_DMdYN z^(gp6inhf43)nNQ&AEMqS}k$^3RVmM8`!xF|2x>cx7FwQA7Fjdvwr^s8%JC6J`PrQ z@8nGV3+$QFX8Z}%|E8$hw)W_#~*^_;0^z-rEE#`7## z{p$KmJqK3HUU>oR+&+#c~FnfD`4k2^W{rh zO#DQ_HX*?#*rJxe!ap(GvHOr7tEm1(Pge*dfrp9QxctANY?tOnPfeZD$a?MQsn=QZG+@U!c)Fd6Q7Q_opg z6Kq>;uA_ZWON_O^YWWsl8{AC{^Jt6TI$&+#>w;~UJnMngn&+$eduDxjVrfs?4Zzy+ zdv!yw@ztGcdFFQ$`Z1N_vneI>yBXN|eJyq7cM9#C-|#I8ek}SnEquoozFQ05yWo>) zKcnVZzfIs-zm37=`fUo=p7q-dtX8hyYvGO~>o*NPm7<<`-CQ4vHrLNFs%0*>1gmBJ zwgPt((>&VZw>4OsF`O59Y}ZT<~zWpeHSRx^+DEsyU` z;4;t7a5eXA`o9Z2d9%rQRXIHSAd0a<%e0K+zdG>&-EiUrx2~QsFX}g!0l00t! ztC`3BA&>7q;4;s?aJA{yeAOOKzWu<-qdjf+2Wv~7H-gp7zp0{b}qvY1Uu&N zH-YV2_(5P}uXgjepS9U;Ir1C~c5ld6fgb|a$8)*~{7|qy>Yh`1&Z+IU zp!iIqojixrPCawJR^9LPqp5v-M`$~WGMi#P zan9*m!HItixICxF!nNm|9tTz{&*|IX_BZF_?Qs3nGlnj3H$Jx0o-uTTwK-p6`(_{I zy)}0ndjZe8Qa)Z0*uwH@uv$6p z6X52|{WJidPEpTXECAb9n|sPWsAc?vU^U;p-oZm)`|f*QANTh{GLJEjAtG-{gP({Y@YOM6s*>~!O7vjcIN*{gQhz zxXgVjT&;PpHFLiMZf>>YJ`GJhx!)t$jZC?MdNp$;bGwwpCj9JdOYWXbk`g*cL!4 zgR}us9i)#TMd848NEOAU90B{&rLr7%;Gf#R!y>;^1_O~| zkw^xKfrLSl%CtOTsvD2>HXCoI<dmp)A48#^+3iV+xmPuVol3#p&E7`w%agAtzGOu^&{4+NCnTGm$Ve;%adTE|PMz+Pj-ZYg(lj(L8!2b*1tm%~L!v88s#q~$7jSzltugv6pD zZ~@+W^eC8To zTMY{N#A9}hj36cRUqG6RB``X=l zRt;sj>D^39Qdyu=nYru?W|L)bGCNsaPRL8_BL93jB`<$kR$ZxSx18p$KMCy5BEdqG zcfiDjr48mt6KrRpWr1Q@a?!oari7+^^!S|pJu7xLp$8iW5KC`De$URm*;U!9L)m{h zG~jB>S<#fRX~p|AmW%}tvf!7vgEbSF#ZlMb1_h}ds3DVB|K`}ajxBD$r)sS0{SJW; zY$Ii~`Kh6qR1_Nw!U`%Jlif5N%14(Io5 z{i@2;^BlC)s*gj(6S+%_Rofj6n`)|?V8n+VXKv0Y&dalaxie}Tnwp_3X54!+Q5;$2L{fsRJyY_x6W^=-riKZt)W78Sef$d4<2E99yvJL73{Nh zT+8*%C|;BK-i_ilXUrM=*}&Mq@PW5}_RcS!Jn+^^<4{`+oju zPyUE)$qBUJw5Py}=0_*AErMcajUuncw;k4^&l&JJ1w~7iqUDwxx_v_QZ6th+jBk6l zudUlP_tOdL(-`&V5cM&KRt@*+$9whBcs=USpEc+suC%ib{M~?U$*^rncf3A1w7AQ* z*pKFS4K09^PP+3iqWS$pWW=D@5b za_Ge#{HKFA7=k=iC1)kondv|L(Fe_}6HmFQwdiaEod-%vt2($ve!V8Bb|GV!q%9x0yE;QVm4uvL`0kv>p zu#{LTO0-ifXgzQHYTe%_w{C0u%Sy$mk=YW~v8pNT^FuJb6Z;4#pT@r3aO_)QA2Wr0 zk*~3@>R+%g$#e(y<@|TCZvk9quF)UIzGM@`zGTyF?CXTs7b&B~upVLV4g$_klO~YC z__N@h+T7)UOI5*y;Lqw2YqW6acdIVhO1?)d8nOn12jlQZf&%P=MH~L+E%c5b^PbS` ze`?g64QCTqmxw4>q^kycX^(}zS;XS9L&oLZlZibx3CV)&2h5~N_ z8GBnzG^k~mc)J5^>GC_FlcquZNrS^=TP6#pKudzoN$<#{3^+z757WSO9LqdzOr? znkzj4@D~waQJ}5jXXDzU3~dcxsx9vA+F*Hy4{J-l#Jd`Jm(KE~b{pS&z!FZXkeR@Z zi{Novnu`NeY$UH3xLM+ zu-_%=u*U+5R<4dO3j;Eh-p7HPeKN8mRdzq{tjCQ+F-Za}916CZ*R_vzM5(B7(m_6i zkXuqzCME~Im6&JtjG|uhil8k{mIYOdPika+g7$=awV33=fraa<8`EB5OXQil)dt;o z5G$XSHz2yptX;lLS835*6@D0EayaM>-8^%-OG{8OSNlxVKLq~Gn*J1y(N zJT<@t0I3mkM%f7&6V1` z!BK;7b0c98)nu7)9GBFeWhWs@4OeBjLk`S z6uJMkTe41uh36MOqW;5#3d!%x#6t3o>-mI<*+iNU0SHf%M|X>ghlmdyiwt1?t+Vb8 z;;-f+3+> uint(2)) + (2u * ix); + _80.Store(base * 4 + 8, bic.a); + _80.Store((base + 1u) * 4 + 8, bic.b); +} + +float4 load_path_bbox(uint path_ix) +{ + uint base = (_64.Load(40) >> uint(2)) + (6u * path_ix); + float bbox_l = float(_80.Load(base * 4 + 8)) - 32768.0f; + float bbox_t = float(_80.Load((base + 1u) * 4 + 8)) - 32768.0f; + float bbox_r = float(_80.Load((base + 2u) * 4 + 8)) - 32768.0f; + float bbox_b = float(_80.Load((base + 3u) * 4 + 8)) - 32768.0f; + float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); + return bbox; +} + +void store_clip_el(uint ix, ClipEl el) +{ + uint base = (_64.Load(56) >> uint(2)) + (5u * ix); + _80.Store(base * 4 + 8, el.parent_ix); + _80.Store((base + 1u) * 4 + 8, asuint(el.bbox.x)); + _80.Store((base + 2u) * 4 + 8, asuint(el.bbox.y)); + _80.Store((base + 3u) * 4 + 8, asuint(el.bbox.z)); + _80.Store((base + 4u) * 4 + 8, asuint(el.bbox.w)); +} + +void comp_main() +{ + uint th = gl_LocalInvocationID.x; + uint inp = _80.Load(((_64.Load(48) >> uint(2)) + gl_GlobalInvocationID.x) * 4 + 8); + bool is_push = int(inp) >= 0; + Bic _207 = { 1u - uint(is_push), uint(is_push) }; + Bic bic = _207; + sh_bic[gl_LocalInvocationID.x] = bic; + for (uint i = 0u; i < 8u; i++) + { + GroupMemoryBarrierWithGroupSync(); + if ((th + (1u << i)) < 256u) + { + Bic other = sh_bic[gl_LocalInvocationID.x + (1u << i)]; + Bic param = bic; + Bic param_1 = other; + bic = bic_combine(param, param_1); + } + GroupMemoryBarrierWithGroupSync(); + sh_bic[th] = bic; + } + if (th == 0u) + { + uint param_2 = gl_WorkGroupID.x; + Bic param_3 = bic; + store_bic(param_2, param_3); + } + GroupMemoryBarrierWithGroupSync(); + uint size = sh_bic[0].b; + bic = _267; + if ((th + 1u) < 256u) + { + bic = sh_bic[th + 1u]; + } + bool _283; + if (is_push) + { + _283 = bic.a == 0u; + } + else + { + _283 = is_push; + } + if (_283) + { + uint local_ix = (size - bic.b) - 1u; + sh_parent[local_ix] = th; + sh_path_ix[local_ix] = inp; + } + GroupMemoryBarrierWithGroupSync(); + float4 bbox; + if (th < size) + { + uint path_ix = sh_path_ix[th]; + uint param_4 = path_ix; + bbox = load_path_bbox(param_4); + } + if (th < size) + { + uint parent_ix = sh_parent[th] + (gl_WorkGroupID.x * 256u); + ClipEl _331 = { parent_ix, bbox }; + ClipEl el = _331; + uint param_5 = gl_GlobalInvocationID.x; + ClipEl param_6 = el; + store_clip_el(param_5, param_6); + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/clip_reduce.msl b/piet-gpu/shader/gen/clip_reduce.msl new file mode 100644 index 0000000..26214f1 --- /dev/null +++ b/piet-gpu/shader/gen/clip_reduce.msl @@ -0,0 +1,177 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +struct Bic +{ + uint a; + uint b; +}; + +struct ClipEl +{ + uint parent_ix; + float4 bbox; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +Bic bic_combine(thread const Bic& x, thread const Bic& y) +{ + uint m = min(x.b, y.a); + return Bic{ (x.a + y.a) - m, (x.b + y.b) - m }; +} + +static inline __attribute__((always_inline)) +void store_bic(thread const uint& ix, thread const Bic& bic, const device ConfigBuf& v_64, device Memory& v_80) +{ + uint base = (v_64.conf.clip_bic_alloc.offset >> uint(2)) + (2u * ix); + v_80.memory[base] = bic.a; + v_80.memory[base + 1u] = bic.b; +} + +static inline __attribute__((always_inline)) +float4 load_path_bbox(thread const uint& path_ix, const device ConfigBuf& v_64, device Memory& v_80) +{ + uint base = (v_64.conf.path_bbox_alloc.offset >> uint(2)) + (6u * path_ix); + float bbox_l = float(v_80.memory[base]) - 32768.0; + float bbox_t = float(v_80.memory[base + 1u]) - 32768.0; + float bbox_r = float(v_80.memory[base + 2u]) - 32768.0; + float bbox_b = float(v_80.memory[base + 3u]) - 32768.0; + float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); + return bbox; +} + +static inline __attribute__((always_inline)) +void store_clip_el(thread const uint& ix, thread const ClipEl& el, const device ConfigBuf& v_64, device Memory& v_80) +{ + uint base = (v_64.conf.clip_stack_alloc.offset >> uint(2)) + (5u * ix); + v_80.memory[base] = el.parent_ix; + v_80.memory[base + 1u] = as_type(el.bbox.x); + v_80.memory[base + 2u] = as_type(el.bbox.y); + v_80.memory[base + 3u] = as_type(el.bbox.z); + v_80.memory[base + 4u] = as_type(el.bbox.w); +} + +kernel void main0(device Memory& v_80 [[buffer(0)]], const device ConfigBuf& v_64 [[buffer(1)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +{ + threadgroup Bic sh_bic[256]; + threadgroup uint sh_parent[256]; + threadgroup uint sh_path_ix[256]; + threadgroup float4 sh_bbox[256]; + uint th = gl_LocalInvocationID.x; + uint inp = v_80.memory[(v_64.conf.clip_alloc.offset >> uint(2)) + gl_GlobalInvocationID.x]; + bool is_push = int(inp) >= 0; + Bic bic = Bic{ 1u - uint(is_push), uint(is_push) }; + sh_bic[gl_LocalInvocationID.x] = bic; + for (uint i = 0u; i < 8u; i++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if ((th + (1u << i)) < 256u) + { + Bic other = sh_bic[gl_LocalInvocationID.x + (1u << i)]; + Bic param = bic; + Bic param_1 = other; + bic = bic_combine(param, param_1); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_bic[th] = bic; + } + if (th == 0u) + { + uint param_2 = gl_WorkGroupID.x; + Bic param_3 = bic; + store_bic(param_2, param_3, v_64, v_80); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint size = sh_bic[0].b; + bic = Bic{ 0u, 0u }; + if ((th + 1u) < 256u) + { + bic = sh_bic[th + 1u]; + } + bool _283; + if (is_push) + { + _283 = bic.a == 0u; + } + else + { + _283 = is_push; + } + if (_283) + { + uint local_ix = (size - bic.b) - 1u; + sh_parent[local_ix] = th; + sh_path_ix[local_ix] = inp; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + float4 bbox; + if (th < size) + { + uint path_ix = sh_path_ix[th]; + uint param_4 = path_ix; + bbox = load_path_bbox(param_4, v_64, v_80); + } + if (th < size) + { + uint parent_ix = sh_parent[th] + (gl_WorkGroupID.x * 256u); + ClipEl el = ClipEl{ parent_ix, bbox }; + uint param_5 = gl_GlobalInvocationID.x; + ClipEl param_6 = el; + store_clip_el(param_5, param_6, v_64, v_80); + } +} + diff --git a/piet-gpu/shader/gen/clip_reduce.spv b/piet-gpu/shader/gen/clip_reduce.spv new file mode 100644 index 0000000000000000000000000000000000000000..ce0b9bb3caf196d74fc50dd627ba9bbf971c9794 GIT binary patch literal 9696 zcmbW5d4QB<9mfZjT@X<2L*+&h6%Z5!!(#z)L04ICDQg^d2X=CIX4@U`$ja0%t?Vwl z?@}wZw6wAewX*vzJI(I9%)6{UpLgEh@;K^0z4GJv{=UESIo@YpY#F(5bfGY!Ft#wJ zaOQ|Y_Dm{_geerp722|~qw}=RWkdCzWh)O~uESo1mdsC|3BU6{hyS1a`t2TJwc;?}LT3l{HKvBsED zg~`}5x^e;f*uqTop?YnwT%-%#o}!$!rsGFQR(A#%?=0Uw;Pri#fen4mudSf&Gf)~V zSL;PXU|Tajn>XEPZ)N6l85^~`$xtl!CG{IQU4xz5R?@sZ%*H=@m;)|TQ+__UA8~$6 zv#_-^#LhMJqZrrr_0_r=?L*zCwr$%`x!&+O8hw4Ox~;OkF$Z;@DlL`!na0p=-Z@mW zg&iv6PR`K<{XJLbS9)0QO0`(8^p&%ihRu`PTdr*HWj(^^S{dir(Y;t|G#R|-n*$Lv z-VOgY_C1TzkOR2TRqV+z`1uSHnkV*fu8X*K^i0 zSUSJIR;^WfBIEA$@jPd{IXs`n8tfUohke-A$hlY8hU%s6^TN7&Z6}1?qdeSCBkx3P z_LCj+CY$fSwn}wdGWXVO9=-?cg|la!8l3MwDn#n{7U+uqom%TvYq&S~o@)k~4ImHv-`Z=CCOJGOzH zlblaC=iSlwr(M9~`_svoEpzs6jVhdsep0!=Hpu1Txl#A&FZV~OVcvH`zC1Ws8w6WV zf8P!GEQj^F(0#0T7GqW~e;13PlR&%gSZ3$KRA(D9ogve`^V0YIiYMqTg&Wc9z50&H zej~Q`6+3I)rM^wooecF#t-9$rT(GwkZegrc2QuH=!1#9b)wVv#**@;lcc?h9W2iTa zy&FC5#P5QKxOllDGuu7-BF6oZu~zRbyFTjoGjdm!;{JNz2|8~(i%K77)dtV&7_99W zuv_Csz^8Gen)K1=LzRp2d$6@I2c286)YrI@$10uiUJJv!2|% zg!9go+b`j~Z{-#zocFBUl7zEYx#bDxy(+gd;k-}fRwtZysGMsVJv&di_Js4!l;fBU z`%TWdjS1&nsb6Qpxkhqb3FloYcY4BkN6MX@aNdh@=Omo#D|c?fc@N6b#G$y9PFFx-bw17G3_>}?)=qH&2`sk3b*b^#`#pUo_HEQ&SM_p zGY8#sE7#!t+AU<9{Cwu>c@@hm^N8_$Mcj0RS z@pRbwo;T|{zhfDlkKCGEZ_l}y%UX2r1^w6MHuHylL(ZwocOZ`XpUCL?xev3c9p`sW zes-fzdHpvt+K*?`e$+jSa^^SB83<2D)Z2nG0tvkY4uA9ed*`_`e~r<8++Y0{FkXas zcZGf>*fSCNt^#}ahJH2JC-m#Uo-xnd#~G~|^*#Z%H+koy?z<#sKR4&tn5cg%*!wm7 zeb@A#%B=2SH11AD=b;{bd<(4OEbQL~YM0ahJB;=yZ(eo34dk@{FvrG3zI(yG7op#$ zPX6f2TKvi4<;~?hmatmp83*TDDfYAk7{4VK z^PUDb5k2yrp2x_WN8R6X^Bh5Mo(-{&`}7Em`>U_}ej{T4q2H3|cY^Oqbl2zZL^uAP zMAzT6jw&+jJ@&$Rn|G;8P{e+rR5h81=_)z=cWj`7n-JF*D5H6J7HGw9lP zk=s0W+HdcNunXLN-qzf_E?MT$z1=eP-?zMW@z6h4P zn0YN={1PH}J#~EdzMON57>)OA$QiHwD~Q}R^w7rm)tuYHX#bby{eBHBr#)i74o+jg zfi4%ZUBtQ%-$abpZV#SExmd67ft&Z|`*8BsGRF@Pw^V0rJ5hneFa7=7$V+wT!M`w_?5{t4W? zwtt3`kDmSlPJ8+*x;@E9Pk#d&r!Ds2?_l>po4H0a{sWPBAG}x8!}d?Gd#KGjMm=l~ zg7w$-0HgYmoJey%gp5aw(WZ}i_Z&; zJ8}dHVqD~IL6^4=?`ri}=T@-u*59me4PJ@U?0~<+XO_;^$|zzN#Hd1 zWOO-m$KFptw|{N1PE*11(XYDe5j7n3y@R-EVDCZqZ3fuKzT~GPa`q**=3dxmf?WsK zEPCA=PCm}WK43Zb*4mNB@j>1UYwruTj{WFw4f~kQ=wl!H%tGYsLmV~dg58_wYaZCX z(tVr{Cm+8B3&6(72e%L`@4C8X{^rX?oddvfp)UrzFV^)89f- z`YlQPQgpw?^fO*vAJ;AB?bv*VBmXjN_7nOc=&nngzvWLbEAVAp<6#`yN14>mU5-~GVGr0?(k=Dv>%1|G=u_&3dZ^mfF3TAPnC&*RXw?~3;hEVls(pN(Mq zj(2ktxC4=oH9P@)Jfh9B;+*6n$0=aB`1j7K;4Z{E+QRQNu(ko#Vglpoh}>oLr+#M6 zoxzyBlQ|dhXQNvu`g$f0aEowg-EElz(1D1>0?u%T+p9@ZF zZ$+0YAu;c6aMacwu{~gIQM(M5i`v`3a#7neAQ$ny;I#I0(d8;g)IJX!wY5iVA6Q$| z?gz_7?J8I`Lv*n?cuc|BMz*69skdr$Y|jp*{RPHzIoIvH}wwTl8)u z>f2Y;e@||U`tL=Ti~8>aM}6~JUrxWM|9)^<{{!f9ac5cogGki3uc-f_+!pmej4l`T zuK`DW^IBg{zo>sLIIaH?bh+!0nEyw?QQy9z{>Q-DqW<+@xu|~wIO?0%`f~ckd-6Y2 CNZB6% literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/coarse.dxil b/piet-gpu/shader/gen/coarse.dxil new file mode 100644 index 0000000000000000000000000000000000000000..f71cc0441f5852ebbb2a904e3dc8c03be2439447 GIT binary patch literal 11972 zcmeHtdsq|K+V4#6gb>0d5N;YkkgE;|$W-_+62%BMT>|QTXch>MQdBMXwj|qtO-fKeZJ>B`=9gA+0XgraWZS(_qX1) z*1O)@8VpV33;yhQR=)SKac6zab`JAY?hp(?P}d9yqJn2IxTC zJ1%SiLIX7YwIFW3m;N1pF8Xip!nk-u0qBUn!+(4J1upWx(!f~wT=1I+ZVtFf;KnaK z-Ei{tZ#n_I=N=~q{YMXXS+#WVU!rKIDBq4#Y4Cns3@V-$i zC2az4!U`;l$oq&lp>POV0Q%s1F_Ve;+$dUs0SrUCxDezIXxFEbbXLdLlWHAhL^$fd4RFgNHMTl9?$Vk!>krM&nT*k+WPE2eJhr95X*H%}tP zOdw$r`5ds6RDfAG!~}A5610>+<4}dYFL9ky#!V0V)!YU~ID6AAyAZRPFM=6kJO^i~6 z5zIn;IcbI)gNWzcOOHfu+ekdTp$+C$83rBMw%;w)jb7>Yp|VZ7)hlwNVOr!v7RijH ztmHxuC@EY&8i|*(Hd_WQvne3jJJZBUipRB7ak|)<23@qbFU#jp3ZTU8*Ly}rqFC5k zfwv3uHG=`KWm#v+xRf>MlgJx4GGyW^bwR$uS;pb)K7kqWTBJoDPzBfaCPmzdravq| ze?k}oVJAF2Zm*LelvM~1YBnG=t>dV~96wgxpZV$n!Y^1PgO=ajo&Rjd-D9^~cIni9a>SC|OcNrwkX@I+BjcYa$* zm9`h6Ou6ySB5d|-9@K%&tNh9K5kRnrrmM3zZ8Rf^8AZM2!#mDh&EF4Xc#HW71%LX^ z|I<0aj|bhv<}I(Z!QqJ9$ik{jYIPL0u9jLiz&6cNn|9c?OKRH#+x1KB2H`0qJhq?T z6~9A1=e=iNcI6548Q)+3CMtWzueaAlpg#7U8_yZ0T>v{`YW{_0Y}xbP4$aMx%Ey|U z1E)ygR^GvuZRguA1-6|#-{vY^yo>RD{tV-9JT$j|Etye_i^z^>x({TyZai6fOP(9K z@#K*srbRqU@8Cn{E(OlM?A_Mp%A0~47;XzO;5mzi*>(-vW|+evMFRi4&SE}B-MWRk zv0?+`Ib{4Ri#d)lXEFFwI@ES|s%$e5vKA({=^=(_1R=%5ke)5YntCS=ZHb=f_(w;iyz>qwcBf$R!umk0F<0-A}z z?egF}Vo=YR`DQV7TPAgOW?>Ha#R?^gLW!8V*@e1!f(*odi50G(6>i`Z=2R4Jt&n>B zHpcvIi1}8x``g;-UncM`A_kRp z&IE&glm+z@gPJ-SUG~)N5@N1P;byGR>aKt}F+>^RGJhVTykjxnsF=eF&m>A`Wg=D$ME9{HC%+x{x$@EO+h4K+ z@gAjW(^kT_0Ush4UAXm+faPC1ACju9d@2|ZF!sdlyDM@lmw$eIFFnwjc9bXY7Mn!r zrOuo}!RMdeL_eikzxQ+By;pB$8xyrN&NR6%d4K0Av9y!hQQ$NB)bY|fquV%zikjPp zrZaoB?Qmjf#lFFd3x%D}g7!@n&f|sF5E*lFehP~*I`n(+S16loa<0_O1W=sqamqE8 zi7GS?o#m0oi^99CXT=pFp^o;^jt}`h%dY(~5jG;+=RcuYXCv^#U+;#fXTZ0|ikBz__j~2Id2{M0(iXg@;?QZ@j)fmN~#z@6$44oZ< z$Nw`mh7oQv|E9*sw1r(*=ffb{QJA}df)^F^415`Q4v7WN|J!=PnWk#taTxTpjW(1n zE~qEQOe(n0KtZ0+B9+4-fcC?JbfHCu6^Agu_4%H}W2nS3@m3oPha_66c-jJjAL$;2 zZ~_v`Brel_?C2YGrls|QEWJ(!t+k=71N8#w;5r-nF9f%}f=U~^~~=Zg3|o(%!1;1*JsBOP8e8yy=OV+$nwL3WvW_1!Oz}>7Gd$uKa7XP-|%xn zZz_h{H08Ii-tR5})rLWBAn3a?dsEI<;d;Elh$ArnCnx_;PX7NRC#P_Mt%HNV8Og!K z5FGTUgBU{Ufyz=*dkSzr^DwFTDir4TqgG|10p^(~k7B+>P*oN?uZ$0~;cq3szzX=Tz!@0=h^925+>fXWg1p1t&;=nW+_ai9q94Q8nOe{Iax0+Uq(UnN5 z0(^F$B)C`bPS8)W=sI>=hn35jCi)2^RgANXJoD6U#T*VqlfgNLsl5P30aCC*{sYjY zfu{0$$5hr17X73FEaX|CL|<|DqVOS44Fl-i=?Q`r69j>U0?3Ab%B11@1~ zhA^5tkF!(?Ca0BEBJfDuh8W_qh*Ou52_lRxFv7dRzgEcg+VE;z^G_w8bvui#1u!bn78m znRn2Es)`9#{j5~lek3oN$aDP?mJb)GJXGR?QZLz(f*Y{hx(A9!+QX8Y z@X=^sw4q6Vka}B-?Je?*RW80wh80d*C~lDY963cvXd)L{hNv@ zAX>kC8i5+uq8R;wU_OHdk<{ov_q|i@843wABY>3_Pt0Ei3{{Fi7fAPtq%d%B7nj&6 zKgT#%O+`{AxNkFd4~22*Y-K30#7{iR1%`;tn9Zm`DV;{Oy%y<0Y10MLeKg-m_;J>4 zZ(2Muh|#H5J>JU_n`Bk5=)!`-CogOD&SGbh$^-Tj!Y+sQB?1TC z@Lu8DLnI|xrEgG_^E2p&?b0_X%6lCWcu6REW)8cERy^5%v!hdowR#ucUl&pWavU1G z99+FY^zp4IDg!3(c7k22$G>y*S?TR~*`!|s zJfPoeYEn_L*y?T77rnGfG*3(x9IrN5KdGqj;oWf$7u(cAG|!j7b;|lq zWOu0`l*KSlSW@ce8fTE*YCo6YU9obsRXHvwVlytu+<-CG4q;1V*}&ZVGi|&Od_6Vs zBYDvUMl>f5BuF9y+klT_Y7hA(vd*su>Bj~P>S(~JLZ@-%O9~bl$(9fh5KH=qMp2`V z7Vr&;oK>veBtP5}A(S3&AnzoA1wTu5*P1@6b1PZ!62k~G$({2cKLG>~ufZYQd4(Jv zx$(S3107gj*Pd3b!%68nzse#ktPGz(DY6+dUT9<qQGbw23=yHybjfMm2!*=v>IIMZA4Z9@1N!V z2N360a0UfwS^S5$8PN4e1)Wr}l0%#|v7gOIn>EW4M=Jb$OJl&m^g2{Z4}I>2C}rh$ z>G71XNhc@T9I7jaGJumSiw;Unh$LWUy*HdQBu*8F@8&@+2*<`Qn|8yQkzu7#GGyNJ zrZ)Nqk{jnpJ(eNvf?!|)%`M$l5`)bdKK4-YqC3wCP9!1x{ix36Bu{6Jm}0kqtAMER zhbhGp@`@BvK*~lbPyE2{?;pf?JU9WFxvZ_-&)+2TB03^;6)nS{S2;SoUfgQm zN(BZ9$$bE+Up#59Wk00YTspimQNCTlc3_KpOq)O&<;JgJQPzXBT6Et!Pi+H6PHmy2 zjN#LaZsc*=0Yn+6gA5JoAa``nU@f%MsN)x4#{wG_PZ;`2B@sY@V&oQPoXslh4UBa* z1F3zi>XBa+F(4}26jjTJ9s26uwUrk-#g<~gy*Z9?#1)niobK| z3Z2^3Wry)XFm(tf4H+p7#MJ%_5e4G3!kjMNg1dqRY9gna$jKveUJ*F`1Wr4FQ%&F$ z6FB2AXAtJ}cdmUTSo5$mysp!uq;qme&2n3fU5LgmShGA(v)o_vXohCFr{LkeXwGU48#GqIc;9!gw zi1GX}^$d*XiK*Q&o*Sli!FW?K^?$AM)!(apPML3OJ1$Xm?q|;Tx4k0i?|4n0?`C^a zGOpf0ogZTRr(~eBiuoNrJ?wECk4e%OsY=5G5O;Y>*_3E8@l^(JxpL!wCP1BB({-c2?nBhuEBKve>~g{SuR_W2BdI_2FdnXI@E%&Bq|aaHt!kM_sGPM5(!LUonzxS>cq zXZB?IoAl3D(F3Hbvp@psG)hy=0VcwtzGXFtM~}aYae;2o3rupS77)ZD@vK<*4oYs{ z$mR%n?=#8Ih`NgGh4F{>g~~a2KI(d}&qtpw%*iop{vOXq<#Q8ww^8yxH?o_Ib$QT_ z|5`p<{ZX6O{+4r?_370@@myR^9<7S(Pn27S(>zVK3fj>U*KI;dZ(KYv^*#th|GVyN`qRUYNNaTZRTTomYf>eaw?frc8j^R}WZI|Nn zWRjDn3*O2D9ewb^zsFdF@RP3p1-yh?r1l`%_4AO_CcK2`XYNw`GQN*dBcwm-_2Ajz zh2nhb98QngqtkAHCp;tJ3wiVxMLZ(Np_1S({&}-z7PCcY*wXu(xx11jYT^f`_TrL` z8n>~2CC%(_?=`$C{9Q!kBasp+(az%WL-=vG)SB`I(hG0d3|Ztr7I>4h8M4fQEWZFQ zOI?pJ=0{x?Z1+{ML|j4Abz|VxQ^sJgK@}yP1Uz?}lB(9tVAUqU)mcLdt6>)STK~RG zC`jTN>3OTm1mBvC_S%dpm@il#01wg4%=!guf)vzG32GJumCdOcVD*@e>pQJ!77TA3 zQ!rk$j9)800ETqK6UXBwata8`Ljy&HUDOv(TRWo%G2)j+W4UV;y}v zjxUkT=46Lcx8R98W7=jq$;nDa@mW*&q_6}ZiZDe+9|*5U z9YN(y@&fTXUkGMyfzRBb#`p5nSwA`k9B>p?I??i_^jm+S@D%x6kmblRw5%yZG&hB# zomVEFO}{C7nYoh?M0W~0?-=Azb*9i?MjzErGpHu$<#A}Ab#RAcuy<8fAJ9Qqin23S zjhc&)G2&MYeu|Pf7RsCR5uJlKkx*XfM>Le5a!_jiSX?VaYU;WBbotIF;mJu8`o~O= z?`9xh`a7WZPu#oqDG9>9*d&7ts|c7~G)!5pa6&3p`jg}IyHH;fn^dJP%j_MqNy z&5P-yhZ*R0=t@moRuxp93zrKG%GUy(epe+1<;z^cYnF%Ch$3oi&J>o34|3VMPPb}A zxmAL4bV0djLAfLoFruXP?~6G;c&&Ll@(f9!al7B=*4XFvOlVhI%~qP~c;7`fOcU_j zcM05EV%>l3HBvoLMCfgEW}B%I`RugzKKHA!#*1qrkVplcXoscI2V7~j!q%opT_CIU zS06AUpU2%f|0)oyJ;%pKU#J%FJWD_Uz3!a;oLN=s`BsQU;(_LCH?^)R_@yjTZ2qbb z#1Bzvq)akT-KR`6`fQgjcxN*MUbtD!u0jx6N`lK+|{10`Z4!0NT}}Q=&%50pUK5MP2Jm91&5Jz`c=|ZzGcWdxd0GF$GGLqi9&A!9 z5-Bv)E$O~!L1SM^?8o$2=asR|K7etm>A-fKRbOx|O?4WnPJLM&{jxf%j(MPu9!}Ts zI$g_hYPdDq9cp6TYO-HY#A{q!Ig`#GeOg36C@VLO&#H-cJ9N|AwZf@3U&>VNC$F_2 zCAn0#*6C{Q+ouyF4iXMogPOY|9thT+bPekdov-1Z!lTi}gECzatcf_4>@48=ZVbM@dz|~x4>jm%yt`)TYd!ShkFFc&xj6JC>Zg9@-x0{w2K+hh{O8_0Xs8ExS)+jkwe< zv{6a8*r&@1pIl#%97LeS8Ux{ASFx!ri-~zOAxe;ihgB zcPa6R-FmmOemi|Mf-$=Lo6(5sEQz~L&wHCkBaX{16XciA)L*u)bGQknpiMIFjhBBs zt@8WEdQjt)m#3k#KJ6Xu?cQ~P8a-X$ z+5YA<_iR)9&HDDhI&i$#$&YMo|8N>T>v0uCCVpMSS9*G!=T$YG)IH7Nd(MV2nE1Dnj>apob$z&1 z(O=TtTs#{cn!)*l)g?X%TjqeHd?KEq&RhyNJoS0^#rQQXIcpAbEh%mu{zeksTKUq7 zl>_AEwB7S!4c!twb@}YhrDc0g$$sc-{2{mThnICZCii9Zq)1&#Ce9ox+NuueF$639N^PmPk3RQHUuE@rj`w*!>ho-N8@|{q`$^Ax zu;x;S+fd`gkmutko-KfZ%0)EVTD^_OnE9P|9~ zjpsk!cwPiKo%&Q0F-I2|+77MX&wjJ?#oMLdzFoR*F?IcFay$4epbhH;tmyV{VITR$Ri+mmZ zQXMUp-tPq~s0Qj(Rmm6ZmoGThi55CtgaeD#p-dm!qksWQ$>{yW=wqE8^1ybfu(OCq zb~>d~;xK1-Tf8Fmb9t)k!PHBO^NV>c^t~XvYCnOAPc3aU6*Y|Fy5M!wcY#&a_tJqr zRKy^?KJu-0-?iBHdSK7hfn7R%_+&W5FGc)tePA3Bl$J!^qtk~Vs8z2I^UA9O*>wZ^ z9#)j>f9N3%0{S>*(@dN~eFYMu@$anu#$Jv{oTk&qvdU)z>&b!f8G)zL0t=GM-F+$S zp$eToE@cE+B*XiBgKwk-pGv;p45q8X^>LM$+(=Ar1NxYEMIF?e-NZa)+luR>U7pn_ z&uZ)R5CpeNL0K_<9fdM)oFf`bOQIYB2bo|E(QEQ3FnF|NNv=V<^Q!c)jCAJ}?7YMA ztPEW#w(MB>!(4L4#pxN~3R~fnXQi{eRjV>?B&!cU>ppz!;ocNuTlswMC$31!vi#ed{vAdzE~PFqz&yVP!gXg4T$c@ z1AID}J-Fv8rrt!XeH1Bte_ji_K*IkLGqhGod6$p>h2;(o9{Hr+Y#&zEolBHt zHE+&3y*bO*Z*WLe32LGVK{)Ac`@@pS8n0!|iRY;oGuG4{%+eWj_|+S$W}eEzBZ>5D z9{J3v#T;E}IA+W>(Upcy*>&-0)d$x!QPy1Zi-b&DRy0!B?Uj1p%RXO_rfzV(;J4<2 zMaBh^y%hEWf2WK9_fpCnnnrwUj0#(=wFfyKu1B1P6QI;HG&vxD5wmmN@vM7MzKLg(W!n5)Ot# zaBxF74z@D@V3i{d-VVOdddd%L!KFBOlPLh(FT%kS3jnwv1_vwLaB#8{KIcXpJW!5< zhjH-mGOgrDT=Etzc&2t5IGCux!Q8_**wqYxQ&!?&MKlin2?t|yaPY`<9Q=ut4%GDb G^}hg|0Gb{E literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/coarse.hlsl b/piet-gpu/shader/gen/coarse.hlsl new file mode 100644 index 0000000..a7f769f --- /dev/null +++ b/piet-gpu/shader/gen/coarse.hlsl @@ -0,0 +1,1254 @@ +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct BinInstanceRef +{ + uint offset; +}; + +struct BinInstance +{ + uint element_ix; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct TileSegRef +{ + uint offset; +}; + +struct Tile +{ + TileSegRef tile; + int backdrop; +}; + +struct CmdStrokeRef +{ + uint offset; +}; + +struct CmdStroke +{ + uint tile_ref; + float half_width; +}; + +struct CmdFillRef +{ + uint offset; +}; + +struct CmdFill +{ + uint tile_ref; + int backdrop; +}; + +struct CmdColorRef +{ + uint offset; +}; + +struct CmdColor +{ + uint rgba_color; +}; + +struct CmdLinGradRef +{ + uint offset; +}; + +struct CmdLinGrad +{ + uint index; + float line_x; + float line_y; + float line_c; +}; + +struct CmdRadGradRef +{ + uint offset; +}; + +struct CmdRadGrad +{ + uint index; + float4 mat; + float2 xlat; + float2 c1; + float ra; + float roff; +}; + +struct CmdImageRef +{ + uint offset; +}; + +struct CmdImage +{ + uint index; + int2 offset; +}; + +struct CmdEndClipRef +{ + uint offset; +}; + +struct CmdEndClip +{ + uint blend; +}; + +struct CmdJumpRef +{ + uint offset; +}; + +struct CmdJump +{ + uint new_ref; +}; + +struct CmdRef +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +RWByteAddressBuffer _266 : register(u0, space0); +ByteAddressBuffer _1020 : register(t1, space0); +ByteAddressBuffer _1399 : register(t2, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; +}; + +groupshared uint sh_bitmaps[8][256]; +groupshared Alloc sh_part_elements[256]; +groupshared uint sh_part_count[256]; +groupshared uint sh_elements[256]; +groupshared uint sh_tile_stride[256]; +groupshared uint sh_tile_width[256]; +groupshared uint sh_tile_x0[256]; +groupshared uint sh_tile_y0[256]; +groupshared uint sh_tile_base[256]; +groupshared uint sh_tile_count[256]; + +Alloc slice_mem(Alloc a, uint offset, uint size) +{ + Alloc _343 = { a.offset + offset }; + return _343; +} + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +uint read_mem(Alloc alloc, uint offset) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = _266.Load(offset * 4 + 8); + return v; +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) +{ + BinInstanceRef _361 = { ref.offset + (index * 4u) }; + return _361; +} + +BinInstance BinInstance_read(Alloc a, BinInstanceRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + BinInstance s; + s.element_ix = raw0; + return s; +} + +Path Path_read(Alloc a, PathRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + TileRef _424 = { raw2 }; + s.tiles = _424; + return s; +} + +void write_tile_alloc(uint el_ix, Alloc a) +{ +} + +Alloc read_tile_alloc(uint el_ix, bool mem_ok) +{ + uint _907; + _266.GetDimensions(_907); + _907 = (_907 - 8) / 4; + uint param = 0u; + uint param_1 = uint(int(_907) * 4); + bool param_2 = mem_ok; + return new_alloc(param, param_1, param_2); +} + +Tile Tile_read(Alloc a, TileRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + TileSegRef _449 = { raw0 }; + Tile s; + s.tile = _449; + s.backdrop = int(raw1); + return s; +} + +MallocResult malloc(uint size) +{ + uint _272; + _266.InterlockedAdd(0, size, _272); + uint offset = _272; + uint _279; + _266.GetDimensions(_279); + _279 = (_279 - 8) / 4; + MallocResult r; + r.failed = (offset + size) > uint(int(_279) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _301; + _266.InterlockedMax(4, 1u, _301); + return r; + } + return r; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _266.Store(offset * 4 + 8, val); +} + +void CmdJump_write(Alloc a, CmdJumpRef ref, CmdJump s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.new_ref; + write_mem(param, param_1, param_2); +} + +void Cmd_Jump_write(Alloc a, CmdRef ref, CmdJump s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 11u; + write_mem(param, param_1, param_2); + CmdJumpRef _900 = { ref.offset + 4u }; + Alloc param_3 = a; + CmdJumpRef param_4 = _900; + CmdJump param_5 = s; + CmdJump_write(param_3, param_4, param_5); +} + +bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) +{ + if (cmd_ref.offset < cmd_limit) + { + return true; + } + uint param = 1024u; + MallocResult _928 = malloc(param); + MallocResult new_cmd = _928; + if (new_cmd.failed) + { + return false; + } + CmdJump _938 = { new_cmd.alloc.offset }; + CmdJump jump = _938; + Alloc param_1 = cmd_alloc; + CmdRef param_2 = cmd_ref; + CmdJump param_3 = jump; + Cmd_Jump_write(param_1, param_2, param_3); + cmd_alloc = new_cmd.alloc; + CmdRef _950 = { cmd_alloc.offset }; + cmd_ref = _950; + cmd_limit = (cmd_alloc.offset + 1024u) - 144u; + return true; +} + +void CmdFill_write(Alloc a, CmdFillRef ref, CmdFill s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.tile_ref; + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = uint(s.backdrop); + write_mem(param_3, param_4, param_5); +} + +void Cmd_Fill_write(Alloc a, CmdRef ref, CmdFill s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 1u; + write_mem(param, param_1, param_2); + CmdFillRef _757 = { ref.offset + 4u }; + Alloc param_3 = a; + CmdFillRef param_4 = _757; + CmdFill param_5 = s; + CmdFill_write(param_3, param_4, param_5); +} + +void Cmd_Solid_write(Alloc a, CmdRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 3u; + write_mem(param, param_1, param_2); +} + +void CmdStroke_write(Alloc a, CmdStrokeRef ref, CmdStroke s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.tile_ref; + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = asuint(s.half_width); + write_mem(param_3, param_4, param_5); +} + +void Cmd_Stroke_write(Alloc a, CmdRef ref, CmdStroke s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 2u; + write_mem(param, param_1, param_2); + CmdStrokeRef _775 = { ref.offset + 4u }; + Alloc param_3 = a; + CmdStrokeRef param_4 = _775; + CmdStroke param_5 = s; + CmdStroke_write(param_3, param_4, param_5); +} + +void write_fill(Alloc alloc, inout CmdRef cmd_ref, Tile tile, float linewidth) +{ + if (linewidth < 0.0f) + { + if (tile.tile.offset != 0u) + { + CmdFill _973 = { tile.tile.offset, tile.backdrop }; + CmdFill cmd_fill = _973; + Alloc param = alloc; + CmdRef param_1 = cmd_ref; + CmdFill param_2 = cmd_fill; + Cmd_Fill_write(param, param_1, param_2); + cmd_ref.offset += 12u; + } + else + { + Alloc param_3 = alloc; + CmdRef param_4 = cmd_ref; + Cmd_Solid_write(param_3, param_4); + cmd_ref.offset += 4u; + } + } + else + { + CmdStroke _1003 = { tile.tile.offset, 0.5f * linewidth }; + CmdStroke cmd_stroke = _1003; + Alloc param_5 = alloc; + CmdRef param_6 = cmd_ref; + CmdStroke param_7 = cmd_stroke; + Cmd_Stroke_write(param_5, param_6, param_7); + cmd_ref.offset += 12u; + } +} + +void CmdColor_write(Alloc a, CmdColorRef ref, CmdColor s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.rgba_color; + write_mem(param, param_1, param_2); +} + +void Cmd_Color_write(Alloc a, CmdRef ref, CmdColor s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 5u; + write_mem(param, param_1, param_2); + CmdColorRef _801 = { ref.offset + 4u }; + Alloc param_3 = a; + CmdColorRef param_4 = _801; + CmdColor param_5 = s; + CmdColor_write(param_3, param_4, param_5); +} + +void CmdLinGrad_write(Alloc a, CmdLinGradRef ref, CmdLinGrad s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.index; + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = asuint(s.line_x); + write_mem(param_3, param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = asuint(s.line_y); + write_mem(param_6, param_7, param_8); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = asuint(s.line_c); + write_mem(param_9, param_10, param_11); +} + +void Cmd_LinGrad_write(Alloc a, CmdRef ref, CmdLinGrad s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 6u; + write_mem(param, param_1, param_2); + CmdLinGradRef _819 = { ref.offset + 4u }; + Alloc param_3 = a; + CmdLinGradRef param_4 = _819; + CmdLinGrad param_5 = s; + CmdLinGrad_write(param_3, param_4, param_5); +} + +void CmdRadGrad_write(Alloc a, CmdRadGradRef ref, CmdRadGrad s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.index; + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = asuint(s.mat.x); + write_mem(param_3, param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = asuint(s.mat.y); + write_mem(param_6, param_7, param_8); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = asuint(s.mat.z); + write_mem(param_9, param_10, param_11); + Alloc param_12 = a; + uint param_13 = ix + 4u; + uint param_14 = asuint(s.mat.w); + write_mem(param_12, param_13, param_14); + Alloc param_15 = a; + uint param_16 = ix + 5u; + uint param_17 = asuint(s.xlat.x); + write_mem(param_15, param_16, param_17); + Alloc param_18 = a; + uint param_19 = ix + 6u; + uint param_20 = asuint(s.xlat.y); + write_mem(param_18, param_19, param_20); + Alloc param_21 = a; + uint param_22 = ix + 7u; + uint param_23 = asuint(s.c1.x); + write_mem(param_21, param_22, param_23); + Alloc param_24 = a; + uint param_25 = ix + 8u; + uint param_26 = asuint(s.c1.y); + write_mem(param_24, param_25, param_26); + Alloc param_27 = a; + uint param_28 = ix + 9u; + uint param_29 = asuint(s.ra); + write_mem(param_27, param_28, param_29); + Alloc param_30 = a; + uint param_31 = ix + 10u; + uint param_32 = asuint(s.roff); + write_mem(param_30, param_31, param_32); +} + +void Cmd_RadGrad_write(Alloc a, CmdRef ref, CmdRadGrad s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 7u; + write_mem(param, param_1, param_2); + CmdRadGradRef _837 = { ref.offset + 4u }; + Alloc param_3 = a; + CmdRadGradRef param_4 = _837; + CmdRadGrad param_5 = s; + CmdRadGrad_write(param_3, param_4, param_5); +} + +void CmdImage_write(Alloc a, CmdImageRef ref, CmdImage s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.index; + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16)); + write_mem(param_3, param_4, param_5); +} + +void Cmd_Image_write(Alloc a, CmdRef ref, CmdImage s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 8u; + write_mem(param, param_1, param_2); + CmdImageRef _855 = { ref.offset + 4u }; + Alloc param_3 = a; + CmdImageRef param_4 = _855; + CmdImage param_5 = s; + CmdImage_write(param_3, param_4, param_5); +} + +void Cmd_BeginClip_write(Alloc a, CmdRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 9u; + write_mem(param, param_1, param_2); +} + +void CmdEndClip_write(Alloc a, CmdEndClipRef ref, CmdEndClip s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.blend; + write_mem(param, param_1, param_2); +} + +void Cmd_EndClip_write(Alloc a, CmdRef ref, CmdEndClip s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 10u; + write_mem(param, param_1, param_2); + CmdEndClipRef _881 = { ref.offset + 4u }; + Alloc param_3 = a; + CmdEndClipRef param_4 = _881; + CmdEndClip param_5 = s; + CmdEndClip_write(param_3, param_4, param_5); +} + +void Cmd_End_write(Alloc a, CmdRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 0u; + write_mem(param, param_1, param_2); +} + +void alloc_write(Alloc a, uint offset, Alloc alloc) +{ + Alloc param = a; + uint param_1 = offset >> uint(2); + uint param_2 = alloc.offset; + write_mem(param, param_1, param_2); +} + +void comp_main() +{ + uint width_in_bins = ((_1020.Load(8) + 16u) - 1u) / 16u; + uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x; + uint partition_ix = 0u; + uint n_partitions = ((_1020.Load(0) + 256u) - 1u) / 256u; + uint th_ix = gl_LocalInvocationID.x; + uint bin_tile_x = 16u * gl_WorkGroupID.x; + uint bin_tile_y = 16u * gl_WorkGroupID.y; + uint tile_x = gl_LocalInvocationID.x % 16u; + uint tile_y = gl_LocalInvocationID.x / 16u; + uint this_tile_ix = (((bin_tile_y + tile_y) * _1020.Load(8)) + bin_tile_x) + tile_x; + Alloc _1085; + _1085.offset = _1020.Load(24); + Alloc param; + param.offset = _1085.offset; + uint param_1 = this_tile_ix * 1024u; + uint param_2 = 1024u; + Alloc cmd_alloc = slice_mem(param, param_1, param_2); + CmdRef _1094 = { cmd_alloc.offset }; + CmdRef cmd_ref = _1094; + uint cmd_limit = (cmd_ref.offset + 1024u) - 144u; + uint clip_depth = 0u; + uint clip_zero_depth = 0u; + uint rd_ix = 0u; + uint wr_ix = 0u; + uint part_start_ix = 0u; + uint ready_ix = 0u; + Alloc param_3 = cmd_alloc; + uint param_4 = 0u; + uint param_5 = 8u; + Alloc scratch_alloc = slice_mem(param_3, param_4, param_5); + cmd_ref.offset += 4u; + uint render_blend_depth = 0u; + uint max_blend_depth = 0u; + uint drawmonoid_start = _1020.Load(44) >> uint(2); + uint drawtag_start = _1020.Load(100) >> uint(2); + uint drawdata_start = _1020.Load(104) >> uint(2); + uint drawinfo_start = _1020.Load(68) >> uint(2); + bool mem_ok = _266.Load(4) == 0u; + Alloc param_6; + Alloc param_8; + uint _1331; + uint element_ix; + Alloc param_17; + uint tile_count; + uint _1632; + float linewidth; + CmdLinGrad cmd_lin; + CmdRadGrad cmd_rad; + while (true) + { + for (uint i = 0u; i < 8u; i++) + { + sh_bitmaps[i][th_ix] = 0u; + } + bool _1383; + for (;;) + { + if ((ready_ix == wr_ix) && (partition_ix < n_partitions)) + { + part_start_ix = ready_ix; + uint count = 0u; + bool _1181 = th_ix < 256u; + bool _1189; + if (_1181) + { + _1189 = (partition_ix + th_ix) < n_partitions; + } + else + { + _1189 = _1181; + } + if (_1189) + { + uint in_ix = (_1020.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u); + Alloc _1206; + _1206.offset = _1020.Load(20); + param_6.offset = _1206.offset; + uint param_7 = in_ix; + count = read_mem(param_6, param_7); + Alloc _1217; + _1217.offset = _1020.Load(20); + param_8.offset = _1217.offset; + uint param_9 = in_ix + 1u; + uint offset = read_mem(param_8, param_9); + uint param_10 = offset; + uint param_11 = count * 4u; + bool param_12 = mem_ok; + sh_part_elements[th_ix] = new_alloc(param_10, param_11, param_12); + } + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + if (th_ix < 256u) + { + sh_part_count[th_ix] = count; + } + GroupMemoryBarrierWithGroupSync(); + if (th_ix < 256u) + { + if (th_ix >= (1u << i_1)) + { + count += sh_part_count[th_ix - (1u << i_1)]; + } + } + GroupMemoryBarrierWithGroupSync(); + } + if (th_ix < 256u) + { + sh_part_count[th_ix] = part_start_ix + count; + } + GroupMemoryBarrierWithGroupSync(); + ready_ix = sh_part_count[255]; + partition_ix += 256u; + } + uint ix = rd_ix + th_ix; + if (((ix >= wr_ix) && (ix < ready_ix)) && mem_ok) + { + uint part_ix = 0u; + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + uint probe = part_ix + (128u >> i_2); + if (ix >= sh_part_count[probe - 1u]) + { + part_ix = probe; + } + } + if (part_ix > 0u) + { + _1331 = sh_part_count[part_ix - 1u]; + } + else + { + _1331 = part_start_ix; + } + ix -= _1331; + Alloc bin_alloc = sh_part_elements[part_ix]; + BinInstanceRef _1350 = { bin_alloc.offset }; + BinInstanceRef inst_ref = _1350; + BinInstanceRef param_13 = inst_ref; + uint param_14 = ix; + Alloc param_15 = bin_alloc; + BinInstanceRef param_16 = BinInstance_index(param_13, param_14); + BinInstance inst = BinInstance_read(param_15, param_16); + sh_elements[th_ix] = inst.element_ix; + } + GroupMemoryBarrierWithGroupSync(); + wr_ix = min((rd_ix + 256u), ready_ix); + bool _1373 = (wr_ix - rd_ix) < 256u; + if (_1373) + { + _1383 = (wr_ix < ready_ix) || (partition_ix < n_partitions); + } + else + { + _1383 = _1373; + } + if (_1383) + { + continue; + } + else + { + break; + } + } + uint tag = 0u; + if ((th_ix + rd_ix) < wr_ix) + { + element_ix = sh_elements[th_ix]; + tag = _1399.Load((drawtag_start + element_ix) * 4 + 0); + } + switch (tag) + { + case 68u: + case 72u: + case 276u: + case 732u: + case 5u: + case 37u: + { + uint drawmonoid_base = drawmonoid_start + (4u * element_ix); + uint path_ix = _266.Load(drawmonoid_base * 4 + 8); + PathRef _1424 = { _1020.Load(16) + (path_ix * 12u) }; + Alloc _1427; + _1427.offset = _1020.Load(16); + param_17.offset = _1427.offset; + PathRef param_18 = _1424; + Path path = Path_read(param_17, param_18); + uint stride = path.bbox.z - path.bbox.x; + sh_tile_stride[th_ix] = stride; + int dx = int(path.bbox.x) - int(bin_tile_x); + int dy = int(path.bbox.y) - int(bin_tile_y); + int x0 = clamp(dx, 0, 16); + int y0 = clamp(dy, 0, 16); + int x1 = clamp(int(path.bbox.z) - int(bin_tile_x), 0, 16); + int y1 = clamp(int(path.bbox.w) - int(bin_tile_y), 0, 16); + sh_tile_width[th_ix] = uint(x1 - x0); + sh_tile_x0[th_ix] = uint(x0); + sh_tile_y0[th_ix] = uint(y0); + tile_count = uint(x1 - x0) * uint(y1 - y0); + uint base = path.tiles.offset - (((uint(dy) * stride) + uint(dx)) * 8u); + sh_tile_base[th_ix] = base; + uint param_19 = path.tiles.offset; + uint param_20 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_21 = mem_ok; + Alloc path_alloc = new_alloc(param_19, param_20, param_21); + uint param_22 = th_ix; + Alloc param_23 = path_alloc; + write_tile_alloc(param_22, param_23); + break; + } + default: + { + tile_count = 0u; + break; + } + } + sh_tile_count[th_ix] = tile_count; + for (uint i_3 = 0u; i_3 < 8u; i_3++) + { + GroupMemoryBarrierWithGroupSync(); + if (th_ix >= (1u << i_3)) + { + tile_count += sh_tile_count[th_ix - (1u << i_3)]; + } + GroupMemoryBarrierWithGroupSync(); + sh_tile_count[th_ix] = tile_count; + } + GroupMemoryBarrierWithGroupSync(); + uint total_tile_count = sh_tile_count[255]; + for (uint ix_1 = th_ix; ix_1 < total_tile_count; ix_1 += 256u) + { + uint el_ix = 0u; + for (uint i_4 = 0u; i_4 < 8u; i_4++) + { + uint probe_1 = el_ix + (128u >> i_4); + if (ix_1 >= sh_tile_count[probe_1 - 1u]) + { + el_ix = probe_1; + } + } + uint element_ix_1 = sh_elements[el_ix]; + uint tag_1 = _1399.Load((drawtag_start + element_ix_1) * 4 + 0); + if (el_ix > 0u) + { + _1632 = sh_tile_count[el_ix - 1u]; + } + else + { + _1632 = 0u; + } + uint seq_ix = ix_1 - _1632; + uint width = sh_tile_width[el_ix]; + uint x = sh_tile_x0[el_ix] + (seq_ix % width); + uint y = sh_tile_y0[el_ix] + (seq_ix / width); + bool include_tile = false; + if (mem_ok) + { + uint param_24 = el_ix; + bool param_25 = mem_ok; + TileRef _1684 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) }; + Alloc param_26 = read_tile_alloc(param_24, param_25); + TileRef param_27 = _1684; + Tile tile = Tile_read(param_26, param_27); + bool is_clip = (tag_1 & 1u) != 0u; + bool is_blend = false; + if (is_clip) + { + uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1); + uint scene_offset = _266.Load((drawmonoid_base_1 + 2u) * 4 + 8); + uint dd = drawdata_start + (scene_offset >> uint(2)); + uint blend = _1399.Load(dd * 4 + 0); + is_blend = blend != 32771u; + } + bool _1720 = tile.tile.offset != 0u; + bool _1729; + if (!_1720) + { + _1729 = (tile.backdrop == 0) == is_clip; + } + else + { + _1729 = _1720; + } + include_tile = _1729 || is_blend; + } + if (include_tile) + { + uint el_slice = el_ix / 32u; + uint el_mask = 1u << (el_ix & 31u); + uint _1751; + InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1751); + } + } + GroupMemoryBarrierWithGroupSync(); + uint slice_ix = 0u; + uint bitmap = sh_bitmaps[0][th_ix]; + while (mem_ok) + { + if (bitmap == 0u) + { + slice_ix++; + if (slice_ix == 8u) + { + break; + } + bitmap = sh_bitmaps[slice_ix][th_ix]; + if (bitmap == 0u) + { + continue; + } + } + uint element_ref_ix = (slice_ix * 32u) + uint(int(firstbitlow(bitmap))); + uint element_ix_2 = sh_elements[element_ref_ix]; + bitmap &= (bitmap - 1u); + uint drawtag = _1399.Load((drawtag_start + element_ix_2) * 4 + 0); + if (clip_zero_depth == 0u) + { + uint param_28 = element_ref_ix; + bool param_29 = mem_ok; + TileRef _1828 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + Alloc param_30 = read_tile_alloc(param_28, param_29); + TileRef param_31 = _1828; + Tile tile_1 = Tile_read(param_30, param_31); + uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2); + uint scene_offset_1 = _266.Load((drawmonoid_base_2 + 2u) * 4 + 8); + uint info_offset = _266.Load((drawmonoid_base_2 + 3u) * 4 + 8); + uint dd_1 = drawdata_start + (scene_offset_1 >> uint(2)); + uint di = drawinfo_start + (info_offset >> uint(2)); + switch (drawtag) + { + case 68u: + { + linewidth = asfloat(_266.Load(di * 4 + 8)); + Alloc param_32 = cmd_alloc; + CmdRef param_33 = cmd_ref; + uint param_34 = cmd_limit; + bool _1876 = alloc_cmd(param_32, param_33, param_34); + cmd_alloc = param_32; + cmd_ref = param_33; + cmd_limit = param_34; + if (!_1876) + { + break; + } + Alloc param_35 = cmd_alloc; + CmdRef param_36 = cmd_ref; + Tile param_37 = tile_1; + float param_38 = linewidth; + write_fill(param_35, param_36, param_37, param_38); + cmd_ref = param_36; + uint rgba = _1399.Load(dd_1 * 4 + 0); + CmdColor _1899 = { rgba }; + Alloc param_39 = cmd_alloc; + CmdRef param_40 = cmd_ref; + CmdColor param_41 = _1899; + Cmd_Color_write(param_39, param_40, param_41); + cmd_ref.offset += 8u; + break; + } + case 276u: + { + Alloc param_42 = cmd_alloc; + CmdRef param_43 = cmd_ref; + uint param_44 = cmd_limit; + bool _1917 = alloc_cmd(param_42, param_43, param_44); + cmd_alloc = param_42; + cmd_ref = param_43; + cmd_limit = param_44; + if (!_1917) + { + break; + } + linewidth = asfloat(_266.Load(di * 4 + 8)); + Alloc param_45 = cmd_alloc; + CmdRef param_46 = cmd_ref; + Tile param_47 = tile_1; + float param_48 = linewidth; + write_fill(param_45, param_46, param_47, param_48); + cmd_ref = param_46; + cmd_lin.index = _1399.Load(dd_1 * 4 + 0); + cmd_lin.line_x = asfloat(_266.Load((di + 1u) * 4 + 8)); + cmd_lin.line_y = asfloat(_266.Load((di + 2u) * 4 + 8)); + cmd_lin.line_c = asfloat(_266.Load((di + 3u) * 4 + 8)); + Alloc param_49 = cmd_alloc; + CmdRef param_50 = cmd_ref; + CmdLinGrad param_51 = cmd_lin; + Cmd_LinGrad_write(param_49, param_50, param_51); + cmd_ref.offset += 20u; + break; + } + case 732u: + { + Alloc param_52 = cmd_alloc; + CmdRef param_53 = cmd_ref; + uint param_54 = cmd_limit; + bool _1981 = alloc_cmd(param_52, param_53, param_54); + cmd_alloc = param_52; + cmd_ref = param_53; + cmd_limit = param_54; + if (!_1981) + { + break; + } + linewidth = asfloat(_266.Load(di * 4 + 8)); + Alloc param_55 = cmd_alloc; + CmdRef param_56 = cmd_ref; + Tile param_57 = tile_1; + float param_58 = linewidth; + write_fill(param_55, param_56, param_57, param_58); + cmd_ref = param_56; + cmd_rad.index = _1399.Load(dd_1 * 4 + 0); + cmd_rad.mat = asfloat(uint4(_266.Load((di + 1u) * 4 + 8), _266.Load((di + 2u) * 4 + 8), _266.Load((di + 3u) * 4 + 8), _266.Load((di + 4u) * 4 + 8))); + cmd_rad.xlat = asfloat(uint2(_266.Load((di + 5u) * 4 + 8), _266.Load((di + 6u) * 4 + 8))); + cmd_rad.c1 = asfloat(uint2(_266.Load((di + 7u) * 4 + 8), _266.Load((di + 8u) * 4 + 8))); + cmd_rad.ra = asfloat(_266.Load((di + 9u) * 4 + 8)); + cmd_rad.roff = asfloat(_266.Load((di + 10u) * 4 + 8)); + Alloc param_59 = cmd_alloc; + CmdRef param_60 = cmd_ref; + CmdRadGrad param_61 = cmd_rad; + Cmd_RadGrad_write(param_59, param_60, param_61); + cmd_ref.offset += 48u; + break; + } + case 72u: + { + linewidth = asfloat(_266.Load(di * 4 + 8)); + Alloc param_62 = cmd_alloc; + CmdRef param_63 = cmd_ref; + uint param_64 = cmd_limit; + bool _2087 = alloc_cmd(param_62, param_63, param_64); + cmd_alloc = param_62; + cmd_ref = param_63; + cmd_limit = param_64; + if (!_2087) + { + break; + } + Alloc param_65 = cmd_alloc; + CmdRef param_66 = cmd_ref; + Tile param_67 = tile_1; + float param_68 = linewidth; + write_fill(param_65, param_66, param_67, param_68); + cmd_ref = param_66; + uint index = _1399.Load(dd_1 * 4 + 0); + uint raw1 = _1399.Load((dd_1 + 1u) * 4 + 0); + int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16); + CmdImage _2126 = { index, offset_1 }; + Alloc param_69 = cmd_alloc; + CmdRef param_70 = cmd_ref; + CmdImage param_71 = _2126; + Cmd_Image_write(param_69, param_70, param_71); + cmd_ref.offset += 12u; + break; + } + case 5u: + { + bool _2140 = tile_1.tile.offset == 0u; + bool _2146; + if (_2140) + { + _2146 = tile_1.backdrop == 0; + } + else + { + _2146 = _2140; + } + if (_2146) + { + clip_zero_depth = clip_depth + 1u; + } + else + { + Alloc param_72 = cmd_alloc; + CmdRef param_73 = cmd_ref; + uint param_74 = cmd_limit; + bool _2158 = alloc_cmd(param_72, param_73, param_74); + cmd_alloc = param_72; + cmd_ref = param_73; + cmd_limit = param_74; + if (!_2158) + { + break; + } + Alloc param_75 = cmd_alloc; + CmdRef param_76 = cmd_ref; + Cmd_BeginClip_write(param_75, param_76); + cmd_ref.offset += 4u; + render_blend_depth++; + max_blend_depth = max(max_blend_depth, render_blend_depth); + } + clip_depth++; + break; + } + case 37u: + { + clip_depth--; + Alloc param_77 = cmd_alloc; + CmdRef param_78 = cmd_ref; + uint param_79 = cmd_limit; + bool _2191 = alloc_cmd(param_77, param_78, param_79); + cmd_alloc = param_77; + cmd_ref = param_78; + cmd_limit = param_79; + if (!_2191) + { + break; + } + Alloc param_80 = cmd_alloc; + CmdRef param_81 = cmd_ref; + Tile param_82 = tile_1; + float param_83 = -1.0f; + write_fill(param_80, param_81, param_82, param_83); + cmd_ref = param_81; + uint blend_1 = _1399.Load(dd_1 * 4 + 0); + CmdEndClip _2214 = { blend_1 }; + Alloc param_84 = cmd_alloc; + CmdRef param_85 = cmd_ref; + CmdEndClip param_86 = _2214; + Cmd_EndClip_write(param_84, param_85, param_86); + cmd_ref.offset += 8u; + render_blend_depth--; + break; + } + } + } + else + { + switch (drawtag) + { + case 5u: + { + clip_depth++; + break; + } + case 37u: + { + if (clip_depth == clip_zero_depth) + { + clip_zero_depth = 0u; + } + clip_depth--; + break; + } + } + } + } + GroupMemoryBarrierWithGroupSync(); + rd_ix += 256u; + if ((rd_ix >= ready_ix) && (partition_ix >= n_partitions)) + { + break; + } + } + bool _2263 = (bin_tile_x + tile_x) < _1020.Load(8); + bool _2272; + if (_2263) + { + _2272 = (bin_tile_y + tile_y) < _1020.Load(12); + } + else + { + _2272 = _2263; + } + if (_2272) + { + Alloc param_87 = cmd_alloc; + CmdRef param_88 = cmd_ref; + Cmd_End_write(param_87, param_88); + if (max_blend_depth > 4u) + { + uint scratch_size = (((max_blend_depth * 16u) * 16u) * 1u) * 4u; + uint param_89 = scratch_size; + MallocResult _2293 = malloc(param_89); + MallocResult scratch = _2293; + Alloc param_90 = scratch_alloc; + uint param_91 = scratch_alloc.offset; + Alloc param_92 = scratch.alloc; + alloc_write(param_90, param_91, param_92); + } + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/coarse.msl b/piet-gpu/shader/gen/coarse.msl new file mode 100644 index 0000000..d84add1 --- /dev/null +++ b/piet-gpu/shader/gen/coarse.msl @@ -0,0 +1,1266 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wunused-variable" + +#include +#include +#include + +using namespace metal; + +// Implementation of the GLSL findLSB() function +template +inline T spvFindLSB(T x) +{ + return select(ctz(x), T(-1), x == T(0)); +} + +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct BinInstanceRef +{ + uint offset; +}; + +struct BinInstance +{ + uint element_ix; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct TileSegRef +{ + uint offset; +}; + +struct Tile +{ + TileSegRef tile; + int backdrop; +}; + +struct CmdStrokeRef +{ + uint offset; +}; + +struct CmdStroke +{ + uint tile_ref; + float half_width; +}; + +struct CmdFillRef +{ + uint offset; +}; + +struct CmdFill +{ + uint tile_ref; + int backdrop; +}; + +struct CmdColorRef +{ + uint offset; +}; + +struct CmdColor +{ + uint rgba_color; +}; + +struct CmdLinGradRef +{ + uint offset; +}; + +struct CmdLinGrad +{ + uint index; + float line_x; + float line_y; + float line_c; +}; + +struct CmdRadGradRef +{ + uint offset; +}; + +struct CmdRadGrad +{ + uint index; + float4 mat; + float2 xlat; + float2 c1; + float ra; + float roff; +}; + +struct CmdImageRef +{ + uint offset; +}; + +struct CmdImage +{ + uint index; + int2 offset; +}; + +struct CmdEndClipRef +{ + uint offset; +}; + +struct CmdEndClip +{ + uint blend; +}; + +struct CmdJumpRef +{ + uint offset; +}; + +struct CmdJump +{ + uint new_ref; +}; + +struct CmdRef +{ + uint offset; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 path_bbox_alloc; + Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; + Alloc_1 draw_bbox_alloc; + Alloc_1 drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct SceneBuf +{ + uint scene[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size) +{ + return Alloc{ a.offset + offset }; +} + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_266, constant uint& v_266BufferSize) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_266.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +BinInstanceRef BinInstance_index(thread const BinInstanceRef& ref, thread const uint& index) +{ + return BinInstanceRef{ ref.offset + (index * 4u) }; +} + +static inline __attribute__((always_inline)) +BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_266, constant uint& v_266BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_266, v_266BufferSize); + BinInstance s; + s.element_ix = raw0; + return s; +} + +static inline __attribute__((always_inline)) +Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_266, constant uint& v_266BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_266, v_266BufferSize); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_266, v_266BufferSize); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_266, v_266BufferSize); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + s.tiles = TileRef{ raw2 }; + return s; +} + +static inline __attribute__((always_inline)) +void write_tile_alloc(thread const uint& el_ix, thread const Alloc& a) +{ +} + +static inline __attribute__((always_inline)) +Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, device Memory& v_266, constant uint& v_266BufferSize) +{ + uint param = 0u; + uint param_1 = uint(int((v_266BufferSize - 8) / 4) * 4); + bool param_2 = mem_ok; + return new_alloc(param, param_1, param_2); +} + +static inline __attribute__((always_inline)) +Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_266, constant uint& v_266BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_266, v_266BufferSize); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_266, v_266BufferSize); + Tile s; + s.tile = TileSegRef{ raw0 }; + s.backdrop = int(raw1); + return s; +} + +static inline __attribute__((always_inline)) +MallocResult malloc(thread const uint& size, device Memory& v_266, constant uint& v_266BufferSize) +{ + uint _272 = atomic_fetch_add_explicit((device atomic_uint*)&v_266.mem_offset, size, memory_order_relaxed); + uint offset = _272; + MallocResult r; + r.failed = (offset + size) > uint(int((v_266BufferSize - 8) / 4) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _301 = atomic_fetch_max_explicit((device atomic_uint*)&v_266.mem_error, 1u, memory_order_relaxed); + return r; + } + return r; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_266, constant uint& v_266BufferSize) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_266.memory[offset] = val; +} + +static inline __attribute__((always_inline)) +void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.new_ref; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 11u; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); + Alloc param_3 = a; + CmdJumpRef param_4 = CmdJumpRef{ ref.offset + 4u }; + CmdJump param_5 = s; + CmdJump_write(param_3, param_4, param_5, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, device Memory& v_266, constant uint& v_266BufferSize) +{ + if (cmd_ref.offset < cmd_limit) + { + return true; + } + uint param = 1024u; + MallocResult _928 = malloc(param, v_266, v_266BufferSize); + MallocResult new_cmd = _928; + if (new_cmd.failed) + { + return false; + } + CmdJump jump = CmdJump{ new_cmd.alloc.offset }; + Alloc param_1 = cmd_alloc; + CmdRef param_2 = cmd_ref; + CmdJump param_3 = jump; + Cmd_Jump_write(param_1, param_2, param_3, v_266, v_266BufferSize); + cmd_alloc = new_cmd.alloc; + cmd_ref = CmdRef{ cmd_alloc.offset }; + cmd_limit = (cmd_alloc.offset + 1024u) - 144u; + return true; +} + +static inline __attribute__((always_inline)) +void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.tile_ref; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = uint(s.backdrop); + write_mem(param_3, param_4, param_5, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 1u; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); + Alloc param_3 = a; + CmdFillRef param_4 = CmdFillRef{ ref.offset + 4u }; + CmdFill param_5 = s; + CmdFill_write(param_3, param_4, param_5, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_266, constant uint& v_266BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 3u; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.tile_ref; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = as_type(s.half_width); + write_mem(param_3, param_4, param_5, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 2u; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); + Alloc param_3 = a; + CmdStrokeRef param_4 = CmdStrokeRef{ ref.offset + 4u }; + CmdStroke param_5 = s; + CmdStroke_write(param_3, param_4, param_5, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const Tile& tile, thread const float& linewidth, device Memory& v_266, constant uint& v_266BufferSize) +{ + if (linewidth < 0.0) + { + if (tile.tile.offset != 0u) + { + CmdFill cmd_fill = CmdFill{ tile.tile.offset, tile.backdrop }; + Alloc param = alloc; + CmdRef param_1 = cmd_ref; + CmdFill param_2 = cmd_fill; + Cmd_Fill_write(param, param_1, param_2, v_266, v_266BufferSize); + cmd_ref.offset += 12u; + } + else + { + Alloc param_3 = alloc; + CmdRef param_4 = cmd_ref; + Cmd_Solid_write(param_3, param_4, v_266, v_266BufferSize); + cmd_ref.offset += 4u; + } + } + else + { + CmdStroke cmd_stroke = CmdStroke{ tile.tile.offset, 0.5 * linewidth }; + Alloc param_5 = alloc; + CmdRef param_6 = cmd_ref; + CmdStroke param_7 = cmd_stroke; + Cmd_Stroke_write(param_5, param_6, param_7, v_266, v_266BufferSize); + cmd_ref.offset += 12u; + } +} + +static inline __attribute__((always_inline)) +void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.rgba_color; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 5u; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); + Alloc param_3 = a; + CmdColorRef param_4 = CmdColorRef{ ref.offset + 4u }; + CmdColor param_5 = s; + CmdColor_write(param_3, param_4, param_5, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.index; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = as_type(s.line_x); + write_mem(param_3, param_4, param_5, v_266, v_266BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = as_type(s.line_y); + write_mem(param_6, param_7, param_8, v_266, v_266BufferSize); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = as_type(s.line_c); + write_mem(param_9, param_10, param_11, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 6u; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); + Alloc param_3 = a; + CmdLinGradRef param_4 = CmdLinGradRef{ ref.offset + 4u }; + CmdLinGrad param_5 = s; + CmdLinGrad_write(param_3, param_4, param_5, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void CmdRadGrad_write(thread const Alloc& a, thread const CmdRadGradRef& ref, thread const CmdRadGrad& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.index; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = as_type(s.mat.x); + write_mem(param_3, param_4, param_5, v_266, v_266BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = as_type(s.mat.y); + write_mem(param_6, param_7, param_8, v_266, v_266BufferSize); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = as_type(s.mat.z); + write_mem(param_9, param_10, param_11, v_266, v_266BufferSize); + Alloc param_12 = a; + uint param_13 = ix + 4u; + uint param_14 = as_type(s.mat.w); + write_mem(param_12, param_13, param_14, v_266, v_266BufferSize); + Alloc param_15 = a; + uint param_16 = ix + 5u; + uint param_17 = as_type(s.xlat.x); + write_mem(param_15, param_16, param_17, v_266, v_266BufferSize); + Alloc param_18 = a; + uint param_19 = ix + 6u; + uint param_20 = as_type(s.xlat.y); + write_mem(param_18, param_19, param_20, v_266, v_266BufferSize); + Alloc param_21 = a; + uint param_22 = ix + 7u; + uint param_23 = as_type(s.c1.x); + write_mem(param_21, param_22, param_23, v_266, v_266BufferSize); + Alloc param_24 = a; + uint param_25 = ix + 8u; + uint param_26 = as_type(s.c1.y); + write_mem(param_24, param_25, param_26, v_266, v_266BufferSize); + Alloc param_27 = a; + uint param_28 = ix + 9u; + uint param_29 = as_type(s.ra); + write_mem(param_27, param_28, param_29, v_266, v_266BufferSize); + Alloc param_30 = a; + uint param_31 = ix + 10u; + uint param_32 = as_type(s.roff); + write_mem(param_30, param_31, param_32, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_RadGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdRadGrad& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 7u; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); + Alloc param_3 = a; + CmdRadGradRef param_4 = CmdRadGradRef{ ref.offset + 4u }; + CmdRadGrad param_5 = s; + CmdRadGrad_write(param_3, param_4, param_5, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.index; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16)); + write_mem(param_3, param_4, param_5, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 8u; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); + Alloc param_3 = a; + CmdImageRef param_4 = CmdImageRef{ ref.offset + 4u }; + CmdImage param_5 = s; + CmdImage_write(param_3, param_4, param_5, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_266, constant uint& v_266BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 9u; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void CmdEndClip_write(thread const Alloc& a, thread const CmdEndClipRef& ref, thread const CmdEndClip& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.blend; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdEndClip& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 10u; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); + Alloc param_3 = a; + CmdEndClipRef param_4 = CmdEndClipRef{ ref.offset + 4u }; + CmdEndClip param_5 = s; + CmdEndClip_write(param_3, param_4, param_5, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_266, constant uint& v_266BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 0u; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void alloc_write(thread const Alloc& a, thread const uint& offset, thread const Alloc& alloc, device Memory& v_266, constant uint& v_266BufferSize) +{ + Alloc param = a; + uint param_1 = offset >> uint(2); + uint param_2 = alloc.offset; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); +} + +kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_266 [[buffer(0)]], const device ConfigBuf& _1020 [[buffer(1)]], const device SceneBuf& _1399 [[buffer(2)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup uint sh_bitmaps[8][256]; + threadgroup Alloc sh_part_elements[256]; + threadgroup uint sh_part_count[256]; + threadgroup uint sh_elements[256]; + threadgroup uint sh_tile_stride[256]; + threadgroup uint sh_tile_width[256]; + threadgroup uint sh_tile_x0[256]; + threadgroup uint sh_tile_y0[256]; + threadgroup uint sh_tile_base[256]; + threadgroup uint sh_tile_count[256]; + constant uint& v_266BufferSize = spvBufferSizeConstants[0]; + uint width_in_bins = ((_1020.conf.width_in_tiles + 16u) - 1u) / 16u; + uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x; + uint partition_ix = 0u; + uint n_partitions = ((_1020.conf.n_elements + 256u) - 1u) / 256u; + uint th_ix = gl_LocalInvocationID.x; + uint bin_tile_x = 16u * gl_WorkGroupID.x; + uint bin_tile_y = 16u * gl_WorkGroupID.y; + uint tile_x = gl_LocalInvocationID.x % 16u; + uint tile_y = gl_LocalInvocationID.x / 16u; + uint this_tile_ix = (((bin_tile_y + tile_y) * _1020.conf.width_in_tiles) + bin_tile_x) + tile_x; + Alloc param; + param.offset = _1020.conf.ptcl_alloc.offset; + uint param_1 = this_tile_ix * 1024u; + uint param_2 = 1024u; + Alloc cmd_alloc = slice_mem(param, param_1, param_2); + CmdRef cmd_ref = CmdRef{ cmd_alloc.offset }; + uint cmd_limit = (cmd_ref.offset + 1024u) - 144u; + uint clip_depth = 0u; + uint clip_zero_depth = 0u; + uint rd_ix = 0u; + uint wr_ix = 0u; + uint part_start_ix = 0u; + uint ready_ix = 0u; + Alloc param_3 = cmd_alloc; + uint param_4 = 0u; + uint param_5 = 8u; + Alloc scratch_alloc = slice_mem(param_3, param_4, param_5); + cmd_ref.offset += 4u; + uint render_blend_depth = 0u; + uint max_blend_depth = 0u; + uint drawmonoid_start = _1020.conf.drawmonoid_alloc.offset >> uint(2); + uint drawtag_start = _1020.conf.drawtag_offset >> uint(2); + uint drawdata_start = _1020.conf.drawdata_offset >> uint(2); + uint drawinfo_start = _1020.conf.drawinfo_alloc.offset >> uint(2); + bool mem_ok = v_266.mem_error == 0u; + Alloc param_6; + Alloc param_8; + uint _1331; + uint element_ix; + Alloc param_17; + uint tile_count; + uint _1632; + float linewidth; + CmdLinGrad cmd_lin; + CmdRadGrad cmd_rad; + while (true) + { + for (uint i = 0u; i < 8u; i++) + { + sh_bitmaps[i][th_ix] = 0u; + } + bool _1383; + for (;;) + { + if ((ready_ix == wr_ix) && (partition_ix < n_partitions)) + { + part_start_ix = ready_ix; + uint count = 0u; + bool _1181 = th_ix < 256u; + bool _1189; + if (_1181) + { + _1189 = (partition_ix + th_ix) < n_partitions; + } + else + { + _1189 = _1181; + } + if (_1189) + { + uint in_ix = (_1020.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u); + param_6.offset = _1020.conf.bin_alloc.offset; + uint param_7 = in_ix; + count = read_mem(param_6, param_7, v_266, v_266BufferSize); + param_8.offset = _1020.conf.bin_alloc.offset; + uint param_9 = in_ix + 1u; + uint offset = read_mem(param_8, param_9, v_266, v_266BufferSize); + uint param_10 = offset; + uint param_11 = count * 4u; + bool param_12 = mem_ok; + sh_part_elements[th_ix] = new_alloc(param_10, param_11, param_12); + } + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + if (th_ix < 256u) + { + sh_part_count[th_ix] = count; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + if (th_ix < 256u) + { + if (th_ix >= (1u << i_1)) + { + count += sh_part_count[th_ix - (1u << i_1)]; + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + if (th_ix < 256u) + { + sh_part_count[th_ix] = part_start_ix + count; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + ready_ix = sh_part_count[255]; + partition_ix += 256u; + } + uint ix = rd_ix + th_ix; + if (((ix >= wr_ix) && (ix < ready_ix)) && mem_ok) + { + uint part_ix = 0u; + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + uint probe = part_ix + (128u >> i_2); + if (ix >= sh_part_count[probe - 1u]) + { + part_ix = probe; + } + } + if (part_ix > 0u) + { + _1331 = sh_part_count[part_ix - 1u]; + } + else + { + _1331 = part_start_ix; + } + ix -= _1331; + Alloc bin_alloc = sh_part_elements[part_ix]; + BinInstanceRef inst_ref = BinInstanceRef{ bin_alloc.offset }; + BinInstanceRef param_13 = inst_ref; + uint param_14 = ix; + Alloc param_15 = bin_alloc; + BinInstanceRef param_16 = BinInstance_index(param_13, param_14); + BinInstance inst = BinInstance_read(param_15, param_16, v_266, v_266BufferSize); + sh_elements[th_ix] = inst.element_ix; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + wr_ix = min((rd_ix + 256u), ready_ix); + bool _1373 = (wr_ix - rd_ix) < 256u; + if (_1373) + { + _1383 = (wr_ix < ready_ix) || (partition_ix < n_partitions); + } + else + { + _1383 = _1373; + } + if (_1383) + { + continue; + } + else + { + break; + } + } + uint tag = 0u; + if ((th_ix + rd_ix) < wr_ix) + { + element_ix = sh_elements[th_ix]; + tag = _1399.scene[drawtag_start + element_ix]; + } + switch (tag) + { + case 68u: + case 72u: + case 276u: + case 732u: + case 5u: + case 37u: + { + uint drawmonoid_base = drawmonoid_start + (4u * element_ix); + uint path_ix = v_266.memory[drawmonoid_base]; + param_17.offset = _1020.conf.tile_alloc.offset; + PathRef param_18 = PathRef{ _1020.conf.tile_alloc.offset + (path_ix * 12u) }; + Path path = Path_read(param_17, param_18, v_266, v_266BufferSize); + uint stride = path.bbox.z - path.bbox.x; + sh_tile_stride[th_ix] = stride; + int dx = int(path.bbox.x) - int(bin_tile_x); + int dy = int(path.bbox.y) - int(bin_tile_y); + int x0 = clamp(dx, 0, 16); + int y0 = clamp(dy, 0, 16); + int x1 = clamp(int(path.bbox.z) - int(bin_tile_x), 0, 16); + int y1 = clamp(int(path.bbox.w) - int(bin_tile_y), 0, 16); + sh_tile_width[th_ix] = uint(x1 - x0); + sh_tile_x0[th_ix] = uint(x0); + sh_tile_y0[th_ix] = uint(y0); + tile_count = uint(x1 - x0) * uint(y1 - y0); + uint base = path.tiles.offset - (((uint(dy) * stride) + uint(dx)) * 8u); + sh_tile_base[th_ix] = base; + uint param_19 = path.tiles.offset; + uint param_20 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_21 = mem_ok; + Alloc path_alloc = new_alloc(param_19, param_20, param_21); + uint param_22 = th_ix; + Alloc param_23 = path_alloc; + write_tile_alloc(param_22, param_23); + break; + } + default: + { + tile_count = 0u; + break; + } + } + sh_tile_count[th_ix] = tile_count; + for (uint i_3 = 0u; i_3 < 8u; i_3++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (th_ix >= (1u << i_3)) + { + tile_count += sh_tile_count[th_ix - (1u << i_3)]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_tile_count[th_ix] = tile_count; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint total_tile_count = sh_tile_count[255]; + for (uint ix_1 = th_ix; ix_1 < total_tile_count; ix_1 += 256u) + { + uint el_ix = 0u; + for (uint i_4 = 0u; i_4 < 8u; i_4++) + { + uint probe_1 = el_ix + (128u >> i_4); + if (ix_1 >= sh_tile_count[probe_1 - 1u]) + { + el_ix = probe_1; + } + } + uint element_ix_1 = sh_elements[el_ix]; + uint tag_1 = _1399.scene[drawtag_start + element_ix_1]; + if (el_ix > 0u) + { + _1632 = sh_tile_count[el_ix - 1u]; + } + else + { + _1632 = 0u; + } + uint seq_ix = ix_1 - _1632; + uint width = sh_tile_width[el_ix]; + uint x = sh_tile_x0[el_ix] + (seq_ix % width); + uint y = sh_tile_y0[el_ix] + (seq_ix / width); + bool include_tile = false; + if (mem_ok) + { + uint param_24 = el_ix; + bool param_25 = mem_ok; + Alloc param_26 = read_tile_alloc(param_24, param_25, v_266, v_266BufferSize); + TileRef param_27 = TileRef{ sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) }; + Tile tile = Tile_read(param_26, param_27, v_266, v_266BufferSize); + bool is_clip = (tag_1 & 1u) != 0u; + bool is_blend = false; + if (is_clip) + { + uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1); + uint scene_offset = v_266.memory[drawmonoid_base_1 + 2u]; + uint dd = drawdata_start + (scene_offset >> uint(2)); + uint blend = _1399.scene[dd]; + is_blend = blend != 32771u; + } + bool _1720 = tile.tile.offset != 0u; + bool _1729; + if (!_1720) + { + _1729 = (tile.backdrop == 0) == is_clip; + } + else + { + _1729 = _1720; + } + include_tile = _1729 || is_blend; + } + if (include_tile) + { + uint el_slice = el_ix / 32u; + uint el_mask = 1u << (el_ix & 31u); + uint _1751 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed); + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint slice_ix = 0u; + uint bitmap = sh_bitmaps[0][th_ix]; + while (mem_ok) + { + if (bitmap == 0u) + { + slice_ix++; + if (slice_ix == 8u) + { + break; + } + bitmap = sh_bitmaps[slice_ix][th_ix]; + if (bitmap == 0u) + { + continue; + } + } + uint element_ref_ix = (slice_ix * 32u) + uint(int(spvFindLSB(bitmap))); + uint element_ix_2 = sh_elements[element_ref_ix]; + bitmap &= (bitmap - 1u); + uint drawtag = _1399.scene[drawtag_start + element_ix_2]; + if (clip_zero_depth == 0u) + { + uint param_28 = element_ref_ix; + bool param_29 = mem_ok; + Alloc param_30 = read_tile_alloc(param_28, param_29, v_266, v_266BufferSize); + TileRef param_31 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + Tile tile_1 = Tile_read(param_30, param_31, v_266, v_266BufferSize); + uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2); + uint scene_offset_1 = v_266.memory[drawmonoid_base_2 + 2u]; + uint info_offset = v_266.memory[drawmonoid_base_2 + 3u]; + uint dd_1 = drawdata_start + (scene_offset_1 >> uint(2)); + uint di = drawinfo_start + (info_offset >> uint(2)); + switch (drawtag) + { + case 68u: + { + linewidth = as_type(v_266.memory[di]); + Alloc param_32 = cmd_alloc; + CmdRef param_33 = cmd_ref; + uint param_34 = cmd_limit; + bool _1876 = alloc_cmd(param_32, param_33, param_34, v_266, v_266BufferSize); + cmd_alloc = param_32; + cmd_ref = param_33; + cmd_limit = param_34; + if (!_1876) + { + break; + } + Alloc param_35 = cmd_alloc; + CmdRef param_36 = cmd_ref; + Tile param_37 = tile_1; + float param_38 = linewidth; + write_fill(param_35, param_36, param_37, param_38, v_266, v_266BufferSize); + cmd_ref = param_36; + uint rgba = _1399.scene[dd_1]; + Alloc param_39 = cmd_alloc; + CmdRef param_40 = cmd_ref; + CmdColor param_41 = CmdColor{ rgba }; + Cmd_Color_write(param_39, param_40, param_41, v_266, v_266BufferSize); + cmd_ref.offset += 8u; + break; + } + case 276u: + { + Alloc param_42 = cmd_alloc; + CmdRef param_43 = cmd_ref; + uint param_44 = cmd_limit; + bool _1917 = alloc_cmd(param_42, param_43, param_44, v_266, v_266BufferSize); + cmd_alloc = param_42; + cmd_ref = param_43; + cmd_limit = param_44; + if (!_1917) + { + break; + } + linewidth = as_type(v_266.memory[di]); + Alloc param_45 = cmd_alloc; + CmdRef param_46 = cmd_ref; + Tile param_47 = tile_1; + float param_48 = linewidth; + write_fill(param_45, param_46, param_47, param_48, v_266, v_266BufferSize); + cmd_ref = param_46; + cmd_lin.index = _1399.scene[dd_1]; + cmd_lin.line_x = as_type(v_266.memory[di + 1u]); + cmd_lin.line_y = as_type(v_266.memory[di + 2u]); + cmd_lin.line_c = as_type(v_266.memory[di + 3u]); + Alloc param_49 = cmd_alloc; + CmdRef param_50 = cmd_ref; + CmdLinGrad param_51 = cmd_lin; + Cmd_LinGrad_write(param_49, param_50, param_51, v_266, v_266BufferSize); + cmd_ref.offset += 20u; + break; + } + case 732u: + { + Alloc param_52 = cmd_alloc; + CmdRef param_53 = cmd_ref; + uint param_54 = cmd_limit; + bool _1981 = alloc_cmd(param_52, param_53, param_54, v_266, v_266BufferSize); + cmd_alloc = param_52; + cmd_ref = param_53; + cmd_limit = param_54; + if (!_1981) + { + break; + } + linewidth = as_type(v_266.memory[di]); + Alloc param_55 = cmd_alloc; + CmdRef param_56 = cmd_ref; + Tile param_57 = tile_1; + float param_58 = linewidth; + write_fill(param_55, param_56, param_57, param_58, v_266, v_266BufferSize); + cmd_ref = param_56; + cmd_rad.index = _1399.scene[dd_1]; + cmd_rad.mat = as_type(uint4(v_266.memory[di + 1u], v_266.memory[di + 2u], v_266.memory[di + 3u], v_266.memory[di + 4u])); + cmd_rad.xlat = as_type(uint2(v_266.memory[di + 5u], v_266.memory[di + 6u])); + cmd_rad.c1 = as_type(uint2(v_266.memory[di + 7u], v_266.memory[di + 8u])); + cmd_rad.ra = as_type(v_266.memory[di + 9u]); + cmd_rad.roff = as_type(v_266.memory[di + 10u]); + Alloc param_59 = cmd_alloc; + CmdRef param_60 = cmd_ref; + CmdRadGrad param_61 = cmd_rad; + Cmd_RadGrad_write(param_59, param_60, param_61, v_266, v_266BufferSize); + cmd_ref.offset += 48u; + break; + } + case 72u: + { + linewidth = as_type(v_266.memory[di]); + Alloc param_62 = cmd_alloc; + CmdRef param_63 = cmd_ref; + uint param_64 = cmd_limit; + bool _2087 = alloc_cmd(param_62, param_63, param_64, v_266, v_266BufferSize); + cmd_alloc = param_62; + cmd_ref = param_63; + cmd_limit = param_64; + if (!_2087) + { + break; + } + Alloc param_65 = cmd_alloc; + CmdRef param_66 = cmd_ref; + Tile param_67 = tile_1; + float param_68 = linewidth; + write_fill(param_65, param_66, param_67, param_68, v_266, v_266BufferSize); + cmd_ref = param_66; + uint index = _1399.scene[dd_1]; + uint raw1 = _1399.scene[dd_1 + 1u]; + int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16); + Alloc param_69 = cmd_alloc; + CmdRef param_70 = cmd_ref; + CmdImage param_71 = CmdImage{ index, offset_1 }; + Cmd_Image_write(param_69, param_70, param_71, v_266, v_266BufferSize); + cmd_ref.offset += 12u; + break; + } + case 5u: + { + bool _2140 = tile_1.tile.offset == 0u; + bool _2146; + if (_2140) + { + _2146 = tile_1.backdrop == 0; + } + else + { + _2146 = _2140; + } + if (_2146) + { + clip_zero_depth = clip_depth + 1u; + } + else + { + Alloc param_72 = cmd_alloc; + CmdRef param_73 = cmd_ref; + uint param_74 = cmd_limit; + bool _2158 = alloc_cmd(param_72, param_73, param_74, v_266, v_266BufferSize); + cmd_alloc = param_72; + cmd_ref = param_73; + cmd_limit = param_74; + if (!_2158) + { + break; + } + Alloc param_75 = cmd_alloc; + CmdRef param_76 = cmd_ref; + Cmd_BeginClip_write(param_75, param_76, v_266, v_266BufferSize); + cmd_ref.offset += 4u; + render_blend_depth++; + max_blend_depth = max(max_blend_depth, render_blend_depth); + } + clip_depth++; + break; + } + case 37u: + { + clip_depth--; + Alloc param_77 = cmd_alloc; + CmdRef param_78 = cmd_ref; + uint param_79 = cmd_limit; + bool _2191 = alloc_cmd(param_77, param_78, param_79, v_266, v_266BufferSize); + cmd_alloc = param_77; + cmd_ref = param_78; + cmd_limit = param_79; + if (!_2191) + { + break; + } + Alloc param_80 = cmd_alloc; + CmdRef param_81 = cmd_ref; + Tile param_82 = tile_1; + float param_83 = -1.0; + write_fill(param_80, param_81, param_82, param_83, v_266, v_266BufferSize); + cmd_ref = param_81; + uint blend_1 = _1399.scene[dd_1]; + Alloc param_84 = cmd_alloc; + CmdRef param_85 = cmd_ref; + CmdEndClip param_86 = CmdEndClip{ blend_1 }; + Cmd_EndClip_write(param_84, param_85, param_86, v_266, v_266BufferSize); + cmd_ref.offset += 8u; + render_blend_depth--; + break; + } + } + } + else + { + switch (drawtag) + { + case 5u: + { + clip_depth++; + break; + } + case 37u: + { + if (clip_depth == clip_zero_depth) + { + clip_zero_depth = 0u; + } + clip_depth--; + break; + } + } + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + rd_ix += 256u; + if ((rd_ix >= ready_ix) && (partition_ix >= n_partitions)) + { + break; + } + } + bool _2263 = (bin_tile_x + tile_x) < _1020.conf.width_in_tiles; + bool _2272; + if (_2263) + { + _2272 = (bin_tile_y + tile_y) < _1020.conf.height_in_tiles; + } + else + { + _2272 = _2263; + } + if (_2272) + { + Alloc param_87 = cmd_alloc; + CmdRef param_88 = cmd_ref; + Cmd_End_write(param_87, param_88, v_266, v_266BufferSize); + if (max_blend_depth > 4u) + { + uint scratch_size = (((max_blend_depth * 16u) * 16u) * 1u) * 4u; + uint param_89 = scratch_size; + MallocResult _2293 = malloc(param_89, v_266, v_266BufferSize); + MallocResult scratch = _2293; + Alloc param_90 = scratch_alloc; + uint param_91 = scratch_alloc.offset; + Alloc param_92 = scratch.alloc; + alloc_write(param_90, param_91, param_92, v_266, v_266BufferSize); + } + } +} + diff --git a/piet-gpu/shader/gen/coarse.spv b/piet-gpu/shader/gen/coarse.spv new file mode 100644 index 0000000000000000000000000000000000000000..fe5eeee188be90b85271b37aad93d5f1f23c237f GIT binary patch literal 60516 zcmbWg1)wFx)wSJl?+oq^2@LK83lbp60E4?E*l-zPE{rjP2X}Y(0E1g_5+HbjTOe4F z5S&0DXbAuFoYQM=-A?l5{d3;8Q){igYuBz_)qVP$K6mJtbpFYzYLaToYVxX@qH4Wn zs3t|Js;R2Jt#-ghTW++}*l~lGTK>mN>o9HA(el%0x@xkj4{ayMh_3FD8YY9x&M`gx z)n`z|S$~fX{%=y+`yic+JYb`N12)}sz((r~95j0Lz_H`@89rp-@a~aA1`h5XGi1=X z?mdU-H+kXLJ#x_S@qjT&?^DeTA2BhJ_82pv^^7@XH79)BsPTh_4P@j+#*P~^ ze$coj*EIPh$1l^$zTIzj{4$=o&H?Y4_f%Cs_+DeW#|`<9aVP{n_WdDk8EbBMPppn= z9{8SJ!||Q6S`a*DNY~*17#6!R=cg@WE&%U|IaRedeC+Vdt-@2&5Q)lYl-4s`u8{F)QDR$Y~UlaDdYXI0vEeLAYu zz{zb5@Mc}(hFJv9L%FY>8gy3u;ahPk#7FL{^jRBj9ChQ!eeJT#C}z^5O#&WAtg*Cx zI7`=~wcr1HVs(bC3b*I8F8wl}0pRw0`cxZ&&2`HmyP9q*7d2mHz8k~!Q#W6^FLSeX zP1MYHm##s(4<0jWw7NNOPHW%)cg}i{LXq>P^h?g0f!jIvt+oXBA2E2#abrg9-rkMv zoYZ~w0GO~)I`5b zrKvN&C5Li#Bc}W8=d_vQcHs6LJF4x$?KSUH?F1(7y4}Nv6SX~8HD5h*?W}f&*M6;e z%6&EGxii|ouG1`)g{b@g_gU82w3e%9j;hYE+Zj5)LB!1bcx#wAKQ~S0=Sjf0{YMQS zHO2|Lzgi~-de8j&RJ#>^z}6hqe2v+4m#%?>j9^`7soZ>t`QIm#p&PSl$t|rlml4Fw zT)a8RT8#p?=hC-gc*otSd*pyIU4vW8)tZmGuW~-VsrLA`*R}OVi=OMBwNP`tTK5rL z9in;P(bD$m(K@10_&&T-?hEf3G=BuY}ZOyv%^qsXj8J@oXea1R+ntd*9kKccvQP1FwIn{=i+KSy#okg2Doej>M&H-o5&IdF82IEJxC*)eF`RbWZ zXLTXG_G_(|+?V(9?YOg4m%(eF|2~m=#KtUCGX*l~s4gL9=I~2!=5Q&vJ%`R}LVMiS zf|$SDS5N-aRoB4DZ6Ia(@3|2vNtnY`^i2*|gEQZo>X^Qn{NIEAhc_W&Io9>XST{@@ ztE0LR+@9~O)$L&CvSsVW;(r;r-IF=yExtAOEydWk7GvKwaqQ`;UuW!5!@K|gRMNTK zS;W6YehO<|XfURNucJaTA`XAnD+Uxir z@v~I+xa^@qfwW-&|9l zF2;SP828zU<4#|_SdY8bkX^e+dRhIykKNj{e<;RyVd5CGRIk->>&xQ*J!;SC+3Skx z{(ZR^>y?RPbyTl{+xu$f>d*DKUdaBJLCXuEW4=+0`DQWZTNB5ex_S@X{$8PV2i^Ke z?AI#wRC+#6;^*hqcW8$UZ+#SP)$m<-d#_GjeGG1Wm(V+uU(dAqe%PyTXY~=h=lhuH zs?Uwp`m8@_#DD%k%bAyJZ2Kz@V)v;&r7ykO-wT1w^>4M-zUKC+KCAig?h)PIJoTw& zufC&g&wnUa;{W(5x)`*s%vOA`TC%|H50_s0TOSODw(bo&tFMXQnp11M&gvUl^K9qT zS$zw)Tr{nD{DXEl-=)RNwq z=I25E`2UtZWXu>pLbTqC)xDp@zZJvrrqq#_ct)_k)^lF)=&mtcenybb-{1>2_zDfa zQiJzz@O66ej%p+Lp7mHe^s;qUJ2m*gUc93k4sU;7(pilLkLlWLX+pK~9^1>evl`!v zcT{`B+vj9QbpSYV4gr^O4)4{kvpTZDk8b-flW~u0*iUHiliL2v#{aa2{fu6`qdFU& zJg)^O&l|yIo;UaE*IC`#;J3H^C+2xq!~UCIyra4Yo^c-qXWXZI`E^#$Hu!T5{%V8& zsli|G#XG7u;Th}QUbfEa{RaP_!9Q>CFB<&IUc96F8lJJf?`7+(CgFrB&x1*O@s6r5 zJbkC`W#c>O2A{6MXK(O18+@(?pRd6eXz&Fae5nRsy1|#}#XG9y;F-(Hy=2ZSau|KDxp8Xz+a+e7^?Yzrhb{ z@FN=h$Ob>L!B1}RQyTo720yRC&u{R{8~lm}zp@wasIG=*Z`|0+)>-|k!EbKx`y2eh z27jm*@2DPuXRIfB**dGI8vN;AyrX))==*XnTW9rZgTL0`Z#Vcm4gPL}f7IZAZSYST z{EG(vs=>c*@b4RZ62AE9buDV}$s2si2A`_IXK3)58hqvkpQFL&YVaR4_CUD>V3u4Zd1~uioHmH2As=zFvc`-{6}z_~s41MT7sW!MAJh?HjzS z!3Q_W2N=2EV=+?^8Vj_kP|_4~g+aFCVT?4gO4nzue%j_Tqi2 zx8aV#rwtwkweMj+?B&y0eca%m^x}Q1FX64v6g;gQxJ&oQ_WfID^%Z*SLmtWd{g>QV z&-Y(^hC%B;YUI%FUG?eP;`)pnSU;I0Safcc&`0z1ZtM`;(I;#9bJ^+{-w!;kK5d&r z4jao6X1jvUxFCSE5tm3**^-Upp&M4mW)U-Z%A1`UrP<4>mE&MKyfeJ01&HFD&r z7&3-GuNXI`YvkD4kXTdJu?G4ebE0kX);y=eHkh055u-+q;!Cc4^qV-=)Yu34RYh${ z%xR+U(mklh){Z%CY&@GDw0o>QwVf{ZUiq|ZJ3Th@>78raHbZUe9yzp_d!N=kX2ix? zID2w$YcsVptLWHmZRVEdjC%5!1+87T>>_RLxy*`f;^*6erM;cgY}g%PT-UA>t*O!M zwT+$iFSa?bInRIBZ_e5_xNBTjI{KZ#Hxu{uTH}2%;+)ic4Pw9B`Wg5iwdQBu4mPCw zu6-{;X7F8y58QUtnB51A88v?N`fJ;#Z#5k{@5|%5$90ctd$;xJ(LFzEi>D#}oz-mU zJn=RxeDc=safrLos6k!B*B`kjje(4=ohLZ>n@8*1pp)7>rN;LGt8thHu4M0o`j(Hn6dNf>z#ZtkK|WHhBL!M!tU;Q1hPeU;0!Vsd+vO z9x}T9EShKR(e@oOW>jr({7ulu3{H(Yt4-m1jS25tZ3efpTqynDx|V&a&F#l;B=%`b z%&%{?1=`p_W4gxi%baqKTef3N{ElPm2H&O^pRC#eK4u6%02wo|^=PJ?$Bx)WbnV^C z;BT0ALNBfnt!&%t+P4~1+uX-7nD4M!Z|&jQpgyA3Tj%gZy|WsF-t*mBN3|ckyWTrT zfX5EwU0~dZuF>p3YkMx*pi$#Tw!atatgg_%-niOthL>x9M}z+wp0ip1dm8o!dhyAs zN8sepTCSdVk#;VR_44hko@nqVd-0CyIe2?+eXBRXiQ6-uj%sq=FxuxV=Nh;*FoNgz z?Altj>8$!`8#8K`A$6@*hnMd&o@f2_=Z8Z1U4`=LCQC%r+U6(ql8^C+>jh*xU6?mT-->S|xRqA)Ew{YL;A#mdNe0SQXdc4rv@%mKH z6k0oOpXvp))^)PpG|u@Av{pgtT76XX>(Tk{8NF4j`a)vgf3~!q`b@%&Nx2^;ZSXz~ zK3Ol`x0(i?`IY;62K4scovc~{JZ{vuuHo%6$E{VXAL&=W2doS(zr+8@#2B5`THtd3 zZ_)BCe}C3dZG+a{hn>|9;ITvYXkA&I_ipf>`gc?#;SAf#VNWp23+`JT0M4&Mt$X&> zA?PJP6kgWt@CHAP?Md0eIc zVCT_UT@Ow$<39|iwymE+W=@Z`wGmxocgLeo^%A`OTS-@>9p`1^v@Xc)`h4B$)2=)3 zpLKoNRNTn+KJR%3fL7o8v~rpeUe<6Xcv-_)8hpV9U#J)FTP*_rclB8k{a@?Tr&<=h z^?p>pA$YE?XpC0P_}!tHO=~UI2T!~otZT6eTAA-=4ZcN#@6g~oHTcdAzDt7-h5Os8 zti|w#ePn}=Zt%Ste4hs2ufdOQ@M9bN_y#|*!B2&IZ${l(>wQ(jeocd4*WkA`_#F*? zSA+kq!5?k##~b{~27eZAzGJ%DpTVc+16R4eGdB3l4L)y!&)?t+Hu#Ske3=GcuEBrY z;48!PTO!Z#)!=-znD~68KfJux*|gDr^9J9t!G|^YZVf)X!S`(Ny&L@K20ymJk8kj^ z8vL9FKd-?rXz+{SJ=Z*bL({Nd+2B_<_*)JBb}!zy`VgLb^Pb;=^E|glZ|(cfdiCq9 zzG(2T8vNUW4`5v<1vKqkk&Axt2G_KE2 z=L7o+pSMk0G4mZuIga14_-bm+vGps*Hg7e%+t=?GqM5In-vLD99i*lCxjmXS&~Nra zSy+ZT54YiF5%^Im~R%m{=p|(|_`JIN^&kD`&G1PV_ zH0!IjbD_;qYl92T?<@2h*3$URKnH+V)+byaU+zCO-P~_}Us3Kqx%d@%wT3>IZi}dZ^Eg7a>}@`%hgyHxb->`pnD!I)*%b^_`no?g@4K%nNrvsOjfe zuBTX^c{#saOMRmo+w;O)qv@~je6)@qUEhV_#){VV?OQF0R>sio9G0VP&%d*E9_X)T z{2$lY^Fv+#pU}4F+^6EFHZ5)}eYCGZn|N!2?WZ;yb6ty88(W~|+rQ@4X>G7}HOF3u zR?XPPu~r+<8c!d`+K|?GuA%2rKaNQ_jJGkZc6H-zLaSyxARmH}?LtiTw+(ebn3w2hbYRy&ylZ=Ej+uGvFXv`|77HcKeva!L-IZh{GHX zp-m2srQIBkqD>A*gYBc{n8(ms|Mr-jt>1R*tL_-b(dvIJhhrR1n=y=~-7!w3%^0VF z?W3mc^ul&VZBsMenY4*#KkfE8pEmI>02@zD+l7VgBCs|!<6TUfc=o%9!#)#e6YmPJ zebhWJuB1&LZN|B-p>@^z%?;gsuK(Q)-Myv$Ktms2>%VX4u9f~zHFWE%{!&A?PU^2W z^ov@0pXx8Q?mTC$wGU{gr$;tAiaoV-2l+jlCDx*uDqEeoF1H{jA!) zc40pk?fdV0;$Hyw-bw$2UW)_(gE{jok4rftT_3BbliR{|iix;x1O#_A+>4-3dOa_H{4a zqc5-6pNCIfbM5c8?W=nZdmryS=4MxU4=yeb?f=VB#`sUz@xAABT-WVeTFTJU+_Q4; z`_wX*@8Pbq*V#VQ!`Miz{(a#rZA-Vme6m_I{^T{bPvXslW~cajA88EjuFHyW{q4UB z*j%hf>;uuVPWlgqdrug9H{5&F@R4xOp&t4Pc!2Vvda!>BMd#^2b+&yWntD61f-fycVj^Di-C*1GKOYV2%CHMRB zlKZ`P$^9-`?mV2=&HPWtK|TC;@P38;arit1zmb!DW_UmPzYq4@_nyLi`8mXx=CA$B z+I}EBhw7TEsB)OOF&Grf3J2i>y-k=*Bg-f&&LMtiKl)x!AB!I$U2S13%g zRU_m1dkI@>X4`>t$$h?Ca-Zv#+~>I^_jzr}eO6oY`5S!U2KPB_{C#iq`E1F3HXClc zMuYooHufv9_b<5mfCl%uZ2WC~CR=i!#g^P>u_gCeY{`8F8*ctSe=Yfz1y}dEYwWf@ zcP+WkT}$q>){^^-wd8{ud{~3~95w#F*ZBN2+}7u(CHHx0$*(B5^YeLWY4>?)$$efL zuD{Pq!)<+DT5_L>mfUBe;pXcz(QsRziI&{wpCzBI!F>i=+WR&5as@YEpNqzSdAQF+ z!@Vc*d1$zHpNEFq`aCq;cpDd7f1iiOZtJtplKcF#DIbz9uj#zS^A(q_dh$Y{- z!F{e++I_BAa-S`RJAa=qhJOL~xnj7j&lSV<_qk%Y_3^o4_y%yFD~9XubH$STTru4C zq6YU_VrlmoV#$4uSaP2umfUBECHEO($$frUa-ScTTvBqM8HQUQpBaYR`phug`TEQ- z-1_*;Fx>k1%rM;6XND#BSz*b2Mp$y650>0#gyDXU^EqL-*q5odY| z{e4CluD{O+!_C)cgyH7vGs5tL;XWe_H=fT3!;R-N!f^e4Mi{QY&j`cypTFShJ|m1> zf1eSC>+dtdaP##UVYvDFj4<5RXM`p98DY5hr2Px-cs?VH{R+6x2*Zzq`;0JLf1eSC z>+dtda9f`dhU@P$!f^e4Mi{Q$XN2L}eMT5=>odY|{e4CluD{O+!@q$0j4<5#_>3^z z)@Ov_#`76r_!V%U5r&@u_ZeZh{%;jrf1eS?ZtF9`aKB&i*&j!Pd=d;0ZTb~Vv zYxmipd`&(x`2Egz-2bm>;XUXsU^lD;AV_P(bTh#76Yp##=>B=to0ILa|>S*tWVZy zDX>23S>qppv&Q9KUm9IoYOoC0{_2UpEZBM`{&Hab)Z?=}*ty2%$6)=`t)-uB)#ASr zSS{nO3|8}dJab+Jyeda>_IoD#`rXoMwQcjd=0DZO;mbOEj;{{)^RoNd@#Xr7*PwMg z@tU-bxiY={o@Xui;eYlGFw^JN{l`ex)g0n@r*HT(Ccm8buDV70RU z`f&Bsa09TK`Q?1u5U%cbn#LUfRugYT>p3OfxWJnfc+&#=`91!d*Vr0chpphw&sba1 z%EepLrq4EDzl%%lwuM_e{q@sspDk#$W$r%%H|PFyH1*7VJFuGHgQws2@bojTTtD#+ zw8?A78n4NkdY#`Ho_#bBT-Lk`uKm7xFYN+W^BghmPGGg1M?=7gH5i=zJQS=gadri( zCC(tQTH@>mPMmIV;_MFAmN>(~YKb!ptd_Zr0w>N$a5=ZpaP5h;2Usn!Mu63_ZMXdfyU#!N z0_$f#u{Gb1*7)|>zqZ*L^8nfdITG`r+HRYi4+b0KhC1g%z-sPQ$Cdwr!&r_hPTwQJ z_65`V1sgMa`#7+gbTou#FKevrEDkmE7iX+EOp7-7B!RppDac%&2 zaHKvrg7s5Rznj47K8K9oui)mT&Go#3RxR)6w}8v}IiBz5x1wvy`}u8PHD9^*-wwZ7 ztv+Yu`X%Q(!H(y+chJgL=A8BZ{Vwo=wD#AQy{4Z2zX97nbGaL==6!AQzZd@R^8YQm zw&Z^wSj|`RzaM^&TAjaKKXZA2*71_>gJAR3UzXMg)?caG1{KFje4hq)P7uD|C? z_PedN=V)`yd>-t%VCyyR1=<%meA)MpwcR#zc?oQcTpL~ntK}Lm{{x4yj4e*zSHRXT zeP0Eu&BWpP`X^dnUdOb(#-U~$apJrQF4y2KxNBfNj;8%H#~U2_IcK>(&MEP1UuPW8 zBagS?iT@XHnddujwdDC8IC*M&mqX1w#fkGixXkkdxV3wLJUu@?1iR1GuS0VWd<3?i zHf!=%T3^;g+s7PgjwQ}mpMcA;K82TK{SEF|>W9?he+IUnw$$!(u(i{c*nbDBhkpUK z-Z^K#1gq(1y|t-X@8oJLPn~R=_1EtHb8fa7?<=r#$#`Fb)$%^BP0jw<(=Wi&{~NHi z$-es*td{ls4xAcm`v-@bd59C|pJ3yJe-GZQ;FPsjT_%C+qn`Rs3bww+vQ~2a%spe; zHfxjqoppboZzqpFV0GhdPG5O!eZkYArFN6S)ttNj+SDBXY_M~-P5&vdZBpIj@r-GYv)|FKj83XEq_?s zZ8NXA!N$lu<^ika+9{ut!&v4ZPTzUK<`O&GmK;KL-19 z4%(LI_z8!3h!bZ8uyInu6~Ss{4OfC&Lv87|GFaWY`#e}~u8!*%juqckz+QLaD=&TR z6W>+A#tUB!?3yRPpK9aK&v@Ee8uRo$Y7Mya@%rPv#+qR7HPnsS`tC+Mhy8tay_OvR z$>_f}Sk3#d-fy#(-hOtjO`id@>KT6vu-C%ObxXKDO! z`tfKda+trZbsUE7bs~9m!^``G-QeY#?GD#RJ$;6Qoquv40oPC6>(3}!U*@50B!`-L zh!bZIaQ3A8Z4BJ~rXHWMV0CjJO)HOWZ}1@ch3^B`C;M$*us-VfPHjK%UL0;R?diKe zSX*i&x8}*)Hom_ATSw=55UsCo`NX4sAjjJ918CQ)+nE_l{tBPhZTbIJyI|c~7yCK( z3$%IPHnukVWb8x0I~8+26z-h!neH&KpXbz5hr_}4)8-fl)2b!rkzlpd=qRw?@%g=h zw)h%(gggnX zp3mB+fXgwB?{@{KqHD`%?bE<&zHC8(6!}ybw1d>>W+67tykZ<>0y4f3Z)1tC_d{SAgxK?zorGs%6}( zz!}&3jKsYfU0dQ_16GUuTJSu@xYxneGVb+Y`=~qam9%R1H?JGP<}x)`R{2d}hHw4t zl{SgLf4A=qaP8@PGuU|PdkZ*ywUvE;1=pVW-UiNm%YAV>y0*;s4zOD0dndT8^IdSY z%=gz|`=}?c-++^s_A;+q;o6heJ>cZ!@Afn9z3AGK*KfgUvEK(S*WrG+TJm}TY#;UH z^&mKTX)p7-8?HU^9|mV%yH1b5&0V|vcVO2i;D0ka2fA8xcegUo(HQX-XFlvaM;gya(y!A z7r|u>{s=E?@Df}f^^EZ{SUoXb0bA?vSHb3#I=%+hM?EqA1Xj;=`gO2z)y?w-TD9z{ zH^J^HKgYPnsmEJr+A{W^!JbbU=WVcB{Qm+zizDN{1J*}9G2R7V!jTy7fz|T<@jlpo z>WT9KSl!RC&fC56A>915Ij1*h)soA{;H;(oe}(IlwfhA85r=;Em+O<7{0(ePtVenN ze}<+lexDb9<@NaQ=-QI+7hwCUr|*~Gj8UG;U!iM@-`9m-dF}lMU0cTZ7HnVj)b1Z( z$JQ3@JFvR-%oyLxIhuRGVr#QT|D=_R{k^Jjthe{P9dN$~(dKVg<@sK@6YTfO_P3vQ z{XT_j%bN8CyJp(_y{lZCpZ_KY>#xnENCV*%Q;l{o4rr97nE?*x%RY{%%IFv2FccW@g%1ID9$wthL=XW6uUQM((|52dm|K z8Tm{c#xl0}KYc!%16w(dIpM~BpguR|0=rMt^L@z=!1mMT{qNkgzP#7h_CpRe#}+5= zdBEjZ{ov(T^THiVJ^OGzu>G{9w)2Cnt+vEo0IVLqAlUWDwO}E*ntrZ?XOFm^&HPf6C9yev zN3lnjf~%X4zu}f={H4LpN1MOrmOIumwB~B-eq4@rc@AINe_Y#b6Ke&qF~U~_J6`tZ zN??7|-6Qg4IgIUCV)O8K_)|^(O{V7LsTD`X0CWke^#z+opg4J@4 z$XDesmN~RAv7uFe{>NMrYi+Qx?yJvU>xUA87aDB4R z*9WV)|FfnWz+F>qiL)VCJ?G*^U}LL$E^b2W%Q|b@m_yAuiW6s3u;Zuha@RM$wsPxi zU)%KE3~W6z_szj-*}K}*?4Px>&gs7;*!pIkTY)#Hp6c=08mw-9ThPj5+ZJ4Y=KmR7 zpXBp%us-U(C;6L|?ZA66hVQ}J6Ki|0Hpkh9Rvz1q;Bw7&f}3}~a~%j)bDvuKUBJGq zy|ylnT{*0uIB^Gojgz$>3|1@GehA#P*Oq=m!RqF^Gp*bjJFa6mR%|(UVzW)0VPN}b z{BE#wRk!~Rv}*C+9c;c?ui;?trPbpz0<3QT-Du^pjRKebN5j2$R*%mfVD<1ZVDktc z3pQ?kKRFJpk9yX9JlHte%wr_2T4L-4F7wzM?mf49eD(pWhwlq6^Vkn=9{COC{$PF7 zlgBT>#?fXTd(x_j51{pLj*aiV&Vg|6b=*G(gMGPw)DPlNb8W@R=`gVKFy5iG^29p= zoO`Fk!E*0yyze;@{4uS4v?bnA`fy}@j|O``s~(?Y!0O@0g3CU~!7reXdVG!ttB0Qe zwidZ}JQ1vqdVEd-yKbrF$zc7|)9)0py4Ujfohs+h=66N?ndfO>b?bCGt@9V3L95NW z_-_cD3ErGmA8q;^LaUy<&IWJ8@o@bfa1L0_bM|U9*YaGrdfvm%0;?JCJlb;I&xdPw z-7lo|<+`h1z@g^4i<85};PQLFOW<2@q%OY%`@Nrf>T)UAe%j3AB3iY?yd10+{|R6< zKd0o{c?CS*1!_;Gv-yaYKeIR zSS@+o2+lQKTl{VUYct06wDQ<~1y;+N-2zs-nL~fqNzLnO*1*;={f_TeuzSPz5s$>Z z4NY6(-VRoCFI>U8xu0Ik0}}?i-(L zJP$sC*7(}f_YYum_>|p6Q@z0d=xdJd9qraM^|6&NPiwtw6YE9rk2sRYAHiz*?SwWp z`!7Rl|K(_#pDC=%OX!ZD*ss9-ypZ|43U@y4Ic@QK4Xn)=FVo5$?@zRjXX`!u8?iaoIUoK4Rx^M1+q+<2#?kf;hnjK3iSr)V zd8dBwgVl0ht4%HbAAr>|?uTGC-FR-{ESh(U;)ls_hF7HFFi4tLr6Km#2od8Rr|Yx#xWO7Odtu z8=rrG&1GtKyw|huz${g@K8Ln^pZ8C&wyekZVB@Jf$FFJC5_b|V&}zAlnH28l3ibGO zfYt5q1zjFnC)m7lU)%?-=D8T3zO_%TF_Xd7^iAE>;y(qr{H!u1+;KCfslfWE=Y4Hz zaNgInC)PAzZI0vL?8;-C7Q8WgB71QubdWsWgX0SDM4FB#| zuFdmtR&eeu{TpC;?k#5ndv9qUZHYHKxLlJt;NDxR$7fEkdiY%6vd<6T-dn22=Z9eR z@VUVYGym+ndEok}$EP27etc5F_43|wez-Ym%f00SV0G)XAg%KkFGQ;? z_m&HTy|>gyn?C-1wR-Ye6kOidEe3awB&Wr}-q)!o_a(sg(`HQn##${gmjbKB|3_f8 z^1g0qc<$@8r|&XgZN^=aRvz1O;4<#=@RnA8|LbS%AEPC%_Vir=tj)Oo&9^+ZmB8iw z;>vKx&i&#~z-s0F;wo_aX*1@Ev}%dD8h8Zj9sX0e<0j_nV6`&l8gToyY{c|$&s!Sj z0H;N@Cfsw+>-&1NzI^VlE?*nI7Ol_w`_uZoe{F5(>(H`|t95A?q#Z!(`KNz!+yLyD zxfW~)PA#>?ZzHfaua*8gHyeZ1ti$@Wa((^Ww<&l9>Y!~Cuw2^&Y@37o*S5{T@{F-1 zctCCQZ|LQI5AU^LEAZ#E##xmmEKGm)A1&pL3{LOR?`wu7zAZdF=>J9d-cAlh;6S>aY`7uFcPfUEmox%xh;_xi;%Q z2yDHz?LsSGnF_J&)nM>#>nRq=QS2hJ=djiU^TC|={FvpevT>EuUuEpt;}O@aOSZWSe|+83(h?D0n4?S z%l_c(@BP5??C)QI-QUKupZ1J*09f7J99N$44g~A3&2i-!?;x;ov^lOk`}<(9W4gZ& zrS;|hRzHM8&HXGc>!6mTSjRU^Vk{FU$3Hj~oL|ZI1@a zwN1cw95}T-7A)81IdcLy=gje7xi-(4lfaJScqh`zbIzO$_MCAnW7=1rjCo3}r@nG+ zuIFjs@|-yxo^wW9{LTPt%YHi(d@6_g)VOke;(r#{x@OOw4Oa8MFtt7ho?2^9-*ds* zQtR`;YL4x{m2f`TvGW;2u21@304~SA5Uy6fk6#4O*xJ+gVz9Q1eF<1CWB(HD*uMWc zZ@E6{e<`>e`!cwi&zCdr%i$SYd-_fQYs=VIfYmbgm0-tSj988>*T??uyQ{(3cUOVs z+9qJT7My)|4Op&i9JcGh#x>@3wDMf5ZU8@mkMZoMJ$v9LurbSP)vwUhv+g&8)o$cS zzgxiR=a_Q+%5&8B|IF)laOQOzSe|*^3C_Ik0L!(xw|@=JwdyXgJlCq->SABURPDsfo{^~8S= z?B2>gdI+3-q%D3AgSBNJJpxuUFRvkTecdy^2dB2b1Ix8d!1fq8wS5#U*XFhA3Gj>@ z#(bPsuFY%JQ((u@_9U%5*Q%$%UaK6-nD*5tV?I;s_H}HzHrMkx@QNIn|MTEntF*=M z4`6NCZ!dtK<;b;4u21}51efRRAK_}{wdy5!YOOteUj}One+68|dKIo#UVC1HCzkf~ z{S#PQ`0L;@)*EoO@>=pHJh8N=?^|GP;eQ5~vEGKO`57to{0ls>w5RVoU~QXKlePXm z^LN3{+k72IyL-)lgYrGFc5DAW?FSszKG$lwK8gJyxXkM#xSIdwZSwjUp1icD?_a^% z!ao6*u|9>X`S0Z>*5BZXr9FK=18duiyg#S?JBRVrotyuD<`=c@+`a^V#o^rYULe;e zF~0_v`FsOc^WW=DKHtKVkM{Ka2UuJ9ci=MCKjCWad%*Vd+VA0sr9FN9@S!bpn*?rr z^~`NjuzKd!0d{VAuaWDMn4RDVEB}2lxwZ+|767MK^MmCXV*!OMY*-v}MTLi3bE{-eDc#DGd z*XFqLjJFurINBUnp7-s=!H((s_L8)|eBV}If36Ut37?!1Zy*{|0bL~wzb6^ z4{&an>pEaHe}C?N^n3Aj;p+F}BOd@((T?K~vA#YztQNZ%ESbXYllMOu2sL{$7{$^;$twZHLW$>Nw?C z+oNmC_v<@=)$;xNPGDc2Puh0mQ1g5en~%?P2ZD`XKF{45O+B%@z-o?fo{3|dXSKV4 z9mm}DcMQ+#!L+{2L!Ut$yK7xGl%`r)U#H<0IRj{ciZo+2f)+MxytoRj03?tF#p6j2(FfU*_N&FKN!3t z{^@@RT%Y`n=Am#;kwrQ5)$X|V(N^vo_fO&+1}^vA;c&I&p-ru<_W<^I`X5>M&ojEC z;A&-mHT}IV9Rtq#91WIhn}F>&aMtHousq|O0M0%=9xTs3JrV3aHJ<&nXS|cZ>ej+> zRT{SDx`s0UJk~T=Jr{c11>dEVDaO!XtSf0Gj z1*Z<@faThJ-##Cl_wDn*a&x?Z_DqhvPhSZ3ecIT@&{v;~aZ#;j?s9Fu_gw-m-=}{G z&-37N@VCEy+H>As3pPgio^c(Tdd|7)!D{7q-8aC~&oSltWnMReJ@2yLZ-T33ziU$~ z*L^+C7x@gd?u%QnC*G~#$}S{zJqn|DO6`xLSUfqfIUTkJSF*zk~a? z<@wF`@8SBW$LCRS`M2ed!TsCv^m!bvkNN~wnWlPz|1p*}YyFfWFKeysNe(q@Eq1

2rb@;z&=Uvhf}U;mCV{8{+?@SGRV!SzwkXSV0T`MX{}_onY3(6!xA&*25I z{neerA89iOZ7*`DnX@=?UIP0!jNvcC7cO#o1+I^Ja(We)BgixwZ+|{sPYR>}{|-_Z{zm zuffMY+A@cC!M=Crx9#u2)x5_ruMfb!+#A~7=TI{*apHUgHco!K@G)G?-^)e&D_p(Q zK7p%y{~5ne;p*n#dHx|-En|HSR@3$wt$bznh5uckzk_F{RoDJES~dGxzc0aCkf*jU z!17$nzpicNwfq}2b^X7hmFw@n7yS>g@#Am&@8IhCe@iRR`^Z1R%i!a<#?tOSNj+`l zS$Es-2)+`B|GgUH`!{gDrs7zUF}OT&Z`OmSsj>fF|Fj&glmES^{G0Xuw}$MKT24ZZ z?VlP<3fIRuIS08usb5EJ|0i+9$<5kZcVGUuG?Uv*U~`y-Hvb)+zVtH(x&LnWj0IPp zx54Kxxbs@D=H~40PNwGYd!DR=_ZHK@9s7Mol22dzxzBvh(C+-qMO*6O?>E%KX8{{0 z*Q{CL`lzqX_{N?M?D*c_=kxgNaCP^zpW){OtNDGPKK7dnte$i42Vmo?J2&V0L$LM9 zdzM_k*yk?n=FFC<=0Q`>ePlndTIMt#*f~0XYb}p$0kHGcX5HmtYi=C(vioo$uzS(| z`e`>O&x?h@=InadSFV3@UId&uCFe!a)V<~`PV38Sj{0I8YVISkaa{K$IIO$1`D?w- zOV)bU+14@KpG$!q*S#w?o^fMey0DA0x2>uF4KiP|ak#gg=j=6h&-&jiGoJtbT;nfS z^mkwMP>mSDxJ0 z1v`c|Mn zDOj8NIWKu^n}O9FL!3SBp7Gz}_O&oa>bgjct(~9o)y0c(nBNk#o_}gv5>J1Yf8uys zfz>nq)?nl0`ne5UA9X*ona8$Zb#oLaN6#DIFMTb^ksOz*u{r#x(Eabhnd7pw*#|oi zPd~ZsP7OY==E-+ExH0ov!R_H{?o)ltcSo=}Sa0LY#m3e@_1PJ0PGx<%(A4#JY#;oT3N?0qD{@$am1&d5AmZp#<}nyeJ@Xy{wr}pi zc7>}A<;dC%1KUrVYiB;)V0G6{oVD}cCi^LeFaLYgS-VwhY~HJJnCt4aS)bv=DA#Lr z!M)~=Ex3N;3%(b4@0w@sBjAox&V3}Bde(RpSj{zdUCco(wb=u#mO6|9t4+s|JuwdK z-0W|Ta0uG-8B$r4c5lC z9*3`WII;%o)_6_Y0UWNu`m|Yt1Bg+s!65~A4Gu53en&R=(FNE4xPt3{LW7@FaQ#m! zxc+Ap{4DU4T;qe#)U(D1gVnOe&QUEjKNPH%H9iciR<7|8aBJrpSOdA( z`RkuGJ__s_>!VE{Yp9;_jschB9Sc{Rw)XKlb38cXSwp$8te^fF??kZU>7z{_*FZhv zoeVC=I~A^WO6}u#r-L(|Yaln4_18b+oe6e4eYEN08mec!bHL6ye=l(!SnXVn_?!>c zCu?*eSnUFi_*?{bE{St7Sgnk6N$rz3zl5t9C%^Z&6l@NTVGMoTKbO&7&f%V_IGl@I z|KupVtz@^CJAlf_whlTyXtvZSdO*uK!&H*Z(&K{{ntb&2zq82RAS4l^R_S zR?iyT09MQSX3lEyzX`0C^X*q)wdpwYaX;PywiedZImyM&NB``{+raKeeYELgP1G~q z9pG}jJK<{6);^B+YjDQ1CURq05B)RV-C)PlN1Hy@R6XO}3pNM$%!c^>7OZ9s@1wn+ z!@6eA$@NM92f)V6=g9}bYW9DK_F)eDXV1y?vA_H3k=pK_6KBtPPqhPwub**b&;7i{ zThVS;<85iT=Wze+NSpokDDkrY+&@njT>aSwf3D#Azglqp|J2~G*F5{}F}OKdtJLRl zuzL2}6JWLMH*-{r|C3;~?6;@DYUO@=25uc(Pv;~TJ0Ja%&+}mGQs(mqGxQliOhkIvN+U%V-iCONQ zcMGomeuIBdaQDvV1=s(J2LH0=S*y3;=2X$9=6?pOXYafXR?FToN45C>1+136^A1?8 z+&k~Vt%G%SPI9sH(Leco2(~U|J|Cf}>+gK!S(A^!{cty?v9u?zzk2{BWt`n?I;fS&K|VcJ6{vC+&kYFTzwKYXzaF=);zWU27kvWYyT~pdiKIUz-rkG zj;WTKe+O2}Uic?it=tQwT{$o7WR7yN`RSj0I^fQ;%%>AgU4QeJr^bE2{cty?v9vd9 zyszf*0uyrrHd|Qn|s$W#O_=BCC_ES=BdAa z+MDP45!ihl!jW_R&>H`O_OKcsOnW$o=lYSfIoFpXrst5{b8F>-xAX>Iwcz@%Q*iwU zH28W2*MHN3>%Vz}Z&7gle_n9?w{P$r3ahDDY*Xo zHTeDo*Z;7B>wiRpA6anyPb|3pCpY*h1=s(ag6n@?gP&h;{Vy-L{#P{kl{L?`YI(SO zsH)G&zQ7-&sps0c0$45APWO~r&Z!l_YPoi<1Xe4roj-xQZ#@s(gL3gIwE8EX)xhrK zGM}HKsq63ll;<2;9o!FhV;W0)@>&C2=CvkVt$pq6+qx#L1$R8Joyphpy+69PobPLc z)pEYC3(kBzN95+``QbQX&kg$}*Y&~Xs=t2Plj{cHGS>~^YVGT1JJ*fi$<;a7*IYM7 z*Opv20jnj~&A`dk^HXkqo|}#%_Pn%Ta@`VauKMezJ-KcLE_2-)uC@q=&oR8#Zv#%Q z&LO#Oi>@uX{tT>^T(<)!SFa6n^YgmkIN}{>?U!741e>e=`e{$DJAunw2g22cl2dZs z8J=97Lvrl`YfG-XfYp-gU~qEvx+FJ0uSt$09!hJ!ORgipYRPpJIJtUFl$)Q|L&p(&Ewo>99RoI3{q@tHT*rdTT*t%J z#&IOqJ;BMf+^>6~YfG+sgVmDjzTo7#4_Iz~`_Vd%cz;^^CD&ho%~gN>v?tdCz-6un z!PO4rNUjHi&2=%_vaW}~wI$a>!D`9%aBy-x3@kUlBWN8*?0bOylIu}mbJbrz?aB3M zaGC3|aJ6GNlIwBcM1*<2|%fM>Ma{@SdUJjO<+ZD8qA-aBAo}%dLrP?A*kzuYFUWhr#_ALx26WH}~wh*nOSJkv)4> zjZdRJyT)hGp2OjuJ&!hf_7P&1d-jQft3TD?PuDza{yVs1lxzNbH1+JkN5N{@gN~_| zwSEk&mOc15SgqWHPr{v-buve}*!=WQKF@%iXPM8lXzKc#zdSX54%`oSV;W0)v&NSh zkJrTzJT_ZHNJ@UQVwf;Ic;kEJTc1}zg%$jR~!7bnrBXbfICJxrx(!FQ{xxG zYN@efs-?z%1goXSFM-v{8ovU+gkh|cIm*T6r+@PK6WDo{`Mi#%uD|)qQ{y+l{cty? zv9vd9e2wvVUCEIeUsdBPXs_n5uGiA$`ua9;Qdiq|8vNawC+|1m#wqiD3r#(B{4-e1 zI_BK{3*20sn`6qwj;Fuav8-k0{2sg?ee~B)dvne=VfS?%N9KHejh*uiHSVIlk;D1_ ziZ=8Ah?wR4|JvZ66x{chFADDSgs%$j7+=>s^?Dy}4rRSQKvPe>J_M_k_4*j@Jgkd3 z%EjiVzu4To&Zb_U!u!!jfBm#4kH3MNd3=VZp8fneSk2EDJ{R1HSbvABXC1!;CogLu zHy7)muYNnz?;Ei7)@DuQ$@5$AP;Ba3jUGJdH zx_(d0a$P6s=*=fBxa&H3!F|R!Wx*X|s+y;s{~*3Ol|nLBUUR~ohjlSWx!C;l7n@smQLi7u`_V^#{j?{Kxxx2hEAyBKO+9!7cGyU}kUu=UnvP2^eEg~6k-+24NJo9p@rc3%&0 zWL+PuvFrL!jmOhI%;CEJjyCJMI5Eq0U9!QKD!A*qe8GKIwnD)jW5t@Mo{PZEp{(bk zXzE$l#lULix-J2C9@fPil4_0{hlN1`e=<^ z*T-snFzw?UuIrPuS=ZHwS+48q4ZcRfUDtIB?sJj#3ho%|*F5$73EUjYbzKEbJ?pwE zSgl;wpTeDobumY|*!=Vto7;iKx~>WDM<4z5)1Exm0yp#MkEWh=T^p>Hb={v>>%i5s zt^>fy%Ua0I#X9J#-$C@-0BpUrSrd8IbwltG*z9jV?ag(44!f_XII^x!*VuJ^rpCw7 zKFi^{K2Mu<-ISQ+x^CX!TNK=N{aL|}hHqDJ$JoB+spm#;b12t!V>I=w>n32ea$Prr zI}ht(j&ia2=`S|7V~Ta%65fwK`s=4Xd29u4=CL)Jde(Ituv*sjC}M33SI@fs9GtwY zh1^`MgTDG5OTQh!)?1r3k!M|Z1fPt}{`S+}T-O(|`zr4#{s4E4UDp>lT+2VwW-Ysj zQ?BKp1|MAW)MO{PamuwBh^C&k*%_=>uFWoRb8&8tDHl7Q{$j^Ey;z$e@P72sUq9{5 zIlqS8*GnAP%P-g1IloflGiYDsaQ=Uy&HTHGS-t8GUDr2jd=c$i9Iorzv{~2viCM1e0S$g&!ClwG3-0$YM;81d_)#@aJ@-r10^RO=FC>NWb{$g|6k0bRu2;7f8`s=4Xc^nLG=5Yv`de-$&uv*sj zd}192SI@d00Zv}lLT)bBL0|nYq~Fotw`jFl6M6DH27DPd``b@@b6wxV?&~icuB&VG zPK{lMcR5_k_i4+%Q^G&y@bv*l`hHmBDQG|9ua8kkTb8^k|H=@VF zr<|nqQ?SV}b9GZImM)Y{Fn$JV>UG52Rb?f4oet#o&JZ;u*a$=nVHcxGiEw_Ho z?^N(~wDz~3c7GSFp5Fx<<8uyQpK#>d__W5J8-J^@^Zty(n*5#Cn#@36=M%52$%O^K z2)_vhcTKJ?xbd$oxc=AIJbUsCxH**P#+hj9)}b$^v%uz-zsWxvu8;bc)aG1TU;Ztg zwsSbtoQv4HEXQ2r>ekbFT)>gPF~0~rGcnY)J8w1fbnPz&tK~OW*7g#(x;b1*>&qO} zf61Zdm|}Cdtmf)||8HHd02|l&mo>N&O`G2-c`jWARx>wiF3-5yr?lW4)dzxUrrpxc<)+T>qB~uK%kA z-xmIM!S(;J;QD`DaQ#22xofpG{BF2;-%gz-r@a@hb`OVp`94}-?qzMi<@hs)^AunP7T z^c?sme2iy5?b*|R02{MBCtg5P&z^n}toA%d`u!1{evT>E&wpdWx>`ePnR&g^vbFPb zU&%AC*T9*Vdra=%!+1XY3H)nX`)JGfuY)tbaXio7KvPfNZ-Ujlhe*G-;OS@Ha{bD@ z&0p^Ra~|%w?1jI8|J`1A2VI-FxVPT}ySEQ#-R1AY-{nwuFKmPM0oZ=p%*FFbEj9ZH zoW0<=B~Q)%3Qo;D&*a*S`zhF1#`K(%XD|E>?DzA=v!C|V<}+|=V=Qa>IhuNE`ggEe Vxfi~Gr=MfW^($*=Epu=5{{g6=_aguR literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/draw_leaf.dxil b/piet-gpu/shader/gen/draw_leaf.dxil new file mode 100644 index 0000000000000000000000000000000000000000..200f16962e12a1076fc8d560bcb58523676956d1 GIT binary patch literal 6764 zcmeG=3sh6rmM6J4HHQ1mSL1~+SQ0#yLQl6-t<^v&O zC=t=(r+opm!>YCqXB-Pw?FAwt1O#jQP^kq))IPP8+7?^uymOJb+PCJ-%vv*R=FM8~ ztaa|$|Fic#`<{K1gfzJ#;>3H&Zyf_OAF40C5q;;x5*h#?Zx#SdxGsiM31al1nJ`E8ovep4GzYyNpLU97iyw#3a3}N zJg2)-2pklj8uaP;mvjWusp+IsvHf~D=}tQoZ%wK!wGIku00WFcClb<@E2)inkf(!m zxq`eQVgv9cq`4Z1sl7cQkEsvXBkMw!V?4>C8d@1Uh=F_gqPsJ7!liV1jjP+kimNiL%G^_4)MD~ONK@tGVp$-+Rhe0DJl(@--MgzNuXL*=J#{A!zXatE@l?lm|F>EzLi_(1hzSWLJgA= z=*RB;N~}c6+^%63_hVx`X2~;24HsH?COHOKHYSwGALc?k(-x*EFS-ROgIzYV&)X;Qg0UA|1)F7=F0G*fuVKIr^=%{WQ0gCq+ z_;bXxn8TYYc8bd`4JH6KEhn5vlLAbBha&s0HkulR*bNkQoAe9(M6k3U5gi($N?dB)KQhfTB?NW zEJSHdq@!on+-tfZ9+uB_+eqeWcLBc959usW@HM5bmCi2BqBlvj&Ls8 zV(bxORC;qImXl)uO7EIwyYNjKT+`7n{S0mMM%-VO#6)_abJ$T~3E6x*nej(P9^TAB z_t8kSo`?e8nBi6w`lU%bxY#{kyJ@pWnHxJ82az)H0jERA?!XK<5>6O7=P^vD#w_il zqwN#t25)q>51zA6-0dC<#&t_Wl&j{4eY5*PewgLK?y9`Eq3aoC&m`~mja_pJg3cG7 z2|mpG$SJpRu^kFt?jALpFP67f)$i*a5NGXhZ9H^{0JC;4{CLsnRu_8no}{f)6d(R+ z$(aSGJ{JG83%x(Qes0Il2OD?noeO1scb{GuP05bulYM}5S<3lB$GJvwZb%JCq#0RM zhrHN?#I_)dyO6j+B)+M=Gv|+EHHqr6#ONUW1M5iZ;J%B!N6)qqMTKjLe?PIjIo*gJ3 zfF^pAUCz7+ZDemQE-F=R*|fQM>l%aPuH1A-@`L(L;=17I=%8Wh&Y|ih1+Ede&c{CD zlOv$TnH4c*9TvHcS-Oq6wXTpo2HYO-xQ}$)2)5j}y=``1y-$vqF(qd7b|7(GiZ}}r z=h$W`E@1A!nWae;1qICQ1xU%ZiUOj-%faW~w{l0sZkxq!x5l~SDsBYNKKq8qXHYb| zUF(}8-W7Ys7~8K{bP`#7#<;i)iFJ$%N;J$Jxy+IL|>{H6t?zt*qW7vlESp)EhgCS!EtM`HYZUcY_hV?49FSJ28H!@zd{4ELQq8>PfGKX+hl`KaO4h zWz}9iY5Rv9ytm+S-z}z8AWGQCzA^=;rtnhk0b}*hsmRoh+p*PtDtLY0A5AEOlzB-M zWkCWe=8O8-u3Ae@2*H6#fg?$_CQ0xN1hJo=|L~IFRLQCD z{J)s}W}F+=bq5LXxk>MB;-K~UUz+GWx}IbIwdO@{{Lh+K z^VhKP)wT{?ye&I>v~>VBF6t-D-}}*4|GEqb&tT$A%Hr7AX7RA z6C|_=67_;r^|cv6f?VnHE7CcFSG1mK_3na)av3H_6?txv-ope%(#Ke8Qg&*Bj49Ya z`|7iU@lfn$4p z%V-j2(J(g6exMfyZj&;%OtU@wEP4a86fgrY3~FmEhJ8-ZY&gn{;A=okqu7In=^|La zLDt@Hs}d3bH|Qf-TXlKG;)6Wk`GlsHN*VP90T@mhtEcd|&pau8Kf5)$xzYe4Vh_l9U)Bnq?x7VHMi%=0XrH! z&BJDb6vcvE{s*?Tqja^DwVr{jjZrbsrAf`Ch|@KaK5WHqgLwr>$X4@=m?LeAk~%O{ zPwgGX>nR#2rAMw6&$s)|EGc+|z@u5cAiTydB}iu!kj5Am3>H0aa{_gWtddR(K~ z{T{9sH^f#595%bL7FET8Qcnc#P`X*&;(NBwBElmb06 zj$-SZF>%wo<2L$@7CwT)9h*C|qAvT}ss_q{a5Mfo^>&qA21a+o?9GG{VTbZcTH0bb z;wI_f2sRo?+UVNXybjM5hD)ic?TOo&H9z%MbfO2{cIJlFo!&%5pQMlLrcK{9E}rxc zsfMOoq*q7Gx`n1c9MQ~WmFyqcWsr9viEJMA_Fk+0URb4zgElmFK$G&=$Dii94Un84 zg~_;X)}UeqZlgcyXAL{$XYF~MR;7!l_B2mAw^G{UGXh$ut(vmpopuBls|WE%SqnvA zMZMd!%e$9O8lwzPDcfsm;}Wh$)W*rLW+IV;=~xJB1)c8{OH3+@Hn@91u@9u^w5O-{ zcs#BGOX1GLc=k)GlKsT2yQ+mrJ0{z`{`x93FYrBAfW0WdZtkvFaCpQVS>QHyD7)?m zX;#GN_)LV0%$Ll18oV^aa@~X5rol~;06n9S_#RIL8(&R82fBm>npr8Cx3?h^gM`?m>6C4FG*6rSo-5S5wwsLA1s|B zRxRiKK0-GyI>vK4xV&}ha-~7dUw?G%`lZPa!Kq=eR%5qOwbcB|CeH|$SC($ux31JP zs+B%igQV)G-?e|dy|zEl>?u-b2C0WO?jH`_)4%T+vCbJ%;U3S0E;-pSq)IE?X{z9l zWxSr9Oy5AIDxfw_c6G(;jOp4~J@aM#Ud=kHD(f~lQ+4g=y0O6VeK1uq@~a-TA0|CF zUdo@j*rk&)K3{bzw9u(=|9r+|V9YyHZ%q1i#;exq?+pvOE|d}Rrcm0OVUvuTgi)%JM2nFOHV1OzWH2f&~7=6Mg%zEV4pdR z<;|PtRI2KAG_O=e$a}9Sd%Jozk?O1kPWP@}v3!C?2MjrShuCL_(7ZT*r_#Hj@BpsA8nx3*5Woz%0?CQV-PvqBf2(;ljE8G66K`{Tkx`wj$kDVO|!VbH>L MXu)_2b^ZbVXP|}eCjbBd literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/draw_leaf.hlsl b/piet-gpu/shader/gen/draw_leaf.hlsl new file mode 100644 index 0000000..734d21e --- /dev/null +++ b/piet-gpu/shader/gen/draw_leaf.hlsl @@ -0,0 +1,268 @@ +struct DrawMonoid +{ + uint path_ix; + uint clip_ix; + uint scene_offset; + uint info_offset; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +static const DrawMonoid _23 = { 0u, 0u, 0u, 0u }; + +ByteAddressBuffer _93 : register(t1, space0); +ByteAddressBuffer _103 : register(t2, space0); +ByteAddressBuffer _203 : register(t3, space0); +RWByteAddressBuffer _285 : register(u0, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared DrawMonoid sh_scratch[256]; + +DrawMonoid map_tag(uint tag_word) +{ + uint has_path = uint(tag_word != 0u); + DrawMonoid _76 = { has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 60u }; + return _76; +} + +DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b) +{ + DrawMonoid c; + c.path_ix = a.path_ix + b.path_ix; + c.clip_ix = a.clip_ix + b.clip_ix; + c.scene_offset = a.scene_offset + b.scene_offset; + c.info_offset = a.info_offset + b.info_offset; + return c; +} + +DrawMonoid draw_monoid_identity() +{ + return _23; +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 8u; + uint drawtag_base = _93.Load(100) >> uint(2); + uint tag_word = _103.Load((drawtag_base + ix) * 4 + 0); + uint param = tag_word; + DrawMonoid agg = map_tag(param); + DrawMonoid local[8]; + local[0] = agg; + for (uint i = 1u; i < 8u; i++) + { + tag_word = _103.Load(((drawtag_base + ix) + i) * 4 + 0); + uint param_1 = tag_word; + DrawMonoid param_2 = agg; + DrawMonoid param_3 = map_tag(param_1); + agg = combine_draw_monoid(param_2, param_3); + local[i] = agg; + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + DrawMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + DrawMonoid param_4 = other; + DrawMonoid param_5 = agg; + agg = combine_draw_monoid(param_4, param_5); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + GroupMemoryBarrierWithGroupSync(); + DrawMonoid row = draw_monoid_identity(); + if (gl_WorkGroupID.x > 0u) + { + DrawMonoid _209; + _209.path_ix = _203.Load((gl_WorkGroupID.x - 1u) * 16 + 0); + _209.clip_ix = _203.Load((gl_WorkGroupID.x - 1u) * 16 + 4); + _209.scene_offset = _203.Load((gl_WorkGroupID.x - 1u) * 16 + 8); + _209.info_offset = _203.Load((gl_WorkGroupID.x - 1u) * 16 + 12); + row.path_ix = _209.path_ix; + row.clip_ix = _209.clip_ix; + row.scene_offset = _209.scene_offset; + row.info_offset = _209.info_offset; + } + if (gl_LocalInvocationID.x > 0u) + { + DrawMonoid param_6 = row; + DrawMonoid param_7 = sh_scratch[gl_LocalInvocationID.x - 1u]; + row = combine_draw_monoid(param_6, param_7); + } + uint drawdata_base = _93.Load(104) >> uint(2); + uint drawinfo_base = _93.Load(68) >> uint(2); + uint out_ix = gl_GlobalInvocationID.x * 8u; + uint out_base = (_93.Load(44) >> uint(2)) + (out_ix * 4u); + uint clip_out_base = _93.Load(48) >> uint(2); + float4 mat; + float2 translate; + float2 p0; + float2 p1; + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + DrawMonoid m = row; + if (i_2 > 0u) + { + DrawMonoid param_8 = m; + DrawMonoid param_9 = local[i_2 - 1u]; + m = combine_draw_monoid(param_8, param_9); + } + _285.Store((out_base + (i_2 * 4u)) * 4 + 8, m.path_ix); + _285.Store(((out_base + (i_2 * 4u)) + 1u) * 4 + 8, m.clip_ix); + _285.Store(((out_base + (i_2 * 4u)) + 2u) * 4 + 8, m.scene_offset); + _285.Store(((out_base + (i_2 * 4u)) + 3u) * 4 + 8, m.info_offset); + uint dd = drawdata_base + (m.scene_offset >> uint(2)); + uint di = drawinfo_base + (m.info_offset >> uint(2)); + tag_word = _103.Load(((drawtag_base + ix) + i_2) * 4 + 0); + if (((((tag_word == 68u) || (tag_word == 276u)) || (tag_word == 732u)) || (tag_word == 72u)) || (tag_word == 5u)) + { + uint bbox_offset = (_93.Load(40) >> uint(2)) + (6u * m.path_ix); + float bbox_l = float(_285.Load(bbox_offset * 4 + 8)) - 32768.0f; + float bbox_t = float(_285.Load((bbox_offset + 1u) * 4 + 8)) - 32768.0f; + float bbox_r = float(_285.Load((bbox_offset + 2u) * 4 + 8)) - 32768.0f; + float bbox_b = float(_285.Load((bbox_offset + 3u) * 4 + 8)) - 32768.0f; + float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); + float linewidth = asfloat(_285.Load((bbox_offset + 4u) * 4 + 8)); + uint fill_mode = uint(linewidth >= 0.0f); + if (((linewidth >= 0.0f) || (tag_word == 276u)) || (tag_word == 732u)) + { + uint trans_ix = _285.Load((bbox_offset + 5u) * 4 + 8); + uint t = (_93.Load(36) >> uint(2)) + (6u * trans_ix); + mat = asfloat(uint4(_285.Load(t * 4 + 8), _285.Load((t + 1u) * 4 + 8), _285.Load((t + 2u) * 4 + 8), _285.Load((t + 3u) * 4 + 8))); + if ((tag_word == 276u) || (tag_word == 732u)) + { + translate = asfloat(uint2(_285.Load((t + 4u) * 4 + 8), _285.Load((t + 5u) * 4 + 8))); + } + } + if (linewidth >= 0.0f) + { + linewidth *= sqrt(abs((mat.x * mat.w) - (mat.y * mat.z))); + } + switch (tag_word) + { + case 68u: + case 72u: + { + _285.Store(di * 4 + 8, asuint(linewidth)); + break; + } + case 276u: + { + _285.Store(di * 4 + 8, asuint(linewidth)); + p0 = asfloat(uint2(_103.Load((dd + 1u) * 4 + 0), _103.Load((dd + 2u) * 4 + 0))); + p1 = asfloat(uint2(_103.Load((dd + 3u) * 4 + 0), _103.Load((dd + 4u) * 4 + 0))); + p0 = ((mat.xy * p0.x) + (mat.zw * p0.y)) + translate; + p1 = ((mat.xy * p1.x) + (mat.zw * p1.y)) + translate; + float2 dxy = p1 - p0; + float scale = 1.0f / ((dxy.x * dxy.x) + (dxy.y * dxy.y)); + float line_x = dxy.x * scale; + float line_y = dxy.y * scale; + float line_c = -((p0.x * line_x) + (p0.y * line_y)); + _285.Store((di + 1u) * 4 + 8, asuint(line_x)); + _285.Store((di + 2u) * 4 + 8, asuint(line_y)); + _285.Store((di + 3u) * 4 + 8, asuint(line_c)); + break; + } + case 732u: + { + p0 = asfloat(uint2(_103.Load((dd + 1u) * 4 + 0), _103.Load((dd + 2u) * 4 + 0))); + p1 = asfloat(uint2(_103.Load((dd + 3u) * 4 + 0), _103.Load((dd + 4u) * 4 + 0))); + float r0 = asfloat(_103.Load((dd + 5u) * 4 + 0)); + float r1 = asfloat(_103.Load((dd + 6u) * 4 + 0)); + float inv_det = 1.0f / ((mat.x * mat.w) - (mat.y * mat.z)); + float4 inv_mat = float4(mat.w, -mat.y, -mat.z, mat.x) * inv_det; + float2 inv_tr = (inv_mat.xz * translate.x) + (inv_mat.yw * translate.y); + inv_tr += p0; + float2 center1 = p1 - p0; + float rr = r1 / (r1 - r0); + float rainv = rr / ((r1 * r1) - dot(center1, center1)); + float2 c1 = center1 * rainv; + float ra = rr * rainv; + float roff = rr - 1.0f; + _285.Store(di * 4 + 8, asuint(linewidth)); + _285.Store((di + 1u) * 4 + 8, asuint(inv_mat.x)); + _285.Store((di + 2u) * 4 + 8, asuint(inv_mat.y)); + _285.Store((di + 3u) * 4 + 8, asuint(inv_mat.z)); + _285.Store((di + 4u) * 4 + 8, asuint(inv_mat.w)); + _285.Store((di + 5u) * 4 + 8, asuint(inv_tr.x)); + _285.Store((di + 6u) * 4 + 8, asuint(inv_tr.y)); + _285.Store((di + 7u) * 4 + 8, asuint(c1.x)); + _285.Store((di + 8u) * 4 + 8, asuint(c1.y)); + _285.Store((di + 9u) * 4 + 8, asuint(ra)); + _285.Store((di + 10u) * 4 + 8, asuint(roff)); + break; + } + case 5u: + { + break; + } + } + } + if ((tag_word == 5u) || (tag_word == 37u)) + { + uint path_ix = ~(out_ix + i_2); + if (tag_word == 5u) + { + path_ix = m.path_ix; + } + _285.Store((clip_out_base + m.clip_ix) * 4 + 8, path_ix); + } + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/draw_leaf.msl b/piet-gpu/shader/gen/draw_leaf.msl new file mode 100644 index 0000000..c11e21b --- /dev/null +++ b/piet-gpu/shader/gen/draw_leaf.msl @@ -0,0 +1,316 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" + +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct DrawMonoid +{ + uint path_ix; + uint clip_ix; + uint scene_offset; + uint info_offset; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct SceneBuf +{ + uint scene[1]; +}; + +struct DrawMonoid_1 +{ + uint path_ix; + uint clip_ix; + uint scene_offset; + uint info_offset; +}; + +struct ParentBuf +{ + DrawMonoid_1 parent[1]; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +DrawMonoid map_tag(thread const uint& tag_word) +{ + uint has_path = uint(tag_word != 0u); + return DrawMonoid{ has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 60u }; +} + +static inline __attribute__((always_inline)) +DrawMonoid combine_draw_monoid(thread const DrawMonoid& a, thread const DrawMonoid& b) +{ + DrawMonoid c; + c.path_ix = a.path_ix + b.path_ix; + c.clip_ix = a.clip_ix + b.clip_ix; + c.scene_offset = a.scene_offset + b.scene_offset; + c.info_offset = a.info_offset + b.info_offset; + return c; +} + +static inline __attribute__((always_inline)) +DrawMonoid draw_monoid_identity() +{ + return DrawMonoid{ 0u, 0u, 0u, 0u }; +} + +kernel void main0(device Memory& _285 [[buffer(0)]], const device ConfigBuf& _93 [[buffer(1)]], const device SceneBuf& _103 [[buffer(2)]], const device ParentBuf& _203 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +{ + threadgroup DrawMonoid sh_scratch[256]; + uint ix = gl_GlobalInvocationID.x * 8u; + uint drawtag_base = _93.conf.drawtag_offset >> uint(2); + uint tag_word = _103.scene[drawtag_base + ix]; + uint param = tag_word; + DrawMonoid agg = map_tag(param); + spvUnsafeArray local; + local[0] = agg; + for (uint i = 1u; i < 8u; i++) + { + tag_word = _103.scene[(drawtag_base + ix) + i]; + uint param_1 = tag_word; + DrawMonoid param_2 = agg; + DrawMonoid param_3 = map_tag(param_1); + agg = combine_draw_monoid(param_2, param_3); + local[i] = agg; + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + DrawMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + DrawMonoid param_4 = other; + DrawMonoid param_5 = agg; + agg = combine_draw_monoid(param_4, param_5); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + DrawMonoid row = draw_monoid_identity(); + if (gl_WorkGroupID.x > 0u) + { + uint _206 = gl_WorkGroupID.x - 1u; + row.path_ix = _203.parent[_206].path_ix; + row.clip_ix = _203.parent[_206].clip_ix; + row.scene_offset = _203.parent[_206].scene_offset; + row.info_offset = _203.parent[_206].info_offset; + } + if (gl_LocalInvocationID.x > 0u) + { + DrawMonoid param_6 = row; + DrawMonoid param_7 = sh_scratch[gl_LocalInvocationID.x - 1u]; + row = combine_draw_monoid(param_6, param_7); + } + uint drawdata_base = _93.conf.drawdata_offset >> uint(2); + uint drawinfo_base = _93.conf.drawinfo_alloc.offset >> uint(2); + uint out_ix = gl_GlobalInvocationID.x * 8u; + uint out_base = (_93.conf.drawmonoid_alloc.offset >> uint(2)) + (out_ix * 4u); + uint clip_out_base = _93.conf.clip_alloc.offset >> uint(2); + float4 mat; + float2 translate; + float2 p0; + float2 p1; + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + DrawMonoid m = row; + if (i_2 > 0u) + { + DrawMonoid param_8 = m; + DrawMonoid param_9 = local[i_2 - 1u]; + m = combine_draw_monoid(param_8, param_9); + } + _285.memory[out_base + (i_2 * 4u)] = m.path_ix; + _285.memory[(out_base + (i_2 * 4u)) + 1u] = m.clip_ix; + _285.memory[(out_base + (i_2 * 4u)) + 2u] = m.scene_offset; + _285.memory[(out_base + (i_2 * 4u)) + 3u] = m.info_offset; + uint dd = drawdata_base + (m.scene_offset >> uint(2)); + uint di = drawinfo_base + (m.info_offset >> uint(2)); + tag_word = _103.scene[(drawtag_base + ix) + i_2]; + if (((((tag_word == 68u) || (tag_word == 276u)) || (tag_word == 732u)) || (tag_word == 72u)) || (tag_word == 5u)) + { + uint bbox_offset = (_93.conf.path_bbox_alloc.offset >> uint(2)) + (6u * m.path_ix); + float bbox_l = float(_285.memory[bbox_offset]) - 32768.0; + float bbox_t = float(_285.memory[bbox_offset + 1u]) - 32768.0; + float bbox_r = float(_285.memory[bbox_offset + 2u]) - 32768.0; + float bbox_b = float(_285.memory[bbox_offset + 3u]) - 32768.0; + float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); + float linewidth = as_type(_285.memory[bbox_offset + 4u]); + uint fill_mode = uint(linewidth >= 0.0); + if (((linewidth >= 0.0) || (tag_word == 276u)) || (tag_word == 732u)) + { + uint trans_ix = _285.memory[bbox_offset + 5u]; + uint t = (_93.conf.trans_alloc.offset >> uint(2)) + (6u * trans_ix); + mat = as_type(uint4(_285.memory[t], _285.memory[t + 1u], _285.memory[t + 2u], _285.memory[t + 3u])); + if ((tag_word == 276u) || (tag_word == 732u)) + { + translate = as_type(uint2(_285.memory[t + 4u], _285.memory[t + 5u])); + } + } + if (linewidth >= 0.0) + { + linewidth *= sqrt(abs((mat.x * mat.w) - (mat.y * mat.z))); + } + switch (tag_word) + { + case 68u: + case 72u: + { + _285.memory[di] = as_type(linewidth); + break; + } + case 276u: + { + _285.memory[di] = as_type(linewidth); + p0 = as_type(uint2(_103.scene[dd + 1u], _103.scene[dd + 2u])); + p1 = as_type(uint2(_103.scene[dd + 3u], _103.scene[dd + 4u])); + p0 = ((mat.xy * p0.x) + (mat.zw * p0.y)) + translate; + p1 = ((mat.xy * p1.x) + (mat.zw * p1.y)) + translate; + float2 dxy = p1 - p0; + float scale = 1.0 / ((dxy.x * dxy.x) + (dxy.y * dxy.y)); + float line_x = dxy.x * scale; + float line_y = dxy.y * scale; + float line_c = -((p0.x * line_x) + (p0.y * line_y)); + _285.memory[di + 1u] = as_type(line_x); + _285.memory[di + 2u] = as_type(line_y); + _285.memory[di + 3u] = as_type(line_c); + break; + } + case 732u: + { + p0 = as_type(uint2(_103.scene[dd + 1u], _103.scene[dd + 2u])); + p1 = as_type(uint2(_103.scene[dd + 3u], _103.scene[dd + 4u])); + float r0 = as_type(_103.scene[dd + 5u]); + float r1 = as_type(_103.scene[dd + 6u]); + float inv_det = 1.0 / ((mat.x * mat.w) - (mat.y * mat.z)); + float4 inv_mat = float4(mat.w, -mat.y, -mat.z, mat.x) * inv_det; + float2 inv_tr = (inv_mat.xz * translate.x) + (inv_mat.yw * translate.y); + inv_tr += p0; + float2 center1 = p1 - p0; + float rr = r1 / (r1 - r0); + float rainv = rr / ((r1 * r1) - dot(center1, center1)); + float2 c1 = center1 * rainv; + float ra = rr * rainv; + float roff = rr - 1.0; + _285.memory[di] = as_type(linewidth); + _285.memory[di + 1u] = as_type(inv_mat.x); + _285.memory[di + 2u] = as_type(inv_mat.y); + _285.memory[di + 3u] = as_type(inv_mat.z); + _285.memory[di + 4u] = as_type(inv_mat.w); + _285.memory[di + 5u] = as_type(inv_tr.x); + _285.memory[di + 6u] = as_type(inv_tr.y); + _285.memory[di + 7u] = as_type(c1.x); + _285.memory[di + 8u] = as_type(c1.y); + _285.memory[di + 9u] = as_type(ra); + _285.memory[di + 10u] = as_type(roff); + break; + } + case 5u: + { + break; + } + } + } + if ((tag_word == 5u) || (tag_word == 37u)) + { + uint path_ix = ~(out_ix + i_2); + if (tag_word == 5u) + { + path_ix = m.path_ix; + } + _285.memory[clip_out_base + m.clip_ix] = path_ix; + } + } +} + diff --git a/piet-gpu/shader/gen/draw_leaf.spv b/piet-gpu/shader/gen/draw_leaf.spv new file mode 100644 index 0000000000000000000000000000000000000000..58dde4387fddb65154909ba8a846f351b6115758 GIT binary patch literal 20104 zcmbW833Odm{e>SSY0FT?Ql?T`%1~yCj50&9fJG*mR1B|4+Qz0yNYa)n$Si}3sE8tn z1Bfz+AP6`Bf+#2|ia3KJqJj!4Dzo7Cy?1wd`j)Q$TK`86XYc)+&N=tobKgzVwiTz1 zt<_ejty~*d+p(?I_^er55v5jJr8cI~&zgPI?Cpj}y0_bD=N)udt=4qZXLaH_=-X)v z>%IMSwIgXCq1{begB;q{q?ybZTO0qaNPiEco$9k@cg{NOuvxPY?CcsG>>M6h+Sk+B z*W2II+1)$T(>2n&xJSQngnim2&mD-VgYm6|)=j<6 zg^hxpz1=V`|I%rVoz{E$SNCcG*z3T{L6#&x}3(53{-3 zYU9aWZ`5hy8r5oZ8n~l2fqr3quydq7Z~CGcd*ItqTN@4Dxny97RcXd=K;PBC9ko5^ z=hufjT_a#4|6%kj7kFIb8qe$NoYgllr`~sP|KfqJ`bh6U|G_gGzGG^~(Yp@Tm{T9- z>KWHwJDz^OzPer&9#>M>F@06>siQp4lmQaX8ny(4)@^R zxH9^C-Inxrdx85qM|%5u8Zl!VxxH@ldwS>1A6aJfni}W+(Y>?Y9AtF&I2SE4-cA28 z=z}9&eKEK*%;WxW?HE-|qcP*K)%*JgVn{Cc&d5-`f4DLv*UD9{&N*`iP9C+5Yt&f< zo9kR{w`$bO)@N1hjXga&pXT`7D-C^4Z`ZPMG;_Ll8n)q)de?$jm#uAM4||_1H=btR zHLCIaH=AdGV{P0sqvPpl%wsKVTqtMHIyE)-X!F`yyQ#UC`#T-ovhl2g)?ByjB5lpN zOvL6n(%cKBy*b`>v3s4|r?F<;*Q;z^!&ck+*s{M{Z5vd!?)peQf!=d(MQ`rw{TIyz zJ0~?iUF>&z@8c5$e2novS^hrlsChFT<(bq>)5z1f!O$CLRU@{o=KbKuoD1pe^X7ru zYaY^r^`W|&oLi&D9a-;d@V44w`rgWR4t?o+UJJiqIo?sb7(P6|bGU1$KGMa>;aTCc zfU{z@C0dS(#@H^yo_g1V2S(=i4ADEbTj#=&QDi*V+^Bud+k~Dfkhl}jq|#r_FTo6 zoyB8nuc0;Wa~(%x+iLABLi2jG)m8;(pG*Lk*J5G|-w=Li&%%MBrS9!9HMyUKJqvRi zjOv=ydxnMvyw=_e`n%TXjU2AgMpX{+-r(-;=Gx5^Gf1!RTyWzks%}Krb0`{{TD~;A zvG(n?)59sQ{oL?5O@2|u8BU|dCGgR6q|z=&o7>yh$FsiM`_Hjlht@ocTzR#&+Kp-z zz8$=-O5g;ams(-xSVaV%}5XLsgBl#Ely5we!Kf{fj%h`I=+gB^7Uux4m{%#Yd_&yBW?g zG}1!?^WFv@s%&?N`9A1bqU}C?tGPVTur{MXNCMf}EA zTE^`7tf$`;TKrb3v`q`mXT5&Y3(fhfZBuBDRgKFyns0oi?Nn$!%k|r>(0qQY?OABf zQ*GZu^Lec{v(VP6w1W!G=d^ya3(YlBJG{_*POH7S&?Z*eafRkHS--axn(M1}VxjpA zR_iV_pS^1H8(RD5^EKBe&s%-`%wlhtH$K5uV4yP%`<$#csXKdzUWd5rfNR?bK6d}7~#fh}9F<{6K9x$%4Mw~jy0 z0_!?{>#8NM&xNwCe5#50IIbBux{mIVwOL))Vk*(bi`T)&aZRE5nGEy3RV$k@*4H+L z7;}nkQ@I|lqgdUhkN1t5d8Wd>XXNTzS9RU@ezw7)%{{MHHpl2bm_l2DW~|(E#ZS0v zRNA+KJIB~Lb{q4afVx+uJGOmjiIX2lGk4++0=u8&+7F?5W~>YMzE*QRo%0d&=2G9I z;-1}V+K;NRG0Arf*tMNZ7SDBUlWQ9_&U10uwEurvnd5(9>w5-xPCHi59oJ9IxyU^S z$~ApE+`XlLXJs@0M&!yFVP5ZjxqfpiKQ;5p=T+Jy_(@>*g6pFHP{q}=W*-FCR`5Et z*FIF))%5=`z1K_K{PK${E!V{Jh46;|rIpQ^$$up{^K?E}(Z&&w_`Bi8d%xUAuTR;W z|NUs%oWEy<-0^*v-rQ>DlRr>t+8?a2v5v>R{uIr#FZ>zsngxFz?0uKBP0kv0q%Vj zUWa#7-1*LfuUyy{z#ZpAst$sufF0{eV6U5+x$KwF_{{SDyb!GBTIl})u>R`tzXI$> zZta`kE72Q&KiIm)%O8T9&$Te$kHO|s*ZwHI{#h%1ca_vm-zRf@8E#DYI2O`+;j6*R zxvvl34Er`@_FVBycdvVH*>+*&Ea4daV6MG@;(B`h+$a9b%9`PR&nS6U!QDr`>y&oi zmBYOshYN20#V!1tf}8KW7Jfm&jlZnm`d{C|Z)xGTw(z@J_~R}7iGo}2$rkSWW9Fy* z*%tm>3xBPJ`>vRH$M5@M$-Ng!?mJ_+^?au*x$ku)_uVesJ@0#6$$hUY`ROg(_qy1< z2QDhO`FyX7-SPNNS90IyN`7O(&FA}E?A{Z;)0N!!xsv-%S90I$!X1zAb|v@SuH?Sk zmE3o`lKXB~a^LSt?mJ$%M|>Z!c|Jdb8=?TPIJYqO4Ljy$#j zu$nf{8~N_ka9svz6KLw%J)hK!)jmQ~bN#&k7STMTtgpXZAMs*(&&?M(BbU&7R@y&D zuBG(vr1{bQE_&_u8RscrbKFQRzek=*Q}a5!L=O4MG;T&t;Pz0U$?y|pLyY_K-x?b^y?dk;A4?RhQFdY=n+z18)1jnAjK#^&-o zmy12i_1~R+r{8;NtJCz+rjK#zsr^2%b4u>_gUjo8A>8YypYd{iye3{B*VR7xJ^(J) z=pwkX@0bSie9^YYJVDRj_k3|fYpwnxyR(6pqa~9@nh&$(%Y`0Wp1AZx6bWq zH1*W`9N2pIR`>Mh!QLb4H&pxl8nAKNo@b8N(fe^c+ODOkIUaH9e*xTD{{}Sm)c+#5 ztbZfi`s!y^W4Z}!oHlcN4ZcM48feRPy1CN54%dU_v3(h=rtKDbd2F|Vy}sIRrI%+< z-ws}#W{fs{^i}s7X5RMy>pFY|TkAS}6-_;3_!`)}ncvsJ&QCvU%Js3Pd&sr6H+~HL z9W>W5XT_bBmis`PTDdRXL#<=poqS`_zd>7xX6?Hwp6hxK{BD~2JoUc`R=>X5Gxvhk z^l{I8o8FImM%%Y&YR*HP+}{JY=KemKdiKK)z-sOj&%p=je@J_PHi@R)aotD%Bbv4& zs=f43rO`EN`V9Ut%{dzDeC7Hj=3%gTR-idfV}3$2ChIHLFEKv_8?z!UF+T$vvlY!c za{UtXbFeXOw8Z=ZY|IRrF>?L9j#)qZtluxet$XcPXzIB(kAT&3ZGH{*&-)ix?HHQ#e3o7<<9Pv`@jMSM$MYgwd&ct; zSk3Wd%rAq@nR@>QtGzVFhlAjKLeE9K(um?HNNGSS@Sv8hO;r z>wA>@(QDWNcJJBy+#E~q&tUv$U#YU&XD%y)&5`HkDquC&&V4N(Lo=7T#m0`sJ|3L? z>HC{Jc~=9QH}}r!U^U0;xwaTL~I<A>XN=}>jB5HNW?OJM#_izc7`KP3m1EoiZk)D^ zaRylZ%3_Q=!X0CI&%O?>o-ytO&KS+%7}fO47%tTxZ+9(q5X(b~FcYM#;JJj>>T^DLVOc8ycF7p^^f;vHZ$_eAzmAKblE z#w~=a=M3!!FQBRCIuC%&r_Jj(mtHNkhrw#u10&$vPuk+Q2&`?8c@EMqrm3C9^^>1m zX-nv>GgN7be<$2J8P~hOYRz-A`Lm@{;Hj-Wv8RHyrS`kQYN>r1SS_`eg4Gg#2Dq$! zCS2{x#`-pEp9N2C?TI}btSzq>BW7F-2aJDQfW;Im+3wI%Lq za2fYGxZ3gd)wn+oHddQw$hGu-JVUfyLsRn%5hwTc;BuY609UKivd%Yvv(DNR`$e!e z*W)^RdCsOUfpa$91TN2*o8j7XHr)bNE5A2<8E(#;L$`t#(A0Ck-v%~Tn`5|vr*YV+s`!17t z*~@cZ+M9on{xRAmw46)7t+XrXU9aEKXT7vNPE&Ke#6IIsqJM(sHT2v{?%!8>eEv}R zxR2uVN3i-ihhWJf6^aR^%KDVqWSv+&(U|FKTq#9(Jy&l1kbFv^LmNqSll<(eYw(dPrm}s z9yHdmtLf+19g8)rof@x#%Nno2%Nn@X)KX&wcxo7H4K@9evkkmAww#OYaJBMW?11O{ zzV^h90c#5%3odhwgR7P2;!5!3(w^9r!P;`~tpYZ`dhWed!RpS#ztfc`el_)C9;?IE z-2YjRHQ=eOJ+W(owS}(*E_1C7S1Zq>3Gn37p4fH3+A@!cVDqbI9_xbDorixHEjRu` z#=0KZ`|mK$bkCdh;c7NNNTc`R257EBc@As{SHF;*YWyUyIn+~QBXC(`GF;8ZA4avT z#>Q&2@;sOVS5J+pU~{OSQPtf9Y;A4Ubu81so|*C46t16ouBlqaxf!?|=jL!Vo1bzm zwm@?&%e9yeSI;=N1e-%WHMRnmHMWMU+4!^ET5N-0jdCrvg{!B=c3^X;XP(=Gt*tF< zu>-hViy3hJ)Uz+t;{Q6ZwZnG;dtE$RUE7_(`luW4?=-08%-#*GR<6(PaP|1S{=fL_ zfu^4Gc2BTz>WSM6Z0>Sh_J*s+XP^J#v#&9<$i zJiAT;n_oT8t|73x^K#G06F;n8%wq(uRz8Cl!Bbm%Vi$w8g)ae@xlV?wmCxX%@Z{2- z*mr`pWghPWn_oTiI0dZkJjScj68~=XVjic#)yj8>)8VPDJ+WtiwOPk==1j1fbxx(1 z$M)^CB$r7dIqHrVlKbF6pJt0nLK;M8$_z5~}M*Z8|&ebkM= zk6tZv`yRNQ>-XWtXKp_L>!WV`1N3T%{~_4;%;861wOwiYKS-~pzu#LP0{fg#{Ey+~ zd-TI_wVc5}0q2=+tU1;6OU|Ey%kK_9gR3>4;mx1h{2YD>t-R-c0ay1NcWl1|tEKj@ zz^QFcYpdy(+K+(C+Kz^{w+AQ&1r2l{ZjjJa9R6z zaJA+$zxsI<+}hiwAB6sSS_{x2u^KtT3b!O)P53N)_w}E*8GmptoreA7511@X-6|UC&j?k?AH@N3gdA%O~2HB30&5G1+Mlo zEzg#JgT2SgdA|x*Pwm&hYN<`B(c0#;wtw_X?G@lz9t$pOuLM`~-_PjdxwkUdvnh2}fvcs?s^HWy-a2afrOtS8S!Z>) zn*S~*b=ClTrlroBaJAG~3!FN}TSra5@*TvVW0w4diucg_vq$@kYa+2-taJYUeqFel z`#nDEwfJlRSM%Qx(l&{vW_@vTZw&VL#KNb*+o_#*>I z)bFp((oMkX&fkCIAW!@>a2NS9pH1Ov&c|_W4))`?v~5OHb6jHgy1!?%1$b3rCV`WC zI+}Xw%3bHg+sAiH@N2{--&Sz*O)K(kji#QtZUZ(>J$beTJICyS?ZDXs`a2di{Txfy z*j_%4n$FEWIk(4FuJI0VwP}paeA?8E@2KL((*K)pg1gcDw_#)P^>=vvjH6j&W%^g? z*QGE0{9Q3WtI~EQzrS-fzQC)2jbDS_e@mu)1DZ9mej9;}DX+(5xO#jx2Aiw&nF3eO z8cziqH-(nCO~BSO&iTvri+xkDwOyaA;bvg<_-qcgcImSPTs^g?gN;*9+?HT#XYJ+s z#lAJz+U4K3ZUa}3&$eJ|mp^>cl_?z_>nyPw46ewqOHvlh+$hGHUi5p@)O?=h_XqX`tNWZ$cWma_mzF&Hfz`tI2b=dk^7>ux0GgUP z;xiLm`n&f^epyKeshiV1Opu}GOiJ+8*Gb(oMbOTChi#7;{0(QFj z14$7ys6A2YHB@WuOs9=4a}xBt@TZhgph%q?s-9v+Y8`8-ZBMVYclQ>=v7MXV&D`Bg zZ>HbO_ulvY-uHd)&%WK?nhFfYs|%jX|KJZ7E`N2ZF>y;vUxNSuaEJl`hrJeRF4T2U zOQE80@ERN-2mn@Q*BdrOx<#|LBi=^UBqX7vjv_j2{|*ha$Qcf^+d};T3_Z zfQmxpWd@f_yLWLR^m(1Z`91K$=kq#)&(z_^R$BCyF$@3}h(J3mY%0jfLa(nych5QG zg3TpYt5+VLTi+mY9m=Ij%Td|$O6k5&mIBL9hM@%P>vF>*VKLRvfu5!O_kKu z>N@ww5%ry%(ARYQsrK6;HN^W*ggh0WFgvi(0w+lnp;A!4G>YS&M*V~?XdRK~ca1+g zYAWfb!gjUgFV&k8!TLvcANGVNJYTCRozrIZ5IJX#yh6{BgRXMa2O{K~gYF8}onGRu zDnXzf`j(m8I80W<7Rag@e6tTPr}2$mvTVlKI%AZvI3F;KeRfY+r4FxV@rq$=g2F4O zBUutwn2wx)Lp~hFY#bwDojVn#kj1hGysEgRCQ!MU$&6_}Kn~>Lu{S6k+}gA-qU*ict&WJZpcSBm}Mgl-Yj8+X`!M>Ql5z%owvJXMNK)fuPh+Yw@NZwYMPP?lf?UzzTF?NSEa5f zmlUn|m_j2^ZXXQ>F)N);pn7ZRCI`q+)*-3{VsGz7$*08*pjCz&)_SZg!F8`08bk$L z`UB2Ss1Jiw5Now35FclKG7rO_bO2L?E8(o48ZnuXm8&BCDt|h4N(3TYj!{smARakO zl`890m$KVxUkyA$rIk8Bb_B|vVO*U|DeaL-dst&r6-dWiUE||jH%?v|nmBW^=f;)( zi39??`(SQfO7a)mub000`S!-r9dMBrZ4V3_=^yua-meWbw!S!cG`qazU`q=P?t39k z(-8;}+QM4rb$?e_BZ(_Atf{-XaM{g-HNCH-3^s1sc^7Tpb`WX35 zki5nqg?+8jZ@(%X*A z(@3VjTiT<-x7m>DM0X|Ujyh!|Z;fH&H2LWm_7{r05hO2qWnY+OUo%d7pJ4Vm9Y0Z* zHHJ6%uTvQfz{9W?QJDQg^!EpLUbcR#2t|i{ok{tzCp8zYZ`m(JvB&*l!Wn#a$KQtD zyTLryfC--*W?xK{U1)jsX67T9(D`OTh)X}FS@!&=MODoYXCd|lCI01m++PeH`|dyj zV%P6|1M8wETMqv5tE1Z7*$3^xsI-Fk{_dL-XTQ(eZ4dgMDJBMOe;vAn>*OladhzKg z+mcV^e+Ha8&;9-(OZVl>owLpGvw3aY9!A2*%@;~|0G%YoYts_QOyUW19=$aLjK}R%m17U`SbWmF4X;V4m7~L|C9q02<{!dLMkpQS7JRh z4CopP6cE%78vu+Cv^gzMUV8k|B~Ur$dqwNWF%+>g*w;z%fJ5d_gzdWPzr zo}s$ZZoi#YH=Z@GQ?S4JH+hErmprp7My=he)+`Qaj~wHPZ>}Dn`X~5>KYrnl{|0|p zX#ie=?^XnwlgxhyLZG6>V0o%Xbq);gsl3`|-zI zvq~bjaFK7=SAVIClv%#ig~>)2h#AEuVgg8uqg=Eim4VqvQRFBmU#(c*DtAO;YZ;NX zR)1ffwZQ5XS?Ib@qA-fHB86s*9p$K90!2D4Tn~yv>g?iR_=pcKDOnX-|075Dperpg zODY9uC)iiJWY}30gl?43aNUnRwH0~k;KCwN8?(j)E xg5vnmt~>Z1vcoU#OrekaCC7)w3)^S#L*((GSTLVH5tN)56TgEJ@K<9>{sAn7e|P`@ literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/draw_reduce.hlsl b/piet-gpu/shader/gen/draw_reduce.hlsl new file mode 100644 index 0000000..8311155 --- /dev/null +++ b/piet-gpu/shader/gen/draw_reduce.hlsl @@ -0,0 +1,126 @@ +struct DrawMonoid +{ + uint path_ix; + uint clip_ix; + uint scene_offset; + uint info_offset; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +ByteAddressBuffer _87 : register(t1, space0); +ByteAddressBuffer _97 : register(t2, space0); +RWByteAddressBuffer _188 : register(u3, space0); +RWByteAddressBuffer _206 : register(u0, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared DrawMonoid sh_scratch[256]; + +DrawMonoid map_tag(uint tag_word) +{ + uint has_path = uint(tag_word != 0u); + DrawMonoid _70 = { has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 60u }; + return _70; +} + +DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b) +{ + DrawMonoid c; + c.path_ix = a.path_ix + b.path_ix; + c.clip_ix = a.clip_ix + b.clip_ix; + c.scene_offset = a.scene_offset + b.scene_offset; + c.info_offset = a.info_offset + b.info_offset; + return c; +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 8u; + uint drawtag_base = _87.Load(100) >> uint(2); + uint tag_word = _97.Load((drawtag_base + ix) * 4 + 0); + uint param = tag_word; + DrawMonoid agg = map_tag(param); + for (uint i = 1u; i < 8u; i++) + { + uint tag_word_1 = _97.Load(((drawtag_base + ix) + i) * 4 + 0); + uint param_1 = tag_word_1; + DrawMonoid param_2 = agg; + DrawMonoid param_3 = map_tag(param_1); + agg = combine_draw_monoid(param_2, param_3); + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if ((gl_LocalInvocationID.x + (1u << i_1)) < 256u) + { + DrawMonoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)]; + DrawMonoid param_4 = agg; + DrawMonoid param_5 = other; + agg = combine_draw_monoid(param_4, param_5); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 0u) + { + _188.Store(gl_WorkGroupID.x * 16 + 0, agg.path_ix); + _188.Store(gl_WorkGroupID.x * 16 + 4, agg.clip_ix); + _188.Store(gl_WorkGroupID.x * 16 + 8, agg.scene_offset); + _188.Store(gl_WorkGroupID.x * 16 + 12, agg.info_offset); + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/draw_reduce.msl b/piet-gpu/shader/gen/draw_reduce.msl new file mode 100644 index 0000000..759267c --- /dev/null +++ b/piet-gpu/shader/gen/draw_reduce.msl @@ -0,0 +1,140 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +struct DrawMonoid +{ + uint path_ix; + uint clip_ix; + uint scene_offset; + uint info_offset; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct SceneBuf +{ + uint scene[1]; +}; + +struct DrawMonoid_1 +{ + uint path_ix; + uint clip_ix; + uint scene_offset; + uint info_offset; +}; + +struct OutBuf +{ + DrawMonoid_1 outbuf[1]; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +DrawMonoid map_tag(thread const uint& tag_word) +{ + uint has_path = uint(tag_word != 0u); + return DrawMonoid{ has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 60u }; +} + +static inline __attribute__((always_inline)) +DrawMonoid combine_draw_monoid(thread const DrawMonoid& a, thread const DrawMonoid& b) +{ + DrawMonoid c; + c.path_ix = a.path_ix + b.path_ix; + c.clip_ix = a.clip_ix + b.clip_ix; + c.scene_offset = a.scene_offset + b.scene_offset; + c.info_offset = a.info_offset + b.info_offset; + return c; +} + +kernel void main0(const device ConfigBuf& _87 [[buffer(1)]], const device SceneBuf& _97 [[buffer(2)]], device OutBuf& _188 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +{ + threadgroup DrawMonoid sh_scratch[256]; + uint ix = gl_GlobalInvocationID.x * 8u; + uint drawtag_base = _87.conf.drawtag_offset >> uint(2); + uint tag_word = _97.scene[drawtag_base + ix]; + uint param = tag_word; + DrawMonoid agg = map_tag(param); + for (uint i = 1u; i < 8u; i++) + { + uint tag_word_1 = _97.scene[(drawtag_base + ix) + i]; + uint param_1 = tag_word_1; + DrawMonoid param_2 = agg; + DrawMonoid param_3 = map_tag(param_1); + agg = combine_draw_monoid(param_2, param_3); + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if ((gl_LocalInvocationID.x + (1u << i_1)) < 256u) + { + DrawMonoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)]; + DrawMonoid param_4 = agg; + DrawMonoid param_5 = other; + agg = combine_draw_monoid(param_4, param_5); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 0u) + { + _188.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix; + _188.outbuf[gl_WorkGroupID.x].clip_ix = agg.clip_ix; + _188.outbuf[gl_WorkGroupID.x].scene_offset = agg.scene_offset; + _188.outbuf[gl_WorkGroupID.x].info_offset = agg.info_offset; + } +} + diff --git a/piet-gpu/shader/gen/draw_reduce.spv b/piet-gpu/shader/gen/draw_reduce.spv new file mode 100644 index 0000000000000000000000000000000000000000..d6c6fb75fa1dc5d7d65bfa5c1ab890d474b0ee68 GIT binary patch literal 7140 zcmbW5iI-ee5r-d{CBqh$5E3AX2|@%JCdus8rRjmRq-O-J3oPELgKJ z%g)Ho%0{w3WyNOXQz7b)^@Ain8>K6z<{%vYHzC2I5dY@SFPEY zDX@iktTlFPbqZ-HJUx%yaIB(+=d3{0t|XmCVDBZeO`@zbGx;#FC z%5EL^yz}~dE(aSE9$>4|Yc%V5%)&gk=Q34qOiuOYjh>lt&WG-mYEfk1&M=1-Fy4j# zF!)Tb)(nFuZywLz-cePU=3+*$Ra>og7$Vo1;Jr?@)h!K?%X7_VaeTaeXwEj0uj4t} z=VFW^<9vVed2?px_EXe%E^+xRk2h-bwj$@EuyuRY+WxT4_x8N7pH@$i*SWHu%73wW z$F=2mVy>Q{d>D_>mCx6+1gz*=Op&%?FBfC;ohW7@wHNg+ z#qJ8uX;`E0%SxMP*l)W8Tg-RA?b6aVQSDVD&^z}U@M5la%#79$b*>|dZyZ`PM5a*@J22VWn7cK z^#-%Ig3bF6X7A|JkDR6tWk=DwQ5`cCF5eZ$54@_SRh;bQ%7xY+Z(;oAQ54Ij*Y0={Rqmv`SY zRQGGo_QpL}`~5@#ArI&O&awA9+nxQRo%ZaEkEHv04xd}=)9ub-XJI(k4V$h{$Jor_ z)-7J|blUdCb&a=n@H~g>{sG-@l>B3G--qSUF8uex{nivL$9${&`Z3oN^F1_HKi3i) zNw9$u^PLGlWAyVa2+a3W%=;UdZ>QL@1oPb#i<({6^~6>|5$jtic142uPKvEfF!wKZ zb%I%|*fj~}dnmRs!F&hBHYb>GpV;*Y=03%?C75rW7)L!jx?BUF^XEIZ8giD@-w5~2osV7o9dB0T4?WE<7Wu2lX?}I{JD27^2KT%#gY3~` zaKFeGnDvMibrgQ)5F3HQ&vnI|7xO<3xBi&xxcB<_*$*+}^nU`L)~jy4VfXj1_1bs+ ze+QrMPjSD2POJYp^#1zA()z7KEY|T&Nb69y4nNoZBiuTiiN$0x*J`4TS6+gTb*+N@ zR-$=N#Zp>@pZ(J&t>z58r^@1zKHjT{UybgaQWtkE=YJJ+26R=aJO6&6FGo)8YtXGV z>{lXNzq&PE2l-uu*Cu%2TaY7`}_26pYCtSG@rjAQ~gMyoBw#CyZ(tj{h>bn zWT|gwm)^5iLEb&@{SEBdzV3m<@54v^_7d|B-bk11iP(0m&hzUaXWqBRS&uxgN7nXq zndcs4u{Dr+?4?-vk0XnPUPE@RBW3&qvOeOmb{)A2xwiI*-G{8rbvz^Wur-jyw0S=2 zTUf(BzX|d;iMV#pQ%u>jb9VmDj5OiSiu^D1tp#sGe%fc?+U2qCn~}}&C~Nt)b|5kP z`VVrbPebN1RyksO$Z2n8(bL`>K-Wh+Vh$p^Ui9w}vVP*xzr)DUKkX6w7G!Pi+Zk04 z+uM-Ev^lTp(Z9DN+dpyr?ddxq`>4P3tgd_nuKyP1NWXVM=Ro=#f%Gv>+@9KB`y!88 z-i@5@@jd8b%OT^niKS;|e`EgNk9-VD=l=uf;!*zxk*&j+`{5sgjzapmhPpn=$KmF3 zy${2!SMH2_6#g;DPy5H=+U2pIPavBkX5^E|VlgA?AA!tetn#19{Qz8g5Q@Eh3b}u8 zp9T|;^*)2_de4-1?6b(uxOmLX=a7xl7Il3dS^Pfeuk8Daa6ji*+ZP})YgCS!zJ%Og z)0e@-qo%JQr!{>Q-I~OsrmrCzr!CIq>&Tvqwm7?QAd7o8C*bN~dl-2V()JKsJ#3F4 zpMbQTgsaD#eiPX_HAb60<`MT?W8d=UJDYD|>+i$2!NjA6?;x8u_V+0AQAj`6RM*Ef zohy4SH+~rYUC2I0&%Re;aW}MyrL*c>#aVqHxxdCAfQg?h->@Gdi#hYo!H?m7&VjZc zL1NaX9Jzmn{3K*t)cbQ}@!0<<_; zzO=-{XBoP=QlCrE#bcH(MK(@6;x0pWz392ReqmpM?ArD>)?SG$9zHKXcJ0(>6}ot= zy&BoLRZzrTj_lfTcIx_t{R(8)_MBqv7b1&?&x?>lgN`kX`#qDAv9jSv-8!A-i_!vmRYM)_y6napDno4YF$+=Paq~=Q^JI21xtAm<#1} zo-RQ5I}dW6oR9M%XXrwB{N~+=Y_6#NT4enmVWu|0H$!6H<%cu>4_%hM3=;R<7PmI@ zh)15S$YP;ik8Iv&$!m@qATe`<&yC2b&o*?io1pO7j;v4o?z|aUYzGv%Z$TE{3Ejr- T+~3O~YjWLfkUsWJ+`j!AnWXVL literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/draw_root.dxil b/piet-gpu/shader/gen/draw_root.dxil new file mode 100644 index 0000000000000000000000000000000000000000..4ea23f7699a63ef9a4dcac871e1f234a5ea5b0ef GIT binary patch literal 4468 zcmeHL4^R_V8h@MJY&KyNHV`!k(v1j3;p_qyu)$9L854vyh_$rRYmh&UCujgmLECFW zVnQ)usGtb84aGWi&enK!wDgY`2r5$K(0UzIw4p|7J+xG>)%I+AZ=uXMox7X4xw*N! znZB9td*Ana-~0C4_g?n*CYn5zI_HMw&LPi-iuxgDT+eA9L;wIA#Q^YNp8+KcN?_Fa;Z0CYesq?QI8h=)VrSz{WN zeV87nk{7>>*vVKf5fP1TU1O0f<)~f`QSGIYCFz88xgdE(oz5+x)!Mo@Kt!@Cs1vyx z=k^+uIab%G?Mgj1UyJ9!V%gdvT>$*1{i zW8?1YDq9I363@j*P`@<7!mfDzn8LUAj3lQgcwoYjZBHe|GJWJ@5{;dssA>Qb7!fY1 z@Hsacov9{gjR}Re@PI*7HYC~vH3;3r@G3mKN}5;Z7L`vM)ze0~k;euM;a}VnR;A!= zGV(SK;bRo9YAVtwf*GbF$Ka5M2V=?tBADl=43o%1q4V6bn57y}mBGlPTjqogH1MKt zQVLk)RAYik7BGD_jeUgxVGje_d~P1XQLq+l7TIG zAO(OLxYt#1{%kbMt>uJk9+Vb_RLaqV2|L=BZ>~glmv0uCE1)A|0qKj+k{~^v`i6wIDs*UUI=p^{XiK z<3&QR%$!R3P!Mjf9R{URLVuphigd_c*B#pOMprh)l+}Waa15Mhiu_(Hy)f&7K{FGS zxq5=Zo|{L9FCFh0KH7WpYX4Xqq0q!xgPqGOJKEZ4a1Q_7t%n%`O_HJvZA)fsDGJ*ngY7}1 zE!B*U3j-cOWZP41%V$5aJMv-^-KlbXzUJ|`v|ZnfrN+{&?$NH{EeFpvpUmdhKU7q` zu+mgeR9un0p{Tetm$R3Qcy(+3xslbcjO3jgX$rilaeS`Y#d~~aaO@lHEG?TNE!J}l z=4k3Vee#O(!#CK9`G*(6M1>okUB9uoqB5704)>Ma-)z{Y5SFOk(Wnv$vZPL(Zuh%X z7G6y?ud=0_90Q@(DbZgnq7+gU-P1j1usy1T;ZBCxA_veVed;AHbjd)qYonRBnc&sv zoMtm`lNqh5cA9Bt#DG{d?iP(Qq3fB@%M+prgD8dSopVVRH7uLcQxILkyd?j?CLdBS zK7?j`V9T(h@_`9Sm65mE#7i?dE1X$|JTIci^;GekFi~H1zTPF3p{`4kwdL1fn z@;?9{dbd`77L|Mbu09#>c{5r;IV;U5`z}VOxzG7rCvQ_rcSg_>x}Fx@G>AsMBGv%q zGyrnAc3cX#ej4@tE8DLWd?yuRgP!gMInhVcKE3f=D~aJxc?E>cx1r&O!4o%`7hE{! znIU6yeCVfb8*a<9agOcnJic8SNL%*e=Y=&b4{K3tipiVVyZmErQ&eLX$YrCPb~&mjfLE%?@0}j@|1O_FhIvORB?X0*k$5i+`E03i zlFYQ{{~npqr)TaQOI7;z@i`M&6fpS>LO)*H62DF@)`N}Zy2_5rw>}x$iQm>fKBna* zJ7&h6FfDjX*zguvbhX%2%(Z96$D7pNY|3l$d&)05WVhZE`foq`*zoZ9`TR3|{UcZR zVagfF8MEF`?&>;}c4R(s+D`uQ*h1H-x3kt~8qdo7mAlm;F$p`?ZHh@KYzw1{`zG>( z{bQG|1&{ZS?ZY0Olbo?DIqjjYjLsvA(%$Qu-_71{f8mB-xZz*nh5{OZui&`}$Cf9u zzkBu`i1&Or zG)2B-L5euS>*~qU;E5efcq8&r{L1$Hh%?)yG$3k~i|`p?c^!Pj;6D<-)z>Af@dglCAbO>DUDWctsofC zlJN4~P50X#oZzHNvkA-WYsO_$*k@=OVG9Vg;T(@EZCH$AK*`koK#Tflx9zkYbOwc0 zy#P^ASBxLWtfbG2Bt@ZcjfmrbjHHxF z;2q4b;ChI)2DlOpZv2>nOYQE^R6lL3IKnzbd^-`Su`=%M{`J4xt`z@c{mEuT?StO zfV(d?|G`LzTjZX2sQ>hOA# zhgH7T=YW!%P#%K|;W{zLG`bPBAxx`p(&z(fZsNh8@RN33Ky3_j=v(aipqf8#8b8&m z3#!dw$Mt)9^;gxxxzqS*47!#uJkx_^j@$Gph;7snmXp+jW4)!m|Qhc Q7U2zCNH%rf`@hBC0Mz2sqW}N^ literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/draw_root.hlsl b/piet-gpu/shader/gen/draw_root.hlsl new file mode 100644 index 0000000..b4cb7e4 --- /dev/null +++ b/piet-gpu/shader/gen/draw_root.hlsl @@ -0,0 +1,108 @@ +struct DrawMonoid +{ + uint path_ix; + uint clip_ix; + uint scene_offset; + uint info_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +static const DrawMonoid _18 = { 0u, 0u, 0u, 0u }; + +RWByteAddressBuffer _71 : register(u0, space0); + +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared DrawMonoid sh_scratch[256]; + +DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b) +{ + DrawMonoid c; + c.path_ix = a.path_ix + b.path_ix; + c.clip_ix = a.clip_ix + b.clip_ix; + c.scene_offset = a.scene_offset + b.scene_offset; + c.info_offset = a.info_offset + b.info_offset; + return c; +} + +DrawMonoid draw_monoid_identity() +{ + return _18; +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 8u; + DrawMonoid _75; + _75.path_ix = _71.Load(ix * 16 + 0); + _75.clip_ix = _71.Load(ix * 16 + 4); + _75.scene_offset = _71.Load(ix * 16 + 8); + _75.info_offset = _71.Load(ix * 16 + 12); + DrawMonoid local[8]; + local[0].path_ix = _75.path_ix; + local[0].clip_ix = _75.clip_ix; + local[0].scene_offset = _75.scene_offset; + local[0].info_offset = _75.info_offset; + DrawMonoid param_1; + for (uint i = 1u; i < 8u; i++) + { + DrawMonoid param = local[i - 1u]; + DrawMonoid _106; + _106.path_ix = _71.Load((ix + i) * 16 + 0); + _106.clip_ix = _71.Load((ix + i) * 16 + 4); + _106.scene_offset = _71.Load((ix + i) * 16 + 8); + _106.info_offset = _71.Load((ix + i) * 16 + 12); + param_1.path_ix = _106.path_ix; + param_1.clip_ix = _106.clip_ix; + param_1.scene_offset = _106.scene_offset; + param_1.info_offset = _106.info_offset; + local[i] = combine_draw_monoid(param, param_1); + } + DrawMonoid agg = local[7]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + DrawMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + DrawMonoid param_2 = other; + DrawMonoid param_3 = agg; + agg = combine_draw_monoid(param_2, param_3); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + GroupMemoryBarrierWithGroupSync(); + DrawMonoid row = draw_monoid_identity(); + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + DrawMonoid param_4 = row; + DrawMonoid param_5 = local[i_2]; + DrawMonoid m = combine_draw_monoid(param_4, param_5); + uint _199 = ix + i_2; + _71.Store(_199 * 16 + 0, m.path_ix); + _71.Store(_199 * 16 + 4, m.clip_ix); + _71.Store(_199 * 16 + 8, m.scene_offset); + _71.Store(_199 * 16 + 12, m.info_offset); + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/draw_root.msl b/piet-gpu/shader/gen/draw_root.msl new file mode 100644 index 0000000..9ee8cfe --- /dev/null +++ b/piet-gpu/shader/gen/draw_root.msl @@ -0,0 +1,140 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" + +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct DrawMonoid +{ + uint path_ix; + uint clip_ix; + uint scene_offset; + uint info_offset; +}; + +struct DrawMonoid_1 +{ + uint path_ix; + uint clip_ix; + uint scene_offset; + uint info_offset; +}; + +struct DataBuf +{ + DrawMonoid_1 data[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +DrawMonoid combine_draw_monoid(thread const DrawMonoid& a, thread const DrawMonoid& b) +{ + DrawMonoid c; + c.path_ix = a.path_ix + b.path_ix; + c.clip_ix = a.clip_ix + b.clip_ix; + c.scene_offset = a.scene_offset + b.scene_offset; + c.info_offset = a.info_offset + b.info_offset; + return c; +} + +static inline __attribute__((always_inline)) +DrawMonoid draw_monoid_identity() +{ + return DrawMonoid{ 0u, 0u, 0u, 0u }; +} + +kernel void main0(device DataBuf& _71 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup DrawMonoid sh_scratch[256]; + uint ix = gl_GlobalInvocationID.x * 8u; + spvUnsafeArray local; + local[0].path_ix = _71.data[ix].path_ix; + local[0].clip_ix = _71.data[ix].clip_ix; + local[0].scene_offset = _71.data[ix].scene_offset; + local[0].info_offset = _71.data[ix].info_offset; + DrawMonoid param_1; + for (uint i = 1u; i < 8u; i++) + { + uint _100 = ix + i; + DrawMonoid param = local[i - 1u]; + param_1.path_ix = _71.data[_100].path_ix; + param_1.clip_ix = _71.data[_100].clip_ix; + param_1.scene_offset = _71.data[_100].scene_offset; + param_1.info_offset = _71.data[_100].info_offset; + local[i] = combine_draw_monoid(param, param_1); + } + DrawMonoid agg = local[7]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + DrawMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + DrawMonoid param_2 = other; + DrawMonoid param_3 = agg; + agg = combine_draw_monoid(param_2, param_3); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + DrawMonoid row = draw_monoid_identity(); + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + DrawMonoid param_4 = row; + DrawMonoid param_5 = local[i_2]; + DrawMonoid m = combine_draw_monoid(param_4, param_5); + uint _199 = ix + i_2; + _71.data[_199].path_ix = m.path_ix; + _71.data[_199].clip_ix = m.clip_ix; + _71.data[_199].scene_offset = m.scene_offset; + _71.data[_199].info_offset = m.info_offset; + } +} + diff --git a/piet-gpu/shader/gen/draw_root.spv b/piet-gpu/shader/gen/draw_root.spv new file mode 100644 index 0000000000000000000000000000000000000000..e6a53e55ba4055a4339c14c7a99f86a510b0d3c2 GIT binary patch literal 5440 zcmai$`I^6HB!5mjl=9LOwP`1yR#tXg;Xjn zwX~?Lj7lxb%r15}!|ZBTyZqc=q1N-%?ceJNjDL31@tIeIQeeGJc(`dJL?#OfdvTf)l&Sq}+XWP-oHrVZ@KA&PQ z_4W2r-^=yVSP7Zk%f9SVd^@V0>bB{z{EoDJM)9dVb6LP6ejB=xVNa9Q@nr9v zWtX!3*wsCI;Pq!WgQxaZrfQSbPK`u+-$pa^!MWvWGsayK2VgJ?Qhj zh&=B>-$21vO1|UKuI~)YLlz_QqW7_WnSSe&XTJAv2I3R;s;-&a_S5~Img4_N_9^IB zFd6P?6|1?PcqwaIe+cnek={9dMUmq@D4SVOF=V&9%etY0}ht($+3u-6eaeiEXu zDDL5v_=b=pD+w zLX274y^bLE*SsrJtt-Ih`IzS!-;9`Rp7piO3(hXE97bV=^A@n2m53Z|^MZ3FSdQ}$ zb*=)-S%b*YHZM3=gXK6k!FelKj^6+|+UAL!73c2XRCx#5SyMkxKi-9YH{xUbJ!s?V zxQF+GUE^{3a1Hu>h(7P(KdhnsPQ2KEkJ&vl%&J&1mL;QOH+v*F*ym<`vHvmO0@#8~wDy5vL6^tuW5{farz zHZM5iVAm%z?If!D)l#6z5<9TTaY@wt2z%09Z~5i8=7Mf}EHGZSywqPS}$v zWD#ON%!~f{1JwR`U-nDeTxY`h@Ltt%N7LZqo$SNb7w0pkuQ+qgN7Oq2w%+wf+}90Y z{np=));8bo!$IU0WC;>`_z?JDvOU|ah&(yBp>IcSM$B_fZF9ta|6I%ceHiWe)V>oR zMc;w=7{3#3TphJP26l~jCq53=7w?4jM-bPNEB+U2--TB0M&fMufP2sO34Hpa-Y3D< zdnUaD_kw+A^go8r_wqhOp0O7=zaQKB z`r}>xJXoHwxPvc%y#r&efQ6E)XQ%WTY$)Y1g#zS`b}`}c^<{5|Isuv-^y|KU2tZ;jqT&i z7<&xS=Q+f&_T%8*wZDT;f6T*o!TOw&=-2na_RCoG^!s4_aRzPAA9dBnoUWB-gF>{_o>yP@s0>XzXrRXn4{l-&CwqhmNN$l&Y!??Rw8n=%?r+-!E)vz z!TAeV&Kg9Hwt2z%D_Bkm3C?q1Im3t?ZS%Z$XTZBtNALayF81p0*!p6h#`MXLxpD^n z%jqsa{JlOOdjaAzfLP-=^#9N&p^JImrO)xmM)v1SoKV0gg5@tp`}^MbQp6gbaT)p) zM2`KBbDWy|k#ichYZY^rW9yH(I2|lcfACg-trvHzZC=EOz}B`uQF|452#K6Cz}7D2 ztj5+KwO<96r$2aSf~_6B*ETQWuLfJ&d5GF;!TKZTHDGHObI!unAGOa0%hMmcbzp19 ztZADU@%3P940|*G1CJ+M(T2xfDwoQm4D%z;jI>a^uL(o`) zqQyBl(ob81ScMkb1frl~1;v(rTE9TCrC!?twJmsTd*2-rzuxbhd!Of=`_K8~?1yCU zwf65F)>`kh7LnwqXE0~aqmAbMerw(*9|eZ?zuHAY5TqsPMZO6u1)~va{{#C-@Vef9)sQt`9*%U|iS2JaJwv2$X?V z-+~g7{Mk8#LTmu2p+7yt@-T#fAYVhDr*IRp6NDroNqE6%l3#X&}SKa`599EDTEuBK9I#Sa>8u6;`tgwpYG94sz8d) zhshx3C(vLgri~^d*E7q9h(`*abPp{UB_2seWH#W4arHHkJW7w5^1j&ZJxKO=2 ze?=M9C+uV?e0@_|XY}>XF&I2cZ8>a{l`e8eJiet{OpG?1EuiGm6*wbhr<_qP$7*V< z7-QM`fNAfvPD~$JlD@3+dhGf@KX}s?L`wgBX4|8edo!}$rYE$eo9vV4#EaT+V3M2X zFH@Jx;Za!>truVbM;C3PiUlSsw|tlRQ2EVc<_hEUea6KFBh~k%R{@you})pCxpW+% zBme^j>rD!&V)jCb``*vwK6T6tN!A^ypBtx6#ve*py)#NavSdqJWO`t^-gt6%v`1*V zRn(7@<((4r8w;xs_xjn6UQl{o%w>&=ynf+(J>t_&T4=xKvc3^{{lxcD$yiT@SbZX| zVgxO*(`0s96#xChVGh$ne0(=rDoSJ(iA5D#izx8SJfF*2*Tvcf--y>XoVL+K+r+1> zGqN_g9=5n1HpplMpbP!}FR;xb+BO-jqznC)P1`(7cMQkyYz-Qb-<@`iCRV*)Fuf>xC%PAp|bU7Q1`Lv)|SrAEYYYYjgDZ|im zB$0ZHDrX42UyqxSp#@ZY@nS3haHe`1mtGr702=d#h& zI??jL{qp1bHRj_{t+j`*msOwI*W7(;F1vhnTfy9b2IgYlS=g?Nm&}b9nAy|cES;V| z-&Y<;pZrY=v-D?@g(^l8kw7$^VUuOxXkd8=qeCruCI8@kY9FJ;y;Zah>yiZ~vTfYr z)#f@nB_6Sem{{5M>t&C2G+fzT*07`UQTIMV+RohC!^`s@PPl(RAG<#x@T;J&sdBSk zf4!`Ecbx;UtDJlP73}K=RJkSpWb-YDiU!HSK%%Eh-?gs~7?xE7r`_(aavy#f^!1_T z++80hBz@fM-fCHA>@_hT*qL@dYe2>tG_ju8SwnoUrR#g=KC<;9yuK|QqtnYq=?bz>b~Z#N7j9A&COT3%esNZt-0*{=6=z5O;IHbb+0enH?E34V0OkTMQ*i+IBi^YgUC)N!zBRY~E6#cL@r&nyiDmKlbHuHbfOutx3Z0H5`p9sQNf8i6=y^!h2+>nX0&bdY!tM1-JLc`)%nleMQ>cJ&$ z9?^V>^QB~#b@Fp1J5R>gf{0vwICeaDe+NavUvqaU^t*ERLwNV! zHFwX2vHd@Dcgz3eZfEW;li>8ZMonJaX{6}CR}OdDC*1y9=5QuqV-stA|L-}R$k>~x zCdktBFjntocYDrHdEom=x|w zQa~}*BVu-`Ga!SNYnXQsW+G!<7h05Cw)@tHw<30*>)gG54m&TTZIiRMIpocr+cC8K zNo((u8w01Q(%`tX_hwwgFT^#UPP!0xX)=ri?wD82@;cvH4f`a0zH-qpTJwfTvrjtqC(k!m?6;WI`sbg#ngS6sk3Vbd zPp_IxE^1sjF${?iUBgIpEuW4msC>G1C`oE+{PytM3hIuSkB5___Fo$Yw>#d9#u_-L z=FbnU!lj7T^iG9!oXo@VG8r>))O`9KJMogQ*N(q@wi`M}|CHm>uJe;>=K;@uTxgnR z;FN}A{a-uC|JV(qL69)i9ns3vACLocPvRc$-~~nm6vC!l6wmTFj(9jO|5zj zNzKRDA6rl}?xR5|@gDJJ@Abu9@yBHtTvzD_yGP_+SUVc>8w$ER*;Tz>gZc5$!N!UkS zb!#g&=MQ!@>r!J%1|RkX`=^u)Azp1LwE?Bz#UjR99BD&K&5i~Y8lJ(e{D}kdx(uwN zEiI;muG?y=C}YWC+YD&-8wA=mNU3HAdArt*mYBV!D+R`iHnku&D59Xq< z*#8UUYdR=u92-+a> z{*e&$jziGUxFAPE0u2zvnxhT(PwCOjuCb+8q$IoVg%Q!BHh2>w@^Fmk#Q2CD4JtG| zgGcxgKZ2l%^;BU@Nr!G<2MF3~@qR`KIx{{Z2b%6^NR-eRU*Eu-r)^P(Z7&%jsTO6% zkhcuKiPNny8azAK=MfrBYNTF^A;o2@8F`F?xSS5rRdr}vDKxI(A$}LBd2H{LaXzoi ztEEn%MhUD}pQ?TH=1ZfJ2Xx~2>Dn#-tR-U;cJ z>_Ch8)taZX50}i|pErBAq_ik^{7tD1_2y0-5yb9ndjEu~YbafDj4TBwF`t1a}8 zul5wrC59}mSRA$M0+z@pZ^qQSSs%*$R|$$m)E{spF!;5;m}>>SYWf2?l9*y4FRQs~ zCUIKYkm3@3*+p!ZoD!<8AoULNJ=chR+E5AGi(>AU`+t!>wVbIJO+;lUC{xX4^B93_ zYZ}86qlwEE9H!_Z{02cif0ug8)nzlN1rGI|zff<2qM9XM3hIf1g(RnUJ}O&k&QSXk z-s$nOMNc9_6ew0z`~So~ceN-T$!mNu;@8b(Y1H)_3=Lfkb^^kuE&Il{=?5kW|0mHTW}4tqeM z<@0>VqvNs68anY5r?>>G{JJX&zoxemaV+S&z5pxOD8q{6{}za2>*-1gqmWx0fz9Pp zWKAJ-FT8eTQH}k%CExP_S@@I&W~aaHq#c?cNXj>0K|~&M5F^(XRB+3hv|7H-ZIiom#M{*Lr-(` z9bjs{!#r1R*loyKLyd?7=bC_B>aL3|wn+)wCp_ux4PnV~Fu8=gjCuyu5xVdRpC}s8 z?*^X`3&6wDGct=-z^obwHws8y5DKGHGQcUj?bX3tSjH9Lc}8i~xel1la0;G11eUx` zyn3G7l;eH_bf-H@BLy1+tp>thI5^qqFGt*W4@zZWguk@!at&h5#;_}#0b&p}cS47{t8?E|%z|nQ;@XRRl|Mtc&F>_uAS_e&lc1_T3GavF()&Z-H+i%sj@E zrvq(yq$A7SHXNTm=)PDs=`A7yt`rwr+7EuEuX2;9I=TG{F~wz!`4RJ|e>jct#G(Po zV3l*Bm!Fzf`ls{7H;bM~km^2qi1O3jtRFW-cgN^_-F-2|e%Xc4$|}D(s+^`l1zF1T zNf^r?vb3Y~g#MBtr}Ze7NLJEap1E(n`!X9D)41H`)3|8VW!>j%=OAHDvsY)oPsQWL z%|5fK)`Jt~PaSEH8>`&KsuSyuBN47SU(p+3-YcgZ57LtO8zB}sYO0us zwpgpvJEMM{CO!&Q`W>5r9MB1*3F@og*$lUnkE8Bma8{h7`~g&LHU>ALZkU z7xDk*U30@s8fIA$V=AVwS%|Rfe{ksmnKJ?(8h#I11Rl4Aj+%Z|ye`|1d^ISW6#H`= z_>nmp(ox?$S!5vq)Qbq$J8H|hNx0rw!o||654=aX*b**y>$bS~sj!OF^Ksynzm^t0 zUkP3*TM6DbH2YRpCwQOj1~~Gd&!Lb5;K;^(q$ux72-Bj!N#>x@QQTmGh?wIKR^_3K z$@{r;>A>?uYkb-!%_pD=PRLs@eBfBK}v- z6-+%pK)sZFa%1=^llE{oheCx3@9n>~sv`n9)Ln|yTM92yKeBo#rMv9hbD~uj8D?v| z;p5i&FhfdfRM?YQLg|BDDuZRh7*-uSav-7ujYi*Y>*wDeludibCs)P@BahXFbhI1A z!qe=LrGvS#r#8iu)K9o8hGOmJ_u-zwkKNw%4K;{M1$rFiT*GT-gVTrbrclT71s&}k z=|ZVwND!*6;&4<3J>fJG8v@JG7#1_B5A4172MZXuIv5O`f59|o)@GCTPPa!P zP0C;sHu&?AdzA-FZ%Td0Ci*>%bC9j=Zx%(Wj}U_ldsAjZNMI+4Bb7%ExfiZV2tnq| z2*$=2=m%9%<|^`iVu4Z(S6X`m4M!q+VVqyx-6lV4SA9qx=??fiZA#X$PptJmh70Ny zc7-OC7d8(S`X`JuXf#!BWLM@x>PFn{g-%0crymook7)`Vb6Ih^-Df%8r595m$xA{{ zrf#|`w0142yz!E#5Ui{)5%z}1(#2$ssj4QII9Ya3Q^ zx7RdG;%+xKe1~l(O2yP{J7ltH+>+nyi*n~I(L}9go-Oq|eCKOfkGu2$E*uibRdz*d zI4>}KtXFisT|ZkGnFA}Ju43jvda%cj?|f$`f8Wi{>v7hP{TDM!MRfnX@iamL({aSn z$&>Otat!9te5gXTBv;vW&L_q#HDNq@(o%0N=*sZdSG!2$XBK%(!-{Ms!nBWXj*j59 zorp2L<@7%^_8oSS!DbBe%9{fDYX!W2HKC9$Xo9?Jv8s}(PvQ|X9y1!9sOLp^@fP~ zw5E|N$8!btA_QH@M*Zr07AY4jqvXilV%~i7BqvWK3Rz9{W8(!~L?Oy~x!Y&TkskKp zp4mrwX2bGYinMG)B%`&6EQl$=BaUqJPpQ2#Z>DE(PURU>8S8YWEwobD%#P=GKM-jlDc8=UHrlcsM!5cQack-E@!OJSo z(f=9NdIyp0)%zeEdNRgpWgYEkhER$@I2_7;}GS)A7 z%V1@<#Z86%@XRA$z=ry0?yi3>vMUp>S$kgVOv=T(&X+qM*PlJV)A^{pd7c$kJO{Eu z4P=GMkQEk=aK~$jQK55~rPqC&fdd&(skWDO8DJh>xSQjdrPMmen789LYpb?3F=UA~8 zV#S!B^P{RmV`G+kpFevf<@{OG8I$(u$4j*dFb5?`BHf}am;>RX*R1KZW7}fob-I_W zUz)qDXZrtxIe5OS#We?ioNaNeEhcT8+T#Wi&8F0=uYO4O({Xet zB<;bXCoQ>C_zgN+mcjP++FmuE@;L2({dhU6bJt*k%mt zj5{X|pf*yA@oWn^eZh1zVnHn0NX^;}(a0_Av@>A;fbDEvY$_n4>*PnT0j0ebD4VQ- z)Rjw;%daj--2BV3)Xjc2|GX~664DN_8=5Dx#8?AiVkN^sixjYDW7x6`Ji zPzqPzH`%m}>^w^0f(#@(__Zmw$R6c_Y@D_-D3^5bRkuRaDf{Yys_G?3A;`wzEya)G zL(>N3DHY3F56azUoSyX*TWnS=lh*cerLW3+IMQHw4d=HAUf58!P&)9Q%Fq=%7E`k2 z{7ZNJQ(7j}uk#FkIHjkljP+B*n&5vX#^Ms?`Gr0d4})A2!t-H_^@5MmGla8fFP&~< zTK}f8#!!NL#Ioo_sOBMt=Z-X3_f&CeRfc@K!bawW`Hx}^mR0*@k%k7rD7M?}X#;;x zm3y_SSU;dBFe`={f;Y>n@|CX2 zw|=pVvLsixWQ&Z%C=jzc#eb&>M`>Z+;7T3sdWo=+8YZVvc&XTEuhr6ClPNr%HX?Xb zUB%6SsNJu)#y+-GTa6dp9#F84U2iuD_f|2tqP)PT3}NCEuBy1#u4sd1;%IH(^>)U~ z!on>h0)^g1y~4C+L=cxheGKE852}jUl(AkuI?^C!48#Y`N6#;Y{mDvX?bRe`G%Qx# zWuoypNmwWTwG3~Iv1YkS@J;`A2~|2v`pjFy_X8}hEkfd#Z>&XVFa+0L>eld~z)siO z{B&dGb(5<*p5|7%(dCRiKnsn1n+riF1+b32TSmb9; zxqX`=ep>^la6Co{md+b3G1FwhLsn&Ju!W#5Z6&Bx{Ix*M_As$~&;0^hG_M!b;>}Z* zthL67qkx(mRsW`)Rvi4?s+lEVvBR)+|P* zo_yZzA2Dr{Ytr!`jYo^Ws*!O<=Aju2m<-0V$mH5lzOibP-9Nv{ZTLV$a+!vefxF#d zWr;897GUp+$~Jg@x)fa`Dw8F*`dynkTFA`{@gq-3S_J!Iv&gk0e)8dWpHIb0Jp1Fb zO&*lsa-+wE;KSr%T0I`r=kb`38`j`qRZ^$cze0=vP8zSRH?w)3#Tr-j@HF_V=2v7t z`P;yB!;2@R&l6tq44uTgUserP^Cy^zh$T!c4N?``-9~#Yorf_q z#^=SCkJbuo?S>HHuQfW=lgYHP-i53Y?z=^_15-Zj({HYA?hk~xia*1Zfr{}Ws|@NAcipa!)m7G6 zuUgve$EVp9Tl%(-G^nQyD7KQ`9%(Sa!?qB3u)xD?7^HqQKhClK&P{q$o5)RfrOS1U z)(PTFv99g+=eQDDqpq{N1jcLl=q^h+U)aVuJwM<^(oK`+1vVqIR=PUsA2AtIJHZ9y z0reqZiWm7T&F|;z+k-!l_sOff$CYh0GyS5gHDbsp^ zm=kz`AifO5brsue?VSYiB_IwR%irFVsmhrN#HPOx4@%e5LGm+9^EqioiiD0Q3Y zfVI3e#t5hBfPJR)d}j14jP;w2dwNfd$~5YsgBKB4_LjGIONXz2CFR?;{DOhE5Z)TW#tgT9o+-i;Q|pOJpgrd zr$ew|jMmEFpp60{#}m$pb5AKOJFR@`dhDXyLXbHQw12{S3vjwK@LLqhU1E+irgN$^ zQ&Bf(wx~!$FJnOvlksxWFhn%U4C;PtxM4tF6#?^|1zsA;3-lkuPS}V!meQXJ3QSc@ zHe_MDLPtHel)MqE#|;}~lnwvx{;u&c&i-_1`9`eLqMss^%7z7@yQ&_^O;_9Xup0Cc zL4mPq^=rgZC`v`VF!UKD#-Ro|140Iu$ovq{Sj3rr?RA7lNm}hXp(~2(Xh-I`+-FUk z9a;}pI1iE!F`fyGd&k}9js8fm!yy(No@KUF9s&>0rI<4Cpy9|3pHv?)uE+AF~*FSezCQEt0135?Rh`iXPpU7u(KYVK$rR;L-x280Q?{TsG~<3 zM0=~Qc(%aib0cU#I_tFK-|(RB=(EQ`#Sz2{lFiwRJ0aQF+GbLvvA=+JUl7_|fH>G8 z5}uA7gPw;(&!xZU`$5l6(6^^84fJi=I0f_-dHu%5I-Y~R_3fbV&`wp1k}_7?VaQR% zbPzpiICCOASn1Wq=^G(PWCY2LlK>bZ05!W*F-=gt+_(rm?Ca2T@&yU_QKiSA?09-g zZM~Tl8EvNUnsk*OKd8}R*>Q53oiekw9`_oRc|}I|fFEg&lT(ZtDJH-^v#_4oSy9m! zDLXxe)qpkCi@c(uJ1KJbCa`sY9m+_l2kZq4D>}MKx6|VXi`TFTu(GG7ZU1-oca7KL zz>m-!|A!wPS^1ywBY*HCQ~zyy66J6+_mKN7BtiEXL&|V;KfaBmQI37jJ<7SoPoQ)$ za_IdEQm>_*YSvUSWyC)V++Z(MFG+~HEn{o%GIHXhZaEKrS-#AN@mQ(zAbCN8`nJh+ z+@ITRpQ+P254shzD5H#bWyuBhn`)@MM(cRwY{O{&Mq9ggxG*sss^RJ?CK%KFeuV@g z0reRs=xRsl!bHhXgIIezM*I>Z4?|xvTZRAbo|*L_mvF9^3=)~mj~8Yh#j3BiuZZsg zR(xPJhQXOc=VF+E$ zI=%fcni9ZhbsADhz>}g}I0$q6YoKSf>4aG949M~`)_2iNJV1%{6H@~x}(^TW|-1|_`L&imJUdDzM4 zjM6vui^HX%c>m5SPQJ>}(XaTOTLi3(RUEC#0H;z1ClG-S3+ZatYlMk!;0=qB88h+bG}(+a_LWrA%v9xpjk3UvM$aAU-734=J6#oT z;}R_+z4iHH)SdohJ+9ZL#v6*6|qCe4P>hECo-Rk6+HkFV9ClT7*o8zr|P-i}9w` zt$Lf=dmG(+*18$*Quf}Y>~EuVe?s|tBjvrxv`Znh<}O-q2ko^X+Kxe5MtBE5yz@M( zD4*@d^-s5cDHWHOEi%c)eoqX3#pYw9y|R~v8?0~lD=4h%?d3sxpaO;?#!(tnz+OtilQuGqJg6-jtYaKS~}Fg zV=AMB{`J+Wn68k~PVY_|e&60fRd{#L?hxnXL-oDvpady2XJ1v%k;(~0x zPa`!>UJziOuPSt3QLtYW3Afg32}LJ>M@MP(>aGmf;~3qi+_hBnL(%0^c$yS+brs;A zbQt|;j$1tGi#jdyGaeJz+8-uU6uNZuHKoM7(3Az7oAkNnz@pPM(#R|D|aj#aXVtyNB63-F#PF8&O-SiPIe^ z+oN1XifnlgX}3glm?ApW5xqluZbj2?~Ut~n9x_y9tSvp02hVpAz7{ zBm!*A1aSF?13Zxcf8hYD2(Z420RQR$3kmR$1N;X8=7$pC*TR8whJgUrRukaq-UMeU z0UpRAI0w4}xGRhRFODE|ZYIE@_XzN)1N=Gx*5?r5Y62{aBf#?_3GfX9oPU4-PxB!- zuOz@74)6g7IGzBncYr@9z`YKzkN}H|39xZ4F&m8pSe;0Kdk8S)J^@x9A~-7vaL5}3 zc!&U(e@1}$DFk@VBm(@{p~evc{2>9ZQxRbEe1h{a4*(lqBfx);DT3L!?zYanGU^i% K@q1&}^}hjWLyOn| literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/kernel4.hlsl b/piet-gpu/shader/gen/kernel4.hlsl new file mode 100644 index 0000000..0a6c022 --- /dev/null +++ b/piet-gpu/shader/gen/kernel4.hlsl @@ -0,0 +1,1303 @@ +struct Alloc +{ + uint offset; +}; + +struct CmdStrokeRef +{ + uint offset; +}; + +struct CmdStroke +{ + uint tile_ref; + float half_width; +}; + +struct CmdFillRef +{ + uint offset; +}; + +struct CmdFill +{ + uint tile_ref; + int backdrop; +}; + +struct CmdColorRef +{ + uint offset; +}; + +struct CmdColor +{ + uint rgba_color; +}; + +struct CmdLinGradRef +{ + uint offset; +}; + +struct CmdLinGrad +{ + uint index; + float line_x; + float line_y; + float line_c; +}; + +struct CmdRadGradRef +{ + uint offset; +}; + +struct CmdRadGrad +{ + uint index; + float4 mat; + float2 xlat; + float2 c1; + float ra; + float roff; +}; + +struct CmdImageRef +{ + uint offset; +}; + +struct CmdImage +{ + uint index; + int2 offset; +}; + +struct CmdAlphaRef +{ + uint offset; +}; + +struct CmdAlpha +{ + float alpha; +}; + +struct CmdEndClipRef +{ + uint offset; +}; + +struct CmdEndClip +{ + uint blend; +}; + +struct CmdJumpRef +{ + uint offset; +}; + +struct CmdJump +{ + uint new_ref; +}; + +struct CmdRef +{ + uint offset; +}; + +struct CmdTag +{ + uint tag; + uint flags; +}; + +struct TileSegRef +{ + uint offset; +}; + +struct TileSeg +{ + float2 origin; + float2 _vector; + float y_edge; + TileSegRef next; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u); + +RWByteAddressBuffer _297 : register(u0, space0); +ByteAddressBuffer _1681 : register(t1, space0); +RWTexture2D image_atlas : register(u3, space0); +RWTexture2D gradients : register(u4, space0); +RWTexture2D image : register(u2, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; +}; + +uint spvPackUnorm4x8(float4 value) +{ + uint4 Packed = uint4(round(saturate(value) * 255.0)); + return Packed.x | (Packed.y << 8) | (Packed.z << 16) | (Packed.w << 24); +} + +float4 spvUnpackUnorm4x8(uint value) +{ + uint4 Packed = uint4(value & 0xff, (value >> 8) & 0xff, (value >> 16) & 0xff, value >> 24); + return float4(Packed) / 255.0; +} + +Alloc slice_mem(Alloc a, uint offset, uint size) +{ + Alloc _310 = { a.offset + offset }; + return _310; +} + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +uint read_mem(Alloc alloc, uint offset) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = _297.Load(offset * 4 + 8); + return v; +} + +CmdTag Cmd_tag(Alloc a, CmdRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1); + CmdTag _669 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) }; + return _669; +} + +CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + CmdStroke s; + s.tile_ref = raw0; + s.half_width = asfloat(raw1); + return s; +} + +CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref) +{ + CmdStrokeRef _685 = { ref.offset + 4u }; + Alloc param = a; + CmdStrokeRef param_1 = _685; + return CmdStroke_read(param, param_1); +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +TileSeg TileSeg_read(Alloc a, TileSegRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11); + TileSeg s; + s.origin = float2(asfloat(raw0), asfloat(raw1)); + s._vector = float2(asfloat(raw2), asfloat(raw3)); + s.y_edge = asfloat(raw4); + TileSegRef _826 = { raw5 }; + s.next = _826; + return s; +} + +uint2 chunk_offset(uint i) +{ + return uint2((i % 2u) * 8u, (i / 2u) * 4u); +} + +CmdFill CmdFill_read(Alloc a, CmdFillRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + CmdFill s; + s.tile_ref = raw0; + s.backdrop = int(raw1); + return s; +} + +CmdFill Cmd_Fill_read(Alloc a, CmdRef ref) +{ + CmdFillRef _675 = { ref.offset + 4u }; + Alloc param = a; + CmdFillRef param_1 = _675; + return CmdFill_read(param, param_1); +} + +CmdAlpha CmdAlpha_read(Alloc a, CmdAlphaRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + CmdAlpha s; + s.alpha = asfloat(raw0); + return s; +} + +CmdAlpha Cmd_Alpha_read(Alloc a, CmdRef ref) +{ + CmdAlphaRef _695 = { ref.offset + 4u }; + Alloc param = a; + CmdAlphaRef param_1 = _695; + return CmdAlpha_read(param, param_1); +} + +CmdColor CmdColor_read(Alloc a, CmdColorRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + CmdColor s; + s.rgba_color = raw0; + return s; +} + +CmdColor Cmd_Color_read(Alloc a, CmdRef ref) +{ + CmdColorRef _705 = { ref.offset + 4u }; + Alloc param = a; + CmdColorRef param_1 = _705; + return CmdColor_read(param, param_1); +} + +float3 fromsRGB(float3 srgb) +{ + return srgb; +} + +float4 unpacksRGB(uint srgba) +{ + float4 color = spvUnpackUnorm4x8(srgba).wzyx; + float3 param = color.xyz; + return float4(fromsRGB(param), color.w); +} + +CmdLinGrad CmdLinGrad_read(Alloc a, CmdLinGradRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + CmdLinGrad s; + s.index = raw0; + s.line_x = asfloat(raw1); + s.line_y = asfloat(raw2); + s.line_c = asfloat(raw3); + return s; +} + +CmdLinGrad Cmd_LinGrad_read(Alloc a, CmdRef ref) +{ + CmdLinGradRef _715 = { ref.offset + 4u }; + Alloc param = a; + CmdLinGradRef param_1 = _715; + return CmdLinGrad_read(param, param_1); +} + +CmdRadGrad CmdRadGrad_read(Alloc a, CmdRadGradRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11); + Alloc param_12 = a; + uint param_13 = ix + 6u; + uint raw6 = read_mem(param_12, param_13); + Alloc param_14 = a; + uint param_15 = ix + 7u; + uint raw7 = read_mem(param_14, param_15); + Alloc param_16 = a; + uint param_17 = ix + 8u; + uint raw8 = read_mem(param_16, param_17); + Alloc param_18 = a; + uint param_19 = ix + 9u; + uint raw9 = read_mem(param_18, param_19); + Alloc param_20 = a; + uint param_21 = ix + 10u; + uint raw10 = read_mem(param_20, param_21); + CmdRadGrad s; + s.index = raw0; + s.mat = float4(asfloat(raw1), asfloat(raw2), asfloat(raw3), asfloat(raw4)); + s.xlat = float2(asfloat(raw5), asfloat(raw6)); + s.c1 = float2(asfloat(raw7), asfloat(raw8)); + s.ra = asfloat(raw9); + s.roff = asfloat(raw10); + return s; +} + +CmdRadGrad Cmd_RadGrad_read(Alloc a, CmdRef ref) +{ + CmdRadGradRef _725 = { ref.offset + 4u }; + Alloc param = a; + CmdRadGradRef param_1 = _725; + return CmdRadGrad_read(param, param_1); +} + +CmdImage CmdImage_read(Alloc a, CmdImageRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + CmdImage s; + s.index = raw0; + s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16); + return s; +} + +CmdImage Cmd_Image_read(Alloc a, CmdRef ref) +{ + CmdImageRef _735 = { ref.offset + 4u }; + Alloc param = a; + CmdImageRef param_1 = _735; + return CmdImage_read(param, param_1); +} + +void fillImage(out float4 spvReturnValue[8], uint2 xy, CmdImage cmd_img) +{ + float4 rgba[8]; + for (uint i = 0u; i < 8u; i++) + { + uint param = i; + int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset; + float4 fg_rgba = image_atlas[uv]; + float3 param_1 = fg_rgba.xyz; + float3 _1653 = fromsRGB(param_1); + fg_rgba.x = _1653.x; + fg_rgba.y = _1653.y; + fg_rgba.z = _1653.z; + rgba[i] = fg_rgba; + } + spvReturnValue = rgba; +} + +float3 tosRGB(float3 rgb) +{ + return rgb; +} + +uint packsRGB(inout float4 rgba) +{ + float3 param = rgba.xyz; + rgba = float4(tosRGB(param), rgba.w); + return spvPackUnorm4x8(rgba.wzyx); +} + +CmdEndClip CmdEndClip_read(Alloc a, CmdEndClipRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + CmdEndClip s; + s.blend = raw0; + return s; +} + +CmdEndClip Cmd_EndClip_read(Alloc a, CmdRef ref) +{ + CmdEndClipRef _745 = { ref.offset + 4u }; + Alloc param = a; + CmdEndClipRef param_1 = _745; + return CmdEndClip_read(param, param_1); +} + +float3 screen(float3 cb, float3 cs) +{ + return (cb + cs) - (cb * cs); +} + +float3 hard_light(float3 cb, float3 cs) +{ + float3 param = cb; + float3 param_1 = (cs * 2.0f) - 1.0f.xxx; + float3 _889 = screen(param, param_1); + float3 _893 = (cb * 2.0f) * cs; + bool3 _898 = bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z); + return float3(_898.x ? _893.x : _889.x, _898.y ? _893.y : _889.y, _898.z ? _893.z : _889.z); +} + +float color_dodge(float cb, float cs) +{ + if (cb == 0.0f) + { + return 0.0f; + } + else + { + if (cs == 1.0f) + { + return 1.0f; + } + else + { + return min(1.0f, cb / (1.0f - cs)); + } + } +} + +float color_burn(float cb, float cs) +{ + if (cb == 1.0f) + { + return 1.0f; + } + else + { + if (cs == 0.0f) + { + return 0.0f; + } + else + { + return 1.0f - min(1.0f, (1.0f - cb) / cs); + } + } +} + +float3 soft_light(float3 cb, float3 cs) +{ + float3 _904 = sqrt(cb); + float3 _917 = ((((cb * 16.0f) - 12.0f.xxx) * cb) + 4.0f.xxx) * cb; + bool3 _921 = bool3(cb.x <= 0.25f.xxx.x, cb.y <= 0.25f.xxx.y, cb.z <= 0.25f.xxx.z); + float3 d = float3(_921.x ? _917.x : _904.x, _921.y ? _917.y : _904.y, _921.z ? _917.z : _904.z); + float3 _932 = cb + (((cs * 2.0f) - 1.0f.xxx) * (d - cb)); + float3 _942 = cb - (((1.0f.xxx - (cs * 2.0f)) * cb) * (1.0f.xxx - cb)); + bool3 _944 = bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z); + return float3(_944.x ? _942.x : _932.x, _944.y ? _942.y : _932.y, _944.z ? _942.z : _932.z); +} + +float sat(float3 c) +{ + return max(c.x, max(c.y, c.z)) - min(c.x, min(c.y, c.z)); +} + +void set_sat_inner(inout float cmin, inout float cmid, inout float cmax, float s) +{ + if (cmax > cmin) + { + cmid = ((cmid - cmin) * s) / (cmax - cmin); + cmax = s; + } + else + { + cmid = 0.0f; + cmax = 0.0f; + } + cmin = 0.0f; +} + +float3 set_sat(inout float3 c, float s) +{ + if (c.x <= c.y) + { + if (c.y <= c.z) + { + float param = c.x; + float param_1 = c.y; + float param_2 = c.z; + float param_3 = s; + set_sat_inner(param, param_1, param_2, param_3); + c.x = param; + c.y = param_1; + c.z = param_2; + } + else + { + if (c.x <= c.z) + { + float param_4 = c.x; + float param_5 = c.z; + float param_6 = c.y; + float param_7 = s; + set_sat_inner(param_4, param_5, param_6, param_7); + c.x = param_4; + c.z = param_5; + c.y = param_6; + } + else + { + float param_8 = c.z; + float param_9 = c.x; + float param_10 = c.y; + float param_11 = s; + set_sat_inner(param_8, param_9, param_10, param_11); + c.z = param_8; + c.x = param_9; + c.y = param_10; + } + } + } + else + { + if (c.x <= c.z) + { + float param_12 = c.y; + float param_13 = c.x; + float param_14 = c.z; + float param_15 = s; + set_sat_inner(param_12, param_13, param_14, param_15); + c.y = param_12; + c.x = param_13; + c.z = param_14; + } + else + { + if (c.y <= c.z) + { + float param_16 = c.y; + float param_17 = c.z; + float param_18 = c.x; + float param_19 = s; + set_sat_inner(param_16, param_17, param_18, param_19); + c.y = param_16; + c.z = param_17; + c.x = param_18; + } + else + { + float param_20 = c.z; + float param_21 = c.y; + float param_22 = c.x; + float param_23 = s; + set_sat_inner(param_20, param_21, param_22, param_23); + c.z = param_20; + c.y = param_21; + c.x = param_22; + } + } + } + return c; +} + +float lum(float3 c) +{ + float3 f = float3(0.300000011920928955078125f, 0.589999973773956298828125f, 0.10999999940395355224609375f); + return dot(c, f); +} + +float3 clip_color(inout float3 c) +{ + float3 param = c; + float L = lum(param); + float n = min(c.x, min(c.y, c.z)); + float x = max(c.x, max(c.y, c.z)); + if (n < 0.0f) + { + c = L.xxx + (((c - L.xxx) * L) / (L - n).xxx); + } + if (x > 1.0f) + { + c = L.xxx + (((c - L.xxx) * (1.0f - L)) / (x - L).xxx); + } + return c; +} + +float3 set_lum(float3 c, float l) +{ + float3 param = c; + float3 param_1 = c + (l - lum(param)).xxx; + float3 _1048 = clip_color(param_1); + return _1048; +} + +float3 mix_blend(float3 cb, float3 cs, uint mode) +{ + float3 b = 0.0f.xxx; + switch (mode) + { + case 1u: + { + b = cb * cs; + break; + } + case 2u: + { + float3 param = cb; + float3 param_1 = cs; + b = screen(param, param_1); + break; + } + case 3u: + { + float3 param_2 = cs; + float3 param_3 = cb; + b = hard_light(param_2, param_3); + break; + } + case 4u: + { + b = min(cb, cs); + break; + } + case 5u: + { + b = max(cb, cs); + break; + } + case 6u: + { + float param_4 = cb.x; + float param_5 = cs.x; + float param_6 = cb.y; + float param_7 = cs.y; + float param_8 = cb.z; + float param_9 = cs.z; + b = float3(color_dodge(param_4, param_5), color_dodge(param_6, param_7), color_dodge(param_8, param_9)); + break; + } + case 7u: + { + float param_10 = cb.x; + float param_11 = cs.x; + float param_12 = cb.y; + float param_13 = cs.y; + float param_14 = cb.z; + float param_15 = cs.z; + b = float3(color_burn(param_10, param_11), color_burn(param_12, param_13), color_burn(param_14, param_15)); + break; + } + case 8u: + { + float3 param_16 = cb; + float3 param_17 = cs; + b = hard_light(param_16, param_17); + break; + } + case 9u: + { + float3 param_18 = cb; + float3 param_19 = cs; + b = soft_light(param_18, param_19); + break; + } + case 10u: + { + b = abs(cb - cs); + break; + } + case 11u: + { + b = (cb + cs) - ((cb * 2.0f) * cs); + break; + } + case 12u: + { + float3 param_20 = cb; + float3 param_21 = cs; + float param_22 = sat(param_20); + float3 _1340 = set_sat(param_21, param_22); + float3 param_23 = cb; + float3 param_24 = _1340; + float param_25 = lum(param_23); + b = set_lum(param_24, param_25); + break; + } + case 13u: + { + float3 param_26 = cs; + float3 param_27 = cb; + float param_28 = sat(param_26); + float3 _1354 = set_sat(param_27, param_28); + float3 param_29 = cb; + float3 param_30 = _1354; + float param_31 = lum(param_29); + b = set_lum(param_30, param_31); + break; + } + case 14u: + { + float3 param_32 = cb; + float3 param_33 = cs; + float param_34 = lum(param_32); + b = set_lum(param_33, param_34); + break; + } + case 15u: + { + float3 param_35 = cs; + float3 param_36 = cb; + float param_37 = lum(param_35); + b = set_lum(param_36, param_37); + break; + } + default: + { + b = cs; + break; + } + } + return b; +} + +float4 mix_compose(float3 cb, float3 cs, float ab, float as, uint mode) +{ + float fa = 0.0f; + float fb = 0.0f; + switch (mode) + { + case 1u: + { + fa = 1.0f; + fb = 0.0f; + break; + } + case 2u: + { + fa = 0.0f; + fb = 1.0f; + break; + } + case 3u: + { + fa = 1.0f; + fb = 1.0f - as; + break; + } + case 4u: + { + fa = 1.0f - ab; + fb = 1.0f; + break; + } + case 5u: + { + fa = ab; + fb = 0.0f; + break; + } + case 6u: + { + fa = 0.0f; + fb = as; + break; + } + case 7u: + { + fa = 1.0f - ab; + fb = 0.0f; + break; + } + case 8u: + { + fa = 0.0f; + fb = 1.0f - as; + break; + } + case 9u: + { + fa = ab; + fb = 1.0f - as; + break; + } + case 10u: + { + fa = 1.0f - ab; + fb = as; + break; + } + case 11u: + { + fa = 1.0f - ab; + fb = 1.0f - as; + break; + } + case 12u: + { + fa = 1.0f; + fb = 1.0f; + break; + } + case 13u: + { + return min(1.0f.xxxx, float4((cs * as) + (cb * ab), as + ab)); + } + default: + { + break; + } + } + float as_fa = as * fa; + float ab_fb = ab * fb; + float3 co = (cs * as_fa) + (cb * ab_fb); + return float4(co, as_fa + ab_fb); +} + +float4 mix_blend_compose(float4 backdrop, float4 src, uint mode) +{ + if ((mode & 32767u) == 3u) + { + return (backdrop * (1.0f - src.w)) + src; + } + float inv_src_a = 1.0f / (src.w + 1.0000000036274937255387218471014e-15f); + float3 cs = src.xyz * inv_src_a; + float inv_backdrop_a = 1.0f / (backdrop.w + 1.0000000036274937255387218471014e-15f); + float3 cb = backdrop.xyz * inv_backdrop_a; + uint blend_mode = mode >> uint(8); + float3 param = cb; + float3 param_1 = cs; + uint param_2 = blend_mode; + float3 blended = mix_blend(param, param_1, param_2); + cs = lerp(cs, blended, backdrop.w.xxx); + uint comp_mode = mode & 255u; + if (comp_mode == 3u) + { + float3 co = lerp(backdrop.xyz, cs, src.w.xxx); + return float4(co, src.w + (backdrop.w * (1.0f - src.w))); + } + else + { + float3 param_3 = cb; + float3 param_4 = cs; + float param_5 = backdrop.w; + float param_6 = src.w; + uint param_7 = comp_mode; + return mix_compose(param_3, param_4, param_5, param_6, param_7); + } +} + +CmdJump CmdJump_read(Alloc a, CmdJumpRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + CmdJump s; + s.new_ref = raw0; + return s; +} + +CmdJump Cmd_Jump_read(Alloc a, CmdRef ref) +{ + CmdJumpRef _755 = { ref.offset + 4u }; + Alloc param = a; + CmdJumpRef param_1 = _755; + return CmdJump_read(param, param_1); +} + +void comp_main() +{ + uint tile_ix = (gl_WorkGroupID.y * _1681.Load(8)) + gl_WorkGroupID.x; + Alloc _1696; + _1696.offset = _1681.Load(24); + Alloc param; + param.offset = _1696.offset; + uint param_1 = tile_ix * 1024u; + uint param_2 = 1024u; + Alloc cmd_alloc = slice_mem(param, param_1, param_2); + CmdRef _1705 = { cmd_alloc.offset }; + CmdRef cmd_ref = _1705; + uint blend_offset = _297.Load((cmd_ref.offset >> uint(2)) * 4 + 8); + cmd_ref.offset += 4u; + uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y)); + float2 xy = float2(xy_uint); + float4 rgba[8]; + for (uint i = 0u; i < 8u; i++) + { + rgba[i] = 0.0f.xxxx; + } + uint clip_depth = 0u; + bool mem_ok = _297.Load(4) == 0u; + float df[8]; + TileSegRef tile_seg_ref; + float area[8]; + uint blend_stack[4][8]; + uint base_ix_1; + uint bg_rgba; + while (mem_ok) + { + Alloc param_3 = cmd_alloc; + CmdRef param_4 = cmd_ref; + uint tag = Cmd_tag(param_3, param_4).tag; + if (tag == 0u) + { + break; + } + switch (tag) + { + case 2u: + { + Alloc param_5 = cmd_alloc; + CmdRef param_6 = cmd_ref; + CmdStroke stroke = Cmd_Stroke_read(param_5, param_6); + for (uint k = 0u; k < 8u; k++) + { + df[k] = 1000000000.0f; + } + TileSegRef _1810 = { stroke.tile_ref }; + tile_seg_ref = _1810; + do + { + uint param_7 = tile_seg_ref.offset; + uint param_8 = 24u; + bool param_9 = mem_ok; + Alloc param_10 = new_alloc(param_7, param_8, param_9); + TileSegRef param_11 = tile_seg_ref; + TileSeg seg = TileSeg_read(param_10, param_11); + float2 line_vec = seg._vector; + for (uint k_1 = 0u; k_1 < 8u; k_1++) + { + float2 dpos = (xy + 0.5f.xx) - seg.origin; + uint param_12 = k_1; + dpos += float2(chunk_offset(param_12)); + float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0f, 1.0f); + df[k_1] = min(df[k_1], length((line_vec * t) - dpos)); + } + tile_seg_ref = seg.next; + } while (tile_seg_ref.offset != 0u); + for (uint k_2 = 0u; k_2 < 8u; k_2++) + { + area[k_2] = clamp((stroke.half_width + 0.5f) - df[k_2], 0.0f, 1.0f); + } + cmd_ref.offset += 12u; + break; + } + case 1u: + { + Alloc param_13 = cmd_alloc; + CmdRef param_14 = cmd_ref; + CmdFill fill = Cmd_Fill_read(param_13, param_14); + for (uint k_3 = 0u; k_3 < 8u; k_3++) + { + area[k_3] = float(fill.backdrop); + } + TileSegRef _1930 = { fill.tile_ref }; + tile_seg_ref = _1930; + do + { + uint param_15 = tile_seg_ref.offset; + uint param_16 = 24u; + bool param_17 = mem_ok; + Alloc param_18 = new_alloc(param_15, param_16, param_17); + TileSegRef param_19 = tile_seg_ref; + TileSeg seg_1 = TileSeg_read(param_18, param_19); + for (uint k_4 = 0u; k_4 < 8u; k_4++) + { + uint param_20 = k_4; + float2 my_xy = xy + float2(chunk_offset(param_20)); + float2 start = seg_1.origin - my_xy; + float2 end = start + seg_1._vector; + float2 window = clamp(float2(start.y, end.y), 0.0f.xx, 1.0f.xx); + if (window.x != window.y) + { + float2 t_1 = (window - start.y.xx) / seg_1._vector.y.xx; + float2 xs = float2(lerp(start.x, end.x, t_1.x), lerp(start.x, end.x, t_1.y)); + float xmin = min(min(xs.x, xs.y), 1.0f) - 9.9999999747524270787835121154785e-07f; + float xmax = max(xs.x, xs.y); + float b = min(xmax, 1.0f); + float c = max(b, 0.0f); + float d = max(xmin, 0.0f); + float a = ((b + (0.5f * ((d * d) - (c * c)))) - xmin) / (xmax - xmin); + area[k_4] += (a * (window.x - window.y)); + } + area[k_4] += (sign(seg_1._vector.x) * clamp((my_xy.y - seg_1.y_edge) + 1.0f, 0.0f, 1.0f)); + } + tile_seg_ref = seg_1.next; + } while (tile_seg_ref.offset != 0u); + for (uint k_5 = 0u; k_5 < 8u; k_5++) + { + area[k_5] = min(abs(area[k_5]), 1.0f); + } + cmd_ref.offset += 12u; + break; + } + case 3u: + { + for (uint k_6 = 0u; k_6 < 8u; k_6++) + { + area[k_6] = 1.0f; + } + cmd_ref.offset += 4u; + break; + } + case 4u: + { + Alloc param_21 = cmd_alloc; + CmdRef param_22 = cmd_ref; + CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22); + for (uint k_7 = 0u; k_7 < 8u; k_7++) + { + area[k_7] = alpha.alpha; + } + cmd_ref.offset += 8u; + break; + } + case 5u: + { + Alloc param_23 = cmd_alloc; + CmdRef param_24 = cmd_ref; + CmdColor color = Cmd_Color_read(param_23, param_24); + uint param_25 = color.rgba_color; + float4 fg = unpacksRGB(param_25); + for (uint k_8 = 0u; k_8 < 8u; k_8++) + { + float4 fg_k = fg * area[k_8]; + rgba[k_8] = (rgba[k_8] * (1.0f - fg_k.w)) + fg_k; + } + cmd_ref.offset += 8u; + break; + } + case 6u: + { + Alloc param_26 = cmd_alloc; + CmdRef param_27 = cmd_ref; + CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27); + float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c; + for (uint k_9 = 0u; k_9 < 8u; k_9++) + { + uint param_28 = k_9; + float2 chunk_xy = float2(chunk_offset(param_28)); + float my_d = (d_1 + (lin.line_x * chunk_xy.x)) + (lin.line_y * chunk_xy.y); + int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f)); + float4 fg_rgba = gradients[int2(x, int(lin.index))]; + float3 param_29 = fg_rgba.xyz; + float3 _2264 = fromsRGB(param_29); + fg_rgba.x = _2264.x; + fg_rgba.y = _2264.y; + fg_rgba.z = _2264.z; + float4 fg_k_1 = fg_rgba * area[k_9]; + rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1; + } + cmd_ref.offset += 20u; + break; + } + case 7u: + { + Alloc param_30 = cmd_alloc; + CmdRef param_31 = cmd_ref; + CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31); + for (uint k_10 = 0u; k_10 < 8u; k_10++) + { + uint param_32 = k_10; + float2 my_xy_1 = xy + float2(chunk_offset(param_32)); + my_xy_1 = ((rad.mat.xz * my_xy_1.x) + (rad.mat.yw * my_xy_1.y)) - rad.xlat; + float ba = dot(my_xy_1, rad.c1); + float ca = rad.ra * dot(my_xy_1, my_xy_1); + float t_2 = (sqrt((ba * ba) + ca) - ba) - rad.roff; + int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f)); + float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))]; + float3 param_33 = fg_rgba_1.xyz; + float3 _2374 = fromsRGB(param_33); + fg_rgba_1.x = _2374.x; + fg_rgba_1.y = _2374.y; + fg_rgba_1.z = _2374.z; + float4 fg_k_2 = fg_rgba_1 * area[k_10]; + rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2; + } + cmd_ref.offset += 48u; + break; + } + case 8u: + { + Alloc param_34 = cmd_alloc; + CmdRef param_35 = cmd_ref; + CmdImage fill_img = Cmd_Image_read(param_34, param_35); + uint2 param_36 = xy_uint; + CmdImage param_37 = fill_img; + float4 _2417[8]; + fillImage(_2417, param_36, param_37); + float4 img[8] = _2417; + for (uint k_11 = 0u; k_11 < 8u; k_11++) + { + float4 fg_k_3 = img[k_11] * area[k_11]; + rgba[k_11] = (rgba[k_11] * (1.0f - fg_k_3.w)) + fg_k_3; + } + cmd_ref.offset += 12u; + break; + } + case 9u: + { + if (clip_depth < 4u) + { + for (uint k_12 = 0u; k_12 < 8u; k_12++) + { + float4 param_38 = float4(rgba[k_12]); + uint _2479 = packsRGB(param_38); + blend_stack[clip_depth][k_12] = _2479; + rgba[k_12] = 0.0f.xxxx; + } + } + else + { + uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + for (uint k_13 = 0u; k_13 < 8u; k_13++) + { + float4 param_39 = float4(rgba[k_13]); + uint _2522 = packsRGB(param_39); + _297.Store((base_ix + k_13) * 4 + 8, _2522); + rgba[k_13] = 0.0f.xxxx; + } + } + clip_depth++; + cmd_ref.offset += 4u; + break; + } + case 10u: + { + Alloc param_40 = cmd_alloc; + CmdRef param_41 = cmd_ref; + CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41); + clip_depth--; + if (clip_depth >= 4u) + { + base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + } + for (uint k_14 = 0u; k_14 < 8u; k_14++) + { + if (clip_depth < 4u) + { + bg_rgba = blend_stack[clip_depth][k_14]; + } + else + { + bg_rgba = _297.Load((base_ix_1 + k_14) * 4 + 8); + } + uint param_42 = bg_rgba; + float4 bg = unpacksRGB(param_42); + float4 fg_1 = rgba[k_14] * area[k_14]; + float4 param_43 = bg; + float4 param_44 = fg_1; + uint param_45 = end_clip.blend; + rgba[k_14] = mix_blend_compose(param_43, param_44, param_45); + } + cmd_ref.offset += 8u; + break; + } + case 11u: + { + Alloc param_46 = cmd_alloc; + CmdRef param_47 = cmd_ref; + CmdRef _2621 = { Cmd_Jump_read(param_46, param_47).new_ref }; + cmd_ref = _2621; + cmd_alloc.offset = cmd_ref.offset; + break; + } + } + } + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint param_48 = i_1; + float3 param_49 = rgba[i_1].xyz; + image[int2(xy_uint + chunk_offset(param_48))] = float4(tosRGB(param_49), rgba[i_1].w); + } +} + +[numthreads(8, 4, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/kernel4.msl b/piet-gpu/shader/gen/kernel4.msl new file mode 100644 index 0000000..f60ea81 --- /dev/null +++ b/piet-gpu/shader/gen/kernel4.msl @@ -0,0 +1,1349 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" + +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Alloc +{ + uint offset; +}; + +struct CmdStrokeRef +{ + uint offset; +}; + +struct CmdStroke +{ + uint tile_ref; + float half_width; +}; + +struct CmdFillRef +{ + uint offset; +}; + +struct CmdFill +{ + uint tile_ref; + int backdrop; +}; + +struct CmdColorRef +{ + uint offset; +}; + +struct CmdColor +{ + uint rgba_color; +}; + +struct CmdLinGradRef +{ + uint offset; +}; + +struct CmdLinGrad +{ + uint index; + float line_x; + float line_y; + float line_c; +}; + +struct CmdRadGradRef +{ + uint offset; +}; + +struct CmdRadGrad +{ + uint index; + float4 mat; + float2 xlat; + float2 c1; + float ra; + float roff; +}; + +struct CmdImageRef +{ + uint offset; +}; + +struct CmdImage +{ + uint index; + int2 offset; +}; + +struct CmdAlphaRef +{ + uint offset; +}; + +struct CmdAlpha +{ + float alpha; +}; + +struct CmdEndClipRef +{ + uint offset; +}; + +struct CmdEndClip +{ + uint blend; +}; + +struct CmdJumpRef +{ + uint offset; +}; + +struct CmdJump +{ + uint new_ref; +}; + +struct CmdRef +{ + uint offset; +}; + +struct CmdTag +{ + uint tag; + uint flags; +}; + +struct TileSegRef +{ + uint offset; +}; + +struct TileSeg +{ + float2 origin; + float2 vector; + float y_edge; + TileSegRef next; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 path_bbox_alloc; + Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; + Alloc_1 draw_bbox_alloc; + Alloc_1 drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(8u, 4u, 1u); + +static inline __attribute__((always_inline)) +Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size) +{ + return Alloc{ a.offset + offset }; +} + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_297) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_297.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1, v_297); + return CmdTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) }; +} + +static inline __attribute__((always_inline)) +CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_297); + CmdStroke s; + s.tile_ref = raw0; + s.half_width = as_type(raw1); + return s; +} + +static inline __attribute__((always_inline)) +CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdStrokeRef param_1 = CmdStrokeRef{ ref.offset + 4u }; + return CmdStroke_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_297); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_297); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_297); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9, v_297); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11, v_297); + TileSeg s; + s.origin = float2(as_type(raw0), as_type(raw1)); + s.vector = float2(as_type(raw2), as_type(raw3)); + s.y_edge = as_type(raw4); + s.next = TileSegRef{ raw5 }; + return s; +} + +static inline __attribute__((always_inline)) +uint2 chunk_offset(thread const uint& i) +{ + return uint2((i % 2u) * 8u, (i / 2u) * 4u); +} + +static inline __attribute__((always_inline)) +CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_297); + CmdFill s; + s.tile_ref = raw0; + s.backdrop = int(raw1); + return s; +} + +static inline __attribute__((always_inline)) +CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdFillRef param_1 = CmdFillRef{ ref.offset + 4u }; + return CmdFill_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + CmdAlpha s; + s.alpha = as_type(raw0); + return s; +} + +static inline __attribute__((always_inline)) +CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdAlphaRef param_1 = CmdAlphaRef{ ref.offset + 4u }; + return CmdAlpha_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + CmdColor s; + s.rgba_color = raw0; + return s; +} + +static inline __attribute__((always_inline)) +CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdColorRef param_1 = CmdColorRef{ ref.offset + 4u }; + return CmdColor_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +float3 fromsRGB(thread const float3& srgb) +{ + return srgb; +} + +static inline __attribute__((always_inline)) +float4 unpacksRGB(thread const uint& srgba) +{ + float4 color = unpack_unorm4x8_to_float(srgba).wzyx; + float3 param = color.xyz; + return float4(fromsRGB(param), color.w); +} + +static inline __attribute__((always_inline)) +CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_297); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_297); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_297); + CmdLinGrad s; + s.index = raw0; + s.line_x = as_type(raw1); + s.line_y = as_type(raw2); + s.line_c = as_type(raw3); + return s; +} + +static inline __attribute__((always_inline)) +CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdLinGradRef param_1 = CmdLinGradRef{ ref.offset + 4u }; + return CmdLinGrad_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +CmdRadGrad CmdRadGrad_read(thread const Alloc& a, thread const CmdRadGradRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_297); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_297); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_297); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9, v_297); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11, v_297); + Alloc param_12 = a; + uint param_13 = ix + 6u; + uint raw6 = read_mem(param_12, param_13, v_297); + Alloc param_14 = a; + uint param_15 = ix + 7u; + uint raw7 = read_mem(param_14, param_15, v_297); + Alloc param_16 = a; + uint param_17 = ix + 8u; + uint raw8 = read_mem(param_16, param_17, v_297); + Alloc param_18 = a; + uint param_19 = ix + 9u; + uint raw9 = read_mem(param_18, param_19, v_297); + Alloc param_20 = a; + uint param_21 = ix + 10u; + uint raw10 = read_mem(param_20, param_21, v_297); + CmdRadGrad s; + s.index = raw0; + s.mat = float4(as_type(raw1), as_type(raw2), as_type(raw3), as_type(raw4)); + s.xlat = float2(as_type(raw5), as_type(raw6)); + s.c1 = float2(as_type(raw7), as_type(raw8)); + s.ra = as_type(raw9); + s.roff = as_type(raw10); + return s; +} + +static inline __attribute__((always_inline)) +CmdRadGrad Cmd_RadGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdRadGradRef param_1 = CmdRadGradRef{ ref.offset + 4u }; + return CmdRadGrad_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_297); + CmdImage s; + s.index = raw0; + s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16); + return s; +} + +static inline __attribute__((always_inline)) +CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdImageRef param_1 = CmdImageRef{ ref.offset + 4u }; + return CmdImage_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +spvUnsafeArray fillImage(thread const uint2& xy, thread const CmdImage& cmd_img, texture2d image_atlas) +{ + spvUnsafeArray rgba; + for (uint i = 0u; i < 8u; i++) + { + uint param = i; + int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset; + float4 fg_rgba = image_atlas.read(uint2(uv)); + float3 param_1 = fg_rgba.xyz; + float3 _1653 = fromsRGB(param_1); + fg_rgba.x = _1653.x; + fg_rgba.y = _1653.y; + fg_rgba.z = _1653.z; + rgba[i] = fg_rgba; + } + return rgba; +} + +static inline __attribute__((always_inline)) +float3 tosRGB(thread const float3& rgb) +{ + return rgb; +} + +static inline __attribute__((always_inline)) +uint packsRGB(thread float4& rgba) +{ + float3 param = rgba.xyz; + rgba = float4(tosRGB(param), rgba.w); + return pack_float_to_unorm4x8(rgba.wzyx); +} + +static inline __attribute__((always_inline)) +CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + CmdEndClip s; + s.blend = raw0; + return s; +} + +static inline __attribute__((always_inline)) +CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdEndClipRef param_1 = CmdEndClipRef{ ref.offset + 4u }; + return CmdEndClip_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +float3 screen(thread const float3& cb, thread const float3& cs) +{ + return (cb + cs) - (cb * cs); +} + +static inline __attribute__((always_inline)) +float3 hard_light(thread const float3& cb, thread const float3& cs) +{ + float3 param = cb; + float3 param_1 = (cs * 2.0) - float3(1.0); + return select(screen(param, param_1), (cb * 2.0) * cs, cs <= float3(0.5)); +} + +static inline __attribute__((always_inline)) +float color_dodge(thread const float& cb, thread const float& cs) +{ + if (cb == 0.0) + { + return 0.0; + } + else + { + if (cs == 1.0) + { + return 1.0; + } + else + { + return fast::min(1.0, cb / (1.0 - cs)); + } + } +} + +static inline __attribute__((always_inline)) +float color_burn(thread const float& cb, thread const float& cs) +{ + if (cb == 1.0) + { + return 1.0; + } + else + { + if (cs == 0.0) + { + return 0.0; + } + else + { + return 1.0 - fast::min(1.0, (1.0 - cb) / cs); + } + } +} + +static inline __attribute__((always_inline)) +float3 soft_light(thread const float3& cb, thread const float3& cs) +{ + float3 d = select(sqrt(cb), ((((cb * 16.0) - float3(12.0)) * cb) + float3(4.0)) * cb, cb <= float3(0.25)); + return select(cb + (((cs * 2.0) - float3(1.0)) * (d - cb)), cb - (((float3(1.0) - (cs * 2.0)) * cb) * (float3(1.0) - cb)), cs <= float3(0.5)); +} + +static inline __attribute__((always_inline)) +float sat(thread const float3& c) +{ + return fast::max(c.x, fast::max(c.y, c.z)) - fast::min(c.x, fast::min(c.y, c.z)); +} + +static inline __attribute__((always_inline)) +void set_sat_inner(thread float& cmin, thread float& cmid, thread float& cmax, thread const float& s) +{ + if (cmax > cmin) + { + cmid = ((cmid - cmin) * s) / (cmax - cmin); + cmax = s; + } + else + { + cmid = 0.0; + cmax = 0.0; + } + cmin = 0.0; +} + +static inline __attribute__((always_inline)) +float3 set_sat(thread float3& c, thread const float& s) +{ + if (c.x <= c.y) + { + if (c.y <= c.z) + { + float param = c.x; + float param_1 = c.y; + float param_2 = c.z; + float param_3 = s; + set_sat_inner(param, param_1, param_2, param_3); + c.x = param; + c.y = param_1; + c.z = param_2; + } + else + { + if (c.x <= c.z) + { + float param_4 = c.x; + float param_5 = c.z; + float param_6 = c.y; + float param_7 = s; + set_sat_inner(param_4, param_5, param_6, param_7); + c.x = param_4; + c.z = param_5; + c.y = param_6; + } + else + { + float param_8 = c.z; + float param_9 = c.x; + float param_10 = c.y; + float param_11 = s; + set_sat_inner(param_8, param_9, param_10, param_11); + c.z = param_8; + c.x = param_9; + c.y = param_10; + } + } + } + else + { + if (c.x <= c.z) + { + float param_12 = c.y; + float param_13 = c.x; + float param_14 = c.z; + float param_15 = s; + set_sat_inner(param_12, param_13, param_14, param_15); + c.y = param_12; + c.x = param_13; + c.z = param_14; + } + else + { + if (c.y <= c.z) + { + float param_16 = c.y; + float param_17 = c.z; + float param_18 = c.x; + float param_19 = s; + set_sat_inner(param_16, param_17, param_18, param_19); + c.y = param_16; + c.z = param_17; + c.x = param_18; + } + else + { + float param_20 = c.z; + float param_21 = c.y; + float param_22 = c.x; + float param_23 = s; + set_sat_inner(param_20, param_21, param_22, param_23); + c.z = param_20; + c.y = param_21; + c.x = param_22; + } + } + } + return c; +} + +static inline __attribute__((always_inline)) +float lum(thread const float3& c) +{ + float3 f = float3(0.300000011920928955078125, 0.589999973773956298828125, 0.10999999940395355224609375); + return dot(c, f); +} + +static inline __attribute__((always_inline)) +float3 clip_color(thread float3& c) +{ + float3 param = c; + float L = lum(param); + float n = fast::min(c.x, fast::min(c.y, c.z)); + float x = fast::max(c.x, fast::max(c.y, c.z)); + if (n < 0.0) + { + c = float3(L) + (((c - float3(L)) * L) / float3(L - n)); + } + if (x > 1.0) + { + c = float3(L) + (((c - float3(L)) * (1.0 - L)) / float3(x - L)); + } + return c; +} + +static inline __attribute__((always_inline)) +float3 set_lum(thread const float3& c, thread const float& l) +{ + float3 param = c; + float3 param_1 = c + float3(l - lum(param)); + float3 _1048 = clip_color(param_1); + return _1048; +} + +static inline __attribute__((always_inline)) +float3 mix_blend(thread const float3& cb, thread const float3& cs, thread const uint& mode) +{ + float3 b = float3(0.0); + switch (mode) + { + case 1u: + { + b = cb * cs; + break; + } + case 2u: + { + float3 param = cb; + float3 param_1 = cs; + b = screen(param, param_1); + break; + } + case 3u: + { + float3 param_2 = cs; + float3 param_3 = cb; + b = hard_light(param_2, param_3); + break; + } + case 4u: + { + b = fast::min(cb, cs); + break; + } + case 5u: + { + b = fast::max(cb, cs); + break; + } + case 6u: + { + float param_4 = cb.x; + float param_5 = cs.x; + float param_6 = cb.y; + float param_7 = cs.y; + float param_8 = cb.z; + float param_9 = cs.z; + b = float3(color_dodge(param_4, param_5), color_dodge(param_6, param_7), color_dodge(param_8, param_9)); + break; + } + case 7u: + { + float param_10 = cb.x; + float param_11 = cs.x; + float param_12 = cb.y; + float param_13 = cs.y; + float param_14 = cb.z; + float param_15 = cs.z; + b = float3(color_burn(param_10, param_11), color_burn(param_12, param_13), color_burn(param_14, param_15)); + break; + } + case 8u: + { + float3 param_16 = cb; + float3 param_17 = cs; + b = hard_light(param_16, param_17); + break; + } + case 9u: + { + float3 param_18 = cb; + float3 param_19 = cs; + b = soft_light(param_18, param_19); + break; + } + case 10u: + { + b = abs(cb - cs); + break; + } + case 11u: + { + b = (cb + cs) - ((cb * 2.0) * cs); + break; + } + case 12u: + { + float3 param_20 = cb; + float3 param_21 = cs; + float param_22 = sat(param_20); + float3 _1340 = set_sat(param_21, param_22); + float3 param_23 = cb; + float3 param_24 = _1340; + float param_25 = lum(param_23); + b = set_lum(param_24, param_25); + break; + } + case 13u: + { + float3 param_26 = cs; + float3 param_27 = cb; + float param_28 = sat(param_26); + float3 _1354 = set_sat(param_27, param_28); + float3 param_29 = cb; + float3 param_30 = _1354; + float param_31 = lum(param_29); + b = set_lum(param_30, param_31); + break; + } + case 14u: + { + float3 param_32 = cb; + float3 param_33 = cs; + float param_34 = lum(param_32); + b = set_lum(param_33, param_34); + break; + } + case 15u: + { + float3 param_35 = cs; + float3 param_36 = cb; + float param_37 = lum(param_35); + b = set_lum(param_36, param_37); + break; + } + default: + { + b = cs; + break; + } + } + return b; +} + +static inline __attribute__((always_inline)) +float4 mix_compose(thread const float3& cb, thread const float3& cs, thread const float& ab, thread const float& as, thread const uint& mode) +{ + float fa = 0.0; + float fb = 0.0; + switch (mode) + { + case 1u: + { + fa = 1.0; + fb = 0.0; + break; + } + case 2u: + { + fa = 0.0; + fb = 1.0; + break; + } + case 3u: + { + fa = 1.0; + fb = 1.0 - as; + break; + } + case 4u: + { + fa = 1.0 - ab; + fb = 1.0; + break; + } + case 5u: + { + fa = ab; + fb = 0.0; + break; + } + case 6u: + { + fa = 0.0; + fb = as; + break; + } + case 7u: + { + fa = 1.0 - ab; + fb = 0.0; + break; + } + case 8u: + { + fa = 0.0; + fb = 1.0 - as; + break; + } + case 9u: + { + fa = ab; + fb = 1.0 - as; + break; + } + case 10u: + { + fa = 1.0 - ab; + fb = as; + break; + } + case 11u: + { + fa = 1.0 - ab; + fb = 1.0 - as; + break; + } + case 12u: + { + fa = 1.0; + fb = 1.0; + break; + } + case 13u: + { + return fast::min(float4(1.0), float4((cs * as) + (cb * ab), as + ab)); + } + default: + { + break; + } + } + float as_fa = as * fa; + float ab_fb = ab * fb; + float3 co = (cs * as_fa) + (cb * ab_fb); + return float4(co, as_fa + ab_fb); +} + +static inline __attribute__((always_inline)) +float4 mix_blend_compose(thread const float4& backdrop, thread const float4& src, thread const uint& mode) +{ + if ((mode & 32767u) == 3u) + { + return (backdrop * (1.0 - src.w)) + src; + } + float inv_src_a = 1.0 / (src.w + 1.0000000036274937255387218471014e-15); + float3 cs = src.xyz * inv_src_a; + float inv_backdrop_a = 1.0 / (backdrop.w + 1.0000000036274937255387218471014e-15); + float3 cb = backdrop.xyz * inv_backdrop_a; + uint blend_mode = mode >> uint(8); + float3 param = cb; + float3 param_1 = cs; + uint param_2 = blend_mode; + float3 blended = mix_blend(param, param_1, param_2); + cs = mix(cs, blended, float3(backdrop.w)); + uint comp_mode = mode & 255u; + if (comp_mode == 3u) + { + float3 co = mix(backdrop.xyz, cs, float3(src.w)); + return float4(co, src.w + (backdrop.w * (1.0 - src.w))); + } + else + { + float3 param_3 = cb; + float3 param_4 = cs; + float param_5 = backdrop.w; + float param_6 = src.w; + uint param_7 = comp_mode; + return mix_compose(param_3, param_4, param_5, param_6, param_7); + } +} + +static inline __attribute__((always_inline)) +CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + CmdJump s; + s.new_ref = raw0; + return s; +} + +static inline __attribute__((always_inline)) +CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdJumpRef param_1 = CmdJumpRef{ ref.offset + 4u }; + return CmdJump_read(param, param_1, v_297); +} + +kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1681 [[buffer(1)]], texture2d image [[texture(2)]], texture2d image_atlas [[texture(3)]], texture2d gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + uint tile_ix = (gl_WorkGroupID.y * _1681.conf.width_in_tiles) + gl_WorkGroupID.x; + Alloc param; + param.offset = _1681.conf.ptcl_alloc.offset; + uint param_1 = tile_ix * 1024u; + uint param_2 = 1024u; + Alloc cmd_alloc = slice_mem(param, param_1, param_2); + CmdRef cmd_ref = CmdRef{ cmd_alloc.offset }; + uint blend_offset = v_297.memory[cmd_ref.offset >> uint(2)]; + cmd_ref.offset += 4u; + uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y)); + float2 xy = float2(xy_uint); + spvUnsafeArray rgba; + for (uint i = 0u; i < 8u; i++) + { + rgba[i] = float4(0.0); + } + uint clip_depth = 0u; + bool mem_ok = v_297.mem_error == 0u; + spvUnsafeArray df; + TileSegRef tile_seg_ref; + spvUnsafeArray area; + spvUnsafeArray, 4> blend_stack; + uint base_ix_1; + uint bg_rgba; + while (mem_ok) + { + Alloc param_3 = cmd_alloc; + CmdRef param_4 = cmd_ref; + uint tag = Cmd_tag(param_3, param_4, v_297).tag; + if (tag == 0u) + { + break; + } + switch (tag) + { + case 2u: + { + Alloc param_5 = cmd_alloc; + CmdRef param_6 = cmd_ref; + CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_297); + for (uint k = 0u; k < 8u; k++) + { + df[k] = 1000000000.0; + } + tile_seg_ref = TileSegRef{ stroke.tile_ref }; + do + { + uint param_7 = tile_seg_ref.offset; + uint param_8 = 24u; + bool param_9 = mem_ok; + Alloc param_10 = new_alloc(param_7, param_8, param_9); + TileSegRef param_11 = tile_seg_ref; + TileSeg seg = TileSeg_read(param_10, param_11, v_297); + float2 line_vec = seg.vector; + for (uint k_1 = 0u; k_1 < 8u; k_1++) + { + float2 dpos = (xy + float2(0.5)) - seg.origin; + uint param_12 = k_1; + dpos += float2(chunk_offset(param_12)); + float t = fast::clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0); + df[k_1] = fast::min(df[k_1], length((line_vec * t) - dpos)); + } + tile_seg_ref = seg.next; + } while (tile_seg_ref.offset != 0u); + for (uint k_2 = 0u; k_2 < 8u; k_2++) + { + area[k_2] = fast::clamp((stroke.half_width + 0.5) - df[k_2], 0.0, 1.0); + } + cmd_ref.offset += 12u; + break; + } + case 1u: + { + Alloc param_13 = cmd_alloc; + CmdRef param_14 = cmd_ref; + CmdFill fill = Cmd_Fill_read(param_13, param_14, v_297); + for (uint k_3 = 0u; k_3 < 8u; k_3++) + { + area[k_3] = float(fill.backdrop); + } + tile_seg_ref = TileSegRef{ fill.tile_ref }; + do + { + uint param_15 = tile_seg_ref.offset; + uint param_16 = 24u; + bool param_17 = mem_ok; + Alloc param_18 = new_alloc(param_15, param_16, param_17); + TileSegRef param_19 = tile_seg_ref; + TileSeg seg_1 = TileSeg_read(param_18, param_19, v_297); + for (uint k_4 = 0u; k_4 < 8u; k_4++) + { + uint param_20 = k_4; + float2 my_xy = xy + float2(chunk_offset(param_20)); + float2 start = seg_1.origin - my_xy; + float2 end = start + seg_1.vector; + float2 window = fast::clamp(float2(start.y, end.y), float2(0.0), float2(1.0)); + if ((isunordered(window.x, window.y) || window.x != window.y)) + { + float2 t_1 = (window - float2(start.y)) / float2(seg_1.vector.y); + float2 xs = float2(mix(start.x, end.x, t_1.x), mix(start.x, end.x, t_1.y)); + float xmin = fast::min(fast::min(xs.x, xs.y), 1.0) - 9.9999999747524270787835121154785e-07; + float xmax = fast::max(xs.x, xs.y); + float b = fast::min(xmax, 1.0); + float c = fast::max(b, 0.0); + float d = fast::max(xmin, 0.0); + float a = ((b + (0.5 * ((d * d) - (c * c)))) - xmin) / (xmax - xmin); + area[k_4] += (a * (window.x - window.y)); + } + area[k_4] += (sign(seg_1.vector.x) * fast::clamp((my_xy.y - seg_1.y_edge) + 1.0, 0.0, 1.0)); + } + tile_seg_ref = seg_1.next; + } while (tile_seg_ref.offset != 0u); + for (uint k_5 = 0u; k_5 < 8u; k_5++) + { + area[k_5] = fast::min(abs(area[k_5]), 1.0); + } + cmd_ref.offset += 12u; + break; + } + case 3u: + { + for (uint k_6 = 0u; k_6 < 8u; k_6++) + { + area[k_6] = 1.0; + } + cmd_ref.offset += 4u; + break; + } + case 4u: + { + Alloc param_21 = cmd_alloc; + CmdRef param_22 = cmd_ref; + CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_297); + for (uint k_7 = 0u; k_7 < 8u; k_7++) + { + area[k_7] = alpha.alpha; + } + cmd_ref.offset += 8u; + break; + } + case 5u: + { + Alloc param_23 = cmd_alloc; + CmdRef param_24 = cmd_ref; + CmdColor color = Cmd_Color_read(param_23, param_24, v_297); + uint param_25 = color.rgba_color; + float4 fg = unpacksRGB(param_25); + for (uint k_8 = 0u; k_8 < 8u; k_8++) + { + float4 fg_k = fg * area[k_8]; + rgba[k_8] = (rgba[k_8] * (1.0 - fg_k.w)) + fg_k; + } + cmd_ref.offset += 8u; + break; + } + case 6u: + { + Alloc param_26 = cmd_alloc; + CmdRef param_27 = cmd_ref; + CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_297); + float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c; + for (uint k_9 = 0u; k_9 < 8u; k_9++) + { + uint param_28 = k_9; + float2 chunk_xy = float2(chunk_offset(param_28)); + float my_d = (d_1 + (lin.line_x * chunk_xy.x)) + (lin.line_y * chunk_xy.y); + int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0)); + float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index)))); + float3 param_29 = fg_rgba.xyz; + float3 _2264 = fromsRGB(param_29); + fg_rgba.x = _2264.x; + fg_rgba.y = _2264.y; + fg_rgba.z = _2264.z; + float4 fg_k_1 = fg_rgba * area[k_9]; + rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1; + } + cmd_ref.offset += 20u; + break; + } + case 7u: + { + Alloc param_30 = cmd_alloc; + CmdRef param_31 = cmd_ref; + CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31, v_297); + for (uint k_10 = 0u; k_10 < 8u; k_10++) + { + uint param_32 = k_10; + float2 my_xy_1 = xy + float2(chunk_offset(param_32)); + my_xy_1 = ((rad.mat.xz * my_xy_1.x) + (rad.mat.yw * my_xy_1.y)) - rad.xlat; + float ba = dot(my_xy_1, rad.c1); + float ca = rad.ra * dot(my_xy_1, my_xy_1); + float t_2 = (sqrt((ba * ba) + ca) - ba) - rad.roff; + int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0)); + float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index)))); + float3 param_33 = fg_rgba_1.xyz; + float3 _2374 = fromsRGB(param_33); + fg_rgba_1.x = _2374.x; + fg_rgba_1.y = _2374.y; + fg_rgba_1.z = _2374.z; + float4 fg_k_2 = fg_rgba_1 * area[k_10]; + rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2; + } + cmd_ref.offset += 48u; + break; + } + case 8u: + { + Alloc param_34 = cmd_alloc; + CmdRef param_35 = cmd_ref; + CmdImage fill_img = Cmd_Image_read(param_34, param_35, v_297); + uint2 param_36 = xy_uint; + CmdImage param_37 = fill_img; + spvUnsafeArray img; + img = fillImage(param_36, param_37, image_atlas); + for (uint k_11 = 0u; k_11 < 8u; k_11++) + { + float4 fg_k_3 = img[k_11] * area[k_11]; + rgba[k_11] = (rgba[k_11] * (1.0 - fg_k_3.w)) + fg_k_3; + } + cmd_ref.offset += 12u; + break; + } + case 9u: + { + if (clip_depth < 4u) + { + for (uint k_12 = 0u; k_12 < 8u; k_12++) + { + float4 param_38 = float4(rgba[k_12]); + uint _2479 = packsRGB(param_38); + blend_stack[clip_depth][k_12] = _2479; + rgba[k_12] = float4(0.0); + } + } + else + { + uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + for (uint k_13 = 0u; k_13 < 8u; k_13++) + { + float4 param_39 = float4(rgba[k_13]); + uint _2522 = packsRGB(param_39); + v_297.memory[base_ix + k_13] = _2522; + rgba[k_13] = float4(0.0); + } + } + clip_depth++; + cmd_ref.offset += 4u; + break; + } + case 10u: + { + Alloc param_40 = cmd_alloc; + CmdRef param_41 = cmd_ref; + CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41, v_297); + clip_depth--; + if (clip_depth >= 4u) + { + base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + } + for (uint k_14 = 0u; k_14 < 8u; k_14++) + { + if (clip_depth < 4u) + { + bg_rgba = blend_stack[clip_depth][k_14]; + } + else + { + bg_rgba = v_297.memory[base_ix_1 + k_14]; + } + uint param_42 = bg_rgba; + float4 bg = unpacksRGB(param_42); + float4 fg_1 = rgba[k_14] * area[k_14]; + float4 param_43 = bg; + float4 param_44 = fg_1; + uint param_45 = end_clip.blend; + rgba[k_14] = mix_blend_compose(param_43, param_44, param_45); + } + cmd_ref.offset += 8u; + break; + } + case 11u: + { + Alloc param_46 = cmd_alloc; + CmdRef param_47 = cmd_ref; + cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_297).new_ref }; + cmd_alloc.offset = cmd_ref.offset; + break; + } + } + } + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint param_48 = i_1; + float3 param_49 = rgba[i_1].xyz; + image.write(float4(tosRGB(param_49), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48)))); + } +} + diff --git a/piet-gpu/shader/gen/kernel4.spv b/piet-gpu/shader/gen/kernel4.spv new file mode 100644 index 0000000000000000000000000000000000000000..c38894159a01685a4f3a553e68d4dac6c41b07f1 GIT binary patch literal 66224 zcmbWA2Y_8w`Rz}bNeCUJD<$*}(tC%55+L->FsTI6NTDeupmY$B-b9*!(mRR;P(X?T z(iD)UAiYR$@Atd+tjV79-uwG}M>l(|^?m#Mc01+XGc)O!Zn5dBs!ugz)xR1xW7Rt5 zsHQ`ys+p>Ot$NVl9S5&CY4Y$DS6ltNI?P&iwEXm$t(v~-OWjEs+dX=mh7K-)0g#Fkh;5tJFA7Lr}d|^S{OcU-2JLW;hT;fzT@PH;}00I z!-$d4*8Hk_Ol#Y>S_0nV3+~7N)I27SW}zpheOq^b?AJYJWY@u?hfm(mzRgiBgVqy+ z2HnH|_sOkiyT^@gwIj&CXI_V_xgrNO4{%m0=G(`ex6F=JXIai6GpOdCgMwIaOs zYmGtf(Hw)iV;I^!?114D$4}6wFK5Ck)OGv+ID)z}z54Hom;SE|PXAYH$K7=NnDG-G zfc;nVnAZQk)f#Z}xBRrZ4r(3~M-A=n8fJucU)Mp4?|Khw6sH~ zX%P$MVyqhxBV*kdoUv}!j=e+o@ZMup_bA7@1)P4=zU}_1 zc}yE?XQk#?$97L{jZ^M84;h2N;nh~naSmgXgnM0_NGF{e?S18RfT}eQgQ=(G|Bv%9 zZA`^HY(>v)8)XaW@|~eA6+bC)m?=AJjd{zS^EX)V^&$HIJcV zMvNOy?YZGiZkpeJ?CrGNMSs2TroYC`ypC$e-fGI&c4wV0HIHe1?W{(_>4Sc)IhA{O zzi!8xy&4Ozeg0!7>x6X=9cM?yt-Gg>W5DgWoz?hu+}32;Keu|2<@DeU2_-9#h0Tp^n+Q{rs0P zk1Jvx-xG88>eM=BeWCv^qPFMsd;XY5zr3 z*Xryd<~cnvXRj`*W7a41e-X7k2Nx7^FYJjsTXkt2*K_c{32I+2DPsP#CuZO3a_~+( zL+&_YRL^;;=24!fSHf$*)_E!S7(a3JC_k}yuBv&i9>fO@pM#=#e?7Eo#PCsE5Tkj2 z9XH~T);-tj`Ze%6{{K9*X`StL4Z4c)s5gu!zPXxF>dLDMy|T9rFBkR zk3MW@d%kZFbAj-)%q(UPi=lzw7I9JP2cK%+jO@R09$?f zgZ38gTm6xG%#^XM4gg#B2dUfke$}I5?uhMY+qRqg>5;N>-)l;?|+x{%<=ftPm+*=?qI;&?|yxU7~E7!Bs zUgP_+!(I^6pNu>Gc@Diju8!*Y9{>K;tKhMt59w;%yL*;m+UL5LY*+7Mc2OhK(OPVf>^KJ(1eunilUh{1WeVaVuU&^)GODo#zd4o##z(ndd`! z`degMjqSafAHK2sx$Hga-k+g5s`t?*O&o@`v--d|J^k-neGH#Ge$ozuHkr20AEEKt z2LFE5XW)?&$B&)nVV=(FQ}jtXap?CI^^|cF`1IEDY;{P#FKfT{dCK=9eX6R*%dWTD zceXAfex8C7Z~Ceqe#7>gGVXw`_7%ym7}~ausxP+DEx(zo*}x;YkGHPnOCL04m360m zqHSqhF;+Ne^eQ=@XTkpvZrt{5R&(%7F?#GMctbO9Y_70Npwx>BPLGta*21}mj85Zy%od0EK|o2uMHl} zuD2iSgC}+$?8n*`-w0mzbCU+&tnI&Y{I_V>x9Y_^s%_wGL1K4QyMmK8KY1YG9*UaxkY)#1H(M|BiDagGHi&WYeM&MCdx zbyh!W@YCD=J^eqkVL!XU&u#m!l6k(cVZXS+FKPR)8vn~1_A7hwj_PW7#&st+(Pe&&kg=~+rMXAPd4mNH~6z{|J5=N z&o}IUZ}6Ae{;S9T9}W9!4gRmT{~GasyJ3H~!QXHDuNnW38um{b{Ij{}(aPsaAF7xi&t6gU`v={HFM!*wi3^;KnfXg_OdbR7Uru5<+ z)gkc2IRczGM}x~aKkU`6vpTL9@2F0Mx8wAyP6PA#v#WdD@UCf}O*^YId$sAT&TjC_ z8~n-!zplY=Xz)84{H_MSr@`-Q@P`}x(FT92!JldH7aRQL27jZ$-)iuGH~7cBct`a) zJnQo{xLlt;9lg(wjt1`w$G>Nv_HWo{Z19=e{yo>T*&6ma8hp;S|Ef7(=V{pIYw&Nk z{d=xw3pVTvH~1oL|DNmF;tl(fy*SSW@b-S}tPbjB>#Ppx#XG8F;XXWjFTWo=vc`{r z2iMn=#~W>)g!9p^)#e4T58GaM=HL;nyiU#A;A=PddJVo|gKyH{J2v<(4ZeGW@73TV z8+>$wk7@An4SrOEAJgE+Huwn*etv^r)Zmvi_+<@#ZG+#?;5Rq;ZM}F$bq{=Ky;gti zW$Ub-Yw#Bv{N)CJr@`NE@Q)h&(+2;t!M|znzI}V|rTz^*bA!*;;Bz+kJPp1;gD=$J zi#GV;4ZcQ$uifD5Hu(AtzEOj3+~Auv_@D;gvcb1*@NFA>y9VE(!FO)(T^oFl2H(5E zyL$1CYCrhM?)Eiew3vbRmwwc&yb2xn3XYkJISomb$ zHFI-zP5k+UQDeGx9Y67aK@-PMnXu(%?cArJmp?Bc_b9(hJQZ!z@#990Zhhz9uabL= z>l!g;#MlwzCbRB-Mpp0B(kFCJ-fz+f+)<}*>C=H*uebRIsmqSF+O%!ls;d1)_`TXR zW8!i7X^*4+;eqHZbv2Yw`I&;~c5TAsVPj%Q{^`}*y^5))&;7A=j~h2WhQyiyee%Ta zag%C8V$E2`>KZz9{2@KI{;hG&gl#wvHDkw*8$Wt@B2Js1nX$LNpX(V$Bit;-nW(WTi94A zN6$R9wK-dwvuN(NHdjk?MAP~+H(Gn%vWv8}$1)E#-$%FiLTPXJX(` zjh*$s+2+UQIR8)WzFpgfcTet4L(i)pF}Lk~y~&i3_&6ptk74Y0Yp+|U*P3n41^c~I z_MPYAMQG)9<+299yuq)mW8^#UYvAo`VMg3OH*Y~dh}H@vk6hmV-h{&^F=zeQ_(=f%rG z$NVCkxdeAsFM-SPz0%;H!TIKnAB<`D`}2nV%U-;r`UdVhNb}6Vi8*|vxL-9JxOL5P zft{S%XI^x#CiZ21aM_my8hpV9AJE{7^x}Q1#o?`=kKiRp@-2nd?q6rM5_mY@4!3Qc z)f&CpcU0@aeJSEtb}QQL3-0E}cG7MV-0wQWC&SBe9Mp?#Tm< z;Ab}YISqbpFWymI0MA%21CKql%dc2lbA3fG-_Gh9IG3P_8SnLQ-=4cJw}21khZe>k z+@6d3;PHDDd>fV4eOqtEdlIdjyT7;F^o-+EI89o%S;?SdEtfXK$?z&REt3Pa8{TwJCgByv}Nyw!Q9m-)f}XpCj-k zLThe1t5J^m|K@n>z=!k-q{cVZs;v>s|}>^m!uL0rq01am6`Rs4)u z`(2Q>#rzLPG@GcEem)PYaXPee_}+<3t+>8FS`_SI9{D0|+VWk_q#X8b#RB_zB)XqP zg4ZbUngw2~!0Qxv-2$&y;PnfMQwr?oulOHX;A47l zXX`WAOgQ;G>fXp_E%$+kn$PNlX!xO}v<#S4v7u%#_sXpY~u%NLq`Rr5Vb^7?s5ZM8!4bCB9v zh301;weXzeVIQ~Px)7fA3b&1idsnTag*#iHW6E>E&o!P4 z?k%q~x58bY@3eSF>vIFJ702iF;4KR5b8hre1)kka!eJi<6nOOluTkJN3+(e$+DnJjq^EU?d9(GM^1Q3ak_;2#$FxB?$v z;1ddbVu4R8@Kpu&IWB#8qQE}0MfZ6sc)UW|dlq=F0`FbmeG1&wgL#(>_S$3aIq8{wTZFt` zyTo&IhI#oLK=Bv=_i|%Vp9?RyvAR>ME?vAu_U#c@w|>3YdPw~ z^1f*-HRogn>H(BK6yvQ(t*^TAR-#rjo^6fiJvH%sd1*W~ZEMsv+pYy76|PRx_S$jkhax;(5O}o|?AZ zsneG|z;fH_r`_LCNZWnEwpG*CO|8wmL#gGq-It<$6m{C}2ez%6w$ZiCw)=zSw$)Gj zcIL94f$fu;b2u}#nsIDvn-#Es3+%BBD`eaG zYMb#Et8EKn%V+k5h~;N)?G|-xmsi#Po>$3tQ*BTCzrj0;c(1_w z7W_?kIi3&Ujv?)*Bl!#kpAqgo(EiR2*4J}bzF5r%)&7oaIke=LuLMti`C$0$)VANL zZa<{v!)xyKMf<@u@2>d?HTOD@{+@^S4I5SerI{%EcV%t&+}zU1Oo(EIm?n7PE?gUA7)W+c_xA;aIC(bEwwUqds-fp*1{LX-@ zrNr-?w%-QvI~T5&68&N{$C#4#51^fY-gz5@KL}rc#(8_@;g9g^2EDX?;yeVmjegpq z-^=1Urzy)b;{zG_`7CmBrZ$&*?E!4x964>So`D~};Au;0cTc-M^jH3^5lp4W1}@9}V7d>`z-@Y>?~9S7IUFk1HGA;wp?z1(-|YPLVI z##F8Qj^FLH@q(@Ibi`%R{{K_T82>lyeb3MRen({APNQ}%)a<+5&;Dwe-}B)7zh{0g zkT7|+zqt0bzZtvV6*%^+#~pCb*{tx`X zo7nTg{T#hKSBXK?E5mcH#b+$q%-9m+AMm@^Ja+Z)2i!bIeY0uS-p}S9Y5xIyu7dB& z1@60W`+NzQ;k5cJ|0!I}>!@S66im97p7?%_^6;E+&-nSqG26!b!y1ew`%^#n*{3LM z>((3_&!6G2Hm1)azI!gY@14V~eeYay-#dp-!0vnJaBJT?hkHNueRH_iZQmz{Ti;)B z&zCFy+XG&V{l*{%n7Iw$ud*#^epYN5!y>I$nIb46=D~Idvd*yIz-z$f^ul&v` z{0i*8Uk>*^;Ctn8Yu_!G+;_|2j>mV);r3@h!Hw^`<=CA+-z|q*FI8~uzFRKszFQ9W z`OJ6A;nu!i4tIR}7To6u-!aE-|9rU_kDBha^E+X+;`3;_nmXeeeWFZeEROWBd|xA;yvcIV4?(&4U;@1(=6eJ366bKw33x4rMB zW4Ha-g8R(w_o%U3`)<1AQwpxX->=5*bL3eCH=pmQV>h4gsKcFK-%*EKUsG_`&v(_O z-S^btw)Z`CxX;19rw+G&eh(XdE!^)^!?pXJX}Hg!zLyTS|Gt|JxBtGI4!8E*bh!QX z{dBms@2A7HhdUnMRfjts z-&KcuefE3NaIf#auMRh!@2kVDeP11Je!o8rxAvWNxc9uv3a-E3b;fS(JL{7B&N^Iw zzXJ{TIo|iy;f~LD*5QuNch=$7zOyd5@2pGid+TuX`8{j6wePOOwflW*$$fuaa^GKv zTQ65|`|JDb*zK?Hufwfk z&w%?Z=jUHPm&WhMXxe^2EUM~E{$pG9#6Jscd_Nzj-Pv&Uqw03&fNiV30Z%DDlc>f2 ze6U*bTnJY4`O!Z5?D!K(`sj0}ZGD!!q_%Be=lW@_(cM-bor_D+e9m!v=9lXy_IbyA zVxM))xi&M%G*(xD2T&a8^!1Oav$7Y>+!i0-rOs9p{Zx>e+l+;fO`7#E3mP&CDz?w^OgJh*Kqat z{03~k(&x8u_2l~<*nH~AcMsTD+J-Q`4(8!ru>H{H__)H1%uhu{{e`vz;-X0;?JSuhjO>+PQm<`gw|n_7`fqweg*+zk`k8 zJc<8Cv7K?m`gz{J3_h1)&X=g=+B}b61$!Q7dxcuQcGa)-J@-GrQ}D5kwp^3`3D)Le zyBEP~8TV^owT%09u$trk1pha{9`;+?zbI<bh4%eh@v zZ&Q<|dWYK2kMB}Y)Z)$Fx)koCM*~iqr7yE?Tei=9O z`2{%h`8img`TPo;`TP&#DIurYEj^@FSBT$1;p7|Yn=w4D)bTlcfiaWljHtf+3^$IN* zZ2O!`v%=MKE@@MSq!eqWWbtO|B4S^Giwtp-;=rf#=7*tY79X(ej4^l?qFebnZ62J*FO+rj#-1@1?! zp8MI_VD?7pMOlYNM#fXCrXgX5*Tt%_eYt z)Q_*@Yzj7(wwc*|#@-CET4l^BEITZ8pcPn*GD=OgR44Ol<*ndzBvwgo$mO(@!DuBLD4 z+ts?)O#7l=`a1+{e;t?OmFw@`bsxL`tTW#2!Hz%Ir5(U(xh`o_vwhAD_p$l>zGg?T zd-EnbYRt>gc7m&$e>Q5l?Y-uh-`e(6)z17!pX_bdWEZ&l&++kkvMX54bJTmHd``;T z6ysT&Yd7kg$Gg`yYtQ37srRCI*q6O)yLIMbAFwfU9(RG&avsa~pcu=%;{C#urac?jsmM?Z^^$$F_y8#X*(5c zTj#{@gpLN=x9lz7bsYm&A5VYW6F&f}Wxan0cD;?|c;x!nU;A!O>*PEZY|gCBabUHq zw>CA~XFlwE*6nz(eJt1e1h{&hS55@0*&nX~a^syuZGWtt-&3ehrFdvRxwcy;-|67y zb>s{*b>p8#Ef@codOnI{_4;xaIL|9*g5`N$IUDTritTNu-5BRkYcs~>*v|u-U)#CV z@;tAc5B7P*Hrn+05w*Jb0_yCo3&D=v+Oc0u{S%6ZW51-fTW76)3N}Xe)}>&z>@E34 z6k{1%Y}-M^yd3QFM`B$GHWUO%UJ z4If0lU!mn1ZXDa`XMb;@)}FrH4OVj>IWPA2*Kqat{HFFP`}UVx_9(Sla{L{vR*v^YxO#kEs(s3Syo{!vXW3W4_QPjc+iQ>Q)xxHq=SQA- z{t4Iq4~qSJfm$uOUk5k)_b)W{_`Fg3l>K`XO+9^h3vB4i8Y-UDvEl1I>63!yK@qb*T%YWp z8Q{0T%X45xH1+t*RQr^BY-Tj|jAs_G?bI_jvx3d1EwN?;?@K=Wm2vtzGwPYYIlyYR zPrGk{)6Q|q_49cm*K+HOdroYQJJ;X2;6H+UKP|^PH@dcbUY!T*9(3-sCI7tO%V=ND z!F*`y@tGfNj%5juv3(m&J@fP(ufstaa~6`nb0ecP+Sa&F>t@ z^>>Vp*M3^3Uu%QyXV!Ziuv*qzn_4-CjyLPIE;yfm6LUQ@^~78stTvW@rQHVbw6mXb z{mhfk&3^Z-s zux&HHTY{Zmb^9iF{>^PaY;T?RTY+t#x!4-4mig7DR*ubnWS$0tGf#=R4VrplZVOf` z-;ZtwPdoc5*UvneU*D-1)9)pA#FpU}G6)C$QYlj{dyzF5n+f+eVu{ z+f%EHccsqS?FM#!tX;c3sQ09J*sr~6yLHC8H`o|iyM4fFSv&de6k{1%oVHzH+h*3h=TNWp8H(WhFL*=ZmZSB8YANyxt&0(D!!@%aqS_}uPWqq}&**@d9uUX#_;H*<( zjzm*W%u!%9$DDTi!PCxu%JnPP*LP*cbbZHQ%laMwmTPl;$APoHW5IIQ*PqWH4?cm~ zHrmX!Kec-LG7+qnc_nQ%39hdHfz)z8rw_z78SLL8P}e?zT1`BKI{W1yu=8#0emR8t zP>P4+Jgl}`XYRfSHb(Z#_rYq}FY<#a#xl0pw)X1?uzg9aBf-YX-Z%>E-cZ-a{WBG; z?sw_NI2vpmb;ol!wOV2x12$&(4{B}>-+})S?mKXGeT;c5SUvqX4s6^@*$?T*@o@D# zi<|)Vx}=`Iod`CTw!}UOY^ID0!WPe)Tv%rn4hH6Ak?VK=#>ISeea!%hU zrsSNK|AJyHV~f-F7O-vY$IaC8d|!AgxF2(DyaiV`)q5x537mTJMKm$(9B!jFXn}-C+~b1jyCi9yAo>2u`JlWgfCa~v|k>sk9vGo02{}7wx274^;6G%bS1Fs zxNvPx+wY>AFW0`6!TPFaE>;0M7yf>7+O3MFo_4E&ZL99M{GAWA_^$z0%d^0mVEg0u zncAAq0&Ah^Q$7o%;YNY||5G1F&N&p9MBV zQ_r)&Mqsr(3v3KFj(VO2HUXPUoBi;2TGZ@^=k;b_wLA-K4py^xI3~|++ir@cJgmU>VB=oO;M0%Y;Og1eyMuieP*30X02@nNV($qy)>QJFYcH^V>gm(o zVEbg>9g|!??`6|bdq1+ynDzlX#@x5N;A(jm(57bl+$;OQ&F9|k2Irohm_yOj6LT0? zZC^^-4F{*4{gmrhJ`4D>)5i2!U_Wem78nJVYx7xPe{h}!MuX)(3v7?=0Pwlgw$Wy; zk<{w8?_gcVfo~y~wy|KjHrr1C>#uD*wcObL?)rh?*{E%!O`kE;>f(vidCr&wcJ8fx z&X_`d5XHkWA6(n5%}sZzL%_z!bH<@ywLE9YCsT}NY;oFt4{Y1)i|>Q&o4P&&iGMg) zJ#CHvr;YcgBjKJy>iT#N9R*g;KAj3Sj=Ez$j9M)*j|Lkv{Fs`jUq67mU)A-oUq1w^ z>tjET1snI$;v6~-uAXz~c(CV?dir((*jU;U`$TZ7kM%ip5?nv^^yy@H7fcip; zhkdxHwp*tU7lVzFKKul%mOU*$pJFWYiqrOTu>B9e0&cAED{G#0{2APJRM%%9{kjUQ zp3j_DgVj8uT~ljJOON*RTJwGszhBgv&#gmPN#kFOW=vz)Mjz+xI%;j+Z|PEXJ-81E z)O}y2UuD?9Lo8($Jht_F-GuZaoSGR!GvRAdK zxqew+*Fx_8FrIrN@oooSL;n2!-5qeXJZn4FJK?U|(e>Exf~&dj^801Kgsb0yU$kGr z)z#A9yX6#Z&cTV;eht2r+I^XR{RT}vKEJJf@_7RH>UU`B$#)OfeW;#%_kxY3Eq?cb z_0tx=--GR&HrL@cYBlj6sB_NT4|Wc%J!c-I{v*Z1Iew_NTW2f}gN>1M<`J-3&Kdaw z6k{1%oVJgG?MwKdz{X0y{|wegJwA_t^~oN4Tssq{uFpXF`UF@#=fYpWY97&^tTm>p zM|-N)%Jb;yS}V_^XV8pk4BP1ATs}*!E$7i+!JbFz`F+p7f%E&Gj?Z>#`euyJfy*&I z4|j~}`pa`3xjxRTb=toGF6Z&@aJ8IA+SFXrthwt`o=4}v6YoW^=SbG&CAeDRX;X7e zyjHvn&b2==UO`h&j90;G<@?uvz|+og%Js8fpOV9I`a5dI^ttO_*z(-@4(f{F{{P@-P->;=77i2>OHtx?q45( zJ-mNud!M4_n8bNK8357*R0Rr>g6@-bGSLQ<$Cl5 z_*Ukwyk>ogrXHWKYM=6&^);G$u0`K~y$+~n-Ca0iX^USUH2t*2Z#uAj)8_ho2v&11 zWZtZ``TPF4hIN9SQ){nb{ivs>c-XdoZMV*x&j2|o=B&jGhjx&M9(d>P}_Ph0%v1Z#61{5uD7$7x=3 zm@B?>gPq6ldEn;AoX-o^M?Kf8`M|keX;0hv!P=5nUgkA_d=~)QH+}tE3v#cK1DO~9 z&aLmY)wTP#8Pw8tVQ_mM+MjC&z`Yi#>thW6Ca!wgEDBDW)^}O(eYr5H>$5m*{M)&U zQSv;#1lTz0_QSscp_Z6Sf{mG+{*7Jxs;_?^LLQrcbJzCT{96(7TpO1Md#x$IZ&?9N zU4Q?Ugxud%aBM4rKcqH?v9ue@&-g2WoeTSBTe<$OgL~ffx6T;83w8`y@0G!7xu$7T zvwhAZ_k8-k3OLv2#9S3kJuz1Ut9kCE-RkhPv!8PP%#-W7zjJI%B z>-sw2T-W`(8gj4eqp__EzLeTF+Pv2jwptIY&BNbQG0ysMb^BvHdHTBnIQ`8yH`KTA z*$8YOT}NZc^(n{X*yR3ByK7-B&+ona``)(QlyOm9_+ip?Yt<$$H z!N$lnXe+Q8Q1qKW@asle^(CL`aLJA z*6;PN$r}26=HJH)o~6cIl3M@1oc)*`oWC_U2ZOXt`ZXuq_BmJPg6os>K(3Ge=AXN^ ze@$F*`sOpz+!PQ0#EaO+EQn0~;s%dv&-z>RzMm#~NUD z`zTHy{Tbe6C>~2v(#NH0ycqS;6#KO-b=G=qVw7vYe!<;)8x>r?O&WZ&g6qFU!S&y& z;QrmhZEBu*SqE;v%6VBAO+DXHtOr(Gmg2hEXSK}R24JvcT)GJfmW2;hUkL^kf_d@ud1$WQvQ*izEZSbK5*MCI8^&eI6 z(ct}Sp847hZokSswmX`7_Sha^wd^tbtd=#{3#^tswl`R<++$sE=fX8{OmeZ~(LZb6 z4R+1-(WZ}cqMm%iz-7MSaJ6!ejf5wkb0RmE^Pzw8?FTlWKHBtgPSum|0I+??oQ(lH zkJ)2#ed0eBY>d1Q9S2sk{dnpL6x(Ny$@Q_l`{=;h?j93okNLC0>rgz_Y?bysaIG4z zPQ5n8J+>}&_S9rzlzZ%ug1cwFS8)9fFZhw*qiUYa4-B#3odGWM{TQw`u=ep@ zbrv}JoO8LcoNxV;?;NoC^wFk|Yo?xj=Yh+77r@oduYJsS5jgo=Gr6%`Fa4A6Ct&mG zqfH;zTs`@I3NG_q23NbZ_A%cT;N)}7<;J>_TL0wx8Q6UKXw%0%qn>=%fSu#)iJybr zCppLD`o#YiVCN|J;%mWbw)b9q9oY7{PRjL3`|H7uIqh!%tEK&oVB0TCvA=SC(*9qyV{Mmx* z|9rvq|9gYKRB-+OQE>fVYw&*+T>rNVuK&9Y{(iyr|ES>lf70Ne63$Fjy4bF1* zte^g!1=qh{gZD4E{xcU`|5*#}-zA=-<~hIaBES2{^DBG$mtgf=bAJU^%Q@g4Q;Ywv z!D_kY{syd8UUPp3ckg-*xCiB8_n-c`M%)Yb8ljIiecZ$9$@hD3nePv9wSl#d*Q5u) z$>$!H8_WHxfAakiY(9Oo>Ek(|o_r63%Y2W()yiw`pWw;oIUqNd`(OX$dkkzoeYEN0 zIi#L^Pk_sOe}SuATKhP@r@+bQIV3lh=a2r$_YBy4`e@U~b5K3`{t7PhJqK6&TkT`M z7r@EqIVd-l=b!$`_afMQ`e@VVS!(sG z^wFk|*8uh8djnkNdke1iX6<9XcfiT#H9&4G&wu@s?>(^j^wFk|*AVsO`v6?#`!`(e z!`jDuAA^(6Ylz%fUO)6tzE8pC(?^>=UW3$=?{jdO?+duvjJ1#Xz5*xTmteWEzNXec z`Mv?0Pakdid`7LFe0{ivl=(W~YW~|i`k1dTIQhJW$&KapOaJ7X9&A2+wCU5j27;4s z25^~gCb*ja_D}N70!}`!fpTMc{nJ1BW&@i~A8q=04OLIRZ-MV-ytzis3HQ3`HBx`M zKJlLm>@_3L0CU6DVxI@>IUf7GaJAUy1A9)!K0jP7_HToqE$rWctK~c21;FMsu5Gp3 zci-162)6G-Xm8);`X}~6V9%H2UKp+x`v9=}JN8B3YD0)Wi1>?wZKH0^#i$pjm~+`W zr(D0pTmtMKPRu3YYOyZ`cF)DWG+Zq?mjT;GJvo;JoAahRr(D0pTpsMcO3W3&YPR<| zaYeB0{n>NpNv==Ye;4dNN&A(-YH7a;*!KRsw(aHm*xqNkRl(YQ&J*W3Z+mPzQ#|~) z{_>o+V~w|?-ihLK-Y(R6u3Li`-rvL5DY(x8>la+V4I6ypg6qFo!Sx?l@EMrXEo$y~ zwXX@cU!D`0qqWe~^W3mDSS|a&KC8ulU9ei78`cA>mCp?uz?}={)G^7$jz|BT*BgO7 zul3QUk8`4)e4BvFe4E16%IAj7;mPNm$c^QE=%0Lpz~<9On?BB|dh%@vwhx)Jt-#J> zt|M}N;=eW6822&9e$TiKSZy%Hedq7hZ3|Yv7a#d{U^RUbX9(ChX|p|8Ep2uHtJ%hL zY6x+51RG16{q^5I+nHj2bM27pXCC+MF16h~EzX|a9oya%k3A^a(|gutr0sANy+duuYXNG~@Gx})L$1$oW-$<}pt}~;+YL3b4)P7*cnLQ-e$M(+q z=-Td@h_fc$*hWx1hElR7!)m-Q^>B)7GLkxLGKLuCnoKCTYci?e`b{bL!Qew`o;4W@ zH&3}HdCE073Qaw0FcqwpHLwqAnUiC{YFUFHfYr)1I2P{M zoKyQK7u!Gmvj)e5T?2iz>EjsHlkX(3TE=iP*l}eIFjKXY~t*g4Zjn?CkYJ#%(0SnYnU9o|pQ2dkY&iO&UKebT23!D@pk-W!~+ zi@>(k=6uRCpT282jN;+H(VO`^q{hzYp%mxyd(_US^LIHhGM~QJy0Ye(zf0h@E9dX0 zXzJN>mx9&&oa+1Ak3UGcviuAXt;2|gO#v6)jYHlMzZ(=q%K>@$cy+8n1^ z#_9W+6DS_XQZmltYV0_Vr#Q|NsWZ-dh>>ynuI#>=XPm!=+pZkvZ_v~;&fkL7GS1cT z{~cUCo^_5@4@R(>!Zzas%4yh$9yWq<0MMPd2)>%=P4A&`6KF# z^I>9Soa>`MTJwza0l4kTaXyHqo^k#Wtd?=EhyO!x^^EfouzhiC=9G)gr?2C541WS| zLamQB$ElWa`WfrT6pzy=8RzLWcARHW9Os$T8Rt{P$T&Agf2QUc=i_kOmE(K@O+DlM z3s^1V+zkIG;p!RZ(_s7J*vu&xn@?ZI=@^~`52n^fo8wf=IJd`kF2&<4O2&D1jUDGX z6vufUb;kK3F)~g+Q@&jDjPp6T?aFaJkEWh+z5rIsIJd?B?{M{u^Chr-act(4i_NF6 z<8%zKfc@O7k2c4tmT~$VbrHqmd`iZ7L5&^fg%rnmF?GiI1~D?uebL{ldB*uqxb4bu zzJ{iralQ^#%Q(C6{})_6<9rirUmTk`o^_5+u#w@`e<{UY8j``Dwk3`enQDO zFR8KP{3*q8UPhg9{+k#X=YHrP*F58V4{p11obRKlXPh5^)iTae_pxGy^`Ec7zg=+s7c98`3pe;8 z1=oM^g6qFz!Tonsm#KO7!57Sz^IGnMFVWQV%=Z;oEzkPSxte>x=f-cqYS}j|e@iQ$ z^`}E~E!`WgiCpY@=}`Uq5h}Z+f^|`8~}H@Z@t%<;H5QCph_L z0-H}CZTh$;)RS)(aG7sbxZ3%Brkw|~!;{ZFAvc!$LI33Y7TA3HXw%0%rJj6qfy;bz z!__V=^34lRKKGQ|SnenNlW%^o`Sj7Ik9$%*`Mv`#^DO{Z^Ssx`&m9ZFlg~XVH%YxlQ z`OaiHxLWMXgI)93R{*Q!`{WhDwo&)IS&UdKfjyTzZ_FvzFEPIhc3l&5Ww2W8tAJgH z*jELsCFg2j+o&h!>R@x``!>0LiMa;YxlYVA!D_bmyP>tfw$FJi*C*}Q20M3tR=OPj zb--%2_xigo*!DNoW0vbHFf?Z>MgKe6-{2v34#eeL|B|*IoIWs)HBy(!R1`*lbGYc#?;4t$nBSX&ip$s<@|dt{D$IDo~L)i zoqx~SUsH_pTWX)d)D9w!&l=%})ZDQcb0XY!{$87W5?t*-$`J1QYLmr9$+eYBY0zrz{b*+`M4ZxpR^^`72vXuSHjCa{tT{G_VFsX{nM5> zSA)xWy#}tIdVGFf`=sB$fE!0W{k|4#TW#^X4s1WQCD!#|`pws-9A zb+znu?<4n9Jbq8f{{92l{rv#7&l=ueZ=s!fK10~{Ry6ha+*bPx?9=Y2KDVQ($LEgP zC(r8o+=-^1>*!rzLbQGl#`fA{`(zrKn6D{!-qccZDt=hwAQ*~j0YsmJHHwNKf{ z-=V3eANPRm;}R6xYme>T!e)MbT}#J$AJ{SI<9f);_4p$`9_4yG2zNamqR#c^e%g7x z@w^j1M9JQG7;KyHN8tLLS@+>lus-VcL7qN%t$LK=@h~NO;t{ZY_!G5#a8Eo&yYfEs zc)_(l+2AkKJpFnCZd}J;Te-iV={)@f>|Cg8HmXzJuY~q8*Q$Q^Zy#yG2Fp=_`Li&SZzznC-}cX?cuYQ zwtrF7+#}+|c@yl~WzF9LI}fqHUEA~A@eW)qCD*`r(QK2)l+=t^St;N%^cd}`w7@FrcAAUKSk4K8|T+CtNFXQnJ4S? z^E0q(a92IwpX*B*LUF#or1o&WwS7TRbH2ri^A$Mbwa;I}%lZDMwx|DutJG35-+j<+ zt39!&0~RE$Mux+)aU0<+c3-1Rn<4h0NM?L-O4^~gf8q9!Z4(;)s5$w2A zvIaAuYqL$(K+QGC+*#W%=W1qb&Xw0had}OA0^Q@!lw1=ZtFhO*$0@GGU#N49o`V?W z^>N;sC*N#vV`N@thZ`sMZ`F4DZY+7Ok8^^(KB{Lu<^mf-TjpkNuzE`7W*#*2Yd5}q zQ_Jti&j?FD?x>mNw_xeyJtqGGOy+Gq&7)>R26@<4ZfQ!SNH9e)?;(KibOQ z^>>fIK=CN|@pJI3;qzeg{hd1Z`L&6eee61~*WepA_$Cd$V}tL~;JY{YUJX97!ACdv zm2DD+#LV9XzK2D_oqDm>w}y9-2hEpfA_CE{u_by zFZ;eRn!5h(Z+XtuO~IZg+T82%wV7eBi<^PHHrgli*jAsk-5jhfxd(zXHsi_liT@z5 zapJ!PIAhaau21~81h@LjntQHpRr@s#4NYBtuQl@c z?*Y#IWqf<0sq62xNFLkX;HlLjQ?Tf7eq!#2rfz$$yYl$&4^IE%e*l`g z{$7{m@gECLf8sw5OroUXD_#Xr|XZ#NaXKeb*^(p7tIWNyM&o{Z(UDx6e>cl$~oczgo7@B%~ zzE}H{d;R-p>N)QY2is2F^X^D$56?SoM^Mh7m|vXOQ^6Te@*j<+?)Z+PmdF1G;AVe+ zh^DUpG1T(-9|zXI?ECR(>iQo`El=(f!PyHZfaTc>CxP7y_Q^c9)hBIF25U?1Q@|OU zx#ar9|5UJX;{PLX#-_hqpZK2!HfQ`#2WM>h%k^=soqy-rIgkI3!Rb$8o{6R&pR;P8 za=y++Q_otT1Gb&IYkeNIhik3vT*@yf<`*aS1>k1>3(?f=|M}GN_+Jdp_|o5>psDMB z5w$%2KLzVwKCfJgrmp`b)bhl-9Gtbj3@p!DUjcTl?UQ+It54cq3D%a}KLclM=923Z z|Es{piT~B$j7@*JKJmW>Y|i-q9GtP~FW1MlcK)4f=RE$`ffGM5uSZk2{k7Ec_}>Ul z|KooXn!5fsP|M?g3po9W|E*~1`rk|~kL`AF=K40UJac^q*txb(=CQ3lX?rJFTXNq8 z&e+T)*C+nJ1RE#*zXE4$`pfl+|J`77#{bvgj7@*JKIL3H=jD0k`6geP+O@DYzMp-5 z12&$YXT|wEtL;^a$BUFT$#pIDO9g(p#(oZZg<|Y~Q0KG$J;cjrB-{SA!JljJ7aRQL zg8SU|PJ_SS;2$;krw#sPgMZWDefv)9ulf2n_{p&=uK#8YKB(aOZ&`5tw{Gxl3$Fim1=oLv2H&~h`tMqB z{r70_y$h~?SIs>S+-vtz+}HiE-3NA0dJlLK?D0jlNK5|<-0y;aPwnTEKTz|vKy^Pg z|F0h4KZLFxq&C(csolp9QTv(kVd@#EAEEZ=jvl3+iTY2}GgJSWdKT)(sAr{qoO(9u zC#XF~%%AgbD2Y6eG6uQls(seiKF0Scu;*X+({S77J=rsGebf_At&Hbb)BbN@(=VP#W z%kMBgK~p!5*Hd}?KLeY$^#2@9U4O5)^5p#jY+mDNOPsI3<}IJ&zD83wj@M~<{5=`X zTl)7wQ`g_?xIB5M1Dn@4+VZ_`2fA}%zx1)K*A4Y@Z9LDDyC1gZ+Dwn8Zu{0f104Su zz|FOp5lvlx?cHN`gCl1shJ<@G)B7Q>cYi-P5`Edfq0 z@7eO?S`vIOxoo2?xzx*C-a`^^X>8^SUj}Y`zoWPR%fj_h&(G2<2kwK7FfDuXsF!)X zMSbSi=1c4m*u4JvxlEkTWdn)x z7RBSAlzc9Gt-!C>_-5*V75I%B`z-q=#plbnsq;>2Kk{e`AKl>BHu#MN_j&!+nmayY z<(-^y#E#A9>HXp6_u2eVu*Vntc_sC_m+nA-81&o!_ga`UGDzQ-4*|D~V(ieEWS z+mwFxJAT@ngT=}JeRS6&?_CdvYgbQBwKA4tPx~Xmu5H>M1-HF=@~I_OvtQ0%VjT@O zR@%wa&ast!aK7W0HFZ2cKzBT8CokK%KFN11xa|9JaQmm8*lLN@?1O6@zp@V}pqF!c zBHXy@iKSNd%YBl5oeVDLK_1&F;4=29aAT_{wptn6y_fc6bPagB?rS`~>c`Ep0A=tLc~Ll%Il)r7dwT1t+)r^D?+`Z6m)Nu8(^D zErKh+#`1`EWvzKliuSWw^ZJy(Uv(9l?e*7gd_Qk`y}lZ3`z0vngZ1^guI@F*Yft)p z9k}fG^>DAH`Pqvb!1}1W2G>%nrTtCd%%9`D89thl_P2obQMdh#)M{ye8`zj~*X(w1 zxn_63)$~i7JHf_Do4df}n*9>4reD_VS72jlvmduot0l(WV727#yDTu9zQ{CyA#m-LUl!bWo!n&0 z{w>hp3l-e{4ruVj3T~VwYwo%NtNY34yyWkDJP22NfN~Azs`vCig4Hi&(R^lj80_IY z9c>R$)SO$fu?J#*1ngRDOdR|6DBSfhPWIKG(A0Au{WI8Wn!48<<2(j7mbS!x9BeH2 zpFU5(^;37B$nB5)O8%$7&HPWJsb?QQ3s!rElAn|ME7(41^IUPysHHE@fsLDg^W}NC zn#IGt?%vLrUqClTuBm?q+eY2^?m@Nm`z5gRSkBwaXzJ<^$w>IOxNUfHB z{}XK7^!qiqn#H5p@7K|dk$(RRY#Vjsdv2&1-@WxF*u9nC5q=x4_7+8d&mA@Wz1F=4 zHikCO8+mLWfSng@ouW#EsXb%)3S6F#-@w(rrs(hWK`s7H zv|7&9K2AKvz2dprA?NU1)ixdFI~30iv2g}s?+Y&XLO-~g#iQH{j%j*y`=0sf4{x4Z zGoYzwer5!#rDT3)Ld*P^*Vy_xUax=Jvo14()slA>a5L|$XzKBq4cuJ2+0oSFGY7aC z=UZs%xi-xS_Iy&${d_L4v9wL)TI9WPZm@mOmiyj3VEgV7ZQfdQ?9t|{wens#Kbr0J z*KT~Taq6z4>zVmk5S(+!dxbo;Y)P`T{is@VY#x1# zI}mLRuv)J5Yl77*9yz~VAN!=;?_QjrwZP7ide(n!u<%i5sm)8Z`)+5?_ zwN~!=^=qx%^BbVqUVrVzcJ0(N@4%{$fjGXQIPcD}gK^%cqt0{P#yC6gnU76cji=4W zrf{{E-VV{4Q|H6B+A|-Uf!%lNnUBrE<}K%AAewr{J_u}Ek7!%en!|{;Wv!L-u@#!_ z_1A7}`({pU8Q0cewd|9@a5YPFpSZ_#--iE~e@%-1&a+zPxdUTA9XL9vv%mX-o#*MP z&2OwB{73x+_Ktje{-fr*qx`$6JHXXvEZXe|x81RIyPe=_WxJi>>g~TZ)Bbn+c7YpD zoBfuj-!l_6aSXFlyI0&JyAvbl=uRv`cfr+%72N%HK*7EK zOf2~K!H3t}^)j~a4gJi%2iV-!{+o$=QSVLh(7sP?x6b@_fsJtk^W?MnzF;-y(tEso zPl~aOEw-&|I27#I6Kgo!SoX^?$=&1njsUyn>POYuNU$-q`S&A+kbe|d-Sfb?+7IlU z`u8Wyqt9qGb>qnQ2is2F+|I9>{$r@eQ^wU?{|R7yhSGi@{6Mfi>b7_O)oecy`y{Yh zu62{aY8DT=#>}+ax>nkAZchPwty9mreGu4qu3OI0gW>vm*av;o(&iAbZG2aoHiyD( zV}9G{qn0*@fo_>u)VNCbtQDFCHF6 z4#BDTs+Cl$Vr@-Gl&Gj^sc+iywvnQxmbO5x1#8>8Pe^R<_kQ<2&v)-X-ydHdlAOKv zZ>_!dI&0d8#$8!MhZ^lfWd2 z3_%g#JzqEjApsiX{D7bt^K-ebODAd0e;60eUxb(d96@>hlh>c%Mfe*Xm4ydyR3DVi zbdAB6lZMhg(E)g&y8iv;ro%2G4t%3;tZVG1qw>-Bf25;h_ZwwFG49v=1%d@#U_mj^ z>Sv%tj%Y>!st_FjE&zH~w&|--3Isg`G*1=BDaQxQ5tZS!0vmn?OixJLhpD7Q5=g8N z{O%$Jb2>I_pJ$epp5U8GV&;1!6lwGHzMO2K_5{RA$m-9d=^5eHLcv_6?veWOE<9HV z&jrRn?GvbTpk^cTv8`0i5PFH@U_OJ3M$t=*9-0Ag=)Qea;w(-9$+_)n4g@U(w1hA% znX_^y-Y5KcCHVNJvPJLfU7*u>R$2?_t!5IR40${unG6&QLg(TNNm7K8zDq*UNSJkX zW+r86SwQ?d9U)mGi?fzi-<(nw=m%yjL71$&nU=?M?AducNJ*9~gJaUnM7{+9LJAl7 zixnCPxRjN|Xt_)PhZmOP1zdxfrPB@n>G0`3&UOzkF4`iMf3TdN!uU|M`k2%ESCgSr#+BbZ!pEzD}F>P>&_K5FQ z3c;HkM6rVy%{g_D97wDnRc&n~rs#JrGwj@GB5rFXZjC0UaR@wwSh1x_W+1-CR|>uj z!bGEX_T6Zz+9a+@E~(mDf&7Su^#U-|#hcKv5qc4+wA{NU>ufR(YdJm5`IWNIS_F)f_ z!adMTW71y*6J={M=ap>c$R<7nqQd#o6F_izgUf`FMkb38l* zz!m6)Sf8A!OsQvi2F@6}zd3(ehTFQM?f z-$+PH@u@uhWgFcqhmP}^&OihR4sq;z=wGquxu3ApKRHFfG>|*!v*_>|HGg?vgXE}o zZQIf4j>dyGch;V$Z@+(M7G1N(QZy^TNL}PR-HyjlI~3EGF0(KBTyx^-w2pu?(YxBH zm314whC47gsF6c5!D0(vXJ(d?Y+?(d6|nRrp84g?oa0KcRx33-pwt4Z`082IWZxng z%x9j#97EFD+pbQzc*%%doMN1I<%4!|M+rB?mI``ct z!W5v!W$L`Z%jQ6nAjoD8>J|js&A|>q$bdOSB=06JwA%;2SpL9vt9RfeUf`E7ZNf(h z?d=y*E=|JRucX|&9;yM z@3sULBLpW*9oU7-v}I@NaG70Obem+v?IhyXyegTDxJ`yba{{M|)y1F<>d~xIhIoB5 zL>nrkCDN_IUxfzQLxVdcA;m~_##MDjcXrwZ+_bCeX_dH)t|8wFF>!kdF{PwxBluKQ z$@En+F|i_=STRZfV&76!y+Nwlz^vNX2r`l~?#Ch8j}F=|CfXo^E3&l&^|u6dIX`

MT>hqAyh$J-CcE3&k2rl*@42qN^$t zRjm8jv{47#&!K(mfS=Q8BOrI`y?!e6dWNW#HVnI$k1o_92}Uo_8oYr6LGj=_8KS=X z)3hCmB!1=C_4Uozsmnk4c@(qIH)lb*$q{~Oz2|4LvAf?87tEzIGSV220EPSjrFb$h z1vzM~U@8+)7Iot1J2rf&aZrG?KO6;@-srtZ$gw=msChGo_KQ|s(|eJ%03E)@_-D@a zobMnt00fI(QShkn376JmKSzbn$<2Y(sah&Ct>+uCBH-*&?r?^OF;z6`$pTh@LW>W?9fzW?Xik5>9kcmt^Q zHc;y~!HOGyuk}TM|NpnvN0HR$1x$*Ev^)^DazL$bV>EIo_cWD-{zXhC)=p3IspNJ? zkrsa_2|c`%1H&qgSS5z-c!?s4l$|7C9;agGt{1^;LQg7H9NPg7?yN!!)x_$Aem%A(|yNXJva zTi1U7=3~LfC(!PNeB*y=>HpNyziO$313_UP;2eZcVR(W=a}tQtq%O#JUMO5QH_cZV z>02<#H?Q4q)|gk_@)qCe#`>H`z7qa0T=ynlSuY&>lkHn5@LPoG`14P8T@jzUfHQsU zPp`UM7ObBi5)MW3Z!j?=6^8^%@f?zBC`D*E|K#8fDgN!aE5j*5$8+PrcIV9K6dl9R zez$owB7{_icdE?eY#z?ZRHlxh;E*0T&`FL~H9q-b4F5EKrRA(dER#0D0g;CmR5B0U5Y62)E+Ec7@=_i#X=N3q{6g#t{Va{7%ajc=m@ylp;clO z947rE6Kq6$T3}qFMVbx=Kv1|`N19hkSCH;*-X$- z$89FzwzXEBph*h7o&z-fO%!b%a-vDjK@J$S^|Vk11weG2IkU?$uPOl?$wf~K;juz= zAuiFmP&vw4i8u#%Fvt&nz-Rvm777@|@!~e;;da=-LTkFcUZ4xT5cBl!?~kl<2!N= z@L-T1Ji?DecNB644|Zh3HK04%+U@P!(R<@Na-vDjL5>Uz#_#X4f@hX!gY`9C6y7Au zhJ3`xZG>cwQzH3=K9Av8%z6BcI7~vmg0g_JGNHi6zpe_&VnKAH<$!Oy0qrQp*s@Og@vLD^5mdPnMTOFo?rHy%?8a+QL>I&;7UbG8p|m3mWN z(%d7Mz`T{5x4kF6zAtj&yl_TnfbaNBs8N{%KbW}TLFAHokyE|7#{}HMF)e=mc-^>%3jGm@r?LQSkI$Hbak=`T z>jnK;g+9R5+dgaY>*|QpP_>F+SbSO?dyue{5=a+hP)v)e7V|wnU?y|0Z!r~TX>EG{ zVs0rP{}Tc&3oa5&wiVeGB!>jLm~QJ>+HGyaFsv4+baNx&qG-1S7pAYm*q`z}*SGjs z;2gvYhvXZ8)xI@}XAsg`{Eidc>P-+!XiAY&z1Q68{p-5ucy^&b8i`eNOj-;UM11Ks ziT@zoaJ?i8s_XD^-@G@DU(dj=Lyi8~pnJz~KxwF#a#@gcaMF#CqCwV;yWVj)Kitje z?QQnS7x3~y|K(U-RM0T@k#>&X@eO{Jnujd`BITN1s_>AwE)*-LFdT{Fd5m0Uy8aEx ztQDXdM6@#|bmKCk@XuJ>REUmOt5f*lqH2a32MLhWZPFxFLz`k>4Y^KUkk=>8@@D(s z#~L}d&Ps?kp1|64xohytf`zJ-Bo7_8NQR~t;&#T=Ny_ZqaGntjnF-!P-g$+T?KqiPGNF zn$hGM^xJTta;c6Pgl0H1yN}U!eNolH#+qMIUywj0emcoq`VrT@`IXaDmzd)Q3D3D3 zuHAi>##6F?L0zQ->W*bv&Tzo~ojTU#ZNqPJZM>?wEmu5YS@4acul*Yl3EI~Bu z1#*{;g^XWGBSquN@Aa#$5l^|8ZrG_k(+M1ngyUZlQ*i26g0I@-_#JMG>g>5tz_Q>k z`f>_ZjthPr(bNlGX{cOqtLFwha~`4oBopM^4^N@@O=S-YBYBcH0w7H><(` zgypV9bPem~Y-yLP*ED!nuBiu2+bJqw#{*x#OiIEBp$>ZUgbsGwj~P zmj`cZS2H@F!(D|vq%TuJgwetn3;5&GaXB~(HPly z2E^A+5MS{x8Ws0D@w0gT(}s-7{WS!J+j|^-b6F~PWUgAE@;5?45Y*0)>pZ91C2)B+ zCjO+i$>wtVNAqGZI@d&K{K+h^Xmw79{(wSr;=F9q2}7mHKea))*uCW_z+?xXE)L-9au|O!61j{e--1_ z(llWO`R+>B*5zaMoLOBidNGUjHrZu!gjGQv1P&Qp zC5E&?AQw6NuH84E`F;Z=6CrEfRk5{V1?bx zbR|h!!7aQiU-Wc+f9i{j?iQE`p}r6UCw^SNP&qzl`M9gNO@JVOoE;PiTzf zUre`?yG&gmj?UZy<>>PM#@Z$T5Z8myQ%p?wCIN(AcMJ%z&^g6$F<9NS5RKNu-MZUo zw4O$zGV>Jc9W*LMX!Oa85~qSYfj1_AKz`9*oL3A2dF3i-SI$b1ru8wk`;;Ctg}D?u zYU7_-^~!q1DxnWneA?3btl-{q=qI2G1+3SyU!0nq=`Gpns82*vd2Kfc}SF=aY zw`Isv{INI=YgA;gb!zu3YY2PfrDBIv3Arm@R;wI)+0_bUS9VX-2YUV}beDQ9eH1@| zl2p`<7Ww<6@9y?qm^1X@oZl|Y`3v?yo=s1y}@B8 zlE&gTw>A4bZAiCMJ+|ft4@T|$_tItz{}CIBWncwkx(&VnFdeD2BSI8%eN*s-!5*akxu_RSCc2&g1hA z6;K6)iO2oybXQIA7fw#fFnjJnyT&*Wj?M#8Eras;Y)&>>W;#90uC{D-Qg*vfPdD0@ z?aOl2%I1L2=|AC9lhC%T%PsPuFzs1}nTCoK#JWYA8D723Ip);8&s|>y2)a4Y{i>@XRb!x}2oE^UE6kO=FKYEBtBtzN96L36Cf6tv*@b(l>+80j^->a!8&^-U&-wv^-DFto#3%X zfo(N7SKizo0M3=iwk}nJb7jt)(zc#Z-P2Q4w@2kLEd$ZjHH4k=Gr$qA1HU4TF&XB+ z{OTS^WehY-`4$L7Z;kU5=0j2ZfSF+yUk)9Zh%~Mmy=v$q9t3G<(%6QHt^C8TG<324 z%6mMIku_)<(!Y0Jd^n*Hm-#pjJaSw>Ne@s6Sd{`o>}Eeh-9!3(ks$x|;5B$ZdSPCG zfB@S1B-F4y^PgE~PMnILxOpD<1Gg7Bz`-9QtC0hIN`7#`n)eQ#pq;eNzc-IK|LLTX zQ|d`$3zEPo(}rqlLGw(d7|k;bU!2%mgsN+s1k|Og5mGVvhS*R<+R3GV#;1T2?miJA zKX~byGnFT3XLA;T!|tM(Sk+%??P9uV-_l5s4zbz1DCUUK4Dy-=q{Elq%QN$XcT9TU zaDsL%V)6ZXRc{=Kt^SLz;PC%bI%I17(enQ>7L@-4rrPk;z*%5jR+&EhS{NWD^3~!7 z2L+DN%uB`9Q@g~cvNM|&fm8J1+4O|7aT_U|@m_%rOB%Le}R*mEu$G|&^zxoqIi$T(-w zUIR8T02^fG^Mu+4w++&aBktV_xIeouXBb~y&gDM)yA2>CYJ)!12D@IMHbCf8S`8F~ zDna34gG8-LB?Uhj29F!o>0xsmCa}x9O|YkKkD%ID2Zr;)rxq2OwcoiC_B2h=`+lu< zkO~*fY@Yx))1m-p*$^!FeBoY+v1iZgAN$w5PB0SuM(qpvZxkvqeoQ2P2nKDw16~Op zT*P65K+v8Ex@sJj>_h2Y=<+bKwN-C%?5{&a3*>durJ!%l17>bN?!g%Yu77KeH;ZQjyr! ziO=RGGe@```Z@Q_HQ&jyO>4me9zJn1y>S;6bo0*RdTCp~6d%P7_a9{%O`Tc-PZ&4U z6xm3YmQr{^Ddz1sEjR*1umi)!8kvJkd0Z#rc!_yep6) zqgq)LOpzCC?U&vPXU`8AYvfuxb)mds%5YO8ldOJTz=MaIy*Q_JxTD#djTU-A76S&-UoT$1x)*RSnjx*;`a;?gL5WbF6vxfvo&<5fe zfBAXJj&7pz!+;QL=K#mxE{3ly-gwTx%X`$!uL^ypoI~6WJ;bmnxpvGMz?~ia5^$S| zAE4YVkcaZlQns~oLd<*3OrIaj%cLraJ5Bpl3|gtstSP3z1lw`CqD>5b=Vv*dAwgct5( z_1g>Ozq^v*}51 z2W?s6LD+NHX!0L$9c!z;u08Y-;dz+=bne2AiqQUk=|`N6BgWU6X=C=GH;0Uveup&M zX6}SnC~BxzA$H&v46pd5O5Qa}7;7B%9N|i}HLPrT7qd?~C(YJ*c%!~1^D*bhBW;Dg zy)STYNmWzqCtzS`?`tkY_DQao^EBVV;ViJ4J6V2d$#~0kNPGNzcIvfXn_4 zT=sj&`yPCpMq{J+Cn-9iK zplFt;YLUyWo}~n6Pb

25k8Xcckt^4wV8=wJ0H_8LCV|D{GA*WE9DK9@ag90Lumc?&rBC_vQoAwhzWK2+_G+ko2E zE&3|JJOr2@vR^XTw^8QfRq{9vZp{9UB2RwiHL!el$Ot#ga4$}oO3A%I+#daP94>Vf zB!}UUA47qpYaE)gUR!PL6h;8mehy`O_VRfb!(zPI8MQ!MZHmw^Q2(Vt43|{Rf4K}{5#pGEEuz?=eOd8b` z`XfL@v1I`JTDJ;dIi8msnymQCK<-PEXBogg2cM&0>X8Frb<8-lv!1?JISKyr5&L(g z#>|3?nJ)6ez2-^aw*bc_3X~j$2gycJLx$z02zZj#hMsHd)8R8rN|r+-2!u-kTM&?D z#?&Y$!)o_)q-pRarmy?C)CkjFR7iJKi!P~bH~9BRE=9_ugPfeI&lXoR*!g=$MJj8j zCWn_CJz|v9)^KN{HkMZM+2QaDI|PyfiINBTG>0x0Klj;BV{fKmmhI`b!D!Qtc zq`TZF#rw0vKNuB-z)OLserY8Kh#G5ravk*>A?X>n$N&l>$cHx|Ua}tZ_wYGD8RE+J zC%X>$$D-77iX*}mK9(rl4vM3HmMeTN-!!msX=G>k3`dn61>WBe*8f!Xh{5VV

;ai=tAQgR2#hk;XO zXPn7Kf?6hS4+CNcWnRiX`IoJo*yW%BF(0vYs`7Zn@sGXPo?AeTCmaLMaM1AUwaBck z+Fc+BE?YA3C*ubJpaTW8hOs@9#_S)N^1<=76KK%3Yut}zUKYmL4`Obm+~#-j0&*ui+zCNKq=zbzx1`wJd5g^*#-1f#$H18eFpSZ zc3rb}spHbq}k-DLw5IZCjw0n6<*K9N3$|P zY(&QxaB0VJyF9)(cnuo7epDRI$}qIjy`lkjm&Y^3(T-{K=YWV}V*vKG<7$BAcn%tl zsxn%E+?RB(7=RUm&n__a+yJnyfQf9zybk}+5&wPBe!(m_iRB_sZ5F%^;;46}7$yH_ zapYG9Z(~uiO#98V;dH>}jH5@*m^H8BsPj$u1nX5CIavDQS8-Iiy6}KAj<_rkM>t;p ze=&}1X#ALn$(R4_`w?kP92!U6|63f{m;6`aNQ@q5{_zK8N1~~z+1S)g;U|vX7F(U< zm_I3|#G$6ck68C9S&7gcQzeJzub|2cpukZ(lsv@ZiqMB0Yje~i6gV{Xjwwm~fa0!J z_Z^vPfh!cVr{hK`yIhZz_xe%4N{rUJ9{!7x6!#pU^MXI2Xy0s5vR{O@SwWG^mB$JC zq}gHYK;JQYo2`=>$rIwAakC6H4BXqL3eYIw*?|$Da6hx~cS?qEfrJ0{EvxK5paj7+ zYRSV3mk)EXyK5LrEP8`@(q*oMB^VZ4Gsoyj0XL)p?^Cg&Er2sa5il`9sD~RSne}B+9HJ zLkGfFQU8$q$M4jvTle^*#jKp72yFQKqf8~pwsE|NN0cBJW5<}GAbthBlUs*D^{ruG z-_|h)q-v(AQ-fM)B?Q{&T4pqjvROr_-9|XFk>InAp#K!N|2D3{g1i3_?u+xdcP9}q zh7#MmiFO-t#SroB0b+K9jT6yzj^>!<>zM58Snn%Z?{^-pZ%zKK4@BXOS#@Ah<}Dj9#KwsDPX@a)5!@W^3SJnP?Ya6X|>x>MO_Dx4^ypIR)Nv=}iig3>aP_G#j@Y4ndX44E0Srj6pjje5_w6?^0k z!aLnnJCGa|C9C7*Q-y{wS1$?mm9O_*DH0I^{1=$-ip=nD)iu=TKJ383iM(WoUS4Vo z1s!nqh*9|t=z#w;b;5rr=O`Z>fzx~GjM@_13%6%p+HEgAc2 zZqh?;cIv>{0~A-OqUJ1vZFc`(u$KB@YU8I0S6Hi8;xf|!8M3iOCn9&LUM)wJTO%!Tm2})z67aO zCRh&b+^1NLiLyE(ZKlY)gc9;w{-P}EB7myr+~h>sp~&7PspKvG1s&M5eKpkM5p16x zJ}5S#kURK=73U`C`_cDtj@=;Yg?tL7E>=$ z5*TO94az9hRC|Iwrwv{P_c*};D7ZKrfN^Cg*tQ1+e}aM^rJ!JaE&yvroZt`?{Ll%O zqhM_f3V!Yc^HA`R6Z{7X=7gc(6%jykwhje1)}r94-l$|D3hvKCB?prMxH}vLFN#EU zZb!lVcTw=D6Z{4W))t`PS`;i!K*6)4Q1C4jENVi*@jj^JRVdiz1UEUsi72?t3BHSh z?M^Tc1q(`1uznV5jq@m2k&J?SP%!RG6fA2-CCgB7=$j~b2nB1dp#!kAhp|D7bAdD*1>90P9zv;4k82z#2CRWo@gXKN3^X8|QlcFI$?XY5)KL literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/kernel4_gray.hlsl b/piet-gpu/shader/gen/kernel4_gray.hlsl new file mode 100644 index 0000000..ffada37 --- /dev/null +++ b/piet-gpu/shader/gen/kernel4_gray.hlsl @@ -0,0 +1,1302 @@ +struct Alloc +{ + uint offset; +}; + +struct CmdStrokeRef +{ + uint offset; +}; + +struct CmdStroke +{ + uint tile_ref; + float half_width; +}; + +struct CmdFillRef +{ + uint offset; +}; + +struct CmdFill +{ + uint tile_ref; + int backdrop; +}; + +struct CmdColorRef +{ + uint offset; +}; + +struct CmdColor +{ + uint rgba_color; +}; + +struct CmdLinGradRef +{ + uint offset; +}; + +struct CmdLinGrad +{ + uint index; + float line_x; + float line_y; + float line_c; +}; + +struct CmdRadGradRef +{ + uint offset; +}; + +struct CmdRadGrad +{ + uint index; + float4 mat; + float2 xlat; + float2 c1; + float ra; + float roff; +}; + +struct CmdImageRef +{ + uint offset; +}; + +struct CmdImage +{ + uint index; + int2 offset; +}; + +struct CmdAlphaRef +{ + uint offset; +}; + +struct CmdAlpha +{ + float alpha; +}; + +struct CmdEndClipRef +{ + uint offset; +}; + +struct CmdEndClip +{ + uint blend; +}; + +struct CmdJumpRef +{ + uint offset; +}; + +struct CmdJump +{ + uint new_ref; +}; + +struct CmdRef +{ + uint offset; +}; + +struct CmdTag +{ + uint tag; + uint flags; +}; + +struct TileSegRef +{ + uint offset; +}; + +struct TileSeg +{ + float2 origin; + float2 _vector; + float y_edge; + TileSegRef next; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u); + +RWByteAddressBuffer _297 : register(u0, space0); +ByteAddressBuffer _1681 : register(t1, space0); +RWTexture2D image_atlas : register(u3, space0); +RWTexture2D gradients : register(u4, space0); +RWTexture2D image : register(u2, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; +}; + +uint spvPackUnorm4x8(float4 value) +{ + uint4 Packed = uint4(round(saturate(value) * 255.0)); + return Packed.x | (Packed.y << 8) | (Packed.z << 16) | (Packed.w << 24); +} + +float4 spvUnpackUnorm4x8(uint value) +{ + uint4 Packed = uint4(value & 0xff, (value >> 8) & 0xff, (value >> 16) & 0xff, value >> 24); + return float4(Packed) / 255.0; +} + +Alloc slice_mem(Alloc a, uint offset, uint size) +{ + Alloc _310 = { a.offset + offset }; + return _310; +} + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +uint read_mem(Alloc alloc, uint offset) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = _297.Load(offset * 4 + 8); + return v; +} + +CmdTag Cmd_tag(Alloc a, CmdRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1); + CmdTag _669 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) }; + return _669; +} + +CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + CmdStroke s; + s.tile_ref = raw0; + s.half_width = asfloat(raw1); + return s; +} + +CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref) +{ + CmdStrokeRef _685 = { ref.offset + 4u }; + Alloc param = a; + CmdStrokeRef param_1 = _685; + return CmdStroke_read(param, param_1); +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +TileSeg TileSeg_read(Alloc a, TileSegRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11); + TileSeg s; + s.origin = float2(asfloat(raw0), asfloat(raw1)); + s._vector = float2(asfloat(raw2), asfloat(raw3)); + s.y_edge = asfloat(raw4); + TileSegRef _826 = { raw5 }; + s.next = _826; + return s; +} + +uint2 chunk_offset(uint i) +{ + return uint2((i % 2u) * 8u, (i / 2u) * 4u); +} + +CmdFill CmdFill_read(Alloc a, CmdFillRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + CmdFill s; + s.tile_ref = raw0; + s.backdrop = int(raw1); + return s; +} + +CmdFill Cmd_Fill_read(Alloc a, CmdRef ref) +{ + CmdFillRef _675 = { ref.offset + 4u }; + Alloc param = a; + CmdFillRef param_1 = _675; + return CmdFill_read(param, param_1); +} + +CmdAlpha CmdAlpha_read(Alloc a, CmdAlphaRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + CmdAlpha s; + s.alpha = asfloat(raw0); + return s; +} + +CmdAlpha Cmd_Alpha_read(Alloc a, CmdRef ref) +{ + CmdAlphaRef _695 = { ref.offset + 4u }; + Alloc param = a; + CmdAlphaRef param_1 = _695; + return CmdAlpha_read(param, param_1); +} + +CmdColor CmdColor_read(Alloc a, CmdColorRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + CmdColor s; + s.rgba_color = raw0; + return s; +} + +CmdColor Cmd_Color_read(Alloc a, CmdRef ref) +{ + CmdColorRef _705 = { ref.offset + 4u }; + Alloc param = a; + CmdColorRef param_1 = _705; + return CmdColor_read(param, param_1); +} + +float3 fromsRGB(float3 srgb) +{ + return srgb; +} + +float4 unpacksRGB(uint srgba) +{ + float4 color = spvUnpackUnorm4x8(srgba).wzyx; + float3 param = color.xyz; + return float4(fromsRGB(param), color.w); +} + +CmdLinGrad CmdLinGrad_read(Alloc a, CmdLinGradRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + CmdLinGrad s; + s.index = raw0; + s.line_x = asfloat(raw1); + s.line_y = asfloat(raw2); + s.line_c = asfloat(raw3); + return s; +} + +CmdLinGrad Cmd_LinGrad_read(Alloc a, CmdRef ref) +{ + CmdLinGradRef _715 = { ref.offset + 4u }; + Alloc param = a; + CmdLinGradRef param_1 = _715; + return CmdLinGrad_read(param, param_1); +} + +CmdRadGrad CmdRadGrad_read(Alloc a, CmdRadGradRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11); + Alloc param_12 = a; + uint param_13 = ix + 6u; + uint raw6 = read_mem(param_12, param_13); + Alloc param_14 = a; + uint param_15 = ix + 7u; + uint raw7 = read_mem(param_14, param_15); + Alloc param_16 = a; + uint param_17 = ix + 8u; + uint raw8 = read_mem(param_16, param_17); + Alloc param_18 = a; + uint param_19 = ix + 9u; + uint raw9 = read_mem(param_18, param_19); + Alloc param_20 = a; + uint param_21 = ix + 10u; + uint raw10 = read_mem(param_20, param_21); + CmdRadGrad s; + s.index = raw0; + s.mat = float4(asfloat(raw1), asfloat(raw2), asfloat(raw3), asfloat(raw4)); + s.xlat = float2(asfloat(raw5), asfloat(raw6)); + s.c1 = float2(asfloat(raw7), asfloat(raw8)); + s.ra = asfloat(raw9); + s.roff = asfloat(raw10); + return s; +} + +CmdRadGrad Cmd_RadGrad_read(Alloc a, CmdRef ref) +{ + CmdRadGradRef _725 = { ref.offset + 4u }; + Alloc param = a; + CmdRadGradRef param_1 = _725; + return CmdRadGrad_read(param, param_1); +} + +CmdImage CmdImage_read(Alloc a, CmdImageRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + CmdImage s; + s.index = raw0; + s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16); + return s; +} + +CmdImage Cmd_Image_read(Alloc a, CmdRef ref) +{ + CmdImageRef _735 = { ref.offset + 4u }; + Alloc param = a; + CmdImageRef param_1 = _735; + return CmdImage_read(param, param_1); +} + +void fillImage(out float4 spvReturnValue[8], uint2 xy, CmdImage cmd_img) +{ + float4 rgba[8]; + for (uint i = 0u; i < 8u; i++) + { + uint param = i; + int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset; + float4 fg_rgba = image_atlas[uv]; + float3 param_1 = fg_rgba.xyz; + float3 _1653 = fromsRGB(param_1); + fg_rgba.x = _1653.x; + fg_rgba.y = _1653.y; + fg_rgba.z = _1653.z; + rgba[i] = fg_rgba; + } + spvReturnValue = rgba; +} + +float3 tosRGB(float3 rgb) +{ + return rgb; +} + +uint packsRGB(inout float4 rgba) +{ + float3 param = rgba.xyz; + rgba = float4(tosRGB(param), rgba.w); + return spvPackUnorm4x8(rgba.wzyx); +} + +CmdEndClip CmdEndClip_read(Alloc a, CmdEndClipRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + CmdEndClip s; + s.blend = raw0; + return s; +} + +CmdEndClip Cmd_EndClip_read(Alloc a, CmdRef ref) +{ + CmdEndClipRef _745 = { ref.offset + 4u }; + Alloc param = a; + CmdEndClipRef param_1 = _745; + return CmdEndClip_read(param, param_1); +} + +float3 screen(float3 cb, float3 cs) +{ + return (cb + cs) - (cb * cs); +} + +float3 hard_light(float3 cb, float3 cs) +{ + float3 param = cb; + float3 param_1 = (cs * 2.0f) - 1.0f.xxx; + float3 _889 = screen(param, param_1); + float3 _893 = (cb * 2.0f) * cs; + bool3 _898 = bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z); + return float3(_898.x ? _893.x : _889.x, _898.y ? _893.y : _889.y, _898.z ? _893.z : _889.z); +} + +float color_dodge(float cb, float cs) +{ + if (cb == 0.0f) + { + return 0.0f; + } + else + { + if (cs == 1.0f) + { + return 1.0f; + } + else + { + return min(1.0f, cb / (1.0f - cs)); + } + } +} + +float color_burn(float cb, float cs) +{ + if (cb == 1.0f) + { + return 1.0f; + } + else + { + if (cs == 0.0f) + { + return 0.0f; + } + else + { + return 1.0f - min(1.0f, (1.0f - cb) / cs); + } + } +} + +float3 soft_light(float3 cb, float3 cs) +{ + float3 _904 = sqrt(cb); + float3 _917 = ((((cb * 16.0f) - 12.0f.xxx) * cb) + 4.0f.xxx) * cb; + bool3 _921 = bool3(cb.x <= 0.25f.xxx.x, cb.y <= 0.25f.xxx.y, cb.z <= 0.25f.xxx.z); + float3 d = float3(_921.x ? _917.x : _904.x, _921.y ? _917.y : _904.y, _921.z ? _917.z : _904.z); + float3 _932 = cb + (((cs * 2.0f) - 1.0f.xxx) * (d - cb)); + float3 _942 = cb - (((1.0f.xxx - (cs * 2.0f)) * cb) * (1.0f.xxx - cb)); + bool3 _944 = bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z); + return float3(_944.x ? _942.x : _932.x, _944.y ? _942.y : _932.y, _944.z ? _942.z : _932.z); +} + +float sat(float3 c) +{ + return max(c.x, max(c.y, c.z)) - min(c.x, min(c.y, c.z)); +} + +void set_sat_inner(inout float cmin, inout float cmid, inout float cmax, float s) +{ + if (cmax > cmin) + { + cmid = ((cmid - cmin) * s) / (cmax - cmin); + cmax = s; + } + else + { + cmid = 0.0f; + cmax = 0.0f; + } + cmin = 0.0f; +} + +float3 set_sat(inout float3 c, float s) +{ + if (c.x <= c.y) + { + if (c.y <= c.z) + { + float param = c.x; + float param_1 = c.y; + float param_2 = c.z; + float param_3 = s; + set_sat_inner(param, param_1, param_2, param_3); + c.x = param; + c.y = param_1; + c.z = param_2; + } + else + { + if (c.x <= c.z) + { + float param_4 = c.x; + float param_5 = c.z; + float param_6 = c.y; + float param_7 = s; + set_sat_inner(param_4, param_5, param_6, param_7); + c.x = param_4; + c.z = param_5; + c.y = param_6; + } + else + { + float param_8 = c.z; + float param_9 = c.x; + float param_10 = c.y; + float param_11 = s; + set_sat_inner(param_8, param_9, param_10, param_11); + c.z = param_8; + c.x = param_9; + c.y = param_10; + } + } + } + else + { + if (c.x <= c.z) + { + float param_12 = c.y; + float param_13 = c.x; + float param_14 = c.z; + float param_15 = s; + set_sat_inner(param_12, param_13, param_14, param_15); + c.y = param_12; + c.x = param_13; + c.z = param_14; + } + else + { + if (c.y <= c.z) + { + float param_16 = c.y; + float param_17 = c.z; + float param_18 = c.x; + float param_19 = s; + set_sat_inner(param_16, param_17, param_18, param_19); + c.y = param_16; + c.z = param_17; + c.x = param_18; + } + else + { + float param_20 = c.z; + float param_21 = c.y; + float param_22 = c.x; + float param_23 = s; + set_sat_inner(param_20, param_21, param_22, param_23); + c.z = param_20; + c.y = param_21; + c.x = param_22; + } + } + } + return c; +} + +float lum(float3 c) +{ + float3 f = float3(0.300000011920928955078125f, 0.589999973773956298828125f, 0.10999999940395355224609375f); + return dot(c, f); +} + +float3 clip_color(inout float3 c) +{ + float3 param = c; + float L = lum(param); + float n = min(c.x, min(c.y, c.z)); + float x = max(c.x, max(c.y, c.z)); + if (n < 0.0f) + { + c = L.xxx + (((c - L.xxx) * L) / (L - n).xxx); + } + if (x > 1.0f) + { + c = L.xxx + (((c - L.xxx) * (1.0f - L)) / (x - L).xxx); + } + return c; +} + +float3 set_lum(float3 c, float l) +{ + float3 param = c; + float3 param_1 = c + (l - lum(param)).xxx; + float3 _1048 = clip_color(param_1); + return _1048; +} + +float3 mix_blend(float3 cb, float3 cs, uint mode) +{ + float3 b = 0.0f.xxx; + switch (mode) + { + case 1u: + { + b = cb * cs; + break; + } + case 2u: + { + float3 param = cb; + float3 param_1 = cs; + b = screen(param, param_1); + break; + } + case 3u: + { + float3 param_2 = cs; + float3 param_3 = cb; + b = hard_light(param_2, param_3); + break; + } + case 4u: + { + b = min(cb, cs); + break; + } + case 5u: + { + b = max(cb, cs); + break; + } + case 6u: + { + float param_4 = cb.x; + float param_5 = cs.x; + float param_6 = cb.y; + float param_7 = cs.y; + float param_8 = cb.z; + float param_9 = cs.z; + b = float3(color_dodge(param_4, param_5), color_dodge(param_6, param_7), color_dodge(param_8, param_9)); + break; + } + case 7u: + { + float param_10 = cb.x; + float param_11 = cs.x; + float param_12 = cb.y; + float param_13 = cs.y; + float param_14 = cb.z; + float param_15 = cs.z; + b = float3(color_burn(param_10, param_11), color_burn(param_12, param_13), color_burn(param_14, param_15)); + break; + } + case 8u: + { + float3 param_16 = cb; + float3 param_17 = cs; + b = hard_light(param_16, param_17); + break; + } + case 9u: + { + float3 param_18 = cb; + float3 param_19 = cs; + b = soft_light(param_18, param_19); + break; + } + case 10u: + { + b = abs(cb - cs); + break; + } + case 11u: + { + b = (cb + cs) - ((cb * 2.0f) * cs); + break; + } + case 12u: + { + float3 param_20 = cb; + float3 param_21 = cs; + float param_22 = sat(param_20); + float3 _1340 = set_sat(param_21, param_22); + float3 param_23 = cb; + float3 param_24 = _1340; + float param_25 = lum(param_23); + b = set_lum(param_24, param_25); + break; + } + case 13u: + { + float3 param_26 = cs; + float3 param_27 = cb; + float param_28 = sat(param_26); + float3 _1354 = set_sat(param_27, param_28); + float3 param_29 = cb; + float3 param_30 = _1354; + float param_31 = lum(param_29); + b = set_lum(param_30, param_31); + break; + } + case 14u: + { + float3 param_32 = cb; + float3 param_33 = cs; + float param_34 = lum(param_32); + b = set_lum(param_33, param_34); + break; + } + case 15u: + { + float3 param_35 = cs; + float3 param_36 = cb; + float param_37 = lum(param_35); + b = set_lum(param_36, param_37); + break; + } + default: + { + b = cs; + break; + } + } + return b; +} + +float4 mix_compose(float3 cb, float3 cs, float ab, float as, uint mode) +{ + float fa = 0.0f; + float fb = 0.0f; + switch (mode) + { + case 1u: + { + fa = 1.0f; + fb = 0.0f; + break; + } + case 2u: + { + fa = 0.0f; + fb = 1.0f; + break; + } + case 3u: + { + fa = 1.0f; + fb = 1.0f - as; + break; + } + case 4u: + { + fa = 1.0f - ab; + fb = 1.0f; + break; + } + case 5u: + { + fa = ab; + fb = 0.0f; + break; + } + case 6u: + { + fa = 0.0f; + fb = as; + break; + } + case 7u: + { + fa = 1.0f - ab; + fb = 0.0f; + break; + } + case 8u: + { + fa = 0.0f; + fb = 1.0f - as; + break; + } + case 9u: + { + fa = ab; + fb = 1.0f - as; + break; + } + case 10u: + { + fa = 1.0f - ab; + fb = as; + break; + } + case 11u: + { + fa = 1.0f - ab; + fb = 1.0f - as; + break; + } + case 12u: + { + fa = 1.0f; + fb = 1.0f; + break; + } + case 13u: + { + return min(1.0f.xxxx, float4((cs * as) + (cb * ab), as + ab)); + } + default: + { + break; + } + } + float as_fa = as * fa; + float ab_fb = ab * fb; + float3 co = (cs * as_fa) + (cb * ab_fb); + return float4(co, as_fa + ab_fb); +} + +float4 mix_blend_compose(float4 backdrop, float4 src, uint mode) +{ + if ((mode & 32767u) == 3u) + { + return (backdrop * (1.0f - src.w)) + src; + } + float inv_src_a = 1.0f / (src.w + 1.0000000036274937255387218471014e-15f); + float3 cs = src.xyz * inv_src_a; + float inv_backdrop_a = 1.0f / (backdrop.w + 1.0000000036274937255387218471014e-15f); + float3 cb = backdrop.xyz * inv_backdrop_a; + uint blend_mode = mode >> uint(8); + float3 param = cb; + float3 param_1 = cs; + uint param_2 = blend_mode; + float3 blended = mix_blend(param, param_1, param_2); + cs = lerp(cs, blended, backdrop.w.xxx); + uint comp_mode = mode & 255u; + if (comp_mode == 3u) + { + float3 co = lerp(backdrop.xyz, cs, src.w.xxx); + return float4(co, src.w + (backdrop.w * (1.0f - src.w))); + } + else + { + float3 param_3 = cb; + float3 param_4 = cs; + float param_5 = backdrop.w; + float param_6 = src.w; + uint param_7 = comp_mode; + return mix_compose(param_3, param_4, param_5, param_6, param_7); + } +} + +CmdJump CmdJump_read(Alloc a, CmdJumpRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + CmdJump s; + s.new_ref = raw0; + return s; +} + +CmdJump Cmd_Jump_read(Alloc a, CmdRef ref) +{ + CmdJumpRef _755 = { ref.offset + 4u }; + Alloc param = a; + CmdJumpRef param_1 = _755; + return CmdJump_read(param, param_1); +} + +void comp_main() +{ + uint tile_ix = (gl_WorkGroupID.y * _1681.Load(8)) + gl_WorkGroupID.x; + Alloc _1696; + _1696.offset = _1681.Load(24); + Alloc param; + param.offset = _1696.offset; + uint param_1 = tile_ix * 1024u; + uint param_2 = 1024u; + Alloc cmd_alloc = slice_mem(param, param_1, param_2); + CmdRef _1705 = { cmd_alloc.offset }; + CmdRef cmd_ref = _1705; + uint blend_offset = _297.Load((cmd_ref.offset >> uint(2)) * 4 + 8); + cmd_ref.offset += 4u; + uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y)); + float2 xy = float2(xy_uint); + float4 rgba[8]; + for (uint i = 0u; i < 8u; i++) + { + rgba[i] = 0.0f.xxxx; + } + uint clip_depth = 0u; + bool mem_ok = _297.Load(4) == 0u; + float df[8]; + TileSegRef tile_seg_ref; + float area[8]; + uint blend_stack[4][8]; + uint base_ix_1; + uint bg_rgba; + while (mem_ok) + { + Alloc param_3 = cmd_alloc; + CmdRef param_4 = cmd_ref; + uint tag = Cmd_tag(param_3, param_4).tag; + if (tag == 0u) + { + break; + } + switch (tag) + { + case 2u: + { + Alloc param_5 = cmd_alloc; + CmdRef param_6 = cmd_ref; + CmdStroke stroke = Cmd_Stroke_read(param_5, param_6); + for (uint k = 0u; k < 8u; k++) + { + df[k] = 1000000000.0f; + } + TileSegRef _1810 = { stroke.tile_ref }; + tile_seg_ref = _1810; + do + { + uint param_7 = tile_seg_ref.offset; + uint param_8 = 24u; + bool param_9 = mem_ok; + Alloc param_10 = new_alloc(param_7, param_8, param_9); + TileSegRef param_11 = tile_seg_ref; + TileSeg seg = TileSeg_read(param_10, param_11); + float2 line_vec = seg._vector; + for (uint k_1 = 0u; k_1 < 8u; k_1++) + { + float2 dpos = (xy + 0.5f.xx) - seg.origin; + uint param_12 = k_1; + dpos += float2(chunk_offset(param_12)); + float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0f, 1.0f); + df[k_1] = min(df[k_1], length((line_vec * t) - dpos)); + } + tile_seg_ref = seg.next; + } while (tile_seg_ref.offset != 0u); + for (uint k_2 = 0u; k_2 < 8u; k_2++) + { + area[k_2] = clamp((stroke.half_width + 0.5f) - df[k_2], 0.0f, 1.0f); + } + cmd_ref.offset += 12u; + break; + } + case 1u: + { + Alloc param_13 = cmd_alloc; + CmdRef param_14 = cmd_ref; + CmdFill fill = Cmd_Fill_read(param_13, param_14); + for (uint k_3 = 0u; k_3 < 8u; k_3++) + { + area[k_3] = float(fill.backdrop); + } + TileSegRef _1930 = { fill.tile_ref }; + tile_seg_ref = _1930; + do + { + uint param_15 = tile_seg_ref.offset; + uint param_16 = 24u; + bool param_17 = mem_ok; + Alloc param_18 = new_alloc(param_15, param_16, param_17); + TileSegRef param_19 = tile_seg_ref; + TileSeg seg_1 = TileSeg_read(param_18, param_19); + for (uint k_4 = 0u; k_4 < 8u; k_4++) + { + uint param_20 = k_4; + float2 my_xy = xy + float2(chunk_offset(param_20)); + float2 start = seg_1.origin - my_xy; + float2 end = start + seg_1._vector; + float2 window = clamp(float2(start.y, end.y), 0.0f.xx, 1.0f.xx); + if (window.x != window.y) + { + float2 t_1 = (window - start.y.xx) / seg_1._vector.y.xx; + float2 xs = float2(lerp(start.x, end.x, t_1.x), lerp(start.x, end.x, t_1.y)); + float xmin = min(min(xs.x, xs.y), 1.0f) - 9.9999999747524270787835121154785e-07f; + float xmax = max(xs.x, xs.y); + float b = min(xmax, 1.0f); + float c = max(b, 0.0f); + float d = max(xmin, 0.0f); + float a = ((b + (0.5f * ((d * d) - (c * c)))) - xmin) / (xmax - xmin); + area[k_4] += (a * (window.x - window.y)); + } + area[k_4] += (sign(seg_1._vector.x) * clamp((my_xy.y - seg_1.y_edge) + 1.0f, 0.0f, 1.0f)); + } + tile_seg_ref = seg_1.next; + } while (tile_seg_ref.offset != 0u); + for (uint k_5 = 0u; k_5 < 8u; k_5++) + { + area[k_5] = min(abs(area[k_5]), 1.0f); + } + cmd_ref.offset += 12u; + break; + } + case 3u: + { + for (uint k_6 = 0u; k_6 < 8u; k_6++) + { + area[k_6] = 1.0f; + } + cmd_ref.offset += 4u; + break; + } + case 4u: + { + Alloc param_21 = cmd_alloc; + CmdRef param_22 = cmd_ref; + CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22); + for (uint k_7 = 0u; k_7 < 8u; k_7++) + { + area[k_7] = alpha.alpha; + } + cmd_ref.offset += 8u; + break; + } + case 5u: + { + Alloc param_23 = cmd_alloc; + CmdRef param_24 = cmd_ref; + CmdColor color = Cmd_Color_read(param_23, param_24); + uint param_25 = color.rgba_color; + float4 fg = unpacksRGB(param_25); + for (uint k_8 = 0u; k_8 < 8u; k_8++) + { + float4 fg_k = fg * area[k_8]; + rgba[k_8] = (rgba[k_8] * (1.0f - fg_k.w)) + fg_k; + } + cmd_ref.offset += 8u; + break; + } + case 6u: + { + Alloc param_26 = cmd_alloc; + CmdRef param_27 = cmd_ref; + CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27); + float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c; + for (uint k_9 = 0u; k_9 < 8u; k_9++) + { + uint param_28 = k_9; + float2 chunk_xy = float2(chunk_offset(param_28)); + float my_d = (d_1 + (lin.line_x * chunk_xy.x)) + (lin.line_y * chunk_xy.y); + int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f)); + float4 fg_rgba = gradients[int2(x, int(lin.index))]; + float3 param_29 = fg_rgba.xyz; + float3 _2264 = fromsRGB(param_29); + fg_rgba.x = _2264.x; + fg_rgba.y = _2264.y; + fg_rgba.z = _2264.z; + float4 fg_k_1 = fg_rgba * area[k_9]; + rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1; + } + cmd_ref.offset += 20u; + break; + } + case 7u: + { + Alloc param_30 = cmd_alloc; + CmdRef param_31 = cmd_ref; + CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31); + for (uint k_10 = 0u; k_10 < 8u; k_10++) + { + uint param_32 = k_10; + float2 my_xy_1 = xy + float2(chunk_offset(param_32)); + my_xy_1 = ((rad.mat.xz * my_xy_1.x) + (rad.mat.yw * my_xy_1.y)) - rad.xlat; + float ba = dot(my_xy_1, rad.c1); + float ca = rad.ra * dot(my_xy_1, my_xy_1); + float t_2 = (sqrt((ba * ba) + ca) - ba) - rad.roff; + int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f)); + float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))]; + float3 param_33 = fg_rgba_1.xyz; + float3 _2374 = fromsRGB(param_33); + fg_rgba_1.x = _2374.x; + fg_rgba_1.y = _2374.y; + fg_rgba_1.z = _2374.z; + float4 fg_k_2 = fg_rgba_1 * area[k_10]; + rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2; + } + cmd_ref.offset += 48u; + break; + } + case 8u: + { + Alloc param_34 = cmd_alloc; + CmdRef param_35 = cmd_ref; + CmdImage fill_img = Cmd_Image_read(param_34, param_35); + uint2 param_36 = xy_uint; + CmdImage param_37 = fill_img; + float4 _2417[8]; + fillImage(_2417, param_36, param_37); + float4 img[8] = _2417; + for (uint k_11 = 0u; k_11 < 8u; k_11++) + { + float4 fg_k_3 = img[k_11] * area[k_11]; + rgba[k_11] = (rgba[k_11] * (1.0f - fg_k_3.w)) + fg_k_3; + } + cmd_ref.offset += 12u; + break; + } + case 9u: + { + if (clip_depth < 4u) + { + for (uint k_12 = 0u; k_12 < 8u; k_12++) + { + float4 param_38 = float4(rgba[k_12]); + uint _2479 = packsRGB(param_38); + blend_stack[clip_depth][k_12] = _2479; + rgba[k_12] = 0.0f.xxxx; + } + } + else + { + uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + for (uint k_13 = 0u; k_13 < 8u; k_13++) + { + float4 param_39 = float4(rgba[k_13]); + uint _2522 = packsRGB(param_39); + _297.Store((base_ix + k_13) * 4 + 8, _2522); + rgba[k_13] = 0.0f.xxxx; + } + } + clip_depth++; + cmd_ref.offset += 4u; + break; + } + case 10u: + { + Alloc param_40 = cmd_alloc; + CmdRef param_41 = cmd_ref; + CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41); + clip_depth--; + if (clip_depth >= 4u) + { + base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + } + for (uint k_14 = 0u; k_14 < 8u; k_14++) + { + if (clip_depth < 4u) + { + bg_rgba = blend_stack[clip_depth][k_14]; + } + else + { + bg_rgba = _297.Load((base_ix_1 + k_14) * 4 + 8); + } + uint param_42 = bg_rgba; + float4 bg = unpacksRGB(param_42); + float4 fg_1 = rgba[k_14] * area[k_14]; + float4 param_43 = bg; + float4 param_44 = fg_1; + uint param_45 = end_clip.blend; + rgba[k_14] = mix_blend_compose(param_43, param_44, param_45); + } + cmd_ref.offset += 8u; + break; + } + case 11u: + { + Alloc param_46 = cmd_alloc; + CmdRef param_47 = cmd_ref; + CmdRef _2621 = { Cmd_Jump_read(param_46, param_47).new_ref }; + cmd_ref = _2621; + cmd_alloc.offset = cmd_ref.offset; + break; + } + } + } + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint param_48 = i_1; + image[int2(xy_uint + chunk_offset(param_48))] = rgba[i_1].w.x; + } +} + +[numthreads(8, 4, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/kernel4_gray.msl b/piet-gpu/shader/gen/kernel4_gray.msl new file mode 100644 index 0000000..e174713 --- /dev/null +++ b/piet-gpu/shader/gen/kernel4_gray.msl @@ -0,0 +1,1348 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" + +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Alloc +{ + uint offset; +}; + +struct CmdStrokeRef +{ + uint offset; +}; + +struct CmdStroke +{ + uint tile_ref; + float half_width; +}; + +struct CmdFillRef +{ + uint offset; +}; + +struct CmdFill +{ + uint tile_ref; + int backdrop; +}; + +struct CmdColorRef +{ + uint offset; +}; + +struct CmdColor +{ + uint rgba_color; +}; + +struct CmdLinGradRef +{ + uint offset; +}; + +struct CmdLinGrad +{ + uint index; + float line_x; + float line_y; + float line_c; +}; + +struct CmdRadGradRef +{ + uint offset; +}; + +struct CmdRadGrad +{ + uint index; + float4 mat; + float2 xlat; + float2 c1; + float ra; + float roff; +}; + +struct CmdImageRef +{ + uint offset; +}; + +struct CmdImage +{ + uint index; + int2 offset; +}; + +struct CmdAlphaRef +{ + uint offset; +}; + +struct CmdAlpha +{ + float alpha; +}; + +struct CmdEndClipRef +{ + uint offset; +}; + +struct CmdEndClip +{ + uint blend; +}; + +struct CmdJumpRef +{ + uint offset; +}; + +struct CmdJump +{ + uint new_ref; +}; + +struct CmdRef +{ + uint offset; +}; + +struct CmdTag +{ + uint tag; + uint flags; +}; + +struct TileSegRef +{ + uint offset; +}; + +struct TileSeg +{ + float2 origin; + float2 vector; + float y_edge; + TileSegRef next; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 path_bbox_alloc; + Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; + Alloc_1 draw_bbox_alloc; + Alloc_1 drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(8u, 4u, 1u); + +static inline __attribute__((always_inline)) +Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size) +{ + return Alloc{ a.offset + offset }; +} + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_297) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_297.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1, v_297); + return CmdTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) }; +} + +static inline __attribute__((always_inline)) +CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_297); + CmdStroke s; + s.tile_ref = raw0; + s.half_width = as_type(raw1); + return s; +} + +static inline __attribute__((always_inline)) +CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdStrokeRef param_1 = CmdStrokeRef{ ref.offset + 4u }; + return CmdStroke_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_297); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_297); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_297); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9, v_297); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11, v_297); + TileSeg s; + s.origin = float2(as_type(raw0), as_type(raw1)); + s.vector = float2(as_type(raw2), as_type(raw3)); + s.y_edge = as_type(raw4); + s.next = TileSegRef{ raw5 }; + return s; +} + +static inline __attribute__((always_inline)) +uint2 chunk_offset(thread const uint& i) +{ + return uint2((i % 2u) * 8u, (i / 2u) * 4u); +} + +static inline __attribute__((always_inline)) +CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_297); + CmdFill s; + s.tile_ref = raw0; + s.backdrop = int(raw1); + return s; +} + +static inline __attribute__((always_inline)) +CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdFillRef param_1 = CmdFillRef{ ref.offset + 4u }; + return CmdFill_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + CmdAlpha s; + s.alpha = as_type(raw0); + return s; +} + +static inline __attribute__((always_inline)) +CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdAlphaRef param_1 = CmdAlphaRef{ ref.offset + 4u }; + return CmdAlpha_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + CmdColor s; + s.rgba_color = raw0; + return s; +} + +static inline __attribute__((always_inline)) +CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdColorRef param_1 = CmdColorRef{ ref.offset + 4u }; + return CmdColor_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +float3 fromsRGB(thread const float3& srgb) +{ + return srgb; +} + +static inline __attribute__((always_inline)) +float4 unpacksRGB(thread const uint& srgba) +{ + float4 color = unpack_unorm4x8_to_float(srgba).wzyx; + float3 param = color.xyz; + return float4(fromsRGB(param), color.w); +} + +static inline __attribute__((always_inline)) +CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_297); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_297); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_297); + CmdLinGrad s; + s.index = raw0; + s.line_x = as_type(raw1); + s.line_y = as_type(raw2); + s.line_c = as_type(raw3); + return s; +} + +static inline __attribute__((always_inline)) +CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdLinGradRef param_1 = CmdLinGradRef{ ref.offset + 4u }; + return CmdLinGrad_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +CmdRadGrad CmdRadGrad_read(thread const Alloc& a, thread const CmdRadGradRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_297); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_297); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_297); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9, v_297); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11, v_297); + Alloc param_12 = a; + uint param_13 = ix + 6u; + uint raw6 = read_mem(param_12, param_13, v_297); + Alloc param_14 = a; + uint param_15 = ix + 7u; + uint raw7 = read_mem(param_14, param_15, v_297); + Alloc param_16 = a; + uint param_17 = ix + 8u; + uint raw8 = read_mem(param_16, param_17, v_297); + Alloc param_18 = a; + uint param_19 = ix + 9u; + uint raw9 = read_mem(param_18, param_19, v_297); + Alloc param_20 = a; + uint param_21 = ix + 10u; + uint raw10 = read_mem(param_20, param_21, v_297); + CmdRadGrad s; + s.index = raw0; + s.mat = float4(as_type(raw1), as_type(raw2), as_type(raw3), as_type(raw4)); + s.xlat = float2(as_type(raw5), as_type(raw6)); + s.c1 = float2(as_type(raw7), as_type(raw8)); + s.ra = as_type(raw9); + s.roff = as_type(raw10); + return s; +} + +static inline __attribute__((always_inline)) +CmdRadGrad Cmd_RadGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdRadGradRef param_1 = CmdRadGradRef{ ref.offset + 4u }; + return CmdRadGrad_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_297); + CmdImage s; + s.index = raw0; + s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16); + return s; +} + +static inline __attribute__((always_inline)) +CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdImageRef param_1 = CmdImageRef{ ref.offset + 4u }; + return CmdImage_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +spvUnsafeArray fillImage(thread const uint2& xy, thread const CmdImage& cmd_img, texture2d image_atlas) +{ + spvUnsafeArray rgba; + for (uint i = 0u; i < 8u; i++) + { + uint param = i; + int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset; + float4 fg_rgba = image_atlas.read(uint2(uv)); + float3 param_1 = fg_rgba.xyz; + float3 _1653 = fromsRGB(param_1); + fg_rgba.x = _1653.x; + fg_rgba.y = _1653.y; + fg_rgba.z = _1653.z; + rgba[i] = fg_rgba; + } + return rgba; +} + +static inline __attribute__((always_inline)) +float3 tosRGB(thread const float3& rgb) +{ + return rgb; +} + +static inline __attribute__((always_inline)) +uint packsRGB(thread float4& rgba) +{ + float3 param = rgba.xyz; + rgba = float4(tosRGB(param), rgba.w); + return pack_float_to_unorm4x8(rgba.wzyx); +} + +static inline __attribute__((always_inline)) +CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + CmdEndClip s; + s.blend = raw0; + return s; +} + +static inline __attribute__((always_inline)) +CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdEndClipRef param_1 = CmdEndClipRef{ ref.offset + 4u }; + return CmdEndClip_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +float3 screen(thread const float3& cb, thread const float3& cs) +{ + return (cb + cs) - (cb * cs); +} + +static inline __attribute__((always_inline)) +float3 hard_light(thread const float3& cb, thread const float3& cs) +{ + float3 param = cb; + float3 param_1 = (cs * 2.0) - float3(1.0); + return select(screen(param, param_1), (cb * 2.0) * cs, cs <= float3(0.5)); +} + +static inline __attribute__((always_inline)) +float color_dodge(thread const float& cb, thread const float& cs) +{ + if (cb == 0.0) + { + return 0.0; + } + else + { + if (cs == 1.0) + { + return 1.0; + } + else + { + return fast::min(1.0, cb / (1.0 - cs)); + } + } +} + +static inline __attribute__((always_inline)) +float color_burn(thread const float& cb, thread const float& cs) +{ + if (cb == 1.0) + { + return 1.0; + } + else + { + if (cs == 0.0) + { + return 0.0; + } + else + { + return 1.0 - fast::min(1.0, (1.0 - cb) / cs); + } + } +} + +static inline __attribute__((always_inline)) +float3 soft_light(thread const float3& cb, thread const float3& cs) +{ + float3 d = select(sqrt(cb), ((((cb * 16.0) - float3(12.0)) * cb) + float3(4.0)) * cb, cb <= float3(0.25)); + return select(cb + (((cs * 2.0) - float3(1.0)) * (d - cb)), cb - (((float3(1.0) - (cs * 2.0)) * cb) * (float3(1.0) - cb)), cs <= float3(0.5)); +} + +static inline __attribute__((always_inline)) +float sat(thread const float3& c) +{ + return fast::max(c.x, fast::max(c.y, c.z)) - fast::min(c.x, fast::min(c.y, c.z)); +} + +static inline __attribute__((always_inline)) +void set_sat_inner(thread float& cmin, thread float& cmid, thread float& cmax, thread const float& s) +{ + if (cmax > cmin) + { + cmid = ((cmid - cmin) * s) / (cmax - cmin); + cmax = s; + } + else + { + cmid = 0.0; + cmax = 0.0; + } + cmin = 0.0; +} + +static inline __attribute__((always_inline)) +float3 set_sat(thread float3& c, thread const float& s) +{ + if (c.x <= c.y) + { + if (c.y <= c.z) + { + float param = c.x; + float param_1 = c.y; + float param_2 = c.z; + float param_3 = s; + set_sat_inner(param, param_1, param_2, param_3); + c.x = param; + c.y = param_1; + c.z = param_2; + } + else + { + if (c.x <= c.z) + { + float param_4 = c.x; + float param_5 = c.z; + float param_6 = c.y; + float param_7 = s; + set_sat_inner(param_4, param_5, param_6, param_7); + c.x = param_4; + c.z = param_5; + c.y = param_6; + } + else + { + float param_8 = c.z; + float param_9 = c.x; + float param_10 = c.y; + float param_11 = s; + set_sat_inner(param_8, param_9, param_10, param_11); + c.z = param_8; + c.x = param_9; + c.y = param_10; + } + } + } + else + { + if (c.x <= c.z) + { + float param_12 = c.y; + float param_13 = c.x; + float param_14 = c.z; + float param_15 = s; + set_sat_inner(param_12, param_13, param_14, param_15); + c.y = param_12; + c.x = param_13; + c.z = param_14; + } + else + { + if (c.y <= c.z) + { + float param_16 = c.y; + float param_17 = c.z; + float param_18 = c.x; + float param_19 = s; + set_sat_inner(param_16, param_17, param_18, param_19); + c.y = param_16; + c.z = param_17; + c.x = param_18; + } + else + { + float param_20 = c.z; + float param_21 = c.y; + float param_22 = c.x; + float param_23 = s; + set_sat_inner(param_20, param_21, param_22, param_23); + c.z = param_20; + c.y = param_21; + c.x = param_22; + } + } + } + return c; +} + +static inline __attribute__((always_inline)) +float lum(thread const float3& c) +{ + float3 f = float3(0.300000011920928955078125, 0.589999973773956298828125, 0.10999999940395355224609375); + return dot(c, f); +} + +static inline __attribute__((always_inline)) +float3 clip_color(thread float3& c) +{ + float3 param = c; + float L = lum(param); + float n = fast::min(c.x, fast::min(c.y, c.z)); + float x = fast::max(c.x, fast::max(c.y, c.z)); + if (n < 0.0) + { + c = float3(L) + (((c - float3(L)) * L) / float3(L - n)); + } + if (x > 1.0) + { + c = float3(L) + (((c - float3(L)) * (1.0 - L)) / float3(x - L)); + } + return c; +} + +static inline __attribute__((always_inline)) +float3 set_lum(thread const float3& c, thread const float& l) +{ + float3 param = c; + float3 param_1 = c + float3(l - lum(param)); + float3 _1048 = clip_color(param_1); + return _1048; +} + +static inline __attribute__((always_inline)) +float3 mix_blend(thread const float3& cb, thread const float3& cs, thread const uint& mode) +{ + float3 b = float3(0.0); + switch (mode) + { + case 1u: + { + b = cb * cs; + break; + } + case 2u: + { + float3 param = cb; + float3 param_1 = cs; + b = screen(param, param_1); + break; + } + case 3u: + { + float3 param_2 = cs; + float3 param_3 = cb; + b = hard_light(param_2, param_3); + break; + } + case 4u: + { + b = fast::min(cb, cs); + break; + } + case 5u: + { + b = fast::max(cb, cs); + break; + } + case 6u: + { + float param_4 = cb.x; + float param_5 = cs.x; + float param_6 = cb.y; + float param_7 = cs.y; + float param_8 = cb.z; + float param_9 = cs.z; + b = float3(color_dodge(param_4, param_5), color_dodge(param_6, param_7), color_dodge(param_8, param_9)); + break; + } + case 7u: + { + float param_10 = cb.x; + float param_11 = cs.x; + float param_12 = cb.y; + float param_13 = cs.y; + float param_14 = cb.z; + float param_15 = cs.z; + b = float3(color_burn(param_10, param_11), color_burn(param_12, param_13), color_burn(param_14, param_15)); + break; + } + case 8u: + { + float3 param_16 = cb; + float3 param_17 = cs; + b = hard_light(param_16, param_17); + break; + } + case 9u: + { + float3 param_18 = cb; + float3 param_19 = cs; + b = soft_light(param_18, param_19); + break; + } + case 10u: + { + b = abs(cb - cs); + break; + } + case 11u: + { + b = (cb + cs) - ((cb * 2.0) * cs); + break; + } + case 12u: + { + float3 param_20 = cb; + float3 param_21 = cs; + float param_22 = sat(param_20); + float3 _1340 = set_sat(param_21, param_22); + float3 param_23 = cb; + float3 param_24 = _1340; + float param_25 = lum(param_23); + b = set_lum(param_24, param_25); + break; + } + case 13u: + { + float3 param_26 = cs; + float3 param_27 = cb; + float param_28 = sat(param_26); + float3 _1354 = set_sat(param_27, param_28); + float3 param_29 = cb; + float3 param_30 = _1354; + float param_31 = lum(param_29); + b = set_lum(param_30, param_31); + break; + } + case 14u: + { + float3 param_32 = cb; + float3 param_33 = cs; + float param_34 = lum(param_32); + b = set_lum(param_33, param_34); + break; + } + case 15u: + { + float3 param_35 = cs; + float3 param_36 = cb; + float param_37 = lum(param_35); + b = set_lum(param_36, param_37); + break; + } + default: + { + b = cs; + break; + } + } + return b; +} + +static inline __attribute__((always_inline)) +float4 mix_compose(thread const float3& cb, thread const float3& cs, thread const float& ab, thread const float& as, thread const uint& mode) +{ + float fa = 0.0; + float fb = 0.0; + switch (mode) + { + case 1u: + { + fa = 1.0; + fb = 0.0; + break; + } + case 2u: + { + fa = 0.0; + fb = 1.0; + break; + } + case 3u: + { + fa = 1.0; + fb = 1.0 - as; + break; + } + case 4u: + { + fa = 1.0 - ab; + fb = 1.0; + break; + } + case 5u: + { + fa = ab; + fb = 0.0; + break; + } + case 6u: + { + fa = 0.0; + fb = as; + break; + } + case 7u: + { + fa = 1.0 - ab; + fb = 0.0; + break; + } + case 8u: + { + fa = 0.0; + fb = 1.0 - as; + break; + } + case 9u: + { + fa = ab; + fb = 1.0 - as; + break; + } + case 10u: + { + fa = 1.0 - ab; + fb = as; + break; + } + case 11u: + { + fa = 1.0 - ab; + fb = 1.0 - as; + break; + } + case 12u: + { + fa = 1.0; + fb = 1.0; + break; + } + case 13u: + { + return fast::min(float4(1.0), float4((cs * as) + (cb * ab), as + ab)); + } + default: + { + break; + } + } + float as_fa = as * fa; + float ab_fb = ab * fb; + float3 co = (cs * as_fa) + (cb * ab_fb); + return float4(co, as_fa + ab_fb); +} + +static inline __attribute__((always_inline)) +float4 mix_blend_compose(thread const float4& backdrop, thread const float4& src, thread const uint& mode) +{ + if ((mode & 32767u) == 3u) + { + return (backdrop * (1.0 - src.w)) + src; + } + float inv_src_a = 1.0 / (src.w + 1.0000000036274937255387218471014e-15); + float3 cs = src.xyz * inv_src_a; + float inv_backdrop_a = 1.0 / (backdrop.w + 1.0000000036274937255387218471014e-15); + float3 cb = backdrop.xyz * inv_backdrop_a; + uint blend_mode = mode >> uint(8); + float3 param = cb; + float3 param_1 = cs; + uint param_2 = blend_mode; + float3 blended = mix_blend(param, param_1, param_2); + cs = mix(cs, blended, float3(backdrop.w)); + uint comp_mode = mode & 255u; + if (comp_mode == 3u) + { + float3 co = mix(backdrop.xyz, cs, float3(src.w)); + return float4(co, src.w + (backdrop.w * (1.0 - src.w))); + } + else + { + float3 param_3 = cb; + float3 param_4 = cs; + float param_5 = backdrop.w; + float param_6 = src.w; + uint param_7 = comp_mode; + return mix_compose(param_3, param_4, param_5, param_6, param_7); + } +} + +static inline __attribute__((always_inline)) +CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + CmdJump s; + s.new_ref = raw0; + return s; +} + +static inline __attribute__((always_inline)) +CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdJumpRef param_1 = CmdJumpRef{ ref.offset + 4u }; + return CmdJump_read(param, param_1, v_297); +} + +kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1681 [[buffer(1)]], texture2d image [[texture(2)]], texture2d image_atlas [[texture(3)]], texture2d gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + uint tile_ix = (gl_WorkGroupID.y * _1681.conf.width_in_tiles) + gl_WorkGroupID.x; + Alloc param; + param.offset = _1681.conf.ptcl_alloc.offset; + uint param_1 = tile_ix * 1024u; + uint param_2 = 1024u; + Alloc cmd_alloc = slice_mem(param, param_1, param_2); + CmdRef cmd_ref = CmdRef{ cmd_alloc.offset }; + uint blend_offset = v_297.memory[cmd_ref.offset >> uint(2)]; + cmd_ref.offset += 4u; + uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y)); + float2 xy = float2(xy_uint); + spvUnsafeArray rgba; + for (uint i = 0u; i < 8u; i++) + { + rgba[i] = float4(0.0); + } + uint clip_depth = 0u; + bool mem_ok = v_297.mem_error == 0u; + spvUnsafeArray df; + TileSegRef tile_seg_ref; + spvUnsafeArray area; + spvUnsafeArray, 4> blend_stack; + uint base_ix_1; + uint bg_rgba; + while (mem_ok) + { + Alloc param_3 = cmd_alloc; + CmdRef param_4 = cmd_ref; + uint tag = Cmd_tag(param_3, param_4, v_297).tag; + if (tag == 0u) + { + break; + } + switch (tag) + { + case 2u: + { + Alloc param_5 = cmd_alloc; + CmdRef param_6 = cmd_ref; + CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_297); + for (uint k = 0u; k < 8u; k++) + { + df[k] = 1000000000.0; + } + tile_seg_ref = TileSegRef{ stroke.tile_ref }; + do + { + uint param_7 = tile_seg_ref.offset; + uint param_8 = 24u; + bool param_9 = mem_ok; + Alloc param_10 = new_alloc(param_7, param_8, param_9); + TileSegRef param_11 = tile_seg_ref; + TileSeg seg = TileSeg_read(param_10, param_11, v_297); + float2 line_vec = seg.vector; + for (uint k_1 = 0u; k_1 < 8u; k_1++) + { + float2 dpos = (xy + float2(0.5)) - seg.origin; + uint param_12 = k_1; + dpos += float2(chunk_offset(param_12)); + float t = fast::clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0); + df[k_1] = fast::min(df[k_1], length((line_vec * t) - dpos)); + } + tile_seg_ref = seg.next; + } while (tile_seg_ref.offset != 0u); + for (uint k_2 = 0u; k_2 < 8u; k_2++) + { + area[k_2] = fast::clamp((stroke.half_width + 0.5) - df[k_2], 0.0, 1.0); + } + cmd_ref.offset += 12u; + break; + } + case 1u: + { + Alloc param_13 = cmd_alloc; + CmdRef param_14 = cmd_ref; + CmdFill fill = Cmd_Fill_read(param_13, param_14, v_297); + for (uint k_3 = 0u; k_3 < 8u; k_3++) + { + area[k_3] = float(fill.backdrop); + } + tile_seg_ref = TileSegRef{ fill.tile_ref }; + do + { + uint param_15 = tile_seg_ref.offset; + uint param_16 = 24u; + bool param_17 = mem_ok; + Alloc param_18 = new_alloc(param_15, param_16, param_17); + TileSegRef param_19 = tile_seg_ref; + TileSeg seg_1 = TileSeg_read(param_18, param_19, v_297); + for (uint k_4 = 0u; k_4 < 8u; k_4++) + { + uint param_20 = k_4; + float2 my_xy = xy + float2(chunk_offset(param_20)); + float2 start = seg_1.origin - my_xy; + float2 end = start + seg_1.vector; + float2 window = fast::clamp(float2(start.y, end.y), float2(0.0), float2(1.0)); + if ((isunordered(window.x, window.y) || window.x != window.y)) + { + float2 t_1 = (window - float2(start.y)) / float2(seg_1.vector.y); + float2 xs = float2(mix(start.x, end.x, t_1.x), mix(start.x, end.x, t_1.y)); + float xmin = fast::min(fast::min(xs.x, xs.y), 1.0) - 9.9999999747524270787835121154785e-07; + float xmax = fast::max(xs.x, xs.y); + float b = fast::min(xmax, 1.0); + float c = fast::max(b, 0.0); + float d = fast::max(xmin, 0.0); + float a = ((b + (0.5 * ((d * d) - (c * c)))) - xmin) / (xmax - xmin); + area[k_4] += (a * (window.x - window.y)); + } + area[k_4] += (sign(seg_1.vector.x) * fast::clamp((my_xy.y - seg_1.y_edge) + 1.0, 0.0, 1.0)); + } + tile_seg_ref = seg_1.next; + } while (tile_seg_ref.offset != 0u); + for (uint k_5 = 0u; k_5 < 8u; k_5++) + { + area[k_5] = fast::min(abs(area[k_5]), 1.0); + } + cmd_ref.offset += 12u; + break; + } + case 3u: + { + for (uint k_6 = 0u; k_6 < 8u; k_6++) + { + area[k_6] = 1.0; + } + cmd_ref.offset += 4u; + break; + } + case 4u: + { + Alloc param_21 = cmd_alloc; + CmdRef param_22 = cmd_ref; + CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_297); + for (uint k_7 = 0u; k_7 < 8u; k_7++) + { + area[k_7] = alpha.alpha; + } + cmd_ref.offset += 8u; + break; + } + case 5u: + { + Alloc param_23 = cmd_alloc; + CmdRef param_24 = cmd_ref; + CmdColor color = Cmd_Color_read(param_23, param_24, v_297); + uint param_25 = color.rgba_color; + float4 fg = unpacksRGB(param_25); + for (uint k_8 = 0u; k_8 < 8u; k_8++) + { + float4 fg_k = fg * area[k_8]; + rgba[k_8] = (rgba[k_8] * (1.0 - fg_k.w)) + fg_k; + } + cmd_ref.offset += 8u; + break; + } + case 6u: + { + Alloc param_26 = cmd_alloc; + CmdRef param_27 = cmd_ref; + CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_297); + float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c; + for (uint k_9 = 0u; k_9 < 8u; k_9++) + { + uint param_28 = k_9; + float2 chunk_xy = float2(chunk_offset(param_28)); + float my_d = (d_1 + (lin.line_x * chunk_xy.x)) + (lin.line_y * chunk_xy.y); + int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0)); + float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index)))); + float3 param_29 = fg_rgba.xyz; + float3 _2264 = fromsRGB(param_29); + fg_rgba.x = _2264.x; + fg_rgba.y = _2264.y; + fg_rgba.z = _2264.z; + float4 fg_k_1 = fg_rgba * area[k_9]; + rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1; + } + cmd_ref.offset += 20u; + break; + } + case 7u: + { + Alloc param_30 = cmd_alloc; + CmdRef param_31 = cmd_ref; + CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31, v_297); + for (uint k_10 = 0u; k_10 < 8u; k_10++) + { + uint param_32 = k_10; + float2 my_xy_1 = xy + float2(chunk_offset(param_32)); + my_xy_1 = ((rad.mat.xz * my_xy_1.x) + (rad.mat.yw * my_xy_1.y)) - rad.xlat; + float ba = dot(my_xy_1, rad.c1); + float ca = rad.ra * dot(my_xy_1, my_xy_1); + float t_2 = (sqrt((ba * ba) + ca) - ba) - rad.roff; + int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0)); + float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index)))); + float3 param_33 = fg_rgba_1.xyz; + float3 _2374 = fromsRGB(param_33); + fg_rgba_1.x = _2374.x; + fg_rgba_1.y = _2374.y; + fg_rgba_1.z = _2374.z; + float4 fg_k_2 = fg_rgba_1 * area[k_10]; + rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2; + } + cmd_ref.offset += 48u; + break; + } + case 8u: + { + Alloc param_34 = cmd_alloc; + CmdRef param_35 = cmd_ref; + CmdImage fill_img = Cmd_Image_read(param_34, param_35, v_297); + uint2 param_36 = xy_uint; + CmdImage param_37 = fill_img; + spvUnsafeArray img; + img = fillImage(param_36, param_37, image_atlas); + for (uint k_11 = 0u; k_11 < 8u; k_11++) + { + float4 fg_k_3 = img[k_11] * area[k_11]; + rgba[k_11] = (rgba[k_11] * (1.0 - fg_k_3.w)) + fg_k_3; + } + cmd_ref.offset += 12u; + break; + } + case 9u: + { + if (clip_depth < 4u) + { + for (uint k_12 = 0u; k_12 < 8u; k_12++) + { + float4 param_38 = float4(rgba[k_12]); + uint _2479 = packsRGB(param_38); + blend_stack[clip_depth][k_12] = _2479; + rgba[k_12] = float4(0.0); + } + } + else + { + uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + for (uint k_13 = 0u; k_13 < 8u; k_13++) + { + float4 param_39 = float4(rgba[k_13]); + uint _2522 = packsRGB(param_39); + v_297.memory[base_ix + k_13] = _2522; + rgba[k_13] = float4(0.0); + } + } + clip_depth++; + cmd_ref.offset += 4u; + break; + } + case 10u: + { + Alloc param_40 = cmd_alloc; + CmdRef param_41 = cmd_ref; + CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41, v_297); + clip_depth--; + if (clip_depth >= 4u) + { + base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + } + for (uint k_14 = 0u; k_14 < 8u; k_14++) + { + if (clip_depth < 4u) + { + bg_rgba = blend_stack[clip_depth][k_14]; + } + else + { + bg_rgba = v_297.memory[base_ix_1 + k_14]; + } + uint param_42 = bg_rgba; + float4 bg = unpacksRGB(param_42); + float4 fg_1 = rgba[k_14] * area[k_14]; + float4 param_43 = bg; + float4 param_44 = fg_1; + uint param_45 = end_clip.blend; + rgba[k_14] = mix_blend_compose(param_43, param_44, param_45); + } + cmd_ref.offset += 8u; + break; + } + case 11u: + { + Alloc param_46 = cmd_alloc; + CmdRef param_47 = cmd_ref; + cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_297).new_ref }; + cmd_alloc.offset = cmd_ref.offset; + break; + } + } + } + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint param_48 = i_1; + image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48)))); + } +} + diff --git a/piet-gpu/shader/gen/kernel4_gray.spv b/piet-gpu/shader/gen/kernel4_gray.spv new file mode 100644 index 0000000000000000000000000000000000000000..17c75317fdc52c79bd1ffd7073f75e80d3b670aa GIT binary patch literal 65980 zcmbWg2Y_Bx^|gP)OhTvu=}HN`gY@1Zp#%uMJ4`BpG*W0v2`C)|q&JZ!p!AMn0Ths; zfHVc9DM&BU+y8mq_pZsA`xU?6|LWnawe~)HpLWap-kC|q42#cLReh?NtNzvAGgqx+ z&T0mfsM(~y-WPZ_b&>T7&g$Jwh6tCnp*HDlEm+({YNJ!ZW63H_?- zR>~Z-SD%y8pCUGX2mhOa`W{FpJs3QsYw-5l4<52b*YJrGyCzRLbnM8kv17)M>>4p< z(#YXc#vC+Kzy5{anDN8MP90G?^u?p>gWO?lGIs2Q;ne-;Wj)Ml!l+S`M^33wW~>Ha z8$a^ku5SI7nYzk){D0Ug>tXAxX2mw$x3iiJK6%Vx^v1TG%mZT{Ij(EM0rHuvdEiqf zOdY;o7wR&Tr%akUe9DTg4y-tJl~&|-ySead<*Vb&O+7u&%+gu`&EmigMc}y9@LQhKjw(kDeuY2sMu7k&nn6jUJo3mOL zttSQzx<~x4lUvVrj~m@;O}cp;-_q3W@pV+ofKA(%|1A%u(V#J7$F@e|K2h_SK90_6 zC3x-E8iU-UIR-PV31a)V6_1_aO{rB8V|5tCv-E6|x z36mUv{a5pt-v7SUnsD;B{It0aY95nD59{t4ZiIGU*F}r(f9-5C*S&SF*QR~?x{mQ% zbKS36za4kTnDK)rb&qIGcdNhZ9%X+wgwu!Gx4pmAJjRS4G4c><$ExOhx2_k6(Y(I2 zv_q$BUbR{p7XSttQ!*}W8DOtv2Nauy<_)?-eXnwD95@boPN~4 z?f$BHOdo4!rRG@2bx&!HQ|>qq8H>Q-)mF`M4rh~udtIDFC!HJZedTn3sx=QosHf-u zuk$c{OvOBGO^nRLHsJO=^sTmQ$KGmO_vqH8!ak^Zlyk5HoIbStwCShjF?|l&nsYF{ z?HtuE@U;D}y-!byzVAf)^nGV=`o3E`?j~a=?$^z#xB8^!F+G3ZY7aR1TYlQ?yP8M0 zjj6o`>_Z*D|Ju)FE&92a@mjGus=cYR*Sgwqw-`TSv$10)+S7I))IG|++MYhtzHL7> zk6~j+jvqnox#3N2y5E28?eyG5f4%RfzsAkHj&8@^dg{1#XPqxKkLi8wtj56UgMO_! zm3w%= zuE{;7AlOFUhcaqx_vp#geR+bIN==(CeEx4YG4i&%X&-HydGthY_i;dVSRKpz#eW@q zPsH}Py?!Ppm&l%&N7gahH~0VdsP^;lBIXfA%pcYj@i2X z{10OuSHwKNC*~a0sddcyLjOOA+Md&si@2v0aeq?BtuLJa|KEo;y|cZpL08c}=i=4i_W9kfx)D5i_@t2|$1iixsMXf9 zw9bj^(T5Lf&-V>tE)af}xn1mIT-S&Rbat{HmwTu&I;A_cUqc zxwB|4Yj?%Ul(o8J^|?gdYtFlx&GuFda@Huv08}zo9=c3 zV5?7m)ZW59kKmv+jet5JyQ5Q0p`PCSK>Hf`aIU- z(@{MR_TJF9nyLC*?N7hD#*807a#B5I)91Rgddjxr+Mk8}ocMH`dkZ8+XZ1{rcY6tL z<$9LdYkXgJ*b8F%lX0g%&!M-+)log)f6In_>t4L0+7`|hBz8x&8#sCQ0+)He z*Q;G;wQqwDYy0=)9oevt?!`N*{o%}NOlxo!VdGtU<`>=!rqC2jxJ;(vL=eq}G-QC$tsxb6gJ zTz7-Zas94WyUx~kh2{RZukGJ6uKOGQ4>tHiZU3HeJ=(DUrNJL>`}d6N$%g&u27k8g zzk253`G)-;4gONwe~tM6vtfU&!T;U%Uo-x1H|*~=`1@`Dwc`I#!~RKwf7bS2JN{ob z>|ZyyUp%m0>%_kkUan8S2JheYU&RyN^MBTcefD0QYZyHHWKD4P$-3ZjpKQ>pU1zmX zgKyII@7X7tH|&FY@s4Usc=GN6PTqaMW!`;zwd<^g_2M1XNO$??9o32OcAS3IX<$Boc6E;*(KY?EX=in2uQr|4 z*$sYqgJ0R;*ERSJ4Sq+1-__vvH28fD{&0go+Tc$$_%jXuVuQcj;BPeeTMho72LHGh z@2Eb9XMMf~m+RA~qxbpI(cpdI`1kD7{tf%g4L)nzzvp^3pkbf0!RKoGua@(5-iCet z2LE>3zvp_kP{Y1RgD=|l@422W(XcPoi}PFnZ|}#>>Y!e>&gzg}yrVi6?!&Y9^83M~ zYWx^@NPRtdywT=KI3Mj=ZC(KTu(qP=zD|R$-{2cH_@)iMQ-kl?;CnRq z-VHvg!N)ZC*an}_;72w1F%5ofgP+jg=QsF84Sq?3U)JE)HuwzfhkAHu!)BpR2*=ZSVyf zeBlOPtihLP@HHEJod#d8!8d5|jT?NE2H(8F2RHau4Zcl-Z`a`4H~5YXzDtAe*5G?K z_&yEZ)r)sj`@u(bx33Xn#B4mT5!ZsdCwFOT@f+pCx-`Q(s$1d1`9j#wsZWB(j6bN0 z&!t^$|BmYK+PS#-dGf{Dp3ky68RMV&wDR?>-T=2gzjw7i(b|W1qKzDZe`obByu5C` z-{2qi;(eYso-|?V#H}`O=RO6!{CNSnNBLdisc4%`7(Z%E>pTB` zmE2=|*T}IW$Bi66g?0BcvU;DEKCyesev?PyjyhvYp8?!@z0EgBU3RS1rfuU^RqZ#@ z@71Oo6OYSJdmQx-4@76Ft6_x7&lE(rYZIpo9~(pRz+RZKm7?vJf|{P+nmB-Tvm zQzmtfpIjRfYvwvu*RWv|4(YM=Z;f*nY$JH688>14gfSx$ar*qsioNyyT+evgb1+-< zVPiO{+CRY2kDvc=PG^s8@)W+KiFNwi4v4*1f7)^9z-E7X_qA=Cv$l;HKdKme-&W3V zVPl~jJ@eGo=4xrqqPg4J+%3%!P4CY%e>fpAKl&yrM=y!`LLV8XZ2V! z@AKC-cGmx5TL7Em{NJ_vc5NHcJ*7JhJ+FSu+_v}irc+1Z6XeYqmKT z?DtODcbd)Zm*9NZN@YWt2F>+%2=S}$j7OnN2 z7cU1L^NVoi65Lt61TM$-N`rp}=bJlzFs9w_&l~nHd-0Cy8@TTv%`+1x=7>?^e$@bQ z>zd^PJ2|z_eCS?H?8^e+vM&oZ_(Banu)!DY#rsxEz*|2b!Ap?jTN8vNW|yra4Rp0QpA9(QP$U$M64`ifq@oz*pPE)c9j?o;#M~;+=Wg zxCeJuJAkME971QcBfOmNoqKV;>(I)+-=k~ zA8m{KAC71?Q7ip?{#9f9w{rNNiA=4yzAst~>|q}HqHWspUEZV|_HCsC`}rffpFM)t zEbv+dUc12S7I?h^uV3H|3cO)~{alk=n-+Mp0&ia6K?NRM;4KThRe`rI@HPb=Qs8Y1 z?B}I+e>z*AmCCsskIlJk&vi%3?sKejsm*6fv7e_r#FGm=rNC1Q?B}icA6ejIdT?j! zGuJFQ`TXhL$Y(3}frpyU_a04+EzoNZ=~wQdv~^sz^>alu_nn&0{?Yurq&9bTG|qY#wpm+mMJvHZ`&0L&AzJnUL<+_{G+ydq4~K-ZS6wy zvyR&Oh2}WbHZC+j+o)|`Xnv+q+p^Hy3u;3O%{fvVT4;WzQQNuD{4Arkd!hOHMQ!gw zbAHwKEi^x`sEsHz_leqmEsgh@_1>_LTXJ0p&w7R1#>2g<*3rV9t@!h2Z)fH6P&tQk=g@P^vC4Bj5xX|(J~MfU zpDOUv1%9T$K2yd2#R9)l;8zR$dV$|4@S6pGtH5s;_?-g#%#^%7F9mO2;4KO~sK7o? z#ecg3?@-_!3%qNAcQ5cB1>Up3dlh)^0{aY>yayK8XRPRl7x<_GPb=_`3Vd9Fk1y~E z1wOIBCl&ar0{h&SK0HxipV6ZG{1m)Ufqiz0zG#7$DDaX6UZ%jy^brqGyf*9w_S&FrE?U}lQGC3%i08q_%hN!L$3VE31B?3Hc)88g9lE-F zVIr7DpGEnvIpk@puWh`3t7%)5`=r-y+ecUPT5Fu?TEEKsmTjUL!<^1XyT6^SYplB3 zyx6>l4Wukg-X*E^Q!~d>)N01_I%}-usT0flnz7WJlNG54Quc(4{TFrR2 zHJwpCjg)!!4*=2%jUvk_dIacpCp&8ZV- z3$St2=3@+lsBPa_^T9QDA1utOY)Nff{j|ky8}n^Nt&jPPZNBZOlW%*l`P9tkeb@PF z$LF&=SYLJH?MSU=Jlh&?H|oUmer!B7ZF^9sFMERJw$)F&zcY}w`+{w&rmdS=n|X&( z%WbCu>qMS4Tq8>z1v!so7`}z*G zSRdoOTVwUu?B5r)-j3T@eFN`T`#4V>ecIgq>E8)gOP>BURx|HRHFtfNhZ}2VwEol! z!e;^7CpG7AR%$il*w!{HV*eJ{V_87Q&X#)C&{K&&1j->h@BL+gKOSh^Al?Bbs+sc5A7Q^ zs{YF`QTFf3+U~izm6OMzzkFzmAGJ~Vfo=Q7;YYUlMk_E&o~uW})lxhUror8Zx~AO; zlIo~U!cT7TjWq99ImYvq^bsuLE4t+hZ<$fRI;lB6* z*nQ!(#q~Q5u9;!9?8ifluWoy}@3_@$e`1ZPTK65lduii^THm>d%cA}NPbp*kpRo5m zKbQL*j(t0g+PP4(?{Ytzt7U%AgY*BM`Mp5G+SmSO?0%Ma>{*XH;GVNt>&M_( z|I}X3XJnFHE3fCq?hkj~)s3ZX7Vt6$jNdM?=ZE__cLlBzgQ-`6=Uj`=IJ8-@CB{GD zcdvEq8sQJPd5-#Kv#hTe zxD6LW$AP)m;d~FE*k*_qOb&hRlibfn9-c?;Gd~kKPv-HSvL@rtp4QL3`6&vgLMw)S z@aHhBjp_4??~hCFyW?@3p=g4%gpz!{Pe-ZaCc9cf;ZCMZbp%zXH4O zh{L^i_-;7d+V{gH_x*6VAPhwqES&FA~# zaOcDK#o^YzFAjJ7zB3NDzM|mTePrsU3-^1{aP58%8t${J@0P>uzwejB?Z5Ar!>xV49BzMo#~g0$ zJLYiv>pSL>`<}VvzH1J5Jic!Zx4(XW8t!=f4mDi6-;sv<`N((9;f}}m&Eby6_s!v6 zpZ%^g-0QpVoWqUhJLhm~-#Ley-|s}jt$ptt?tSvIg6r@1m$6&>-nrzycMjLz?={1H z_V?X$xa0G^bGYO4y>qy=@10BTd*_n-?m66ies>yf?fd6&?S6+^a^FFh+;`C7*1m%d zx4*uF4!6I)gATX$9dyZk?=0VhyS~qyCsBM3_4!}}?)kv#NJXB%^_!wfQw z)fM1@)V9@6yM8`1X>$&Y|8uaK@iV5Y;Oax^r{AGn4OX+g&rI^P{{>jBZ2wER`aSiy zt_7>PK3TWx;LF0*v?urVU~T5|c}$*MH-MYDZbVbhdffz8^SLtZZic6wapn4Hcc0xx z@tMEe2e;RHeC~ud_sU&p>RJ0=f&HwYp8otAY%FbwbvM|2<-Yz6Ts=O&1)Hz*`5jz6 z`F;;JpL+7$12&enp^UGCdAJvBKeRc1pKH~ebM1cst4)LtX1)IiR@2vKUb)!kUFYaC zp35Hq`+RQwG41|D{Sd`N`@^-}I&1p~*cdm^FK#W>qhK{Z2YgNp`GXW=8C#sTe*xRp z`SIHHIN0ZHb^9iNjACy4VSDSee*$d#%+p`NYJOjpv8k0~vme@?q;}lq_L}oF+`Vht z#D4}&{hE4g&w|x#XN;%7YR3N?wf(bp?w+H5p5meXh1zayeCO&PU}HE>;=faDXB@G9 zp7$?<&!w32C2F}g&*N9Yo(I}qp_Z>x^=o|>{!j2!d~BmF*Q9@ewRza?MX*}N{Tf&; z<9;2i=D0t>{|&H*{nqwxikkfvJKl^#J#pUx8`l`0QhR*CHzMkCZjaU5)TF82q4x9Q zyVRZ3?@{}?@qKFkUwy!Tyw-h4?Ogtcnp#?5?w0nU7W4wh#= zzXE4IzXZ#j&n1cVHJGn_TA$IhId`8?tBb#(&begg%xmp_n1Q;(4s$p^oweOM^V1h> zjGRmT;A%OS7Dw%!#J%J?CccV>p(@ zz~8jAd-GZ3h$cyI}7Z)9AN+Rk)hpX=N;{ zfgMZMelUKk!_|+e+pPh%t-51cnOZGz_$u)b@9`%$asezp!+J^fr4 zY|Qj?J+PX7$)^_o4ZzNE_=aHHpGnRo=>JCADC+Uq7+kj5q~>X}DO?}*Z^6iuPHn8C&}HweB_3zUY_! z4h7p^$K`nC`nz}C$L>GtjCTjHb@8U0}7G$MQWX#xk!sZM(s?%{mMNd%e!}$MqbJ zrap~v$wz?I{O&q&M#9TDqu{Qmx<0lW4OVyW$oB&qOFien7;w%3?Kub1zx~nnrPy!h zPHr3L#(8pl*6G&)VEdJQG8U|seWFdx_L+O2t|cYf%GG zv$qZdo7dXC^#kg|DIWIgh}v$Q`8yJ9jO?wWz-rlB^6yiOWo&WUP6ON4Iq^H7qrvtq zd&~D>$H3Jm&>#2255a0#?;n9(Z(}(gxjy#SzMIoJIgbUKGi!4kSS{k^T6iUb}qF%&nxGHeO|GR zHhq3ftuDTRI(zFvuw%D&>=#r2l;YvoFRAU;S*xFcjgh@|DOfFgOMVf>SjHCHb}%t7 z2mAbySXaW0WxuYVmghO)=U|@`)bpHh65dvh@Ht_B-Z-58fqtGP}-m;D0l*t0JN z)8?0O_4MgluBj0UJ}B^L8J#TKf5Cu>CCO?Jsck_&f%7olBp`;p*{u0&MKk=dW<}+>@UKyPn=} z%wY_D65}awvoBAhsmJFTaI-JZqN&H{Z(#dU&eh-H>gmgK;Pk~D#?U8o`8?SE>gT*Y zO0AY0{{X9%<9!jX9-o(LpRyk>qp9av_7$-G@LAUO+GBgQu<7Udk!PNN!L|RBV*g&C zR!i>J!Oi~t8%;evZ`3|z|K3DXPaoa_+rMQfw%2aH*QmASTzdzsmUi!gGcIlU{P8we zTiU%3R!h4Nz{#i0>xB8<18YmW|9~BH+I<90J8fzAAy`}5eF8RL_@`if!au9|k2rt4 zhI|g!M?F4YfbCECmo-nDui*NqXT81#8%taKz5)B}t8H4H&x??K)Ry>t&>U-G&ESQK zq8^_Pu=5|@2`}69h3li9&yM}T#?s~-e@uHdV{l8XW&~eQ#Ox2(C;Mk6_$~1A9GDqR zJwCJ4KII;p6-_`o0{z}tlM8qZ9YG5FNw{0 zx~ZP0%h8rXQ#bzrY;xP5TDP~hJyo?dHhuE^B+%YxNhYw!E=lW21a#qn61 zYq{FTT+4&CS^M0&BK1lX5A7@0cI(W=cfrQEfxb~ytMDJSJh#eMpcu>8;*KX-4X}Fl>zZKqNoQ@#wREjo_nxGWdn<9*h8x%X&VgKi$LM(Nr*-ELvtH|g^Z7S1*GE%N%niV5r*!G!=ZNO@oUu|mT*z8Ba5-F zVCTo$wcC?=FN%l#+Pk(}XRP~xjghtc9#}1FC*Ol&EMtq)whL_Atlhp~`<83V66EcM ztH)=Uob|P>{g>-w|Lm(dtdnCn*c@4l5n#2fuQoN?XZ-dx>pK#hbxO=pXzGbM8m#7+ z({4X_+SyOJe&zc5uFROO?^tYE-vhvMZLaTlaMpJmSnm4zGyD_4Cs5l)o4NLjvq^%~y)%8D+TJGocLD;5%{d)rH+9y)0iKkL$zZ?X1zOCIahfp6%@o=1n)pqO5 z-S@%9$bR_&SS|ZSelW#Y#unSwejNd}FNt*|*jU*cM}gfN>iW2Urh(P{F5MVMgN>u^ zcn+slOUz@y#ti>q&CTIE@E^f_2d=JhSrys|Gje9BkA^kWWuAXO+6Tn`V)YG>U z!N$^-*e8LFHLdRd$zc7|)2CCw_Q}3GCb@pDf$QNsT4zkBf*oV_?~lQ1IVZHK**A?JNqfuuRJGwZ){A@iF2^!oH!dS*XB8K9ysU3 zxnQ~Hgukb9KKN8>+h{Y_S=8!TqYJ@mj@g**m5bo&@wpi6cU$iF`1}-2J=^MqAoYV4OQjBG6 zaoXMjwypiRnOdIj3vUJYV-D5xec^3jb^ULmmS=7506VVC%bj57rCj5?;Og=D71%iC z8vh!uo^`kzY&-SL=WoE~)0SAj1)Hy2_us+QGJ?3&$E95+s2}OJnQSbfrr4_%=-swxp~L3 z_>Y1gsky#?2J7Q^y;l4M?3|SMp2y(oKd9qB4ptjVo1wIM0&E<0|0Re)XnzHpOPl?8 zm|D$#XnzW#u8)0x1+1QX&8uMJsN1i9P^%^8Kf%Tf|5we;;eGHmxc3}&eO!ar!RqP9 zzrn`6l(D5BZ@|@aAAA$+eNa7pdkbtVZHfIh*jUr*{qPQ0KlSwKU9f$!?~X~XpVzJH zsJ-@BXH4&b9b@iU?}OEHAJnF1`(IGo-fNio+}j_5b3aSW|DdTS=0{+)4=8E(F*xn) zr(D1CKIqR)8PogV=h$)|{0uDD=6&!>aPEU&faTr?x5V}p_$So1(Ppkssnx|_Q|G$> z4cPHnd)@ED4{Ocf4{X^Uy#uV>I%Dkw8za~KzHqf%_vQZ3m$7Usw(Vf#8NuayqnY5w zvS0p;yFB-SnZe!%)N>!01+4CVFy5?SW2zg&pPN_9-k1&S*mEy%zs`=PzCEjHi~(RZ zi-+f*?R|$Z2bwX<`^21Zb@#39zXdjidh*T%ZswgEO+9(%0jpWOPH;N(H|yUwn7=ZLodG{mDLm2Tk4cQNAG9w(6NjwXEC1VEdDpi-3)p zXAffxL{rbt-7E^Wow_moodq>xIJb*~jp6U{n#b$a5@_nlyCm4W;Y-y#IhKa&qn`Wd zGGOCqGq1lJp_UxWf$dB9@-sFi>W<6b@lcEZnqaj&3#J=UHG=u(`C^4}XV6&3<@ZZw^+=v%nT$HH(L1^4zxVW@y^;EHDV{vw(WmWH8ux z<+H$+aD6?DqmNqJYz4MW`7E%toML|4=%bc4+kkCTJ_`(i+a~?hM=fLB4s07aOHgeO z_BxuFL*e@}`Re+(XLkUr=YG2**f{F;Yg=ly#M}vN%X9k3k$K9|}y+RQbITHW>?tjl=t zE#%TR4lLJZ`-x!vwN0Rw8{6M+KM*{C+BVws8B47$obf-E5Y>Yf-912#;bB25h#aPA`r|tK_w#~ly0ocB&>obV>hlADA<_K`wcz-$) z?m48ckLS=)VD;?NX<*~1JLbcv)e`e)urb4rsd@VKL%91@T_5}PBe1$Y_TyNvaW5^- zq2u7{Ifsr1dk(3mZzq6_r7f{f1h@KFpF=0X^;1uuP6pd2`|gkX;ZU(_JM0=KF^^agL5t<=4oi^iFrC$t$bH;20ZQTr(D1C9P;-L67x)K z{(edJ(^+sezn}MKR?Y!?`0h&E*_2x;j#q4sbE&hZ&#P_L?&%AtFQj_bWv%(#I+T?({Tao(<@*5>_|E>+iq`;b80_r>mm8^D)Q z9H(QmotnNGdAL6*jU=)cOO_kZSng9*uH6V z9d4sm6aSGq=gj?J=g``7=0WN|Q9PXEhibcZ#_}-O7&&Ji0juSlkv~8&ma)ZY`zYAH zg#Q_Atn~XYV13l%^B7p4?6JqSGhyob45F`3fYoy@{1vR`5$(xZW2$yZ((W&t0#B<=TAidJ}9cF?X%^1AR2T&*0lTKwOw{jXyVcr2~ngRABK^#Rz!`M zoH+jhm)ES1;A-VH>tlG?w@=_|yqp9aw^bOeSfO^*5g)^46`1L{4Ph0$E0NXchuFr>HHTOd1&03qk-=Aw( zC)hc)_8QiYdPa(eZTr`D>&*E~U}NMOHZxc)*D!fsim{9>PTN_)u0`6;3Rlaz&kpu* z-L=g|nVVu9apKGYHct4QaQl?|@3+90F<$+&#cwXKHs`^=Yan-=<~4`8;yVx6c?_Qy zZjQ|Pd|-XlbG@1$oa>eLv|RwKEqUc-Uh~IyL9l((*T1bG_Zm5fdGT-6`d(XIyMK#8 zEo~P8x96e#xppAjYoWS6#_;djs;A9j;IwIdmj&OK3zND&OVGx@f4ev(&*MvijiYWq z{F@JIiMbTmn91qi$+fTg`u8B@vH5p%ZLiJ04I$69aRso~n)3UW717l7_isnY{appe zwi5V5YI7J%yRrO?zcSdluy3}N>+d?a=Usp6jN!Xr$B^}21+129nl?4t=R9)Hr|+wR zbA3+C)zH)vb9Jzq=T6$K0Z%*oDc8?Dxvu*=$Hqji*5*ICuKTw$O0G=^NC za!ig*?(ej_7S{6o-n+l=ZQIRgm+RN&V12EYW<%4TFi;hJN`|&;Mr=-Dc<@w*6ha| z;QXyN|4n(@q+fHvZJ)h2H(a0WE4e=Un}43#{xxyM>6_0S^H4neyVmx{zdbWojm_!b zzOz5`Qrq6YpPKeef9v0rpyF>MPH*quk`j0HQ{-X;%2E2dGGhe&I?N_Ku2$}`QSjt* zPUOaNKJ-t%{lMnaN1Hy*se1Aq0JaaAv$0_3F?&p|PyEM$jgj|@s5oi#X?808wASa8?il!EK`;|4#y;QF6gaQ)A2@N)~U|AhtD|KbL}q~Q8rUU2=d zEcjL6t81S5JPy7Ie&wDx9!))a;smf-_JnhymNhyFtd>1-GFYwL6Q{yACXZ|EoXN$` zm;Tukr-9uQ`e@U~Iag1z{l-2b)hHZTh%p)RXTTuydR}@e8o~B>TA@d@Web_TG!H1KU2= zNx430e?8bSr~M6JwY0wxZ2RRX_E)Y?+TRRz3~7H0SS{^u1>626itXk4*xu{FZD8$Q zBgDBzY>90M#bYZ;H6vKU;A9pD(!n|7h@+3a;HDa z^?$d)-!HiS9~E5xPa6EQg6scf!S(;T!CB6p_0zwz;QIG#@csqYf7XKQKYPLbyS{VQ zJm=S4`ThVd^ZgO7HmLUTn)CoT`P{>DW4V9zPrg5a&8LqxeLM%$lkZ`0 zneP#}T6xX=Gd%e`2js?b|LdQ8kAcmnk2ZZght!kr32>S3uW+?XYahq=6gc@jhvdfc z{Lw%8o&lRrA8q=04yq^L-@s+Q=iq98uYJt-0yz0R2j#}{{L?@AUId#@A8qxcfy_bJ$X`e@U~ zYmj>KeGV@3eF0aSx%M&NSK#FP5-c~?*VOta-#1|M>7z}bXtuMgLdGG7N=&3_L^ zAM^DEC!g0axv{){>7RTvg3YIoHho&xKydQS1TOQ<0$20j+eyCJz{%$|P;M-*fBGli z0I>P=(WZ~rQ1#^d7Wi()n``7;aIc$QBlVZ-6aTrvUNiCxFb`ZU_Ibgcj*_gw7Dz}1p- zS+H%?lXE$+Id7_S%Joak6~OMR#9R@qW_zC#R|4DKpXGL*LBaLgsKGZWxc-|LT>n7@pNTo$vgVFg`&w}Oe8OY(9Oo>EoQLC*M|J`;a-?8tgpgIwIF6{@Z|!aUXN+_l(1nnr%F%h7xBdu(7n+U;n+aT`2Z9*ABUU=5gQd zTHD>z;_T@?u&$4dnq%@hwIA4VW)I2rvAy#?rnb8#;;czGwviN%VU(=N@EY$+J%Zwz zjH1q(j3q|7CKC(pnoKUZep3s6F!+#~XHCYz%~P()cr^8_$po-k*2F%jWsVL6t7T0l zfz`@2nF4og&ar)zi|wEOS(Agnu8BU{^l^;p$#*DNEn_$g?6|T9a((o7p1)t)T?28} z!1wRtC?5M$vIYm#ct7f~6xU!pb=KerVw7uebirMN9~E4`;|hKP_{5s0|3|{jQ?9{L zXzE#mX<)UifqhWRoE!sI%NqO;tX8hUv2e%coZ3gZ*#7CCH8>vZ8t9`**GGTn`IOr38i=z72V$E_@$lbv%^FOu@kHt=6xZM&>a4+O#K;;r=VumN z{p^~jf2YHZQ_k@jXzH2cpMcezWBcuM^I35F;vASmE;hdYnX_}i&Y3>i^s$fXnX_}j zYWH*P@P2YWSnWJYd@ca%lRjMtRvSX`-r#&)1h%a<=Tn~f^j*VY6c7Jh+|1`8HFiD^ zr8uA8r*=M_zsre{`SiWkl{L@&T>`gVIe$MxQ_r5e6s+dwRNvn&kN;(G^~~oLVEf|y zm{Tq`pT5qgWB57PcW?S=bDU}!r|<3j8~q;sTfZ6S;Wc)gM^GH+QPdgdb;QUx7e~LL z<{9TTaNCvR`~{kN#`#OITE@8;{@23QGtTS5_QkQ8Q!X~2zK+u|+z9qPwm#Y%r&`A8 zJ>-WJk7<;Q^XM8o&SNN!^GDPf=N-hzIK98$Rr8GV7P#%oao&oio^jp=R?9ed!~b@; zdd7Jt_-J&;W=^@-eEK?0$M7q#&mj6}bDU}!r|)M@pm-ch$vBUzvEw|R;y6#F&N%NO zM#ky8vioYDasCEwyKe+O2}I9JF2_i*)$^Iouhact(4i_NF6<8%yv0Iy4} zk2c4tmT~$W^Qjb%lPDSI$u)MIr%)W{kEt`xhl!DKZh-!1%`?si;I=Er`5>Bl#`!0( zTE@9P{tv;`GtNiA_QkQ8Q!X~2zK+u|{29C{wLaP$r&`A8XRM!4JWiuzoTt~=ah^eO zoM%#JoKF!W9Orq|8Rv_{$T`Et!O&gbB^E64df zntI0h0$45M+z$VLz|}L(m%#SLv6)jYHlMzZ(=ogP_H(a3+8n1^#_4m^MHG+oDH-Pl zHFlgAQXJ>S)EVa+#K<`JMSrX28Rx&?wkyZ^8k%~>`8rrF*BNjn!IDaPrLpHlIG)^l?w9C*N$~GT-cQwe$N-KM&@B zC!c#lZY=kM{>k?(u=(`SrjL6{J^AJam-*&_t6f^;n-89R?kTyk+)w%^-vVIs>7z{_ z_oRCAeFt3TTM(}1d9RP3I~Il~pLNZF2n*b4{>w zotSHZ)okx~Lu-R=pYvF*Pui~ocJBPFbUFU(g4JyA^>;n6?Qg2bEZ4{OUcc9`?Oxl( zxwiij+l>^D>nL8^ucf}ez&C)sw%u8(?rHU*b$HiO&7 zzkOnx&Efi}`@7|8TY!zF&A!Xi_a(63PVu;fV&9$1TMK*}*uLLEZ9iIE?+Ipmv_4Dfua7y^&1wI>!9(mE zh|9VD6>UArxxN#gdA|#6+h0?sulo>3pUlbk;A)wZeZd~iiMB4vFp6=+<(#M|?r^aA z5_be#Eo(Rm>|tDOBPsh)j4LkJP(5+SfbDPQc7M2iRgcdBU~^}#$HMhf&s>iKmvgO8 zVvYwJQy=>ww_o--^Y6Tr^Y6LvTZ%_{p56_2{yk@ZLov?psC@=gJBT^i6f8z54uzk*T<8Zis>hU?E_W3pAng%|aqIMMJ>beid)Vl5Mk87cpJ#Rnn zrFfKc{(E@l{2s7#ejjz__*h~%=jqFFaNGE|nv8io+`X{~MO*w%to<_fli+G6P%`JI zg4IutL*W;gn)z6@$-I?IB-C1z`)Z=q@?UQ*t53Y7DCG&VbSl#xH z&GmK;)D!0du(`r7ggbW`>qT%i*EwVTDOmktO4?llHg@>Wz{X9ROX2#cXFe_i8%tZ} z<8rWl(w10PfXhB!2`~HjbGTaB$E)D>Pg~+#4KC;P8n}My@%csVlYajaZXEUW`&zJV zwZ-o`u>I7QSl5H?XV&NjxLUc_Z-lGc-m$ya)w0*UkK9l3_yZ;T`;TDv_XE^EYj}UX zg?8%s3}M?_(bVH}TkSKbPrIM`+>WLmpF3)wJge(-Cz^V$qj!M`(fU0Y+iQ>QSB1^| z`X=_T!OcG2jiw%--_$;3AAgIc9-rUUK4l+&kEWh}+yl0cOHypFJ+^xboB8#1EgkE9 zV8@`3>me`K<4^c_lQRcv!<6iaN5J;s&(!w8J@FXr%KOaY1=s#$gTGMo^y>+@aUFwg<^CR~^YmA+ zbD^%?TxwaPr@*!?_v6!O>hXD|_9@r;Sv2+d{H^vW*ZS{h>RHd{z*%eCYme>u!e)Mb z?XP3kmOTFeH~ae{ntFU*s(s4-zKo_GpI2(1vcIpQsi#l>1n2y*z4qAtRoKk0ul;px zw7E9U|7&2!a0lz*^YZIpwXGTm#=lv#s{DeGhEx+)v&I>!%){55URqxITnCuAGnm0qdilK70gLPw|=0^WtMP zb7+t6Ct$~zGOhOg6iu6LoL|SR=I`QWo~+Z)&%mz1UG;o_t}kUM#rgh{+Qa$Q_60@F z`4%V6SKy4-K7S1_=lh%5p8gZAQcKBv_d&O<_Qak6Z0yWk2i*RtXAL^Rw$+w)eZh_` zydS)bGb3Cd_4KPhSUn|cFcX?Nw8wX5u;Wh28q9*O%{EyBHP;|>XKlZnt68x*S6&mv zrOq{aPGXeT$N6fWd;{Rd$h^z}H%{!|s_pjOSn^yS z=K_0uRL^?M4K{|h%*{Mt^_0xbylCdvZhZTuW_;&jez5leb>~B#`IrIyZxoLwDb9y; z@Kk}HF7Pu2eirOJ{hd1F{tmJA311LyteguA!Szx1Tv&wK!*fB~!j$DG#uX>dK(PG? zUleZK%-3RYebnQ#IM_b=JhlY&5^(+0% z1)Eo!vE}Yl$LhEoU)p&Mj-R;n(_fqY(N_Mhzw7z}#iQKE&%v{X&x6hP59-|K*CA&1 zvFp5kgKyN}n>P4P4ZdrG@6q6UH~6RqAJgDt8+<~8AJyQ;H2AR%enNwv-{2QD_$3W~ zS%Y8O;5Rh*%?*BA&2!IR9_|{I&pIohsmEu<+NXSWSP4zt&j{YbR|eZo-ScP_Y7fsN zZQrFFLUH}YiM=Y={AKLb(A49zdhJs_m#l%Np8RWqZKrPjwW&SKuWc>LHWc%V6MJ27 zbNuU}sk_(RpYr%`0B-hoLo{{$-M{kqZw%JI?E5BY>iWCC6Dp6LUW_b=!O0mB)X7aQYwr1JKm<_qr^P|2S~^6aVpO>iUnRmd7>`oVlI= zmS?UH1UuLE$vn2zCv7KzwI%mtaK>gXxjylq0ya+kr-Cy!{pI?^{~)kA<9{$XW7A)* zPdV4ld3m0BzRA7rx)z5}C*GmppwtK&v|z^*mmlkcSlltc;0C{ zf^r7M{Nlu(2F`er|7bLI$9EL9JpMlfH~ae|GcQoK^dj^K~|wde-_Ju+`5RTx)ITQhrG> zzc{fk05|hrh^B7;&!?8h|6*{)m;U|~Op! zV0qU13b1Q!pUh)hebV+yu(ss>IXGi8mt3FtUj;T!{I3RQZ2HUfiT^cVbH@J{;EYXw zxjwG7^Y2_c=kdP|ocM`(J({}hucem9|3+~7AOD-s)b+oCS|0yf!0Au?Z$(qr|7L1= zY`23m*SCS?nd>{i&b56qk8Sly+dIM9lKU=j#%3A;8vOkR|ER$~ZSXG}{F?^v+jn|@ z&DX!dXKnBS4L(xc=K0T>l*#e3ydjzgxle-?PE@DY*V!HTOJluiZ;=U-!dyAJ{$V zJ>W^O#~0P2E&Z=>zYG2YwVzM^NX^#*)&11`zj}cG5W0Gh+E{<0b{{`P?Ptb^sb`{o zgxa4wdX#z=>OWJ@O8pn=*{C0*o}Kz}>H*YGPd~hjPFxm z&%f}e;kM0tvS;A>s3)FU8PBn%{oldH)7NuT9^3Qa^vm;9o_E$SfPH6e8*Ry@UgmN> z6YnK(;(4CSV|ztj_rr5uo?Nejecx{zZONrx=5oyv?_c0@o?e5S+wc3muDuTLL%k$L zU+syjt&Hm)NZdETW!yL6W!$&m#$AS@ulB^%R>pNt<(mF3*tq4h!Fy=xw)eUxkN*c? z^OpV}qN(feby4mdE`i^F!2W!YakM4Q$6)i8-(h@$rfwXsr}FrJ1~zZ$|2dkv{$6k8 z$@>M^yvEU%IA4LyTRz8qjizoKuha7Qdor50^zVbFuD{oDdGgKxHm`BC<$K=_bmzi; z>0?{38|vlSc%CPBKWxplnGsFh_N{vcIQ}z%n`<*On!5hpQ{?IEEMR}0$T->(XEtzi zZDvPPH;(rrdHm-9H`iuPGwDrYjxD(s1IuGu5}aJ#v*pRP6!>0p*+yG(sh7FDha}!I*vu8a zEZq2hM{oa^gX^Q7pQTwI+y@(BTK42oFY|bhO57E(m3dZzn`c^`XJxoP>S^;`ux;{g zYZbWKf!sjjvnt#;xi75-*H1lTSRHJ9ZRx98*;ntQiMbXw^Xlt!kUZ~w)&}2?O+EhW zg0m(*C&_K=Gm`#UpY_3h|F4g>wBHb%d_G6XlW!xi@0nUUF*k-A$M`-|X;1u3!B60; zkG70ey&S903dy|%w)A~-uspd3fgP)Q{I>+BuRgQM&9fD?{^`rsV8^PDwzMAtPWx@Z z^5okV>{!)}xgE7}#M@JAPyC@^$EuIEj8(lHtItEpy%Vu+4)<1pO z7i?ej(U!ibmwoY>FR@2r^ZMuKGI2ha4I<846pw#V^11A_0>574o2ma@;5TaQv+SD` zpD*90&O5FB$fGTMOoLzB;5QcB=k;4_?)Z$AcXGxNJ2s!E_lKL`XY)hB9$)a6kJS&M z^no8hJp=VvYM;%=Q8OLYcxwJ%P2fMqnn-P*52W_ld=j%vhT;i?VoyLt0h*m53X_i%08TcUe4)>aO0{cmRi{__euJ7GPs-vd2FYE%h;#F zjjf*8YGrKqUfQ1qF2{U2-1h1ji(3580GIPAFJrr(<9jB!jC~f|*y@R`7XP!sr?)I)m3P= z*I&Ew{k-Y*`f9N4m!zBz*4OL0y4M`9J?Zy#;IiM>!@ZW~XD@C5>!a=(TuZH%_BVku ze~$BJ_!vss-vZW0-S#(9tEK&IU}MT%v)jStn%x0c(=ToA1RE!9?gE!<_A9uWep$0$ zgN>!le%wl}mKb+~)so{k;PlIJ{T6P{tmp5*`lx4}eh)T|N3?rttz6H0Ypq<*`_OE! zzjouhX6og=_``Pi>RoE@#qUtRSK#++>^fB?WB%Zc>XZrVod;fm1;ND~Z zS#aCEQE=_=H2Av>{(ix2|6zlFS#aZZa+59lw_t-WTyXn4u)!BExN(-Mx$6q7?kAt~ zlE3fqAYAPM$~By;-qZgCR=<=*^O@mcu!rw-v^_*ob8f}P9)$f7uxqgiaqQcpaM!~) z*;ju?Q_p?$FJP}}>Rxk<^BCAz+7kP5u(8~K`aA*GPu+bYw?FnP`JVze^FNKIo_+i* zSnU}~eopFdVEd%abHzQQmcBd(Hg5jS<>%pQ77zEjdpl!(0o@q6rv3wL8+GHm2i4N= zm%z?rId3ncsi)tsfYr)=zY4eC+Kl@mwOac9FR*db@7LgJ7LR7XUq?4a`u%UPZPbnL zxuIr!_tu+W_f~#K_-(k_TNM30chvOvTK68<7}`8<ZUtBk-%###CH}`?lKGhhE%ReuW9#d9z5Z#> zy37hzOWxVQ&AhXtsmEskxVd(7psB}aPH;2Mx6sscZJG=0`J|rv`P^V*X`9Bi$a~{F zVEdpg_q}<+_T3}ee6{A-qs?Dy<-KqLG~4U1-S}SP)Llo{GxM_$IOmY}3VCdcfL$}^ z)%%7#<6jiqoTJ6i)b$@oEsy^a;O2ZSiKec<_Z_*psj8* zgPkMwtp7S-<6TNabGYx{=M36o%!to8{-D%$!GI@!D`N>_jvhU6k{1%Y+KiG7}&8V)(E(!yI!EFN@?nQ6Cmt+eOdo(lF_r=D~B zAh7XVx16H~!}ayB5BjL3%^_gh_^vi>4u#vs{I=0YEo}}1+r~YgHs6QaCjHe%Eo*oL z*fq@9j|3aTnC{J^!0tc&%qiE${pDVBk60((G_dhpE2`>f{-c(^C!$TQ+`sNM+k3tp N12&)MqS$ls{{yE|O<@24 literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/path_coarse.dxil b/piet-gpu/shader/gen/path_coarse.dxil new file mode 100644 index 0000000000000000000000000000000000000000..9fd593ca43dc26f919b022903612dc81fad0890b GIT binary patch literal 7064 zcmeHLeOMD$x}T8AWI`r!AfO2#nFvNlYRFCTAuS`bRi9bAXhvDF#*j3tr)ab zpjCr51eyy7&Vja~pom1lIQ$f?SR@UPDf-kujMXbvl68Pjh9~?3{SiEjzvB_0PuW7X z2+;VT!Jtu3n9m*`j@dfV9V20WSf669C{>gmLm)^4F`(Z7fc&zeVhRWcGCSZaRo+`7 z41(N%$2JpVYx01MX-e9o2*5=Mo<6sUX2GU2P(dR6N`;ZXFhbeHR(9g)F*zuINo0D3 ztj+Hf@29$T{=-i zTB(i@Vx54BfF2#_@yr?}QH%oXAgGampe#Vsr`cGB&>6it)?pRu#G10|#JKNG( zjN5&voCQS=qFy)VC-D-SmXD-YLYIsfI(?@cfm_$U3bhU%?jN2pBuZWYw`fJ`V9Xl+ zVJ+Wkav!i>KH_O`KIUp|1WAMv;R7dnX2v>M({HfxC@8*1ULp%;~7n8WY3?Bt6j;BZw+?+F9a7AHb@(#TwYn zzw|mDc>-cyIu5fVWvv$arDL`eaWtY$KJy^W{obqI`)=a zdfU(T9$2zt3w{5lcABD=P&nHe9gEVpSjs7BROjCT-84`MTP8Hy99IqU~=SW1?} z(BdQ3)BB4dQh^&eUaQG-lH>cGmY-_Rf3~dUQ&@p{SRKJ)+`7)+!|@~L8#ynkO4dV< zq>+2~vM+yMu_Ke(VrbSKwx+)Ik(+&e)ik%w2^Pm&N?%9UYxe}#hUfH{LX=n zn;-OC8Wf|de82sxS@w;<*g{F)&B1$s=g*&ZWjDo9=ij>&djH0)jsQ@+U^$I_v*FdR zm!7sST-_4?*-cpM$x>?xL}zr;evVY?kt!$hf(NPYK{kdEG@(-wo0awZtY2lGNzZ;S zMWNn_UA=I+cw^dES85i^Ge!O_ZEW3HaI(=cF9kxY275XJmj+sH1uotAHC!l2&zsVI zrv06H?QLh;Q+%b3*wvkBlTJ$lSH9YvW~U%FE;fD)FjBT0-Fvxy`;sk34;+X#tOh_I z-1_q7y-NdwzlJwK)P0tK`^-R%37!23l`*vWHdT(rZ&wbdHJ0WgJ&CS3&r`{4J+z?Ie&?Z)rjk)LRb9+>C&(ZVF*XLR2x!s|dts3Tz zDrR<7!)EX{Ht3xVdJS{yOy<@R27rCp*zi2su!-NW+10SkWs-ay;(i_EKJs#hNP0z} zHDRDNp?mCoQ=4!}l+bCKdP$31cdF(FRC8Tp?|1z<&8i$5J-69E8!Y+xfq zBBnu4-#)WptFb}QtKyCXkpY5xH;8yhfuvxkv}w$ZvmXCg%oFYK1@c5dN8&Y`G}el-+4`h z^%mMbDx6GE_&?MF1sjFH$)oh2nlrXsJmiB4shm$eg^Cr0G2>lmMnT!T_0JQ3+O{fg z^}OL9-lOW)O;DJqI`%hE;Qqdjbpn34j(rVe|EzT^2kh-Xtz!rOx7IO{2#Pw|+{H z`gy+Q-_WuUcvGRt~IP z{fX85rm^U^9UoNwX|Sna)OpLB93we zhc72Q!!O^z>ZAB_!KckoipqL-+RwB8wsJ=H%B<|^&M(2-=Jwu;Unu@?yuJ6zQdTC| z`v2$r|3f)nOF)nbB!3=UgR?>2P6PL!*$u^1$bB(|Df^a6pH}wAwXRyjX#h=Fcs~o}MGW0)7SH=>?FPu3# zW5cQIx3dfIrX8>Ratx5<+oOyG+7TqjRvBj|^X(9ngs>p9M$@Pi6#q+*xs9xFLIjFj zpXpLOV!GYTBm@Cs`H2o@WJZ*ZoOzVwNs;>W@Sws-X@8P(Pa$RQAX{)~t;f`idu_A` zBf($u+aR+j<*~0y31QhYX@c%_SgFoWlrs$OBm@PpPG85&JX(rf49gkY+|!*AD+%_z zn`CzczdIVG$I_!*3{qVZWqJx&Ii}y=k-F-PH}WWpMqVa8J~ci|&N$(YJs4r<6A)@v zTxWZ{j5f-Sh3+s9;hqgK&QZLVy2HFI?%M?7n?)_0@1#LBmkS`hzX&|nFjNzG&N{R@@Z8R!rGe)x zL(;%u^GQLWp<8}iXHE@tS)O(}SX@J~KBzNC3wg~WE+WtcD`b<=P#ZC2AfrM<-4$!u zIhDqg7FgHmh2%IEeMsM+8Sge7!W)BTZ2`uGV4pX{GN9EJok2>;ex#>CK0Qr-X}idM zRquE5G~{Iu?vp5_YSXd8*Qa#K8Jn4nGjV!sU~2N~a;y|ny*|UjSU@weooApuj{PcJ z$c`8-1NKDB0Y<(+2$QY>!%mtE#bxwSejj(Z&Bz(hMZ^cX_*qWJoNAx|kMKp_F1S!8 z&L5U)O%4{@Q0yMiDW?m0$zW@~t}{(W2SZB;zi%L9li}euxzoWoN(gxc3R!{OVvrDw zQI|Y&SlXF9pi`uP^I7h6NRJc34U2&S8j>oF4wlnUY#Y#ZLFnAimu0~Z+H`vyQ(rL@qd^@_A$&G&^hO^r29ixC9=6@xj3zQbi{85~0{BueD9#~8~5J*V~hwIlY%unWzcP$Ht>2a@y z)T$9{()-tl_j%SFEy^?F*doIzk-;CO3C4B4+=0))v`XCEk0e3$m* z%!h4V*#kN=A!L83l1*-A8VWEBxD@eVPb_PqJG$pbRFX5GTO>bHZeHW2w{mH{2P$^g zZyd5MIJv2ZlQ`Nihx=Y{5+SXYW1SEdk4FeFuZI>@N7ukb-gWPal9xwdm4fDJuIXh+ zD*llGy7T`c}Yu)ZeF+nhMBFNq4w zmlR3Ml~C+G+e&BBM1X&Cz}C0})}>Q<XUmMvKd|OIt{HR!Hzzbd|B0u|XtUvgaosKLs4~8I`QyoXsH7b4FdkxM3+=0Ni2& zaH~H7AAd_E-0&Pw_%D5YZB>MjU8s_Y#R`K29nhJ;m34!mtHEF=d81|eMBwsy1G>T% z5V#LP;NBAnbv-KCbvIDJgE|W&tcqaqNjmZycy`iM6ycsLHQlejK>|!4GG^Kd;UAkZvWwtSA;!AEY7K*rx)YE zm{VZUa#Q$9XC0kwVpt8`B-rTbK$rR1CzABP0?96eguJag6Ww!HLzA+r6*52PjF+p? zG)cVcaw<^UTUO0F!NE+ll64rI`b!_mHzBFkzPa>w^hm06K@Pp0qVsd;zt>yik?oX2 zqmd_jc8-*Jdb`fel8Hfnao0N-CWFK|Wwb1q7Mh5fgkYzEcf*)CMc*L`>R)7 zKCS7nx2rkB=15tOABmz@4r(g;-lTj(~hw@9^zhW${M0qb7zKm7I75 zcipylo5b*Ri=&hX*nHHNiu~c;T>Tx{*DGXS@WxSm`3v6UWe;Y66cBroTS~~D*-7(r zOayWk>x;4TB_OfT&MgRT$v&O5V8}1~He(6N$u{o(SBrw-|1kXoEw(Jj^J@6aSgnt! zGh5dGQuugStoEnC@%W?eIU_r=KU1SVefH+Uj#>(6 z4@Dmq2s~p{As{}MUkJsoh1mO8Ghy?%yO0%Czl`|v%wb4)c&+#JFnQauOi!!uN1eez zuBTr}@)vMHdIjb3I@D!rbRMgoO@x0r?2!hn1j*w06afCNr zbW{+T#;v<<6II8B?O)6uVokWO zR+qIE5S~mvbHvW)Q(t(>MY%CQdC!5N6H=#1I<7QpWz`VU=sKAs>6bz zUOwXtd6R^z$jd$Z6l!JMMTsd}Ej{n&+f?I}JjAH2HM8-vKCWDC;;wEjj8iZoN_z_C zRd^_~@RZ|GXj_ST(M!$U>2}{6Nn70u4HBI7T^SJ#K4WGDGZK2E*$=Ly8%=!JA5}x+ zb*z*2c6L>mZC7jQ)3A>?4f5jbz{1kv>>fr1wg^v)F`UJ~|Cv6$WyyJGh_mQCj75FU zQXM2MECre-?=~kkU6q*lSS_LCYcy{0=W?cbCk;Vf80ORxWq{?K9cB@Kvr+4#d(_& zrPVACoP=z_<3MWhtjJB z?}aBF=gm4^5?o@QJ=!$;LGLX8N%KDGi(>?njl3MwULly2^2hL`-|^CZS3)c^ z=QQrkF)g3<1gw2UT6+n(Y$LhsPUf=K(wFgNnwNvi2>wQN1Ogub0WbT7soz48G3uhl zsOB{{hNL35f^fR0&d44q%X=3_ho%m(U_Xbp)aaAc3(CM*2wZhR9%|3~0R~kbEW4jHYpB13$G{D($I>dZg5Bu%Q@W?RW9dTaf28=a%>X#8I!pI?~eBPB(!+& ILl8Xw25S(5yZ`_I literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/path_coarse.hlsl b/piet-gpu/shader/gen/path_coarse.hlsl new file mode 100644 index 0000000..93ee8f0 --- /dev/null +++ b/piet-gpu/shader/gen/path_coarse.hlsl @@ -0,0 +1,673 @@ +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct PathCubicRef +{ + uint offset; +}; + +struct PathCubic +{ + float2 p0; + float2 p1; + float2 p2; + float2 p3; + uint path_ix; + uint trans_ix; + float2 stroke; +}; + +struct PathSegRef +{ + uint offset; +}; + +struct PathSegTag +{ + uint tag; + uint flags; +}; + +struct TileRef +{ + uint offset; +}; + +struct PathRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct TileSegRef +{ + uint offset; +}; + +struct TileSeg +{ + float2 origin; + float2 _vector; + float y_edge; + TileSegRef next; +}; + +struct SubdivResult +{ + float val; + float a0; + float a2; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(32u, 1u, 1u); + +static const PathSegTag _721 = { 0u, 0u }; + +RWByteAddressBuffer _136 : register(u0, space0); +ByteAddressBuffer _710 : register(t1, space0); + +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +uint read_mem(Alloc alloc, uint offset) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = _136.Load(offset * 4 + 8); + return v; +} + +PathSegTag PathSeg_tag(Alloc a, PathSegRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1); + PathSegTag _367 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) }; + return _367; +} + +PathCubic PathCubic_read(Alloc a, PathCubicRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11); + Alloc param_12 = a; + uint param_13 = ix + 6u; + uint raw6 = read_mem(param_12, param_13); + Alloc param_14 = a; + uint param_15 = ix + 7u; + uint raw7 = read_mem(param_14, param_15); + Alloc param_16 = a; + uint param_17 = ix + 8u; + uint raw8 = read_mem(param_16, param_17); + Alloc param_18 = a; + uint param_19 = ix + 9u; + uint raw9 = read_mem(param_18, param_19); + Alloc param_20 = a; + uint param_21 = ix + 10u; + uint raw10 = read_mem(param_20, param_21); + Alloc param_22 = a; + uint param_23 = ix + 11u; + uint raw11 = read_mem(param_22, param_23); + PathCubic s; + s.p0 = float2(asfloat(raw0), asfloat(raw1)); + s.p1 = float2(asfloat(raw2), asfloat(raw3)); + s.p2 = float2(asfloat(raw4), asfloat(raw5)); + s.p3 = float2(asfloat(raw6), asfloat(raw7)); + s.path_ix = raw8; + s.trans_ix = raw9; + s.stroke = float2(asfloat(raw10), asfloat(raw11)); + return s; +} + +PathCubic PathSeg_Cubic_read(Alloc a, PathSegRef ref) +{ + PathCubicRef _373 = { ref.offset + 4u }; + Alloc param = a; + PathCubicRef param_1 = _373; + return PathCubic_read(param, param_1); +} + +float2 eval_cubic(float2 p0, float2 p1, float2 p2, float2 p3, float t) +{ + float mt = 1.0f - t; + return (p0 * ((mt * mt) * mt)) + (((p1 * ((mt * mt) * 3.0f)) + (((p2 * (mt * 3.0f)) + (p3 * t)) * t)) * t); +} + +float approx_parabola_integral(float x) +{ + return x * rsqrt(sqrt(0.3300000131130218505859375f + (0.201511204242706298828125f + ((0.25f * x) * x)))); +} + +SubdivResult estimate_subdiv(float2 p0, float2 p1, float2 p2, float sqrt_tol) +{ + float2 d01 = p1 - p0; + float2 d12 = p2 - p1; + float2 dd = d01 - d12; + float _cross = ((p2.x - p0.x) * dd.y) - ((p2.y - p0.y) * dd.x); + float x0 = ((d01.x * dd.x) + (d01.y * dd.y)) / _cross; + float x2 = ((d12.x * dd.x) + (d12.y * dd.y)) / _cross; + float scale = abs(_cross / (length(dd) * (x2 - x0))); + float param = x0; + float a0 = approx_parabola_integral(param); + float param_1 = x2; + float a2 = approx_parabola_integral(param_1); + float val = 0.0f; + if (scale < 1000000000.0f) + { + float da = abs(a2 - a0); + float sqrt_scale = sqrt(scale); + if (sign(x0) == sign(x2)) + { + val = da * sqrt_scale; + } + else + { + float xmin = sqrt_tol / sqrt_scale; + float param_2 = xmin; + val = (sqrt_tol * da) / approx_parabola_integral(param_2); + } + } + SubdivResult _695 = { val, a0, a2 }; + return _695; +} + +uint fill_mode_from_flags(uint flags) +{ + return flags & 1u; +} + +Path Path_read(Alloc a, PathRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + TileRef _427 = { raw2 }; + s.tiles = _427; + return s; +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +float approx_parabola_inv_integral(float x) +{ + return x * sqrt(0.61000001430511474609375f + (0.1520999968051910400390625f + ((0.25f * x) * x))); +} + +float2 eval_quad(float2 p0, float2 p1, float2 p2, float t) +{ + float mt = 1.0f - t; + return (p0 * (mt * mt)) + (((p1 * (mt * 2.0f)) + (p2 * t)) * t); +} + +MallocResult malloc(uint size) +{ + uint _142; + _136.InterlockedAdd(0, size, _142); + uint offset = _142; + uint _149; + _136.GetDimensions(_149); + _149 = (_149 - 8) / 4; + MallocResult r; + r.failed = (offset + size) > uint(int(_149) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _171; + _136.InterlockedMax(4, 1u, _171); + return r; + } + return r; +} + +TileRef Tile_index(TileRef ref, uint index) +{ + TileRef _385 = { ref.offset + (index * 8u) }; + return _385; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _136.Store(offset * 4 + 8, val); +} + +void TileSeg_write(Alloc a, TileSegRef ref, TileSeg s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = asuint(s.origin.x); + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = asuint(s.origin.y); + write_mem(param_3, param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = asuint(s._vector.x); + write_mem(param_6, param_7, param_8); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = asuint(s._vector.y); + write_mem(param_9, param_10, param_11); + Alloc param_12 = a; + uint param_13 = ix + 4u; + uint param_14 = asuint(s.y_edge); + write_mem(param_12, param_13, param_14); + Alloc param_15 = a; + uint param_16 = ix + 5u; + uint param_17 = s.next.offset; + write_mem(param_15, param_16, param_17); +} + +void comp_main() +{ + uint element_ix = gl_GlobalInvocationID.x; + PathSegRef _718 = { _710.Load(28) + (element_ix * 52u) }; + PathSegRef ref = _718; + PathSegTag tag = _721; + if (element_ix < _710.Load(4)) + { + Alloc _731; + _731.offset = _710.Load(28); + Alloc param; + param.offset = _731.offset; + PathSegRef param_1 = ref; + tag = PathSeg_tag(param, param_1); + } + bool mem_ok = _136.Load(4) == 0u; + switch (tag.tag) + { + case 1u: + { + Alloc _748; + _748.offset = _710.Load(28); + Alloc param_2; + param_2.offset = _748.offset; + PathSegRef param_3 = ref; + PathCubic cubic = PathSeg_Cubic_read(param_2, param_3); + float2 err_v = (((cubic.p2 - cubic.p1) * 3.0f) + cubic.p0) - cubic.p3; + float err = (err_v.x * err_v.x) + (err_v.y * err_v.y); + uint n_quads = max(uint(ceil(pow(err * 3.7037036418914794921875f, 0.16666667163372039794921875f))), 1u); + n_quads = min(n_quads, 16u); + float val = 0.0f; + float2 qp0 = cubic.p0; + float _step = 1.0f / float(n_quads); + SubdivResult keep_params[16]; + for (uint i = 0u; i < n_quads; i++) + { + float t = float(i + 1u) * _step; + float2 param_4 = cubic.p0; + float2 param_5 = cubic.p1; + float2 param_6 = cubic.p2; + float2 param_7 = cubic.p3; + float param_8 = t; + float2 qp2 = eval_cubic(param_4, param_5, param_6, param_7, param_8); + float2 param_9 = cubic.p0; + float2 param_10 = cubic.p1; + float2 param_11 = cubic.p2; + float2 param_12 = cubic.p3; + float param_13 = t - (0.5f * _step); + float2 qp1 = eval_cubic(param_9, param_10, param_11, param_12, param_13); + qp1 = (qp1 * 2.0f) - ((qp0 + qp2) * 0.5f); + float2 param_14 = qp0; + float2 param_15 = qp1; + float2 param_16 = qp2; + float param_17 = 0.4743416607379913330078125f; + SubdivResult params = estimate_subdiv(param_14, param_15, param_16, param_17); + keep_params[i] = params; + val += params.val; + qp0 = qp2; + } + uint n = max(uint(ceil((val * 0.5f) / 0.4743416607379913330078125f)), 1u); + uint param_18 = tag.flags; + bool is_stroke = fill_mode_from_flags(param_18) == 1u; + uint path_ix = cubic.path_ix; + PathRef _904 = { _710.Load(16) + (path_ix * 12u) }; + Alloc _907; + _907.offset = _710.Load(16); + Alloc param_19; + param_19.offset = _907.offset; + PathRef param_20 = _904; + Path path = Path_read(param_19, param_20); + uint param_21 = path.tiles.offset; + uint param_22 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_23 = mem_ok; + Alloc path_alloc = new_alloc(param_21, param_22, param_23); + int4 bbox = int4(path.bbox); + float2 p0 = cubic.p0; + qp0 = cubic.p0; + float v_step = val / float(n); + int n_out = 1; + float val_sum = 0.0f; + float2 p1; + float _1147; + TileSeg tile_seg; + for (uint i_1 = 0u; i_1 < n_quads; i_1++) + { + float t_1 = float(i_1 + 1u) * _step; + float2 param_24 = cubic.p0; + float2 param_25 = cubic.p1; + float2 param_26 = cubic.p2; + float2 param_27 = cubic.p3; + float param_28 = t_1; + float2 qp2_1 = eval_cubic(param_24, param_25, param_26, param_27, param_28); + float2 param_29 = cubic.p0; + float2 param_30 = cubic.p1; + float2 param_31 = cubic.p2; + float2 param_32 = cubic.p3; + float param_33 = t_1 - (0.5f * _step); + float2 qp1_1 = eval_cubic(param_29, param_30, param_31, param_32, param_33); + qp1_1 = (qp1_1 * 2.0f) - ((qp0 + qp2_1) * 0.5f); + SubdivResult params_1 = keep_params[i_1]; + float param_34 = params_1.a0; + float u0 = approx_parabola_inv_integral(param_34); + float param_35 = params_1.a2; + float u2 = approx_parabola_inv_integral(param_35); + float uscale = 1.0f / (u2 - u0); + float target = float(n_out) * v_step; + for (;;) + { + bool _1040 = uint(n_out) == n; + bool _1050; + if (!_1040) + { + _1050 = target < (val_sum + params_1.val); + } + else + { + _1050 = _1040; + } + if (_1050) + { + if (uint(n_out) == n) + { + p1 = cubic.p3; + } + else + { + float u = (target - val_sum) / params_1.val; + float a = lerp(params_1.a0, params_1.a2, u); + float param_36 = a; + float au = approx_parabola_inv_integral(param_36); + float t_2 = (au - u0) * uscale; + float2 param_37 = qp0; + float2 param_38 = qp1_1; + float2 param_39 = qp2_1; + float param_40 = t_2; + p1 = eval_quad(param_37, param_38, param_39, param_40); + } + float xmin = min(p0.x, p1.x) - cubic.stroke.x; + float xmax = max(p0.x, p1.x) + cubic.stroke.x; + float ymin = min(p0.y, p1.y) - cubic.stroke.y; + float ymax = max(p0.y, p1.y) + cubic.stroke.y; + float dx = p1.x - p0.x; + float dy = p1.y - p0.y; + if (abs(dy) < 9.999999717180685365747194737196e-10f) + { + _1147 = 1000000000.0f; + } + else + { + _1147 = dx / dy; + } + float invslope = _1147; + float c = (cubic.stroke.x + (abs(invslope) * (8.0f + cubic.stroke.y))) * 0.0625f; + float b = invslope; + float a_1 = (p0.x - ((p0.y - 8.0f) * b)) * 0.0625f; + int x0 = int(floor(xmin * 0.0625f)); + int x1 = int(floor(xmax * 0.0625f) + 1.0f); + int y0 = int(floor(ymin * 0.0625f)); + int y1 = int(floor(ymax * 0.0625f) + 1.0f); + x0 = clamp(x0, bbox.x, bbox.z); + y0 = clamp(y0, bbox.y, bbox.w); + x1 = clamp(x1, bbox.x, bbox.z); + y1 = clamp(y1, bbox.y, bbox.w); + float xc = a_1 + (b * float(y0)); + int stride = bbox.z - bbox.x; + int base = ((y0 - bbox.y) * stride) - bbox.x; + uint n_tile_alloc = uint((x1 - x0) * (y1 - y0)); + uint param_41 = n_tile_alloc * 24u; + MallocResult _1263 = malloc(param_41); + MallocResult tile_alloc = _1263; + if (tile_alloc.failed || (!mem_ok)) + { + return; + } + uint tile_offset = tile_alloc.alloc.offset; + int xray = int(floor(p0.x * 0.0625f)); + int last_xray = int(floor(p1.x * 0.0625f)); + if (p0.y > p1.y) + { + int tmp = xray; + xray = last_xray; + last_xray = tmp; + } + for (int y = y0; y < y1; y++) + { + float tile_y0 = float(y * 16); + int xbackdrop = max((xray + 1), bbox.x); + bool _1319 = !is_stroke; + bool _1329; + if (_1319) + { + _1329 = min(p0.y, p1.y) < tile_y0; + } + else + { + _1329 = _1319; + } + bool _1336; + if (_1329) + { + _1336 = xbackdrop < bbox.z; + } + else + { + _1336 = _1329; + } + if (_1336) + { + int backdrop = (p1.y < p0.y) ? 1 : (-1); + TileRef param_42 = path.tiles; + uint param_43 = uint(base + xbackdrop); + TileRef tile_ref = Tile_index(param_42, param_43); + uint tile_el = tile_ref.offset >> uint(2); + Alloc param_44 = path_alloc; + uint param_45 = tile_el + 1u; + if (touch_mem(param_44, param_45)) + { + uint _1374; + _136.InterlockedAdd((tile_el + 1u) * 4 + 8, uint(backdrop), _1374); + } + } + int next_xray = last_xray; + if (y < (y1 - 1)) + { + float tile_y1 = float((y + 1) * 16); + float x_edge = lerp(p0.x, p1.x, (tile_y1 - p0.y) / dy); + next_xray = int(floor(x_edge * 0.0625f)); + } + int min_xray = min(xray, next_xray); + int max_xray = max(xray, next_xray); + int xx0 = min(int(floor(xc - c)), min_xray); + int xx1 = max(int(ceil(xc + c)), (max_xray + 1)); + xx0 = clamp(xx0, x0, x1); + xx1 = clamp(xx1, x0, x1); + for (int x = xx0; x < xx1; x++) + { + float tile_x0 = float(x * 16); + TileRef _1454 = { path.tiles.offset }; + TileRef param_46 = _1454; + uint param_47 = uint(base + x); + TileRef tile_ref_1 = Tile_index(param_46, param_47); + uint tile_el_1 = tile_ref_1.offset >> uint(2); + uint old = 0u; + Alloc param_48 = path_alloc; + uint param_49 = tile_el_1; + if (touch_mem(param_48, param_49)) + { + uint _1477; + _136.InterlockedExchange(tile_el_1 * 4 + 8, tile_offset, _1477); + old = _1477; + } + tile_seg.origin = p0; + tile_seg._vector = p1 - p0; + float y_edge = 0.0f; + if (!is_stroke) + { + y_edge = lerp(p0.y, p1.y, (tile_x0 - p0.x) / dx); + if (min(p0.x, p1.x) < tile_x0) + { + float2 p = float2(tile_x0, y_edge); + if (p0.x > p1.x) + { + tile_seg._vector = p - p0; + } + else + { + tile_seg.origin = p; + tile_seg._vector = p1 - p; + } + if (tile_seg._vector.x == 0.0f) + { + tile_seg._vector.x = sign(p1.x - p0.x) * 9.999999717180685365747194737196e-10f; + } + } + if ((x <= min_xray) || (max_xray < x)) + { + y_edge = 1000000000.0f; + } + } + tile_seg.y_edge = y_edge; + tile_seg.next.offset = old; + TileSegRef _1559 = { tile_offset }; + Alloc param_50 = tile_alloc.alloc; + TileSegRef param_51 = _1559; + TileSeg param_52 = tile_seg; + TileSeg_write(param_50, param_51, param_52); + tile_offset += 24u; + } + xc += b; + base += stride; + xray = next_xray; + } + n_out++; + target += v_step; + p0 = p1; + continue; + } + else + { + break; + } + } + val_sum += params_1.val; + qp0 = qp2_1; + } + break; + } + } +} + +[numthreads(32, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/path_coarse.msl b/piet-gpu/shader/gen/path_coarse.msl new file mode 100644 index 0000000..26aa33a --- /dev/null +++ b/piet-gpu/shader/gen/path_coarse.msl @@ -0,0 +1,717 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" +#pragma clang diagnostic ignored "-Wunused-variable" + +#include +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct PathCubicRef +{ + uint offset; +}; + +struct PathCubic +{ + float2 p0; + float2 p1; + float2 p2; + float2 p3; + uint path_ix; + uint trans_ix; + float2 stroke; +}; + +struct PathSegRef +{ + uint offset; +}; + +struct PathSegTag +{ + uint tag; + uint flags; +}; + +struct TileRef +{ + uint offset; +}; + +struct PathRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct TileSegRef +{ + uint offset; +}; + +struct TileSeg +{ + float2 origin; + float2 vector; + float y_edge; + TileSegRef next; +}; + +struct SubdivResult +{ + float val; + float a0; + float a2; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 path_bbox_alloc; + Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; + Alloc_1 draw_bbox_alloc; + Alloc_1 drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(32u, 1u, 1u); + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_136, constant uint& v_136BufferSize) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_136.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +PathSegTag PathSeg_tag(thread const Alloc& a, thread const PathSegRef& ref, device Memory& v_136, constant uint& v_136BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1, v_136, v_136BufferSize); + return PathSegTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) }; +} + +static inline __attribute__((always_inline)) +PathCubic PathCubic_read(thread const Alloc& a, thread const PathCubicRef& ref, device Memory& v_136, constant uint& v_136BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_136, v_136BufferSize); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_136, v_136BufferSize); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_136, v_136BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_136, v_136BufferSize); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9, v_136, v_136BufferSize); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11, v_136, v_136BufferSize); + Alloc param_12 = a; + uint param_13 = ix + 6u; + uint raw6 = read_mem(param_12, param_13, v_136, v_136BufferSize); + Alloc param_14 = a; + uint param_15 = ix + 7u; + uint raw7 = read_mem(param_14, param_15, v_136, v_136BufferSize); + Alloc param_16 = a; + uint param_17 = ix + 8u; + uint raw8 = read_mem(param_16, param_17, v_136, v_136BufferSize); + Alloc param_18 = a; + uint param_19 = ix + 9u; + uint raw9 = read_mem(param_18, param_19, v_136, v_136BufferSize); + Alloc param_20 = a; + uint param_21 = ix + 10u; + uint raw10 = read_mem(param_20, param_21, v_136, v_136BufferSize); + Alloc param_22 = a; + uint param_23 = ix + 11u; + uint raw11 = read_mem(param_22, param_23, v_136, v_136BufferSize); + PathCubic s; + s.p0 = float2(as_type(raw0), as_type(raw1)); + s.p1 = float2(as_type(raw2), as_type(raw3)); + s.p2 = float2(as_type(raw4), as_type(raw5)); + s.p3 = float2(as_type(raw6), as_type(raw7)); + s.path_ix = raw8; + s.trans_ix = raw9; + s.stroke = float2(as_type(raw10), as_type(raw11)); + return s; +} + +static inline __attribute__((always_inline)) +PathCubic PathSeg_Cubic_read(thread const Alloc& a, thread const PathSegRef& ref, device Memory& v_136, constant uint& v_136BufferSize) +{ + Alloc param = a; + PathCubicRef param_1 = PathCubicRef{ ref.offset + 4u }; + return PathCubic_read(param, param_1, v_136, v_136BufferSize); +} + +static inline __attribute__((always_inline)) +float2 eval_cubic(thread const float2& p0, thread const float2& p1, thread const float2& p2, thread const float2& p3, thread const float& t) +{ + float mt = 1.0 - t; + return (p0 * ((mt * mt) * mt)) + (((p1 * ((mt * mt) * 3.0)) + (((p2 * (mt * 3.0)) + (p3 * t)) * t)) * t); +} + +static inline __attribute__((always_inline)) +float approx_parabola_integral(thread const float& x) +{ + return x * rsqrt(sqrt(0.3300000131130218505859375 + (0.201511204242706298828125 + ((0.25 * x) * x)))); +} + +static inline __attribute__((always_inline)) +SubdivResult estimate_subdiv(thread const float2& p0, thread const float2& p1, thread const float2& p2, thread const float& sqrt_tol) +{ + float2 d01 = p1 - p0; + float2 d12 = p2 - p1; + float2 dd = d01 - d12; + float _cross = ((p2.x - p0.x) * dd.y) - ((p2.y - p0.y) * dd.x); + float x0 = ((d01.x * dd.x) + (d01.y * dd.y)) / _cross; + float x2 = ((d12.x * dd.x) + (d12.y * dd.y)) / _cross; + float scale = abs(_cross / (length(dd) * (x2 - x0))); + float param = x0; + float a0 = approx_parabola_integral(param); + float param_1 = x2; + float a2 = approx_parabola_integral(param_1); + float val = 0.0; + if (scale < 1000000000.0) + { + float da = abs(a2 - a0); + float sqrt_scale = sqrt(scale); + if (sign(x0) == sign(x2)) + { + val = da * sqrt_scale; + } + else + { + float xmin = sqrt_tol / sqrt_scale; + float param_2 = xmin; + val = (sqrt_tol * da) / approx_parabola_integral(param_2); + } + } + return SubdivResult{ val, a0, a2 }; +} + +static inline __attribute__((always_inline)) +uint fill_mode_from_flags(thread const uint& flags) +{ + return flags & 1u; +} + +static inline __attribute__((always_inline)) +Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_136, constant uint& v_136BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_136, v_136BufferSize); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_136, v_136BufferSize); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_136, v_136BufferSize); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + s.tiles = TileRef{ raw2 }; + return s; +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +float approx_parabola_inv_integral(thread const float& x) +{ + return x * sqrt(0.61000001430511474609375 + (0.1520999968051910400390625 + ((0.25 * x) * x))); +} + +static inline __attribute__((always_inline)) +float2 eval_quad(thread const float2& p0, thread const float2& p1, thread const float2& p2, thread const float& t) +{ + float mt = 1.0 - t; + return (p0 * (mt * mt)) + (((p1 * (mt * 2.0)) + (p2 * t)) * t); +} + +static inline __attribute__((always_inline)) +MallocResult malloc(thread const uint& size, device Memory& v_136, constant uint& v_136BufferSize) +{ + uint _142 = atomic_fetch_add_explicit((device atomic_uint*)&v_136.mem_offset, size, memory_order_relaxed); + uint offset = _142; + MallocResult r; + r.failed = (offset + size) > uint(int((v_136BufferSize - 8) / 4) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _171 = atomic_fetch_max_explicit((device atomic_uint*)&v_136.mem_error, 1u, memory_order_relaxed); + return r; + } + return r; +} + +static inline __attribute__((always_inline)) +TileRef Tile_index(thread const TileRef& ref, thread const uint& index) +{ + return TileRef{ ref.offset + (index * 8u) }; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_136, constant uint& v_136BufferSize) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_136.memory[offset] = val; +} + +static inline __attribute__((always_inline)) +void TileSeg_write(thread const Alloc& a, thread const TileSegRef& ref, thread const TileSeg& s, device Memory& v_136, constant uint& v_136BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = as_type(s.origin.x); + write_mem(param, param_1, param_2, v_136, v_136BufferSize); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = as_type(s.origin.y); + write_mem(param_3, param_4, param_5, v_136, v_136BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = as_type(s.vector.x); + write_mem(param_6, param_7, param_8, v_136, v_136BufferSize); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = as_type(s.vector.y); + write_mem(param_9, param_10, param_11, v_136, v_136BufferSize); + Alloc param_12 = a; + uint param_13 = ix + 4u; + uint param_14 = as_type(s.y_edge); + write_mem(param_12, param_13, param_14, v_136, v_136BufferSize); + Alloc param_15 = a; + uint param_16 = ix + 5u; + uint param_17 = s.next.offset; + write_mem(param_15, param_16, param_17, v_136, v_136BufferSize); +} + +kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_136 [[buffer(0)]], const device ConfigBuf& _710 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) +{ + constant uint& v_136BufferSize = spvBufferSizeConstants[0]; + uint element_ix = gl_GlobalInvocationID.x; + PathSegRef ref = PathSegRef{ _710.conf.pathseg_alloc.offset + (element_ix * 52u) }; + PathSegTag tag = PathSegTag{ 0u, 0u }; + if (element_ix < _710.conf.n_pathseg) + { + Alloc param; + param.offset = _710.conf.pathseg_alloc.offset; + PathSegRef param_1 = ref; + tag = PathSeg_tag(param, param_1, v_136, v_136BufferSize); + } + bool mem_ok = v_136.mem_error == 0u; + switch (tag.tag) + { + case 1u: + { + Alloc param_2; + param_2.offset = _710.conf.pathseg_alloc.offset; + PathSegRef param_3 = ref; + PathCubic cubic = PathSeg_Cubic_read(param_2, param_3, v_136, v_136BufferSize); + float2 err_v = (((cubic.p2 - cubic.p1) * 3.0) + cubic.p0) - cubic.p3; + float err = (err_v.x * err_v.x) + (err_v.y * err_v.y); + uint n_quads = max(uint(ceil(pow(err * 3.7037036418914794921875, 0.16666667163372039794921875))), 1u); + n_quads = min(n_quads, 16u); + float val = 0.0; + float2 qp0 = cubic.p0; + float _step = 1.0 / float(n_quads); + spvUnsafeArray keep_params; + for (uint i = 0u; i < n_quads; i++) + { + float t = float(i + 1u) * _step; + float2 param_4 = cubic.p0; + float2 param_5 = cubic.p1; + float2 param_6 = cubic.p2; + float2 param_7 = cubic.p3; + float param_8 = t; + float2 qp2 = eval_cubic(param_4, param_5, param_6, param_7, param_8); + float2 param_9 = cubic.p0; + float2 param_10 = cubic.p1; + float2 param_11 = cubic.p2; + float2 param_12 = cubic.p3; + float param_13 = t - (0.5 * _step); + float2 qp1 = eval_cubic(param_9, param_10, param_11, param_12, param_13); + qp1 = (qp1 * 2.0) - ((qp0 + qp2) * 0.5); + float2 param_14 = qp0; + float2 param_15 = qp1; + float2 param_16 = qp2; + float param_17 = 0.4743416607379913330078125; + SubdivResult params = estimate_subdiv(param_14, param_15, param_16, param_17); + keep_params[i] = params; + val += params.val; + qp0 = qp2; + } + uint n = max(uint(ceil((val * 0.5) / 0.4743416607379913330078125)), 1u); + uint param_18 = tag.flags; + bool is_stroke = fill_mode_from_flags(param_18) == 1u; + uint path_ix = cubic.path_ix; + Alloc param_19; + param_19.offset = _710.conf.tile_alloc.offset; + PathRef param_20 = PathRef{ _710.conf.tile_alloc.offset + (path_ix * 12u) }; + Path path = Path_read(param_19, param_20, v_136, v_136BufferSize); + uint param_21 = path.tiles.offset; + uint param_22 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_23 = mem_ok; + Alloc path_alloc = new_alloc(param_21, param_22, param_23); + int4 bbox = int4(path.bbox); + float2 p0 = cubic.p0; + qp0 = cubic.p0; + float v_step = val / float(n); + int n_out = 1; + float val_sum = 0.0; + float2 p1; + float _1147; + TileSeg tile_seg; + for (uint i_1 = 0u; i_1 < n_quads; i_1++) + { + float t_1 = float(i_1 + 1u) * _step; + float2 param_24 = cubic.p0; + float2 param_25 = cubic.p1; + float2 param_26 = cubic.p2; + float2 param_27 = cubic.p3; + float param_28 = t_1; + float2 qp2_1 = eval_cubic(param_24, param_25, param_26, param_27, param_28); + float2 param_29 = cubic.p0; + float2 param_30 = cubic.p1; + float2 param_31 = cubic.p2; + float2 param_32 = cubic.p3; + float param_33 = t_1 - (0.5 * _step); + float2 qp1_1 = eval_cubic(param_29, param_30, param_31, param_32, param_33); + qp1_1 = (qp1_1 * 2.0) - ((qp0 + qp2_1) * 0.5); + SubdivResult params_1 = keep_params[i_1]; + float param_34 = params_1.a0; + float u0 = approx_parabola_inv_integral(param_34); + float param_35 = params_1.a2; + float u2 = approx_parabola_inv_integral(param_35); + float uscale = 1.0 / (u2 - u0); + float target = float(n_out) * v_step; + for (;;) + { + bool _1040 = uint(n_out) == n; + bool _1050; + if (!_1040) + { + _1050 = target < (val_sum + params_1.val); + } + else + { + _1050 = _1040; + } + if (_1050) + { + if (uint(n_out) == n) + { + p1 = cubic.p3; + } + else + { + float u = (target - val_sum) / params_1.val; + float a = mix(params_1.a0, params_1.a2, u); + float param_36 = a; + float au = approx_parabola_inv_integral(param_36); + float t_2 = (au - u0) * uscale; + float2 param_37 = qp0; + float2 param_38 = qp1_1; + float2 param_39 = qp2_1; + float param_40 = t_2; + p1 = eval_quad(param_37, param_38, param_39, param_40); + } + float xmin = fast::min(p0.x, p1.x) - cubic.stroke.x; + float xmax = fast::max(p0.x, p1.x) + cubic.stroke.x; + float ymin = fast::min(p0.y, p1.y) - cubic.stroke.y; + float ymax = fast::max(p0.y, p1.y) + cubic.stroke.y; + float dx = p1.x - p0.x; + float dy = p1.y - p0.y; + if (abs(dy) < 9.999999717180685365747194737196e-10) + { + _1147 = 1000000000.0; + } + else + { + _1147 = dx / dy; + } + float invslope = _1147; + float c = (cubic.stroke.x + (abs(invslope) * (8.0 + cubic.stroke.y))) * 0.0625; + float b = invslope; + float a_1 = (p0.x - ((p0.y - 8.0) * b)) * 0.0625; + int x0 = int(floor(xmin * 0.0625)); + int x1 = int(floor(xmax * 0.0625) + 1.0); + int y0 = int(floor(ymin * 0.0625)); + int y1 = int(floor(ymax * 0.0625) + 1.0); + x0 = clamp(x0, bbox.x, bbox.z); + y0 = clamp(y0, bbox.y, bbox.w); + x1 = clamp(x1, bbox.x, bbox.z); + y1 = clamp(y1, bbox.y, bbox.w); + float xc = a_1 + (b * float(y0)); + int stride = bbox.z - bbox.x; + int base = ((y0 - bbox.y) * stride) - bbox.x; + uint n_tile_alloc = uint((x1 - x0) * (y1 - y0)); + uint param_41 = n_tile_alloc * 24u; + MallocResult _1263 = malloc(param_41, v_136, v_136BufferSize); + MallocResult tile_alloc = _1263; + if (tile_alloc.failed || (!mem_ok)) + { + return; + } + uint tile_offset = tile_alloc.alloc.offset; + int xray = int(floor(p0.x * 0.0625)); + int last_xray = int(floor(p1.x * 0.0625)); + if (p0.y > p1.y) + { + int tmp = xray; + xray = last_xray; + last_xray = tmp; + } + for (int y = y0; y < y1; y++) + { + float tile_y0 = float(y * 16); + int xbackdrop = max((xray + 1), bbox.x); + bool _1319 = !is_stroke; + bool _1329; + if (_1319) + { + _1329 = fast::min(p0.y, p1.y) < tile_y0; + } + else + { + _1329 = _1319; + } + bool _1336; + if (_1329) + { + _1336 = xbackdrop < bbox.z; + } + else + { + _1336 = _1329; + } + if (_1336) + { + int backdrop = (p1.y < p0.y) ? 1 : (-1); + TileRef param_42 = path.tiles; + uint param_43 = uint(base + xbackdrop); + TileRef tile_ref = Tile_index(param_42, param_43); + uint tile_el = tile_ref.offset >> uint(2); + Alloc param_44 = path_alloc; + uint param_45 = tile_el + 1u; + if (touch_mem(param_44, param_45)) + { + uint _1374 = atomic_fetch_add_explicit((device atomic_uint*)&v_136.memory[tile_el + 1u], uint(backdrop), memory_order_relaxed); + } + } + int next_xray = last_xray; + if (y < (y1 - 1)) + { + float tile_y1 = float((y + 1) * 16); + float x_edge = mix(p0.x, p1.x, (tile_y1 - p0.y) / dy); + next_xray = int(floor(x_edge * 0.0625)); + } + int min_xray = min(xray, next_xray); + int max_xray = max(xray, next_xray); + int xx0 = min(int(floor(xc - c)), min_xray); + int xx1 = max(int(ceil(xc + c)), (max_xray + 1)); + xx0 = clamp(xx0, x0, x1); + xx1 = clamp(xx1, x0, x1); + for (int x = xx0; x < xx1; x++) + { + float tile_x0 = float(x * 16); + TileRef param_46 = TileRef{ path.tiles.offset }; + uint param_47 = uint(base + x); + TileRef tile_ref_1 = Tile_index(param_46, param_47); + uint tile_el_1 = tile_ref_1.offset >> uint(2); + uint old = 0u; + Alloc param_48 = path_alloc; + uint param_49 = tile_el_1; + if (touch_mem(param_48, param_49)) + { + uint _1477 = atomic_exchange_explicit((device atomic_uint*)&v_136.memory[tile_el_1], tile_offset, memory_order_relaxed); + old = _1477; + } + tile_seg.origin = p0; + tile_seg.vector = p1 - p0; + float y_edge = 0.0; + if (!is_stroke) + { + y_edge = mix(p0.y, p1.y, (tile_x0 - p0.x) / dx); + if (fast::min(p0.x, p1.x) < tile_x0) + { + float2 p = float2(tile_x0, y_edge); + if (p0.x > p1.x) + { + tile_seg.vector = p - p0; + } + else + { + tile_seg.origin = p; + tile_seg.vector = p1 - p; + } + if (tile_seg.vector.x == 0.0) + { + tile_seg.vector.x = sign(p1.x - p0.x) * 9.999999717180685365747194737196e-10; + } + } + if ((x <= min_xray) || (max_xray < x)) + { + y_edge = 1000000000.0; + } + } + tile_seg.y_edge = y_edge; + tile_seg.next.offset = old; + Alloc param_50 = tile_alloc.alloc; + TileSegRef param_51 = TileSegRef{ tile_offset }; + TileSeg param_52 = tile_seg; + TileSeg_write(param_50, param_51, param_52, v_136, v_136BufferSize); + tile_offset += 24u; + } + xc += b; + base += stride; + xray = next_xray; + } + n_out++; + target += v_step; + p0 = p1; + continue; + } + else + { + break; + } + } + val_sum += params_1.val; + qp0 = qp2_1; + } + break; + } + } +} + diff --git a/piet-gpu/shader/gen/path_coarse.spv b/piet-gpu/shader/gen/path_coarse.spv new file mode 100644 index 0000000000000000000000000000000000000000..5e6beda2d5384eb658b12761ac78a7207338d8af GIT binary patch literal 39788 zcmbWA2b^9-`L+-2CX~>7C!vNCAataK8ahakBCu?dO<36MhGY|h0urT46%`BJLRUcn z6+{sf6h*Ot2qFlGfC?xbf$zHB_uR?s`Sbt2-|u^MnESq;XP$Xx&YU^(p4}uJ^RKx; zRn1o|TrF5_KfG#vmaOJSsj3mx@K!y3pM&<Q2hk z?%sY4cgEfcl5s5uvtyLf+ zn%vhtWrmjsQ!B=3eH2@Kc(nob0o{Yg?lN;i@5BRpCPQ0ut?p-D{9)C`@VUNVSE`zy zX{_*YuhF!z^SJAF+NSfkcfqvHDy70yw#(hPqRdx*hUBSOli%GIjH$5 z=VdE+?RRkZl=e8){0t)4PTpF(bLW2X%8OFB>%#BR=#pS1Jiq__ylkaYUVqO zO?+VbJk5LPq^_PxQ&{)$^FHeDnLS8F%T=l)sO$J$t$ixfKX0$AZ-sf;wuU!U9W!h4 zrmZr0?3kgJdOF-qoR354FY|F2xIG^o)#2dwTrX7l9O+@B>pEd3wW+;!32nxloQkAZt;4E9d#<`Ffc6>!Mp zn6~4bgW_OGftR3*fluN2W+)FnYw+wJFC;d<=Q?IzHiUef$21K z510FCJ?rbULv+WW-ZOo=OT4w7^>-}jtr+HcHr$W+B5Hlz8?L1GqD-y%)eZjD2EVnz zZ*TA)H~76nct`aB+&MFkKT!KbGLJ{W)4NaPYP9*|)MZ{zH29Nn{C(WC+CNQQ`aj#? z&%yEM>bCt~Y}o(V;4i`P_mS9&|DT5awFZ9!j{oLVE&q2K_IDfneK`KDN1)^H;6j%9 z4{PuR;P`JzvsU~O4f~=EzIfZ;M`g=@>4tsT24BAIzjgdqZrE3C@YUP?+r)p(hJEb@ zU$^Z))(h--H)z<$H2B7~zZY2l%^LPChVYJRYj}H~9A0&UIRkWc_fKjc+UgiRL)vs! zQyP3$gU=qqJF3&+S+8e$yXGJFD{>{KB^X-1T~K!~VGjzohLycfEe8VZU?; z@2IYTx5wX6-3(5i-yh=FS>4&-KWOlK8~mpYeqVz>*xbn-L!@G7Pr1HbyoX>Cr%%j zk$LZi&t^p!Ye3A^bM2l2o-whzkK;zmb{5>{j<&PG^X~hO>RfGgyYm{h3*qhWIUUs{ z;7Q%a7*<^hZarV>PKFb%=(?8hyRxph3w|-}y($_V< zZ(u@q-=6)m1}1h7_73##xoe&GkFmGUC7snh;MP$Hn`bDwpYl0&FWN2x{gZoJ=lJ23 z+)sa3-7^#Cb4;DffIf|r^Z8}~-uUx3STZQ9^lH2BsHzU>gscW3ws z9H@=G>kwOKHNL_3YVds<{D1~OBAh+Q_(z7f#?@IJ-QdSG_=F*RSTz}bd{57`);FA~ z?YTVx&F?g>nc3jp8N6ItHi4gR|!yrX&;o;CB>5F7VdgFn^a&ouaR z4gPX?Yt3|2ufk{6bMsnkt!vd$y``;wj=w#`)>*x$t$0><@UR-}o<61hPPn5Q1<&`$ zwZSv%m>Yt51J2m(&4Zb_PJV)%`;@y6| zZS{Q^{G{TX(cR*)^Bw>`sr4PNqnak>JFWM_EO2lCtQma+(>xZTbX1>0n^@zEz!PeG zF*xJC44gC8)nZQ6w!Id7QtMiDR5yuh{Q-Rl>OB1h%njH(i5}&Tz$bLisQVvYJq2%{ zBj!CT&GiNJ@+@K7f1%Br^I_GS@u?p^`n-cSdwMq`w!dN3d>lUdx@Qb_)uMSXhBi2L z8rF_#DX`AAT}9pV3xl%lNcijtJjN$YAK>z!HFL?? zT|aX%o|?I+d7h1C{%T7U8q?R(mMJvnL_g1SY3uo1EjinV{i%(l#LqLi+FFI?d0cID zp*eoF4GYa&)!1I<#`D~*wt1m>-d5YX&^%+SZC7ZHQ*B(Kd8Ss|wa`3EtL;%}o~70H zDKzIu?SMk_EUk8Mp?QW@JFL(=H>(|4XwI+NF@@$iS#46Gc|KMJpuO zAJ?N=M+oT5^4E#|8I#*!R+W6U2<&pWe2=7cPsBYhonD&q%oI z!J@tlEnExgUVnA@YDDnb>obb~+J`)C^<9lNUK4fOjDmYT)bul^^C{M66zki$)Hk}Z zU6J>&YRuJO&D9cr zCRjbO)86~y6!ejl`6wII{8X@?RhgGOV~l+!nsa1PcPwX7J7#U$*8J>R(|%5kZDV}v z^C;<0em=#I{ufd^SKdF#=QH5=o9`DWUfcD^&->^WN__7En@{?BzQJFCyD!B47T8$f z^U;ydl<n6WZt-ed_%cprHEu2+ z4Zrub>vo8JJ@~7K?KhXN-||0gF5e3N`sh{W#@`yg(%mc1wO`copO??Y@X^Qj@8^Ad zC)lw%ALjo9xaq08*5sZ=)U^Mw#EfUG8&C&AIg2-VZ6))-T|jquc)1wavYBLso?KZ_vDVEc!eO=l`uS<@zkj*PDUaGwRf#|zir_juv%6~4y{cTey=UbyXjj~8xx-{Xb5m-`+s{3y8Z@xrZrmlv+x zcX{Eq_kCWt_LU2+?mNBG?t8s(+xu>>GZE)Z5 zrM>>X<14xE_`;3vJHBw&xbOJFZSOn2k{{jR#}wRnzT=C(_ki#C!mWM37w+};9bdS$ z@Apc6YQf#Z=M>!a;QPJ!dw%i#UbuGO?}c0YZm;B572JHTF1Yuk@Al&FeE4oJ-0}K; zFWh_fwt{Q-{a)rqC_`d54H=ghMO8!`b zKhfY%HTW|HH@@%t63_MH`@V4PzV9pfYYqNZgZtjE^nb6xeeV~$;~7~i1# zmE3oLCHFmG$$bwPZhYSbmfUxNCEuyw_UF66*yX+pEV=IjOYXbCl6N<_?*dD^?*mKj zJHe8Fyup1xSlWF@SaRPJhP!@!S6K3^8{Bt>rQP?2;f~jLhvCNa{b9*{e;984RKY#( z`rfd#`_8cBzB3FrzV8c5?)$=$`>wF$zAp?nU*8#q8*gNT`@S&tUGVo^Vaa_@7;f!* z!jk)*Fx>sf_k`irz9$T~f8P^^o1gCpOYS?uaQ%HpSn|^v+;@Ye-S>kf_Z?xl{rR3S z-0}LJu;jiY47c_jVaa_*SaRPHhC6=W6NX#+o-o|+h`uKbHy_^-hI>EyjxgMKz9$U# zd#3LR!;R;ALitXdT09f~h~hcfbK>@#Suj_3*ZK|>?=->kooICqwx5ALoBBP_GpTjj z{T!}6`~k4-Pp2>A{t~R_+4Zzqdk}0pZ3oc*7Wn-NtZx6FpX6fCPM&KM|F>Y{??iNS z`yDvB`MofHzem@0GkyAf>KcjipbG{slTt8zt=D&a)t2WPo@@?qXv;31_e>d6o+AojbofzXBddZF_Bw z(=)WX@x6!s16I2WY)s$9yb4#3&ug_$=2M^7(bPXh9Qhky+o@krYj1+pm!TxqTVVUk zyxR6{xcZ#B-8*30s;Aw5!M5`~K=OPSuAUs;1KU>Jv3mYjGlu&IQcKJIJ_cz%H1+t* z{{cQ7XzFJW$2Of{+o`ABFtGjkUCI3P8;+(Pp9Ma^XF)Xe#9jz&JM}qre+z@{Q=8-R zdyHE8Sp=+>7>mKx{EoW_cc8f}0d{@*&RM(PcNbxuXj`(jdFIt`D*3x-Ed|zQ4EfTv z-+Hypv6_$HqqLcie#?N>(r!7heXmp7{9c@P+QfeMx~RTZuFDnS&2_R8n!5geH_3Ol1gL+MhAMI<^cI))L zHrN>9>wwKAd+xemebn7&<)bLZwl8tot_L<&o)PPV)tu{$Z3DQvZM=RPf*nt;{}`}7 z>W+knidHlBo>tF6ETcN4z@Aq7J z{I>z?U*6jvLQ~h@@4@nncRR3S(&l$!`8K>=bT8Q+d_1*dFfaSEy?*-pom`*9+7WE5 z@SVWrT<#1XM^Sg5alUr}+fG}?up78Jr@N!6+wZQ_^7!uoZqDhRXzKcpr#v{wJ5cN6oH^$Qs#CJ> z9|TtOooRdyhI`M39|HDT%wbNA@e!~->WOnGSbZENd;DQw<7-PUYKeOk*nYGfNiC19 zOTD%oO)ZbD8*DzdJBC^=oM~IlPUVyUapUJpI66$_0O}?wRb#R-9F@f;N>aKqkJk@O+RC5Q_DE}!5OFX zBzIq1liIz=I{6QP%|Cn5G_accfU&iymHXxz@Z>ffZ2#GpW`NbkQL--&g4LblJdb9A zeO~xH(w^thEU>mbS5B<`d>(1b^XLSyHs{gv=f}XF<+6WyZ9a~s?l?}OmT!Zh^IewLIS?J_*j5&^Eco_Wvnt+7st=urE z*tY6vdkHvwyi&CN0=l;NeX;QK`X-kzp=(PYUk2M&J=fw=uzhQbb{SaRpMj;1E8ynh z8Z58%m1x?Wqsyt~;;X2QV=jJg`wH0aDB7+D%eP@B{TbRqs~5cJJ_?O`gg&uBloQD zf!*`0J?q^;eJ90__8-)C>#T{pz{UvwA$SBOXT2YR^-=e%C;vXh*!Cq(+aH6Cm3#Oe zu$pTyW4;%zo;msncx6iV)SrU&QFnZIQ>*0~{0!W@20uqr*Z+QMdHjC?)<1it*Z7xc z>iR!GEsy`N!1|YGy;E9NJpR7{>tCMrev78A|3lRBjQ988yKCF;sO34Q`~mDa z#W9$dec4_=W8FusPhvd+Hdgo_!R1^&3Li&N&spzJVB2ZS7#;^V=k(8L>h}8>wLJcR z0XOILNi=o+pP-h<|F7WYoIZ`FuK!ci^7uasZqDi7(A4#RhFUIuj@mJ04*w2z4$av- zjIFmC!>RvGnM_gF$1~S|YTfoPgXQ{rzIq+3mbLC4^afmA|JSJH zo(r_S37$@^uKiVNwZwY|Y`okHw)rnyUH`YK<%#z$*m&yN-=bDCciX-XRtraN^_jIf zA6y^x?1}S({hq7MnD1d%GsY>pf^S0`sQVZOS4$tm!QaB3{bd2TKI+D6{mcdI{hamQ_9^ejP0-Yn|5&iG7vV~I%{K-6-NSv# zc>3#O4jWQyPunfPYTEqSp*;8YmSFE~b?uu`tEKJMV72gV!0soRw-3SfQIF5IVDIPL zo7=(lQ@0O)Ua6+P`R@RBAJ*p2F6FU}13z5b{5hsPww=N5*S6aUEKd%*g6&7!F4S_r z8{6M*;EB}sWlY=Zqp#~od%2EWPq|j(!8g$7`L-v2uf-l{>hakVY@E_(FSvT%bM6iH z`mJ5J*X|kI{eB;?Yu(=!D`V`7t}S!FAJ{nR=Ds_%TJksm+|2dEXzKAf5Zui5AT;&l zdN9~rM^kLCJ-HqN{-9hxg05{&y(b+CHjcWv?oX|jJPrq&YrfYwuScM%$LFJ9b1i+2 zgsUglqrjP0+iOp*M}w1V8KVnbTjuo`uyNGQ^)PC+Ur%28%Nz-yQ$TZ$MN9i_3cAbkIz(abDi{~sb@|Gz_wGj zpW~?265|ALbG*~h)Z;S)+?=;TH1+f|6Kp$m`+|B zb`r(s%qi4iKkY^ zGj7^{5^SIP&Z3s*`kxNY^)L5;Gtjjq#+hK-WA2r>NDE>u12`s_i0bdDg~f!Ob=JIW+b7e7^Q6 z^S=a5J?rNSU^T~(e!mE|zvTHPuzu=k_hqpCX^Y>bVD*gWVz64`UJl-!68|f}YWnAT zTnYAic)rr!&-(Lxdlgulb8#89JaNAQ-k*}VUj?g`alZynTh^ULwcNg(%Ws37L)XczJVU+%zNUCa>vJobdVFrHeezCEpWD&YJ!8qg3pPKW zMfPC~eR9pTCx`EWorA>xKG+y{7i_@MV;=e+I0NdVHP*yN1I526oJ8^Bh!M0KN%UZbdaE)l=W9U)fpIq%Oa-U6G4?+5aZ!12C~ zrp?^nq?VieQP}^H~^8{f_#& zi~!qC-RrU_wI8pGwnZpvUJG&7?c!j^Z*Klhr#!YLz~-AdlE=0rSpSSk?in(DSjTs1 zu=j?({+_5@-({%H$J%#Z%TX^+@uPi(+HRd#D}s#?z7p8`@RoWWR)*`N?%3taQjBe1 z;a$tAW-0^G(@zSBI;cyT8*aPy10|=gyxwxIe7{SM&bz+WDMc6W-jP)(b8oH-SMz$WNA1TktBxwJEmOZl12qjls@UzUOQL)?eN0n!K#@UT-YeyuCj9o3rEE zjM|Um(q~hOn&T4ZT5Jw>z8r(|z6H4XtlJVzJ!@qvu_iC;O1OzkER}<9l*_NvLl*$=5QR?cIwx&kGgJl2CKK$8?|f3->X*7 zyzC10c^AGL+3;pZ~p2M8xxv?|aUTEs}xd*jeyf?M&{rRtZD}_LT*mn%yo~cH zxN&^m>8ss1r%`LmxX%Fl9VYxt?eNU~S#W*SbKW}}oO#fmc{m5GEjfM~Y~C5$x!{aV zU+v~}I<>asa6Y)q=>oVp&0*j1x?Tv^M?E=R1U9y3aP7(IGhl7W>0+>RkbC2^U^VZJ z<@1nhU3@GroxhUdHM7vaWr47Sx@pUi>wwEYs;{=&Zu zE@NK`FJoT@Pi))juTS#Uo>*6a&0E{$)Nskw^9(JYbKgMM z=I@#Ly!<9uE&YBATt2^VhMPy$$Sq)f)brfR`h4Oz9-qNKfae)}Cs?k{=i3j#wzb_| z)bc!&e+2fKY#VJpbAC+i$7`qkZi<@MPMo;+fQ_5y@4etWf3?N$Ct$yKK3hL;e+t(( z&s*&olQ!2~`m;8U`}zG~pBv7hKNtQP{630vXnVOn@&7s4XH)nCaNE1j8SfWxebf{4 zmtbSMHyBf{pZGy)e@DxBJ`a8cFTXeZ8m{IX{)XC*bEy6hMa?-B+uu>xe+zCt3x9{E z9-rTX9Y@Yw?j?UfQ#Tj+!(i`?w0Q(<+^i9O{)ncYeg09f?bH+JPvGO|E6>=+;QFcO z8T&ZceaAU8S98%XbEG|a{2AQL;|VnN`1}Rz_{vP&1)e}oaeyC3I99TYkokzUY-Z*qwZM$My-}{yZ~;F^+h!G z`1}Lh9P2;P)bq~aUtrs*oBzwy$zR(`l-DTcFE(GV#VcTQ3ja6Qe6rsE1J_60e7uKV z1()kxuAldy{o03pCFbj3`$+pYz_!mlDAy!X7i(|$EO3_ zy!M@F>bc*Bfo-RrGt6+X>zb<4p0nrzU~MOHUu2vM!ZR<9(>183uXB^!t>szc)?SZO zXtNNu%L~3R{ABod>Ny$#*GD~V76G4Dv{@8xj6ClbgX^Q7_nC`>jit@GUXt36bFFO& zikfpR&fbxq!Nz9oSh9Coo0t39QrNsda}Ab;J3o0|%JtEos#=Es+|lxHzb{=DuD`$M znf0rtU)n4O_Sxk*!oNFedAOQ=`saC91m~TrZG0B5gr@G8{PVtYv47Ur_WB)+->P8y z*XE!5mHYeBT*+!R@DbFux1ILnv^vd zb=H8Z<(a2V&GuK;?XRxeJMU{^yNEuFCtnNh*?kWA$k&Fe>F3zAsipsQ!2bLpd|kL_ zvE(xvuBM;uwW-B_J#ZKKX!HA!++3HhbG44|`q-9&C-w$#wTxApn(ddV+b>tN-w>O> z1Y1h)bw``9}9M`(dK?4H@6XJUPo*B zf;C^5I_rN^e4D?swi%jw^4T2h@4h9UE#PYU*{3$O_!^rg+vi=Zx!Iuzlyf z)&*ewa?aN7bu<@kX?HPLE&Q`!TWNSZn^- zp?rt>5?K8v;`mJdGR64nj?J-NN^yR&ZshvKepz8RXX9K>QO`U4D{3ucx(e4axOS&ezm-$CR9}1*`jfyq?;R z&qwv^C~B@Fv2ncaH&D)|80RHc6t`1#BSk&e+1fr_pEps=%e5;uo^fOUW?>g+Z9Aua zFZ8oK#kK7?{rkDH&iy+(jkhwj@o%9{d)LKCil0>|X}@ZXUGu9Icy+L`Mp5U!y%nrq z?xEXjZte?%Z=)E`Yo)JTn{)DAit)ACPM+_)-vc`~+i1(0`94^i*W*rVKVA>@J1A=A zD9+Dc)zaq=!S)&cBe3K0{N}a0o8q;*i=wZ&-A%1ux#yYx+7v%)P%?LG*4Q~&i()S8 zP-nh=3@+#E-Uff5=9#~H;Kno$+sc#sPr&w}%{cPR-%r8LpKY{d{_X>7bN+rt?Z^33 zzn`LJ&SKZg`88KhzrO(6Z|3Zm;LMq}_&o^LW`2%K9^0?LYW5+{nsi>*qxe~u;=DRv zqbcUKK6PUJrq=K0zO&tLDQf3Yj4Myvjlr8y{A@zW^%`5_4XHP!*vID7SqyFuK%+Q{#?QJf3e{D|FgkgD!BgtDY*WxHTW9^*Z-Y@>;G

;@S??nXuD)o4FJ5rt`|o3=y|w>7R>_wyxc>h8Sf$;6AFJg4 z`&i-nuUT;QwHw@jAFK4=py29b8hqn|8{dB)EA6fQ_p!qLy9fREvE)80tRJSh?oOp- zT|ZLmdB^xixSD&C_mXQ;E&hK3tK}^47+B4FSD%Hc|4ebM+uk)R7rS2dci-?H`U}N< zO&@LgxaQT<-&5eSzrVuOyzkQAGvM^+nwJ~P^{#*V`y1H)^wFlzlho?z@9*HUzZc+Y z&(}VV?;n)(=RG4gmiLSP>F-})`_o68KF?9Br@xoMWq<#MtG!bD*x##^^yfV%H`Z&^ z`lr9w!S<(*HhsKj)zjab;IhBB;c9QyKKA!tO8WDjl^e_ZRsZz&9@zf$(Wa00yn6cM z*gmAc`Qd8*-MsqPUne;IdC$v@<^8UI`Wp_mKYg_6I+y zO&|9x_4Ky_xa@BwxSD@%JN>N!PJixMa$~t)>7V{q1KXcI+VpYHQ%`>*!DWAIz}5V_ zk<;H=;PmI7CpVV+o&M=>9kBiBqfH<8O!f3P8eI0bK3vVedpZ4W2u^?QnQ~*fU+SO! zHUit9KHBtg&s9%?gy?d@)?0&0%`r8t0fBI1ex~kIu3uvA26it<%-!K?v5yCP@5a6dTrGX>3AT-T z#<>^RKJTjgl4*=UnJ$-%{Y@d5moLjkmiFqK{ zdnV7=gTVTKq0UjRPy7!7yXNEn5wQLTQ;aXyC;o?lU9-vgaIjkJM}S>pv40e-mN6X( zwvBqmbQIV=52e_rTt8!ac0C%b-E(aVSM7H%TVdOl;%940&b8arcuVRJQ9RdfN1bzQ zH?bC|`Et}f1y`R^^UTQvxN%(18RtYa^_){Cfz@0K`uH4~3^y0YWFK;|@%7JrdMw!c zR3B~nn4^07>jkUjynj4c?Knz&`oQ`mr+%>7REqQOd<}qYtIhe8m){e{;p1m}O6GHi z8atmmQk>77sB_MpMU2d+-z8?(Jo7glZo6{+W}vC(+%^bS^SST4>uk-18ISK6d9evEjajIpUJ7XJ9@v|$%aqdFBTa6vd?i9zl2X)4AGBNZi z$8idpdal8#VB7kRX*l&K;A$&UGS<_;w$qlM|Ig7*F;{VN-G>+lQ2gvm$u-%p#(Pok zPchdIQ)i8zPK;a=>$3~)nm@PT+RtzB3k$CQ#Rb>@a}9n;!S(-A!S%ni;Fp81sCnl0 z47lSc=k`oA^{nT!z-rlV9iv+2{v5Dc=K0fLweyLaHFO@_xpNIU2Xe9F*FX2<1z_(< zeYEN09IB_ki@;@npMk5LSM>K;c=~e=<;HUU^iO}E2iu=M+Vt@nsHeX#fXn{A2v;k= z3w#-#{=5cqV>$o&r@za<_NR|FeY}S1>F-LgW6qv)6EH>cc7Cs~@G# zy?Qh8yhpO1e!t-AcQ*JB3hupjZ^8BdX@lQaaQz=FxcrlpT>lpu{KcAQueb&7oR)jVx6#zIe!c@%%UIT)o%IdHn^tw&eOtuv&8c6*#%NhUMnx`n4bN zL)5lQuD=0~q#ynD)1F*^3odj09bD}d{FCeN;mNg}*FT_ZORf)t)spKU!O7M8M{a)J zJN6^?zOh|$eGF`_`s=4XxjqgqbNw@1?X6a>?cddU0&cFZdE0sY|AMBTJf8%sCC|Tt zlc)Es+}ylp?L+MSYP;n5EO;cb_18~(^86dP%=0<8+I+Q-_s8?#^y$4Xw;%6&{hfRF zfq#G<&)(GKz5Y+QHru#2$dlW@z$0m6T;pjke|E7Ewuux!M^oId{m$A|;A0BhU1Rry z2^9D4N!0GuYA+GrTr=OV!1c@T)chO%GDY2d^;gs1T>b;L@3emvu21-DHP7$eybjk# zJ%8`h8(`yTOWZfX#}skjf*Ut+-T|wpf^DZQaoz=Y7jfQ$JHE7eAFhx39G<0) zi7Bd#r7h!_53OuBKU_ccwCeyn7HvLf<$2B;_jqIT>7nGhO)l`10v}u8-Wq$oj-z-D z`>1mbhtp4a4Htmxmut8n+-s=rHPl}%bF>iHJTlgW;rfJ+sCnjo5x73;@maLtvlv`I z_009+VEfXRzLx+y?!;LVUiQ6I&C~bNaDCLHEelrn?-);C%YlupEqyHyc3#uh3UK>M zoE5=pWt^4Zw$qk4D}%iziL(ma`A*-f!u3(l+E@*2ENyAGI=E~%60VTR|hD5rczvsuC0EGYik;H`dXKE>fsyIJae`oe0_>~ ze8x0m`(QM6@z zJAyNA?df+M*xa-`K6%DBNWLGV_?bz`9y6=JCl+{ijlK3KQJjO1Q@gMDOxcCl?knNr z8+`AA8)N?l?{4sc20x|2&uZ|q8~oe`Kd-?rZ177O{L%)$zQJ#kYn`@P`$sN3H8Rg3>VU}J{w3)Uy?_XF#rZhNnX z+D_!>^*8`*|L4~^c;9^(O+7vbf*n)oa}b((o*xH;ZKs}ghk)&`JX3xIuAa4eDA=~@ zX?GadoXU2Gqp8Q|2(USoJ|9I>PfkaIZKs}gM}f_$9Lv#g_2kqAwyk<%9RoI3ZO)_X zSS@iUfV~c7+=+1Y_)G#f`|d$gPv4Wlwo^~LDPUulV?7qG9-m%tbFPm=Q%~&U!M0PM zQ}3~TVEfeO8uI#21v|d{`GbD2e(H|hdsZ!dPXoK=%CVmSSC7whaC7W4(A1OPAlP>5 zX*UyW>~ie0;Og-?5!@X6Y&7-6J_&3)^*Qy}KL)l>Z5jK=!R6RbhU=&9*axW9;(sbw z|J;9{0Gmtp=+nUN(K}PLrQIB`vBEzIc3!jIJ_XiCJ!3c>Y&>nbHfMm%L0kGf6KtQI z6z%5iKB}Jn&H>w>wzH|_`8x?e4fffuuKg@(wX{87du=<9TAsESXrryVZO^4v6JJPu z5hZax12(VxnUT+e)h?#If&b^J{rLM!+CE27^BE*goG*awC;W?GbIx3T39OHL=IG1d z%#n8UyM$UjaV`U!U)o#_&hu8AZ7!u&Pn#>i&THCS3AT;zP_)JGDzI}HzpD#BZTa&O zUjZ9KyL0k2YCq12wy#pIqc|tx#Q!?j@q}LkHrK4BYr*=c=h~?Gd%*T#JmaSQ4Pc-D z;WxsKmvwj(SRZxcT~DnR|8IcH_TPk?U-J1DSReJYxdm(-$@z^0K-DV9!8BAyK>Ga?F>*bMavB$foh>thn8_aqXfI zaU>5<;fG{vlXYSuK2duTatVnWOcv_wz1tP?Im+q*_2Vjba3VSfbed_Kq}t334_C%& z6ljK-B4UK@qS+%%5!Qv{?M%UrW~V{W4Db%qS4R~UN2KyJHZTlrCm<*k=t8`-Hbia( z+tK?(Ir#aq{FqMckXl`BTi%{3(wW>~+(!?QS`iis*=R}3OVq}b*d{)M6B9L_+MiuK-8TLpBBAz{86;~ zPY!oyrp6XOVCEm1SqnQtx^yv8&q#zz0R{3W=izx7F%F^;?+;2Qic!{AGNBm~l z`=uIUzT2EUNq zW|CX1=avxMH9FxsYoXLySVwW03f<_Re^YIvjJuiQuJ1;lh`1X^Yf?ghFk15oFzLCV z&Kk22;EW+yG|_gsI;r>c1n628LVjt0>(}){U6BRQa<_0OR4Acmj29T=k0Kyu#xBiN z8<8be85!H;Fd96JfIst*{SwXO;6mCGZj6QM_!NSf*kdlE?kku@Sc(utOYJyp(?o#9wX!-2~!r z7bv+Rsk%g#W5N_X$F?K}ITyI+P|B7P6>9%%Lr{0oeikg8@GsyrkR|oKj+& zlhP+*JF=;R1k{6O5#c_@#raX=mUYxu^@J67+dz} z=5oIk@ge?p!I%Knja=TYJU~Fbn({ebajvecUNW*fOfEvE5Z^mFI)=&(x2d+cG-o+> zYr_(w26`YR=j-SlC_@s4zFR$>J-VBQfI&M`_uiB5>h91t-;PRBTyFB)%&n@=!qW> z!VSqKhZ-1XSu(F%HHE;ch3zkgFiy$yHY^fN9qJtZ_{q@2!A?-!#n6~iVyT{IkjEfl z$;gxpBs?F9C`E#4#78(t35WE;r)J?WVfTWv8`0an*7GHYmK;iPU3ft9c^%Ohr}-*- zfnUV_AD@K#5(jkymwGGC_3gbe%cd$iw{%iTepYT?@vODEc?I)WCoq?zI~Mo#FFMkn z(br!`yqBQ)DsDe_f%D*#9}~wVG7u5I!(Ri8x=VL%JeL3Q|Jz3 zvgAO2f1S||f|wYZ17awYN4vETxGfjMZ)oEq%iF5sXzr#`Zb|(*>ou8kGZ0b>5h+TLZ)6(Pc27cx`xFYDejhh zZbW`%G5Dn`i*%Jm6nCRPcjE{Lz`jLSF5^|M!z+vHD>v0EgT5UWerp!~U=%(jk?f9R zZiC0%x-7r_S#F(Dw|1p_XBK)-7a!FTA60MpZE1_Fi;vVIF?DT|fz#Kjm>wj?(1x{3 zxn)JjX8+2KbfwFkc;Sc{9VCQbnb99b!e>U|fX?nah20BMt?Yn3`(=!y)+~3Xz~ses z1OzPu&pa8b{KD;TM|M5V`Uw}XZ|hB~o`19T-5WBhn^V%aB)ItU63!BHMAEjeayHe+ zCL+qve8c^+KYXz=M1-%u%BO8#KN*_*Mj6Ul(M|1jjg9!?+qVv4Y;=i%Z)W`|>ZcUD zca%n#u#A(MPi|d&b;J)*$``!PZZWRi`Sabj6<~`Hq@Nxrv3m4<51ur9sNqj9hFy4v z1`eM^-I)LvL;7$$KJ_z5ws-^6!?~yb9TJ4d;T1LZL$!IFHke?U(V3rhz&Rp8KmSM0(ezKmIVyZ`Z7PfIlC!CLmR?EA%d?ni{72*=_JD_I0S{>h zJR}?NkYFfuCk}WB#_*7_*aMG$NFE{p4f`K?2z||-W=U7GX`>6|u!12TdJ3!uCRb#F zcmKb9!;dHXNRDItoDvDzP5{1Pwg>6|9vf~6_bERw>gpC5HMt7sY}}Ov^bEmpW{nbz zHvz#=uT@bSB+l8HyzG4cDpolAGc@izM5$;upZl$M`~7%1##th|Kn3MX&_X<1RNlNn z!Vf|w?W%Bjun2d5=VJK9u$G9+AraBK;3hIEao)-`MFn|8{N(t+Kq4(pE|;)^>xpVp za~4k>OsEA2UI)7r;9W+ngN{Ea`CJ1FIsrMsR_f4PMMcf64{rI^yx&z*I8&4+Z7Ufg zUxyq-vpR+sKW*)O+BtZNo#-Aib$?V7c|PRQsfhC-%@g3c8tjwl9~69hc6Q2#3;t<>r6o=Kqog{L|*AD9bmA*UC<&CFi9jz+7>G40cLa?qyB)hJ0mhILuQKQ_)Z;hfVza z_rK!R_B6q3{Q@^-zW32jHJ~YwH);c^sGGBn*StCIaS8<9$sd?!hD^VyhF}WYR_-;rezXBoko{#i9zm^D-m< zXW2xGQrwgi+60BC}=VX*&L8mDK0{0G3Xcm&fQ$e2uQO zGAzR+av0X5fq0yk$33c-_CE#V;e>WIq!DPP5;_ZsYea#XR}I(u^3VWi3iOD=8=)ac ztIpE1bO<5TDEYL2j>LhHZ&4Q60p)04GUB}4jK`iyTLv^yybfw#mPmCRk*1SZ+GE*) z_)?W;xf3?|j$IpHvwxVI&1p+{9WUpAk6)-0Xf~b(HSl?7bb3Djj6q*zv)qPpB{CDh zzX%qpfM9PC2#X+Ngwavrj2|BG&()j0Svpi*ia{iL?08P$Gl znf((GaF*HrCShhzoZf+%ZEmzt3$thf&yO`Zw1}G8!9>>`7m}lspP6x`eG;1tJ}81d zt7DHrCOU@<*qTxSq%lYttblnHr$vYpJy*{MLUP3!Mo2d>f>u5d&|ajD)N}Bz5F$zO zrNY6WGEMIB$}pM7t(Lw|4cj*3>Ey`tMWR6bb?Zca0R5Xmogb#Sip)Zt$+eTr$sQ23U`4p){kS~JO8A2C}P^|G{Q-zt^GS0JnrXihxHb{7ERY# z^{3a3^OBD%W%M6)RMd&A`m^W8`OO(uW~qOc(Fe&x6tEtCmULgnF-o_c5TrrQ7J)`P zJXa!4>VpKM#0pfeYr_@6EjEntDm30=gdPwL%p2mn%l z-|u+5W#2N2^t4EQ(Gvt_cY=K7*H+A0ZQ1a2(sLB^@gKR?VD8y*wnM^L z3(h8Qt=8Z}3|^YvbA@SZu32TO8rpS*GT0+*un~3VvJ=t}L0oQ_6bj(gUXUe(6q%^$ zy5yx~Z%~`#CAmqdwYDu<<#pcvL znWByf`_|af){2TGQwz*k4>r`TN7TX{uCi6bjml6(s5ahPSiLxif|MrjaR9(*E0|0c zpDWe%$IMy5Gf;`ks?>Oz>sx>jmENq}Spo+{cVvbhLM-J>xi-p z6Z|ZT^%D&TE9qS}s~}Iu9?06}A={QPYnxv=OtShB{HZ*}+U8FuKe{Wrz}`1MTjcp( zI}z(=-d>j4;rJ4F;SC3eh|D=fH2QsTq)a_hZyx_Fc;L;!oK=MJdXJ3$BuJUJ(F}|+ zMZhQvcouvtZ7`<@7|k+z*B#ii{rV7X_b5korsI0~{MubFm5}Fi1dhGHz9+j5**PZJ zfA;HBZ|f*gq)q7#9%^kz#q$O0u@mG=R)rm?BIECwtQb-JA5}>`SI?4P1R)za8*MKV zo)~QPyE!>#XOA~cW*3{%Oc@%a5tiUT0r4FetbqLmC5|ODNqkqt}&{U*X_>;Vm zNl+{6!X-JU^n#N|F5)y&y%P@uAqHQMSgD>2Kfnsw!V8{Hp5n0qWVD*2G0>v8`$nxC z0l`_M|1|uygRusRC_*BmmvH>axF19*5ef#tJRDFc%;ZV)PrmjBN@VE;bb9Oy?lR5+iSQT9CK zE#Ry!!s_fVPis2aTfIkWz8IO+mA2m6i2lC}-|A#p)&Vu6#KM8>eQys2E;LEA1!Ib z>n1w2Qu=H~a~e;rtdjT5m11B4y--6AsiF&$ z10y%KNPDGZ3#46$e&?w1CTERUerEE1WD(14K34K&!*zu%Qjs@Ei#&wnC1OIr^z{oH zaF%*iyP0A3uHn0-WaF4&rMJtkhKPf}u-pbTJx6Ag31idDXf(>k2F>xD!vC!Lpo6aY zKo!!b~^{I|C~pnz{8!^4B|A8SV7U&0-Ku z$}njDw2eL$NpMhh#>rSda74R;o#VI?rS?(GJVc1pr3x@FCP6#l(F5kG2np75r{Ilv zzYKdM7k~pr*k~2L(Pz{&O7BnZy|1k~Q>T&?j=V#I;OL<-6-x?^6}@*SaD1zGrv-qP z!TZxD=qWt!@b0=mAS(ASX2g^4@9tql_;4;GKEAR$Er_umE?`74+`w9LcR(ZCZ=@o# z|5EiKOJ3<6`Qb=_X#GB^pp$6)V1Lt2s#2mVVySKE`aDt{D$;lpsbGE&&Kwz=-`gOl zL|w88>H!UO#7Kb`yTQ36k~zKk9WA`ENNulXj(PcmHv5~Wj2V_6EwcH=Ff?KRnJM+Y z$!g_JSdJ`y*XOS}rp(P^6H9HpoLrzX1*8xSFIG>DKk*J?SLO)=Z- zju{8$Mb(U+jnKK@)+M#Pn9Xn=`LwJ9x2MgpfxAMQV9C(z?`tQ6pgQ1sT3^0xcfe`L zIk>xS21krFRM+mF%ZTQ>nT%MsXLlwe4%ekH;^Do!S1}^JHx-C#FRII-$t1gKJO^i@ zt4T$0MuC2x$rVe(L=r=u0R@{&D_ZFCzDdT>8D;f%FbGsr}RA#Q^Tn7h+LJm3(3cd+(*AZA!?xA4;^RrjeM?MW5B|11)XffKxB{6TSSbeB}~Ofx)TVR9M{o?x)!Z zU$YK2aJY>doO+zbGTv92TTaeZE)#5hykM)-ya#Ttw}14AW>MnW1!K8taly;^DV$@z z<>~nq&a>v<%>;)jqch-py>fT<=xKV`d~fj*=Uy=&GnT!sZ({fE(`FX6%4Q-SoDbFp zBD^?r$0=5igTnSB#p-v*IM1IaUb!FCI{{D~3rR(mp4>3c#4(E5nO4_pPp8l3ob_!= zU;UADMvdhZZ}MJx>W76;`moFBsylTh{V(sTZ0XZqe?+zGnd3*~_nRla!_Q68P3+r! z_Pp!puJYBD^wn69f1uS_!&1Zabl|M6+3Ku+)17-6oQuBw_tt#AFY~hi-MB7?UXzSV z^uV*b;Mv`vXud3WH?+DL3uRoeN;Wh5s7Jq?bi0GDCCb|D*9@br3*x|euT;l1jecF4 z`uuTbF(qGYS~0fszq)_6nLi0SCI0JW{$|@m%o*ecy7j|8)q5lSjvZIVzO0nKc&~Zb z$hxAka`C_UE?w`m7)o{jBgY%H1&m4dazSC=XK!=9KDl)4L)KIxap~KXa+%fDLHm>s zKW1DV{844E=73E}pZxUVNasb?h-z}y=qDx*$9a?I80aQ*=x9vNs`}%<<#WbC^Y&~u zX-&*04n;uv(&N*YD&OwgPp0U&Kh)w~4$LYe24VkR5r~A-|HDe=ACty%%?D}^tRPKiW literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/pathseg.hlsl b/piet-gpu/shader/gen/pathseg.hlsl new file mode 100644 index 0000000..578417f --- /dev/null +++ b/piet-gpu/shader/gen/pathseg.hlsl @@ -0,0 +1,661 @@ +struct Alloc +{ + uint offset; +}; + +struct TagMonoid +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct TransformSegRef +{ + uint offset; +}; + +struct TransformSeg +{ + float4 mat; + float2 translate; +}; + +struct PathCubicRef +{ + uint offset; +}; + +struct PathCubic +{ + float2 p0; + float2 p1; + float2 p2; + float2 p3; + uint path_ix; + uint trans_ix; + float2 stroke; +}; + +struct PathSegRef +{ + uint offset; +}; + +struct Monoid +{ + float4 bbox; + uint flags; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +static const TagMonoid _135 = { 0u, 0u, 0u, 0u, 0u }; +static const Monoid _567 = { 0.0f.xxxx, 0u }; + +RWByteAddressBuffer _111 : register(u0, space0); +ByteAddressBuffer _574 : register(t2, space0); +ByteAddressBuffer _639 : register(t1, space0); +ByteAddressBuffer _710 : register(t3, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared TagMonoid sh_tag[256]; +groupshared Monoid sh_scratch[256]; + +TagMonoid reduce_tag(uint tag_word) +{ + uint point_count = tag_word & 50529027u; + TagMonoid c; + c.pathseg_ix = uint(int(countbits((point_count * 7u) & 67372036u))); + c.linewidth_ix = uint(int(countbits(tag_word & 1077952576u))); + c.path_ix = uint(int(countbits(tag_word & 269488144u))); + c.trans_ix = uint(int(countbits(tag_word & 538976288u))); + uint n_points = point_count + ((tag_word >> uint(2)) & 16843009u); + uint a = n_points + (n_points & (((tag_word >> uint(3)) & 16843009u) * 15u)); + a += (a >> uint(8)); + a += (a >> uint(16)); + c.pathseg_offset = a & 255u; + return c; +} + +TagMonoid combine_tag_monoid(TagMonoid a, TagMonoid b) +{ + TagMonoid c; + c.trans_ix = a.trans_ix + b.trans_ix; + c.linewidth_ix = a.linewidth_ix + b.linewidth_ix; + c.pathseg_ix = a.pathseg_ix + b.pathseg_ix; + c.path_ix = a.path_ix + b.path_ix; + c.pathseg_offset = a.pathseg_offset + b.pathseg_offset; + return c; +} + +TagMonoid tag_monoid_identity() +{ + return _135; +} + +float2 read_f32_point(uint ix) +{ + float x = asfloat(_574.Load(ix * 4 + 0)); + float y = asfloat(_574.Load((ix + 1u) * 4 + 0)); + return float2(x, y); +} + +float2 read_i16_point(uint ix) +{ + uint raw = _574.Load(ix * 4 + 0); + float x = float(int(raw << uint(16)) >> 16); + float y = float(int(raw) >> 16); + return float2(x, y); +} + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +uint read_mem(Alloc alloc, uint offset) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = _111.Load(offset * 4 + 8); + return v; +} + +TransformSeg TransformSeg_read(Alloc a, TransformSegRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11); + TransformSeg s; + s.mat = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3)); + s.translate = float2(asfloat(raw4), asfloat(raw5)); + return s; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _111.Store(offset * 4 + 8, val); +} + +void PathCubic_write(Alloc a, PathCubicRef ref, PathCubic s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = asuint(s.p0.x); + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = asuint(s.p0.y); + write_mem(param_3, param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = asuint(s.p1.x); + write_mem(param_6, param_7, param_8); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = asuint(s.p1.y); + write_mem(param_9, param_10, param_11); + Alloc param_12 = a; + uint param_13 = ix + 4u; + uint param_14 = asuint(s.p2.x); + write_mem(param_12, param_13, param_14); + Alloc param_15 = a; + uint param_16 = ix + 5u; + uint param_17 = asuint(s.p2.y); + write_mem(param_15, param_16, param_17); + Alloc param_18 = a; + uint param_19 = ix + 6u; + uint param_20 = asuint(s.p3.x); + write_mem(param_18, param_19, param_20); + Alloc param_21 = a; + uint param_22 = ix + 7u; + uint param_23 = asuint(s.p3.y); + write_mem(param_21, param_22, param_23); + Alloc param_24 = a; + uint param_25 = ix + 8u; + uint param_26 = s.path_ix; + write_mem(param_24, param_25, param_26); + Alloc param_27 = a; + uint param_28 = ix + 9u; + uint param_29 = s.trans_ix; + write_mem(param_27, param_28, param_29); + Alloc param_30 = a; + uint param_31 = ix + 10u; + uint param_32 = asuint(s.stroke.x); + write_mem(param_30, param_31, param_32); + Alloc param_33 = a; + uint param_34 = ix + 11u; + uint param_35 = asuint(s.stroke.y); + write_mem(param_33, param_34, param_35); +} + +void PathSeg_Cubic_write(Alloc a, PathSegRef ref, uint flags, PathCubic s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = (flags << uint(16)) | 1u; + write_mem(param, param_1, param_2); + PathCubicRef _458 = { ref.offset + 4u }; + Alloc param_3 = a; + PathCubicRef param_4 = _458; + PathCubic param_5 = s; + PathCubic_write(param_3, param_4, param_5); +} + +Monoid combine_monoid(Monoid a, Monoid b) +{ + Monoid c; + c.bbox = b.bbox; + bool _472 = (a.flags & 1u) == 0u; + bool _480; + if (_472) + { + _480 = b.bbox.z <= b.bbox.x; + } + else + { + _480 = _472; + } + bool _488; + if (_480) + { + _488 = b.bbox.w <= b.bbox.y; + } + else + { + _488 = _480; + } + if (_488) + { + c.bbox = a.bbox; + } + else + { + bool _498 = (a.flags & 1u) == 0u; + bool _505; + if (_498) + { + _505 = (b.flags & 2u) == 0u; + } + else + { + _505 = _498; + } + bool _522; + if (_505) + { + bool _512 = a.bbox.z > a.bbox.x; + bool _521; + if (!_512) + { + _521 = a.bbox.w > a.bbox.y; + } + else + { + _521 = _512; + } + _522 = _521; + } + else + { + _522 = _505; + } + if (_522) + { + float4 _529 = c.bbox; + float2 _531 = min(a.bbox.xy, _529.xy); + c.bbox.x = _531.x; + c.bbox.y = _531.y; + float4 _540 = c.bbox; + float2 _542 = max(a.bbox.zw, _540.zw); + c.bbox.z = _542.x; + c.bbox.w = _542.y; + } + } + c.flags = (a.flags & 2u) | b.flags; + c.flags |= ((a.flags & 1u) << uint(1)); + return c; +} + +Monoid monoid_identity() +{ + return _567; +} + +uint round_down(float x) +{ + return uint(max(0.0f, floor(x) + 32768.0f)); +} + +uint round_up(float x) +{ + return uint(min(65535.0f, ceil(x) + 32768.0f)); +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 4u; + uint tag_word = _574.Load(((_639.Load(92) >> uint(2)) + (ix >> uint(2))) * 4 + 0); + uint param = tag_word; + TagMonoid local_tm = reduce_tag(param); + sh_tag[gl_LocalInvocationID.x] = local_tm; + for (uint i = 0u; i < 8u; i++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i)) + { + TagMonoid other = sh_tag[gl_LocalInvocationID.x - (1u << i)]; + TagMonoid param_1 = other; + TagMonoid param_2 = local_tm; + local_tm = combine_tag_monoid(param_1, param_2); + } + GroupMemoryBarrierWithGroupSync(); + sh_tag[gl_LocalInvocationID.x] = local_tm; + } + GroupMemoryBarrierWithGroupSync(); + TagMonoid tm = tag_monoid_identity(); + if (gl_WorkGroupID.x > 0u) + { + TagMonoid _716; + _716.trans_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 0); + _716.linewidth_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 4); + _716.pathseg_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 8); + _716.path_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 12); + _716.pathseg_offset = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 16); + tm.trans_ix = _716.trans_ix; + tm.linewidth_ix = _716.linewidth_ix; + tm.pathseg_ix = _716.pathseg_ix; + tm.path_ix = _716.path_ix; + tm.pathseg_offset = _716.pathseg_offset; + } + if (gl_LocalInvocationID.x > 0u) + { + TagMonoid param_3 = tm; + TagMonoid param_4 = sh_tag[gl_LocalInvocationID.x - 1u]; + tm = combine_tag_monoid(param_3, param_4); + } + uint ps_ix = (_639.Load(96) >> uint(2)) + tm.pathseg_offset; + uint lw_ix = (_639.Load(88) >> uint(2)) + tm.linewidth_ix; + uint save_path_ix = tm.path_ix; + uint trans_ix = tm.trans_ix; + TransformSegRef _771 = { _639.Load(36) + (trans_ix * 24u) }; + TransformSegRef trans_ref = _771; + PathSegRef _781 = { _639.Load(28) + (tm.pathseg_ix * 52u) }; + PathSegRef ps_ref = _781; + float linewidth[4]; + uint save_trans_ix[4]; + float2 p0; + float2 p1; + float2 p2; + float2 p3; + Alloc param_13; + Monoid local[4]; + PathCubic cubic; + Alloc param_15; + for (uint i_1 = 0u; i_1 < 4u; i_1++) + { + linewidth[i_1] = asfloat(_574.Load(lw_ix * 4 + 0)); + save_trans_ix[i_1] = trans_ix; + uint tag_byte = tag_word >> (i_1 * 8u); + uint seg_type = tag_byte & 3u; + if (seg_type != 0u) + { + if ((tag_byte & 8u) != 0u) + { + uint param_5 = ps_ix; + p0 = read_f32_point(param_5); + uint param_6 = ps_ix + 2u; + p1 = read_f32_point(param_6); + if (seg_type >= 2u) + { + uint param_7 = ps_ix + 4u; + p2 = read_f32_point(param_7); + if (seg_type == 3u) + { + uint param_8 = ps_ix + 6u; + p3 = read_f32_point(param_8); + } + } + } + else + { + uint param_9 = ps_ix; + p0 = read_i16_point(param_9); + uint param_10 = ps_ix + 1u; + p1 = read_i16_point(param_10); + if (seg_type >= 2u) + { + uint param_11 = ps_ix + 2u; + p2 = read_i16_point(param_11); + if (seg_type == 3u) + { + uint param_12 = ps_ix + 3u; + p3 = read_i16_point(param_12); + } + } + } + Alloc _877; + _877.offset = _639.Load(36); + param_13.offset = _877.offset; + TransformSegRef param_14 = trans_ref; + TransformSeg transform = TransformSeg_read(param_13, param_14); + p0 = ((transform.mat.xy * p0.x) + (transform.mat.zw * p0.y)) + transform.translate; + p1 = ((transform.mat.xy * p1.x) + (transform.mat.zw * p1.y)) + transform.translate; + float4 bbox = float4(min(p0, p1), max(p0, p1)); + if (seg_type >= 2u) + { + p2 = ((transform.mat.xy * p2.x) + (transform.mat.zw * p2.y)) + transform.translate; + float4 _947 = bbox; + float2 _950 = min(_947.xy, p2); + bbox.x = _950.x; + bbox.y = _950.y; + float4 _955 = bbox; + float2 _958 = max(_955.zw, p2); + bbox.z = _958.x; + bbox.w = _958.y; + if (seg_type == 3u) + { + p3 = ((transform.mat.xy * p3.x) + (transform.mat.zw * p3.y)) + transform.translate; + float4 _983 = bbox; + float2 _986 = min(_983.xy, p3); + bbox.x = _986.x; + bbox.y = _986.y; + float4 _991 = bbox; + float2 _994 = max(_991.zw, p3); + bbox.z = _994.x; + bbox.w = _994.y; + } + else + { + p3 = p2; + p2 = lerp(p1, p2, 0.3333333432674407958984375f.xx); + p1 = lerp(p1, p0, 0.3333333432674407958984375f.xx); + } + } + else + { + p3 = p1; + p2 = lerp(p3, p0, 0.3333333432674407958984375f.xx); + p1 = lerp(p0, p3, 0.3333333432674407958984375f.xx); + } + float2 stroke = 0.0f.xx; + if (linewidth[i_1] >= 0.0f) + { + stroke = float2(length(transform.mat.xz), length(transform.mat.yw)) * (0.5f * linewidth[i_1]); + bbox += float4(-stroke, stroke); + } + local[i_1].bbox = bbox; + local[i_1].flags = 0u; + cubic.p0 = p0; + cubic.p1 = p1; + cubic.p2 = p2; + cubic.p3 = p3; + cubic.path_ix = tm.path_ix; + cubic.trans_ix = (gl_GlobalInvocationID.x * 4u) + i_1; + cubic.stroke = stroke; + uint fill_mode = uint(linewidth[i_1] >= 0.0f); + Alloc _1089; + _1089.offset = _639.Load(28); + param_15.offset = _1089.offset; + PathSegRef param_16 = ps_ref; + uint param_17 = fill_mode; + PathCubic param_18 = cubic; + PathSeg_Cubic_write(param_15, param_16, param_17, param_18); + ps_ref.offset += 52u; + uint n_points = (tag_byte & 3u) + ((tag_byte >> uint(2)) & 1u); + uint n_words = n_points + (n_points & (((tag_byte >> uint(3)) & 1u) * 15u)); + ps_ix += n_words; + } + else + { + local[i_1].bbox = 0.0f.xxxx; + uint is_path = (tag_byte >> uint(4)) & 1u; + local[i_1].flags = is_path; + tm.path_ix += is_path; + trans_ix += ((tag_byte >> uint(5)) & 1u); + trans_ref.offset += (((tag_byte >> uint(5)) & 1u) * 24u); + lw_ix += ((tag_byte >> uint(6)) & 1u); + } + } + Monoid agg = local[0]; + for (uint i_2 = 1u; i_2 < 4u; i_2++) + { + Monoid param_19 = agg; + Monoid param_20 = local[i_2]; + agg = combine_monoid(param_19, param_20); + local[i_2] = agg; + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_3 = 0u; i_3 < 8u; i_3++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i_3)) + { + Monoid other_1 = sh_scratch[gl_LocalInvocationID.x - (1u << i_3)]; + Monoid param_21 = other_1; + Monoid param_22 = agg; + agg = combine_monoid(param_21, param_22); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + GroupMemoryBarrierWithGroupSync(); + uint path_ix = save_path_ix; + uint bbox_out_ix = (_639.Load(40) >> uint(2)) + (path_ix * 6u); + Monoid row = monoid_identity(); + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_4 = 0u; i_4 < 4u; i_4++) + { + Monoid param_23 = row; + Monoid param_24 = local[i_4]; + Monoid m = combine_monoid(param_23, param_24); + bool do_atomic = false; + bool _1264 = i_4 == 3u; + bool _1270; + if (_1264) + { + _1270 = gl_LocalInvocationID.x == 255u; + } + else + { + _1270 = _1264; + } + if (_1270) + { + do_atomic = true; + } + if ((m.flags & 1u) != 0u) + { + _111.Store((bbox_out_ix + 4u) * 4 + 8, asuint(linewidth[i_4])); + _111.Store((bbox_out_ix + 5u) * 4 + 8, save_trans_ix[i_4]); + if ((m.flags & 2u) == 0u) + { + do_atomic = true; + } + else + { + float param_25 = m.bbox.x; + _111.Store(bbox_out_ix * 4 + 8, round_down(param_25)); + float param_26 = m.bbox.y; + _111.Store((bbox_out_ix + 1u) * 4 + 8, round_down(param_26)); + float param_27 = m.bbox.z; + _111.Store((bbox_out_ix + 2u) * 4 + 8, round_up(param_27)); + float param_28 = m.bbox.w; + _111.Store((bbox_out_ix + 3u) * 4 + 8, round_up(param_28)); + bbox_out_ix += 6u; + do_atomic = false; + } + } + if (do_atomic) + { + bool _1335 = m.bbox.z > m.bbox.x; + bool _1344; + if (!_1335) + { + _1344 = m.bbox.w > m.bbox.y; + } + else + { + _1344 = _1335; + } + if (_1344) + { + float param_29 = m.bbox.x; + uint _1353; + _111.InterlockedMin(bbox_out_ix * 4 + 8, round_down(param_29), _1353); + float param_30 = m.bbox.y; + uint _1361; + _111.InterlockedMin((bbox_out_ix + 1u) * 4 + 8, round_down(param_30), _1361); + float param_31 = m.bbox.z; + uint _1369; + _111.InterlockedMax((bbox_out_ix + 2u) * 4 + 8, round_up(param_31), _1369); + float param_32 = m.bbox.w; + uint _1377; + _111.InterlockedMax((bbox_out_ix + 3u) * 4 + 8, round_up(param_32), _1377); + } + bbox_out_ix += 6u; + } + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/pathseg.msl b/piet-gpu/shader/gen/pathseg.msl new file mode 100644 index 0000000..9f6328e --- /dev/null +++ b/piet-gpu/shader/gen/pathseg.msl @@ -0,0 +1,717 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" +#pragma clang diagnostic ignored "-Wunused-variable" + +#include +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Alloc +{ + uint offset; +}; + +struct TagMonoid +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct TransformSegRef +{ + uint offset; +}; + +struct TransformSeg +{ + float4 mat; + float2 translate; +}; + +struct PathCubicRef +{ + uint offset; +}; + +struct PathCubic +{ + float2 p0; + float2 p1; + float2 p2; + float2 p3; + uint path_ix; + uint trans_ix; + float2 stroke; +}; + +struct PathSegRef +{ + uint offset; +}; + +struct Monoid +{ + float4 bbox; + uint flags; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct SceneBuf +{ + uint scene[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 path_bbox_alloc; + Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; + Alloc_1 draw_bbox_alloc; + Alloc_1 drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct TagMonoid_1 +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct ParentBuf +{ + TagMonoid_1 parent[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +TagMonoid reduce_tag(thread const uint& tag_word) +{ + uint point_count = tag_word & 50529027u; + TagMonoid c; + c.pathseg_ix = uint(int(popcount((point_count * 7u) & 67372036u))); + c.linewidth_ix = uint(int(popcount(tag_word & 1077952576u))); + c.path_ix = uint(int(popcount(tag_word & 269488144u))); + c.trans_ix = uint(int(popcount(tag_word & 538976288u))); + uint n_points = point_count + ((tag_word >> uint(2)) & 16843009u); + uint a = n_points + (n_points & (((tag_word >> uint(3)) & 16843009u) * 15u)); + a += (a >> uint(8)); + a += (a >> uint(16)); + c.pathseg_offset = a & 255u; + return c; +} + +static inline __attribute__((always_inline)) +TagMonoid combine_tag_monoid(thread const TagMonoid& a, thread const TagMonoid& b) +{ + TagMonoid c; + c.trans_ix = a.trans_ix + b.trans_ix; + c.linewidth_ix = a.linewidth_ix + b.linewidth_ix; + c.pathseg_ix = a.pathseg_ix + b.pathseg_ix; + c.path_ix = a.path_ix + b.path_ix; + c.pathseg_offset = a.pathseg_offset + b.pathseg_offset; + return c; +} + +static inline __attribute__((always_inline)) +TagMonoid tag_monoid_identity() +{ + return TagMonoid{ 0u, 0u, 0u, 0u, 0u }; +} + +static inline __attribute__((always_inline)) +float2 read_f32_point(thread const uint& ix, const device SceneBuf& v_574) +{ + float x = as_type(v_574.scene[ix]); + float y = as_type(v_574.scene[ix + 1u]); + return float2(x, y); +} + +static inline __attribute__((always_inline)) +float2 read_i16_point(thread const uint& ix, const device SceneBuf& v_574) +{ + uint raw = v_574.scene[ix]; + float x = float(int(raw << uint(16)) >> 16); + float y = float(int(raw) >> 16); + return float2(x, y); +} + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_111) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_111.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +TransformSeg TransformSeg_read(thread const Alloc& a, thread const TransformSegRef& ref, device Memory& v_111) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_111); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_111); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_111); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_111); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9, v_111); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11, v_111); + TransformSeg s; + s.mat = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); + s.translate = float2(as_type(raw4), as_type(raw5)); + return s; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_111) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_111.memory[offset] = val; +} + +static inline __attribute__((always_inline)) +void PathCubic_write(thread const Alloc& a, thread const PathCubicRef& ref, thread const PathCubic& s, device Memory& v_111) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = as_type(s.p0.x); + write_mem(param, param_1, param_2, v_111); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = as_type(s.p0.y); + write_mem(param_3, param_4, param_5, v_111); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = as_type(s.p1.x); + write_mem(param_6, param_7, param_8, v_111); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = as_type(s.p1.y); + write_mem(param_9, param_10, param_11, v_111); + Alloc param_12 = a; + uint param_13 = ix + 4u; + uint param_14 = as_type(s.p2.x); + write_mem(param_12, param_13, param_14, v_111); + Alloc param_15 = a; + uint param_16 = ix + 5u; + uint param_17 = as_type(s.p2.y); + write_mem(param_15, param_16, param_17, v_111); + Alloc param_18 = a; + uint param_19 = ix + 6u; + uint param_20 = as_type(s.p3.x); + write_mem(param_18, param_19, param_20, v_111); + Alloc param_21 = a; + uint param_22 = ix + 7u; + uint param_23 = as_type(s.p3.y); + write_mem(param_21, param_22, param_23, v_111); + Alloc param_24 = a; + uint param_25 = ix + 8u; + uint param_26 = s.path_ix; + write_mem(param_24, param_25, param_26, v_111); + Alloc param_27 = a; + uint param_28 = ix + 9u; + uint param_29 = s.trans_ix; + write_mem(param_27, param_28, param_29, v_111); + Alloc param_30 = a; + uint param_31 = ix + 10u; + uint param_32 = as_type(s.stroke.x); + write_mem(param_30, param_31, param_32, v_111); + Alloc param_33 = a; + uint param_34 = ix + 11u; + uint param_35 = as_type(s.stroke.y); + write_mem(param_33, param_34, param_35, v_111); +} + +static inline __attribute__((always_inline)) +void PathSeg_Cubic_write(thread const Alloc& a, thread const PathSegRef& ref, thread const uint& flags, thread const PathCubic& s, device Memory& v_111) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = (flags << uint(16)) | 1u; + write_mem(param, param_1, param_2, v_111); + Alloc param_3 = a; + PathCubicRef param_4 = PathCubicRef{ ref.offset + 4u }; + PathCubic param_5 = s; + PathCubic_write(param_3, param_4, param_5, v_111); +} + +static inline __attribute__((always_inline)) +Monoid combine_monoid(thread const Monoid& a, thread const Monoid& b) +{ + Monoid c; + c.bbox = b.bbox; + bool _472 = (a.flags & 1u) == 0u; + bool _480; + if (_472) + { + _480 = b.bbox.z <= b.bbox.x; + } + else + { + _480 = _472; + } + bool _488; + if (_480) + { + _488 = b.bbox.w <= b.bbox.y; + } + else + { + _488 = _480; + } + if (_488) + { + c.bbox = a.bbox; + } + else + { + bool _498 = (a.flags & 1u) == 0u; + bool _505; + if (_498) + { + _505 = (b.flags & 2u) == 0u; + } + else + { + _505 = _498; + } + bool _522; + if (_505) + { + bool _512 = a.bbox.z > a.bbox.x; + bool _521; + if (!_512) + { + _521 = a.bbox.w > a.bbox.y; + } + else + { + _521 = _512; + } + _522 = _521; + } + else + { + _522 = _505; + } + if (_522) + { + float4 _529 = c.bbox; + float2 _531 = fast::min(a.bbox.xy, _529.xy); + c.bbox.x = _531.x; + c.bbox.y = _531.y; + float4 _540 = c.bbox; + float2 _542 = fast::max(a.bbox.zw, _540.zw); + c.bbox.z = _542.x; + c.bbox.w = _542.y; + } + } + c.flags = (a.flags & 2u) | b.flags; + c.flags |= ((a.flags & 1u) << uint(1)); + return c; +} + +static inline __attribute__((always_inline)) +Monoid monoid_identity() +{ + return Monoid{ float4(0.0), 0u }; +} + +static inline __attribute__((always_inline)) +uint round_down(thread const float& x) +{ + return uint(fast::max(0.0, floor(x) + 32768.0)); +} + +static inline __attribute__((always_inline)) +uint round_up(thread const float& x) +{ + return uint(fast::min(65535.0, ceil(x) + 32768.0)); +} + +kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _639 [[buffer(1)]], const device SceneBuf& v_574 [[buffer(2)]], const device ParentBuf& _710 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +{ + threadgroup TagMonoid sh_tag[256]; + threadgroup Monoid sh_scratch[256]; + uint ix = gl_GlobalInvocationID.x * 4u; + uint tag_word = v_574.scene[(_639.conf.pathtag_offset >> uint(2)) + (ix >> uint(2))]; + uint param = tag_word; + TagMonoid local_tm = reduce_tag(param); + sh_tag[gl_LocalInvocationID.x] = local_tm; + for (uint i = 0u; i < 8u; i++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i)) + { + TagMonoid other = sh_tag[gl_LocalInvocationID.x - (1u << i)]; + TagMonoid param_1 = other; + TagMonoid param_2 = local_tm; + local_tm = combine_tag_monoid(param_1, param_2); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_tag[gl_LocalInvocationID.x] = local_tm; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + TagMonoid tm = tag_monoid_identity(); + if (gl_WorkGroupID.x > 0u) + { + uint _713 = gl_WorkGroupID.x - 1u; + tm.trans_ix = _710.parent[_713].trans_ix; + tm.linewidth_ix = _710.parent[_713].linewidth_ix; + tm.pathseg_ix = _710.parent[_713].pathseg_ix; + tm.path_ix = _710.parent[_713].path_ix; + tm.pathseg_offset = _710.parent[_713].pathseg_offset; + } + if (gl_LocalInvocationID.x > 0u) + { + TagMonoid param_3 = tm; + TagMonoid param_4 = sh_tag[gl_LocalInvocationID.x - 1u]; + tm = combine_tag_monoid(param_3, param_4); + } + uint ps_ix = (_639.conf.pathseg_offset >> uint(2)) + tm.pathseg_offset; + uint lw_ix = (_639.conf.linewidth_offset >> uint(2)) + tm.linewidth_ix; + uint save_path_ix = tm.path_ix; + uint trans_ix = tm.trans_ix; + TransformSegRef trans_ref = TransformSegRef{ _639.conf.trans_alloc.offset + (trans_ix * 24u) }; + PathSegRef ps_ref = PathSegRef{ _639.conf.pathseg_alloc.offset + (tm.pathseg_ix * 52u) }; + spvUnsafeArray linewidth; + spvUnsafeArray save_trans_ix; + float2 p0; + float2 p1; + float2 p2; + float2 p3; + Alloc param_13; + spvUnsafeArray local; + PathCubic cubic; + Alloc param_15; + for (uint i_1 = 0u; i_1 < 4u; i_1++) + { + linewidth[i_1] = as_type(v_574.scene[lw_ix]); + save_trans_ix[i_1] = trans_ix; + uint tag_byte = tag_word >> (i_1 * 8u); + uint seg_type = tag_byte & 3u; + if (seg_type != 0u) + { + if ((tag_byte & 8u) != 0u) + { + uint param_5 = ps_ix; + p0 = read_f32_point(param_5, v_574); + uint param_6 = ps_ix + 2u; + p1 = read_f32_point(param_6, v_574); + if (seg_type >= 2u) + { + uint param_7 = ps_ix + 4u; + p2 = read_f32_point(param_7, v_574); + if (seg_type == 3u) + { + uint param_8 = ps_ix + 6u; + p3 = read_f32_point(param_8, v_574); + } + } + } + else + { + uint param_9 = ps_ix; + p0 = read_i16_point(param_9, v_574); + uint param_10 = ps_ix + 1u; + p1 = read_i16_point(param_10, v_574); + if (seg_type >= 2u) + { + uint param_11 = ps_ix + 2u; + p2 = read_i16_point(param_11, v_574); + if (seg_type == 3u) + { + uint param_12 = ps_ix + 3u; + p3 = read_i16_point(param_12, v_574); + } + } + } + param_13.offset = _639.conf.trans_alloc.offset; + TransformSegRef param_14 = trans_ref; + TransformSeg transform = TransformSeg_read(param_13, param_14, v_111); + p0 = ((transform.mat.xy * p0.x) + (transform.mat.zw * p0.y)) + transform.translate; + p1 = ((transform.mat.xy * p1.x) + (transform.mat.zw * p1.y)) + transform.translate; + float4 bbox = float4(fast::min(p0, p1), fast::max(p0, p1)); + if (seg_type >= 2u) + { + p2 = ((transform.mat.xy * p2.x) + (transform.mat.zw * p2.y)) + transform.translate; + float4 _947 = bbox; + float2 _950 = fast::min(_947.xy, p2); + bbox.x = _950.x; + bbox.y = _950.y; + float4 _955 = bbox; + float2 _958 = fast::max(_955.zw, p2); + bbox.z = _958.x; + bbox.w = _958.y; + if (seg_type == 3u) + { + p3 = ((transform.mat.xy * p3.x) + (transform.mat.zw * p3.y)) + transform.translate; + float4 _983 = bbox; + float2 _986 = fast::min(_983.xy, p3); + bbox.x = _986.x; + bbox.y = _986.y; + float4 _991 = bbox; + float2 _994 = fast::max(_991.zw, p3); + bbox.z = _994.x; + bbox.w = _994.y; + } + else + { + p3 = p2; + p2 = mix(p1, p2, float2(0.3333333432674407958984375)); + p1 = mix(p1, p0, float2(0.3333333432674407958984375)); + } + } + else + { + p3 = p1; + p2 = mix(p3, p0, float2(0.3333333432674407958984375)); + p1 = mix(p0, p3, float2(0.3333333432674407958984375)); + } + float2 stroke = float2(0.0); + if (linewidth[i_1] >= 0.0) + { + stroke = float2(length(transform.mat.xz), length(transform.mat.yw)) * (0.5 * linewidth[i_1]); + bbox += float4(-stroke, stroke); + } + local[i_1].bbox = bbox; + local[i_1].flags = 0u; + cubic.p0 = p0; + cubic.p1 = p1; + cubic.p2 = p2; + cubic.p3 = p3; + cubic.path_ix = tm.path_ix; + cubic.trans_ix = (gl_GlobalInvocationID.x * 4u) + i_1; + cubic.stroke = stroke; + uint fill_mode = uint(linewidth[i_1] >= 0.0); + param_15.offset = _639.conf.pathseg_alloc.offset; + PathSegRef param_16 = ps_ref; + uint param_17 = fill_mode; + PathCubic param_18 = cubic; + PathSeg_Cubic_write(param_15, param_16, param_17, param_18, v_111); + ps_ref.offset += 52u; + uint n_points = (tag_byte & 3u) + ((tag_byte >> uint(2)) & 1u); + uint n_words = n_points + (n_points & (((tag_byte >> uint(3)) & 1u) * 15u)); + ps_ix += n_words; + } + else + { + local[i_1].bbox = float4(0.0); + uint is_path = (tag_byte >> uint(4)) & 1u; + local[i_1].flags = is_path; + tm.path_ix += is_path; + trans_ix += ((tag_byte >> uint(5)) & 1u); + trans_ref.offset += (((tag_byte >> uint(5)) & 1u) * 24u); + lw_ix += ((tag_byte >> uint(6)) & 1u); + } + } + Monoid agg = local[0]; + for (uint i_2 = 1u; i_2 < 4u; i_2++) + { + Monoid param_19 = agg; + Monoid param_20 = local[i_2]; + agg = combine_monoid(param_19, param_20); + local[i_2] = agg; + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_3 = 0u; i_3 < 8u; i_3++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i_3)) + { + Monoid other_1 = sh_scratch[gl_LocalInvocationID.x - (1u << i_3)]; + Monoid param_21 = other_1; + Monoid param_22 = agg; + agg = combine_monoid(param_21, param_22); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint path_ix = save_path_ix; + uint bbox_out_ix = (_639.conf.path_bbox_alloc.offset >> uint(2)) + (path_ix * 6u); + Monoid row = monoid_identity(); + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_4 = 0u; i_4 < 4u; i_4++) + { + Monoid param_23 = row; + Monoid param_24 = local[i_4]; + Monoid m = combine_monoid(param_23, param_24); + bool do_atomic = false; + bool _1264 = i_4 == 3u; + bool _1270; + if (_1264) + { + _1270 = gl_LocalInvocationID.x == 255u; + } + else + { + _1270 = _1264; + } + if (_1270) + { + do_atomic = true; + } + if ((m.flags & 1u) != 0u) + { + v_111.memory[bbox_out_ix + 4u] = as_type(linewidth[i_4]); + v_111.memory[bbox_out_ix + 5u] = save_trans_ix[i_4]; + if ((m.flags & 2u) == 0u) + { + do_atomic = true; + } + else + { + float param_25 = m.bbox.x; + v_111.memory[bbox_out_ix] = round_down(param_25); + float param_26 = m.bbox.y; + v_111.memory[bbox_out_ix + 1u] = round_down(param_26); + float param_27 = m.bbox.z; + v_111.memory[bbox_out_ix + 2u] = round_up(param_27); + float param_28 = m.bbox.w; + v_111.memory[bbox_out_ix + 3u] = round_up(param_28); + bbox_out_ix += 6u; + do_atomic = false; + } + } + if (do_atomic) + { + bool _1335 = m.bbox.z > m.bbox.x; + bool _1344; + if (!_1335) + { + _1344 = m.bbox.w > m.bbox.y; + } + else + { + _1344 = _1335; + } + if (_1344) + { + float param_29 = m.bbox.x; + uint _1353 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix], round_down(param_29), memory_order_relaxed); + float param_30 = m.bbox.y; + uint _1361 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 1u], round_down(param_30), memory_order_relaxed); + float param_31 = m.bbox.z; + uint _1369 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 2u], round_up(param_31), memory_order_relaxed); + float param_32 = m.bbox.w; + uint _1377 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 3u], round_up(param_32), memory_order_relaxed); + } + bbox_out_ix += 6u; + } + } +} + diff --git a/piet-gpu/shader/gen/pathseg.spv b/piet-gpu/shader/gen/pathseg.spv new file mode 100644 index 0000000000000000000000000000000000000000..4e2e9d54995c7e17f3f02f9876384c618e18d991 GIT binary patch literal 35212 zcmbWA2bf+})wW-lnI!byq$SkQd*~$u2qlovJHuo$2?LXvm`nmF(y<{ZAlPXl(nLfB zMLE4(w|`&$HIvYwdFOd)_mXq+{{*N4Hvw zwU%s+Y0c_tRo4ow#Zg+VrCOt^dh(RRr))WIc*d67@36HF%eFcyKYf;?oy|HavwQjn zHOybU)w+ywFJ*c9(6)jG8ra6t!GD-r4?sGJK6y&_=%3ryJKVpZProsRU;kk5!2B7dLl+)p9&!fPnm8~p)Jr{zOyyofGiT208*ZU2 z$#`0=;i37xr*zNmn>~Ks@Z9;m!&_91wwS-oHoMN>W>@_>TTA2DQ}-obW|4lDYOM@E zeQy77-&^)mbyfA{e3X5yxJX|et(D*ldIqZe)&S4#>zVPE-C{SN)u_upSBEbgXIE=Y z_z^v`4jvjD>Yo7~#eZsE!*hEE=XLj=As)pnsk=_D!Rh@ohEEx>wcEIMb9#nPnb$X~ zvN*T4ac8J2rXSZNz6?)z|{uzCP!~Mf&jz^}q)m!UR_YTdT?lxHH zmjxOb35!(zzs;`o3cj^N)qh889Cc5P*Q1_Z;Zd!Pspt01nBQB49bd11S8F45^EiEI zuG2@3qgtC#AK~219GW}(@V;4x_02@>s&su8j(1dR%Z9IZ?+$gZ**(MXwx4rft;s;o zFk7DEmQ)D_8?$-=|(X6ViY(E_>Z{m!58}M9mb}ZY2r?GaE=1=eMbs$`h zMcr%RSh`vh;3K|Z$D!smhXCP@V-7*Wt9`>j!mItVoxI&=_DXyIjH>$dd|7mVwtYvT zGuWZin4qrKTCLsSBRO?fcj8;l&WIo5tfr=(wGn-b1v9r@q_lS#s>cEGI$AqZXC8L} zx972=rF~xI)79F8+CJ;8QH@j0YvFvZ-P#vk`*m0I|NoJHyU!|n_UQj^0OmiDn8|-{ za6A9b)+F%Efu33IF?6){fwysIYce%Op$6u_w<**SB&ZM)|T^$!l`e(q=;hRqYt zK8}iC|27kven*b@jcOgQ-_ZQQ8Qn96P9GdUv;Eo7(K-&@3GhBTX~bv#oVG`MznnPY z)7k0;&*_=lGrMZj+43GQx%$CAf+kCHLYJsyq88x^u1GH+SyPT(Ik+zxNt? z)radn1o!f@VgYp-&)gUCj@G$weQfuBYV+;lzdNX_Y3%N07KSVPmkRs$!GrAybjwus zpHXMs{snH&d1vbd@Z6r$t7i&xdKtbjhtAe34gOl&-y@5DIVl);>A!e`cf#r4W3%!f z)37hu;7hmtw~PPs4f~1>zH-~YIv&ku^@jaz4Zc>}e~0uxwqajy5#G_-5Z=D;I8(rx zw|y4z>ul}U;FBBtkOrUD;D9@CzIK;s(E@!7pv_D;oUD2EVGouWs;b8~nNkzpcS`aJzPfz~s2h#qyPD%% z3fl}8t^RzjvtBqqey8dk=%2GNR(lSXiO!?wLR-5}zpqto<_-7so)+uExm_;yMe=F) zyF51YSv1$SZH3y_KRB}(dsj7%6|u1vK2tMKZEdAWa~6%=)>f`Gvs;+YDroI_%P!K^ z9?PoO7JkPn?d_cWPGK=AJg0XSO-s_~!fo z`Vo!q!D?QMe$NM&~WPtQv3FfveyuD_8a|e&iSLHvdH4d$cS9EUQ8=|{Mr=MAU z&8>3SRA2V=@R@V`NI>goO+fQ|rDNGqJo4W5cD^}RqnXQwXW)7hHS;8(!&J<5DvgD?Dk+u2%|4OHE0 zb$=Ved;P9n@vhd+@R|Js1H4d~(f%y$Z0(7*aIQLA`!x7|4Zi;(ysI@8zUcFOXX~g+ z^DW4{YF_)`{qw4w1n+2F1n-$OtHSTqR_A*exXk5>MR-^1YB=Y_yxzGz!@ZoT_H{$i z{)^!9Ecn_Y?K@ldHTc67Z{LINwMXF9N5as2K3UJG`u_pi+@ba}#S`Fmt{tsEg0pT< zgUh`CvIy^LJqw?~C&uv5?0PLaTPyQ{T(()W!Pjf>O&WaP20ysLk8JQ03qF~>Gr*kt z*-vV%{TXR>ID^-xct!JUQ!D+r-7C%KfXS?ZZP|+8&gwhldSI{cacx@hU1_{9ZfEt( zr`gXcWA0|>T*<@!)zZJue(SYes{Z8mPgQZ>E&EKd7RCPZe3t&yeD3gSYR$3gr>lOB zHEqpB&1aKn&X1bU57C(NO7k;6nq${5<28oQT58Ez&3I~KDQUYzt!+?fK3nNGzR(=M z+UA93u4){SBmFH?YkomZ44<9!+o{leUQ+Y@S^ONQ+Fpg`^O4##xr^dT@-! zi`Le=TJ973kG>8)x&~^^Vy@nM+Ks<1wejV~9b02{%Q$>CtTp2~o=vDTo=w5FS6da^ zX4GRTtI^JTQto*n-=gM@VI8>d`t?ya_Eywtj>CS9<^7sizAQ49+B(e21ZvxsCF50d zd^=LxMqlU9dp&b#f7*@Xd+5a36Kor`aiG1Z(?*-~&~Dd$2kaQqc7L#KCsQ2L0X0^) zIRA&#nsa+7*j(+?e7)Cg@4RV0mfF76H=(V2;$+I&-~noF>kwyljpge0HArpV+IFq^ zP_4O7=YYpj7Ncxd^S6U-WBcJ6+oyKxvngJivr6un_fXWDCH-BD%`5zJa2f9!xc5%% zH-gi1Qi$W#JjG_Eq8Kc-MjZS?c_60Cue26Y{NU?tYU` zsJZ(m{hx(qzu}j`lfU{W;O;f;j_*@&BywL3Ncx3!x4?4xF0#+@PIHVZ#h{^mK8J!j^D z&DnXjzhO9OSGxCs+-D~>?F(vb8*}kFs*TsKo}a{J(f)raWgl;ajqhi!&qd~THZ}iO znz_sU+*QjOoeS}~KyJU6Xk>6}!7l}SpQ_p42dUj}>h>r1orGHE#?Nt@SN>PlHv35b z*P!|HgZTS7ufLx=ekT7ZN~gGc=b9b z_wjRZKL=xf4W4~&K7?GvzYAWTBTK^lTyf4;1-r+*H;unq?W?YRU26M`eQa%4*Z(wX z!dH95eC+EPv|DevWe@KK@7LGixkuvD=_0gRdm3YHu=h-2tOGY+b?u+5?OE4P!TG<^ zokO|bA=S+3`WoA3=IBv0?{)2e2iw1VQzpy$Ike3gsrQr5J;w5UvrJ$djuDsYVmSQF zYVqnUF}`!}`)B(M_Z@Tiy6Cr+&wg|;QISMD0ca#1^4Xt zJy2=)T~N4o-vx!+-uFS_+I=4suHAP+;o5yC6t3O(LM8XTP|1BSRC3=7mE8A2;eNLH zUMSqps8b8>eELo(c0c=kClqdez7q=f{_vepxX)+46ACwf-wA~q&-X#$u9xqF!mIfx zxc&J)sI>b&sN}v2D!K21!uO}W?}EbZ-}gb`+I=Tfa^DGs>+k!YaQ%G;RC3<|mHfVf z+unCTvCDl26mEaM{|VRbJD_mwz6UC~?}AG1yP$B-6W;}e`~2#AppyF@sN}u}D!K20 zO7450lKUR0?=f2M$yK_c)hM!UE7vdv76Rx%@`)d!*LC-L? ziCBFeJ`e2Y@5SJAsngH-U~Lc7{agT6^LaWx7sBl?J{N)2^f|X~_ingtwI$|zz&`&N zQ+xV*FSxRir)R4?wu`}Pj=^(PzB4iW9RC1#V`_Enp2KRHr^~@U1C;Z01zg?!?c>8> z+u6ovgsUh%N9bpta(%>}5oR+>C!SntI~j0xsj<3OBy``E^dWfo-QPYw$U+YoIOb zbURqxb@2039@`yYHEn*D%453|?D}f+b5*`Gv*Fr*5xhRNZM5m5uexiPaa+IDI(!LR za~{5ork)(`0^4uKcQ@Ga>1Ry2KE`wpIoH;Z9s&wL%MrjL8(n_w^ZjJ9u3)EtL6eSaI= z?E4Wk_3Vd7!D{Xk&yB~x-=REA@!ZhvygWetU5d6t>%H`Nt(l84J^R0h<``}3c;)(~ z&G*6f!|gTF<_B=wWPauPrOgk)w&DMgHa~*c#xu}3a{bcg$6(v=Q9sh=Cve+bQ@4@p zmo`5I+lG&ykv2br+vdi)ja)z1K67rJIsZAhxj%n_rk?fuC0H%%`D?J3>#6Nm6gAgV zoW7p`H~an#ntImrw_r8bGxx)jaL3?0{f=60pI1`yZHa1*7bi1EYCgsN3i#> z{by z4WdT3BjZ^dY&&h!*pr)R2CEzYRc!La9|cZ)zaN+JN5i#eY-7NV%{fj^OQ5MIr=`GZ zez(pXFAcUYZRWHG?Uw3?uYEfHQ{@}og;1WTMMl1Ec)LJ+uC3?=gnu} zb-;e_vwz1Z*C*}Q1DA2vhr7<1_YL6VDEir6u20%;1bzqp;Tyy4FKaR$tmd;*?w3vA zw$o;xe(zLEADe;IzFmJ_Yz|iQU8eqiFI9{G7GSko@%MRcOR!qb$*sX&u7S3#DAPOJ zYamWP+knfs+rrh%J$-KnS5MzNfYZ0O?J0*+Twihe-U;maV*Y+_miv6{yP2KAhf}L- z_j|Tl+U^RrZT9qTa5bMb_4j+WTKd}qY=7$7{a&t?{`LerUwO9N3#?{;J}2xA_Hw?o zO{AzfCUN@N2VCa0FI+9(W%dKBC9la~^HMjjNz`iTZ-20P-BPdP0bn)P-MkJ0dzqKE z11W0eB~CvFgUh@Qfvfqh+PG7}YRPLF*u2!WPoY-xK2QJF^2~#^-1RrUYo7RrgR|DY zvydm&5#XbUrEVj9T*Jsqx=wVna?GB0ht z6gBe_r=LD>nb%CXTF#1DV726R3fR2V&C9h`b1l=qwLJ4+U9Poj?zwy#^#CQGVSa|1 z*E^|a*S0((4#L&Ea<&e^Po;RKn~PjO`|z{#?O;DMwao#`eV+2?GjqY)QrljeKK<0{ z_V4?vd0@4CuKM$mVYvEKo&h|2=7ZHNUNo!P``NJo&Hd?FXHI^;oDNsNv_8Ae0NaOp z;+_d^#(f8xdg8tltY%?|WWO-(SrFs8=M(o`aP`DJ8*Cry=heBK12!LR=HffMbHV14 z=P<{79$Y=`&Ij98-FY;&TE=q$xH+B+(bO}Zi@<6YuW}yWjkXJ=oX7XT)ia*=g6%^+ zao-1S#(h7Udg5LTRdvFF)m%g8{lj4AHvCGkK1+RWMa0oQ;xrQA*) zK9^hzR?FFYJ=n`LPTO^q=P90X;_QP@gPZ%{1~m2TgU^7~EMDeod+*~L(I!#K{r6e8 zdiKFhVEa%{+?&D8xVNCGC+@9aHA{2<-G*k|a{qk}uAaEJgY82-`{?sv^U-E5?!P<0 z=2GszFTmB)?oP06)m;x`tGOPo+ZVyEQF8whSk2<)oMpd#8O`?Dn|Fb=yEkp8-Q1kt zyTSJB{AS;L1#TNJbJ0gFZN3V&jdPPW_rMd!Hu|U~|F3~tOqJUnZAtz-{AYF8ZjY%{Rfe@pCq99)Kr~ZS+wyC+~-EfxRE{eD@$& zEuZ-hgT1^Tv^_-mGsXKsY-~T*z71~f%SX`EGdGWd)hu3)J-I)IHic5&58r{SXJ397 zY#-{0`#886_j_pSiTi!9nk8$JxIaKMZh1fa5U!rMKLXo_`gx4kbMwbw^U-Es-VZ+k zn@f2={1mR9c0U8#R^9b5wp#Yn&%v%?a{mQb&En;pWk3BA&Gxw;eg)R<{a`!o8RxIT z_FLW$Prz;CWiI-trOj`^wkhw2-@+5eHu|U~|KEYj{GWuYSu*#@|0%RG|KG#S-}}LK z+LOy4!1i0-4}XN)#>-svQA?XYfo)UX4^P7r$2R(?`7G&PcVD}|ljC2&lPLO{t30;9 zf_EwS-@y7#tM|y?!OzrXX?_F8ee*2bG5Wk{d)w$||F+eh{+|Pv{r>|#fs+3J32yfP zFSPV;d)w$||MsQ59IIoNUr+7WpQldze}iq4vCCuo57_w#e*s+P_9DEQ+e>K4&Gxp@ zFJsr9{$2*#U-*B)#>?1W0XO@96)pYS-ZuJW?Apt*I%aLJQNKN`W^9xl={eITk=ZMANYCEvW{a)N5=knatwm3!2ITj~Y7r2Zy3T`a*#2O7&zld1I z9s};AsGFnb?-JnVIk+U6dd`xiz-ksR@5iMXv3)L$=G^T~(J%cj18(-aESh@yT@I{f zu@BN8>38{BvtRwPek*{DsXgcHiePoemvvEd9=vx}0=rh{)O=;QTI{RT_I&oO3fIRg z+G@3Cp3zoEv#nROHEK=2JY&8M%|7kJHu@Ob=a@CYo;ByO9`d!|YPNAK+SJ^C*$39P zch8LlyARUeIJj;6PV4%v2lsay^t%{;`&82}eXI{|@1;@I&u=z>+eY2bB-^OvGuAoX z2=0C6dixo+F?>TxV#xI|hW_Kho(b{a1g?Mjm+KS%O~IaP@!t%te}2{@*C+m)gT2q= zzXe?X{G3Owk9L1XwI$eh(B=F1t z^I})H=Y_gH-pjjz)pIuO4p#GQx|#jt`1XLS``PZejpgsgsOLS(Uf{e(=^}>x`yOQ? zx;F0>``X(!6t9eX65QXd$v%?nXCHguw=cMpK8?E%Se`LX23NUY+Yc<)Hi32rfSn)P z?N2S=na@_zY8?pPm)f}YX7n62G)N1fF)rmFt&2rhzxa zKkvp5g{%485ud|qpS*8B9IoblkakDF)m@Y1b0pmUv>AIUwOaZ;8mtz6OoJZ_H@Cz+ z4zA|?lDNmi)s36|ega(G--V9fiE#VZmi$fvn}asxA=<4wC^xIbq~?D=pteT}0{E&dC@Wt`LDYGs@=;OgbxIuou= zmm_0(2i*6n<-Pe%H1*_j7Ff;pnTvOU)6N{_`X$G+!DWu;z|}loGZ*K=)yug!5AM(U z68n6(n!d)-rdIA#_p5xxdf!?nj|=cEKPxVTt7WffQ?vbYb^8^l9jCu{_ik+NuguS6 zwD-W(&F3O&x%j=*j>W#-2lgJa_GhLSQ-6TsrTvoHZk=2&1>46R#P-?tgJ3m3`+T02 zzn@}X_AO4^4}onfH^g7nTgI)P z_#Xk6@jnVTeqw$MY)t)ZFW1Mpa9$jbb^7}_*#5E)J^@zCx@l7@=h}HOp3fH7fSbSP zaV?s9#(W((IbIL`H05fFeGM|=bI+023QJ_9yiYuEO()HhMQwBKCYtuvlm!1j@~ zy%nsM-=mY?NU<;b7C%Yfe($zi!aJnNZT(j3@C|Iu zdH5!pdUALGY`+=bx4@21KV!=EF{XRSxwf{w>-HendCYzA5Lhkufi|^rU%H2yIj6sG zWAk@qjQucJp5H-v44miAN5MJs%;P(7?RlR0E?BMn9hAr6_L*my?}63)S#$FLKHRq2 z%;6DgHGj{~v*w53snptj0G4Op{RrHg?;oS7=l8;X0#@^0bx;2c?B$-;_EUwXr;9tS(Qfw3d zzk${D{|mJ|_trCD?=Ab$rq3U#)y2WS7ejOY&C#FXEDra(t-AeehSmXA_jjv24?4kWK8O3e`CVZDe5>byF`NrE z{gTTlaGA?!xSD_NGCpJA$;J6v0zQsh)Z?=xSUs_q0w=b88e2`j%=^;d=DaV1rk;6U z7OdvyZT8D@aP`dl3SckiUEA`M6)DcUxVZ=P*QQVMUkU6yn7{MA3f$a0Z;hj$bG|CN zHv9C?Z^-@K4d;I~@bT2@nTOTEnFnLoPEFs$UISdN!Q0?!?zQBz7CiZ=C!e*^wb`$K zeng&p)&ZN3dh%HpoP3O9J2ici&scDo&p5c6Ymj`_hbJHP~Hv9F@ugH_nhTxOP zM?Lv$1WrE2v7MT}$!BA5na_B*nrBh+*%Y39@~*-4+6-Nr{rcx;M8S2`=;53a*yl-_oXL+nlSOJ=u5G@|+FU_U-qJt+Cn0vu#_jm-||M8;Y8H zSDdkL2hP~bwb&kATl{t?{LZG|tl0!~ZTGN`&0$BdZPjn8^V$ilZeF`kdzqK|&J;EC z5+~lS;N(@V$!_S{; znmLQJwv)iFpSgL4?gP($(3YI{17~dITuw&U=GyN|Ezf@2AME|8{w$wso*4&#jp4ZM zN1x0NVDYB@K|?*w@A)1LfJ0w=$6?{=eWGrtq5<;m}4ZS<+0{CdE~NPhabceD4blkaqJ znQt##tvpxMY@c($TK*(^*1c?<{$^nFb34x&eQ>pWPtm4k`#kIVT>3gcx!Q^1pK10y zSeD|o6lF)oJ%)Pe8v8u348_>XQNKZbGIiSXwWj*JXQ-;bCnD`vfQxOvVu4oz+n0a; zkAF_uSiO|dHP^qd=8jW+2F3nd6Mf~GgPCCa*JeAp&%};v7R9mIMqAFhQ^49hH%_JY z@~l(ur>L2u*trs`CFWePF~jG99arwz`QYq1ea%fxzjE$9AN+fDyjG=T?)IFcP_O&whO^(#t>&ut_k+ft$D3Q zF_(3y^?MI>>9;=IYb?b%Uzd7Zf!71uegkUfU;Fzh&VQbBKLECk&yvaQ60mxFE(P0H z>GMIjdhXv3fo-Rrc9(&TXFKOWu3zj|fQ@}QC9yvYR*%n>U}Kj)SHaa2`y*i6si)mX z!N#_ovE};3{xPtz-%d&FkAv0Y^9iuAOP^1|)f4+uVB4vu-PK@Y+s@c>{bIisZ0z!9 zq1VCH<8wXO*rm^>;p&Nf1K4)zY4;hhv2AB;xqh*K7HsSrDH-=oVDKO zv2O+2PCe~z0~_0R#+K`69QXC-DB9i2;&Lx<4ENfQlJjFDu={sBb^cz8JHYPW?B_2P zdR!0HXOGDBN!eCka(xoxTi`Or zLvXbRDT(ngSUoYm4OUBxN5IC&ev|8y7>|O>7~g@bJw{24?}F76<8iQBVtfy5jEgC* zkzAj|_yO283jZP4n0fd3Be0rtoi+OjSpCP8?CGC^?Ms_++)HZd_vc`>@Lx3eFX3g} zU%}O!o5cM!SlzhUQ@;VLKS7D#Z^8DjE&2TpY!2GY&%Ldde4YfWW!%39t35@D&mX}0 zq|ZNs)$~oDYVrRQxa{-KaJ8okpTE>T>GQ8}HGR{kTKfDO*giAXzk}7Dp=8aU1-s@q zQtU&npM9)J{TxNRXNfpx$(G=4DPCJqJWIwVqa-*^Z&AODSqlKXEwgzN9W z@er=R|Hebf*DARFV+*eTdIjGAykX6=UeDv>yn2sjo&OD1&wl(5Sk3#;HFeI_;{PI8 zE&J#tu-e%aea!d26xY)Bu8Caidg$---ze%=DQi&l(WZ}Us-AeSgUfht!qwiWeT=sl z<4io)RBm6cr~Zl80XLpL+VpWxs3%?*xQsU%u9okZ##;iOc`2ww+gt7w;Eh6?~aYP z20ZcHlXClVKkA=&Yih$^A8q=$r_~d0ZEzWHUAS7_4;ybBJn`Jqa{F>W>z{b*gN>(; zHhsJ&)Dv$buzSeQsm-W2hO5Os9_*UOz6o3{_D#XAQS6(6)neZq?3~8F1z0V=*SICv znD%R1?cRU>e$Cbt@5y{d%Jom*+klg_GQ8(rU>K!S@ zyqqyRUvmBI({p&I+U^-G&KbQpw%sURJ5h2*?_A>@sCTLHj?}wSJg;}JaQp8o+m-%Y zU(avPt9=UYd9`1IPcFFrhZJ1@X$^j8!Sz3;;QAle;KvtS|LFzSe@27%6wjf~UsZ7Z{kLV3 zpZ?c2_;m%>|F(kbe|v*}zTo=bU2y&Vw`IzB_tZR}U%SEGbLHpP?r7@y{MrMo_ICW- z@9x1$V-Nb7vnO0FpI>`{)%^U*ySs_{)N%5;=2@~gx;F1W`U}0RKkqT)h`qmTms}46o2&l%X-}?)gUeixfUAw7B-bPLp}l8OIj={dYfG+2 zgVmDjvEbzDJuWvt?{DLXkEga>ayCxpwPQ=UUF|$>`dW zYY$j0x%PsS>vXW({5)HXBlcXeU2>fXHdp=i)1F*sfy-P^fvc@bNv{3+(B3^?&g-e@ z+LG&OV725r8=PD{+vMgqNNpUkXPWJj>m0DT>aU;nv?tehg3Dar1y?(Zl3dT$hxXS{ z%6UBpU0ZTJ7p#_C&j%;h^T2ZRyMWp_;tQ#5ms~Fbo2&l%X-}^22A8?M2d;J({>k;d z`qa6W^ZGt?ZOQfhV727>0dR7?7%VrxOQ?+_zLeT_$@PO^bJbrz?aB2+;4;_C;cA!F zKAw{w1}EkfV7YOwq}D&@{YSu#$?uox|D$N?`d>vYPhKAbo0omsm-e*#IJnH~lW?_9 z)IR2Q4LEsS4VD|{T5A21*Y#lY`V@8g|1?-#|Ldsb$?FENdD*9ZX-~V)fXlo-3s<|5 zlC`~AAKKqUUGBqM;M%gbw}RE|+wpu3oblWSmYd)0)W#8ip4xWF^$xJP>aU;nzDMYb1nDbm(jH)*So-K$@MGX_IY+6iqEtAQRn&f0s41d^FHFi27jpFKHq$& z;65)uUT|Z4ujbiT_rlGme7?C4O+C*y_k-0u>$5IjhnstTFZ3I5wWrY@;(LJ4D&K^w zXH6W(x8M`0om}k!uzw%yI%#t*<;n90V86fE-ger{y)hZv6pGgY6!*sd z)CU&$Ah3JjU~1P~`w!{IxpT~0fqz8NFKf0r_{X&#pP#^Or;oYH^~t>b6l`3di?XIa zLsQTDx1WR6T)+Gt(=Xuao(V6}@2|jK{;W*fFDYt{L7aQ^*Wlzl3T({HX!itITl{`g z_!%R9zeU$}FLSUY`tQKDRrhDXiD{kRt9TM@PR^zN=3~ykr}i>ueV(GInX@>#`~jSt zmjx%6KZ3Qz?@xta_PS$x8eLm*{xjIN>gJr7*2(!VU~_UD`kRkA|Bc$qob~xDO*`6rrs_R7D&YUN&e9^BLy0+vs25eh(=Q1&^GnY$%&C~hO-+c7V zyjjP0No{b~!gluK_?D*ja(wzMMNxBn;_TUFz!_im^|EN{BY%z=tY+~d)JXf~(X!{$ zeg!o3v|kadX7RFp{%(eq(BvCZmILdXIb9i?zFq758RS*aZRh0~te;xitqM*%*COp! zLr*;0>8F;tSRLG)i#5>HGnTi3)hzr^FALkRiIBLNi?z_y(|&ERnuY(X_L+-y5S)wg z6yxZdF|7+u-`*b?(^z!dc{wKi)Y5JoIPKiOX}2DF;@M6=HTPNeiM2dumvzpP^|3iu zs84tFtdfIIQb{xhtU%7sgQ z6>dKn>oj=Asx5wpf-4(iGlo32!@+9VpGSbzeExTC=1?CASN|5zZ1SVPYWfV-b96LZ z-T5rcqwadhvmV}C z$5XtHrMRcvzsFHrj}xfV?{wO!=kMm|g>xiSKabFM9`j*7ePC@fD4wlqGr{_Lw(bDd zXEym8e->D~xrxi%PQvb0=5`|7+`6fg+iA2@Pi_Nn<9UB-OK!8l+REGpYhOPDPX+6f N+=jr~%}s1>{|_qA&e8w? literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/pathtag_reduce.dxil b/piet-gpu/shader/gen/pathtag_reduce.dxil new file mode 100644 index 0000000000000000000000000000000000000000..4c2bd233ca3457688916b6937bc1b15e1db950ba GIT binary patch literal 4644 zcmeHK4Nw%<9e;bfx4Xcxx5pi35zsw8P6U#!gbx*L?+XDv^$7Tt+Q}X$!35~>5hCg| zd&j|D2*mR%AsXAm4?@#SLK4z|$vAgFEFq{!oQxmTfqr|2OP3*_p!!y=SZD%^u zNoVqA{%`-k-~a#Kf8Xuf{k=1;(;IAizxmK~C@MNc_Ug3*m!$0o03a?500LgKpqZd; zfK~<#gTaL)0EN&BmgaHra;N1L^zz60LV+Dn}!ep4p11b0jPHbG4XA%q0Fs6dM z-r&8cmI2TUxez<+pI-@Mge`WbZUD=}>6Asg5Qi*YN=D4Vo+}bm*%JM(5dC2~MYWJ* zmItRi;>vZaSi@4+5fG80pU9=$J7fCIx_n#nc>U!jZ2nR_A1+S3x5(x(@n;~kl8(^E zDbYo}U<5Co7G1TXbn- zgVwh%mTd9~63@=}Hh13&+D3l(SkUHp&D@^yLRch0ipfAzc7z~4jiyPh*D|8Y@16MR zSz~D*t=QF-pRF?{fQ_qmx4XhL&opF|#dKL*WZwCXgKUfn^j2dYpj1_Poi)5Oz0_G( zib6m14Kr0fOl^fdkZ|pIF&phCJf_$xGSwzOKjzdnqhpBCTeFg zc?!-jlQ{`P9s=f+k5O=*+cWr?#qv0}IyTz~YJ0)ZN=7&rgmRe{P%=;EHO!F&_B8;F zSvc+AesIk|RwtX;4@8h}%hQUiWVh3Ga6S5Y($M%zGOE z!_R|A|Mn7zx3sX6Lra;{%0wtk?&cs(&T-5jiRUC{*`|DKlbMtzn1zJP`Aw4S!r&vu zquP!ZFnj>=){rmeTuLg@ zJZ=ZZ$T0nSmxU*V{;mSf-;6&ae*$2Y*c@m+tqNSX9b-KE18v;=RD`|ab>JaF6-wFJL^C? z?&zJE=)HOD%E`&|$NFzx8JyIR!UywAD<4X}@!a*Y*FSr%rR;e)ctch{OdJ`UaJfEg z=xJ%&d*aR9>iviI?`Hw?Y-+~Qo*;=^(U#QrP0_ZH*kb+K#;?+HzS68s*MZboPi?NK zsa9hA1$M73;$>v^Vc#MA%B@L_K62V2JN3as%_m+rRV*`~QTJ89WC)2(+Wq9#*rek9VeF=XsSOi@lNYZ}93Px) z$JWNfu}8DI-(oY~%{a;(g;Vs*N7g!Jr;ZG@tb9m*^}!P@6&1@k{3meS@rzeSuMAGW zVby_Wn-86bNg=9#JgP*6EO8kYI{KRR0#UbxsBW#4O#=C6H1&l*JuK7D>GegQY>O)4 zr2e$}$RTvefMH29x@2febB&eQMiOT|Iz>&H zsfX$Q=!@#8v+C$x%bXJK+06IrGlva}y3nlm>$4na=FpU?mM6BA5*ek=D%cB7tJ`Vi ziP{9Bc3KLjeOhonMmj4QXH~1SuGN-&eTuq1M*Yc4UE|QAfwriLwx}V0KWd2@R7dsO zq6aPbCvL-{0mGtJf8X7=aF1c33(efq_XAk;mOb+fn%UVW>sJ%ot?1SSXRY9jIBlS& z$M6Z3`u!OGB~9J*QsZv<4YT|UuHH6)%pKK=g?cT~q9cC2Dp~kFg;y!SZNH29%Yhx2 zEnhRC*h!BsF>QOrvESZEjUZ)5`SAOOi{kGa-k5gZa7p%k!=*0_%U-;Z^)p~p=#9s< zp`x!1mn!yB-1I>Cj}2)Xj{R{OS*lrr;{FV)AU=>Q zN)E`?738W!2e#r}BP|hEE2_?|Fu##gzNT2z&w-#BJ*|Rbe7Bx1tqOPWMnUSprLWfS`C6A->x^E@*UgcaMfI#pbqjJL6?q^1s7LG;{B|Kd|)Ymmxe+=H*%M5J^46K^7X@6N=GsjtdFs_p7dm67d_%N zeTFq$0%G}OKrGApI&Ex9%LVfWhJWfm)F}SnYSilJl&lw1G8Xq_b-XJ| z=X@REKUaLOJ>P53|5|%2ECA=>d#J>6lEnkH_&*FQ0fVoo&}D8~ad0AdfhQ{Xpmxx$ zP`1-*oh_I%yXiKq4MC;B4D>x&gU@)h=(2;`(PbGPE1KB01r0vJNND%LJXVMiPgFVCTqe1wX?NKk0(!t6D^&e##Ji$u>^()#` z_QhWY>JKy&kcs1g`o#}CqV8yz)oCyE*DnrPha`1K58kQ&n!mm$++W{gf1w^X7es2; zTS3}8$hyycd;>CfWL&<6nbJjTEf9R7RzYBkHj3c9Mb#MU(nkmu)Sip;X9dQMFzKwL z7(m*~2= zrF9eC$T#@nuItuWxS;LK8paee*A$)ExHqZr z*Rb1ssxr2X(&e%7`C;)sDt;Qh%^Q_4&6by-v4m+(Q)%4+zEBC%e0kiw%CLFkRQ9a^ ZzflR(kYx)KY+(tXQ*UE9Jf8X6-vN|+=PCdI literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/pathtag_reduce.hlsl b/piet-gpu/shader/gen/pathtag_reduce.hlsl new file mode 100644 index 0000000..5f7d125 --- /dev/null +++ b/piet-gpu/shader/gen/pathtag_reduce.hlsl @@ -0,0 +1,138 @@ +struct TagMonoid +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(128u, 1u, 1u); + +ByteAddressBuffer _139 : register(t1, space0); +ByteAddressBuffer _151 : register(t2, space0); +RWByteAddressBuffer _238 : register(u3, space0); +RWByteAddressBuffer _258 : register(u0, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared TagMonoid sh_scratch[128]; + +TagMonoid reduce_tag(uint tag_word) +{ + uint point_count = tag_word & 50529027u; + TagMonoid c; + c.pathseg_ix = uint(int(countbits((point_count * 7u) & 67372036u))); + c.linewidth_ix = uint(int(countbits(tag_word & 1077952576u))); + c.path_ix = uint(int(countbits(tag_word & 269488144u))); + c.trans_ix = uint(int(countbits(tag_word & 538976288u))); + uint n_points = point_count + ((tag_word >> uint(2)) & 16843009u); + uint a = n_points + (n_points & (((tag_word >> uint(3)) & 16843009u) * 15u)); + a += (a >> uint(8)); + a += (a >> uint(16)); + c.pathseg_offset = a & 255u; + return c; +} + +TagMonoid combine_tag_monoid(TagMonoid a, TagMonoid b) +{ + TagMonoid c; + c.trans_ix = a.trans_ix + b.trans_ix; + c.linewidth_ix = a.linewidth_ix + b.linewidth_ix; + c.pathseg_ix = a.pathseg_ix + b.pathseg_ix; + c.path_ix = a.path_ix + b.path_ix; + c.pathseg_offset = a.pathseg_offset + b.pathseg_offset; + return c; +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 2u; + uint scene_ix = (_139.Load(92) >> uint(2)) + ix; + uint tag_word = _151.Load(scene_ix * 4 + 0); + uint param = tag_word; + TagMonoid agg = reduce_tag(param); + for (uint i = 1u; i < 2u; i++) + { + tag_word = _151.Load((scene_ix + i) * 4 + 0); + uint param_1 = tag_word; + TagMonoid param_2 = agg; + TagMonoid param_3 = reduce_tag(param_1); + agg = combine_tag_monoid(param_2, param_3); + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 7u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if ((gl_LocalInvocationID.x + (1u << i_1)) < 128u) + { + TagMonoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)]; + TagMonoid param_4 = agg; + TagMonoid param_5 = other; + agg = combine_tag_monoid(param_4, param_5); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 0u) + { + _238.Store(gl_WorkGroupID.x * 20 + 0, agg.trans_ix); + _238.Store(gl_WorkGroupID.x * 20 + 4, agg.linewidth_ix); + _238.Store(gl_WorkGroupID.x * 20 + 8, agg.pathseg_ix); + _238.Store(gl_WorkGroupID.x * 20 + 12, agg.path_ix); + _238.Store(gl_WorkGroupID.x * 20 + 16, agg.pathseg_offset); + } +} + +[numthreads(128, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/pathtag_reduce.msl b/piet-gpu/shader/gen/pathtag_reduce.msl new file mode 100644 index 0000000..91e0cca --- /dev/null +++ b/piet-gpu/shader/gen/pathtag_reduce.msl @@ -0,0 +1,154 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +struct TagMonoid +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct SceneBuf +{ + uint scene[1]; +}; + +struct TagMonoid_1 +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct OutBuf +{ + TagMonoid_1 outbuf[1]; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(128u, 1u, 1u); + +static inline __attribute__((always_inline)) +TagMonoid reduce_tag(thread const uint& tag_word) +{ + uint point_count = tag_word & 50529027u; + TagMonoid c; + c.pathseg_ix = uint(int(popcount((point_count * 7u) & 67372036u))); + c.linewidth_ix = uint(int(popcount(tag_word & 1077952576u))); + c.path_ix = uint(int(popcount(tag_word & 269488144u))); + c.trans_ix = uint(int(popcount(tag_word & 538976288u))); + uint n_points = point_count + ((tag_word >> uint(2)) & 16843009u); + uint a = n_points + (n_points & (((tag_word >> uint(3)) & 16843009u) * 15u)); + a += (a >> uint(8)); + a += (a >> uint(16)); + c.pathseg_offset = a & 255u; + return c; +} + +static inline __attribute__((always_inline)) +TagMonoid combine_tag_monoid(thread const TagMonoid& a, thread const TagMonoid& b) +{ + TagMonoid c; + c.trans_ix = a.trans_ix + b.trans_ix; + c.linewidth_ix = a.linewidth_ix + b.linewidth_ix; + c.pathseg_ix = a.pathseg_ix + b.pathseg_ix; + c.path_ix = a.path_ix + b.path_ix; + c.pathseg_offset = a.pathseg_offset + b.pathseg_offset; + return c; +} + +kernel void main0(const device ConfigBuf& _139 [[buffer(1)]], const device SceneBuf& _151 [[buffer(2)]], device OutBuf& _238 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +{ + threadgroup TagMonoid sh_scratch[128]; + uint ix = gl_GlobalInvocationID.x * 2u; + uint scene_ix = (_139.conf.pathtag_offset >> uint(2)) + ix; + uint tag_word = _151.scene[scene_ix]; + uint param = tag_word; + TagMonoid agg = reduce_tag(param); + for (uint i = 1u; i < 2u; i++) + { + tag_word = _151.scene[scene_ix + i]; + uint param_1 = tag_word; + TagMonoid param_2 = agg; + TagMonoid param_3 = reduce_tag(param_1); + agg = combine_tag_monoid(param_2, param_3); + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 7u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if ((gl_LocalInvocationID.x + (1u << i_1)) < 128u) + { + TagMonoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)]; + TagMonoid param_4 = agg; + TagMonoid param_5 = other; + agg = combine_tag_monoid(param_4, param_5); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 0u) + { + _238.outbuf[gl_WorkGroupID.x].trans_ix = agg.trans_ix; + _238.outbuf[gl_WorkGroupID.x].linewidth_ix = agg.linewidth_ix; + _238.outbuf[gl_WorkGroupID.x].pathseg_ix = agg.pathseg_ix; + _238.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix; + _238.outbuf[gl_WorkGroupID.x].pathseg_offset = agg.pathseg_offset; + } +} + diff --git a/piet-gpu/shader/gen/pathtag_reduce.spv b/piet-gpu/shader/gen/pathtag_reduce.spv new file mode 100644 index 0000000000000000000000000000000000000000..f1d8679c961cc8dc0b74b6dafa1cb84cc2a7e80c GIT binary patch literal 8300 zcmbW42b7#u701713nA3dONbkS8fg-xo=_5so9JRdf*>Np?CfNBWOinmolW2zltm+; zh`oyvD~gI>S5yR1Y}hNJVs8j4N>k9^@0;)LzRU^7b9``l|NFn~mHWLNdgd(M{;OefJ`ICC_Vf$m-!ZyUaAicl?Kxu$m2S~x4CZJWVM~)y+^h!e5E-)?AF@l(b9On zu%(mj)>Nf^(9B8?np&3ryKMRYbRcgZXF8v`T;Oe(hjZMYpMR@5GF8dTE}dGoT>n1q zhjU;Z8=5VTC(p4nb0x=p8E)A`v({*rD$S`zJGUQ~*v|qtO1Z}*Ft=Mj)^HKn6Yk^g z?2FNQX|UcLF4tE#);BBVcCFc1ePSNhmtD#{S*da|ojdV9FN0fIuQy$f=dJFTJ=5Oo za`Y3Ljdiur&i$%88l`HzI$mwGS(*1P@BOmg&MktyFM02_xbsG-U8`5~n2yc+9jn$x z$J(V@7$cwetb3{4S!D2j6I^o8Gxhu7C)$;I7^1$t7yVMCnu(c*t=wodXJa~h*&4pn z!<~r8We?>2I6U0Epa|;c{N1NLG14k;==5D=oIRhsKkK!Lutd)7h^xpC*DABNPR{Ma zHrXy$&I{}8+3pbb&DPU7Tl>%Tl(t~=&CphBtV{OZm+xaIY}|!kFz3|a?Ay-WTD!yT zl5?(nc0IeobK{B7vZ^K9w;H>Ugjm)r9D-<#cyZOv3WzX|VC-O-$C4|~bp!#y1e+@Jpq;t;c@FE&EpiN! z-?!SgVj;2$|9E$opgZJzPaNHxY2-YE@UxbjZ&`3X1?QU+oH6?OE(GUWD(ANx96QcA zzq#OiL-mU_yFTm54ItsSb;0eIaPtdpX~MaGV-HL?*D7~N!udAJ9hz{yiE>9KobRCA z(Fx~1(i&v*b~Y1sjV?swq`-FG9kFF|*`Vc#F@9ztIRwhz>; zdkoUU2z@2GYY6=Wa4#eD)#xi2p|3&TkrDdo=mU(<&q1HV2z?aYeX6^^I^ubRem>Y< z3;*?Cdt!I|U%>2|-H+$@LT2|azr4`>8xZHA?=<4j{}N_x@{xZd*z;OM*2@apqHLf1 z?}UD-X#amC&GA^+`o1lfBd+yj%&tq$y{TVOaB)^INB8~H|H{H<{(Z<5Z;yF>Q`Gf) zL*XZ9UUlD<=o#ZTA)eDNu-7ohS$z^bjVvVYQ_Nc-a_0RsvwbCRUUh#r$;I9HUYS?_ zn+lsbBL7#x^KkIn^6lBuS-CggAnoqWw?^HZe_?hXa@JJ$y^)JG_};`C?kjBOGH<*+ zi;6cU>dTvd4uR$}f%$U1;0|Q9_Mz{HXR@h~!^vfg=isn+AB5o*%X9dxUxauL=2XA5 zOZWG2*!@=gog2FTS9R%Ecj?z9y7B(rjd*o`?}l!^n-g8TzhA?y?(f%Bzq8O+va3tL zFG4OxrjcXVljrziM1CVa>Mtp{jm*b#DxQJdO04$sD-iqL_s%|#Jg)?6yQRqUDzMxV zBz#^CHs5sN^BS;z^49Wv<--3>V7aLMX0YFcwY7!cTfo}P;hm_5?QLK=ZQh6aQPlUG z-i|Clx}@eXEt!})&B?On|8MjYDT!>rvo>b@6jj_av)4fFdDInVWB za;U!(F_*F8h)Tbzx0*gg!7v#}S|<7}=6 zdp7d=dtM(wJR@`6Q0QX&RR5#s5&b@fY>VilO&{arJulDBJv+x*J`PUz_z85mg^2Om z9bi2c^45$oN;?0TKO|67^A zggCT+nOVDY)V>XDj=2A?faO*q-oN@65px+Uz6*XMv*Mc@d;1!=dv9NdlaG4e09)@? z_SVmQJF*FpkG}aP*f?#mu5W?meH$NO-``<&xJGT?M&w+hIM(z%aQB+N4<{dM`T;my z(+|-%A@Z@NAAyb27WeXFu=k=Z?(QdGdGE$|Pd#iu1$(dB?qF6A+t0xIYrB(KJ^J+L zVEfbtfQ`9#QY0vOb-$<{{|bgKVpo!ei8Es z*qB}vkWmtT|e(G&e!=7c621-@06|3?Ni6r$PvWNW1gSjZNT2mcFg`>(Y`Zc zthKji-VHIvvy8jiz2L)V4|H>-K6|3eNB`{wHcmd`_6A!o?nzz0u7lGx& z=W$?br#=Jd@=<#+*th{C;`Rkw+c?iuUB9sJ2e!6%8?_$~mJgpNfUTYSJP}51&K9)=qs6Lzj=*hl7oikGLbi){Z__*Uvik+>wa(N9jp%+RF>j9XlZQ zvc0(@V*l>M9KW}hgUuEFd|aX%b5@sLF7(*z(dg#At$1IK1AX$2Q&HHhxoqes;BiMp%6={lc|F1H$q8qWaB zM~xw{T+}!jY>oH^)b)uP&jhD6PC=JjgG7x}!SYe#Szx)SaT?ef(QoSdM2)rJw8pd1 zjwF)b)uP&jou&p`Q)5X8eE0^T2YRb=+AAEPoCXJ$)|N zT-xjfd+A&x@(zRLLa%h`Bj{<}D!QC!6Lr^t<*ge%H3pU+MZ&KJHovx5-}Aw)L7VHd zx8-6z=Yi#7-{WApIubq&us)Hq36|40a>|AO1USvvLYF%~@tG`qB4-<2PT$BW7dfZE k=8U~=0L!gM;?6Gsd*{m#bExZQ4!{2k5$(PuV&9Vg0@&-L+5i9m literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/pathtag_root.dxil b/piet-gpu/shader/gen/pathtag_root.dxil new file mode 100644 index 0000000000000000000000000000000000000000..77f12e6db0ec75e4d05be227c033a9793a2cc1a0 GIT binary patch literal 4716 zcmeHKe^gV~9ltN{<-HI>@!Lj`5ltuOp4TA)EF1+m)$ zB1tuBRIn&@3;r5iwM}bRp|+lcKt+lco$aBZ3|3^cQ%kj5Jw0ySeE~L4xB1U{&bD)U z&$&N7pZmSv_r2eDb3ZSJT%G>UC%?3usruDB<99Jv>50m#2mqiY7yttHDXl4J&%>A@^pIwbw^0GJV~I;6sjM$r#t18KFxj0;;Z z`Q0XMR-I$4;o3HAK^mR~gXLlib&PV`A~d}ApvEa~hP=!&Oc&>%_r72r##rW%@JrlVk8SHFviL9 z7*eH711o5I69P2HL@3ZzV1k!x8{hw>+vdDW7B_Cycn?Gfws~2eYxn659d`q21?Qd& z*bo`@Kxb(-d`L7O6M=21AuN~iZ4(;LhD-9SuJPBV8dq8qMN(BpXpY=q?Zdj2L>Dww!=dE&|$=j*6k584rHu5y^v2RYa-*RHe|#^IF0s{X2=G zv#6v-K*2bti}IFI17?p2Js!t`*|W;@HUNa0wgMjKS%k;T09LgM!#pzp=zkF)@E;bA zctZokuv5=#k%vf)PDeo0^7k-3FOue&M1@%be!9uS+xj?bDl`d?K5R0kYCAOTJHWs} z#6@%OxCXxKXJP*X)aXp($qb8?`GFP5=-xwUq8OkRxeK+M+% zI_qlUGAjb}Ge0IV8OH9XzUDxumQfB!xDjswlzSvtxh6pQ;BbGHcPWo9xq( z=x-B(+$u{V=|O?adT0=oC!?M>VUs}t zN*!I}<6XDT4350lHF&1`)@0vAlz=rv8y{P=@bm3AN*6kAY~N9`6F%E$owl63FflZ1 z2znlxHoRH#s^4y>Ew|;R_3QRu!H{;l{oK~h9c|6$SCU|R>Ve^~tB~zTKhk-TV;q8? zgpAA`enR;%hV-@!G><)h{^GmmnqP0vtT@~{%se|nz>VYi0534%MHal+iEnh{B?EYw z1%HOkHs_ywx*{VY7TN}5IlWWsulJ2lTpGD@3^T~oDRWQLiKo?_3$$4_(FcX%A3Ba3 zU;T8)aYyM2wn^oy*d5zi`^w(X*w)tIlxDbg^44fy{go>tlUYkHI|jFn1c5U_ik6hcN;HY!@9@9isZ;LyFSV4bLd#2 zrkbc|DHlxu$qiEcB`b~>>E?BL!!NXj714aZSwm<)x~x~f%z-ZJuXa>gh*|+rlWwzE zh|LzXs@i6uY$5$p@pY$ojFyzql24|@Qzmgd*&RNj3L8{~cbVrE(Jv&w(~vx%U-~AR z@=imF6;1A+l2lP{ymZl9x9MQpL4n@C%Q1?Bd_*et!a=h}(QE#8!IR#HZZZ<)knZZS6iiNX8u|kgk5lXV-{~IJ9G8(gb;z~ZD!7Wr}HB3J6*8&zSBji_nj_o z91y+udCJd#L8>zx(*$v2%^be8Sxn#QEgjY^UVrBH9RyyK-H$zHfXoy12^^y{0GX6` z-c43f+>*am8Zg@4u#F|?S{IC% zqG&#C2crK;&09n#&uuvv*nfdv^amc)JpV!QH{ZSHao;rmUh_277&{~YW4yLxRAYu$ zq5m$-fc<}}b5MCsYLy~kenlwWO+^XTrZUqaH}tPWj=HGWH<76IvlPJ3HT8a%ur`(JK0&XOZJP6oD1Y zWvC?k^??zwd!ODM5u4u{Ocg>c8Sk4I`E2~XKBywA{z*mnGr9U+H@?@6??N}s6ae$# zXS@u{T*&?J34j$V0>U^K)z>CRg@-l51lNPYqvI=OyQ7pNj%}0qJ(N^f(p6;q9H}|vs|cJ@EMb! z0vo4LWt;|(SVOz~=BS95qFgCp)l(o%L;}o9>G;b`4U8mB#&ZRyuu>EQTH3y*O)^AH zHC(iU=rjp4<8m$FCkb?B=yJ`xQP-S9%|w!gzu_7p{;7Fg4U6WwAWbQLy=PH$bi<~^ z=wl6!|45lgHRXZEAiGt`no(W|&J40iRs_-x29)^tO3@PRIKWSebfiP6gvqW$#BI*N zSjqFQz!UCU1H|c3Aq1Yl*EsMl;ATbxU)S)J*{pV^i4}dJ5x+0gr!=7_yGY5zz;U|U zw=@aTtoQ>rk`;~8B^vS^fQd%cz=vBcA`eWo5~VX~m`Mjo^l11XpxgvOlOM3t;3Yrs zgzCcN5jYOtJ3G+u@RvHT-r@^dtARoDrShn5q3_XNy}N5la+^9%-$j>+VnyGWBpf}c zpAFF;Y)*`;t=}5=rjn1VEXJ*v00KY;Rftl8pdu= z3b@^{UJi6G=Qc7n2W18`JWPjbkf2)%=|-m81KLXQ6cSM9NW-PaINiu%K5o9leUB8nt5N>J=EB2``Khb9;CM%`iD;!&k;kl^Xg|B zdh$Vy$-A=Gu0`R`&RBX;MoXOJUh3@8DhjlUge>dd`EscuLE6-lr6|l&EH*3Fa|xpYVZuy% m0GbD{7Qku!6ZpbAZ;94Tm4EwIBTe literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/pathtag_root.hlsl b/piet-gpu/shader/gen/pathtag_root.hlsl new file mode 100644 index 0000000..7ad806c --- /dev/null +++ b/piet-gpu/shader/gen/pathtag_root.hlsl @@ -0,0 +1,115 @@ +struct TagMonoid +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +static const TagMonoid _18 = { 0u, 0u, 0u, 0u, 0u }; + +RWByteAddressBuffer _78 : register(u0, space0); + +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared TagMonoid sh_scratch[256]; + +TagMonoid combine_tag_monoid(TagMonoid a, TagMonoid b) +{ + TagMonoid c; + c.trans_ix = a.trans_ix + b.trans_ix; + c.linewidth_ix = a.linewidth_ix + b.linewidth_ix; + c.pathseg_ix = a.pathseg_ix + b.pathseg_ix; + c.path_ix = a.path_ix + b.path_ix; + c.pathseg_offset = a.pathseg_offset + b.pathseg_offset; + return c; +} + +TagMonoid tag_monoid_identity() +{ + return _18; +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 8u; + TagMonoid _82; + _82.trans_ix = _78.Load(ix * 20 + 0); + _82.linewidth_ix = _78.Load(ix * 20 + 4); + _82.pathseg_ix = _78.Load(ix * 20 + 8); + _82.path_ix = _78.Load(ix * 20 + 12); + _82.pathseg_offset = _78.Load(ix * 20 + 16); + TagMonoid local[8]; + local[0].trans_ix = _82.trans_ix; + local[0].linewidth_ix = _82.linewidth_ix; + local[0].pathseg_ix = _82.pathseg_ix; + local[0].path_ix = _82.path_ix; + local[0].pathseg_offset = _82.pathseg_offset; + TagMonoid param_1; + for (uint i = 1u; i < 8u; i++) + { + TagMonoid param = local[i - 1u]; + TagMonoid _115; + _115.trans_ix = _78.Load((ix + i) * 20 + 0); + _115.linewidth_ix = _78.Load((ix + i) * 20 + 4); + _115.pathseg_ix = _78.Load((ix + i) * 20 + 8); + _115.path_ix = _78.Load((ix + i) * 20 + 12); + _115.pathseg_offset = _78.Load((ix + i) * 20 + 16); + param_1.trans_ix = _115.trans_ix; + param_1.linewidth_ix = _115.linewidth_ix; + param_1.pathseg_ix = _115.pathseg_ix; + param_1.path_ix = _115.path_ix; + param_1.pathseg_offset = _115.pathseg_offset; + local[i] = combine_tag_monoid(param, param_1); + } + TagMonoid agg = local[7]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + TagMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + TagMonoid param_2 = other; + TagMonoid param_3 = agg; + agg = combine_tag_monoid(param_2, param_3); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + GroupMemoryBarrierWithGroupSync(); + TagMonoid row = tag_monoid_identity(); + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + TagMonoid param_4 = row; + TagMonoid param_5 = local[i_2]; + TagMonoid m = combine_tag_monoid(param_4, param_5); + uint _210 = ix + i_2; + _78.Store(_210 * 20 + 0, m.trans_ix); + _78.Store(_210 * 20 + 4, m.linewidth_ix); + _78.Store(_210 * 20 + 8, m.pathseg_ix); + _78.Store(_210 * 20 + 12, m.path_ix); + _78.Store(_210 * 20 + 16, m.pathseg_offset); + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/pathtag_root.msl b/piet-gpu/shader/gen/pathtag_root.msl new file mode 100644 index 0000000..65e3741 --- /dev/null +++ b/piet-gpu/shader/gen/pathtag_root.msl @@ -0,0 +1,146 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" + +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct TagMonoid +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct TagMonoid_1 +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct DataBuf +{ + TagMonoid_1 data[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +TagMonoid combine_tag_monoid(thread const TagMonoid& a, thread const TagMonoid& b) +{ + TagMonoid c; + c.trans_ix = a.trans_ix + b.trans_ix; + c.linewidth_ix = a.linewidth_ix + b.linewidth_ix; + c.pathseg_ix = a.pathseg_ix + b.pathseg_ix; + c.path_ix = a.path_ix + b.path_ix; + c.pathseg_offset = a.pathseg_offset + b.pathseg_offset; + return c; +} + +static inline __attribute__((always_inline)) +TagMonoid tag_monoid_identity() +{ + return TagMonoid{ 0u, 0u, 0u, 0u, 0u }; +} + +kernel void main0(device DataBuf& _78 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup TagMonoid sh_scratch[256]; + uint ix = gl_GlobalInvocationID.x * 8u; + spvUnsafeArray local; + local[0].trans_ix = _78.data[ix].trans_ix; + local[0].linewidth_ix = _78.data[ix].linewidth_ix; + local[0].pathseg_ix = _78.data[ix].pathseg_ix; + local[0].path_ix = _78.data[ix].path_ix; + local[0].pathseg_offset = _78.data[ix].pathseg_offset; + TagMonoid param_1; + for (uint i = 1u; i < 8u; i++) + { + uint _109 = ix + i; + TagMonoid param = local[i - 1u]; + param_1.trans_ix = _78.data[_109].trans_ix; + param_1.linewidth_ix = _78.data[_109].linewidth_ix; + param_1.pathseg_ix = _78.data[_109].pathseg_ix; + param_1.path_ix = _78.data[_109].path_ix; + param_1.pathseg_offset = _78.data[_109].pathseg_offset; + local[i] = combine_tag_monoid(param, param_1); + } + TagMonoid agg = local[7]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + TagMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + TagMonoid param_2 = other; + TagMonoid param_3 = agg; + agg = combine_tag_monoid(param_2, param_3); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + TagMonoid row = tag_monoid_identity(); + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + TagMonoid param_4 = row; + TagMonoid param_5 = local[i_2]; + TagMonoid m = combine_tag_monoid(param_4, param_5); + uint _210 = ix + i_2; + _78.data[_210].trans_ix = m.trans_ix; + _78.data[_210].linewidth_ix = m.linewidth_ix; + _78.data[_210].pathseg_ix = m.pathseg_ix; + _78.data[_210].path_ix = m.path_ix; + _78.data[_210].pathseg_offset = m.pathseg_offset; + } +} + diff --git a/piet-gpu/shader/gen/pathtag_root.spv b/piet-gpu/shader/gen/pathtag_root.spv new file mode 100644 index 0000000000000000000000000000000000000000..3783b49cb1351c02c3e5aa12cbfd4f26c93e4936 GIT binary patch literal 5836 zcmai%iFaL96^Cz|p(%yZBDF2J#xcTfejaZy@v;vJeFtH$*d;%XC>`t)!ypo@s&9=g~S2g@!mk6zrEXw-(m-TcV; zS$9uuI3Jty&E~k@$#Oo{td6!k-D!P0Auk#0xQ~c!jEpp^c}8?kwh%rqkCw(;8!FX? zs}p%G-@6_f{|b7UWtGPGE|30L-1=rdHC@U3=1$T#y{Y(b)7JkR&|1GEI{{rzcrkic z!pqQ=gwIFY75Cc@9<7%K>Wy9H`ryRwMx~tB8WV%tTd|$lAi54&Z`obhi_vqt1lw;p zpZ0#sKUu%^JyE~q+xso|#QpBfhVX4K=jCnFBkZKBWkY4daOy5ryF|bvejU2j!kw9e zJXxM9k0<+0kFz_nec0vEQS!R78^O)7QnNBu&MRZHz1t5*?}ltoTkBf2-iJSG9RxS> zvFa4sIldiTtbOQlc1LzM_Ecj}Yn>VJ>^dFUVQ}pC2)MY$(Skid4exdlvJ$zFmF%O> z+F3Qnc}H^gIO4n;xt;>&y~rJ3;LMR*f>`q+bT@J`;#%I1k1;+2+<}~H8SmYkj`)PV zX4Y)k_SiF=n*5I@`xNw3m>~DLhSgk8yo@!i-;4OHO7Fc~yO`(s{4Vs8XC1M!hP``E zvAl9dTD$eF!`?u){*w^7c5yFHh3`e?A?9l@M|`Zg0__^^E&Qv&#rSpDo-O=+V9y)& zX0Z3Loc~{db}jv`|3b8D$!|?|f8z9SOE~JQcOdp7dgHuV$M~C)uekoLaPBke-3IO88ipXYOC`?npDzm^#77bVU)JfE)FBe{s* z+2UQNyEQ+TVmnj5Yx<0Ho?eRRv$lJ^46(n)U7BiL4mQrmIM4Vp#8~63uWel9yc}!} zqcE5A3a~k=5p%SSi=0=2&2b*0&a1%YtV7JvHZF2r4K~NQiJaGf&GEZnj<#`;a|PHO zXD)JH3pU4ZhdJ8DiJdQJ^_P}>aJLYK$9P{LQ<~UDHL|^nePaJBl*VAC%$(S8&<05A_*ma7tvj^Lpm>q57 zBIio5ImOx8i)~KKj<#`;^JcI)#o2iawmC67+QvoBTfycOXXkC$=EUr18@Gjb&py5# zIT5jk#zoKH0k-Gfu|3u{);V&foCS5<<5l3|JzkA17d_D@SDaI4D(YPew%#>J-0?fX z^48yn);4|{`d!F=WGNDRcz5C>|2^2|nR65Ry~vG-ajvOtjM(q2Yq`JoqdlM6_vnM@ z4Mj$5{JTv~n8~XZr}a{cInFlaG2I16%Kb z^xk|NTtejUg!5hg1Y(}Pe{jYFXdmBeeV;@QAnr#T^*;q}um5Q{`KbRHaIyYpv8^v3 z@Bin(=IM(&_&nG<&=>dg1+cvLa1gB>zAu91^xclu4&NPM?^oX;w06w(m%z@oIr@w- zR^B^2fcD(#C%cC`@wNBi%W(3shr7V88|V89*z*}@O>JYW=?qy@ZGJcUtB8G!@88!F zw;FA}KDpvtIz#Py-oknd;J=P6M9jTA*>SJmz-~X!H{s;(NHg;-u-sC_nYkD3G z*&m=EKz#K75UpPwz5fw-28nm}$6&b~i09S5A8{>n#gY3H@JzDp`%e-3?Y@pA+cPc# z{|p&K%!xDpJn_aK1#8FbJ_vSpT}PiWhtcv;{}(O(G5eR;Gf2$QLtta%qvpe4*NNYg zzXD%^EJuuw-y^>U8|&j49zp*GG2U4BqitN|{1$A^JS1{{2R0{WS=+eC`90X2`AFpa z0c_4X#5&r>Mb00==5!#D^Cz%5eTX^Q#zoGb!RB-#k@FX@Ihzr4w2gCCoEz_39sNEA zo@~ literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/tile_alloc.dxil b/piet-gpu/shader/gen/tile_alloc.dxil new file mode 100644 index 0000000000000000000000000000000000000000..7759910ca527b4ff12fbe9ba3ee03fdb6a428ca7 GIT binary patch literal 5132 zcmeHL4NwzT9)FwN>~2U18v)&zKsWMHRBD%~d=$Or18G!jW1$L4uSrCT9iZW(6x8c& zLQJA^HP}?+M|(kQ%Uo>_Ew$3Iy#%5KiykO#p`{iq_;Ky2*RvISJ@4KkIK9r?+|AtF zOz);|=D+*@{Jr=7?+w2VS@}AB*8VRVZXK%3YtBCyANWvt9034Gq5vS^djXUjD1}f; zpfcGmak#V}vcPQQL_v#wm5icElk`5R&p(Y^#N%7YTIR#kvrsj!Gx#rD1A| z+u(^}^x5tMK(5w}7^EIqT*#AKikbWaG=>=J>TK8ix2b5E?;qxbqsXq5*IQ zw~X;B;z^}Lyb{4-c>pYiHGE$p?Is% z)Rdft2C*ZQL-~SAgpT(E)=f$3$p9-b3hZ0d0BGKmXyj8E)d7CzA+HU9$_pfAzapRW z%l25^S9-sEszA703p9L(OWt00^gGc8vg<14EQt@Ti_e12sC|NX?qG)9Pfvjz%pO?Q zG5S!y-f(M)VRp*R&a(Y{VbXEj05pZm$#EFqK4#-OUYfbj(e+u&<{qbNM*O<{TkLKl zFxKv#x5&%hoPEq3*%YqNGMved7Rs7{zv5W{ZztS$RAs`} zs%-*HGhqOIEKYKhhI4_G8oz10LsFN>0DK}uf0eHpzni?xj&yN=nTVJ=I}k)OVFWzy zp#sx{!6n1_BcP#<_0e)!j;BGC_`nHFFDM2}OqicOxsMEM0-`W#U(`uI8@9^dx@5nB zR3QGt8uwm=sTiNT4&Jv75Mw%;#Qn3&4 z8Ha(%ZjvataI|r)LQvMhniQLlw9I=;q?2!q#JJrI_`r!soMu#8`xsrC)FM3^;*7q% zh56^EeL7J1$dh-{D8c+2uqqwcv^rx#=e@sUlVl&!W z)omgHCzCxVu2Se{@7coudHCETZ?SJ@OkG!ARb3T{Y2q3$WT&NRLF$a^wY86xK4~t3 z3s|FD>AW%jfg6b{Pwbm>>R07(_tlDIbrL!^9i3NuMic%a7ydC9J~$LU#7IX>(n~|H zqM6O@CyGD1uswSP1WX`*;~8%2f4=L~fp^-LiQI`rwR39Br8R3SbM5=2@)zG%uA&Qe z6`oZT$UpIdnR@5N87ot7thhUEMQU0?%dE!Xww}@8^TM{acX~JVLWon#L`Uv}7;DR` zE9;C;t}Cyu$`zzs(m5}xzgclHdu3u8EJFc*=F8!k&m}o!XjoYsg)DUI(;XcRI*zET zC8~TCk_#Z>kCgN}C!H(N#kL1yPB$rwS>bqEy`mdk*r{LGfG+H=ZK$>q8%Uzg;IdkY z^;Wc|)@5Z}@@|>*OOJGzjVNa$KEEctW|YpQLNVu5%6?T$yE(R)-JE%>KC?%k(TXlO zR=>c3W_DkTsxc88N{F-)S0#LMF0037H4!yQM9r8G!amKp9wS{9w5!tRs`J^BuUwN} z8Ipb-kbcRcmd+;SNRzUA{5fb=o>VDAwwRM<{6mjEqf?*Z8-E_{NcZZ~-Dsw_<6b!F zclOLaG_$2c5>gQxt?2qBR}JTq_v)o%L-+_Iy*Pw_LrK31NQXTUSB(+Z*?LR_W#! zzWe8~zzFMZQH6gy)>)>|&E-7^wTP4rAoIrcGeCAB?yWLOpY`VH&xyR{)bh)qe%s)Y z`FS0qncMC%F4AUwIg;goo_h?u=iV%gm8p6pVsqG8zC;n~Q%QOa?z4s!O!&^L3vN}z z1UHW$E8_F-kC7pg^GvE~#JCMQJ$ttC99#+sk>})3AZvV|lk=c|l9SKC*#8A5@BQEBXFQ29s7k`9WS)1{DJHlF)<@XXM;OD6{}X-@RCkI>5wycs<7yiM|sBVPxfr!>ME ze{>%*v8bJ4>wAQ+@WE;EtA(+Xi+XK{FjEKH;nanm(gZ(j0tuTCIBrk!YOjUu7a>;d z;SWNSAS=Z$d+=Z~+*v@_wd!;HD|reFC=4B)c8fU*m^J4$Ka0b02;g_uCt=e-kvOLF z@lXJ0D2ca{*)$~Ja@x?Fw>xIDOJAW1K~FN0VnmE+35iC7d2T0bL|AkkZ>W9F$^P6o zVvp^!XYlj7g9vOUdzBMAGa6_m-)}VH2t`gT{E+_MF=S422i(v^bZ*2kPjkma*>%6z z##*d=({mP%+1sUYIKl6?AH5groXEKWq+2s5it8iZrRdan6&oaNLOZ#FZ3r#!x3?fN z5@nYfC3ygo#h}sLda5_YV&YMsy&rzd_btTI1_NRI87s4muQ8!8r?=C^2~2s`Kynan zm&EfXnvgr?GAHf89aAKn>K3RUb`He^OgoU7_F;TgI>GwZ8hn0t)lpf!5NKJS@w~E9 z6|b|zXV0Kh()rGd6GGZ)2FpAZm_-xK;I7s%)Tt0?C3Apfhqk~mQD-_I(>|ml50zvWZ6H@MKajj?e0{k+@LW~`tzQB1xlr5nE7UZ^y8k1b! z+ReQV;7hRLvp|#9H|zw7aIwQ}b{UGnI?9aFsI^-jZf2V zu|G`IanX`ub0i`RG!NZXzC$y;88{3N`ooy!r>vsa(z(sA2Bb%vJ2_cB4gs%)_9a|V zkz!SH`;6r4O+%){*XJUs1^UW30O>5J~3l7~hI9v&bbK?(}%GLUG}sD0>?TZ_W0Ra@Q| z87FH!zN&T6>efw*^UPD(L5eA2-!~QnlY^U(@&CuI#`DQe)BAogA}^Ic19y5)$6~Cg OWo=+H$A5m@8Gi?G?1dQs literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/tile_alloc.hlsl b/piet-gpu/shader/gen/tile_alloc.hlsl new file mode 100644 index 0000000..73e0a8e --- /dev/null +++ b/piet-gpu/shader/gen/tile_alloc.hlsl @@ -0,0 +1,264 @@ +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +RWByteAddressBuffer _70 : register(u0, space0); +ByteAddressBuffer _181 : register(t1, space0); +ByteAddressBuffer _257 : register(t2, space0); + +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared uint sh_tile_count[256]; +groupshared MallocResult sh_tile_alloc; + +float4 load_draw_bbox(uint draw_ix) +{ + uint base = (_181.Load(64) >> uint(2)) + (4u * draw_ix); + float x0 = asfloat(_70.Load(base * 4 + 8)); + float y0 = asfloat(_70.Load((base + 1u) * 4 + 8)); + float x1 = asfloat(_70.Load((base + 2u) * 4 + 8)); + float y1 = asfloat(_70.Load((base + 3u) * 4 + 8)); + float4 bbox = float4(x0, y0, x1, y1); + return bbox; +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +MallocResult malloc(uint size) +{ + uint _76; + _70.InterlockedAdd(0, size, _76); + uint offset = _76; + uint _83; + _70.GetDimensions(_83); + _83 = (_83 - 8) / 4; + MallocResult r; + r.failed = (offset + size) > uint(int(_83) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _105; + _70.InterlockedMax(4, 1u, _105); + return r; + } + return r; +} + +Alloc slice_mem(Alloc a, uint offset, uint size) +{ + Alloc _131 = { a.offset + offset }; + return _131; +} + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _70.Store(offset * 4 + 8, val); +} + +void Path_write(Alloc a, PathRef ref, Path s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.bbox.x | (s.bbox.y << uint(16)); + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = s.bbox.z | (s.bbox.w << uint(16)); + write_mem(param_3, param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = s.tiles.offset; + write_mem(param_6, param_7, param_8); +} + +void comp_main() +{ + uint th_ix = gl_LocalInvocationID.x; + uint element_ix = gl_GlobalInvocationID.x; + PathRef _241 = { _181.Load(16) + (element_ix * 12u) }; + PathRef path_ref = _241; + uint drawtag_base = _181.Load(100) >> uint(2); + uint drawtag = 0u; + if (element_ix < _181.Load(0)) + { + drawtag = _257.Load((drawtag_base + element_ix) * 4 + 0); + } + int x0 = 0; + int y0 = 0; + int x1 = 0; + int y1 = 0; + if ((drawtag != 0u) && (drawtag != 37u)) + { + uint param = element_ix; + float4 bbox = load_draw_bbox(param); + x0 = int(floor(bbox.x * 0.0625f)); + y0 = int(floor(bbox.y * 0.0625f)); + x1 = int(ceil(bbox.z * 0.0625f)); + y1 = int(ceil(bbox.w * 0.0625f)); + } + x0 = clamp(x0, 0, int(_181.Load(8))); + y0 = clamp(y0, 0, int(_181.Load(12))); + x1 = clamp(x1, 0, int(_181.Load(8))); + y1 = clamp(y1, 0, int(_181.Load(12))); + Path path; + path.bbox = uint4(uint(x0), uint(y0), uint(x1), uint(y1)); + uint tile_count = uint((x1 - x0) * (y1 - y0)); + sh_tile_count[th_ix] = tile_count; + uint total_tile_count = tile_count; + for (uint i = 0u; i < 8u; i++) + { + GroupMemoryBarrierWithGroupSync(); + if (th_ix >= (1u << i)) + { + total_tile_count += sh_tile_count[th_ix - (1u << i)]; + } + GroupMemoryBarrierWithGroupSync(); + sh_tile_count[th_ix] = total_tile_count; + } + if (th_ix == 255u) + { + uint param_1 = total_tile_count * 8u; + MallocResult _392 = malloc(param_1); + sh_tile_alloc = _392; + } + GroupMemoryBarrierWithGroupSync(); + MallocResult alloc_start = sh_tile_alloc; + bool _403; + if (!alloc_start.failed) + { + _403 = _70.Load(4) != 0u; + } + else + { + _403 = alloc_start.failed; + } + if (_403) + { + return; + } + if (element_ix < _181.Load(0)) + { + uint _416; + if (th_ix > 0u) + { + _416 = sh_tile_count[th_ix - 1u]; + } + else + { + _416 = 0u; + } + uint tile_subix = _416; + Alloc param_2 = alloc_start.alloc; + uint param_3 = 8u * tile_subix; + uint param_4 = 8u * tile_count; + Alloc tiles_alloc = slice_mem(param_2, param_3, param_4); + TileRef _438 = { tiles_alloc.offset }; + path.tiles = _438; + Alloc _444; + _444.offset = _181.Load(16); + Alloc param_5; + param_5.offset = _444.offset; + PathRef param_6 = path_ref; + Path param_7 = path; + Path_write(param_5, param_6, param_7); + } + uint total_count = sh_tile_count[255] * 2u; + uint start_ix = alloc_start.alloc.offset >> uint(2); + for (uint i_1 = th_ix; i_1 < total_count; i_1 += 256u) + { + Alloc param_8 = alloc_start.alloc; + uint param_9 = start_ix + i_1; + uint param_10 = 0u; + write_mem(param_8, param_9, param_10); + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/tile_alloc.msl b/piet-gpu/shader/gen/tile_alloc.msl new file mode 100644 index 0000000..961be50 --- /dev/null +++ b/piet-gpu/shader/gen/tile_alloc.msl @@ -0,0 +1,273 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wunused-variable" + +#include +#include +#include + +using namespace metal; + +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 path_bbox_alloc; + Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; + Alloc_1 draw_bbox_alloc; + Alloc_1 drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct SceneBuf +{ + uint scene[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +float4 load_draw_bbox(thread const uint& draw_ix, device Memory& v_70, constant uint& v_70BufferSize, const device ConfigBuf& v_181) +{ + uint base = (v_181.conf.draw_bbox_alloc.offset >> uint(2)) + (4u * draw_ix); + float x0 = as_type(v_70.memory[base]); + float y0 = as_type(v_70.memory[base + 1u]); + float x1 = as_type(v_70.memory[base + 2u]); + float y1 = as_type(v_70.memory[base + 3u]); + float4 bbox = float4(x0, y0, x1, y1); + return bbox; +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +MallocResult malloc(thread const uint& size, device Memory& v_70, constant uint& v_70BufferSize) +{ + uint _76 = atomic_fetch_add_explicit((device atomic_uint*)&v_70.mem_offset, size, memory_order_relaxed); + uint offset = _76; + MallocResult r; + r.failed = (offset + size) > uint(int((v_70BufferSize - 8) / 4) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _105 = atomic_fetch_max_explicit((device atomic_uint*)&v_70.mem_error, 1u, memory_order_relaxed); + return r; + } + return r; +} + +static inline __attribute__((always_inline)) +Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size) +{ + return Alloc{ a.offset + offset }; +} + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_70, constant uint& v_70BufferSize) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_70.memory[offset] = val; +} + +static inline __attribute__((always_inline)) +void Path_write(thread const Alloc& a, thread const PathRef& ref, thread const Path& s, device Memory& v_70, constant uint& v_70BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.bbox.x | (s.bbox.y << uint(16)); + write_mem(param, param_1, param_2, v_70, v_70BufferSize); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = s.bbox.z | (s.bbox.w << uint(16)); + write_mem(param_3, param_4, param_5, v_70, v_70BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = s.tiles.offset; + write_mem(param_6, param_7, param_8, v_70, v_70BufferSize); +} + +kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_70 [[buffer(0)]], const device ConfigBuf& v_181 [[buffer(1)]], const device SceneBuf& _257 [[buffer(2)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) +{ + threadgroup uint sh_tile_count[256]; + threadgroup MallocResult sh_tile_alloc; + constant uint& v_70BufferSize = spvBufferSizeConstants[0]; + uint th_ix = gl_LocalInvocationID.x; + uint element_ix = gl_GlobalInvocationID.x; + PathRef path_ref = PathRef{ v_181.conf.tile_alloc.offset + (element_ix * 12u) }; + uint drawtag_base = v_181.conf.drawtag_offset >> uint(2); + uint drawtag = 0u; + if (element_ix < v_181.conf.n_elements) + { + drawtag = _257.scene[drawtag_base + element_ix]; + } + int x0 = 0; + int y0 = 0; + int x1 = 0; + int y1 = 0; + if ((drawtag != 0u) && (drawtag != 37u)) + { + uint param = element_ix; + float4 bbox = load_draw_bbox(param, v_70, v_70BufferSize, v_181); + x0 = int(floor(bbox.x * 0.0625)); + y0 = int(floor(bbox.y * 0.0625)); + x1 = int(ceil(bbox.z * 0.0625)); + y1 = int(ceil(bbox.w * 0.0625)); + } + x0 = clamp(x0, 0, int(v_181.conf.width_in_tiles)); + y0 = clamp(y0, 0, int(v_181.conf.height_in_tiles)); + x1 = clamp(x1, 0, int(v_181.conf.width_in_tiles)); + y1 = clamp(y1, 0, int(v_181.conf.height_in_tiles)); + Path path; + path.bbox = uint4(uint(x0), uint(y0), uint(x1), uint(y1)); + uint tile_count = uint((x1 - x0) * (y1 - y0)); + sh_tile_count[th_ix] = tile_count; + uint total_tile_count = tile_count; + for (uint i = 0u; i < 8u; i++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (th_ix >= (1u << i)) + { + total_tile_count += sh_tile_count[th_ix - (1u << i)]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_tile_count[th_ix] = total_tile_count; + } + if (th_ix == 255u) + { + uint param_1 = total_tile_count * 8u; + MallocResult _392 = malloc(param_1, v_70, v_70BufferSize); + sh_tile_alloc = _392; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + MallocResult alloc_start = sh_tile_alloc; + bool _403; + if (!alloc_start.failed) + { + _403 = v_70.mem_error != 0u; + } + else + { + _403 = alloc_start.failed; + } + if (_403) + { + return; + } + if (element_ix < v_181.conf.n_elements) + { + uint _416; + if (th_ix > 0u) + { + _416 = sh_tile_count[th_ix - 1u]; + } + else + { + _416 = 0u; + } + uint tile_subix = _416; + Alloc param_2 = alloc_start.alloc; + uint param_3 = 8u * tile_subix; + uint param_4 = 8u * tile_count; + Alloc tiles_alloc = slice_mem(param_2, param_3, param_4); + path.tiles = TileRef{ tiles_alloc.offset }; + Alloc param_5; + param_5.offset = v_181.conf.tile_alloc.offset; + PathRef param_6 = path_ref; + Path param_7 = path; + Path_write(param_5, param_6, param_7, v_70, v_70BufferSize); + } + uint total_count = sh_tile_count[255] * 2u; + uint start_ix = alloc_start.alloc.offset >> uint(2); + for (uint i_1 = th_ix; i_1 < total_count; i_1 += 256u) + { + Alloc param_8 = alloc_start.alloc; + uint param_9 = start_ix + i_1; + uint param_10 = 0u; + write_mem(param_8, param_9, param_10, v_70, v_70BufferSize); + } +} + diff --git a/piet-gpu/shader/gen/tile_alloc.spv b/piet-gpu/shader/gen/tile_alloc.spv new file mode 100644 index 0000000000000000000000000000000000000000..dbc02a8e5b33b11e2aa77b3e30bc20d732eaf669 GIT binary patch literal 13360 zcmbW637A}Em4pAZP-l0kg3QBC)9MuA~b()tT<8>ZJ?THCJo7wr>BIkh-}(P@ z{{7zWw2eKeGt0(gf+#SCP9t&RW2Qf`8j$lSfK+`VK;_rg=ly{lH0ha2k# zs^x)Ny;|<84OM#^wbfPYjZ5m)>b(OaeQAYuYNUOTk0-~xfq}tZ$_{!d_ZeKie7M@k zP&(;sf#zCGI=zH(bfnmHz-jpiJm?E`NNj`a4I$vkzq zF*MTKm_E<6`DGm~u2`DLceC$y`r`FRe8C&#ErZ*HZ42wwVS$ zhv|$ua-X!$5pd(EyC>v6Jw1bKEkko{8pIl=Y|kcTM^oCaoKMfS#R{F|)LzOjUd+TOx&fRhN za=ErPZ+ARA=5;A5*{WQtkm7lG1sT`=o1eFZ|@e`tsU}=32;o>gDP{b!D~Q zVDViGb=RU^Ud0X^u2LPfGuK^Hf;F3rTeS375$A-XT&SvvwHL58_m;> z-b1K3?@_-4eO04(ARLi{mj`Q<_bOc5`aBL_rCuKlN5t9zy)jg&4;PMz<^61|e0y*6 zjmyV50iSz(#H;>5!CnC z=h2({dj80Auwzp5>1DrLdRN>h;1NGext7aM{5W`BfuB_8B2<4`J;$Z&S#aFh-Zjs| zJp^9j?aawm$`u3Ug@e77fz#`&DH^rG`sw*~XCGg{U!1bMk!}2Dp-uO|V9#GQvdv5Q z^Zn@s&Eq@&*OW0o>G}WNs7}51>@V=Md#m+ob4v0#QS%vAr*w>CJm>`;2aY+}0UUEO zUcH!;3F^h1><&)nWDof0{aMQ9=e+suvrmiR`5n+ZI8x`bbvzfK4fp3aNx6{GJF~KN z2OE`v?Ol1FEBI=8pF6UvlJ;?jxIWjQkM^ZKyB41N`QEDxHHg!mtw+oIGd$AMyyx4q zI|?oC3S-^9U93{Jv4!8;!XIeikF@Z|w&U&DW_WX~%^9$7Pom{LZEmrA+)tr3$6d;v z+b%{a`&kSBdBVFH!^zB72j)F|8qxBN3HF=FCp4PnT3XLL-+JD!-Rv3Lw&l2#zl#q9 z`N?V{R;PnkU{nty*ht>!rEvqnf#G>m3}L{Z;dx49$0ons-}h&VibDP-wnW z)b>j>$EwzqXue0(4o)=RA!<_-&3A{I-+Gb5zN&G}Y}0&KsLe_=-w|qa63zF5+N%=H zajLy0(RM8~*C+DsR%oXsn)knL7bco>q_#BCy#LkCNi^?$weu6ryI$?WM00-CE=n}- zcD24l^G;Xm&$ZIFdpp)A?s4n*xc}7Ja$L&q^mPAu$GiXBOYX<5l&;6V1^3RCyVm;u zOgVZkO8LEsv~7RHotfrWi~M6ql;)S4e=L^ghnvr_PefIpLTlTL_n;5v@4|e#;O<9F zeRpcP7uDT=>hj4%aPL`X3ja5UJla}!GHu)w>b98zcR#3E&zP>K*g8`G@ceXP#-B|oH@@RNmQu|)&b7Z|c41?%X$ALO`Rq@ulPT@{GLYwGstR7(5c z89Sli3kpqtcY$qVe9JSi$R~d-=41Uul=jUviT*4BM}7NyF6MX+qkpc=AQp8m1>2wC z*MYqoBKM7n-`wjl_nvdVky33kBllhvr>6h^V`+?+!{+xKo7Z{x0;S_nb6(}1AOj>pw*4yfxRpls!{$*V=ush0->T)%w4n9D{Aaj4$7cspy|JEY)_Zy$J;U2$UMJq8}* zT3EDio}1==JJ0&qe#AeY(lM!-@AZ_a=(XDUTNWe?}pzy#qZtVmVWD|{HBC^fBW4Ve!1Vg!7csfO}XE^!L9E% zZ*b?=Z{FaJ@3Dld`^_7EOTTwh?ze7m5CapT{onEdM>``Sz|okv-vl z{s&C`BzBDZLmu`25&R;S*8eBCy7k>Z@_A(Gru=8j`^k65xkVXy{spc-_`iZ}e-(YR zAHTxXyicwuw0{TNPTx}cJC*W3Fm?00U*+?NVE+HYBER=gB;EnLm_gmK*G zYGU_zI^N&I?Puip12}Rx&dBjcu)cf9;l1$Rm~GX6N8CSA`uve`==&c`%`+;FIAd8e z`xCqkZvL2)65KlK(WiE>_ow~QA8k9p`eI!2Kael-TI%y0+g9uu4($02?AaAZ>>c6G z)rMlcJHge`xqC%Ron7E+-sQIy?REv*R-f}UkxaXEAptd{1Q0_P{sr9W~V2-X*CC68POfz9RjP2@Tlt{%A# z0js6C4u$iRe@p9+T(1P{Gnaco9=WE1&9y$sbr@Vdavcsob>o zP3~TewYPLk&i8b%dp7p}3~-#uON;(^ryPl{@3NxZQDC(lnN{P=1oM-BGaJ+WZJk(0 z{gLx%uxBH3&H_iyV~Tp_oDJ3&IgbUaMb6{E{Ny=}X-@0Jdg_mybHL7fXVU*!47)b9f3^E&Qi~o%`@F0IP++8|*xV|1_{#_)iBrrtqHuR$Iz@>u~bD z7Hm%A+E%~&d?Do`Y(8fH?YrFi5qk;P+`*TE9m|!pcaNP3wvKw#ISXumf}ag`e9`6{ zuyxdZKN{y;u(9;TI9>-&;myeDg5wSLSyU*3xu%h=*A=-z4C z#qfxA2{;|&0NmJ{8KZNs670RJem9!$uCN4*mnA2-bcXByS|9M8mu0C4cIx4cm7(in)PBm)STn!tL4kh(>i=<-`@l` ze)Q>5uzj+g@#WSrzU%2+SVsRY13UKU- zJlg)9_-td_YboW<;STuT1y*wo?4Nt>-Ej5W3;rIknqzz)rH^A&e=nxy7{$i*oAx^J z0?LT%zIs2JdVG6*0IU}0GU8bNhC9x_Tn{$4pAH>w`hxkv~m>jFc zynh({0&U_Q@ez2uBfKZ0-bc~(#aX`*Y+H46f1J|C-10zKFyn|L&Zoh~iSv02T+MSEXW>@3XF*@I`wUp!b-#&H?w&EPIm{Kl zScC9cMx5KhwzuwjN_qJ11gphf-T>~$)T7Q_;Kf+f`7GEv>h|B?2|ovRAI3V2N4p2E zZr-~o<>HN$<`LgZX}r1Y7u$Rud?=;$t*765cTnmxUnl-AfYluI3Z{c^Lix7Vm4h^^7r< zdXIu%qI4{efaUu9d+H{z{nGasrQG@O?<$XjkES%X?ex0`V!vA2hh>ys#oR+rV6k>z zEA&{qufx@Rro9OyB6?5<{u$te4#(5H~7J0u7cF*g33M{wZ zPgA;gEq!-?m-2g#BWB+a%c&7sUF7e;H?1z7| z_HTEN{dmka(XSK1wvT;(64*Mi@8#Bsd3tr>|1)vL(Kp{~z7u_R!=gVE!S-Pf%J}=? ze6W3xFQQzWaP?&^yqs{yRVlcA?gpQREx^+KeLCFSFOkVJz7VtC8JKIR-+8w$`eHto zg4Keb2{umbr?bG;Q9q?PJ7wd+VAXhFm=zV{B?!qJrv*F=M{RqkIsi1U)`}e z*4JasPuy{G>xKV<#BZN{N5276kMD&G3oXX4TouRiBpE_Tk16Zd)# z*u7|b>*=>o?y+9XKF1!CTR-~TSNI)M^tlRF_dUFV(#Nx}z8q6?ABm0Qy7yzwyK#P1 ztaGi{(;0Vj=k-i?t}SE!ra@AbFTt-QtpizZ(mB|4^l>Z z_eB@xvmX}i_b;&RCnxv-u(764#kIXJa61iqrKy8tyX#i#{G%;KL}7D)4m5nV5Y%hBDUkYH+Ntd+YjytKZPVKa_Cm z-<)vk-_pWwO}O>%NVxSkwD7wM9&>mN+%cqc_zt*wd_!CdR&z~VPsgMdbNo)QTFmjg zz-r#d@xAhHu=8Uc^=SJZu)e7GUU1YqBdPa3u)fW#eFx?HF}3TkCkpKYVB_gCzH9qI z%=NP0&b?ghyxUK)b8g#s|9=?Vg<0Qv`lG)e0jFdAC|u3H#uz>ZjxpQ_jx+7y zNb0Rm>U|QdFUEQkSS=mvr{KoB35&6Q8XRMF-^d-Gd&Yi>-7mI{v3>^Jg<0Qv`lG+M zfzz?x4p)mYIlp&ek<+~=H_rx2>&I9>3wBPHVCi1F8?Mhb?oE00?Q`HR%(%wWpWY#} z;d3ybV=>QXXI?hX;aqT>!+XH`~4hHS;*%@~Ha_ zaGK|va5eW@jOSb6$fG~nJ_XhndAJ&dE6gz>pG`D#GK3EKLR`E;6Dc2x8To#jp5&%-Pb?CjA1+L>Nk)3PM__p_ft&G KcbC|A*Z%@7(!8|* literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/transform_leaf.dxil b/piet-gpu/shader/gen/transform_leaf.dxil new file mode 100644 index 0000000000000000000000000000000000000000..f9f31e6ea1417ccaabda3cd5ad47c1b5642a4e85 GIT binary patch literal 5664 zcmeHLeN~{t_2wpo zB*mZs(W16C_|Z}8d*as<)cO*@ii#GLzQ}9ZrKw|<4fC2WrtZXG5UbO7I1*$2`j&Hnz4SCCzN1$H$V#a?K{v9^X%PROTb}ekj zVV6#hZ~*9J_h$iYtetIB{;%pWXk+`7p55EaZAw2ChqdM9Yw{`RXapS44Fl^7vUAxN zP|)^_06?qr+>`SGxBzwj7TUKaF|={b5sjK2Je#26QkxJfKaxWRh2qL+qc2 zt45CqY@@7*dB82A_!%oAE@nNZdCrPpApqHM4!c*e#i*roNgEGA!22`+bD=ISd?Als zQ-|*iKV*fE`_@*MBrvz$>2E!hOI5`Ko&yjfiyYy3=uV{*G`n4UB0y~%ZH?a5O6%X< z4q7X}FpZ0UqME|c8*NmyX%ntdg{~(x^~r({epwhGA6LkK+dUF-uEEbrX%DwNlIzbz zbLK<&NaIX0C3kOlPRnz_KC)aIu5C&>7VRK8jKQ|v1^c9Fp2nvI~ePXvIGzk%ZFNdR>Fagvz=wse2WS1PQ6UmZ>ZSW3Po9W$r_ ztS4Vnj+|rwJqc)kwk}1m$Az{?gG9M5CogKj=eX{Bj@E=T2A&g0bfV_WAc+rn=pQLuj9d56^bMkrP9kwlVc7-?lOnTW z@+k7nE5nK_Yb{K=7{~9psB2oKP6uXPTC>5FG^=)u>guDw`zQ9u_VLib@`cjK@I$R~ zMo^0_7UMVXkN!#=uVHoz{Fki6RSAFk6(U2+FiiQ4k+Qhpay2l<}dl*HMnbEng z&x`$W+YjZj&L6fllx>I0&bLl@KDznv&Xl9$^6Ux6CuP5z@>`wW4IOV?8*wtFXCtl6n3 zUh=`itmJb@8kVpSODw>WO0ncBj3;!ZxN1|~wXe0+rg>`9)lb3E&d=BW>0wjWa_AmO z9&-(M4DPym2i{$rj*idz>-u4c8;f}tUxP79Hf*k_HoUiP!{$wSmbgb+ z+qZE)E&n!aMO1QfWFLEV0JcV!z%ia=Lb``T)=H514qb}1%c*6!)m7Y0EgSg{LBJ1` z@F^pV=W9bdy)$pNhOAX`d~UU}UTl7kZoU(n-&^I}Y~t3C-0FpPlZm^wDfU47nx`={C>2R!IEOrAzJ6rMCELT`GLnDZIPZcY>-BSrAwK$t z)SuJBu6I3E1(P3YYL^{Lf4qO+*JDYPbo1ral`EVdZrG=ZUw!GHo*`5wDGEXpeKNc= z^I!7}MVu?=eQ6rMd7qoVgc7BtiiV+vdyCchi;!GBnb4&jR41@gvPKL$Ql9mcuQL1K zZIQ@B3o|Od%5n)l_%Z!2oAtQTQ9H;q>gGdblv)XH!A-!nP?NXNcQHeG|3_DX?`|)- z^mgjmxdS(Y5?PMy5lnDq1+1i(IkOGwr#N#RwEb0_DV^ZV?iZX{^j|nr6!8*gs{U^| zlg(RPxWmVp;t0r@;)qGkbVAPT&ZS&lz8Vb&wWQa@;8UbY7jFmrYYa_BmPAw5-KnT` zZ;kEx$8C1kbE?!mMf7mwIpOVE5~I?qB}~xbyhi>TtDjC>pMSj1V;URU2<9YnL-DdedovlI$T1Z zxglb-;b26h?vZ$CRQdmtLyg8K`9ty@>=Bu4uMj1VdO@%J>cEK)EY?#x7OMvA7be@3 zBkwc5PeT?Mdvpi_x9e*j*A*0HCIJ&m!SZAIQ}Qx@`dQr~pv^JjZfU(c_L`gWb1}9c zaBD4a>_gl}A2qXgVJ4jWzvW9vS*JBpzF+Ys8PQEczRWbfLYA^7%}PgMDGBv@Szy*m zvJ}c)L+Smv`E(T7cdAhSNgZvo9f&sS4CVAJe-K65)d%Lr-YB>9%@O1Jy)=E>W|r9m zcx(BeZRmC8`lo*#o1h2(CpKyO9x_pLNB`9an$He*^g||Le;WLaC;rA0|1D1x(g6Gj zPqr9e981AE34nb7{+DT`cyCW%Jb>c0y< zWDa&pX93cpc}lc=K)lmNL=Xg=>gv-Rni6|8XNY)FXs5F?*J@WDREme?z0tk;Dl*>h zWpxq!04IronJ#n1E^nz8Ian! z;Zfk&G!P6+ZP{f$|InxuwjAs(>@7yu-f|24cf_tv)Pjv0jybG=0fQ;$<(UV%? zP%(mh{6H`~8u(t@MR|x374A;9>TXkEms)VoB#c*augh+lqaGOt8Ra2Fnnc%^ z0-JzDgPB6CsEUAUs)!qOd2`5)vin> z%=XuyIez-mwIWZjts`X{d_CEY9O;QYG!V`t@+(`XdtoJPhKW|f-2R3>qf;4>QFEX0 zvYMmz*QQBeZs(Fe7@+aYgt5=!GltKD#)%cTja+{Hk^V-esY|e0O`PU)MESGz9Fx3c z23$PdaDj{lS<^Pl&$3Y6@EHputK^qiXl(cc3lq1>AFz;V=x1T(L3v3qa=fsm;Wi7G z{e%203)>sMgwVFkNQdq0RA*G0&0C#**myw#F{ZM^jNXPBrQ=*XMC4YddZ-vj*}N7= z+s#{hYhdDkf^K}_NHL0piOFXzChry8FXJ@MZj^e=?jH9FyI`4%d?4gi;4mZbl8JqS z$WExr)NQB=z@rnx$Kc<%#iE3uI~r#JWARzSB$%<12~|orkQEqzZIDwgRViUP*K#t} zL?qc{e+i%OG44iEEvVp+?S2K=WL?(>d>!2|lnm?In4lrE#6)3&S-KtkL>hOnk9Us> zCQrrsU-sQkv^-@`ML#Z2K?cRlN0DkG27GUl6}wTl@SKVT%mg^kf`@Jv7=;&9Hraq& z4_~jzw!CQ{W&ED&>h-X!=!nQb18~OZ^)Zg3Wa&nta;_+ zopivH7bt_)Bxo&z*1?IBcR=7fc`MNpH|h!8%x*JQ!(Hm7&p#O$z|jB)AHt5+NWzDok-`<6O+ z$yIWe-L<~Hv;)Eg9Mmnwc=G3``c`_!bm zd1t#{FqN7Q1zCv{BvTJJ6<$3MlG2Y*i7fvZI YbSjd#-=Vd`HuVZvB<7Xq)c-;M183t`AOHXW literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/transform_leaf.hlsl b/piet-gpu/shader/gen/transform_leaf.hlsl new file mode 100644 index 0000000..8a3b3d5 --- /dev/null +++ b/piet-gpu/shader/gen/transform_leaf.hlsl @@ -0,0 +1,234 @@ +struct Alloc +{ + uint offset; +}; + +struct TransformRef +{ + uint offset; +}; + +struct Transform +{ + float4 mat; + float2 translate; +}; + +struct TransformSegRef +{ + uint offset; +}; + +struct TransformSeg +{ + float4 mat; + float2 translate; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +static const Transform _224 = { float4(1.0f, 0.0f, 0.0f, 1.0f), 0.0f.xx }; + +RWByteAddressBuffer _71 : register(u0, space0); +ByteAddressBuffer _96 : register(t2, space0); +ByteAddressBuffer _278 : register(t1, space0); +ByteAddressBuffer _376 : register(t3, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared Transform sh_scratch[256]; + +Transform Transform_read(TransformRef ref) +{ + uint ix = ref.offset >> uint(2); + uint raw0 = _96.Load((ix + 0u) * 4 + 0); + uint raw1 = _96.Load((ix + 1u) * 4 + 0); + uint raw2 = _96.Load((ix + 2u) * 4 + 0); + uint raw3 = _96.Load((ix + 3u) * 4 + 0); + uint raw4 = _96.Load((ix + 4u) * 4 + 0); + uint raw5 = _96.Load((ix + 5u) * 4 + 0); + Transform s; + s.mat = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3)); + s.translate = float2(asfloat(raw4), asfloat(raw5)); + return s; +} + +TransformRef Transform_index(TransformRef ref, uint index) +{ + TransformRef _85 = { ref.offset + (index * 24u) }; + return _85; +} + +Transform combine_monoid(Transform a, Transform b) +{ + Transform c; + c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww); + c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate; + return c; +} + +Transform monoid_identity() +{ + return _224; +} + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _71.Store(offset * 4 + 8, val); +} + +void TransformSeg_write(Alloc a, TransformSegRef ref, TransformSeg s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = asuint(s.mat.x); + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = asuint(s.mat.y); + write_mem(param_3, param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = asuint(s.mat.z); + write_mem(param_6, param_7, param_8); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = asuint(s.mat.w); + write_mem(param_9, param_10, param_11); + Alloc param_12 = a; + uint param_13 = ix + 4u; + uint param_14 = asuint(s.translate.x); + write_mem(param_12, param_13, param_14); + Alloc param_15 = a; + uint param_16 = ix + 5u; + uint param_17 = asuint(s.translate.y); + write_mem(param_15, param_16, param_17); +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 8u; + TransformRef _285 = { _278.Load(84) + (ix * 24u) }; + TransformRef ref = _285; + TransformRef param = ref; + Transform agg = Transform_read(param); + Transform local[8]; + local[0] = agg; + for (uint i = 1u; i < 8u; i++) + { + TransformRef param_1 = ref; + uint param_2 = i; + TransformRef param_3 = Transform_index(param_1, param_2); + Transform param_4 = agg; + Transform param_5 = Transform_read(param_3); + agg = combine_monoid(param_4, param_5); + local[i] = agg; + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Transform other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Transform param_6 = other; + Transform param_7 = agg; + agg = combine_monoid(param_6, param_7); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + GroupMemoryBarrierWithGroupSync(); + Transform row = monoid_identity(); + if (gl_WorkGroupID.x > 0u) + { + Transform _382; + _382.mat = asfloat(_376.Load4((gl_WorkGroupID.x - 1u) * 32 + 0)); + _382.translate = asfloat(_376.Load2((gl_WorkGroupID.x - 1u) * 32 + 16)); + row.mat = _382.mat; + row.translate = _382.translate; + } + if (gl_LocalInvocationID.x > 0u) + { + Transform param_8 = row; + Transform param_9 = sh_scratch[gl_LocalInvocationID.x - 1u]; + row = combine_monoid(param_8, param_9); + } + Alloc param_12; + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + Transform param_10 = row; + Transform param_11 = local[i_2]; + Transform m = combine_monoid(param_10, param_11); + TransformSeg _422 = { m.mat, m.translate }; + TransformSeg transform = _422; + TransformSegRef _432 = { _278.Load(36) + ((ix + i_2) * 24u) }; + TransformSegRef trans_ref = _432; + Alloc _436; + _436.offset = _278.Load(36); + param_12.offset = _436.offset; + TransformSegRef param_13 = trans_ref; + TransformSeg param_14 = transform; + TransformSeg_write(param_12, param_13, param_14); + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/transform_leaf.msl b/piet-gpu/shader/gen/transform_leaf.msl new file mode 100644 index 0000000..fe45438 --- /dev/null +++ b/piet-gpu/shader/gen/transform_leaf.msl @@ -0,0 +1,287 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" + +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Alloc +{ + uint offset; +}; + +struct TransformRef +{ + uint offset; +}; + +struct Transform +{ + float4 mat; + float2 translate; +}; + +struct TransformSegRef +{ + uint offset; +}; + +struct TransformSeg +{ + float4 mat; + float2 translate; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct SceneBuf +{ + uint scene[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 path_bbox_alloc; + Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; + Alloc_1 draw_bbox_alloc; + Alloc_1 drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct Transform_1 +{ + float4 mat; + float2 translate; + char _m0_final_padding[8]; +}; + +struct ParentBuf +{ + Transform_1 parent[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +Transform Transform_read(thread const TransformRef& ref, const device SceneBuf& v_96) +{ + uint ix = ref.offset >> uint(2); + uint raw0 = v_96.scene[ix + 0u]; + uint raw1 = v_96.scene[ix + 1u]; + uint raw2 = v_96.scene[ix + 2u]; + uint raw3 = v_96.scene[ix + 3u]; + uint raw4 = v_96.scene[ix + 4u]; + uint raw5 = v_96.scene[ix + 5u]; + Transform s; + s.mat = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); + s.translate = float2(as_type(raw4), as_type(raw5)); + return s; +} + +static inline __attribute__((always_inline)) +TransformRef Transform_index(thread const TransformRef& ref, thread const uint& index) +{ + return TransformRef{ ref.offset + (index * 24u) }; +} + +static inline __attribute__((always_inline)) +Transform combine_monoid(thread const Transform& a, thread const Transform& b) +{ + Transform c; + c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww); + c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate; + return c; +} + +static inline __attribute__((always_inline)) +Transform monoid_identity() +{ + return Transform{ float4(1.0, 0.0, 0.0, 1.0), float2(0.0) }; +} + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_71) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_71.memory[offset] = val; +} + +static inline __attribute__((always_inline)) +void TransformSeg_write(thread const Alloc& a, thread const TransformSegRef& ref, thread const TransformSeg& s, device Memory& v_71) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = as_type(s.mat.x); + write_mem(param, param_1, param_2, v_71); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = as_type(s.mat.y); + write_mem(param_3, param_4, param_5, v_71); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = as_type(s.mat.z); + write_mem(param_6, param_7, param_8, v_71); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = as_type(s.mat.w); + write_mem(param_9, param_10, param_11, v_71); + Alloc param_12 = a; + uint param_13 = ix + 4u; + uint param_14 = as_type(s.translate.x); + write_mem(param_12, param_13, param_14, v_71); + Alloc param_15 = a; + uint param_16 = ix + 5u; + uint param_17 = as_type(s.translate.y); + write_mem(param_15, param_16, param_17, v_71); +} + +kernel void main0(device Memory& v_71 [[buffer(0)]], const device ConfigBuf& _278 [[buffer(1)]], const device SceneBuf& v_96 [[buffer(2)]], const device ParentBuf& _376 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +{ + threadgroup Transform sh_scratch[256]; + uint ix = gl_GlobalInvocationID.x * 8u; + TransformRef ref = TransformRef{ _278.conf.trans_offset + (ix * 24u) }; + TransformRef param = ref; + Transform agg = Transform_read(param, v_96); + spvUnsafeArray local; + local[0] = agg; + for (uint i = 1u; i < 8u; i++) + { + TransformRef param_1 = ref; + uint param_2 = i; + TransformRef param_3 = Transform_index(param_1, param_2); + Transform param_4 = agg; + Transform param_5 = Transform_read(param_3, v_96); + agg = combine_monoid(param_4, param_5); + local[i] = agg; + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Transform other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Transform param_6 = other; + Transform param_7 = agg; + agg = combine_monoid(param_6, param_7); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + Transform row = monoid_identity(); + if (gl_WorkGroupID.x > 0u) + { + uint _379 = gl_WorkGroupID.x - 1u; + row.mat = _376.parent[_379].mat; + row.translate = _376.parent[_379].translate; + } + if (gl_LocalInvocationID.x > 0u) + { + Transform param_8 = row; + Transform param_9 = sh_scratch[gl_LocalInvocationID.x - 1u]; + row = combine_monoid(param_8, param_9); + } + Alloc param_12; + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + Transform param_10 = row; + Transform param_11 = local[i_2]; + Transform m = combine_monoid(param_10, param_11); + TransformSeg transform = TransformSeg{ m.mat, m.translate }; + TransformSegRef trans_ref = TransformSegRef{ _278.conf.trans_alloc.offset + ((ix + i_2) * 24u) }; + param_12.offset = _278.conf.trans_alloc.offset; + TransformSegRef param_13 = trans_ref; + TransformSeg param_14 = transform; + TransformSeg_write(param_12, param_13, param_14, v_71); + } +} + diff --git a/piet-gpu/shader/gen/transform_leaf.spv b/piet-gpu/shader/gen/transform_leaf.spv new file mode 100644 index 0000000000000000000000000000000000000000..b7390994c2522a13c274a0c50bf0a1c8e2e3139a GIT binary patch literal 12972 zcmbW5d6-;PnZ|F@OTrpRLQs~_O;`dXkc1GD30orAi3Sn~j-Y6Db$2CQ(p?p+D;pvP z3~mU53+f%3d;+`!u(0o2a!j zl*1WMmbIHZhes>pweiIh?bgoW_R_r3(w)nfuid$Pt@XOIBdJ#{V#$x17Uzg;K74np z-mX1woIF$>%lSxS&7BsjE1L)3RUM;lZ?*t@ZL8Xt*w$=~UsKx#&BvndGc}f@vJ>Hx zbu0D8NNrDyXi_G` z67WQ>&&oVtiPdGn(?`xvV5s za|v@*#f(M*7w|_ z=P{g@0l1I&I?DPUad&nDxK-Vq-#PBZSHrgs*BZ6;J15V8n$Ltfr7;E-c#L61Vk{3n z*5NCHn-~7`gVVL#>B-yQ(&0`~9(zA!XRW%j`@oUwgWz=D9xCyJCH`oMA1d+3OZ>?a zf2zcvF7dCG_%}-Y*%JR|iGQoazg^=?`1hvquI&5pVa)ZJ&Bo}=+wA;Zv3;yE zFxDKZj%{r0Y7ST1^=4z^SEI6{+8&+AFO6Q_4(jgD-Sv_7DEDWjT_3CEZ92X#e51Ac_R;o~ zGvfK~v3jMr*hB9@=g@{c*?%wkj`r|aIHLbq>Yc2@HQ8o1zG|b<3`g{F7<$|Lw{S$P z!;4szp`qrUN#E>zoJZgrVY3(aO}vgK`|^D^JXYT^)mLW@eDCG@Pj*E#RBofH0P`E-28;4`1;xpsWMFZIzOd(a^hZKKXa{b?5SPoN7M9>~~Ag)`CJ4F9cV&Z%6OWP64O)(`j(Niul6l zv^x`C&vAElUa9W#5?@*3t4e(JG~Sb~hfj=FCWc$p_Au+w!~FVv$hqHS3Z-M8k7FbL z$me=+vprh#u68U{csiEhX}l{NgSVQysoR@>gVN8-o2?rMTFsq1yjYyiHgzW7n#A>c zLCq(=7vxI${+Rr(>B)8>ZmG7oOs3XC&4;|uJ9E0H(9HP`aGLX-(|A{Q54mrvtzLI_|1{o{9f0Tg`E5x5j_sk)_~vqB*#Dtv?YgrkO8m)$4^ZFx*ZavkZ37Xm zp!8YPDJC_VfRRiop+j?&r*LVG?m{0Kj4$b+iz*GHp=kI*oo+tB&_kHAH ze6@%_2u|b6jc-o&=Wi45y^}EG#XBV8sd-oUlr?3w7;C&M)EsNH?JDYde}-lo>v_k8 z=6gZSdnq)>t`_6958nrBk+Yipsr6ye)^~u~X^H0jueLbR9KYI4 z^x^%kwmQ+g)736YG{>oSS)zG|t6iRG-rZ^&63x3?ZBwE-M`~LV&AVIe+C=ltR=Y0I zyr-e}I)w*)ro!`gle)N9s$7eJ5 zqxYceabm&!%`CTH=REGDh##8m)tqPJ>_l+y^kEy|aUVsV!6S}&szsjORcW5`eqx3H zOK|gYj^=TiJWu_2^^1?A&*&eTEyUx#?P&gT@9bl>KI*zx!#|%0?$dt8Yae3kTc@80 zj?q3sQ#ZbA9r0abpTggFRE`vyi&pw_;7*99~RmpZeAoe4x-=yA9wzYzB5t!8d|!V|&k{*qGtJ3hX0S zzXo&O%_r`0=gRw8Zk!6Gan#I1K8UGBoFTAJ@KJE&Wj_rpt>1>b52OA~VAnPHK5!3a z|94S3udcs-$9*?ueEs)Q>UTfL4;0+JVtva$!?JAOrNRF^=iaBY^0sE4zrCu_u^qQI`iz9%;9i2|8w0rllvZ1(|=@vV{I(wpv|Uq{(Z-(EhKy2 z9pcpV|1y@wcs^|Z-iP*UuE$Zjc504A?wuIx>3SRw_ijH4|Eb_Uu;a1)B1+dsJ=XV9 zaE4j`GD_>KS$`d+dqLg) zv7e0R*nHeSKSFWu+o!P{hw~@*E;}~i-YfoQNd5k92=3hbyCJyo{M``Tc>Z<>u77#L z^{-60{jDnT)d{z~zXKwk_5Cf7@@k0>m$<(L()xafr`+%G;P!VKalV<;C_pTe>VJ3!u9(d9)8>V9WGzTs(G%D!aVCkXcx0)p0lG1eLr>N$H3J* zBbTu1uCdxWyx!X{0DH!~uRU84=Y?Q>!A}6&HsYQLR=b9F`-;9!g4<5t7V_~Odoozv zzTHo9W9sVztLbwu$=4984_`lc5v96*_nw+<^`8z_b1t0wGr;aY$6ukz%FdzL(3cqEHeJR)&kI|R!)pNmW&goBxAwLT~zUZ7Z63^mi@T{=QzEx0iw4 z*XoZHd-GSpw$t}R#&{j2kNN0(Ii_Yl;^_Ys;Bx=3L{pFcZvdzLzY1>u>bDg+y&7yg zea3bTDwu1aFV<t!x*>a z^R2@+eC2uAj;0{MCw)akc4cK{%b8sVAEzW^HwRB&) zhsrsxAzmNx#_=tqwC^#nJl3@dZeZz|*#TF7xY#qV1*=)dJu?CJanI;$VQP*;9I^L; z%du}lQ;+@dI2-vTbjek+=K%-P$(YR*~gxwpd|gJZsfQf|xv%Db@jnC+teI||+UcY@_{PVNSK zPK={3_Kmu85_wzdyN5FF$#;Sslco3MyD8s;`PlZ}!fzS-{k>pg#69^wu-Y8V`I5g2 zGnTQ%(e~HCwvBOp037phA6V`@_zmzu@M)NBqW*`#>W<@nN_ot~17PRDIQp#feoA#? zy5BwwR&&1@(|!37xVm-RUmq>%xWBC9K70^OeG7HYgntZdJ9YaxK&iHlFFL=&J`P@h z+0VhEG?w?pC*b;ne-b=9;SYnIqqt8#1-6cQe6KwMR!_gz9)%lQpWkbz)7N8Qb#r`( zQtr8ZobuCHnxpy5Mf*(Q^9-8rXA2FO&ynBGpF=x@#krMR&lvXmd9eN2?-P`Aea_t% zz|NPxCn@D?n75huo&qnWG`8*ZyAR!y?j_5Zx2M7BJbe+a7WaZaHQUENcTfJ5+hjH7 zZz#Y2j>dfad(?&GG@J6s0y|d!HfrDg?ek}p{v9UTo3nqf^f?BL_Wu1-Z2P$h_V1m> z@;9%)%j|0{Hml&)zqsIzQ~e^$c+Qt~i#}fhw$I=%1v{?z4RR%T0NaRJ*WA>sm(KkHxZlJ+{w>M* z@^_MV8Zz%C?B|cW-trFi;;TTtn&zX5VIfs(Y6NGXI#fF58rlh8n+Ht zi|+~V&evdrm~r(-+Z)09jO!fA!#562<2K=H4J`8A0UpGRt3TSl7Oc;>u7x~&6W}!N zPPm$Xhm5$pz=N1^^+(&?V134QP37U+3r^#{4z6|+7IAL|4`RmEA8lU`)@R&3l=ASs z0i4FY6|S}qi@3Le2QlO7kG5|F>ocxGvKH$2~F=ycF{}3-ccF z9dLGm&!9Y~z>6uDVBRn1QpWvqA9z;5ZTn!t4+B3`;s+CM{YMjS{X-@Gc*3p!WWufg zREa;GaO;09;nx2~i9efg>wh!h*8f(Ce>>sU|4zcK|J@S*Ucuu&`~ciJb$`T~-w#%g z`|yKcwT+nT=^U#?{SSfF;y!!;tQPm-0q}=0*UNs?ZR=V22&T{d)l2!&LeuX$ehhQ{ z?bo%Gi(SVD3Ve{#cJZC`aqxW1`qtAQxqbqi=K4vv+9>87V6G2ik*j*-`l-Sfxjq6{ zi(DTAN3QN8xw*QB>__bWv0dc)X|TCk-+KBZ*Ux~{Tt5p}YhaP<=fIJxdgS^9SYPD& zd9YgK`UP<0>K>Mx-&2(KBX;lFE^_@M*j%k|J^hjEm%wSRUxur-vB>o+;K)@ya{Ve; zU*!4>SS@n>IyiFm{K(DEvtvKvXDMwLx&AuXT&-_C{gLZ$fYV%m6RsBT7x%z#Vdm-` zmhRWzE_{*e@4(d}*WU$4uAW`F`FU>bN9>ukUF7w-$6&R{^-sW&tM`W7{Jb;lN9=uJyU6uV!RBgx>*-IwTl?V4FrV`< z@8{)|-u2H>#$DeBUWNIbk9pUxpj?^Y3&6I&kkb2I|M#(3SiIl<3T&IJFvqo!@~<)V z&3qBd|EAE?eP(ouyo@-1i)MdmoWFysN1VS0t7-Af1ZcPOKD4iY!0gYu5&Iv(JjJPz-pS~)QqJ?j{gp}KkMFF)DG${%2AnveO!zWpKC@!XB2XY5CCeRmb#7(WIZM?L!f z3Ao(%|DdTy-~S6%(_&7e@1LSY-|3n88M?mc`~SekQFpE!lY=lPedcqo5}Nrq7Wbj` jW}>M_yDqS8)uUZE*g3YHb1b)>G5XMY3cu$^?D_eB=v+M6 literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/transform_reduce.dxil b/piet-gpu/shader/gen/transform_reduce.dxil new file mode 100644 index 0000000000000000000000000000000000000000..978dd98aca8a278bde85fc024c77ca5695bdfcda GIT binary patch literal 4700 zcmeHKdr(tX9zHj@xw$|HHzC-|`v#PPP`yOuAzW^>(~UaQVNQc?NI7ksFYF{EoH5vU3a_Z2GhEB|Ja$`o!LKn zX3jak@B5weJLjI)m#9}ORUfFCQ?&!Rq(HYORAOkpi2wi!!~pPNoer~>#W0t`j6&xS zbliq{MOHSAL(ZGBS1e8RVzz$j5uCST743jL?d|YL{R2$AhdDSFI~O){VJ2ZlVP+HO zLH6^YpLlP3FVB7ho7nzr_VV61tZ#)*q3gu}&;uSg0|Tg6XKUF(3FvEuyi(=5CKUj1 z3UYoX$}@H#^zn_+b&75@8zW??2M~*34i68GK=&0fl1z^BfS>X>A(N!wWUjC5X`99& zp;TG66ChZo9McFLbx~b;ihQGe)coaMG%gFvhabnzEz~~3E(2fAY4)$4Vok0K3}D4~ zSrc(Eo4AVmtO+y#WWzq}HwBvXN=l1Y`ydF|LjmvvwfvpzXlf=r|G$%reRnd#H(0Z{81@DlQxD1>PYJL~{CC zh*(28sz{93ge&$Y`yYO?FhF`QUiycwvFNrMUkjl;Q9mJ7x5n}^pxs;-j;CmkM&;Ds z_iw<<$SCE3q*JjroM)TfI3UUI7<*%?dYL6jAd+T>u9T=P9Cg)xAc0c`fs$=5>vp|0 zsl-}Y!iBc~8+pRAe&J47f`FYge_JEJoZ^=`gj>DF4zICQ&u1I-V_!eitU}J;spoI+ z$HobM#Y{`R5Js43ISyToeAuRJLFdTbBlIlin%->pIr?;UVCE^;^wOT)w@5bT+g zMbt}zMOug5CuZpfsOkn!X+W7gA3Kn!M9$1$i4qY@sMU26-5Q~D)kY_$CKwDX0A0_5 z*j`VVN_Z}IKr4i+b!2e@)r-htzw$KhoAC!5a;wU@N4B;jiYC9bhN`-`MRR?R&i(#q zyRkAcx7@Ed_frB5L@h@sst+rql5kW}Sh~f;q=Z&;r7?KO#nF8+&zhL@KolKz5Iss- zr2@0h7nK$z%{EUHo&AIha~osuf~8;Np2Rko7yF2rq54Y z8XrF2H9gTY9)sf-9@j3Pm+;NLTcxkx+*eb&AI?&j-US;D^o-eTAMS3iX*$$*g)Xl0 zJJ!%ZQM2|fY)d`c?!(zurz!U^r0A^d-28Xi=(l`02NIj&JO6t0Sk=M!;{3zoS;@%? zd8@xJxZWMys0g7q0c^7iD;vR{1N^NNzueWY2-g)BpV?fV9UTv&iP4qaQw5VfW8(wY zuAW5IfwJ_+-lLP=leNbw@~wi)>xEC)PitTQy5_XKEQfJQJIm|h8>(JA7#iQuATBrr z`%X-c^q8(*y*AM?0Yi+RL<_=Y>4#;>i`&zi+ZHCDZI5%4+hM=ps@KYo!EJ0155l*I z-RFF`&kJxStZOuEg9OR2sZuPRb|u5F+`%ue-zpdf0XGTZZAQ32pp58nhhJ(6+d%U? z&CH>_+>CBjhMk+yyTiWSz^}sjl^UzTz~5=$R_w4EC~I)9NI2;bj?w{J=z!}}!l^vr z0-`JYnlx-k8s1SDv4P%`cFvsEuS#v@rk^vXTexYxQ<4fjzp8|vTw>h@OU7z&SPgo9 zMJ&JKE)RzN4P$)K-4r&~6xQo0uNH>&NW;2} z;XQ@eCk|C=w<@*XQ@-1o;#8&BxM|ML`EbztrnEtBT1%&(OUmD6;O>mIRxsA!D=OjL z5p0YSel>#KA%xQ|;ix0vo4kPAwAt8=L|&D%19cRJvj$vBu$2NJ5!O5dX8b(tZ!gt; zS@=Eak6vnY&s`Z&ak=lg%1~TzRzL5(_^jn^@goAx`>EfjMSj$IJ23=*QDI(rDt+?h zhR>&y2y**{f{I_+U)<85SWs~OFLx0l6Bh+@7I|#=-su**MxxuweEw*-cj-D`mrsb& zQbnVX;SaEkp9s;a@r9krA^AeKOV(IkZOYy5vR@lL@MDq4MG2Q|`z*`hfB2j9zi(Hg zw9P!k)TuHc86&5`0Q?A;G>R1(&!Yav2~vazxMjUcIY@zJE*a$Qixs`cdzSex`S#=!f(!@xQBg>{}{}vU&6_FdFJzVDtmMvqQb> z(h`hYAm>B@GwyaUs8{f);{8C-!%+B~^|6HIg;b8^XjSz`zi+WR?h_@|yXD3OO9{z! zMF?t)J-hMU`#x3phtK$IoD&n-UZ$gP6Lq5A!B^Pk<`~lOH6V?8`0H%u9aO^lKh?V! z+;&Dm3Ve&pL$NL@249=0e?;s;4=C2y3QKUkc?62!rK1#e1x3~-C8%48fqw1BXer+8 zkLnN?I!gRZ)k-^CjFhbAv%CUQ|NP3Kp~>Nbf$pBG6DLtc5Gxq6_V(7~wm9U9MR57T zJbT~k+ATTyLHOt&QTav3AKbh%I=;9;Os$7rK`$-DmCWOb26f^D;B^YEdZ_6;7TWn`JXQG#L zhC7Xc^(?20@^ZR}Mup@!%Sp<;oFwuz64c^}_aiqL`K~+Y9Lt5Flc}xHRR-B^T31}J>@6$@yV z2I;#1A}0i~Ye`-bt|?q71w9mxOq2rx<=&X39rZV@AeZ4xP8O0SAOPxS26pQlW_eIn zAg7VkF%LrSGzQ(EN<1O?KZZ2y;HBewHl&bqCrn?9+>z)5kUIi>AI`luB*LZ>iv?rF z7s>962NE5%am7?*ZjXG)NvCm_T|QEbaw4ONyQI-|@x>%ZZAKXtSv4$Qg1rsBCEUBK zY1c*HWTuvf?wa(ULa*1hNP`KWQF(5^XV!&QV8^Nfc)@$oZBZ=fL$mv(~AmOXcVg(5b{v zKQ3~GU`OS9^g|4O3W=SVeSB3Evq+Hqrr6zrh*opyTw0(5uTZV3#5}}i>K9&+^C(|s znuHs~#Ns#epmWEJbI9m%Dr2B?X0|O8sOG-virNrVN?#?;p~1Z}A~R2^)%-~f;>jM) z;~ZZmKWmxhTNVzOJTPau<~tS+a}rqZ@~1R^W8oJj4-~J_yp#zxlM|dLSnsU0nm1WE zWny9BA-LuVaLqH;QIY0u`I1$2#$CNpa8rumrgXzi`DyWSbjhrC<1Xzs=)KN*{l|(e zp{|Q2WXY;?$gTuxWI- z*y5c}iso(UYGhAw!tl_(HazDTg-PcvSKN`T+f3GkCzh~h`OiuBmGd}*BswUsX|JCn JXP*{N`Umj%{0aa7 literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/transform_reduce.hlsl b/piet-gpu/shader/gen/transform_reduce.hlsl new file mode 100644 index 0000000..bd14f79 --- /dev/null +++ b/piet-gpu/shader/gen/transform_reduce.hlsl @@ -0,0 +1,140 @@ +struct TransformRef +{ + uint offset; +}; + +struct Transform +{ + float4 mat; + float2 translate; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +ByteAddressBuffer _49 : register(t2, space0); +ByteAddressBuffer _161 : register(t1, space0); +RWByteAddressBuffer _250 : register(u3, space0); +RWByteAddressBuffer _266 : register(u0, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared Transform sh_scratch[256]; + +Transform Transform_read(TransformRef ref) +{ + uint ix = ref.offset >> uint(2); + uint raw0 = _49.Load((ix + 0u) * 4 + 0); + uint raw1 = _49.Load((ix + 1u) * 4 + 0); + uint raw2 = _49.Load((ix + 2u) * 4 + 0); + uint raw3 = _49.Load((ix + 3u) * 4 + 0); + uint raw4 = _49.Load((ix + 4u) * 4 + 0); + uint raw5 = _49.Load((ix + 5u) * 4 + 0); + Transform s; + s.mat = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3)); + s.translate = float2(asfloat(raw4), asfloat(raw5)); + return s; +} + +TransformRef Transform_index(TransformRef ref, uint index) +{ + TransformRef _37 = { ref.offset + (index * 24u) }; + return _37; +} + +Transform combine_monoid(Transform a, Transform b) +{ + Transform c; + c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww); + c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate; + return c; +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 8u; + TransformRef _168 = { _161.Load(84) + (ix * 24u) }; + TransformRef ref = _168; + TransformRef param = ref; + Transform agg = Transform_read(param); + for (uint i = 1u; i < 8u; i++) + { + TransformRef param_1 = ref; + uint param_2 = i; + TransformRef param_3 = Transform_index(param_1, param_2); + Transform param_4 = agg; + Transform param_5 = Transform_read(param_3); + agg = combine_monoid(param_4, param_5); + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if ((gl_LocalInvocationID.x + (1u << i_1)) < 256u) + { + Transform other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)]; + Transform param_6 = agg; + Transform param_7 = other; + agg = combine_monoid(param_6, param_7); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 0u) + { + _250.Store4(gl_WorkGroupID.x * 32 + 0, asuint(agg.mat)); + _250.Store2(gl_WorkGroupID.x * 32 + 16, asuint(agg.translate)); + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/transform_reduce.msl b/piet-gpu/shader/gen/transform_reduce.msl new file mode 100644 index 0000000..62da531 --- /dev/null +++ b/piet-gpu/shader/gen/transform_reduce.msl @@ -0,0 +1,153 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +struct TransformRef +{ + uint offset; +}; + +struct Transform +{ + float4 mat; + float2 translate; +}; + +struct SceneBuf +{ + uint scene[1]; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct Transform_1 +{ + float4 mat; + float2 translate; + char _m0_final_padding[8]; +}; + +struct OutBuf +{ + Transform_1 outbuf[1]; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +Transform Transform_read(thread const TransformRef& ref, const device SceneBuf& v_49) +{ + uint ix = ref.offset >> uint(2); + uint raw0 = v_49.scene[ix + 0u]; + uint raw1 = v_49.scene[ix + 1u]; + uint raw2 = v_49.scene[ix + 2u]; + uint raw3 = v_49.scene[ix + 3u]; + uint raw4 = v_49.scene[ix + 4u]; + uint raw5 = v_49.scene[ix + 5u]; + Transform s; + s.mat = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); + s.translate = float2(as_type(raw4), as_type(raw5)); + return s; +} + +static inline __attribute__((always_inline)) +TransformRef Transform_index(thread const TransformRef& ref, thread const uint& index) +{ + return TransformRef{ ref.offset + (index * 24u) }; +} + +static inline __attribute__((always_inline)) +Transform combine_monoid(thread const Transform& a, thread const Transform& b) +{ + Transform c; + c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww); + c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate; + return c; +} + +kernel void main0(const device ConfigBuf& _161 [[buffer(1)]], const device SceneBuf& v_49 [[buffer(2)]], device OutBuf& _250 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +{ + threadgroup Transform sh_scratch[256]; + uint ix = gl_GlobalInvocationID.x * 8u; + TransformRef ref = TransformRef{ _161.conf.trans_offset + (ix * 24u) }; + TransformRef param = ref; + Transform agg = Transform_read(param, v_49); + for (uint i = 1u; i < 8u; i++) + { + TransformRef param_1 = ref; + uint param_2 = i; + TransformRef param_3 = Transform_index(param_1, param_2); + Transform param_4 = agg; + Transform param_5 = Transform_read(param_3, v_49); + agg = combine_monoid(param_4, param_5); + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if ((gl_LocalInvocationID.x + (1u << i_1)) < 256u) + { + Transform other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)]; + Transform param_6 = agg; + Transform param_7 = other; + agg = combine_monoid(param_6, param_7); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 0u) + { + _250.outbuf[gl_WorkGroupID.x].mat = agg.mat; + _250.outbuf[gl_WorkGroupID.x].translate = agg.translate; + } +} + diff --git a/piet-gpu/shader/gen/transform_reduce.spv b/piet-gpu/shader/gen/transform_reduce.spv new file mode 100644 index 0000000000000000000000000000000000000000..6aa6b941f2f2323acd46ccaf083aad3cc57cf7e0 GIT binary patch literal 8324 zcmbW3d6ZmL5rb}1)vxN-ty}lrn(msr z-@GK5lgv;0k_B^;^jnzBg-Md_liqY396Dp@psDuAL8}fsM28)duGCMTotW3d*o{n- zYYh!oB3qG%k)4U5Z6RVX=O|tL&1Jk5rJL0Uhf0GRHVh7(SgK4;mZsVl)~ls@tx+wF z)LPX_ySAySUthtm)~M8{M~V(Tcog&KVQn^-oY^Wjrbe5siPNj2sIWaLFK&*GPF33p z%ntk&d`q>)NOkkFsdj6+(q3K+v3z>PiZ#Jg}9J z^P4EQ(R)+fdbG{4UT(8ZW-dr}W1LyE)T)+89yb;IyOLe;&+=H5>;TruuSht&H_v!t~0?{ z$?+A8S>Ao=Jsqo;2J6k?a{c7Sre>wwt~DDcyO#5=W{k6a8+cv4-ZYN)N8NAs{&XjA zM?b#V7_E)z)0^u0G)mQab)wp6lTh%Ud80I0ZjVn@afhFm`uip>sExG8xqqd0tzJ#% zWHw*jc(pb*-kvo^Jny~krE->J@V-qhLeMko_rg!MEA=qM`o5Put1!*X>BClTG@46jTdgr#*n3a958pEf&?+yOVpj(R-5n&}aHI*d8zE7d>A6u|2*!c>sRHbUQ7oHC6X(PPd2c zoYv~0i~{$j|4-S++0EAZgRSQDq>aiP`v`t&b)wn2(Dk`NF2*-eorq(c;Wf!uTdk&h zbY1=JJMd`?*PWlo7$p8dsbBU#em#2ha_`xC=7+u?y5Au4{r2c<(z#bIOpi++YiouDd&3)t~cjmzvg0} z)Guj61#ieoIOo$sK8kxxs3(Ro+fu@fm@t&=M*^mmwx9KIO{8SUV*b; z$&C~^`;^>x%5~55qBx)EP5StGALY7I+@1ER;(go#?|t+xc^~Z;o`=7qtZ}^Oh!>px za@N&(_cMA|1`vH7VDyW5<{=k(__m9AsGCRF{rzkn?$`M9SkU<1TjR$&kNCkwUPZs+ zn=lXO#yYuoK?R@cW_{6?ZlYT6|kBswId-h)B2JrPw zgncRd@I9#Or)@Fwy}tv<68x>Ny8BW0&Z@h1Kf3o-ANi%|-c>o{xt@2;_;Rl2JrT<* z=D!c)05S(TAlLiBe(LVU8aX%Y2Y}6CAM%YJ>D~{9Q+Mq{av!-hxo$snUc_Aq_S4@V zu-aTN>_>qkFZs1dasEl@o<;ckMy*xor-SWl#y^kIT39db?t2(9zINY~c5A7ANv>OO zYySpDb8tQFZ)BW<$h$vv`?;L zxft|yE%lyU_ingu^rQZ~7e$@r_1`Y{H*eQdx3?JcKu*~^V{fre-f`t{VrjQte%`YO zV0swS7>rjS-cfb$(bWar_k3+Z*Y5An@VCGC`!jUyHxzX3Hx_j5{ss+y8@4)(NyO&Wt?&X$-N3d2OMDd(1r^tj)Ndg?iXd1Q+8DqRX9xMBMe@h^sy3o($Gz zT+dWJY)=Ijvv=5&XIS`amE_r zn!|`|s_SP=d&P4R?e-CIv5)M9?zcN)A6denTgAlyJv1hkhA0S(Z6i9tde6n!}AGy!#J3p9KTPXyP? z2P9FV1cf$&^+Eku=s0hp^}R~#bHaxj5nH6RMN2JM%A-@XJZiht@y*P+QD$77wPwv) z^Zs~iy*X>0bM|lVeNOg0d*}S_RTn5#wF`R!UvB+lcJY_fpI;cxzkmP$MuP!h!#N#x zE$mxiFM)jo_F`zLhkZjQ*aST>7DTy+qs=SDMeNjTP6XcDbqC=UiIj}wWaG+dk$khtEVafwJ(mg5AF zBq}F0d{=X1zfO^7c1~Eo*@rF5V&%bL>DWS@QD~ zFeU+x1F!@ZDZ=rRy_6HQbhr*j0c#>zAJf`Ns^2&OI;%f6J`(;!9E@Px@DMS^N=%Uu z{yMJMm&$wlB}0((QJnN2`zNE`-ycvTlwx~LF$!hgKk4|TL;T%v1VOtgc0~ufl4O^< z_~rA$u6dzc&!!!QSl>S9RxM}m*0ZaISW^VM`l+X$4|jO#IR#A}Z0J)u&WHOv+rh6{ z9i(Xw)Gqyf^>UD1eJ6GDP{NAa9m4q$Qk8#UQ! z0(9BYP46p05V>CjKpuko>nSnZEs_YdE+=Qn+AEmqHdATDOgfGeh*XO0sN4Vx5w;q*dkvfyp;}iKL#SRUE()xap=aX`wdd4UqKC`7 z5k+U9ti855wlFrJE%tF+x49-Er!uf8=N5qpvF(RRa*$;pQ*caSC@Hg;l0$8%bP2xb z>O|nuzq6RqgqY@*hZs;2=qJfMkJOk#cu^p%SB-)aF`j*!(1zNi?`qCBoaoLaez)CX zN*7|-ZEAa;$4su!exXx`fKq4gm=M1);|%pQ-owVRB82nPNj)mvx{BnF6T~xpnC-Wfr+2QRfhNW zEdHo7Y$wI^yIDgA(F~s|!--}L?s8Tc*|j*kMq@V`*}IKs^)9=Sv`YpB{23R2f(k05 zg1(;R&+7O|ME~MD(y&qK;$B1ePHM00qE$AeT6G>xzi3UbM`eSvqG~<6wwRq-Y_EW$ z$!>Jnje2%<47>Uv6K?xUll@iPUM{v*xa~D=bNu~T{{3|*bB_oPpng33t#oQ^-?;nM;rCjnRv&yHzinZFAA(>qPy7#@px*hd{b zu^YpyKfdv1O(@R&K)>R{xU84oj~nMQ{(047S;Y0e2MLStBMR%r)9Euu+rOGiA;eXe zw^skcd8BNMB5CWTA08sa8eDJ-S?QNIiq!e5z2i~um*SrqAAR&SJ3o&Q$W{p^poTR} zYXSv`R*fh3DaGbwddVjOftTcGR(z4-;=TQS`oC4FG0I^bH8ra;pmK?v0>dzB05pmX z8j)WDVCvOPQQw81-*f4^L-8vs%g@pLq=jTb^VKKtK7F3AlA(T{uLhv)f5lhgr+n4_ zpL|vL-}p)p^&DR%{CW8*9zJJcq{Giw!YIgB!l-9_<%E3IuO%EkTsb2WSn(d037aP@ zvgiQd4wy(hdPfXV|JEu-eOs;V`ma59*At@H)*v@0ttCWXD;8nqm=9jN@Pt!~Uwf7F zTJ(~L?$Ue`w~!~?F1ErEyUv^r0Y239>=eMnlniZ&YSS#Z?|_55_k7X1C4^Oim5o89Rxog^B44KcrGVOzd7} z&a-nr-oan)Jfr>1xBJgHOV^oNqGhaF zwb7RtkR%8@=j?q>JyO0+`uTn_32+PYTh`mbCC4}j)kb2+I994n&fQTWD>Tu&<%d=2 z2-Y)WW9ZS;UDnIIpfGRGUE5OrU44096uZLM|9&4!J;e12zT$@xiX^bMPF5nvZh){q z-t!=WWYlv(`k_@;Fy@*-!jH^8)fs!AEz0wTGiUrmw)ZGD6R)6j2<}b8&;=&sff9lk zZyXKKGc?e6m(hUfguv{@T!Cihc3=sZ8PDPp0P8XBZpbiN$Ahwj41KZ!&l0i>$!75Z z@Dr|YqK6zw!{P)vATTs`;W{M9k}uB1O*b)V8Zs(0Z5wm3yUFEcK6INqBz3Z9g|0Dl z&b09KW!|rb0zbQJdug`edWSYMq;xK!_)ycDcj`(WPZs5ieOFIXuBN;qGD3wv6f3<~ z2}Y)km`ym%ohT1Bf?1sTXaA5Lw})VXLzt-)~(pM8i*yG_=qVd2y763z{27w$Ks3IPPED z*&x~m;lzv6G~8r1*J%ULN25&ETtX~N+)bF)utZl=E=;S{C(q0xXA>CaCIMdOl#dq~ z#-ZEyyvd?3LVZ`MG*=S`tJ9L>MZL@jVa&t9VEWk??7a1AuIV2FF#U5SUhtUU5fjSh2*M;&tifF)H*2L428;_m~Jt9n9LpZ4BMnm+837 z>n@MDeAhPS4Vte~1&i_lmc&aH9c)^BsBW-pvS@8|L{97@_xfvB59VZ^+&cGME-*$! zW#*&NAFf?J`bs8i&s@S?YZDW#I|I;%1#<7z-ti2%k$3KMvX&mUtl2x}P@u5F3NqG5 z{z-5_qu?@}TP-tE-)EjkS>X_U=Y_aCa0ix4@V~{lJ0&eI$l;|6 G+kXJd(PT0J literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/transform_root.hlsl b/piet-gpu/shader/gen/transform_root.hlsl new file mode 100644 index 0000000..d447db6 --- /dev/null +++ b/piet-gpu/shader/gen/transform_root.hlsl @@ -0,0 +1,94 @@ +struct Transform +{ + float4 mat; + float2 translate; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +static const Transform _23 = { float4(1.0f, 0.0f, 0.0f, 1.0f), 0.0f.xx }; + +RWByteAddressBuffer _89 : register(u0, space0); + +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared Transform sh_scratch[256]; + +Transform combine_monoid(Transform a, Transform b) +{ + Transform c; + c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww); + c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate; + return c; +} + +Transform monoid_identity() +{ + return _23; +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 8u; + Transform _93; + _93.mat = asfloat(_89.Load4(ix * 32 + 0)); + _93.translate = asfloat(_89.Load2(ix * 32 + 16)); + Transform local[8]; + local[0].mat = _93.mat; + local[0].translate = _93.translate; + Transform param_1; + for (uint i = 1u; i < 8u; i++) + { + Transform param = local[i - 1u]; + Transform _119; + _119.mat = asfloat(_89.Load4((ix + i) * 32 + 0)); + _119.translate = asfloat(_89.Load2((ix + i) * 32 + 16)); + param_1.mat = _119.mat; + param_1.translate = _119.translate; + local[i] = combine_monoid(param, param_1); + } + Transform agg = local[7]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Transform other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Transform param_2 = other; + Transform param_3 = agg; + agg = combine_monoid(param_2, param_3); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + GroupMemoryBarrierWithGroupSync(); + Transform row = monoid_identity(); + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + Transform param_4 = row; + Transform param_5 = local[i_2]; + Transform m = combine_monoid(param_4, param_5); + uint _208 = ix + i_2; + _89.Store4(_208 * 32 + 0, asuint(m.mat)); + _89.Store2(_208 * 32 + 16, asuint(m.translate)); + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/transform_root.msl b/piet-gpu/shader/gen/transform_root.msl new file mode 100644 index 0000000..8b4b2a1 --- /dev/null +++ b/piet-gpu/shader/gen/transform_root.msl @@ -0,0 +1,129 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" + +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Transform +{ + float4 mat; + float2 translate; +}; + +struct Transform_1 +{ + float4 mat; + float2 translate; + char _m0_final_padding[8]; +}; + +struct DataBuf +{ + Transform_1 data[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +Transform combine_monoid(thread const Transform& a, thread const Transform& b) +{ + Transform c; + c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww); + c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate; + return c; +} + +static inline __attribute__((always_inline)) +Transform monoid_identity() +{ + return Transform{ float4(1.0, 0.0, 0.0, 1.0), float2(0.0) }; +} + +kernel void main0(device DataBuf& _89 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup Transform sh_scratch[256]; + uint ix = gl_GlobalInvocationID.x * 8u; + spvUnsafeArray local; + local[0].mat = _89.data[ix].mat; + local[0].translate = _89.data[ix].translate; + Transform param_1; + for (uint i = 1u; i < 8u; i++) + { + uint _113 = ix + i; + Transform param = local[i - 1u]; + param_1.mat = _89.data[_113].mat; + param_1.translate = _89.data[_113].translate; + local[i] = combine_monoid(param, param_1); + } + Transform agg = local[7]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Transform other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Transform param_2 = other; + Transform param_3 = agg; + agg = combine_monoid(param_2, param_3); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + Transform row = monoid_identity(); + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + Transform param_4 = row; + Transform param_5 = local[i_2]; + Transform m = combine_monoid(param_4, param_5); + uint _208 = ix + i_2; + _89.data[_208].mat = m.mat; + _89.data[_208].translate = m.translate; + } +} + diff --git a/piet-gpu/shader/gen/transform_root.spv b/piet-gpu/shader/gen/transform_root.spv new file mode 100644 index 0000000000000000000000000000000000000000..1578842ea8970357fca4c1f6c7d85bd116242059 GIT binary patch literal 5336 zcmZ{m`vop#zCmKNB*M{!)p5li|-H^iX+Vs5~${JTP>2xw>IPdAz-;K3cBV8l&ZrT5Gi0 zu5BFEuP^6UYgFqKBYB5jJn}vC($zF-tZr2r<7=C(F|ggJ^Bb$Qk*B1xJ+<|#SK5>! zaw=!Xr`c%MM#{C3(MG%0-n0-Ewpr+0o!dUH*sKG@jUtXA5!W@B(w>f4j7 zhSyQnQ@J;}0KT^?urpBS7c)?P?+kj9i_ovCv@2&#tR=2Dm5m!gC%6BL1$<)rt);jp zX~8#CS{3K+-AwEwmy!;0W!*Y#y~!5v`11q3=WFlSM=m8hkz38n(mcAdQ=ooGA7|_raRbazT?x|P`5poJg*?A2rm}Ogj}K+`H!}H9_$*$T z?`%HV%qKns6!mA5pU0odz8&ShOxAZE@-(diA9uC5>3{%~!|^Eu0m46B!p!N*#& zpd{&>5%NK)oFt2gHvV8pAN7)W5&5inNTyS-&RVk}W3(Mi4bR>WO@geU?7jHe=UljT zq_8gm>u>y0ct7N)UHSM_>c%ev8|zuZz5;B#`B%cV+pqFDnQR_s`U$w$JJ9|~xUuS2 zWU~9DPWz`aY)sVqG}!!euzv>b9MtK5KHU21y)$<)`k6~rcTU|ru)C#f+-Kq5vpV}w zcHh)#zcj< zpUJ7+xpN`+(A__cGcSb?L+Wq9NBO)==kA@()SQvJ@V^AC?o8(H{(KH{r+kmv^mCs+ z4{5WuXa54^eDymIo#!Y+`uXYSJN_c1uYT57*3Xzecm*tJI; z9?RtG$mOgXkUIMdpC;J(hP(l6j!qWy6|g?)W47a9XKOy=ror2gdS}qeuRv)a*m!uSzqtUdvo6MnAJ7l{H(4;Ru^@(smt$z_ZIc816yx16mz^Dtls(^xU&Af zr>{Xf?>w2R2Sy+`&)4 z-hsBbr~AR`y@$Kt%3*r|tWMi@xN_JY1a}~9JK)N(*AIc+Yh$$O(}An^4&&VN_q&IO zu@&d=2s-uA!=vC16zBUX*z@UUO=W$o=?*z-x$#Tkk3r5ceq$cbbW7mIYg3osOLwT) z^K$a}o}YmBgN)so$#JhwA{Wo|Gj!@7&UWVKV0FF)cjgyxKX*pkQ;<5(A&%TngNwPJ zL8m_U;g?`_?n%t+S77I*EoS;_u=+TIvgePwa&6DT?Ni%t;Pxc<&HfJl9OS3{dAN3Y z%>D(iIpWQ}2v!$wR{6J(xr`M@?C-(GM$dl$J8%2i4OjMz`@{bTt%i&V|377V{r7;C zV|QNyySwJmrq3?8`l$ahcrYiw0=A#nqgTQDsE?X|2Ae1TFZv7kVrUMefBfI`SFpZ* zp5Zn4-yr?O48S^^)ACNJz%gXwBAKr~KmB*R>3C^G4U&!hrUYol7 zeS0@=Q}txXzdL;c?!Vu3$QmDjzXP8Q&-*!JzZuX;oXNX7AcqeG8~;JLf2V3c44Mjg z#<}o$kTK51b7(spQa?;L%KeZ!$nE@+?4BC&f1zAl*nm*7`8i z59u2{egs_X@kn&)Blkza>Ll}(ay;@ZL|5$bD0J#0&(UCYlIN4mB}I?NfUT$R=B)2U zV0+gV^%jGjckG<9eqsL@xOlE((WxJ18h!BNAa#ZA54V0BW=G-{uUE^7OG7`0CWYm3^agUzGfS=rASkUeR$ tpOq=^_TyR1rQcbZKH@%(Y;0HGja!B6jN^?d>t_z%(%F!9-;db$^DYGj3U>ei literal 0 HcmV?d00001 diff --git a/tests/shader/gen/clear.dxil b/tests/shader/gen/clear.dxil new file mode 100644 index 0000000000000000000000000000000000000000..a79182a020b90a039563546fd57809dec320a586 GIT binary patch literal 3076 zcmeHJeNY?K7T;u->;|^W(nMZpQd~eXJgjvIG{8Wed_ke4H5lxxBdr@K7@Q;xBt`;` zh7hu8n~q6=5z1IYMF$_G+}L%54Khr$1#fzwR7XXAuiL2!aqCf>2;ff$D+Q0<{3W z3e+rsARka;c_od3`I=N3cjT?jIeYg_bAxdY-3sFL+Jc+x_dpWX$G}Foy>QA0DglZB zT`x(i6K=1Ztl`(<>+!}Kb&V6efPoUAeE_K2TdC(3kYHUch-%n3eT2Xm>=HU@Rh36@FJ^&^rYK!zWY%ivK21sLlB0rIS7h)1+1;>G4R1wsB& zgpBtGH^JRv>ZfLsd{LnK2}m!wJ@HoKU{n%lAg-4b>3!m~6n!Wtl<$~Ew41VNz5#Ej z#u6$Fo61y8h{w`lc#0P(=DwL5e7pU?Zi4lpa=u8nO;VraD1lGi-aSy&?tuH7$9U?& zq>_R5aBf|0;_=*XkB@b@^Qs(4^;PeXh{!kCN8!@~6Q#ruwaH@h(lV(Z&X8lG=;H12 z2fZ{U-Ke<_t79-mJEtdKV5cqyV@O7ma!V9WnHtwsBNl9EgD;;qj;J9eBo$1)tVAyQ zO~dp?tEtH*LeBP6_!-1xP~z{>7AkP`OQpD1V7N6UZ;bw5-ssE4sZBPA(;@PiGQ!K{ zg*$vN;h!J5c1O_SrD^>~vn#JJe{^MOcJGB}$IhR#GLsD1nMZXF;;{u>P#}3>7+yUVH1z(cW{rgwO2+Yn^U;gW6_0=rlP&3Wt&ZB9V5Ys$cq4ZTf}#k4xX3 z?^*a%0}_3+=fm=U$qNg!RcrO%1?r!Vd1TQ=S)-V@J)kY}O$9X`)ZL0Y`kL`&DES&m zeB~kXaZUQ<#O5~!WsP(~9M&m~!P{rF+k^1-SZmN}N82&fUBTMzXqz2&wX$}KO^c-v zS3|@ioou6%-(MkC3`9N|-FziOb}3`?q$$0TK2-8*XUUwlcoZ&uwX@U*m&8`YE(UEk zqlIR+1&kin9%Ahb>dHY~*Aqb6M?CBU7~3piTl!dcU&ofuR*26Qh_A0_!}5>k2z z`udDjs@_)n`j^*v_lkAV-Me)lGTocNfNLkcmBUQ@z2AFCJJ0GKtGBQ?H2C-34Twgpb^^n5()1tj9@$S zJbGae#IgiJ$jd}(Sh6w}Q1d;MBqueq;-36E2BI(l;?idcFaVgVJ*2wh86l6Jr;!8n oSrVHF@HlY6CZpjEvj*y!m=CC75Ga*(BVn@ayrPPtK>E1+05h(=djJ3c literal 0 HcmV?d00001 diff --git a/tests/shader/gen/clear.hlsl b/tests/shader/gen/clear.hlsl new file mode 100644 index 0000000..f6a576c --- /dev/null +++ b/tests/shader/gen/clear.hlsl @@ -0,0 +1,26 @@ +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +ByteAddressBuffer _19 : register(t0); +RWByteAddressBuffer _32 : register(u1); + +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x; + if (ix < _19.Load(0)) + { + _32.Store(ix * 4 + 0, _19.Load(4)); + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/tests/shader/gen/clear.msl b/tests/shader/gen/clear.msl new file mode 100644 index 0000000..d89853b --- /dev/null +++ b/tests/shader/gen/clear.msl @@ -0,0 +1,27 @@ +#include +#include + +using namespace metal; + +struct ConfigBuf +{ + uint size; + uint value; +}; + +struct TargetBuf +{ + uint data[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +kernel void main0(const device ConfigBuf& _19 [[buffer(0)]], device TargetBuf& _32 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) +{ + uint ix = gl_GlobalInvocationID.x; + if (ix < _19.size) + { + _32.data[ix] = _19.value; + } +} + diff --git a/tests/shader/gen/clear.spv b/tests/shader/gen/clear.spv new file mode 100644 index 0000000000000000000000000000000000000000..0e8d1d74b838e81594c723455a21f2ce3624694e GIT binary patch literal 1212 zcmYk5YfBqZ5QdM5v8i|Kt@RQ&rmZcC^h1?GX~7Fb!EgFq)~aD4O`t}R{wjZif2vjx ze4aH&bHdBayz`!!+1X98P~CWgQ`yxfEYLbamEuYkQJD52=RS0Yy?mtC7H=XDy*<9f4wk~a5S zSM6VECu_I%chQviS$n75I?o#0m*@CSlJ~>E$o`&To;iD8rOiu8`%IeTSf_$LO8+#@ zIxltPJf~?V)p{q<;nbPhXA}!yk2v;yo5wG6)QCM%;~c#jV>RxfP7mw2U3)m=GVsK> z`V{8zBK`pI?q2sfOvj$q^EqO@1i!bQvGtt6`k$~q(>pN9Tk<_sx$S-ZX*BjrfF~j5 z@O;)qrj5I-;JY(p^-Eawa`rP$fLyG# zxMwx;Vv~1gJ>R3p-QB_Xzp|%0_J5(>f^|ouz&*OVA87152IPNILA`OzjpNIC-yO^X z_vUV*#_v-8290}}1KyMWM@_$V&prLj%>y}S)sJn)h(>TTJdeA3y?} U$NN5moVx$ikA1WcEcY9D1i^Ydpa1{> literal 0 HcmV?d00001 diff --git a/tests/shader/gen/linkedlist.dxil b/tests/shader/gen/linkedlist.dxil new file mode 100644 index 0000000000000000000000000000000000000000..231f0f6084c0cf640edcc1fc14548f61f856fba3 GIT binary patch literal 3024 zcmeHJ4@?_n6#vQ{y@OtlEA6&p6ug2X+d#b*C37Ot#D8&q*NucMCK`>2Ccfmo z?|tvy_rCYN_xekVw&@MojTcYM;2gjKSD(6eMZ%vnJBtnKo?U z%G|!0whMdFbw+U+Ojf8tvTs^=7FHP3FZC`6D#FyT?SgUxi0cQ%&<_#r_gM9e8HZH` z;Bf|kRJdx=OmWQ4J!n^2pBIWRy}f=#qIs`R;_Wk2mUOTNDCY1`kfv^}6+dE$u6JwjI!hFWGi=fS;eZHxCbMKbis?H&HU zee2`X^xa3XHFd@FU%&oByYq4St6XH1vYrN`dC1HfCo^$Y15Q9E7(@Eke{{I27f9f7|H>*pXlFHI^6e{=j1{h*QY-4h9l(SXWdRS$0JEF?K6NhJB z8maQIjNJR~xdCcSPaz-YNC#AoTq=YBIV$=TRMU9zXOua)U3J8Ca?jDhE!43GJnW5f zFL#dJ85wXhH=EB}idKS}oT2%t&(4fw;1PY1`R4H&;+5(rH+7#^em;rXtfs!f(;ujN zGb2BDztGy1Sy^^y&X||S(cMz>SC6LBUN_BDPIq!i6}HOPb~}sHGo3H=B%)hK7q(v- zo1Hr|HF*pzl52FUPO-VCG=uB7GM@No7kQKKW%Kci`(F0dl(-J4BF@gtp4OMTk~4dH z5_JdR+4;rkG0)`W)cnvq3^8{Mtw_=64r%gk8`Sl`o|E_H;JWbHJuuLvS3cZ#++15* zVu8~x)Ori7bvEu&4$mm76rv3shJ0_tr*~n^O_(!KN6dkQuPO377nx1yQ-{JSANDG% z*x0CBTk<%vVbrj}hin*c@-;ZHRvc?K@eT*p;y@akcn8BLjmyYOesYFQsAUtrTp$-L zWHvRNGNn>}s!AEMrB<Ea!J-eJWW z*J6!7#KN!-yZHNYzK-VW1AKG9oppJEygW^QA0jWY$d1uooovH7-@9+0WfjpCdDkt zX-{9!9scpTWC2S9*RZwfI&5_*h zYvFFxRps_SD~kTkwb1wPVk!i@6C6#%S2~l$!%PN#m%$nZ)Lp*|sLbh=PtWCYfk51l z-%N>jMJ&>Y*b`X0+aN0j4O{;l{JH-a{vLwxH#RvxyBM50GdT~TZ5zl zfLz!YIl3uJ_%A5TZcY{08_Y*a)WO-3ykJgApJMFFl^_tw;*U@wF22BJjsxl9IN-5{ z?4^$T&UCm%E{3K)_iPP@vpCkG#zWae1R4vwl&MWAg4WYHY}()!P7tw(XSO))D{Co&7VXWNl)=v5vE$E)Po$F~pGd t#TepvjKoN+2530+Ai@Sc>cEJV7zuW-p@|&`P2{1g=y46jx!_v_`VAHtxDx;X literal 0 HcmV?d00001 diff --git a/tests/shader/gen/linkedlist.hlsl b/tests/shader/gen/linkedlist.hlsl new file mode 100644 index 0000000..614791a --- /dev/null +++ b/tests/shader/gen/linkedlist.hlsl @@ -0,0 +1,39 @@ +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +RWByteAddressBuffer _56 : register(u0); + +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +void comp_main() +{ + uint rng = gl_GlobalInvocationID.x + 1u; + for (uint i = 0u; i < 100u; i++) + { + rng ^= (rng << uint(13)); + rng ^= (rng >> uint(17)); + rng ^= (rng << uint(5)); + uint bucket = rng % 65536u; + if (bucket != 0u) + { + uint _61; + _56.InterlockedAdd(0, 2u, _61); + uint alloc = _61 + 65536u; + uint _67; + _56.InterlockedExchange(bucket * 4 + 0, alloc, _67); + uint old = _67; + _56.Store(alloc * 4 + 0, old); + _56.Store((alloc + 1u) * 4 + 0, gl_GlobalInvocationID.x); + } + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/tests/shader/gen/linkedlist.msl b/tests/shader/gen/linkedlist.msl new file mode 100644 index 0000000..0461d79 --- /dev/null +++ b/tests/shader/gen/linkedlist.msl @@ -0,0 +1,36 @@ +#pragma clang diagnostic ignored "-Wunused-variable" + +#include +#include +#include + +using namespace metal; + +struct MemBuf +{ + uint mem[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +kernel void main0(device MemBuf& _56 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) +{ + uint rng = gl_GlobalInvocationID.x + 1u; + for (uint i = 0u; i < 100u; i++) + { + rng ^= (rng << uint(13)); + rng ^= (rng >> uint(17)); + rng ^= (rng << uint(5)); + uint bucket = rng % 65536u; + if (bucket != 0u) + { + uint _61 = atomic_fetch_add_explicit((device atomic_uint*)&_56.mem[0], 2u, memory_order_relaxed); + uint alloc = _61 + 65536u; + uint _67 = atomic_exchange_explicit((device atomic_uint*)&_56.mem[bucket], alloc, memory_order_relaxed); + uint old = _67; + _56.mem[alloc] = old; + _56.mem[alloc + 1u] = gl_GlobalInvocationID.x; + } + } +} + diff --git a/tests/shader/gen/linkedlist.spv b/tests/shader/gen/linkedlist.spv new file mode 100644 index 0000000000000000000000000000000000000000..a7232834d87bb475ba0928dea6e121c1ec952af4 GIT binary patch literal 1936 zcmZ9M*-lhJ5QYz{2r3{TZr}heD2S+lD1w6MfPzbW05dES;w*`R*WUUp-uMi}_$J=? z0B(ti-*@Iza$-*^_1E8jSJ&yTo~Fi;LrKz*v?p!Ji=;lflSY^%=}1~rUtD{+Ht}w6 zYhr48QpK^PDHW>eMAL$A#ww-VsuLYp7rb1z1VXi16Te3M0YM9j{JIwA+L(J=-C?dR z&9hT}RV>#wO6Aq+``Tt{Z?{%mT}Gq!IDR+9&B-`^V}J9__Fg^j6mh9su5D)XGx+uG z%F_PpB(2QYQQ5Afxn}EOitmAI<*lShb$*L>7Bim4w_?`4f;#ey5Ic(0CFcpLH}5d! zh> z_6X)se-!VWe9U)cT*TJ5h4oU^X}om};0svfH6Fyw=Q?B0#U6-w6l}jl-U+Z72B<#H zb`b0t8e88jyk{Zj_ZWMwa*_8o*rEOo-nsnu+|9VquMYiZ||vdqnEVGc#-6R5 zH7-z#lXb4%Z}*9< zE%sOh#~#kbH!Oi&`z_n^0a$J9>Ac@=k2@FZmcg!z_#s$L?Dq&9`>8dDoVr+31e+ti zX$36j-tm@J!T-Od$8hqo*AsB;We)d}Q|B9rGuIpYe!K&H)UpQl%|tCv!E*lJn9n)6 O{G0Us{N>MQ7W)TUwQb1& literal 0 HcmV?d00001 diff --git a/tests/shader/gen/message_passing.dxil b/tests/shader/gen/message_passing.dxil new file mode 100644 index 0000000000000000000000000000000000000000..2be73da19d9a41131935c2c68a6e89477f90dcd0 GIT binary patch literal 3116 zcmeH}eN0=|6~M3k9)8dGVIINj>|n?Pic6`LJq+PvhRk0fW|+>zjHMH7HUwrgK5gpp zQEmRhrir6IQb%(Q|Y{l)X-Holbd&*yN0043V01WCPCn5&H3(dj=r(I81McH8;0RGiq}fsYRFCiXZ;ktzFG#DeS2gwgoY)%3=ABhIv#pB>=L(Zd&iCRYJ?u#rZuw>tr~J)i_IhCL5O z%8*EsU|<4#*}+*8!t%M3i$KG>P$}H_6H*pXH){qWA1%IVrQc>1PwY>Kk-ZXxv39 zj;|OEV(R7G>3g?s{$W;zho-f1!;5?0Gr$^X-OT#*>g>||#^JHa#pT6v(&y8iPWq$t z*rnY3OOyQRaFcmIjU(A&kBgWO5)(NpPgYZtqj_)Rk@}VnlY1mqa^_ggsnzP6!z%+$ ztHyV?_M5W}Kak~TWd4O&n_L*}?H|0d?{fd(g>pWlIMTV-R&Q^8*;ZF?H@VO4lAICL zq*TAe46h{Bh-+Yq)q&uU5-`0k8w;_&Be;ehOEF=gw8Q~fo-$KXV-V$yZg*oI*@~u?q z_0;5POG*R%s`5jRa!Fe_ffRk{De@x9g)Nc8jP+Qte5>CEd#AsR@wb^VM<(WY90Sw- z%<2Cz?r#_SZA1R9A(#BYmheGX_<2ycNh7s$qtf+J=|ZGmwMgevr8BPNc?<6oMq4Lav=0Axak*e<;RF}JVeZ-Vz0Cz-I;!e(=S=p3Ll4g>y+?rnD77~#y_Gxt~t)0RTaC@v?ZA}*n8bh0gwQ-%oz8^!=n9kA(7Uk_w3(qTgh7@ zpy%g6m!Q#=Q0(k)$%LTJtICmkWAvdr%GbV=oWLRe6i)w5Ge8k44rT#BuQusL5etAc z4*lrw+|W95>66E^IgT2Zz2~=C{8PC3P8JXBU~w&+`=4NO<$s;UmRR_1zg8^XK5MA{ zZ5H$4V#s3hU$gj23bPC@hOz%CgCPVjsKrEVojr*+LqP^>%@OuCe97K%i)5fz%zLMC zx7iaeO2Prx$#(9Cu)MCAMPf98Gtm}j8?*az=qBC3_M|CH>nNp1r;nlrLKKSIUk zj^ME;-w8kY#dq^Z2-p0RCr$$&r1SW|Z4OtLnO|D!Q*)h^xImnKxGJ~S3@Dt3{-pXF zx%su@tX+!5m6Hc<_PIDPNL~~Acw8TItctncH^r`c)}l(TFn<3>$$`q?xtM+;t8c>H zdXRVqpi@*K@}9(FUYRw|1c8bqPUd_df8k data) + { + uint _73; + control_buf.InterlockedAdd(0, 1u, _73); + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/tests/shader/gen/message_passing.msl b/tests/shader/gen/message_passing.msl new file mode 100644 index 0000000..e48f48a --- /dev/null +++ b/tests/shader/gen/message_passing.msl @@ -0,0 +1,54 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wunused-variable" + +#include +#include +#include + +using namespace metal; + +struct Element +{ + uint data; + uint flag; +}; + +struct DataBuf +{ + Element data[1]; +}; + +struct ControlBuf +{ + uint failures; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +uint permute_flag_ix(thread const uint& data_ix) +{ + return (data_ix * 419u) & 65535u; +} + +kernel void main0(device DataBuf& data_buf [[buffer(0)]], device ControlBuf& control_buf [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) +{ + atomic_store_explicit((device atomic_uint*)&data_buf.data[gl_GlobalInvocationID.x].data, 1u, memory_order_relaxed); + threadgroup_barrier(mem_flags::mem_device); + uint param = gl_GlobalInvocationID.x; + uint write_flag_ix = permute_flag_ix(param); + atomic_store_explicit((device atomic_uint*)&data_buf.data[write_flag_ix].flag, 1u, memory_order_relaxed); + uint read_ix = (gl_GlobalInvocationID.x * 4099u) & 65535u; + uint param_1 = read_ix; + uint read_flag_ix = permute_flag_ix(param_1); + uint _58 = atomic_load_explicit((device atomic_uint*)&data_buf.data[read_flag_ix].flag, memory_order_relaxed); + uint flag = _58; + threadgroup_barrier(mem_flags::mem_device); + uint _62 = atomic_load_explicit((device atomic_uint*)&data_buf.data[read_ix].data, memory_order_relaxed); + uint data = _62; + if (flag > data) + { + uint _73 = atomic_fetch_add_explicit((device atomic_uint*)&control_buf.failures, 1u, memory_order_relaxed); + } +} + diff --git a/tests/shader/gen/message_passing.spv b/tests/shader/gen/message_passing.spv new file mode 100644 index 0000000000000000000000000000000000000000..e5f56d6d6a95620755d8f6954abdaf4042835412 GIT binary patch literal 2196 zcmZ9O*-jKu5QdKoGk}Od5OD(;P(VQx*+r1m0hud(fKD?sWRh9PEH3xHiMKw5H$I2a zD__7RQT)H|uGpDl%BufAb!t1OM^kO%9nM{F7hR{zyJ+>hHZjh1yG*Ea`B(YbdgFL@ zVet_jmt87!#HUw2Y3X)J*)LYuxFzWm57!SNM|_c#{f_E|>QX_g{MDpj@rJ z^Xi4_S>V-!vR`Qw3w8OW<&}(`b{(qKxwD{FZZ-n%q~xD^#kW()xC!Yuwc`KEL%!WjN}u^PzZ~NS(zU=pj{3;BdBbtTf?=Ht z(#dnn!pT0?gp)Pz3GY-Zjas!tYw9iI;8-VqvDB;ub!_^}Nf#pDu*H3GHotM4w2WhZ zaNJc+e_xT#NIE6>*!=8VPVWGYmxMpF9FwpNXCoM5=GrYm6VGZ+#)GqX)Siq7CLVPX z|4^EE*Chjbk-W*QPIDf6P(IW@Dq)QZcSs}fSz&s^hMSK4dt`r562ra1=jYCiXrI~P zj|?jlGlCnD%`Cv=xhfsi;M`h_nUB?VE%b|eArF{7SVO`$q{%@q#Jef2l0u$%VCD#C zbq>pB;XfkH9x=dKX>x`R^n!es?AN@jjRzj#M8#wz)0Y^vr!(j_0z;0Y{Bs z?hMXyzK~7Q8J(g>?huYzkR`nf{CR^c`u`~=Elo`BX+nZFJlTTz?vnfSEtu~uxzG2O z#C&hyEoEhnw;b^SC={!7h@yEIt8H2~oYgiX+iGKvdZ-OPS)0`}E55a!hvMMh$2H8! zhT~0qj=9IO;rU*ui+g~xx)x=_vBw>Nt*0f~^aRg7cLrx?S7gIkPph)6C-&&$tER(z zo=DiY-iXH?!zbr&v7d^kE{nY`+hViFom%V-+3Y7{^Ijr8dK=G#$!GC5W#jWvbK<+% zmf*7~`Jvi&rCGoAw%FT|!13;}xfk9T-(pTeJ>T{2==DGX&zrJ0_DkpNJo=A5xD^TC v7H60EDYBpu{+n#f&MQM-T7ag2b1d`H|C)K literal 0 HcmV?d00001 diff --git a/tests/shader/gen/message_passing_vkmm.spv b/tests/shader/gen/message_passing_vkmm.spv new file mode 100644 index 0000000000000000000000000000000000000000..8527c2bae4567003bae7f8e8db8578114729a358 GIT binary patch literal 2300 zcmZ9M*-~6p6o$7zLkJp3B{5Dx$V4;|jb;=R8$!}xP%~6{+jRm>s>>dzraS7rZ{n>_ z;f>GXaODd)RB8OabIz)qVpFTv`v1N5n)W`7kDZxH>7{fy9ZC;Vv5uxOn3UAN953{* zXnh_(_^Nq-<5Bb3sPlE!Yj#`R{@}Z2x4+Zsq?t6CpKEMA-df%t?kr!q`hgLTr13nk zzq9z%&=W{EYxmrE3z>tLJ1RkP^DE>0jiG-6Or+`5*viwFdH1*bd#&bvtDE(P?d^U1 zYIs#+*U}Vqr?l4^bVtKhbGMT{ZMMI?FuJ^w`%bgBlMOSt93KPU?XWr`EID(YK(FJoHrma7#I873!|P;;n@@0J)tINfKkYOdo&J-ov)Oyr-_C~Zes6O< z-=9q9(ccW(FDlRZiL`{?%LZAu#14AU%65uACeyXRWy8R{&gg??BEsq`$K~jkp3Y7BI)obp}zJGit8JlZ$xPUX3RCSh>hAg6&OP{$lC>3ib~q2|Npa{ye35_MM&n$|*8ABe{8OXCXGv ztLUPJlvYaYe4?(`a=)S%^N8)kbrSpr+8p*`yf@L5l=H?DJ4d;w^AxsA|9P}~#t<)} z%`Fap=dMrWKU=~2okRS;5c}_8*DH7#+qV+&-pBSE3H%ZE_}q*0x4_mA`?s;zkmIoT z(9Y2NkDBje>n~@|V&6bD#+P85xWp-E=9`eSH{}`~=x?1a=lK_meHqhxKaZ#bFCD;s z$JKqm<0|(1tzy65Dz2A!m8{;&+lY7PechslqThmF<)n9Y8u7jOCKgKlJLl!SC(qhn z#JqrgFy=dO^0(=F2`#=!4D-E<97g2bcLs7%&qZvxsOJ)P)Z?BtSdYB6cVDgNGWcLU zAHvChU)FF1Th4d#bID!BmY+qe%h}6CT{p1h+;i^Y=;Pca=@abe z$vyk{jp;a_PZ9U!e5c0qUgaa+Dz>#myfy5I=bm>L@$O)|Z#?hLcy+XV#9PNUN8r2I zb>v58a}V7>^pXFQ^P6bbKfEFLHV`@Ax3+J?xAZyUcVj(&@V@Nt3q<}L5^w2m&c%84 yzkFCX5WgYMI9t~tV!Tk!%nta)^Cwus9+imz z5auEf3P}YCgY;;T&NPI%3Iz_CqF>5q?aay_wX=Q$yWu(x00+Xr2T;+Fx6&jyrJ${| zP@b*#-_no(oP;u|ou0A}!vn;&_*dC}Y$Z-qPA` z=cK@*bQ#kiFpmM;o382;*a#lm0KiH(Mual}y z$DOdc<81RPBTaifQs;3KRa=E|j2n>QMu01v7R-k^b7KIgo*<~)e7Qh56t* zl$Cy0HRnp-=BnF^ogrzahX?MG0W7>G8LRyP#DJlg{=R;RyUm0& zYmmQYjs8uP;NeoIN<$V&O=X49BI{yl0JCPQsJ=Dmrt(N=G2?AKpr_<8+4{F4$ZSJ5 zM>sDLkhBY0lrajE*At%b?0HNtN(GA;k(n%Bi`AQXB1~oG>9^PYtEhexbzSP@WFFvs z6QQ3$b%oGl9n1$Cv@s7gd2>WjhNo0prB#{e`%4bCuG(FN9+V;R3X>#9 zl72YSZL3RJRVCfJ>T?Q4B-<9Z#v6=D3ktCXGmkNrXdSIE-5ccPY$kE1pTIKSSgjb105>> z49mD>{s!A{r!4THMWt;rY|cCK1=e;cfzYL~;J8e$B7?a7d%uoMR~{B~Tm6Zq=blLZ z!@h6Il0Dz7O8(6$VNNDz@A!D{w;zw(I^R3;@ksx~J)-{N#rnH@uDmhy!JfvuR}KxW zroi4cEk_ITZq5Dj^MWttsy~mNzn|$j^I>Pof!$333#Eitz~i~X{-AZA}r+R~oxa1meTAMlfx0Xa;|t^@K@ zjziy(jfP|)T0YLnsi$>c`{gG(-v9iA?&Gf>>b|l%+@4fa`$SD~W!3hbM?`9pR^P;n?arYcX41 zzO&j=4NX+77oC#Fyt(SI`i+7G*Eb(ge>vLBkv(vn=c60lR}Ni-!|qS#!fKccsRSR} zHx^r>M3%et8P1?5n-|yBimO^G$O#bf4JCiT%Tvkh1-*g!m)l}Xn6N1|hk6iQ-mhQo zL6;BKdaAAB-GsQ#=(1YHyR2wUt;&SAVnS+zc8cw`>n|SFqS0$`@m(}aCa^jjqan1K((Cmx6 z>v_Uep>kEWxawML$-keJ|2`!D&M&{upoRTyvEyyAgH!dWC3Zj)+h?0UV8K82>X-HF zm$gjQUBQfIeTEy&Yz{sNo&IRgypCpe1U>y4@k>^;l5$m9QDH5d%i8aHv0c2YC6G1F zN0ig@Z_VgS|&)4RGwV|cYzA|Q6_V2CdQQPUF4!qO9 zv*|CrcZ;j$MPQdt44wJ&8wse5oqH1N5jrkk_?Fq@!9_0&b00jc`KN0|G;_~>n0>0? z<3D}xA7|Vh8rcuu_m_RarV0{-T0toCMN_@VMx|lH)BaHWnKH=@>%+@;#2eO9ip*t- zF(@If3KFRTF&T)oV0IpxE)2h>Q21$iA(|uq4%$>B0bmrM0T|i!My0^O@`45}z zdns^@+Bo+V?nT8z+>45z!95Sey*?A=4#XlVG7jiPk-$Op!WXN^zE=x??m{e*eQ|m4 z?wS$3@ns>ZKHXvhBnv@HfzZbvU1s;Gr10)yxUza$>o2*+B`FX4%ia<$91#4Hfo{WE zVha6u=SdqMJihpt^Wr~uG!8dt;udayYp>mH2IiV0X)BtUhfB^{qS|D7gYjxkbeN(I z_{y4qGEzkEl<$8$jY z66$sdFt#GJWd{G&!Lk2U{7WGC)2xb=Y^_q`eKf?s9PV$xZ-*V|qA>RnT6d$NzIKji zUW@SnFS|6E+{Z8n79whA<4s*gG%!)5eFo-E&%#_Vom3NX*Sciq>XA(sG(-EFKH1#5 zdETgc%VG8We$}sAV&8hVGd<%&t+(y+A9J0~7CH8|Q1YUe_*tx;v}=jDU2DS+Q`R_iSs*oW^?HHUxHM?BMekiv8`n9_{&1F5wfUb}3K{u#9h1 zw_cR4T21iZ$Rz5HKnJ2AP-YcFZUD|<5{>4cBt9_*km%09eiaP}JF-!1p~9+|BNT{M zb$rAgL${9bgcIoq$~k6~aw4ihEv4`HsIAA*B;t}SbbXe;LFU$}_Qx3@`7_#~yw zD1)~H{4_Zb=~1P@)!5ESYJK?b4#|b2lf6luf#iU87|}=$xG`-ZC+vef`9Waupq5$9 zjGFNi0KX<{_eyek-i7-ngB!w~%YN)E3#4eWUzFS*NM=($j!pKv5n&t6J)Go4D%86z8<1*h zoFe_nY})j6nw~tFo#auegsI<=zGs#idbkmuZrla4NA$Zve=PT!U!91Mm+uv^2kT3Hm^I@cs?zfN@m%od zkmtEM)9j7V$IuvvtciTAl{ctpysno#5!d8Y$_#aBF3raK6B)vwUJ*jt7sgJrhetL?coD6XDq2A zLX&cd!HcNtPmVY?P?1dGu#iNXa<2!MVTmb`iMGhZHu-6+2)fMJ8ge81(xwk}sMGmO zC}?$L95txCCa|?fqjKissfmuN_G-U4ua}EjqA&xGglJjBIBXJKB3e(DgkR=-qDe*S zw#(dZN2cE(oEX~Y{(D|hcsqxjC!Ckw@B>}0P(nj%S7U3xIJH!S2W6}gt{k0rl;sh4 v5ft82oAHH-j>m(M&4Je}k^$}#CuL(wW2!Y!=vavrcNEt)`rzL>KbQXiZB= (1u << i_1)) + { + Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Monoid param_2 = other; + Monoid param_3 = agg; + agg = combine_monoid(param_2, param_3); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 511u) + { + _43.Store(part_ix * 12 + 8, agg.element); + if (part_ix == 0u) + { + _43.Store(12, agg.element); + } + } + DeviceMemoryBarrier(); + if (gl_LocalInvocationID.x == 511u) + { + uint flag = 1u; + if (part_ix == 0u) + { + flag = 2u; + } + _43.Store(part_ix * 12 + 4, flag); + } + Monoid exclusive = _185; + if (part_ix != 0u) + { + uint look_back_ix = part_ix - 1u; + uint their_ix = 0u; + Monoid their_prefix; + Monoid their_agg; + Monoid m; + while (true) + { + if (gl_LocalInvocationID.x == 511u) + { + sh_flag = _43.Load(look_back_ix * 12 + 4); + } + GroupMemoryBarrierWithGroupSync(); + DeviceMemoryBarrier(); + uint flag_1 = sh_flag; + GroupMemoryBarrierWithGroupSync(); + if (flag_1 == 2u) + { + if (gl_LocalInvocationID.x == 511u) + { + Monoid _223; + _223.element = _43.Load(look_back_ix * 12 + 12); + their_prefix.element = _223.element; + Monoid param_4 = their_prefix; + Monoid param_5 = exclusive; + exclusive = combine_monoid(param_4, param_5); + } + break; + } + else + { + if (flag_1 == 1u) + { + if (gl_LocalInvocationID.x == 511u) + { + Monoid _245; + _245.element = _43.Load(look_back_ix * 12 + 8); + their_agg.element = _245.element; + Monoid param_6 = their_agg; + Monoid param_7 = exclusive; + exclusive = combine_monoid(param_6, param_7); + } + look_back_ix--; + their_ix = 0u; + continue; + } + } + if (gl_LocalInvocationID.x == 511u) + { + Monoid _267; + _267.element = _67.Load(((look_back_ix * 8192u) + their_ix) * 4 + 0); + m.element = _267.element; + if (their_ix == 0u) + { + their_agg = m; + } + else + { + Monoid param_8 = their_agg; + Monoid param_9 = m; + their_agg = combine_monoid(param_8, param_9); + } + their_ix++; + if (their_ix == 8192u) + { + Monoid param_10 = their_agg; + Monoid param_11 = exclusive; + exclusive = combine_monoid(param_10, param_11); + if (look_back_ix == 0u) + { + sh_flag = 2u; + } + else + { + look_back_ix--; + their_ix = 0u; + } + } + } + GroupMemoryBarrierWithGroupSync(); + flag_1 = sh_flag; + GroupMemoryBarrierWithGroupSync(); + if (flag_1 == 2u) + { + break; + } + } + if (gl_LocalInvocationID.x == 511u) + { + Monoid param_12 = exclusive; + Monoid param_13 = agg; + Monoid inclusive_prefix = combine_monoid(param_12, param_13); + sh_prefix = exclusive; + _43.Store(part_ix * 12 + 12, inclusive_prefix.element); + } + DeviceMemoryBarrier(); + if (gl_LocalInvocationID.x == 511u) + { + _43.Store(part_ix * 12 + 4, 2u); + } + } + GroupMemoryBarrierWithGroupSync(); + if (part_ix != 0u) + { + exclusive = sh_prefix; + } + Monoid row = exclusive; + if (gl_LocalInvocationID.x > 0u) + { + Monoid other_1 = sh_scratch[gl_LocalInvocationID.x - 1u]; + Monoid param_14 = row; + Monoid param_15 = other_1; + row = combine_monoid(param_14, param_15); + } + for (uint i_2 = 0u; i_2 < 16u; i_2++) + { + Monoid param_16 = row; + Monoid param_17 = local[i_2]; + Monoid m_1 = combine_monoid(param_16, param_17); + _372.Store((ix + i_2) * 4 + 0, m_1.element); + } +} + +[numthreads(512, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + comp_main(); +} diff --git a/tests/shader/gen/prefix.msl b/tests/shader/gen/prefix.msl new file mode 100644 index 0000000..24bee60 --- /dev/null +++ b/tests/shader/gen/prefix.msl @@ -0,0 +1,264 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" +#pragma clang diagnostic ignored "-Wunused-variable" + +#include +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Monoid +{ + uint element; +}; + +struct Monoid_1 +{ + uint element; +}; + +struct State +{ + uint flag; + Monoid_1 aggregate; + Monoid_1 prefix; +}; + +struct StateBuf +{ + uint part_counter; + State state[1]; +}; + +struct InBuf +{ + Monoid_1 inbuf[1]; +}; + +struct OutBuf +{ + Monoid_1 outbuf[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); + +static inline __attribute__((always_inline)) +Monoid combine_monoid(thread const Monoid& a, thread const Monoid& b) +{ + return Monoid{ a.element + b.element }; +} + +kernel void main0(const device InBuf& _67 [[buffer(0)]], device OutBuf& _372 [[buffer(1)]], volatile device StateBuf& _43 [[buffer(2)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup uint sh_part_ix; + threadgroup Monoid sh_scratch[512]; + threadgroup uint sh_flag; + threadgroup Monoid sh_prefix; + if (gl_LocalInvocationID.x == 0u) + { + uint _47 = atomic_fetch_add_explicit((volatile device atomic_uint*)&_43.part_counter, 1u, memory_order_relaxed); + sh_part_ix = _47; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint part_ix = sh_part_ix; + uint ix = (part_ix * 8192u) + (gl_LocalInvocationID.x * 16u); + spvUnsafeArray local; + local[0].element = _67.inbuf[ix].element; + Monoid param_1; + for (uint i = 1u; i < 16u; i++) + { + Monoid param = local[i - 1u]; + param_1.element = _67.inbuf[ix + i].element; + local[i] = combine_monoid(param, param_1); + } + Monoid agg = local[15]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Monoid param_2 = other; + Monoid param_3 = agg; + agg = combine_monoid(param_2, param_3); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 511u) + { + _43.state[part_ix].aggregate.element = agg.element; + if (part_ix == 0u) + { + _43.state[0].prefix.element = agg.element; + } + } + threadgroup_barrier(mem_flags::mem_device); + if (gl_LocalInvocationID.x == 511u) + { + uint flag = 1u; + if (part_ix == 0u) + { + flag = 2u; + } + _43.state[part_ix].flag = flag; + } + Monoid exclusive = Monoid{ 0u }; + if (part_ix != 0u) + { + uint look_back_ix = part_ix - 1u; + uint their_ix = 0u; + Monoid their_prefix; + Monoid their_agg; + Monoid m; + while (true) + { + if (gl_LocalInvocationID.x == 511u) + { + sh_flag = _43.state[look_back_ix].flag; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + threadgroup_barrier(mem_flags::mem_device); + uint flag_1 = sh_flag; + threadgroup_barrier(mem_flags::mem_threadgroup); + if (flag_1 == 2u) + { + if (gl_LocalInvocationID.x == 511u) + { + their_prefix.element = _43.state[look_back_ix].prefix.element; + Monoid param_4 = their_prefix; + Monoid param_5 = exclusive; + exclusive = combine_monoid(param_4, param_5); + } + break; + } + else + { + if (flag_1 == 1u) + { + if (gl_LocalInvocationID.x == 511u) + { + their_agg.element = _43.state[look_back_ix].aggregate.element; + Monoid param_6 = their_agg; + Monoid param_7 = exclusive; + exclusive = combine_monoid(param_6, param_7); + } + look_back_ix--; + their_ix = 0u; + continue; + } + } + if (gl_LocalInvocationID.x == 511u) + { + m.element = _67.inbuf[(look_back_ix * 8192u) + their_ix].element; + if (their_ix == 0u) + { + their_agg = m; + } + else + { + Monoid param_8 = their_agg; + Monoid param_9 = m; + their_agg = combine_monoid(param_8, param_9); + } + their_ix++; + if (their_ix == 8192u) + { + Monoid param_10 = their_agg; + Monoid param_11 = exclusive; + exclusive = combine_monoid(param_10, param_11); + if (look_back_ix == 0u) + { + sh_flag = 2u; + } + else + { + look_back_ix--; + their_ix = 0u; + } + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + flag_1 = sh_flag; + threadgroup_barrier(mem_flags::mem_threadgroup); + if (flag_1 == 2u) + { + break; + } + } + if (gl_LocalInvocationID.x == 511u) + { + Monoid param_12 = exclusive; + Monoid param_13 = agg; + Monoid inclusive_prefix = combine_monoid(param_12, param_13); + sh_prefix = exclusive; + _43.state[part_ix].prefix.element = inclusive_prefix.element; + } + threadgroup_barrier(mem_flags::mem_device); + if (gl_LocalInvocationID.x == 511u) + { + _43.state[part_ix].flag = 2u; + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + if (part_ix != 0u) + { + exclusive = sh_prefix; + } + Monoid row = exclusive; + if (gl_LocalInvocationID.x > 0u) + { + Monoid other_1 = sh_scratch[gl_LocalInvocationID.x - 1u]; + Monoid param_14 = row; + Monoid param_15 = other_1; + row = combine_monoid(param_14, param_15); + } + for (uint i_2 = 0u; i_2 < 16u; i_2++) + { + Monoid param_16 = row; + Monoid param_17 = local[i_2]; + Monoid m_1 = combine_monoid(param_16, param_17); + _372.outbuf[ix + i_2].element = m_1.element; + } +} + diff --git a/tests/shader/gen/prefix.spv b/tests/shader/gen/prefix.spv new file mode 100644 index 0000000000000000000000000000000000000000..8e7db4a4638e3f8ad517015323935706f323bdc1 GIT binary patch literal 9828 zcmZ{p2b5LS6^1X)48;a2RuBgZ7K(_yi->@rsMtlvVHjY9nSq%%h~22E#`I!(HARgf zU6W|UXiPJS>BS^!OjcsLNoR-+P~Z_Sxs$dmm%d7JH2-iebf; z#mM5?VMTpxS8Rc$D7Gqw*Y%tQixy1luPvQ+$f45>*tTe@#~Cx4m=<(1(%s(GqhlO0 z2D`THG;HAtgVMxYXmLM==3-PaXF=Qixo5U@cXs#oUEbE;(Yv~{t-rIoy{FdI(NA0p zQPtdX3NB9V?dk1WTGUsUJ33c&c6avFm@~TA8s5>{y`-zBv#q-xyI+5;Z=j<#b#Ug? zfrAb@`u{D=ZK4-N(T<3>LNBS~k@bBoU)8ptx1)X4yq>kFT32t+ypwPf+fwXvq#iz;(Bo%y#qb9&b~q4V7$5e%NpGGWStEAJp-Lh={p3T z{T&ZpWkbr?ndo^vgA2PCxudIRNxc~7&l)u0Z7SyLLtN*4H0!i?mwfRM-c-05_D^g} zVGO`1y}R$)WG*>;1h@N5yfHfuAZ_lZ9{86 zd`N6_apVx*QXB*K`6#a{oogx12haE@zL=g6ybsai=ok=yH=ahV& zPc@%>BDqlYeAfUxvt5xL*oFI;$ZY3}cVtcX(~3AI;2D7)v^CeyMy^a`f1^t5nJLcS z7Tt=t=H!x(20L2eW6;UDGu&FtZw>l)LpxXd(IxkeC7-e55r^@+qxESY%Pv~c+V@6# zF51l__gkdReN95^n^4|Uu{LFQ{NBHPkqON3PUzQW?gP;FOS`${-W6^7r}l4oWI{6*Ot4s?_Rk*^I1#Je5T~q z*pGHSZO)VXTxpxa$$C$HuH3U{tQ^Lj(r<2ucl$XkEogIij{bhI-f(|EgnOR;h6p!) zeuew2o!`LyU66SF%PQRY{tl@6Z)o5*HSn7&-1+M(-2DC)NIw1k4yf`C4cu>i{Lc4V zU*%6U@FyF%-}q|0-}fqip@I8-ulmO}aKG_Yzu)*OpV+|t##jA**Q?y`dX@VPuX4ZR zRql7Z%KeUq+t2n2w_kqCxE$Jg_m^)29o-_Q`zC^DbYAXtxKx@8s$KMd0+` ze9p1|7bE)8>$cK{+Suz$z>AUeT5jBtoUQA&BU>Y`Xm?Yn1S@6_3M z2e`WLPPn$ENX}JT^;|uJthWN}ddrdYqzkOw`ODC9<9+6Pka5U(BsKJw_RL=mcb;?B zp!<++#5i-xjS=^w&1HQxwEI(g@7AI(Lmc|oq4le?_T^x6%S82`r7lnybA0*eK~_ygFOR%Ij7fvwR;XXpylzs7OYL*O=x+1 zuLFC&`ff(cbFW_y_Fg+jpE1U2_YAXd_2$mu4fq=S@J4LfQ^T9U=FR@z40eCUxu)D0 z*YpnAYqj%-qu+wq$Gk7zTH2PPov%+@^yo=rkHr`>aQ_$~6oHH76Z*t?Z z-Uq<0m%2X));1oo?hm6K)~)YDh&JmM=S)5VuGaZccd3hq#CA^A51~?DG>~Z8@J$f*tNt-<^my_bIk_>(Q>2 zoSy<$d-rL$w)F0^V23&NeFo8HPH}Qh2fMHM)LG+mVAlx$JlHyB@+I+nzksCA#-@MT zjWuufp?1x?(03!r`9-j6++M!Jz692mecS_fxDS0_MzpyPaq@fxY@WP}?gMMHXRdoM z`o0oxK%3wC?nj$T?YsX0^n-{)|JTs^)z+Qm8huY()U{u|h|=j^`;)|RvX zHrQcqecwW~nOmIweFy9w();g%(|djPKC#~e>v!J|qvh@;dDV^Y!0KI2p6?UqeDgel zmN(8z><{o6lYRaWtj+u7v;1SU!+q-e5%PP)eTvh=pMV>C_)~1!lly02ZRz1Jzz%ck z`#GY`+~U;uOK^HP8~iIIJ=EuW!q`dZUnBbS-gvCEq3X4cK|hW>ir9O5B!3Js#|ZRq z5dHd|D0zIph2K^BoT%f@|}6@I3y6 zcGxq0e?+v|GjY!2&){*0&r5Ru1*|>i@mH|6oX6k64s+}K8=}qJ;^g@UxOyJ{gy%f; zEiGd`kAETha~{u>Hq^%RcozILlJk%om!3ag`t9d)V0nE12Io9p0Lyb8{{ed*&e50i z&~E>;H+A;*BG|p}dBzvXR!=QqqKxN+L^ZLUwX|2)a?R{-SMUbr zq`$ktwdL;U)0X&gU~TEgc(^vt%zjJ&JM4$P-4Si>QJj6Wg0n~eW+abqPjH@(iC}s5 zxEI(xI!B*-G*)|R+#Bq@N{#!#wdKtA1v{)!-y~!|#2Urs^jkC?v2VufGbU^74|a__ za|gh+rEmJQnQH^swX3p-x> zUg(>KXmh_}YkB~`T)RB)4YfSKYmCc#;~-)i-y1WqX-_Q&gSF+oaVXefE&2{Yw7Ir8 z_u~j~?uUOrl*e}zco{kDyMIfR=k6R0_U<@GpE3SDQhVkf3r=qTt}V}*9|xYn{KOs) zcfP-qW@A4AoNtP;w_?k?XC~Zuhdr8tJ`wJmynE!v+0*P>UH!e3{?5YQ*x!?|X-|J= zgSDlpWZ|2@n%uX=NL^AuuTC%?z% z!?oo(Jr(S*UVRG?ZRQo{98Lp!R-UJ4C%1;gsuO!UIOn?%EKg6*05|sZOl;ckDF4l{ z2&^qNo(*VD0Yr9JD;X zi@@$n-(s{pzBaJ&`YuMx^K4uK_StZbK4Z+I-Ckx-YJKhKywjF|-LKkz*DOVMA`a&+ zEB$I~8IFB9*c|zr)(Wt;qwv{Fc?V)HbBirXnFdu8f+iTqtBQwwDzn&06v76@U?L3$v5;eurb=R<~p!>@|)#ya1HTUGCuE# zE5OD&+=K6lE8)iHxsw}bZ|$@FQKz1(z}0%LhHJ~WNuM_7duGnJ&o36k`1}5_`u}(H zH;a96j6x2l5B77*3f>Cr+-=a->hG!T5M!)0^=x0-6SD){xz(7laP8^gj$r3$Kb`wv OJv$-VUt&$Mwf+whEVcar literal 0 HcmV?d00001 diff --git a/tests/shader/gen/prefix_atomic.dxil b/tests/shader/gen/prefix_atomic.dxil new file mode 100644 index 0000000000000000000000000000000000000000..45a7dd8759ebdd7b2ce08e53aeaa163ef913d810 GIT binary patch literal 4884 zcmeHLeOOahnm;$W++32Ia05Xv1n>sX@}<253Mlw&LXZZG6%>7-b+$Jk6j#s)t>Z`A zx%uGBi8fWL_%Sm0Q8nbo#Cv*$)|-R?j8 zJiCAFA9EhwbKmox_q^vl?|IMpolsO+Xt;R#8UInozIPt%=*oTJ{l!NR0DvkA015mS zKr%sE1E~QL3OgrZ2aW@{ZQ;B8d+oB7s~H!RGfzhRSLUyDG6-_ zpuEt)-_#HQ{1VD!HacVqA>>;(P zJe*o8)vjnax>U5Gxcyxaqb(daDqL?T4wUK3trFG1>KX_ za zRkVr(v!J!=S{k?xCgD4d>qKD1*>%0&g>T0%z8t=8?z}mDjnEa4@)0TnhnHwrANJijpq*(8A<3eMB9|s204K|RcZWU|h;V)hoyEIW}0|n~H z!YaI)#aTnpMCQHapFKQ;BHTPYn4{-<<+4BMin5uj*+}xax(#b{GW;u*e%^EqZ6H(D zi^tXgM$fd%`BrQ2l-&2BnbPhmda3fj=VD0BrtIYZ@GA}<@Os_elq;4H#%-# zd1rVT33e>+I<%(p=Inp|bj|0p)t|=2?_|!L{_s?0LbbW6XmDk_xy8^9q4dT^Uodsh# zY@Hipy%@`5CUvh*&FkX=zPO-1p0mc^&`0Ow{5oe}_6cq7nN)pQ3-R$s$4na*+#7Bz z)n#k^&BtXmC*jUSs$L6Bt(ge+2L{Iv+zbqkyoVZNv<0*KPV}8#)OY+uU#f3a2k~*s zf{0@of#G{Q7dQnI8%p_4ppDe#BYQ5{w=Hcxa`0fJQ$Jfp&1i`xx(4!^UIlWg50Y3|xYOn``cq~Zam$RY|8`hD@2yW{GZu#lQv zJtSTnFf4Y87Y}W5zG{(d$0b{hZ5E4Un?<~NOPhsmiy2ZXCS8g#CSn5AD0 z+J5MfZ0qtBjB^niXvO_9#h6ziFkln?7_Xv7(KnkP{L_%)XI8#STM5cXbD#gwn0ev9 zA3QI%o?Pq2PVpOe{iXkQZF6h{dim(^>2H5BPi)oC{s28AbX>l0pLxTHiCzh^zk0O! zx7XIv%pKc){c~&1{qZ3`&a`_q@^2mn8b8-(2@-@{f-9>;p`>r+EgPxU=&~i81*ZS zDuIC|aBXQ-{Fko&V{7^!e0~3?8Ugd*4Gshh^ufF6X$+i{{wD_Z!oL3z22zhPaNr3B zR{u8)R3<%zftmle7?=sys7-7SVPI4e#K5SeX$*8i3>+|#?Y=mKA`*c?6bbBjKfGi` zwjEjkbQj`~!i$Ujx0jC?jBg52b?Z`bkRb#u6T%>OXrawb$y_i*SC&22`b()XJ@ZkZ z@rdyFfIyf5bQR@z2njjI2UgDClX}>3@n5_h!B$OT^135CZ0%*BZ1aKaC7sNp^t0xu zZn>ezc(ph>OxX?Gjk|yA7Ve=)J1{el_QES~{7=&RwUVm#!u;CO2RL55yR zLW9^EM+5kh3Ki;pkuHRFLW>#i<^Ua%!{iIpAjnKZH%mZDIV6aQs;4or4wSN}n;JZf zOS^&6qef^CRt!ApfjfT=p~e%h z^~yhLK~`VT4Da0a@k<9^ioK!!!G3jofO@ej?#Rzi<>Y;+b#?FSekYnt>vp^4ddfte zS?32>HG`>J-y+Vl{xYnV!B&loOn$cJYGCljo%c|EGz5@eLl8KXdpZ@l;(*Yzz&Q-j zrr_TWgmrGzZSDLk zG8aByp&CfS!hww?cj^;g|JifZEs`enDRTa# zQjR`c;}&@(iHnK}0-!!#*^*mtziS_DRrhj4U5p~h^FsSv;^>qF3FAGZT$K%~lZ1!L zWB`eZpNXF26s>Z6E(Hum1oO+`OVXDiUV-}MYvDtax#Y^ww;&0|u-9NrF8_Nn{;%K$Vp`g0a+<~V+wnnA`4&)EAuvT@(NHtb{5GgL#WGYRvZmpgu-F1PFc~PY79F89hO0d`G>aDQQv&D z&m}tz_t0e!_VCKIGv~b0pEK30;P|d*KPa3}E(A3LJkXKEp_kdPRn!K&0f9kGPV1`e z;@wB>(LJ`+-1Ii{*645Osg^4a;Bi~4L=3edj~A4WW^~nhkNqOH$|e1Kq!_{y>f^MM zDyoHxD~=MmZQ*s{$E;GFH|UP_0751|dbkd$3-?;3eD8T)?E%j9#Mvi}hFh`XjeAG; z*9OmKd4^^kvk_^PZR$*jXsGxf65bq(wrvqIZLD=c0?(e&C-AgK!-?$Mpu|0!XZM=_ zXp0>Lt?Ixi?B8qsSDRUEOTH`Q&e=3^6IUhL=th~mBla-)tP|Eqe(ccYE38JHf=t@8 z7F7mAY?4w zy+mTwUP<&rgiw>|8bbGn{sLpzgPBCyj~2KLWs!!G&Z3e-{tPQcROr4Vu|Ff&<6}AM z8CdKX$naAxec5l>h!uJxKYb|Ymq&c5i$F@<1ID397n~35M3S)uSDh3E)5m%%@>^Gg zUw(^=H0!bu=U1&PSr;?pCW779#}e3Z|1xip4q?)0b}vt0X`q`>X}jj;*{xDw8gehR z-;;k$MhS<7;}S=ZG*^c@vt9^y`ZL9%Iif?9Mf8lNaL!O~I0ctYBXHUE77W1#gnLK0 sqh7+2Q|$Bok)6K1X6Yb%sZ6GiX^3gkK%t=ot@YMgI^6KzJ5S1g0DXc?y#N3J literal 0 HcmV?d00001 diff --git a/tests/shader/gen/prefix_atomic.hlsl b/tests/shader/gen/prefix_atomic.hlsl new file mode 100644 index 0000000..a75448f --- /dev/null +++ b/tests/shader/gen/prefix_atomic.hlsl @@ -0,0 +1,229 @@ +struct Monoid +{ + uint element; +}; + +struct State +{ + uint flag; + Monoid aggregate; + Monoid prefix; +}; + +static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); + +static const Monoid _185 = { 0u }; + +globallycoherent RWByteAddressBuffer _43 : register(u2); +ByteAddressBuffer _67 : register(t0); +RWByteAddressBuffer _372 : register(u1); + +static uint3 gl_LocalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_LocalInvocationID : SV_GroupThreadID; +}; + +groupshared uint sh_part_ix; +groupshared Monoid sh_scratch[512]; +groupshared uint sh_flag; +groupshared Monoid sh_prefix; + +Monoid combine_monoid(Monoid a, Monoid b) +{ + Monoid _22 = { a.element + b.element }; + return _22; +} + +void comp_main() +{ + if (gl_LocalInvocationID.x == 0u) + { + uint _47; + _43.InterlockedAdd(0, 1u, _47); + sh_part_ix = _47; + } + GroupMemoryBarrierWithGroupSync(); + uint part_ix = sh_part_ix; + uint ix = (part_ix * 8192u) + (gl_LocalInvocationID.x * 16u); + Monoid _71; + _71.element = _67.Load(ix * 4 + 0); + Monoid local[16]; + local[0].element = _71.element; + Monoid param_1; + for (uint i = 1u; i < 16u; i++) + { + Monoid param = local[i - 1u]; + Monoid _94; + _94.element = _67.Load((ix + i) * 4 + 0); + param_1.element = _94.element; + local[i] = combine_monoid(param, param_1); + } + Monoid agg = local[15]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Monoid param_2 = other; + Monoid param_3 = agg; + agg = combine_monoid(param_2, param_3); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 511u) + { + _43.Store(part_ix * 12 + 8, agg.element); + if (part_ix == 0u) + { + _43.Store(12, agg.element); + } + } + DeviceMemoryBarrier(); + if (gl_LocalInvocationID.x == 511u) + { + uint flag = 1u; + if (part_ix == 0u) + { + flag = 2u; + } + uint _383; + _43.InterlockedExchange(part_ix * 12 + 4, flag, _383); + } + Monoid exclusive = _185; + if (part_ix != 0u) + { + uint look_back_ix = part_ix - 1u; + uint their_ix = 0u; + Monoid their_prefix; + Monoid their_agg; + Monoid m; + while (true) + { + if (gl_LocalInvocationID.x == 511u) + { + uint _208; + _43.InterlockedAdd(look_back_ix * 12 + 4, 0, _208); + sh_flag = _208; + } + GroupMemoryBarrierWithGroupSync(); + DeviceMemoryBarrier(); + uint flag_1 = sh_flag; + GroupMemoryBarrierWithGroupSync(); + if (flag_1 == 2u) + { + if (gl_LocalInvocationID.x == 511u) + { + Monoid _223; + _223.element = _43.Load(look_back_ix * 12 + 12); + their_prefix.element = _223.element; + Monoid param_4 = their_prefix; + Monoid param_5 = exclusive; + exclusive = combine_monoid(param_4, param_5); + } + break; + } + else + { + if (flag_1 == 1u) + { + if (gl_LocalInvocationID.x == 511u) + { + Monoid _245; + _245.element = _43.Load(look_back_ix * 12 + 8); + their_agg.element = _245.element; + Monoid param_6 = their_agg; + Monoid param_7 = exclusive; + exclusive = combine_monoid(param_6, param_7); + } + look_back_ix--; + their_ix = 0u; + continue; + } + } + if (gl_LocalInvocationID.x == 511u) + { + Monoid _267; + _267.element = _67.Load(((look_back_ix * 8192u) + their_ix) * 4 + 0); + m.element = _267.element; + if (their_ix == 0u) + { + their_agg = m; + } + else + { + Monoid param_8 = their_agg; + Monoid param_9 = m; + their_agg = combine_monoid(param_8, param_9); + } + their_ix++; + if (their_ix == 8192u) + { + Monoid param_10 = their_agg; + Monoid param_11 = exclusive; + exclusive = combine_monoid(param_10, param_11); + if (look_back_ix == 0u) + { + sh_flag = 2u; + } + else + { + look_back_ix--; + their_ix = 0u; + } + } + } + GroupMemoryBarrierWithGroupSync(); + flag_1 = sh_flag; + GroupMemoryBarrierWithGroupSync(); + if (flag_1 == 2u) + { + break; + } + } + if (gl_LocalInvocationID.x == 511u) + { + Monoid param_12 = exclusive; + Monoid param_13 = agg; + Monoid inclusive_prefix = combine_monoid(param_12, param_13); + sh_prefix = exclusive; + _43.Store(part_ix * 12 + 12, inclusive_prefix.element); + } + DeviceMemoryBarrier(); + if (gl_LocalInvocationID.x == 511u) + { + uint _384; + _43.InterlockedExchange(part_ix * 12 + 4, 2u, _384); + } + } + GroupMemoryBarrierWithGroupSync(); + if (part_ix != 0u) + { + exclusive = sh_prefix; + } + Monoid row = exclusive; + if (gl_LocalInvocationID.x > 0u) + { + Monoid other_1 = sh_scratch[gl_LocalInvocationID.x - 1u]; + Monoid param_14 = row; + Monoid param_15 = other_1; + row = combine_monoid(param_14, param_15); + } + for (uint i_2 = 0u; i_2 < 16u; i_2++) + { + Monoid param_16 = row; + Monoid param_17 = local[i_2]; + Monoid m_1 = combine_monoid(param_16, param_17); + _372.Store((ix + i_2) * 4 + 0, m_1.element); + } +} + +[numthreads(512, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + comp_main(); +} diff --git a/tests/shader/gen/prefix_atomic.msl b/tests/shader/gen/prefix_atomic.msl new file mode 100644 index 0000000..910e842 --- /dev/null +++ b/tests/shader/gen/prefix_atomic.msl @@ -0,0 +1,265 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" +#pragma clang diagnostic ignored "-Wunused-variable" + +#include +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Monoid +{ + uint element; +}; + +struct Monoid_1 +{ + uint element; +}; + +struct State +{ + uint flag; + Monoid_1 aggregate; + Monoid_1 prefix; +}; + +struct StateBuf +{ + uint part_counter; + State state[1]; +}; + +struct InBuf +{ + Monoid_1 inbuf[1]; +}; + +struct OutBuf +{ + Monoid_1 outbuf[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); + +static inline __attribute__((always_inline)) +Monoid combine_monoid(thread const Monoid& a, thread const Monoid& b) +{ + return Monoid{ a.element + b.element }; +} + +kernel void main0(const device InBuf& _67 [[buffer(0)]], device OutBuf& _372 [[buffer(1)]], volatile device StateBuf& _43 [[buffer(2)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup uint sh_part_ix; + threadgroup Monoid sh_scratch[512]; + threadgroup uint sh_flag; + threadgroup Monoid sh_prefix; + if (gl_LocalInvocationID.x == 0u) + { + uint _47 = atomic_fetch_add_explicit((volatile device atomic_uint*)&_43.part_counter, 1u, memory_order_relaxed); + sh_part_ix = _47; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint part_ix = sh_part_ix; + uint ix = (part_ix * 8192u) + (gl_LocalInvocationID.x * 16u); + spvUnsafeArray local; + local[0].element = _67.inbuf[ix].element; + Monoid param_1; + for (uint i = 1u; i < 16u; i++) + { + Monoid param = local[i - 1u]; + param_1.element = _67.inbuf[ix + i].element; + local[i] = combine_monoid(param, param_1); + } + Monoid agg = local[15]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Monoid param_2 = other; + Monoid param_3 = agg; + agg = combine_monoid(param_2, param_3); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 511u) + { + _43.state[part_ix].aggregate.element = agg.element; + if (part_ix == 0u) + { + _43.state[0].prefix.element = agg.element; + } + } + threadgroup_barrier(mem_flags::mem_device); + if (gl_LocalInvocationID.x == 511u) + { + uint flag = 1u; + if (part_ix == 0u) + { + flag = 2u; + } + atomic_store_explicit((volatile device atomic_uint*)&_43.state[part_ix].flag, flag, memory_order_relaxed); + } + Monoid exclusive = Monoid{ 0u }; + if (part_ix != 0u) + { + uint look_back_ix = part_ix - 1u; + uint their_ix = 0u; + Monoid their_prefix; + Monoid their_agg; + Monoid m; + while (true) + { + if (gl_LocalInvocationID.x == 511u) + { + uint _208 = atomic_load_explicit((volatile device atomic_uint*)&_43.state[look_back_ix].flag, memory_order_relaxed); + sh_flag = _208; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + threadgroup_barrier(mem_flags::mem_device); + uint flag_1 = sh_flag; + threadgroup_barrier(mem_flags::mem_threadgroup); + if (flag_1 == 2u) + { + if (gl_LocalInvocationID.x == 511u) + { + their_prefix.element = _43.state[look_back_ix].prefix.element; + Monoid param_4 = their_prefix; + Monoid param_5 = exclusive; + exclusive = combine_monoid(param_4, param_5); + } + break; + } + else + { + if (flag_1 == 1u) + { + if (gl_LocalInvocationID.x == 511u) + { + their_agg.element = _43.state[look_back_ix].aggregate.element; + Monoid param_6 = their_agg; + Monoid param_7 = exclusive; + exclusive = combine_monoid(param_6, param_7); + } + look_back_ix--; + their_ix = 0u; + continue; + } + } + if (gl_LocalInvocationID.x == 511u) + { + m.element = _67.inbuf[(look_back_ix * 8192u) + their_ix].element; + if (their_ix == 0u) + { + their_agg = m; + } + else + { + Monoid param_8 = their_agg; + Monoid param_9 = m; + their_agg = combine_monoid(param_8, param_9); + } + their_ix++; + if (their_ix == 8192u) + { + Monoid param_10 = their_agg; + Monoid param_11 = exclusive; + exclusive = combine_monoid(param_10, param_11); + if (look_back_ix == 0u) + { + sh_flag = 2u; + } + else + { + look_back_ix--; + their_ix = 0u; + } + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + flag_1 = sh_flag; + threadgroup_barrier(mem_flags::mem_threadgroup); + if (flag_1 == 2u) + { + break; + } + } + if (gl_LocalInvocationID.x == 511u) + { + Monoid param_12 = exclusive; + Monoid param_13 = agg; + Monoid inclusive_prefix = combine_monoid(param_12, param_13); + sh_prefix = exclusive; + _43.state[part_ix].prefix.element = inclusive_prefix.element; + } + threadgroup_barrier(mem_flags::mem_device); + if (gl_LocalInvocationID.x == 511u) + { + atomic_store_explicit((volatile device atomic_uint*)&_43.state[part_ix].flag, 2u, memory_order_relaxed); + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + if (part_ix != 0u) + { + exclusive = sh_prefix; + } + Monoid row = exclusive; + if (gl_LocalInvocationID.x > 0u) + { + Monoid other_1 = sh_scratch[gl_LocalInvocationID.x - 1u]; + Monoid param_14 = row; + Monoid param_15 = other_1; + row = combine_monoid(param_14, param_15); + } + for (uint i_2 = 0u; i_2 < 16u; i_2++) + { + Monoid param_16 = row; + Monoid param_17 = local[i_2]; + Monoid m_1 = combine_monoid(param_16, param_17); + _372.outbuf[ix + i_2].element = m_1.element; + } +} + diff --git a/tests/shader/gen/prefix_atomic.spv b/tests/shader/gen/prefix_atomic.spv new file mode 100644 index 0000000000000000000000000000000000000000..d7dac5bcca704dbeee8902b63dc1a54d731f3c7d GIT binary patch literal 9852 zcmZ{p33yf25yx+MSwK)g!41Sj6c-i&5f|JdA|NO#?xOLLyo5-S2gwWKPSvi~?u*@5 zyQsBv*S1Fnsti3DPJ>!7ZMM{`H5{vjG@*8 z?fu0{`#R?r67Ngox)EHitg9Z<+&9Csxo^qwt?;O~`R|68Hn;Z-4s>s_FAY@jK6FpH zyuNL1v3-5D!`d5?A?z&G-9K!rPaeU>{;DZm|55mk{Ef+D!*il@wHiE$J-g3sBX~pd z%KgU}}@TO!Tyt^;0OWVl3A2lMjJ~?It zZ%B@V`+THVmC6;8RoME=SAgr2^T64jtr;=DKDl%RFC;~Ht>1;D8ysg`KO(L^>CN$e z_^QE5^^$r=<&N@TrFu!-S0%MAWnaF}d=IFX5g*U+bhtxXJhOS5X|(wci#VTGZKDzM zAA)W~d}l3#`}{g4BU!#zmDiqE{X5pG&2g>l9C>z5b}vggqw5q9_u%t>0OAPV0(P(R zlhE$5c8>bt=X!5!o~sq@h&`EKTjXC1&gPe!-(miZb=mwodXmj&pCX_A$>x){kPB7K zcOAfZ`qxD=nOFOp5K9;{4sv&4_ExO!;`Q zqZvK{9Xa=cyBG7j2mSk^ovVFO%DrQe&)BJm!}$Hs`m|5t6wPSu2cqqZcJs*n7HRWb zGtl~`rT0{_wQh28Z*2T`n8$+Ftqj3Zf?1EMVtOvDRz!EP|rc?GTa{x#)p3% zcr*dQ7s5$4%#VdyyQ$FwYhKk;?$;pNs65t zz3TwiWq23Ze#Sm82bd&cR<#EV~*dPr`SklcAj^kx+;4sOo$t3k%b(2gZ8`2Y zJ{#}%J&uN3`|Mj!Yq%0^ zKh)2%<{9W!h(rIGX#MKg`&nRfY-O!x^x23u&u_2grz7Tat~lnd20J(SIVry(Jf7((?wwGWIpG4moG-NTLa&B@~HnM;Hba(oMZhjMf63j z+fo~9xYn0}*C0`AxpBwPTh}cj;}F+0F7~(<>>jO&dy*S#-R*(3Q^&d6!P#?{;MzKn z=&QDDU+qDxw+`%jT}afV8?4>=ooKo7KJ$IZ6l5wAdnl*&n7;w;Jm+jg_anWCapsg8 zBOXAT%l%c*o=@$)+l0OXap>QS)~}AWuLPST?%ma3ZC4@QJNY1DF6WB#?}a$qHTd#p zyB3@FSnoQp>phs>&FjJ5OYL`IYesKDoTtxs-VJDn``Lni1)|OUh-3Ym!1?uGiA{T~ z|0-~H{a3?XUwb?+uK_zxU-aO$V0)l1`t&-mcKdK6S{}aFgSF|q87&Xr8^HEg-z{i) z-0L@jz1PmsXNXVo9<;-Ipzqy?Huolu zy}cJaKjls6_aU{jHdVh1?}yJs+^ezbco%&DY`nw0&O(0>an5+ev&oH%^*#i4z1a7M z!P=%G?)#%?hx^v|5k#B&7DrD$2F~vD%K0g6A-Vyu!B-lBzPq}gSAl6gM zqi*Ua*rU(D?MU?J4)7GjGsHRX1Z$6TehRED`txb9!*lAp3(@8|#nx^s+O;C*XTaIo zeHN}QYWI1t!<_m)hiEgWIC9Pfd#>=QV~sC>U89vRhyDB_5_Q#Qy(0EYVE4F<-W%6} zxEJl_kMpQq`)>3-NX-8-*tPCR-(_C`Ym4*T3wC%OeP2bic^+}(`5M?f@lLuQtj*fF z?tSR{Q#^z=zx#UtZ7#LnfCteJArAfDKZxCJ+Nnpo_rr1J<(@Rp5jf@i2ebhKYHQ$A4%<=De|lH@5Jog zjC7eJo>R6 zERTNt7i>SAqc8fQ-TKGb)N!^K!JaMNOD}=7c_!!U(@oogS*g7rlo$APsuJfF|iuJ8sV`Y1Op*53_0L|mM8JY1W1)Vngof@5An3*RcoQ3%NCSzOl}U*va6yJO2Ge9=^T6k;lKq$m8zp4fgIhM_;U^ zJ-e1`#`*RE4>2d|yDwZ@+#P+|B7O>3ThwDJT$??!9@D@M>!EKyM4M+6$2pq8aYp}! zBoE&K;CMb-!16fbbg*Z1jy}(5toGRBfne`d?C~JDw&>ZxV269uHv>5YagSni`YoD^ zSU2PK853(93U-Zn<_?2vi@NF4X09QyYiDzrQy#TZ%i}pw@9g_G6JPdQDc@P_EPRi$ zcGO}vT$^XJUx$Mo)TSi;a>DDK(x8GIPS;M;J6?DJy9OMW5J!|ufa^tLNoLim! zy%hCbj6Gl9Q?O}|`Yr)$i~24FJFKt1QxR>}R~+YD2KJosJ8(HX&ZW=&`deJP_uac4 z&r0M|dyaeYnP0uLJ9-+it{1=0SHQK!^Ljei;lA~)M6{V#9Q`;0Y|regJ(RnLh*d}I znc(R2DzH3idloof+q1E0zcc+e#A>j%*yFiihkMj_4x-IHiX->=;C${2uxXEX$%SBT z{(f@aM)XDSUbMbA!^L3jp7%VoJbagcJ(s>UXnFYBz{cyl6fKWuT!E)<2np}P06Vc9z_%c|#=jcVtqYfLu*1i>`Pw~KXfj6sf~4%Ty*3?2)1?yhL}>hG$Fh%xRp n_Op9xkC;8+&dtV5f@_Z&P6j(q`LW{Uz2EyVsWiAhx-6 literal 0 HcmV?d00001 diff --git a/tests/shader/gen/prefix_reduce.dxil b/tests/shader/gen/prefix_reduce.dxil new file mode 100644 index 0000000000000000000000000000000000000000..0ee28e84d3b111e42ef0fbab465a4bf7a9a73aa0 GIT binary patch literal 3764 zcmeHKeNa=`6~8a<<-L~>!viuJ46=`}Osb4V&;Wwle2E}1hJ{(Mvf~R72G#(9AoAg~ zA;BbKbc28*)ZL)yX1g;jT6Qbe?SzlDsMtZr*(#!iE@Iblce1W)cd9e(eUa=q&b0q@ zITxoiLh!sLD~fgTrx5mJ6LKFZQ-bogs2CpBvC3U&fzD$;mmT2XicW` zG>A-6FJuYbov{--Reqam-gdtO`%xyI4-dgUo5VH89+yyZyvyuo z+obJ2ITVMWvO2 z^6gz`oZ-@+?N2er4(pv{_Kf!=9V-H3X3PU3MD;#L1LIIsI+`m{C@=e!PFOuHY=I*P zZIKi89>Po$)ox*3P-qGYbqv8eOygg?BdU4uUBIA{;c)C}{~Q_6(c~R2FLxG*k5V zWhj1}ssylP0wiBJU`F*SZ);H4DygtVqQkHX@!XhyGI1R4gdRjqZD!sAB z3KAuDG)79UxHNwdudo8SBwW3NEM-VWGo4GFb4?VvrmkTFBZ|ZcG;G|b&Bu)7;Ij5) z#D@)#3LwG9&MB}z+Vx&}c(uN&CIY+GM}un^Vp0h1%8j)1&^HQE53YM{K%O=-baCB< zUyppe?rg?|b0a*C-_Psn3c331WP1PZXQgrdr}}9?g`S0u!%1+N3JdEBF-tUS`pY^O z6|Yk8r1B#bA6+j- zlVoUeDw?FUZP3{^GB$-7cMq1_p51w3w)pmJH}eav{ek)zQM7zv z@oDCYOg2SIg6|>B(LHkg@_p4Or$??|%j5lSElgBayQikXP;V~aqz3LV58h!iX^EPc zkE#$Mo1L1}_HmcmLNqrK=H5ENA_)DG5KHna|3QR$rQSIQnT5G zZk}p#HJFH2l4#Cym`p^A32kh0m}p1jRG9FQTR1Ndt&xX*?ic!X!enY<)xDUgTQRG~ z^wAaa1L+rS>C>9DVKn2SEu$SxpYn?u8KSk4NU3zx!_nd}xg92kXp|9+-*I8uy%xtV z(orXN)b~1?d)pGf@(aJ36MpLxK9ZxQlLJu;15s0faaJF7B_?X3ZPgV${+U~oHmOPL z4UFf!2Mxi$SJnm#n{nv5a#nb3O5VK$-cSr~_D(y`Y|wDgWaDGV03ZRY#Knj05?lkwO<|lh!KkE3RYy*{Q!#}+Cn`bAaXq#%q6u)o&@rmDu zT4ZYHq|h6XVFx;S1HoY*(|GMy_eTHN(_wYqx_p3ymSi_;+2tf*S$QItk6|a-kd#&r4ctKgJf-KRJ z+?+dJRaBB4fB#6T{Zit+3>Is&5Tsb_xenXt%h;0$?Lq9B{4c?tM0n1Bi9HQJ80?Wd zfuMlBmLDPp$yf~fp8oR~#D07Ha|}{mP^eTy2^gymi=P?0F)Jl!l2xFO$C)#V zhxE6U)Mq`_GB3Y(c(u{kSiQEY+)&d{VD<1}uYT+JY{^Xjoo-7d+vQhWs(_6?LC!Yl zse%2SJ(o9r(%Ex1kLON;4Z7Z7uH92#W`G^~17YO5Imaa@N}_M>?33J`?N*}y@7e#u zJzGx$up8EP1hzGi{SUy0j8y>sdQbeeUnGK2urhP)NPNS+jiqitllWn=*7`snhvRU8=|!F#5{m(8ZPVkFxDecRA{+}X)*4J< zh@y`I8{0<@i)py7YR6~7jHh^r4ng)5rc&2o~wFCML!>{R0C0%GVf2poW$EE?ru5UXA7n~CVsS(Vc#6S)4!DAqn_XcVElSN z*ZjhHXw1X+Q202mr3Jo3m0X8V$_;`-5RYibooxs_0qxZhdv$|U3D*F|m#Jj#(L|dZ zTO$}^_#7@%X!0eF!YGm>as`f$Jm&A4BgXOeaDgS}LtvY?&xQJ!qiv*h-mYM{@?L|? s#Z35sk`lOn; +#include + +using namespace metal; + +struct Monoid +{ + uint element; +}; + +struct Monoid_1 +{ + uint element; +}; + +struct InBuf +{ + Monoid_1 inbuf[1]; +}; + +struct OutBuf +{ + Monoid_1 outbuf[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); + +static inline __attribute__((always_inline)) +Monoid combine_monoid(thread const Monoid& a, thread const Monoid& b) +{ + return Monoid{ a.element + b.element }; +} + +kernel void main0(const device InBuf& _40 [[buffer(0)]], device OutBuf& _127 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +{ + threadgroup Monoid sh_scratch[512]; + uint ix = gl_GlobalInvocationID.x * 8u; + Monoid agg; + agg.element = _40.inbuf[ix].element; + Monoid param_1; + for (uint i = 1u; i < 8u; i++) + { + Monoid param = agg; + param_1.element = _40.inbuf[ix + i].element; + agg = combine_monoid(param, param_1); + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if ((gl_LocalInvocationID.x + (1u << i_1)) < 512u) + { + Monoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)]; + Monoid param_2 = agg; + Monoid param_3 = other; + agg = combine_monoid(param_2, param_3); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 0u) + { + _127.outbuf[gl_WorkGroupID.x].element = agg.element; + } +} + diff --git a/tests/shader/gen/prefix_reduce.spv b/tests/shader/gen/prefix_reduce.spv new file mode 100644 index 0000000000000000000000000000000000000000..b2e35fc79906afa27664fd65a1ed6084c7d96a0d GIT binary patch literal 3472 zcmZ{l`F0dl5Qkf`5QvgQz!fkdicv@;%4%2y(IgrSh>CldEF>ed#K{ceN>oIA760-X zJU)m|;dYK5zpuN?wBzw|imG32x2kSUr(@Ib<|Mf**_!ku-zC`@NH(EKlHQ~{<&!g) zX2#Z2ta7Fub(9jBjW1 zC~~o0v1g;+&SW2Wx^}j86>d*rPq{Xq>a1lXQ>Ar1r42Iw3HWNMQL5(n)W2|7@)CG$ zWo~VuQED!%WcB+7)IUSr?d7NWFJh1Uj5(<{SC$%Z@2kLz_xZ3z{5g2G)l8eQ7qO?_ zYNpND%gs!MwL4iyrk2a~#{X~v^qsXW4Zpfqm9=JC3-X|Y|tjGF&*DR(ZeNVCaeQ@8` z9FN28U!8Y~M{=F^Cv(}@xWf;U_wFF_Xs);S7jk|0O@d9ni#$_^_O8yn)9`G!vU|n; zp43I}?uhs3Kbzak5&7Rl_8kJhjchN0-$Cvn!ESnXzoLKlNNnCO;Kr!)Ua`BRF6vlE zjyi7VHs^YP>H1c8Ap3GI(h>H+g3Zu=y6-u-}%V-EWKd3{|;Dk3jC4`*o6<>~9cKe+wV+UT6>+f=>h z!P~vr4~;ezYz{)dGt+Unvz7ls{%7Ea zAdmLLaP9J_^9Zsze&Bxh_ZXzkzWyMG_-V*o#wthbab$Z8{4BD$*7IH_ko8fok25|A z*^9ONj^dcnY2=uZvBo%~=OAq{qcgb<_RoyYBD)7MBe8zRnZNfw5Bc}nJN5H^>rn4( zoRj^@qn~rg#eQA{tMj)VvsYJ~y>p5^nnCt=Tim1bVD&MNSCFm4m|6G*$ltzx-XYdU z*}XQG_q__YUb%1m8vJ$0qx}uIc6s!38QC21)^8!J^V{KDi!VXuGFJI#^nObuzai1v zyU6Xmy@yVH-1k1R_x+Ub!yK}^sy=q)3bJw9qOKD16=)LrmA)6?9&6M#52>?8<)~>1 zxxJ>V=+s9|%gDu=R>0PzK58l>8>cPa>47x>wFx zZhSYq1v$r<+4Wo(zYT5bio5Dw#an%Z++O1ibn3s)zeOJ-t8?ewgHPce_dwexkUDEq zj@(}&uS3Q~y3&$qxgp*~38_||-btgpxXx8dJH`s?eg#QNE5)Fc0s zzm&%y{|0Ub`>viX&{5)g;9CoFFS5Pd1NW|e_yDAjcSSGv=KAp24mP&vvjeO?-e)JW Taq2HHKkK;yZ5+=k+#4|aP&VNsC{gdz`T z=O%b6wTk||BKqg`sdTW9M`_go$wr?b2N z?9S|C_M7?6`ObHL=R4ndwgU*G!^1OULq06@Sx7pf6z zDb#AH7ok${LnhRc{6dC=?FA_;dGP*)HeWyVowliDGs8kZv(Vu?^*2z2cl!hY@VP9g z9NK(NcZ+zwf!*aS*{IvN3I~7%La-lp&U>WL$OoYyMhX3Tga4{Z0)QR*#5NkhVL%*I z!kW^l8^#K8O1=63Vwa=}NktO2ubh*u6X*|!^~Wf+ESr?C7pWh16?kN{A>Z{JP^k6O z1yWCE@{mbaY;}iQKW@jC<>SRLSw6P}_Y^-ZLM!NxcFgc5>IdWa*4w;^c(@&??hbE) z0ssnOAC%9<#_DM!>4-uQu$KlP75dc41yS_2Hteb7V|G}4WbgCH78%>!V*4>8watRD z9u8wEl{_C5F|IBIXpc)!ptoS;Z(1DhJUig@+@Y!*ZQ8)$6tX?Q3A~-JZy&r9)k2l7Tn;_VTwfBU$sI5>TB#<9VkC5crPCTx=B*#+8$4j_Yw zI9Z+FS(#D72$bnxwnNq|LAdG0hB7=OrvOvCAW^T`(FKj9Ql90b}^}yI>7k zqM0S?$M6V6G|VAcDU2|OjKEJ`0%CSeNnxD%2(z9wG52~@DeLk;gL=l}oZV2E9{`-{ z7YL@EHDO^$LP>6+;m9SaZ-J5~B@N)om=*c0JS~97V^)~y9uCm+4O)yvLsOm9EmOoA zJ?^Nq2hU&zK`J9KNh*p-q06LAE#~M7(=f8`VahJHUn0R31rV(Spb746J?y&#HG1*| zad{8Ow~8$}=*!EW>RsPrL3?U_h^|kZ)7#RSQJx_>obi{#K5Ntc>n-A~>p!5dI1KdA z;4EH7XOWn$th&Yq(i0r0Dvf;qt?<75N*lCQZ%$l26P$Q+ z=+?E7NE*o*(v3e_k@?4cH+N;aZ|>V)-2unWwa+21osV2v0D__W=7677zch>Ma6Z!g zbm?O~lNjQ1xz4uv_V*n;_i}-y_h@e~4HoTPr8(t`5-_oS%R^t6w}?}qQxMaQ`p8wiUONLkpqfSRx`p<*tac%T_T=XT@j6TpGJX868yKF#JVO=@FCXV^FPap0iP=X>Kq+XWcn+=i(BH84g+O?_RH>1TX0CAJe{7--`0 zXA*wgGdbDi=g$JauNz>!lH`;_VP&NZS?e-n+e2I7OmhM_f1jO8=q+fEyrk#q>Sm_OqG|a@* zFfkv^NM}samDEt;Rh4o=l^85bs$}-&oNmn-Gpv3E%{|?kYe#cNXJid5(NaZds+@JO zjYvs>1R{YKl!CM7_@x2PdVME9GxvE z%akK3<&ZUTqzr%0V^}?GSlvBa4u-P34B0L;rz`Y*IOrW)&P6n5Amkoa5qr#N9p$u` zQGPD0(>(0l-A^=k2Xd#mm>ODo%OnjCNO=R)(f}mGwG)mBe{T75_YV#^uWwyNWw+ul zoO|u-XVOrs?w(QdaQN0UZ^v-yde^X|f7b3lLLV7NaCm3s^gG_!o%BO*yUjImb}tD% z%RRj32^YyG>%EG_LHros<=GAwz-wnZFK1b+2lssP`L9!wIh}F{cPl^ryz}y_z=n7h ze*6Wxk^=s2<7NEkH{68j)umTlb%_^y$Jg*Yat9I+&nw@+EAnn$nTGxaUitHX39sA- z=lBnKrSaduD~k|Q8Y5&}@CN=*<|C zPa&-TSzduR{cm-0iD;`O0Ux5%$Sry7EU833_* zX~kC{dmJOSNW=FSq|So@bT? z_;=<1@9O`ZySj`9pcn3G99ERczw1R%u}V7$NB!RLr;1Bk|rm>sz zkXjlyY4M{`h{=lN#1t#JzyK0UqNaBhrNR$81DUTNOXdO{{9Ry^ z7SEZ>r+Dcga*sbyr3V{XnA7|4T|Ar9MGdh!M^K4 zrn@5B?R~nKi^uD4w{ZfdGOEKW(lU8RyHmAz1{lv?vh9;H*%_4=xJ4v$M5Z-azc=a9 wp>XSrpZZ15Q@4Xt;y0(nFR=M*{Oq7=m=9Ut0cbfqC;b1y$MJh_!|Gf4D+hS`{r~^~ literal 0 HcmV?d00001 diff --git a/tests/shader/gen/prefix_root.hlsl b/tests/shader/gen/prefix_root.hlsl new file mode 100644 index 0000000..adf6bf8 --- /dev/null +++ b/tests/shader/gen/prefix_root.hlsl @@ -0,0 +1,80 @@ +struct Monoid +{ + uint element; +}; + +static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); + +static const Monoid _131 = { 0u }; + +RWByteAddressBuffer _42 : register(u0); + +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared Monoid sh_scratch[512]; + +Monoid combine_monoid(Monoid a, Monoid b) +{ + Monoid _22 = { a.element + b.element }; + return _22; +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 8u; + Monoid _46; + _46.element = _42.Load(ix * 4 + 0); + Monoid local[8]; + local[0].element = _46.element; + Monoid param_1; + for (uint i = 1u; i < 8u; i++) + { + Monoid param = local[i - 1u]; + Monoid _71; + _71.element = _42.Load((ix + i) * 4 + 0); + param_1.element = _71.element; + local[i] = combine_monoid(param, param_1); + } + Monoid agg = local[7]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Monoid param_2 = other; + Monoid param_3 = agg; + agg = combine_monoid(param_2, param_3); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + GroupMemoryBarrierWithGroupSync(); + Monoid row = _131; + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + Monoid param_4 = row; + Monoid param_5 = local[i_2]; + Monoid m = combine_monoid(param_4, param_5); + _42.Store((ix + i_2) * 4 + 0, m.element); + } +} + +[numthreads(512, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/tests/shader/gen/prefix_root.msl b/tests/shader/gen/prefix_root.msl new file mode 100644 index 0000000..897a6a4 --- /dev/null +++ b/tests/shader/gen/prefix_root.msl @@ -0,0 +1,112 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" + +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Monoid +{ + uint element; +}; + +struct Monoid_1 +{ + uint element; +}; + +struct DataBuf +{ + Monoid_1 data[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); + +static inline __attribute__((always_inline)) +Monoid combine_monoid(thread const Monoid& a, thread const Monoid& b) +{ + return Monoid{ a.element + b.element }; +} + +kernel void main0(device DataBuf& _42 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup Monoid sh_scratch[512]; + uint ix = gl_GlobalInvocationID.x * 8u; + spvUnsafeArray local; + local[0].element = _42.data[ix].element; + Monoid param_1; + for (uint i = 1u; i < 8u; i++) + { + Monoid param = local[i - 1u]; + param_1.element = _42.data[ix + i].element; + local[i] = combine_monoid(param, param_1); + } + Monoid agg = local[7]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Monoid param_2 = other; + Monoid param_3 = agg; + agg = combine_monoid(param_2, param_3); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + Monoid row = Monoid{ 0u }; + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + Monoid param_4 = row; + Monoid param_5 = local[i_2]; + Monoid m = combine_monoid(param_4, param_5); + _42.data[ix + i_2].element = m.element; + } +} + diff --git a/tests/shader/gen/prefix_root.spv b/tests/shader/gen/prefix_root.spv new file mode 100644 index 0000000000000000000000000000000000000000..3e0422409cf10798205ac282a424700613c33a0e GIT binary patch literal 4072 zcmZ{miE>m`5QcA<1i~T#0WB0TVV6x%77;;+Xh4I2D7c5oLK2sM)%RL4D zgC;*gnuy5nl~Yi%t5U8E_9mRrlyXc-U#V}XuUy5_l1zd3RED|-%Y9u#Y3$OG>hNe! zb!Bbm%F#7zHvQMa-U4?`k|H9WjPB0xZ1mtU?ak>O`b%9MrAl|Pw6lDq(o?JsR?0iK zr?HL6B6JC3DdUUL`I_afOEotpOX1s#)nfbTq4ZvIS1&fT^(+&x#5bY`Q{0%eqYoE{ zi$fW|<1$`IcEgMP{p@Q>_Jcb=OIAJ&HeaRlt?%K;JOh z_c)5q=RSTJFC=5|;mXnUoYUaiIfdj5IO;un*`CJaT#k2;!`AR--AmC+561^Iy!kb1;Jm9fxk?9XwAwi!;sNin!+C zaaL=#My4i7xwbayo0xI!iuDb=62u(WqwSA2Ym(c0 zZTeSa*u7E1O%W_Pg&SV!u7_0@_&nwSs?rql<|3d0%Bcb771% zxqR2(vG^NmjlRG0=h^awXn$*h>-TpdxbN6r$Ng={`~7VQ?tZ`Z;O_T(&vU=`JokIg z^RXQFJiqz8-|s%p{r2R-vZ9=!o^>AxARUmdk?1bfDhfc`P1UtjgYq@b7d0Xe`F2uk8<}@y9yc=wd_Qaax#@ct^z}~6j-ksq5 zz3+i*^EW8oRa^dDeS^ri7i_*gh-X^cy@+=A@6NdK&iMn#0VF@?4`%k*{}9}L?(z5O z5oAANoM+075&Io`mi0Y`_I_&T?jX8|xb%0U^{eClJz&p>x$6UK>qVS9c^7gJaj*Cn z?1#`wKN5Ev0N3Ab5S#YM_c+*mKWB6H1laFJ`w!Th{Sx9neZO(XGTLQ5`i2l~)+3Jm zhr#vvpTwp;@(+Xa`A6X9*B;+X73@BJ@dl${-#}ly(-E+C-=Tt*hwm6zo4%uHdH9|J z`@Z^)qvbKzPlKl*?$KwAvD$sZxVQTM-r*U1^?i62oA#*TIk4x({hkMVKjX|PH^!XK zkiAyBe?0mGVjtu8;)To>-;F+P`MEp@udmr$W7tn3@s6j!_4U7qP5bxRjJyQaHWzV5 zUO~H@5q&Qs+N@g~XTJ)rKl?Ro+T;CS2WxXKqCand?T^0b;hSLXaR<5gjl61or_t7_ z?=7@7sr^RZLBET*^uLGJuZ~{75B7}sMn3>+YeT%R{B6Xu+$)Z~AA;Q*HGc%Q-_~^& zE%%Pi=#P;_hN)z1IfK?7`9A~C&hgK|))RAc9&C*E$oU1> zbK>vFm*8bc3vwQbzZ+kHjdgj43+S&Amgj(C$aRXw(0tR9c#HpJA zNwlFCDzpgp8jyPFm3sK6DxP){NIgWnQm?JFwGOn@+79)o=k+?RGk0&JI8Hk^b2E20 zb8~ZV=6mmZzxTawzxTbj@3))1L2qCSQlyuWccNa)96KL&XzJ2&deN(}DO)bPVGZTK){tXu4{g?m%J{De} z3TP-86u!*hVRyQrSnsN#W(J?J=fQ$PU7-sHfCWO(4IN}}Se?r|f=dM!i0cjht4b1p z{SXVYGofcj!7-67u0=P5t;VU8WiKF&WP*^0j>VoU;$$lY`WM3VJyeP;gP_++Qr5Y0 zJTk^$biE3qQ}mNLQcsI2Xwv1|+!GC-@5L4v@qD-lJ~p{~oL{a;EAWLorg;_hgG+ev zcf5*txUHz}Hm`yl09L~}D4Y{zXZ1`j;SeJT*uwyj1hEuVj+oipgzZ!HG{Wk$#x9R+ zR_TD{}KD}7GSZK~AKqz!b$5qkri zz}xb>vQxLk^~Cv2;<5zw{QinU_>e>)MuNR7qd7k7d#`Ez=1a2tfyr%S*`}(No!ePwTB)DZ%xvQ zQRz-tgNU6PQMFfOWkeMoY1NEUJEK&wBHmyW|MEVw8m(w2E80GaU!z1dcPv>c^l-=W z8XWS9piRZN6#BXAA%rZCi1#StR%U~mjA@TEw8?N)0C37*BA7O$!pabf65LG3k%d#C z6UPH&9mU)`hV;!0Er3TRR5X)&xX>Zxd+bmuf@niH0NOeH`w#&oCAM7_9hvKKi`A>n zVFp2hMqnaK@(H2Kq)o`@m=crZ^B7Z7iTsVGU(k@S5>zpZZZT5DMG+7Br#3S;&& zU>G+uX#}P-msi?BvfP0x)x@L5PZv7WTkJq1pQV4yWo8N1Fq%uHG!M>g9>KZ-X~{t} zGU(6F$I6MG@Wxcck984gK#mU_O~Wob%s$Pm3UgUy6!vuo11@5SC5`+^@V@Q9tm~dUd)E#H)&`&6UNaCDtg6Eo!-zwkjQ>iL=Tow?o9M z(m(3>>)YSXF`jEYO#bQV=Wpa4cCUQ0L;lGX4CzbTU}o@QEndRnPx-ra^L&AMeLDH6 zfZUHn2Lp`AYHDh}c=%#MbMHuV{wW`ui6I6rugjgZrIIaEjoBVv7t};WA`24!ZT6 zsIFFIZLcD)frzgu=`Bv0O6p?=0x^H+Qf$!(LuL)~5p>0nVTBuAF;eT^ZV}ZJqPiTX z#Uk2iL2GKA7RDJp5-FYXNGCKAm70jprlr#+X(|QjRmt%eL2n%O$I z2tM?-J@Y*@vv1Hnq!jJ4pw*PqYC-w6uujX6bB9l~vptYC$wgE$(wipfM4yybKotXk z8s<(oDty*@Ysb%DbbeX9gvw~ZU%zjn#b# z!Qs7U@HsB*i2a3kuiZ6veh&eWR1c4AauKXb?~R@_fOYdz(pAo#!N97@kBJ_>`^xh! zfLBeoe0;)Iernej-~2jG#px75+^zWP8wdhB;<@uO!`O0k1vxU<^5wu8OKHG^Zdp0 z+>iNd4U2i{;ciMo*^WnotwxZukQ&|su6H0_mOtn`XYqH~zsgx^!hcjplagYq91k*T z;&L|o6TT|5k80&}K7EWRVi`h^hP{gqv;fv_!s0&~xO7ELJibH+I>dtUa=Onvrlme; zt4Q=o+LuTnUF^F0(UnW%SFghhS$l4DV3N*z_07PMHXC`Ce502$iG{)n;*8k-Z8?29 zf=nYf(pD^6Nwu|XTV;WuTO9qX=bofHpWJ_? z)$cPnzDRM-F|Jo%zGYm0E-gVj?3w5vYP)#${qxOj7xTnD%i-LT%I(#4rfocb6$rv- za`*rH|9|iPH!}bnf$v2WmYdA~fAH@XHUe7pO1w2YQdX)x-=@>awrRh_Wk#E9O+fb0 zves?dg!i-yaL+nH9JB@?4i^JEJKnB~ciCvm-QEFPyc>z$9*5&_jO|0R@w;j-Qi!KU z$~|tq`2F|&pSTbK$I#j3{uo<&2LZC@k%Z!wDEP)jXi6)y25g zm{y2SRa`eBGDo;eOLN~(k0+Hj#4OeV3TsH5eUl^YE9g9elaON{Yw*`b?}pW8o4-+5 zJ>%@`zr!s#xgJ%PfLz$rpqyTsEd99FPMi#I2Ld&;WmJDCTAK{#wfGyN=k-fk!8J;kb z^8glLPm7*tZ_>u#&zMBd_=Krb2tF7d$lwCWgWVWSK~fI=GChGOg!Irsgw@{xPGUG% literal 0 HcmV?d00001 diff --git a/tests/shader/gen/prefix_scan.hlsl b/tests/shader/gen/prefix_scan.hlsl new file mode 100644 index 0000000..d9e74ea --- /dev/null +++ b/tests/shader/gen/prefix_scan.hlsl @@ -0,0 +1,92 @@ +struct Monoid +{ + uint element; +}; + +static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); + +static const Monoid _131 = { 0u }; + +RWByteAddressBuffer _42 : register(u0); +ByteAddressBuffer _141 : register(t1); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared Monoid sh_scratch[512]; + +Monoid combine_monoid(Monoid a, Monoid b) +{ + Monoid _22 = { a.element + b.element }; + return _22; +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 8u; + Monoid _46; + _46.element = _42.Load(ix * 4 + 0); + Monoid local[8]; + local[0].element = _46.element; + Monoid param_1; + for (uint i = 1u; i < 8u; i++) + { + Monoid param = local[i - 1u]; + Monoid _71; + _71.element = _42.Load((ix + i) * 4 + 0); + param_1.element = _71.element; + local[i] = combine_monoid(param, param_1); + } + Monoid agg = local[7]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Monoid param_2 = other; + Monoid param_3 = agg; + agg = combine_monoid(param_2, param_3); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + GroupMemoryBarrierWithGroupSync(); + Monoid row = _131; + if (gl_WorkGroupID.x > 0u) + { + Monoid _146; + _146.element = _141.Load((gl_WorkGroupID.x - 1u) * 4 + 0); + row.element = _146.element; + } + if (gl_LocalInvocationID.x > 0u) + { + Monoid param_4 = row; + Monoid param_5 = sh_scratch[gl_LocalInvocationID.x - 1u]; + row = combine_monoid(param_4, param_5); + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + Monoid param_6 = row; + Monoid param_7 = local[i_2]; + Monoid m = combine_monoid(param_6, param_7); + _42.Store((ix + i_2) * 4 + 0, m.element); + } +} + +[numthreads(512, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/tests/shader/gen/prefix_scan.msl b/tests/shader/gen/prefix_scan.msl new file mode 100644 index 0000000..5be4e65 --- /dev/null +++ b/tests/shader/gen/prefix_scan.msl @@ -0,0 +1,123 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" + +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Monoid +{ + uint element; +}; + +struct Monoid_1 +{ + uint element; +}; + +struct DataBuf +{ + Monoid_1 data[1]; +}; + +struct ParentBuf +{ + Monoid_1 parent[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); + +static inline __attribute__((always_inline)) +Monoid combine_monoid(thread const Monoid& a, thread const Monoid& b) +{ + return Monoid{ a.element + b.element }; +} + +kernel void main0(device DataBuf& _42 [[buffer(0)]], const device ParentBuf& _141 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +{ + threadgroup Monoid sh_scratch[512]; + uint ix = gl_GlobalInvocationID.x * 8u; + spvUnsafeArray local; + local[0].element = _42.data[ix].element; + Monoid param_1; + for (uint i = 1u; i < 8u; i++) + { + Monoid param = local[i - 1u]; + param_1.element = _42.data[ix + i].element; + local[i] = combine_monoid(param, param_1); + } + Monoid agg = local[7]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Monoid param_2 = other; + Monoid param_3 = agg; + agg = combine_monoid(param_2, param_3); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + Monoid row = Monoid{ 0u }; + if (gl_WorkGroupID.x > 0u) + { + row.element = _141.parent[gl_WorkGroupID.x - 1u].element; + } + if (gl_LocalInvocationID.x > 0u) + { + Monoid param_4 = row; + Monoid param_5 = sh_scratch[gl_LocalInvocationID.x - 1u]; + row = combine_monoid(param_4, param_5); + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + Monoid param_6 = row; + Monoid param_7 = local[i_2]; + Monoid m = combine_monoid(param_6, param_7); + _42.data[ix + i_2].element = m.element; + } +} + diff --git a/tests/shader/gen/prefix_scan.spv b/tests/shader/gen/prefix_scan.spv new file mode 100644 index 0000000000000000000000000000000000000000..6d8fe0af951822060532ad7091e8a50b3fd80f6f GIT binary patch literal 4720 zcmZ{m_jgoP5XT?v5+k4i!7iGBAR-{x6&n$aXwV=kHVm6W5>2wW*@(S_*n6+ou+!{a zfAIK+_*dA@(c|ay_6@wO$IF>B-|x)aJ9FpGK2n%GZ@(m&luS(;lb@2>nwd<-k|fiT zhE%t;uWMgAQterK%BjnYI4~)sfyOkkZwk7I43!3kbsUAPN5+vEoS|B%`Pr?aPQjRI<9qb+I9j;<&N~XiR%R^lQ!@ZqDY3#z0YGt&$ zx}>&q$>{RsEB ztJ2tFG9NvNaWLZx(D|C>u6;EZlZEhArD~~lbW?gSxvK}8+IkiVSmKk=11T;ht?13A zN@*zL7f#@XZZ{hcG-l~T34zjhzrg!{Bxa@}>(TpRF5t}?h>?eDFi zeUB~ZeD2#P@Io>MuavhC+mOCHdDfRJH@8*FqnmA*H9ba5N2$W|dHdSCYIAY6&q`Dw zd7@?l7n5x@-`+dFIe{0F?eJR7g=7afdb4XnTrt^`<87SljQZ~Ci^&vkH4pAG<~Vc% z;%pidZ?!L`klOq;81K8yMO?vKYBc5c-<(Hg{_%_-k?}{-`|=gjeql-uL(Ct&%IDV> z`R!Xizuf$j5bOC7ZGLBb7Qep9=oWUnUwkkz=5Iz^N8|B#ShE%RMIx+Hsz#eSvS7ttK=D7;(ENZt7xxaI5(JN=nyvARj`SR!YVDsG~-zKp4 z2;K*7o0`(SDDE>-XCk+<9oN<9iq zG4i=cGtz>bOIPiowunCgtZgOH&h-++S@$=r&p6+FDWcEZ)_Njh|BYLlKjD9wP5pgAf9P$S0LKme|g4@ch0Xt z)+6~jzc#bS{_EiGbC2J_8xX&h#(AdP81V+QXIbBkXz!&9oArnz|7LJ~{#&qVkNg#IKK}^Z{MzI9QU$wDU%bI6*f-D@?{q6z zyYEm&%fojYSew2rXnFYV0QwuI~g-N8F>&7-O~jhH-E8|GmRq`0D#`H#Y52 z!#!Znjr-jT_I}2hQ*MkoogsUzc7FrMdBSF z0FNVckjVESSo_XwMjirdGsYQt1nqK0^gWDdvu<&m{Uo^l?5D74kN1BXtj)QIZ`m{8 zCy*vYzjZA@Ka1##Z`pI14VC7cfqowGj_&ooa^vhx)ToXcUjWy?-;3C^$NgUdYm588 z0(N;0yuUckeidAQ_G{R*$NgUiYxDjwFK?v&TK(I=a?iXT{TAZvxG&=02ET=P z-#5|nnDKYO&ba63iy6>vPoi$MzU^r5sqbC1cT)R}dmsG);?n;iTE9AG=OeIZ#5e9^ zu(lS&e#zfMJj=b}*t-+#-niFpuy1KyyU=p`(1`v7nUA<9;y=yo#_s{kqYs~f?Stp& zGiC=`d*uHde0Yw30k)p_-F*o*MtkJ^3hX&CyI+GBAcG6 zjkCA*+5V`bzVE^LdVYXwi|>FwZTYve&wulm_!)?QEAI!l53c=@)9Hi#oSMVaz~(*x vZLR*UW+KK|Yt(a4W{;RzaQEh8X2Z2d4-W>rPkRURU_FN*+W#S^*joPu?k=7B literal 0 HcmV?d00001 diff --git a/tests/shader/gen/prefix_vkmm.spv b/tests/shader/gen/prefix_vkmm.spv new file mode 100644 index 0000000000000000000000000000000000000000..cef3965f647503b83d6f6a63ff09657add1e08b0 GIT binary patch literal 10016 zcmZ{p2bfjW5yvlVLqP$>3gRNNF47gTP-H<|P*4;*>iSsT!piPG+$b@7>P8~VG}6?)pbi`}K(O>Nz!)y1x4 zXi{I9*Sd7Y(y4vr)l+94Gu?!HCpDG4{{L`d>d>`FccHUK$7o~(yxd5M!B$k28s=gq z+aPL_J(AX?mF3fQ``SzEi*0?y?m|zwv%QbFI-;_+-K#o#itNVN=Du=oe|vdKb>@`*88c?@&M>!zz9dNj5f4SLs^B4&eXZ$g zTUu%_bS>%GfGT&EdX}7oo7lQ!0=jQ)+xkLpxvg_!we}{sY^~akUA5LGM_{ig7s^Gn z-PCq;bQRX9>-nFYJuIwQ(_36KFxGyq?=5yz*Q`q>VpsAl=9NE*8ky`-nQuzS&VboQ*O6yy9wRSIuS zvP>W1IxEqvQ|M0ZiwD>>$y#joPi$RM0@Lfh_TECd9f!I4;Ud>f;8J;QWs~N<89ST% zmYjVLcJ$l)Td|89+q?SvIyZPOb#!qXx~o)L*S4zAzOHh@@^#6B@GRBYyOXO;9)Y92 zN=ny%40~7p`eet>IdO8;9y|@7t@D`yc3txP&UtpCI_Kp9F}2C718nXyhZ{XT2%OD7 zI%glAvzrFk4ascm&YrX{Z3AncGa$A$nK!_$OIomfKGLg7=W3F(;Cf4!gE@b2_RI#bL6wX z*?ijL$c3upy8&R&4?^~37w%&mvz;&Ak2PJd5pj&g(}1pWwUyp#E92PT9x1lxiu3nI zHzKY%DYb`#9gWx{(2?^1Y-=&UHRwMW?OgfUsqGz!e8!GJ9L67l)+axTT{NQQ4@Y}0 z^5)T=kaF&;DfNv_@0oFO$|3l@KSv;And_a>PWbo{9|0 z*wesu8N2c$VDql{qwtw8<{u3nLV~c5!)CFa_8d%mJBIa6MB8II>zSY0^6qT`+I`xG zg=lT#PC~n;ocq%DPRi+TO|f(1Y&*b%GIl4}^Nu?GhMB;+e3w|4oca7=;~B}jo^~na z^!r;yzjGqrRbcO3*tddTrGThs3%2!!{Wb9XjQw@6Ib;64VD~v0)^+Yfyk8UPO(XhY zB<_#jpX}U6u@|I%Yi*$p{r1>rH+t+ds@;fpBij1p%&YBlDmR&%;=T7dwMX6=WfnI@ zzquVgD=)x!f6U>z`+LIr!}j+?*z>_H8Qb_}8QW)fWzP0@Ld@5{He);A-vwF!O*#AK zoPA5icD}z2vh%lQZ2jAE_JcXwZ+~|Fj-35;&VD9m`_0eJ_j{kU{oZG7zxP@Dpq%YD zKkN6KpSAtwXKlavS=;Y?*7iG}wf&}NZNKYTyO6W}u7}@#`dts(^Yq)Ewf(+lZNKl@ zE!;BSv11Y6!#*2xxpVdCLlODgiD^WSL*$Hko?AK|?RbfJeTN}(-d(Y|eOFFFe1H1h zc$QVoJp+-ShD=9%4(1U*0j;kY@g4ahxgw`}J8Q`+zBA{N&3wlo&NY|5dDQ1KBOm)Y z7A$uo>p16lM9z0@^x*_>z7MnEr^=79L9P6OA%wu=la^liG3H?Uwe5P+VfFA$C}I0 zXCMy!XQK71qxKbGbKK2Zjp(xwIrs0`YoCso%ems1dp6j)VV{%Q*Qa|q7hBHyBIZ1> zy$bt$usNdF7l4hCk3L;k!AW91=Xtj;Lgeki%G8ejw_!*B&F38Ze+i;5dR<64)XrY7 z0$+?oueFUkp0jn`c4Qdhn#M(qt1CE3?1?pL8*AS^1AC{AeHX#meXqfm>pw|9Uw2 zSpN;+?D}uSc76GHZr%iTp1wGXH-kM3eZChP&~HKHrz3G5H>G@>>07boyc0L0wZr!| zu;;Ds7PNNU^S6V&=g!d=cS_!~jQy)$>pb3pFW--M!pTPs?*dyx?DO4V_i3DWPTQDf z#67yUcgo(Yqp$A)+t+xHycaAN&yzm6>^=2P#o4_NoUir$aPsku`~X<)R>XVpA+*DL zq3?r;oVAIgwhx16rncwy5v023hRXNiHtb1=wHm9AchpD0#yhNaGWui4ZOCxM9%&ml z5t)X>x*rF-Zq)w?u$<4U^?wTOuzr1?MC7br9B1@taJJshU`M^yumy>FZwDLih#1(u65x*P0pkNQ4`$hk+c_xc|6R3!TL zd2qIGU%-}&zI_SoFsHsRB68*wN6xsf;Zw&NUk1BI3s16V`V}O4s?T0U>{r3o_!MVt zTnBP5B5$u^KWf+BioOqt`QHG$)*Z}mM1K>Ji~ZaWcDNsX-$LZvk2vyt8*H9KKJLJe zz;ba1egbxwTi=fnIdh9+e?JAghd7g;f#Xc{c_vTt?yX1v9MRu_Jc{@nK7y8SM&j&a z9d-Ua8NY9TfzP?q(|PvqF|hpZbE?1Je~HhS*z>Qza<1#M{W#j;p7s42`3vHn#nI2- zfb;$QEu4Jh{vB8@`netKFt@(nBij*ki=&@UfTN$S;3tvjr@k$SJ!nEdh3KDwOhe** z@&~X!hc%Bt{}I`Ncpmmj+c?*4K>rERukY#94&SrbTT|aNXzlps{TVzAagILc8!PWU z-HvvTYVEk6_BHnWS8(=R{)R0#74cmDfp*wCeSb&f%qxy_`6oDkF8_j)k8}AqST4@x zIk3ar`u>B+nOht+z5vdi%Zu1?F8aLFaV{@`_2WRC)v?FH_}pW>t7@?2+?(_D$%(V?D&I@? z%QG?ev zJpL_5JMPXQVDFA|^u=28*|l6V_BR%s?eC%3a&dR`$who)8Xx@_hb`xs*^lvHhyBoZ z7$WB$#j%eG;Mk*ogVGM)5#V?}n!wtz$7Zm5bdEmvXsmqHI1%i8+}J3a_(DfZSgIT*N*pyTKgfi z@y5k_WEQdc_sFqu@=?oiV7YjYoB(!Mi@xI#IdhBSzRU&3eev&;+TlA9yp|mH-@jFA z$6cBa_AWU`pE3TuQ$FS|09SKY{~M-u+=;jY3*r2&7;{d-HjlrpTHzOgtKR)4e0d-v}}yi?X1`PAC?!<%3ITKDvHVqGtO$1lg0i)Z*u zu*16boq@=iR~&ygSOIQ9vd@U;DQ_(?M;+&PHaPCnSzztx@j2jpkI#jZzmxqplKVWc zT-162*kP^u&PU{|RUEl51m|;K1ScQg_KU%C{>E}%AG!^@gw_}Pz630v-S14g7tq?_TMaf|Uprbmo|_`r=f*kuj4_YAJ&nDo^>v`_g}yal_pSEdN1f#rztbYY~7BOL8iETacd*&*zG4iqI)nN0)@0)ADn-HHdnP4xFv$dh809&etcGJv;mSax#c-{vh^N$6K!2 z2UO+1c^=EU_H$1p@KCU8?}fJ35$KVKG1eOO?33~lvoE%DvoWKv<)erDft@G6ocmxs N`y=wNu%_5r{|}G$(MtdT literal 0 HcmV?d00001