From 47f8812e2fbfc2b449ef8226647888b3da2981de Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Tue, 23 Nov 2021 07:28:50 -0800 Subject: [PATCH 1/2] Start work on new element pipeline There's a bit of reorganizing as well. Shader stages are made available from piet-gpu to the test rig, config is now a proper structure (marshaled with bytemuck). This commit just has the transform stage, which is a simple monoid scan of affine transforms. Progress toward #119 --- Cargo.lock | 18 ++ piet-gpu/Cargo.toml | 1 + piet-gpu/shader/backdrop.spv | Bin 12232 -> 12328 bytes piet-gpu/shader/backdrop_lg.spv | Bin 12264 -> 12360 bytes piet-gpu/shader/binning.spv | Bin 16012 -> 16108 bytes piet-gpu/shader/build.ninja | 35 ++- piet-gpu/shader/coarse.spv | Bin 63796 -> 63892 bytes piet-gpu/shader/gen/transform_leaf.dxil | Bin 0 -> 5664 bytes piet-gpu/shader/gen/transform_leaf.hlsl | 219 +++++++++++++++++ piet-gpu/shader/gen/transform_leaf.msl | 272 ++++++++++++++++++++++ piet-gpu/shader/gen/transform_leaf.spv | Bin 0 -> 12216 bytes piet-gpu/shader/gen/transform_reduce.dxil | Bin 0 -> 4696 bytes piet-gpu/shader/gen/transform_reduce.hlsl | 125 ++++++++++ piet-gpu/shader/gen/transform_reduce.msl | 138 +++++++++++ piet-gpu/shader/gen/transform_reduce.spv | Bin 0 -> 7568 bytes piet-gpu/shader/gen/transform_root.dxil | Bin 0 -> 4824 bytes piet-gpu/shader/gen/transform_root.hlsl | 94 ++++++++ piet-gpu/shader/gen/transform_root.msl | 129 ++++++++++ piet-gpu/shader/gen/transform_root.spv | Bin 0 -> 5280 bytes piet-gpu/shader/kernel4.spv | Bin 38684 -> 38780 bytes piet-gpu/shader/path_coarse.spv | Bin 43040 -> 43136 bytes piet-gpu/shader/setup.h | 6 + piet-gpu/shader/tile_alloc.spv | Bin 14788 -> 14884 bytes piet-gpu/shader/transform_leaf.comp | 86 +++++++ piet-gpu/shader/transform_reduce.comp | 69 ++++++ piet-gpu/shader/transform_scan.comp | 89 +++++++ piet-gpu/src/lib.rs | 34 +-- piet-gpu/src/stages.rs | 209 +++++++++++++++++ tests/Cargo.toml | 9 + tests/src/clear.rs | 4 +- tests/src/config.rs | 3 + tests/src/linkedlist.rs | 4 +- tests/src/main.rs | 12 + tests/src/prefix.rs | 4 +- tests/src/prefix_tree.rs | 4 +- tests/src/transform.rs | 133 +++++++++++ 36 files changed, 1674 insertions(+), 23 deletions(-) create mode 100644 piet-gpu/shader/gen/transform_leaf.dxil create mode 100644 piet-gpu/shader/gen/transform_leaf.hlsl create mode 100644 piet-gpu/shader/gen/transform_leaf.msl create mode 100644 piet-gpu/shader/gen/transform_leaf.spv create mode 100644 piet-gpu/shader/gen/transform_reduce.dxil create mode 100644 piet-gpu/shader/gen/transform_reduce.hlsl create mode 100644 piet-gpu/shader/gen/transform_reduce.msl create mode 100644 piet-gpu/shader/gen/transform_reduce.spv create mode 100644 piet-gpu/shader/gen/transform_root.dxil create mode 100644 piet-gpu/shader/gen/transform_root.hlsl create mode 100644 piet-gpu/shader/gen/transform_root.msl create mode 100644 piet-gpu/shader/gen/transform_root.spv create mode 100644 piet-gpu/shader/transform_leaf.comp create mode 100644 piet-gpu/shader/transform_reduce.comp create mode 100644 piet-gpu/shader/transform_scan.comp create mode 100644 piet-gpu/src/stages.rs create mode 100644 tests/src/transform.rs diff --git a/Cargo.lock b/Cargo.lock index e65ac2f..737c033 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -96,6 +96,20 @@ name = "bytemuck" version = "1.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72957246c41db82b8ef88a5486143830adeb8227ef9837740bdec67724cf2c5b" +dependencies = [ + "bytemuck_derive", +] + +[[package]] +name = "bytemuck_derive" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e215f8c2f9f79cb53c8335e687ffd07d5bfcb6fe5fc80723762d0be46e7cc54" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] [[package]] name = "byteorder" @@ -872,6 +886,7 @@ dependencies = [ name = "piet-gpu" version = "0.1.0" dependencies = [ + "bytemuck", "clap", "ndk", "ndk-glue", @@ -920,7 +935,10 @@ version = "0.1.0" dependencies = [ "bytemuck", "clap", + "kurbo", + "piet-gpu", "piet-gpu-hal", + "rand", ] [[package]] diff --git a/piet-gpu/Cargo.toml b/piet-gpu/Cargo.toml index f8f5c0a..faaffbd 100644 --- a/piet-gpu/Cargo.toml +++ b/piet-gpu/Cargo.toml @@ -33,6 +33,7 @@ roxmltree = "0.13" winit = "0.25" clap = "2.33" swash = "0.1.4" +bytemuck = { version = "1.7.2", features = ["derive"] } [target.'cfg(target_os = "android")'.dependencies] ndk = "0.3" diff --git a/piet-gpu/shader/backdrop.spv b/piet-gpu/shader/backdrop.spv index 450c91681a11baa22490f4e841aaac7a1de8ba29..870abe46716f3ff7fd830656bd0edcdd8713d629 100644 GIT binary patch delta 118 zcmX>Rzan9S0lPjM11rOJ1_lN$AkK>~DN4*MW?*Ar2a9n7#lRx*`DtmzsU<+}W_@;L r7G;DMWuTY_5PLw40cin=>1_7pj$~%!o4lM)mz51Dz_$4!-**K7htn1- delta 28 kcmZ3Ha3X$#0sH0zb|se0tGGj$8F?oE=hNM6!v9490GPW8`Tzg` diff --git a/piet-gpu/shader/backdrop_lg.spv b/piet-gpu/shader/backdrop_lg.spv index 859e3e7b2daea79ac7851017389d2f9ae42e91b1..a8b1fd9ee71a1ce02aa2b9ba68112df551511a47 100644 GIT binary patch delta 118 zcmaD6e~DN4*MW?*Ar2a9n7#lRx*`DtmzsU<+}W_@;L r7G;DMWuTY_5PLw40cin=>1_7pj$~%!o4lM)mz51Dz_$4!pSU6bqf`}) delta 28 kcmX?+@FISL0sH0zb|se0tGGj$8F?oE=hNM6!Y`r-0G+%ETL1t6 diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv index 516cc1d1428f71e86ae8529f9f726ccc040cf99a..669585af39609a604db0e9144ba0a37db4d1c4b9 100644 GIT binary patch delta 117 zcmeCFeN(%DQ=g52mEjiy0|OTj=f#&4CFT_~uraWM#khfDV3GLzw6x;X5+HZ;e~$Ak q$_OpWKrsy<_JA4#(gG6G*?f|BCo?18rGq5qRgT=UkVqlT@{Is;<)Dj?f w^L=%FK4pX!WuTY_5PLw40cin=>1^gSv*lvso9ynO%f<#2U}M-^m813Li5;0%rc diff --git a/piet-gpu/shader/gen/transform_leaf.dxil b/piet-gpu/shader/gen/transform_leaf.dxil new file mode 100644 index 0000000000000000000000000000000000000000..bc4f9410cb194343136a58f29bf2f62a21114b42 GIT binary patch literal 5664 zcmeHLdsI_by5HxKlM_h735gmA!U+f>q4tC~f=phZD7H~iu{w8xh(!w;kuo5zxhEkc zDaINQEovXZN5{6#h_6x9ItgG!MT<&jxNS$F;-g(^xs2DVz23QZpFmuW_pWtkt$Y8t zf6QKMpZ)#5@7v$moA2BEcY>ivr!W8f)y~WF-Zn=+s9hXZ)4db{0MHWw;KMN+_5#?K z!(I-11T?0Q02pCkn3vDM;d#qnI7=PC-1z!S*tl>pa{%g@=QI9V_z&24FRD-gxV3N{ zhh08C!T~^&zXx*wHqOqqfd9*S4BEIJ&~tlxu?6&jIGk-^p|+5MjwZkZ-7v7BD8GPv z0R?SO2mo|??+v98fYVSHY^D9{kwP2a64RvZ!Se|!DYF@|31fL=SR}rA1uM%%bVxgOtw=N?PEY?UvE{Jm-E8mZTe=Ep|6WcN?|yEv}*Z+l~0NJYqiFG`BW|Ymoc% z@HOZGk$r>{F)z4Alsw@?#Lce3w9hyZ%myGI&f)edwVHHv0cjT?2zZAE;1#G#ik>Z? zS2p0=qW9V0EekDZbOqb?6;L(FKtLjd#HvIjUb>TUffkQ@XAG!MqHXcp+i1gE z?}E1KFU?P-kE!M;^m02DZ{C1w)sbsRZDYFVy`L3_D4(j7zv&*1Ik6?kM(Osq-cuTm z#q(xD`9RZTGNWKu^qkgbqTOVLJX+VB_G!G6EUmn`4w*Q`hZXq5Z0Uc zoI@Y+{R?i@8vaHTzp9V8L-DJ}JWXO4Va)RmGYr(#eHw3aKJGfD{$wbQbVe|BzQv>c38D-_uRS- z*v|DGh_)>_Yj<5!!it2TT?s$!>af(R=WYltnfp%^F2!v-Y5FozOsA2!wzzz)mCaDt zF=ZV2#`z)D`BhdnTZ-dfyQzyhMz04`&aPZ*PMcCcN_F*7;GIJ|ckdRUp%t^`vC;e5 zl&q*8nc`q<(ReY)j2%eIeK>B#uuFfXS+x5UAz-VlDY8( zuT4+AY&b<1&6U-3-dPLo(>iV}`d+WBT zO6I?JCola3l7*$r#!`#0v@$Hc1`~+g8SeT_PyLMc`YdmKw&oG|r1Ohaf4tM2w-~y| zlApSVItI31xV-=Kj(GxS+{&8i)y3u2rIibx!je-?V^fdi&kz zC29Y_c=PJM>0%cW!awKe<=t?Pw%K*$41 z{D>7N3w4p5zR6eGB33axzgxYc7n|9mpXtJ8_SU$n%=|i%Upw1jHuE={vFaL!nRbNr zO2qfw;vptvEfaEUL_A^?CsW;%Zzv-Ml#@G)BUdr+WSyzc>eFW)!?MrRXWOu>-Vs@~ ziCOaIZp-A0#l#=n`ph1EW~={svMZxSpW(!^TDo3^FMVds>c_I$ zyM*0J{w6cFG2T(lI>N5%#ZLx_VOo55ka$dqN4?@9cgT;%kVj0tr3Z=nR>Qrhvv@Km zKxzQ%X#mu4hVC(JTpX4I^ipQ6*D6;y?%j;?tr0|q!`e{|(YbR8LLiG2U zKV?N->UyLOC*RlBFZwk5{)fA7j;2xas&mV$m$=?vyIY&Q{Oq5fAXF|X2}4u;GQ2Yj zXLtu=PE-iKGC#d?i(fdOl4NB{hM2iPY4OsI_0FyI>81Z=al3upT;W+*=+*Go>-(oaus`*J1C7K?YmqwZYpB!p3JuDnlZsm^1czcN`1=MqT71oB1ePD5(^3g;M;Jz@q#tgs1 z`acahVCvB$2;8o}dEMtxlpP0bA_dEj<4@U({265PNPupR3HQhwJ&6}RR1jschkCo(XCGm2FwQrgfH|(P6>voI6F2dU? z{$xilv6nvk+t>u%`#-Ts`wx(bS~{*?xUKzsf5%nGB-~Gfzw^Z3dE&q2iDDXnAK}TC z;`0(ISSKN{_ru>4K$#30qeF3podyGhyM0UD53v3Q%FC!gN_Qv-bYRKfE7u74^ z2RKRePj*`>xBJR;pwOBxKnfXR#+X4Cwa+3wz!~~Ad}iD^s$DBP#_3vo#JH{{{tPm~ z>kn`wpevl#15XfI$1)$W#WK8-k$@y{)9p8q+w1Mf#1*cMq>A;c_Y#LVczNM z0Jg;3a9d;=2|Cn%1`(Q&i96C+ZDM!lD3R>uEiJlXB91b_Z`!iTHTa*v`;v$4h@>1d z=bh{#3~rG(Y)Gu2e5qi};j3|n0*skU2@64lQS8(3yvSV+lz}rk#)x>3kszpK39kyr zCV+5o2pZ`<$U(iylF;L^rm;aL?JOY$a34E|zigm>aAKYm_1Mj<1jB)r@eC*@J)`>? zuG&C+%p_AcKs1!_z%K_IjO51R6eURT@#Ja^pm1zX4WCl5~ft^VVEM#Zm1IcXat!-N;f3&HW#GVdFh|$ASgu^9s zWSRqJx}5}dGv_-P0?`4m*G96g268JO*x zpe1?Yf>jc4xVo092Tfak&%xA9%G(@dw_N35?p|eSIC8MKb<1@QF8a0d zI0sK|`3ge&A`>08ty7azZLw@}1z}H%QixHt17`F(%qX4Y-YOwCxio_%ILhUiyw?xS>Q#BjO$k&qPm~d5?ODR_~u$=38IV*0Z z*%g0@p6NC1Kr*eU=np4@im>szZVb5@zhy8T*0m``OXf+5;uMSgUGSJRZRH+seIgn^ z6>EOhcPrKUh&vTmab*TFAZ0&@)es5b2eYEYgL=d#)Er>Pz$p&g@o>N-KCQMZ`jrOw zdTqY-4aW!@^i1DqfMrF;#D*GyE5Tq$$c4}pGmV3D3Jh~N=!toSgKG*6jT{_|QFE|y zg`qJIE$X7;QaSkP8biV?2#s->5DGu@YI0V=Jl&UIPvukK$(sZ7v;*cT66Q&}@}-k^ z*a=Twr~+EkptT%Y2gXj`9+Cg#Ek#Q`s5i8V+h(Gczw#wu9<#(j%MMf60Zo5?rlYhL zYaGajS7Po%L(vU3x+W=cou(;Uxg-6g^F!Gs#qyP}3zJwa!Snzk%u8ePm7vR&pHp2h zs@~hYb@KL2iJ!o$A~tI7^rx+r7Xmi~YT=5k_bI$5rbgA}Je5DXaNZf>za&thf2re_ zTs3dex3tOQe7_}6A5op| XRHgC1N9%{|^b%MkmZfOm|DgW~%X?MG literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/transform_leaf.hlsl b/piet-gpu/shader/gen/transform_leaf.hlsl new file mode 100644 index 0000000..80b5434 --- /dev/null +++ b/piet-gpu/shader/gen/transform_leaf.hlsl @@ -0,0 +1,219 @@ +struct Alloc +{ + uint offset; +}; + +struct TransformRef +{ + uint offset; +}; + +struct Transform +{ + float4 mat; + float2 translate; +}; + +struct TransformSegRef +{ + uint offset; +}; + +struct TransformSeg +{ + float4 mat; + float2 translate; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + uint n_trans; + uint trans_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); + +static const Transform _224 = { float4(1.0f, 0.0f, 0.0f, 1.0f), 0.0f.xx }; + +RWByteAddressBuffer _71 : register(u0); +ByteAddressBuffer _96 : register(t2); +ByteAddressBuffer _278 : register(t1); +ByteAddressBuffer _377 : register(t3); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared Transform sh_scratch[512]; + +Transform Transform_read(TransformRef ref) +{ + uint ix = ref.offset >> uint(2); + uint raw0 = _96.Load((ix + 0u) * 4 + 0); + uint raw1 = _96.Load((ix + 1u) * 4 + 0); + uint raw2 = _96.Load((ix + 2u) * 4 + 0); + uint raw3 = _96.Load((ix + 3u) * 4 + 0); + uint raw4 = _96.Load((ix + 4u) * 4 + 0); + uint raw5 = _96.Load((ix + 5u) * 4 + 0); + Transform s; + s.mat = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3)); + s.translate = float2(asfloat(raw4), asfloat(raw5)); + return s; +} + +TransformRef Transform_index(TransformRef ref, uint index) +{ + TransformRef _85 = { ref.offset + (index * 24u) }; + return _85; +} + +Transform combine_monoid(Transform a, Transform b) +{ + Transform c; + c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww); + c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate; + return c; +} + +Transform monoid_identity() +{ + return _224; +} + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _71.Store(offset * 4 + 8, val); +} + +void TransformSeg_write(Alloc a, TransformSegRef ref, TransformSeg s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = asuint(s.mat.x); + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = asuint(s.mat.y); + write_mem(param_3, param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = asuint(s.mat.z); + write_mem(param_6, param_7, param_8); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = asuint(s.mat.w); + write_mem(param_9, param_10, param_11); + Alloc param_12 = a; + uint param_13 = ix + 4u; + uint param_14 = asuint(s.translate.x); + write_mem(param_12, param_13, param_14); + Alloc param_15 = a; + uint param_16 = ix + 5u; + uint param_17 = asuint(s.translate.y); + write_mem(param_15, param_16, param_17); +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 8u; + TransformRef _285 = { _278.Load(44) + (ix * 24u) }; + TransformRef ref = _285; + TransformRef param = ref; + Transform agg = Transform_read(param); + Transform local[8]; + local[0] = agg; + for (uint i = 1u; i < 8u; i++) + { + TransformRef param_1 = ref; + uint param_2 = i; + TransformRef param_3 = Transform_index(param_1, param_2); + Transform param_4 = agg; + Transform param_5 = Transform_read(param_3); + agg = combine_monoid(param_4, param_5); + local[i] = agg; + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Transform other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Transform param_6 = other; + Transform param_7 = agg; + agg = combine_monoid(param_6, param_7); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + GroupMemoryBarrierWithGroupSync(); + Transform row = monoid_identity(); + if (gl_WorkGroupID.x > 0u) + { + Transform _383; + _383.mat = asfloat(_377.Load4((gl_WorkGroupID.x - 1u) * 32 + 0)); + _383.translate = asfloat(_377.Load2((gl_WorkGroupID.x - 1u) * 32 + 16)); + row.mat = _383.mat; + row.translate = _383.translate; + } + if (gl_LocalInvocationID.x > 0u) + { + Transform param_8 = row; + Transform param_9 = sh_scratch[gl_LocalInvocationID.x - 1u]; + row = combine_monoid(param_8, param_9); + } + Alloc param_12; + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + Transform param_10 = row; + Transform param_11 = local[i_2]; + Transform m = combine_monoid(param_10, param_11); + TransformSeg _423 = { m.mat, m.translate }; + TransformSeg transform = _423; + TransformSegRef _433 = { _278.Load(36) + ((ix + i_2) * 24u) }; + TransformSegRef trans_ref = _433; + Alloc _437; + _437.offset = _278.Load(36); + param_12.offset = _437.offset; + TransformSegRef param_13 = trans_ref; + TransformSeg param_14 = transform; + TransformSeg_write(param_12, param_13, param_14); + } +} + +[numthreads(512, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/transform_leaf.msl b/piet-gpu/shader/gen/transform_leaf.msl new file mode 100644 index 0000000..6229b25 --- /dev/null +++ b/piet-gpu/shader/gen/transform_leaf.msl @@ -0,0 +1,272 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" + +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Alloc +{ + uint offset; +}; + +struct TransformRef +{ + uint offset; +}; + +struct Transform +{ + float4 mat; + float2 translate; +}; + +struct TransformSegRef +{ + uint offset; +}; + +struct TransformSeg +{ + float4 mat; + float2 translate; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct SceneBuf +{ + uint scene[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + uint n_trans; + uint trans_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct Transform_1 +{ + float4 mat; + float2 translate; + char _m0_final_padding[8]; +}; + +struct ParentBuf +{ + Transform_1 parent[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); + +static inline __attribute__((always_inline)) +Transform Transform_read(thread const TransformRef& ref, const device SceneBuf& v_96) +{ + uint ix = ref.offset >> uint(2); + uint raw0 = v_96.scene[ix + 0u]; + uint raw1 = v_96.scene[ix + 1u]; + uint raw2 = v_96.scene[ix + 2u]; + uint raw3 = v_96.scene[ix + 3u]; + uint raw4 = v_96.scene[ix + 4u]; + uint raw5 = v_96.scene[ix + 5u]; + Transform s; + s.mat = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); + s.translate = float2(as_type(raw4), as_type(raw5)); + return s; +} + +static inline __attribute__((always_inline)) +TransformRef Transform_index(thread const TransformRef& ref, thread const uint& index) +{ + return TransformRef{ ref.offset + (index * 24u) }; +} + +static inline __attribute__((always_inline)) +Transform combine_monoid(thread const Transform& a, thread const Transform& b) +{ + Transform c; + c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww); + c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate; + return c; +} + +static inline __attribute__((always_inline)) +Transform monoid_identity() +{ + return Transform{ float4(1.0, 0.0, 0.0, 1.0), float2(0.0) }; +} + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_71) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_71.memory[offset] = val; +} + +static inline __attribute__((always_inline)) +void TransformSeg_write(thread const Alloc& a, thread const TransformSegRef& ref, thread const TransformSeg& s, device Memory& v_71) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = as_type(s.mat.x); + write_mem(param, param_1, param_2, v_71); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = as_type(s.mat.y); + write_mem(param_3, param_4, param_5, v_71); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = as_type(s.mat.z); + write_mem(param_6, param_7, param_8, v_71); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = as_type(s.mat.w); + write_mem(param_9, param_10, param_11, v_71); + Alloc param_12 = a; + uint param_13 = ix + 4u; + uint param_14 = as_type(s.translate.x); + write_mem(param_12, param_13, param_14, v_71); + Alloc param_15 = a; + uint param_16 = ix + 5u; + uint param_17 = as_type(s.translate.y); + write_mem(param_15, param_16, param_17, v_71); +} + +kernel void main0(device Memory& v_71 [[buffer(0)]], const device ConfigBuf& _278 [[buffer(1)]], const device SceneBuf& v_96 [[buffer(2)]], const device ParentBuf& _377 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +{ + threadgroup Transform sh_scratch[512]; + uint ix = gl_GlobalInvocationID.x * 8u; + TransformRef ref = TransformRef{ _278.conf.trans_offset + (ix * 24u) }; + TransformRef param = ref; + Transform agg = Transform_read(param, v_96); + spvUnsafeArray local; + local[0] = agg; + for (uint i = 1u; i < 8u; i++) + { + TransformRef param_1 = ref; + uint param_2 = i; + TransformRef param_3 = Transform_index(param_1, param_2); + Transform param_4 = agg; + Transform param_5 = Transform_read(param_3, v_96); + agg = combine_monoid(param_4, param_5); + local[i] = agg; + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Transform other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Transform param_6 = other; + Transform param_7 = agg; + agg = combine_monoid(param_6, param_7); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + Transform row = monoid_identity(); + if (gl_WorkGroupID.x > 0u) + { + uint _380 = gl_WorkGroupID.x - 1u; + row.mat = _377.parent[_380].mat; + row.translate = _377.parent[_380].translate; + } + if (gl_LocalInvocationID.x > 0u) + { + Transform param_8 = row; + Transform param_9 = sh_scratch[gl_LocalInvocationID.x - 1u]; + row = combine_monoid(param_8, param_9); + } + Alloc param_12; + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + Transform param_10 = row; + Transform param_11 = local[i_2]; + Transform m = combine_monoid(param_10, param_11); + TransformSeg transform = TransformSeg{ m.mat, m.translate }; + TransformSegRef trans_ref = TransformSegRef{ _278.conf.trans_alloc.offset + ((ix + i_2) * 24u) }; + param_12.offset = _278.conf.trans_alloc.offset; + TransformSegRef param_13 = trans_ref; + TransformSeg param_14 = transform; + TransformSeg_write(param_12, param_13, param_14, v_71); + } +} + diff --git a/piet-gpu/shader/gen/transform_leaf.spv b/piet-gpu/shader/gen/transform_leaf.spv new file mode 100644 index 0000000000000000000000000000000000000000..ec47a9f94f1f0e8034426f89715ba91137d43fb6 GIT binary patch literal 12216 zcmZ{p37A|}mB(MwOPYNN1Z7F82}^(kNC<&UWJv^@XdofM5fQDfbXU?P-CeP|Itu~@ z46YavH9CR_C=wJ|f(x6=pfanYvpPB(-{NcY@zu&8Ox^KUcC+EBW|NoqO?zzjm z^*VOVUN|SqW@QIwec9J$W%+YdHX9|&4#|4+c3|D6b!ShshtFPq{<#Jmo^|DM#vDPv z9@=hftkxLUaR`oAVomHwa_Bos$81OHSQj~<+4Bh9j6SfgI($Z5c)dE@Xw`?>jqP>g`jWWD_|WL&a2n7my;LgqX>QpvQEz7` z2Q!~6Yd0r{Myg}=u|*T@*5puoN#1G6owoa@?lV2--t2Vv)HuZIw@!7R zs8jy`>pVVNf}ZxT=9?I`P9KEgWTc_7l)u2WWc!xo*fLL#>(P~+Nn0!MS+t$Gc4dpf z6S+Po^937gjyD^_)y8mryxnN;T7*RZx!DTZq2}0TUyDvm)Hqer^q}R6ragHr%gGhB zoDbfdkI|i3?6z8~HdgS9O8j#8mG!Y^YnN->o5_86bE>f?p}W@V^;WCd0=s|4``)AH zIb4?kxR3Z++Qv?CcXmCvRoju@IiAHg!8Z-n$Lnh*r``cIp9ytZa||l*n8QGFEDb)| z;md+s7yk2tGql|0$@|~Z;Vx00dk<}Aue!3k!BOjj;B?&{D)Id#{#c1WQQ`+m{OJ;Z zw#1(+@voQoH%t7568~0-f4juLQ{vw(@$Z%R_h;~~><926%>C)(V06~4FaNFBI$9kV zZEmiOt{var9ICY&&GEID6|qMV+j&pAv!lWJ+lbGzB=?DP=YG#cdu4NcOJi&2e9C>s ztM$?P7%vo?=lN9EHD29TYmZFimqstY4(guI9gX4k2=`~T-59OseLB7_d?WS7){*wK zGxGV{V|cZ=*hB9@=cNsKYW!aGZSA4aaK!j?)H_v$YpPElzS{VBGaNC-LFjGo-@@Vg zIG2ObnS>s4bmnl#RBWuA`_hvgLwslLIxm4$p;oOuG~#%D`8Nn>Yn>&O zj#fU8we*R4t^+sQBX#cs=dyW5{_bpe2Jgzo;H~BkVte!N9L9NFvvtEjt2w#N!|QrZ z5;Ju(lh@xQHJ|uRk}KsqH1*rrlkGsov*tU%>H6=T!Mn1% z;GOw)Wgi2_9e>Y^xbE!U8N4TZ0G`+9_Xp!UxBa2$9-aOlfq>wKUFm)HvUV0nVX)eK*bP@5#xSPw@T@&G{R_)8lsMzbm?5 zhFM4arbHd)SBw0E;55J7{MO`ne*gN8orsw)?y1P97WYzFE6+9VBjcQF^tBc>zez)L zebkOjH18fYzfB{~xvRx|9m6|EEoxSCJhciIeZ6PYPDwQH7_~)-=KR(Cwz+~eiF+i* z3~r2%Ypm9l;9N!Qprryn0pzQ+E2;TR_r+`l#Ce!p64yl0U=H2bT$M()*i1lOa2 zS!cXsQD^YTW1VVIr|(}{r@WtB;r}Y!y8OnT%L(y}^yAepK9(_Kd}uZwk9*P4{EMfv zH)<7PUH|YOLk8ErpZPk5*mz_5$>1CvBQ$mM`*%a+KMrg@HS;)cvAXf|h(C$8f*r^B z^J(RtHTi;qJI;J~C2zA_?}zup)y(JEr_#pQr-28VhnsvwrF}29E8SE2$ z1RQlaj(?n|@%)4^%5?mVVD~lp?*{i^j=z`Ib#?#sJMTL&^XuP7tKah=zrWy~72|FH z6w9*RmjwT_oO?&i$@Bg>+!%FDyqF5i`4@$zu2!La5wnjz&*DL|I(z4wtl?ld|K+-C zCiiYu(|>4zV{dGaK7JK5F z9tZbbJ`w-P;0oB9?7xuKHCB&3x&&+j<1YmpuV(yeTK7}k@#Jd?E!M+(l>aj02MV7# zBL4<7?}6YO;eNkIfA3xXV~slBHl7dn%d_m=>OJf_cxTETXD6+5Rx^*>ds9vS%>}lv zL_*y ztJyW*)x$8~-_2+jv1h(#hZp)DV&q4{)qH0zX4lZ-w*Xz zqw~S8k$SAl1z^|3c;6qn*muWx^H=Dz61;-e7=6arPd)1UC9w5H?p5G)&0hg`&5g6a z+!)uQUpXz)|A>_zG+d zW}IWljS+i)n9K2A4Ypoe&&4|0D={DaR~3HSn9q8!IpSPg4OZKLc`oFy!OUfD@r&qw zcW4{oF}F?N^4zXLQ;+dp3wFG36z|(FgFV;kj}~Y1bzuAH`w??|J*|)R=(`qEvmSAb z{|0b*{Oi%wWBfOQ)A8Q~cYO8Rikhlm`{^^cdoYN(2l`^4YK87TTnCniZwRcWZ!@ht ze08wy6;JSYC<+zh@2+l6^o>vvry zY2SwFyPCTEZrWXFzak|;1*^Gcapv9!cMi_^ z4qCZ62Wa1qt-<$Oe2iAzoSwH&g4H~4=JZ^C3a)O9=j*{D#`9&2=kU{L z>KlkT9sUs5e(H|%0Ik|;zUcf8`wVy<<~R=*t+~7}J`2|${1I?p!aoOgjp9Ce6l{!o ze6KwQR!_gz9*3J-pWkbzGS(Adb!*&DEBC#9lJ@ghTBG$GjrN7Y=R0VaW1e=Xx0{k2Sa$ zY<_+Allwk6uS+oJW*>d~$)|oPrqADoHMBmyx0lkc#?-7)>{^M{V$4^A9W(fAz|JfF z2DuVEfUU)hwKg^5(zTxl_nX+qe;;za{JYYBON_Pi-#Z=uL|WfZ*JnLujAt)u*ih)f zuZG(<-j_{awe)?t25vun@%PPZ!Rqd_IQF>$_S@X2AB$Q~23y}LwDEWMwcvC=uP^Z} zB|ci>trFi^;u+ zvCqTcLCkUVN8dVFpLw0TJbYWhY2F50ExsqbJKu~AV&>H!eQyBkGp}nX58oI#&D(^l zjbl;oHt-;3Uj5PcEnt1-buZ-Mn*gVIC*f-T+hXM14j#nJt3Ueg0P8cadnym#E^wOn zt#Gv)vB-N9cn~wM{^)x%Sf6=!(#pg4HgKBvR=C=3Eb`t49>mP6Kl;8Mtk1ljF?slI z2fLRyaZj8?`wmPk{O<(2C*gk=SS|eT2D`T5e-BtK{O<+3M&Z8$thRwSX+G`yFvm2n zef7t0`uo9qFy~|Ka^oZS2f(gVZXp1ztqE81sHPn>Oy3yTNk`Zr=wJ zeh~Pf65pS2;~z`7@lTZafrK0Xbi$2)w#1)Hxba_4xbfdC@fQ+q{I?Qr{I^T|I|(=b zy9qb`dnNw;g2#P$58O5Ne8irA9IPJq;k{tBwV3myjysvfm|uJA>zkHXcW*2lq7tLI2=t)3yr5qo~@7qvbKwpQbf(;u~d z9-P+t1-RNc7PWp69JQ)PtzQD`i&~!ot3|EPfTLE=u-y8drF9&!XV-pF>zBdSYP@m! zqt>s0(^|g@S8HQY>({_ht9sP>t6+Um>+@i>sP!A*sMYsJZhgKxjw617)_zgzuYs-A zc;oa(t-lUVYyAzlTKv9v27VK>R`0O%y#7|Q)!qnIEMJ)g8LR0sd)oJoJ^85{&1uWLo%uPsz8L#| z!RAqSt(=pSuqJ)hvrh@ldYp^r(74%X>d~(YY+v>0*9~@!?dKZHjWb6Dt*7w&{)l~l F{ttZo+gktt literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/transform_reduce.dxil b/piet-gpu/shader/gen/transform_reduce.dxil new file mode 100644 index 0000000000000000000000000000000000000000..65ff9449e2f060c1c5caf5569899c4fc0656f15c GIT binary patch literal 4696 zcmeHKZB!Fi8orZECIf^p3BeBE6Q~q~>JXI=0h=!{V6;%9YhBLiK-6Ld8^y9Bp54iZ zBx=y0SW((8wASOcZcA5Mi*;=Ru~LePl+4}k9O^L?*y}T?f%$vcK4k9 z(Rku?8AK}7KAqUrnBXl|m6)BQ)-QIVDFs*wd^v7z(asTW8H7sS(a`!y&gHwo z0A{?!xrmE>im$rExj+X%AsoZKQ@DjuQ+m8U2tmLO3V??prby5QQLDG3`w|Y@V0FdT z=8{SEJDnl6!+K)NTo8mKn8YGSByOsUc7mo>*TFxjP?YT>!q0^2CQ-c`U0*Ul%6A=NqjH&XYtEUv5OAKci9sMGI0*j0de#M0Y>o#ysXEymZm$i;pC&wXoW7wBqA-VnAE0S++c)UEZES zToizCn`Q`9lK7HOh|1`C`4wHq_Ln=C5DV5=SY0@}dYHC(=t{~FCh{_x##)ep*}8nb zuBp@G=f{Rmd#A>G$C7aT%tQL6v(v8cx>otx*Si`jcf(m4a=ozQK(Ei?cz=6mL)-py z7inW%$V)9P6g6Ylyc5|cJA-(e8g}uZlI48IP~8E=){?*`(<5p2^aQ zUf76OFT-4ZJ@)h zOo}Iq#d8U7>}7e(kUX}lEN(5mBj=Pgr(cuZ!OuNq&9(7!`X*&HjIgdkm|0=p1Z&oA za@kFcuqIhpb4viz{))9ff!jBd_D#+9+GcawwMp@{5%ITf@dV8;?`eziwZ-)L>q})Z zz492ZIkvY9`^crq?$Km7`|G#6vzjzn4t`Ek_Z;}pJC>Y5eolM0$SW6aGx4`3+iO^R znbMkHIG@|lqMBQJ`Y*Q-A|IDT^5*+}c;6WoxJMFCR0aLXbo=}j zp`nD3ioa@t&+(JypLn`@;_<@WK`^-1JF-EA+ti z+`nzsqO`+0#5QX3Aef}2!2o;-SahmoI{&2#{d}|ctyG^Za_ZC5?Q8yc?0Zh%N`t5M z?f?|hd-YBW@qpg#hTOlUcjUC*dHW!w!!y)1>#T$4=Lp3Uaj4%O9v@Y{B~>kd&-zg=mb zyNHloQAMHVI4}iK)gGDYXX5g|Bz)@>Sn^y-+BY-!Bv@;m^A8 zvo8EgT_~gA&r+}n zowfR|TNs@MQI#0zKK^6=2jB$9RP7Z65 z2e78tCUbZ*hZ&>+%n;YAk{#wSS#J4Ye#Im*8xX7AX)B0QIl~+YK(O zGNK@y*Gd}LyD7JtBd$^v{*+(J0x3;f1i3iFr4;jS#xQ%3n=&R0xhZ1K;r!b}5^O5n zDDoN4l09elrn~l}7^(Qhy~>40n#|jr$`K>Vi%%qOk!JVV=Q3P-@~Wu#x?$x)>`lm4 z@Ncc8-DiU{**yYu+eGLjf^~q8yN zC^lhIbv2&1(&Lu?K!~CVWIlLi!aDc}+HM)0sZAg=DBgNd^o=mZg);)HzDY}0D$yZe z&`2M9NaBvd4k>pqLoEIZl05$4;S~w&d{O2bQcpV~S;?mt(;@?Sk?PQ-7b6Z!zxa$& zKn1IFWc&y=8UMN%l3S+9A+w)UCqZ(0w(aBIN1WP=T2n+NeTg`QM)oO){9?6U_o)`7 zlD)i#c)@H*!4lm!TsUCyhebCT=TruRJ^rExo}0Jd0Srs+!Q0+lpeS#KQ2CwE}YS6-lpFK*()3y z>NDD+-DfSx!WE~GZE4iV+{N6xksfikQou?kxK;S4KN&w5D9X#L9KP`Y_dmTN@1 zdXDNQSH1O7`RoneR_+w14G--)ffv0*Ve;8aRX1g;*OLvg=@r~r{&U71^=#fCi4H0o NI-6%IxogE={tk-I{6YW# literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/transform_reduce.hlsl b/piet-gpu/shader/gen/transform_reduce.hlsl new file mode 100644 index 0000000..09504f6 --- /dev/null +++ b/piet-gpu/shader/gen/transform_reduce.hlsl @@ -0,0 +1,125 @@ +struct TransformRef +{ + uint offset; +}; + +struct Transform +{ + float4 mat; + float2 translate; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + uint n_trans; + uint trans_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); + +ByteAddressBuffer _49 : register(t2); +ByteAddressBuffer _161 : register(t1); +RWByteAddressBuffer _251 : register(u3); +RWByteAddressBuffer _267 : register(u0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared Transform sh_scratch[512]; + +Transform Transform_read(TransformRef ref) +{ + uint ix = ref.offset >> uint(2); + uint raw0 = _49.Load((ix + 0u) * 4 + 0); + uint raw1 = _49.Load((ix + 1u) * 4 + 0); + uint raw2 = _49.Load((ix + 2u) * 4 + 0); + uint raw3 = _49.Load((ix + 3u) * 4 + 0); + uint raw4 = _49.Load((ix + 4u) * 4 + 0); + uint raw5 = _49.Load((ix + 5u) * 4 + 0); + Transform s; + s.mat = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3)); + s.translate = float2(asfloat(raw4), asfloat(raw5)); + return s; +} + +TransformRef Transform_index(TransformRef ref, uint index) +{ + TransformRef _37 = { ref.offset + (index * 24u) }; + return _37; +} + +Transform combine_monoid(Transform a, Transform b) +{ + Transform c; + c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww); + c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate; + return c; +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 8u; + TransformRef _168 = { _161.Load(44) + (ix * 24u) }; + TransformRef ref = _168; + TransformRef param = ref; + Transform agg = Transform_read(param); + for (uint i = 1u; i < 8u; i++) + { + TransformRef param_1 = ref; + uint param_2 = i; + TransformRef param_3 = Transform_index(param_1, param_2); + Transform param_4 = agg; + Transform param_5 = Transform_read(param_3); + agg = combine_monoid(param_4, param_5); + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if ((gl_LocalInvocationID.x + (1u << i_1)) < 512u) + { + Transform other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)]; + Transform param_6 = agg; + Transform param_7 = other; + agg = combine_monoid(param_6, param_7); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 0u) + { + _251.Store4(gl_WorkGroupID.x * 32 + 0, asuint(agg.mat)); + _251.Store2(gl_WorkGroupID.x * 32 + 16, asuint(agg.translate)); + } +} + +[numthreads(512, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/transform_reduce.msl b/piet-gpu/shader/gen/transform_reduce.msl new file mode 100644 index 0000000..71e9935 --- /dev/null +++ b/piet-gpu/shader/gen/transform_reduce.msl @@ -0,0 +1,138 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +struct TransformRef +{ + uint offset; +}; + +struct Transform +{ + float4 mat; + float2 translate; +}; + +struct SceneBuf +{ + uint scene[1]; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + uint n_trans; + uint trans_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct Transform_1 +{ + float4 mat; + float2 translate; + char _m0_final_padding[8]; +}; + +struct OutBuf +{ + Transform_1 outbuf[1]; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); + +static inline __attribute__((always_inline)) +Transform Transform_read(thread const TransformRef& ref, const device SceneBuf& v_49) +{ + uint ix = ref.offset >> uint(2); + uint raw0 = v_49.scene[ix + 0u]; + uint raw1 = v_49.scene[ix + 1u]; + uint raw2 = v_49.scene[ix + 2u]; + uint raw3 = v_49.scene[ix + 3u]; + uint raw4 = v_49.scene[ix + 4u]; + uint raw5 = v_49.scene[ix + 5u]; + Transform s; + s.mat = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); + s.translate = float2(as_type(raw4), as_type(raw5)); + return s; +} + +static inline __attribute__((always_inline)) +TransformRef Transform_index(thread const TransformRef& ref, thread const uint& index) +{ + return TransformRef{ ref.offset + (index * 24u) }; +} + +static inline __attribute__((always_inline)) +Transform combine_monoid(thread const Transform& a, thread const Transform& b) +{ + Transform c; + c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww); + c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate; + return c; +} + +kernel void main0(const device ConfigBuf& _161 [[buffer(1)]], const device SceneBuf& v_49 [[buffer(2)]], device OutBuf& _251 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +{ + threadgroup Transform sh_scratch[512]; + uint ix = gl_GlobalInvocationID.x * 8u; + TransformRef ref = TransformRef{ _161.conf.trans_offset + (ix * 24u) }; + TransformRef param = ref; + Transform agg = Transform_read(param, v_49); + for (uint i = 1u; i < 8u; i++) + { + TransformRef param_1 = ref; + uint param_2 = i; + TransformRef param_3 = Transform_index(param_1, param_2); + Transform param_4 = agg; + Transform param_5 = Transform_read(param_3, v_49); + agg = combine_monoid(param_4, param_5); + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if ((gl_LocalInvocationID.x + (1u << i_1)) < 512u) + { + Transform other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)]; + Transform param_6 = agg; + Transform param_7 = other; + agg = combine_monoid(param_6, param_7); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 0u) + { + _251.outbuf[gl_WorkGroupID.x].mat = agg.mat; + _251.outbuf[gl_WorkGroupID.x].translate = agg.translate; + } +} + diff --git a/piet-gpu/shader/gen/transform_reduce.spv b/piet-gpu/shader/gen/transform_reduce.spv new file mode 100644 index 0000000000000000000000000000000000000000..d6f84a21400b2026e7c3fe77e601295dfd814514 GIT binary patch literal 7568 zcmZ{m33Oa#6~|wirRfGOrIgYV6Tow-v`81Ai&V;%g29%8;66+ylV&uTiIYhIMWqPt zqUWHZh)Zz=H{4M~5Rtm!F7ASBt^3Lj`un|kH+>!CwfEfr{oniD?|%1NChgM>pPnRb z$;_lXnbVe}-@IfROp?q>x>DUgaMr+46OEyxRv&Y;0sAHGX`C_pGp-Zefs7R^RUL0Z zb|DWTbIGA^9zxnQpX1v3hnd`p(ZTHf1BL$0oBIb&DU`;?3lohCN6UrLO0`@Vs?^J+ zMrC{1xb9qBrCJ)D9LfiD5|OW?let-3a(2B~ofxjw$F`P-F=0DXUR)ahSPvKU;djSbqqv0AlOdGy%j z+lN;pD_5*euah@@BDZ^Kw zn{{`m=XBd>p?|bCSR6gAy1iB^HY&C1Y0l-itI@Hy?*MNc9j%$i^P}yz_xyAu@5DZ- zRvoTvGo~xGjj0yOqvf%3wLw9_yXV!yc(E}uQ6?OIdK&MYIKMK~7~%XC8kNy5aK!xHm#nIAO^xZsSFBcR;fOhA zz^CUobJ!nyF%w}Cp6sdhd0(2b(K&nFnY;_W*}LYUF^8OAiJq$6lbr%S@wMpUwr${! z(|Ozrkr;y;>;`Ood~WyJ%$IiFlF5@Tm%+z~m>yJy_e9OwGwmZey?l>%!rNo}V~=;Cox|TBYl|})`GPZE&Yn2#KD6uaMU06v8sn@(F6!{S z=IhY5j_~_;hjsXSW&Um^G=J=0oZZOpoN;dD@^SfnwhrTB9?x&S4(&e1`#JAyZ0lHr z%;zMUtB=u+7cU?t){U9;;Bn8JoPX^!&wyMnvF>yD`>5GH);3PxL5$xY-HR+F-u`O4 z9&P^y({}D2Z2uN9M!pYwK3dLv&bJV4emUp!oQUNW>pv9Ti?ks}W_AzQPusQFBgcmS zNU${=O1*LKUAy;L+qsXU5? z^y$lB1D7!FHlSxj&b(hi+f#Y-YTur5vF=^iZHV#T$b9CA{67GDzQeu~?49-u-wTd2 z?0NDnk#l{%8QSK30PR}joLAd-LoVv@-H18_`po6{_RMxJ?XJvrznwSUn4at{i9X94 zKQoK>{ct{Q-wWqh#?E;Td@t;q=T9;wpJ(Sjm`=1g?8y~~=T6(RadpmK2);IF z>-X=Zhw)e?!jJe`C%z|4l9S%{klg{@oM#jK8bJ-rZvF$?T1+@=EYq z5$}a}{dm@BU*CqvUrLPj+cWMe^ai?QPvkb@^<8)mvJ>%Lx(10n?*;1%`+Z=?M&9>> z*Z-AR_PF_Fvna`aXon>9hCRYsuyL{xGrtk=K81#);h*&%pic z_eat0yZRpTUXT7b;-~);X#MI~%O}C+xSqMZtDi>X?9l_{(Eb==F2{R1&|XWQUF+wOIf!HQ8RL18k2QY*?3(4xX&=6bxOU?`PugP7lJVv@?iOS* zVvIgx9Cr(1AMA;1QAfRB0$XqF->qP|LB#R;9iz_}^T@lev2OKa?aiO@?LzYB?l0K# zQNv%s=8g6J4ea`ib53nzoYQkzu75wY^~pUBF~<60zk0#)VfTR@8^2eFgXIoGVo#3%_age@ z_v-Or`A3*j9JL+*UV!+`N21mP!Pa*WI{xlh3U+N_FK@BewAdS3?2}vU(_8E^H z?Nj6)0?UQJ40gTY9|kwM=6kmd+cEOaIfAYr&N+xUr?zqC^sRV4qThE!oPS3aWBVP9 z_>L?@FUsLVz`iL<(7qx1$B^krthowyjB9pIdsIW@H}gtrk7t~`Ut3e9I`X^_&iV3r z>e%v;X96sz#5WC~Tt+{ftARM5v5|Wc+*;#yIQhtZK3GmMUwckRo(tewYrGInKJvT> zET_0m#av3%_+qg08M`B^`y#Nl>x=nb0=D0Ap0te%|4YHG>v|cS{APA(9{OTLPVuuI z@81seC5Y=8L{=g0pLgozh`!6I%^a5^7bEg9_hsPLxnBV%A9KGFET=?IWA0bM#oX(_ zG54#%`eN?afXyRsudL^_h&Ac69`C{H5bJR*<}&W}hnTgeBD4%1hhARfCg#4#311{R`lUhR})Z&wm`$j@IY|A ze33+n5)}I)SRd4nr9S7q*!I0jwNCg@BVt90eQ2o#OL=w%EkkXWI=-1XH_D8wv(~Ix zYu+ET*34P!oU?y>?{n_HXYZWfjiyMYzI5@CXxo!-uK)gMg6tgn^dbTP7!3!24d-mw zb+B)Sy$tpd*h`_o4*R;?JPL<+uH~&;Et|vi{OV7ze%%Jj0QJ<|f}hO)3LEo99RmP5 zmbos5G9A;4E%Z6SUQn(pR2I5f02lxh9D$29MR_`U69W3wK)p)sza!-W&<=H>RuaI+ z03qybOMI&`faS4>)aCmSJ2!!eOJcC*VzX!!L$xndb(BaIW#Qudkkr>)T91fS=emvo zNvdj6%lEX#_3M>uE$#{1H@mULxvaG?SUR>y_c;ApYz5;4&oM)bs2>coN*>W7;xWI8 zDxcCK&;pPL*PwJPR)dPv;SLUhfL$a2i=ZwwPRk*;HevhYj@seywY|$D3fJv+huV+o z2vZv1;0Q}#l_CN^*-N=WTc_t>Jg}vb_QV5Sq~@)?psV(}>9O!fVqYxdnvX~{Rbxt7 z%$vA!cP8)GuNcFmkCUZ;*FPEm(Vh@Hp*rThC)IqI$Xp5K6RpvBRzZ8*8t>D2cYM2s z9MZU#-Z-+`>e8DNuM5oZ@Pwi&zq7{R%qVr%m!eSS{a(*6AL8$TBM93;v8y`S)g-&z z!>^nZ>gI$>1DkdjVtxC9TdjhI)3HfplV z4Cu0>n;}qwAc~*}fFc49)>CSXS0oYWJZ{dy)t4~!ZKleEne`ku5UDqcR7jj?T+t&U zO$teyYGU3prDdK{Ipjsim8il9D-nQtxO+XE7oa*%E<>nUCN2rBmZN8q_jlwsRHFwg zyAfqqsJx@0HK{l$q&?|Ld$*-tmR}uOl7EZ9gxJ=DBss`3k{LLrG?rCZ&FPU2RJssf zaCsth(cf6jSwc*E%SQ~T2=udbo=3iA_n~rrC5P#fi zHD?Pk>^8Nv&u1Z*>b}rxB0#yjcXG0K_QL3$OTD8P`e*MAOfAH-)Q1-i^-(F{t_t55F92~=tE|=>{Q}>>(1J?=& z&u)k0i%sG$q54eG(e7(EXkD|WJ(AY+xP}f)>^$>HQ>&~ix$mFbk2LI0?jT7?^T+9# zA9r&Y@0I&ke23a7!X^V~Y9gAJg{EuKWkqO4DVpiO1Ne6p{8PR&xBk$5wDmyujRKA} zX=~lm+HFeVGO>4<(AqkuiF8T^1^j6be}W3D zpu)bM;m_##X+(eY9ck34G`iOqvz^){KW~!{sh59RbFSk*OKyJhW}uk|AU`DO`*ksuBgebsKMYk zW{et;M)g~w2aK%GJ?iBF^>S};JlmJmrp|Jq^0vO0;X|KVe|fj%8{-o(5Bs#U zCux05?e%MK*GJ;qj}1%DCFlO^qvUZe7820*J^ zrxgVy0C-35Y`lX+wM+6pNlW$pwua^>9V82yuReqK>5F`o4)t? zkb8mZIA96P^ykI@N)FQ-9%yL}dtNqh`s;HT{f+Wd4hyk&3V>>j0qYbGyhnr(7Ka23 zZwHB97zxc+a=7My%2$vw`@<7dcW61|^Rp-SgS0CvJ$JPgCHct;sP7U7~JC13DsoC?-Q5ok_yBAy5 zI=R=k@t3$y>wfp`p40B~HRd*HUv+D8N5gOTM<#c4glGRPTsts!Z*sQt&hXg1spHsu z`t0d<{{MUX|K7cAB;kJ^So1q)8PGUzmHmY60-Bv0uHq*NmM%C#F z)^lTXOGK-53o z_aTCG)OS{T&L%G!^GqNyhi0DXOuf$+t@Xz+r-MVD|1dV4qNMZ)?$5%|d1mCH3W7v` zG7ZqvG|>7N(}3xQz~aX|p%&&=U=5ib&*c&T>oME~yBhTf7mOlRZoI&5^U_ z`KK=Oel-;O`CZ2=Gfh`JbvY5`v$E3tEvw#bEPFCpQYa2wK2CXB)|Qa5YW$H{<-bfY zava2r>=bvhJwIk%IIW^xtZ7`V12+<5pJ;qqlJPc;h$5j+0NPh@JP8)oj12~MSQ(}H!ulk2rMp2JPxJ1B>Bzn@o zfmZuSKIU^67a8JU^wv_aTh2Lo5I68_uMTP`nElVdU^o|@2NvM>_1r>GN`9!8>5(DJ z5dWfrsbH(*B4~_$W~>PsEknDY(Pcr*q&TIxH9Tc)wqVTEC&9K-SM`Ll%RD{OQ3qc=`GFh{-B$&7xFs)Iko|XcbR$D-klS|IX7?u_R-so10ml(&P z+jqRlk}o0ym#HjI3kPe|k>e%3%vfRKqrq_c-ITsQSg!;or%z#Z5SV@n;#n;C^Yx86 zUbkQJAw41e03Pp2itG0w#*khb93g0cgj}M*PCeIrOIhYOoRtzh%Datn82bG{L2%z0 z1AA_WEABCG0oz8_Bt9_zM@Ev1ck(uFVA|?Na}@}6>I84`mo%)uN5iBmypy+RxZ_dK z`Yms9(7D02nzjz}PR_z)svEsDJT=Q(`~wYlo}l4&j>i(B;PKy=v64V)R^XEO1UTPU z=C*_Y`3Mb;s>>lTFCV?+P0+HEh*{;E(v4$OV(+*q{rUZu3X-ipL2Zk>#1ND-|Z*`+p)E=wZ{UU1KgK3M;H=$EiRd1suwc!<99BD|fr3zMbdvheuy;x~ulDRBk<3yiy6()N-9Ub?XT E8=u8t*#H0l literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/transform_root.hlsl b/piet-gpu/shader/gen/transform_root.hlsl new file mode 100644 index 0000000..42bbd38 --- /dev/null +++ b/piet-gpu/shader/gen/transform_root.hlsl @@ -0,0 +1,94 @@ +struct Transform +{ + float4 mat; + float2 translate; +}; + +static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); + +static const Transform _23 = { float4(1.0f, 0.0f, 0.0f, 1.0f), 0.0f.xx }; + +RWByteAddressBuffer _89 : register(u0); + +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared Transform sh_scratch[512]; + +Transform combine_monoid(Transform a, Transform b) +{ + Transform c; + c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww); + c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate; + return c; +} + +Transform monoid_identity() +{ + return _23; +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 8u; + Transform _93; + _93.mat = asfloat(_89.Load4(ix * 32 + 0)); + _93.translate = asfloat(_89.Load2(ix * 32 + 16)); + Transform local[8]; + local[0].mat = _93.mat; + local[0].translate = _93.translate; + Transform param_1; + for (uint i = 1u; i < 8u; i++) + { + Transform param = local[i - 1u]; + Transform _119; + _119.mat = asfloat(_89.Load4((ix + i) * 32 + 0)); + _119.translate = asfloat(_89.Load2((ix + i) * 32 + 16)); + param_1.mat = _119.mat; + param_1.translate = _119.translate; + local[i] = combine_monoid(param, param_1); + } + Transform agg = local[7]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Transform other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Transform param_2 = other; + Transform param_3 = agg; + agg = combine_monoid(param_2, param_3); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + GroupMemoryBarrierWithGroupSync(); + Transform row = monoid_identity(); + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + Transform param_4 = row; + Transform param_5 = local[i_2]; + Transform m = combine_monoid(param_4, param_5); + uint _209 = ix + i_2; + _89.Store4(_209 * 32 + 0, asuint(m.mat)); + _89.Store2(_209 * 32 + 16, asuint(m.translate)); + } +} + +[numthreads(512, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/transform_root.msl b/piet-gpu/shader/gen/transform_root.msl new file mode 100644 index 0000000..2c58c06 --- /dev/null +++ b/piet-gpu/shader/gen/transform_root.msl @@ -0,0 +1,129 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" + +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Transform +{ + float4 mat; + float2 translate; +}; + +struct Transform_1 +{ + float4 mat; + float2 translate; + char _m0_final_padding[8]; +}; + +struct DataBuf +{ + Transform_1 data[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); + +static inline __attribute__((always_inline)) +Transform combine_monoid(thread const Transform& a, thread const Transform& b) +{ + Transform c; + c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww); + c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate; + return c; +} + +static inline __attribute__((always_inline)) +Transform monoid_identity() +{ + return Transform{ float4(1.0, 0.0, 0.0, 1.0), float2(0.0) }; +} + +kernel void main0(device DataBuf& _89 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup Transform sh_scratch[512]; + uint ix = gl_GlobalInvocationID.x * 8u; + spvUnsafeArray local; + local[0].mat = _89.data[ix].mat; + local[0].translate = _89.data[ix].translate; + Transform param_1; + for (uint i = 1u; i < 8u; i++) + { + uint _113 = ix + i; + Transform param = local[i - 1u]; + param_1.mat = _89.data[_113].mat; + param_1.translate = _89.data[_113].translate; + local[i] = combine_monoid(param, param_1); + } + Transform agg = local[7]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Transform other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Transform param_2 = other; + Transform param_3 = agg; + agg = combine_monoid(param_2, param_3); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + Transform row = monoid_identity(); + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + Transform param_4 = row; + Transform param_5 = local[i_2]; + Transform m = combine_monoid(param_4, param_5); + uint _209 = ix + i_2; + _89.data[_209].mat = m.mat; + _89.data[_209].translate = m.translate; + } +} + diff --git a/piet-gpu/shader/gen/transform_root.spv b/piet-gpu/shader/gen/transform_root.spv new file mode 100644 index 0000000000000000000000000000000000000000..7824d0914594eb18dfe4cd7dac5988c2b784e2d9 GIT binary patch literal 5280 zcmZ{mhm%xQ7Kb173=E(sqEfnmgBVZ&6)_+NR4g1tF<=g3Gc!QDGt)ZLVAjS2Cd@fu zM$BT)88OFQcUPB z83>ak!;(R%o;CZJ*^~Qgos*~TzKapVlTsRJ%tp!Jq#50W^pq>TI%Xrwkd4WgAAwLZ ztz}M$f0)U;7|lfFbDLp?;G5f5?(Lsn?dt(sMb6bzu3-;KZEMwx>n_*2z|HJsm@(Df zYNfNi(%IEptJKb&0EKU4x{r=(&%8=+S9>F7LVvAqQAcgkx^9ye&!37+*>(Egd#F|7 z+(}YS>y|iYlFZBSmgtTQx1lTN*t;d2$%5|oS>4rnoE(c@w=1wSkaNWhwEsGTrsR0|ndMsfz(w=PYff$RI^opr|3m>x?0zdLZc6&l zXO{cQ&fU9Nw4Pl`>e%H43-C23OTqmM+xt8E%C(M#jdObMoO8BEDI7EHwH$x+S^=)s z7IxXwUMtc0-mBKLOUY{NzUtX&owvY^I;CU{IQF}Cy_}}xlbk(^9==WA$Y|scD#c94 zW4nTLu5#Y3dB>p#A+3lp&e#=svzVh?2{GRJZjHFY9@j8a+c~+9e`Nk2GJ6a3NWKog zk8xD9o_KRm^dCiiKHAjw+tF^xY-7h_4?!CvZ=IoNxmJ7|qV;Ky<1E87EU)ZDjJ-x8 zNmAcG>`hZUNhXqQ{$_|V@``vO_3XKM#>vasYZPLRzRl?2*;|nTh&{Bu7nl1SgSL+n z{>fnD&7Xp9MO^x|cT25C{#3BJo+bR#!RA|k23o)S)jlM%t>a7&MT@-y{fD8=m7kv3 zz9%{ThiBNF=yxR8`lIn5jdl)l#vg;Wzr1(mTa5jTp{s9BZY+i_fzW=_0y{KT3>p<-Iv?b<^A};+mq4lfd z{5ONG5pVTYu-q+(Z&mwx#9HQxe}eb7SMfJD&UOd5c(yy?MQB7;K)txPwQ) z-hsZjr$@o^-opyCcK99x%jsK*)(+p};5wpj6tH$GNzCgFu=COvGkp^*A7{|^{Lxpf?=7_Z)b}>pJ*oX>-$lQNxb(k|)~}A) ze*m^de6t^d<&H!=ul75LwagVq?nhvAW6vLhowxg1i`Mpx8=^l&{B1KQ;y=rH<39mw z$J_lJ?Ax`DK4aFP<)i->;I^FoCD{GMJNgQ2jC}O`8f=~TTl5Y1L}WB#eEjYC7Hq7` zGkk~s7h=4z?nm3W_}lYuusH(|bG}Fa2Qeq!vbJ&Fhj-&l)p4f(g7atiAGTcN>yyjh zw|Db9UH3x#f2ZGo@82~PvBw7JKhUGl`8a3n8jkGAnY^owa(H8~`J15qf2#g1k->;( z9D^Q4`8n)HyA_dBT+YZ9v$D=O#D4iYTW5UK8ILWe#2N@tE@waX+6HMw zjEy~R3oh<)J2?5My**e?v0f>sqs|1l;vRQ^laD$(g5?y?r&vpgJ?;dypRr4_eNP0t zcYV=s64-gio6|Nf{5yk-=h_8Meh$-ULGOylDK7Wp`=5&59q~NAuiX&uZxDJ9MBj4u zW{o`&>&Qp%y}-rZ)8OQz_ugPRC1x7E_koMv{vJl}eZl&o_kLjO$U7_dvp?dV^tqoI rsc!7YvslZx12aDI4#GCK!8h_|Vmssb#rGq5qRgT=UkVqlT@{Is;<)Dj?f wbH9i(w=zPDGEhtdh&`ajfV6<{9 delta 30 mcmeyfj%m(1rVSHBHXjgC;@-@q5W~aBGdW#Hck>*b2U!5S-wSI1 diff --git a/piet-gpu/shader/path_coarse.spv b/piet-gpu/shader/path_coarse.spv index d86ff9bde3010d811c803dd9f2ec5c2d8fbbd7c8..54814b7315f5608784dc15dfe2a1feb724b9332d 100644 GIT binary patch delta 121 zcmZ2*fvMpl(}pKP`fLoW3l>7+54eKP{~|wFJoB w{8&hyOBta>87QU!#2!#%Kw3azI-CE>ykukKoBUi^myHc5z{apyNQJKj0L{4;EdT%j delta 30 mcmZp;$h6=B(}pKPn?;1>xHd=0K4D|znQX73ySYS#rv(7E8VZd7 diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h index cb41a4b..52ea6e4 100644 --- a/piet-gpu/shader/setup.h +++ b/piet-gpu/shader/setup.h @@ -38,6 +38,12 @@ struct Config { Alloc pathseg_alloc; Alloc anno_alloc; Alloc trans_alloc; + // new element pipeline stuff follows + + // Number of transforms in scene + uint n_trans; + // Offset (in bytes) of transform stream in scene buffer + uint trans_offset; }; // Fill modes. diff --git a/piet-gpu/shader/tile_alloc.spv b/piet-gpu/shader/tile_alloc.spv index 2d7363d729fb4f83c0d7040a9acc673ecdfe5662..b123f1890fa44c5595ef70e8cd5ffb1d2900ced1 100644 GIT binary patch delta 119 zcmX?7yrg7<4VOL}11p0bBLf2$0|P@|d`VGaUNHk313OrZ8z=@AiO)|HsjM69fPN delta 28 kcmZ2da-?{J4cF!zu5-+rllWv<7= (1u << i)) { + Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)]; + agg = combine_monoid(other, agg); + } + barrier(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + + barrier(); + Monoid row = monoid_identity(); + if (gl_WorkGroupID.x > 0) { + row = parent[gl_WorkGroupID.x - 1]; + } + if (gl_LocalInvocationID.x > 0) { + row = combine_monoid(row, sh_scratch[gl_LocalInvocationID.x - 1]); + } + for (uint i = 0; i < N_ROWS; i++) { + Monoid m = combine_monoid(row, local[i]); + TransformSeg transform = TransformSeg(m.mat, m.translate); + TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + (ix + i) * TransformSeg_size); + TransformSeg_write(conf.trans_alloc, trans_ref, transform); + } +} diff --git a/piet-gpu/shader/transform_reduce.comp b/piet-gpu/shader/transform_reduce.comp new file mode 100644 index 0000000..4b72b11 --- /dev/null +++ b/piet-gpu/shader/transform_reduce.comp @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// The reduction phase for transform scan implemented as a tree reduction. + +#version 450 +#extension GL_GOOGLE_include_directive : enable + +#include "mem.h" +#include "setup.h" + +#define N_ROWS 8 +#define LG_WG_SIZE 9 +#define WG_SIZE (1 << LG_WG_SIZE) +#define PARTITION_SIZE (WG_SIZE * N_ROWS) + +layout(local_size_x = WG_SIZE, local_size_y = 1) in; + +layout(binding = 1) readonly buffer ConfigBuf { + Config conf; +}; + +layout(binding = 2) readonly buffer SceneBuf { + uint[] scene; +}; + +#include "scene.h" + +#define Monoid Transform + +layout(set = 0, binding = 3) buffer OutBuf { + Monoid[] outbuf; +}; + +Monoid monoid_identity() { + return Monoid(vec4(1.0, 0.0, 0.0, 1.0), vec2(0.0, 0.0)); +} + +Monoid combine_monoid(Monoid a, Monoid b) { + Monoid c; + c.mat = a.mat.xyxy * b.mat.xxzz + a.mat.zwzw * b.mat.yyww; + c.translate = a.mat.xy * b.translate.x + a.mat.zw * b.translate.y + a.translate; + return c; +} + +shared Monoid sh_scratch[WG_SIZE]; + +void main() { + uint ix = gl_GlobalInvocationID.x * N_ROWS; + TransformRef ref = TransformRef(conf.trans_offset + ix * Transform_size); + + Monoid agg = Transform_read(ref); + for (uint i = 1; i < N_ROWS; i++) { + agg = combine_monoid(agg, Transform_read(Transform_index(ref, i))); + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i = 0; i < LG_WG_SIZE; i++) { + barrier(); + // We could make this predicate tighter, but would it help? + if (gl_LocalInvocationID.x + (1u << i) < WG_SIZE) { + Monoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i)]; + agg = combine_monoid(agg, other); + } + barrier(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 0) { + outbuf[gl_WorkGroupID.x] = agg; + } +} diff --git a/piet-gpu/shader/transform_scan.comp b/piet-gpu/shader/transform_scan.comp new file mode 100644 index 0000000..e8e0019 --- /dev/null +++ b/piet-gpu/shader/transform_scan.comp @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// A scan for a tree reduction prefix scan (either root or not, by ifdef). + +#version 450 + +#define N_ROWS 8 +#define LG_WG_SIZE 9 +#define WG_SIZE (1 << LG_WG_SIZE) +#define PARTITION_SIZE (WG_SIZE * N_ROWS) + +layout(local_size_x = WG_SIZE, local_size_y = 1) in; + +// This is copy-pasted from scene.h. It might be better for DRY +// to include it, but that pulls in more stuff we don't need. +struct Transform { + vec4 mat; + vec2 translate; +}; + +#define Monoid Transform + +layout(binding = 0) buffer DataBuf { + Monoid[] data; +}; + +#ifndef ROOT +layout(binding = 1) readonly buffer ParentBuf { + Monoid[] parent; +}; +#endif + +Monoid monoid_identity() { + return Monoid(vec4(1.0, 0.0, 0.0, 1.0), vec2(0.0, 0.0)); +} + +Monoid combine_monoid(Monoid a, Monoid b) { + Monoid c; + c.mat = a.mat.xyxy * b.mat.xxzz + a.mat.zwzw * b.mat.yyww; + c.translate = a.mat.xy * b.translate.x + a.mat.zw * b.translate.y + a.translate; + return c; +} + +shared Monoid sh_scratch[WG_SIZE]; + +void main() { + Monoid local[N_ROWS]; + + uint ix = gl_GlobalInvocationID.x * N_ROWS; + + // TODO: gate buffer read + local[0] = data[ix]; + for (uint i = 1; i < N_ROWS; i++) { + local[i] = combine_monoid(local[i - 1], data[ix + i]); + } + Monoid agg = local[N_ROWS - 1]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i = 0; i < LG_WG_SIZE; i++) { + barrier(); + if (gl_LocalInvocationID.x >= (1u << i)) { + Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)]; + agg = combine_monoid(other, agg); + } + barrier(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + + barrier(); + // This could be a semigroup instead of a monoid if we reworked the + // conditional logic, but that might impact performance. + Monoid row = monoid_identity(); +#ifdef ROOT + if (gl_LocalInvocationID.x > 0) { + row = sh_scratch[gl_LocalInvocationID.x - 1]; + } +#else + if (gl_WorkGroupID.x > 0) { + row = parent[gl_WorkGroupID.x - 1]; + } + if (gl_LocalInvocationID.x > 0) { + row = combine_monoid(row, sh_scratch[gl_LocalInvocationID.x - 1]); + } +#endif + for (uint i = 0; i < N_ROWS; i++) { + Monoid m = combine_monoid(row, local[i]); + // TODO: gate buffer write + data[ix + i] = m; + } +} diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index bee07aa..8d21fe6 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -1,6 +1,7 @@ mod gradient; mod pico_svg; mod render_ctx; +pub mod stages; pub mod test_scenes; mod text; @@ -20,6 +21,8 @@ use piet_gpu_hal::{ use pico_svg::PicoSvg; +use crate::stages::Config; + const TILE_W: usize = 16; const TILE_H: usize = 16; @@ -123,7 +126,7 @@ impl Renderer { let image_dev = session.create_image2d(width as u32, height as u32)?; // Note: this must be updated when the config struct size changes. - const CONFIG_BUFFER_SIZE: u64 = 40; + const CONFIG_BUFFER_SIZE: u64 = std::mem::size_of::() as u64; let config_buf = session.create_buffer(CONFIG_BUFFER_SIZE, dev).unwrap(); // TODO: separate staging buffer (if needed) let config_bufs = (0..n_bufs) @@ -295,25 +298,28 @@ impl Renderer { alloc += (n_paths * ANNO_SIZE + 3) & !3; let trans_base = alloc; alloc += (n_trans * TRANS_SIZE + 3) & !3; - let config = &[ - n_paths as u32, - n_pathseg as u32, - width_in_tiles as u32, - height_in_tiles as u32, - tile_base as u32, - bin_base as u32, - ptcl_base as u32, - pathseg_base as u32, - anno_base as u32, - trans_base as u32, - ]; + let trans_offset = 0; // For new element pipeline, not yet used + let config = Config { + n_elements: n_paths as u32, + n_pathseg: n_pathseg as u32, + width_in_tiles: width_in_tiles as u32, + height_in_tiles: height_in_tiles as u32, + tile_alloc: tile_base as u32, + bin_alloc: bin_base as u32, + ptcl_alloc: ptcl_base as u32, + pathseg_alloc: pathseg_base as u32, + anno_alloc: anno_base as u32, + trans_alloc: trans_base as u32, + n_trans: n_trans as u32, + trans_offset: trans_offset as u32, + }; unsafe { let scene = render_ctx.get_scene_buf(); self.n_elements = scene.len() / piet_gpu_types::scene::Element::fixed_size(); // TODO: reallocate scene buffer if size is inadequate assert!(self.scene_bufs[buf_ix].size() as usize >= scene.len()); self.scene_bufs[buf_ix].write(scene)?; - self.config_bufs[buf_ix].write(config)?; + self.config_bufs[buf_ix].write(&[config])?; self.memory_buf_host[buf_ix].write(&[alloc as u32, 0 /* Overflow flag */])?; // Upload gradient data. diff --git a/piet-gpu/src/stages.rs b/piet-gpu/src/stages.rs new file mode 100644 index 0000000..0613585 --- /dev/null +++ b/piet-gpu/src/stages.rs @@ -0,0 +1,209 @@ +// Copyright 2021 The piet-gpu authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Also licensed under MIT license, at your choice. + +//! Stages for new element pipeline, exposed for testing. + +use bytemuck::{Pod, Zeroable}; + +use piet::kurbo::Affine; +use piet_gpu_hal::{ + include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session, +}; + +/// The configuration block passed to piet-gpu shaders. +/// +/// Note: this should be kept in sync with the version in setup.h. +#[repr(C)] +#[derive(Clone, Copy, Default, Zeroable, Pod)] +pub struct Config { + pub n_elements: u32, // paths + pub n_pathseg: u32, + pub width_in_tiles: u32, + pub height_in_tiles: u32, + pub tile_alloc: u32, + pub bin_alloc: u32, + pub ptcl_alloc: u32, + pub pathseg_alloc: u32, + pub anno_alloc: u32, + pub trans_alloc: u32, + pub n_trans: u32, + pub trans_offset: u32, +} + +// The individual stages will probably be separate files but for now, all in one. + +// This is equivalent to the version in piet-gpu-types, but the bytemuck +// representation will likely be faster. +#[repr(C)] +#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)] +pub struct Transform { + pub mat: [f32; 4], + pub translate: [f32; 2], +} + +const TRANSFORM_WG: u64 = 512; +const TRANSFORM_N_ROWS: u64 = 8; +const TRANSFORM_PART_SIZE: u64 = TRANSFORM_WG * TRANSFORM_N_ROWS; + +pub struct TransformCode { + reduce_pipeline: Pipeline, + root_pipeline: Pipeline, + leaf_pipeline: Pipeline, +} + +pub struct TransformStage { + // Right now we're limited to partition^2 (~16M) elements. This can be + // expanded but is tedious. + root_buf: Buffer, + root_ds: DescriptorSet, +} + +pub struct TransformBinding { + reduce_ds: DescriptorSet, + leaf_ds: DescriptorSet, +} + +impl TransformCode { + pub unsafe fn new(session: &Session) -> TransformCode { + let reduce_code = include_shader!(session, "../shader/gen/transform_reduce"); + let reduce_pipeline = session + .create_compute_pipeline( + reduce_code, + &[ + BindType::Buffer, + BindType::BufReadOnly, + BindType::BufReadOnly, + BindType::Buffer, + ], + ) + .unwrap(); + let root_code = include_shader!(session, "../shader/gen/transform_root"); + let root_pipeline = session + .create_compute_pipeline(root_code, &[BindType::Buffer]) + .unwrap(); + let leaf_code = include_shader!(session, "../shader/gen/transform_leaf"); + let leaf_pipeline = session + .create_compute_pipeline( + leaf_code, + &[ + BindType::Buffer, + BindType::BufReadOnly, + BindType::BufReadOnly, + BindType::BufReadOnly, + ], + ) + .unwrap(); + TransformCode { + reduce_pipeline, + root_pipeline, + leaf_pipeline, + } + } +} + +impl TransformStage { + pub unsafe fn new(session: &Session, code: &TransformCode) -> TransformStage { + // We're limited to TRANSFORM_PART_SIZE^2 + // Also note: size here allows padding + let root_buf_size = TRANSFORM_PART_SIZE * 32; + let root_buf = session + .create_buffer(root_buf_size, BufferUsage::STORAGE) + .unwrap(); + let root_ds = session + .create_simple_descriptor_set(&code.root_pipeline, &[&root_buf]) + .unwrap(); + TransformStage { root_buf, root_ds } + } + + pub unsafe fn bind( + &self, + session: &Session, + code: &TransformCode, + config_buf: &Buffer, + scene_buf: &Buffer, + memory_buf: &Buffer, + ) -> TransformBinding { + let reduce_ds = session + .create_simple_descriptor_set( + &code.reduce_pipeline, + &[memory_buf, config_buf, scene_buf, &self.root_buf], + ) + .unwrap(); + let leaf_ds = session + .create_simple_descriptor_set( + &code.leaf_pipeline, + &[memory_buf, config_buf, scene_buf, &self.root_buf], + ) + .unwrap(); + TransformBinding { reduce_ds, leaf_ds } + } + + pub unsafe fn record( + &self, + cmd_buf: &mut CmdBuf, + code: &TransformCode, + binding: &TransformBinding, + size: u64, + ) { + if size > TRANSFORM_PART_SIZE.pow(2) { + panic!("very large scan not yet implemented"); + } + let n_workgroups = (size + TRANSFORM_PART_SIZE - 1) / TRANSFORM_PART_SIZE; + if n_workgroups > 1 { + cmd_buf.dispatch( + &code.reduce_pipeline, + &binding.reduce_ds, + (n_workgroups as u32, 1, 1), + (TRANSFORM_WG as u32, 1, 1), + ); + cmd_buf.memory_barrier(); + cmd_buf.dispatch( + &code.root_pipeline, + &self.root_ds, + (1, 1, 1), + (TRANSFORM_WG as u32, 1, 1), + ); + cmd_buf.memory_barrier(); + } + cmd_buf.dispatch( + &code.leaf_pipeline, + &binding.leaf_ds, + (n_workgroups as u32, 1, 1), + (TRANSFORM_WG as u32, 1, 1), + ); + } +} + +impl Transform { + pub fn from_kurbo(a: Affine) -> Transform { + let c = a.as_coeffs(); + Transform { + mat: [c[0] as f32, c[1] as f32, c[2] as f32, c[3] as f32], + translate: [c[4] as f32, c[5] as f32], + } + } + + pub fn to_kurbo(self) -> Affine { + Affine::new([ + self.mat[0] as f64, + self.mat[1] as f64, + self.mat[2] as f64, + self.mat[3] as f64, + self.translate[0] as f64, + self.translate[1] as f64, + ]) + } +} diff --git a/tests/Cargo.toml b/tests/Cargo.toml index a987c9e..1f0760a 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -6,9 +6,18 @@ description = "Tests for piet-gpu shaders and generic GPU capabilities." license = "MIT/Apache-2.0" edition = "2021" +[features] +default = ["piet-gpu"] + [dependencies] clap = "2.33" bytemuck = "1.7.2" +kurbo = "0.7.1" +rand = "0.7.3" [dependencies.piet-gpu-hal] path = "../piet-gpu-hal" + +[dependencies.piet-gpu] +path = "../piet-gpu" +optional = true diff --git a/tests/src/clear.rs b/tests/src/clear.rs index 7d8bee0..009360b 100644 --- a/tests/src/clear.rs +++ b/tests/src/clear.rs @@ -55,12 +55,12 @@ pub unsafe fn run_clear_test(runner: &mut Runner, config: &Config) -> TestResult commands.write_timestamp(0); stage.record(&mut commands, &code, &binding); commands.write_timestamp(1); - if i == 0 { + if i == 0 || config.verify_all { commands.cmd_buf.memory_barrier(); commands.download(&out_buf); } total_elapsed += runner.submit(commands); - if i == 0 { + if i == 0 || config.verify_all { let mut dst: Vec = Default::default(); out_buf.read(&mut dst); if let Some(failure) = verify(&dst) { diff --git a/tests/src/config.rs b/tests/src/config.rs index edc1140..2593ed9 100644 --- a/tests/src/config.rs +++ b/tests/src/config.rs @@ -22,6 +22,7 @@ pub struct Config { pub groups: Groups, pub size: Size, pub n_iter: u64, + pub verify_all: bool, } pub struct Groups(String); @@ -40,10 +41,12 @@ impl Config { .value_of("n_iter") .and_then(|s| s.parse().ok()) .unwrap_or(1000); + let verify_all = matches.is_present("verify_all"); Config { groups, size, n_iter, + verify_all, } } } diff --git a/tests/src/linkedlist.rs b/tests/src/linkedlist.rs index b3d03ed..3102a73 100644 --- a/tests/src/linkedlist.rs +++ b/tests/src/linkedlist.rs @@ -48,12 +48,12 @@ pub unsafe fn run_linkedlist_test(runner: &mut Runner, config: &Config) -> TestR commands.write_timestamp(0); stage.record(&mut commands, &code, &binding, &mem_buf.dev_buf); commands.write_timestamp(1); - if i == 0 { + if i == 0 || config.verify_all { commands.cmd_buf.memory_barrier(); commands.download(&mem_buf); } total_elapsed += runner.submit(commands); - if i == 0 { + if i == 0 || config.verify_all { let mut dst: Vec = Default::default(); mem_buf.read(&mut dst); if !verify(&dst) { diff --git a/tests/src/main.rs b/tests/src/main.rs index dd6f4bd..0ab9340 100644 --- a/tests/src/main.rs +++ b/tests/src/main.rs @@ -25,6 +25,9 @@ mod prefix_tree; mod runner; mod test_result; +#[cfg(feature = "piet-gpu")] +mod transform; + use clap::{App, Arg}; use piet_gpu_hal::InstanceFlags; @@ -62,6 +65,11 @@ fn main() { .help("Number of iterations") .takes_value(true), ) + .arg( + Arg::with_name("verify_all") + .long("verify_all") + .help("Verify all iterations"), + ) .arg( Arg::with_name("dx12") .long("dx12") @@ -123,5 +131,9 @@ fn main() { } report(&linkedlist::run_linkedlist_test(&mut runner, &config)); } + #[cfg(feature = "piet-gpu")] + if config.groups.matches("piet") { + report(&transform::transform_test(&mut runner, &config)); + } } } diff --git a/tests/src/prefix.rs b/tests/src/prefix.rs index 71be865..00a69ca 100644 --- a/tests/src/prefix.rs +++ b/tests/src/prefix.rs @@ -85,12 +85,12 @@ pub unsafe fn run_prefix_test( commands.write_timestamp(0); stage.record(&mut commands, &code, &binding); commands.write_timestamp(1); - if i == 0 { + if i == 0 || config.verify_all { commands.cmd_buf.memory_barrier(); commands.download(&out_buf); } total_elapsed += runner.submit(commands); - if i == 0 { + if i == 0 || config.verify_all { let mut dst: Vec = Default::default(); out_buf.read(&mut dst); if let Some(failure) = verify(&dst) { diff --git a/tests/src/prefix_tree.rs b/tests/src/prefix_tree.rs index 9603385..5957e88 100644 --- a/tests/src/prefix_tree.rs +++ b/tests/src/prefix_tree.rs @@ -66,12 +66,12 @@ pub unsafe fn run_prefix_test(runner: &mut Runner, config: &Config) -> TestResul commands.write_timestamp(0); stage.record(&mut commands, &code, &binding); commands.write_timestamp(1); - if i == 0 { + if i == 0 || config.verify_all { commands.cmd_buf.memory_barrier(); commands.download(&out_buf); } total_elapsed += runner.submit(commands); - if i == 0 { + if i == 0 || config.verify_all { let mut dst: Vec = Default::default(); out_buf.read(&mut dst); if let Some(failure) = verify(&dst) { diff --git a/tests/src/transform.rs b/tests/src/transform.rs new file mode 100644 index 0000000..8cc485b --- /dev/null +++ b/tests/src/transform.rs @@ -0,0 +1,133 @@ +// Copyright 2021 The piet-gpu authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Also licensed under MIT license, at your choice. + +//! Tests for piet-gpu shaders. + +use crate::{Config, Runner, TestResult}; + +use kurbo::Affine; +use piet_gpu::stages::{self, Transform, TransformCode, TransformStage}; +use piet_gpu_hal::BufferUsage; +use rand::Rng; + +struct AffineTestData { + input_data: Vec, + expected: Vec, +} + +pub unsafe fn transform_test(runner: &mut Runner, config: &Config) -> TestResult { + let mut result = TestResult::new("transform"); + let n_elements: u64 = config.size.choose(1 << 12, 1 << 18, 1 << 24); + // TODO: would be nice to validate with real transform. + let data = AffineTestData::new(n_elements as usize); + let data_buf = runner + .session + .create_buffer_init(&data.input_data, BufferUsage::STORAGE) + .unwrap(); + let memory = runner.buf_down(data_buf.size() + 24, BufferUsage::empty()); + let stage_config = stages::Config { + n_trans: n_elements as u32, + // This is a hack to get elements aligned. + trans_alloc: 16, + ..Default::default() + }; + let config_buf = runner + .session + .create_buffer_init(std::slice::from_ref(&stage_config), BufferUsage::STORAGE) + .unwrap(); + + let code = TransformCode::new(&runner.session); + let stage = TransformStage::new(&runner.session, &code); + let binding = stage.bind( + &runner.session, + &code, + &config_buf, + &data_buf, + &memory.dev_buf, + ); + let mut total_elapsed = 0.0; + let n_iter = config.n_iter; + for i in 0..n_iter { + let mut commands = runner.commands(); + commands.write_timestamp(0); + stage.record(&mut commands.cmd_buf, &code, &binding, n_elements); + commands.write_timestamp(1); + if i == 0 || config.verify_all { + commands.cmd_buf.memory_barrier(); + commands.download(&memory); + } + total_elapsed += runner.submit(commands); + if i == 0 || config.verify_all { + let mut dst: Vec = Default::default(); + memory.read(&mut dst); + if let Some(failure) = data.verify(&dst[1..]) { + result.fail(failure); + } + } + } + result.timing(total_elapsed, n_elements * n_iter); + result +} + +impl AffineTestData { + fn new(n: usize) -> AffineTestData { + let mut rng = rand::thread_rng(); + let mut a = Affine::default(); + let mut b; + let mut input_data = Vec::with_capacity(n); + let mut expected = Vec::with_capacity(n); + for _ in 0..n { + loop { + b = Affine::new([ + rng.gen_range(-10.0, 10.0), + rng.gen_range(-10.0, 10.0), + rng.gen_range(-10.0, 10.0), + rng.gen_range(-10.0, 10.0), + rng.gen_range(-10.0, 10.0), + rng.gen_range(-10.0, 10.0), + ]); + if b.determinant() >= 1.0 { + break; + } + } + expected.push(b); + let c = a.inverse() * b; + input_data.push(Transform::from_kurbo(c)); + a = b; + } + AffineTestData { + input_data, + expected, + } + } + + fn verify(&self, actual: &[Transform]) -> Option { + for (i, (actual, expected)) in actual.iter().zip(&self.expected).enumerate() { + let error: f64 = actual + .to_kurbo() + .as_coeffs() + .iter() + .zip(expected.as_coeffs()) + .map(|(actual, expected)| (actual - expected).powi(2)) + .sum(); + let tolerance = 1e-6 * (i + 1) as f64; + if error > tolerance { + return Some(format!("{}: {} {}", i, error, tolerance)); + } + } + None + } +} From 8f7ed161a660f860ad8988388394b39039e58464 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Wed, 24 Nov 2021 09:10:30 -0800 Subject: [PATCH 2/2] Tune transform test parameters Previous threshold was seeing occasional failures, and also fairly wide variance in the error. This seems to be reliable, but hasn't been validated extremely rigorously. --- tests/src/transform.rs | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/tests/src/transform.rs b/tests/src/transform.rs index 8cc485b..d696b10 100644 --- a/tests/src/transform.rs +++ b/tests/src/transform.rs @@ -31,7 +31,7 @@ struct AffineTestData { pub unsafe fn transform_test(runner: &mut Runner, config: &Config) -> TestResult { let mut result = TestResult::new("transform"); let n_elements: u64 = config.size.choose(1 << 12, 1 << 18, 1 << 24); - // TODO: would be nice to validate with real transform. + // Validate with real transform data. let data = AffineTestData::new(n_elements as usize); let data_buf = runner .session @@ -86,27 +86,26 @@ impl AffineTestData { fn new(n: usize) -> AffineTestData { let mut rng = rand::thread_rng(); let mut a = Affine::default(); - let mut b; let mut input_data = Vec::with_capacity(n); let mut expected = Vec::with_capacity(n); for _ in 0..n { loop { - b = Affine::new([ - rng.gen_range(-10.0, 10.0), - rng.gen_range(-10.0, 10.0), - rng.gen_range(-10.0, 10.0), - rng.gen_range(-10.0, 10.0), - rng.gen_range(-10.0, 10.0), - rng.gen_range(-10.0, 10.0), + let b = Affine::new([ + rng.gen_range(-3.0, 3.0), + rng.gen_range(-3.0, 3.0), + rng.gen_range(-3.0, 3.0), + rng.gen_range(-3.0, 3.0), + rng.gen_range(-3.0, 3.0), + rng.gen_range(-3.0, 3.0), ]); - if b.determinant() >= 1.0 { + if b.determinant().abs() >= 1.0 { + expected.push(b); + let c = a.inverse() * b; + input_data.push(Transform::from_kurbo(c)); + a = b; break; } } - expected.push(b); - let c = a.inverse() * b; - input_data.push(Transform::from_kurbo(c)); - a = b; } AffineTestData { input_data, @@ -123,7 +122,9 @@ impl AffineTestData { .zip(expected.as_coeffs()) .map(|(actual, expected)| (actual - expected).powi(2)) .sum(); - let tolerance = 1e-6 * (i + 1) as f64; + // Hopefully this is right; most of the time the error is much + // smaller, but occasionally we see outliers. + let tolerance = 1e-9 * (i + 1) as f64; if error > tolerance { return Some(format!("{}: {} {}", i, error, tolerance)); }