From d948126c1662ad71448e8cca14c6df21328c48cb Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Wed, 8 Dec 2021 10:42:35 -0800 Subject: [PATCH] Adjust workgroup sizes Make max workgroup size 256 and respect LG_WG_FACTOR. Because the monoid scans only support a height of 2, this will reduce the maximum scene complexity we can render. But it also increases compatibility. Supporting larger scans is a TODO. --- piet-gpu/shader/binning.comp | 2 +- piet-gpu/shader/build.ninja | 6 +-- piet-gpu/shader/draw_leaf.comp | 2 +- piet-gpu/shader/draw_reduce.comp | 2 +- piet-gpu/shader/draw_scan.comp | 3 +- piet-gpu/shader/gen/binning.hlsl | 38 +++++++-------- piet-gpu/shader/gen/binning.msl | 16 +++---- piet-gpu/shader/gen/binning.spv | Bin 16416 -> 16400 bytes piet-gpu/shader/gen/draw_leaf.dxil | Bin 6880 -> 6880 bytes piet-gpu/shader/gen/draw_leaf.hlsl | 54 +++++++++++----------- piet-gpu/shader/gen/draw_leaf.msl | 32 ++++++------- piet-gpu/shader/gen/draw_leaf.spv | Bin 38560 -> 38560 bytes piet-gpu/shader/gen/draw_reduce.dxil | Bin 3940 -> 3940 bytes piet-gpu/shader/gen/draw_reduce.hlsl | 20 ++++---- piet-gpu/shader/gen/draw_reduce.msl | 14 +++--- piet-gpu/shader/gen/draw_reduce.spv | Bin 6872 -> 6856 bytes piet-gpu/shader/gen/draw_root.dxil | Bin 3944 -> 3944 bytes piet-gpu/shader/gen/draw_root.hlsl | 14 +++--- piet-gpu/shader/gen/draw_root.msl | 12 ++--- piet-gpu/shader/gen/draw_root.spv | Bin 4684 -> 4668 bytes piet-gpu/shader/gen/pathseg.dxil | Bin 9596 -> 9592 bytes piet-gpu/shader/gen/pathseg.hlsl | 48 +++++++++---------- piet-gpu/shader/gen/pathseg.msl | 38 +++++++-------- piet-gpu/shader/gen/pathseg.spv | Bin 34748 -> 34732 bytes piet-gpu/shader/gen/pathtag_reduce.dxil | Bin 4700 -> 4644 bytes piet-gpu/shader/gen/pathtag_reduce.hlsl | 18 ++++---- piet-gpu/shader/gen/pathtag_reduce.msl | 16 +++---- piet-gpu/shader/gen/pathtag_reduce.spv | Bin 7836 -> 7820 bytes piet-gpu/shader/gen/pathtag_root.dxil | Bin 4716 -> 4716 bytes piet-gpu/shader/gen/pathtag_root.hlsl | 20 ++++---- piet-gpu/shader/gen/pathtag_root.msl | 18 ++++---- piet-gpu/shader/gen/pathtag_root.spv | Bin 5852 -> 5836 bytes piet-gpu/shader/gen/tile_alloc.hlsl | 32 ++++++------- piet-gpu/shader/gen/tile_alloc.msl | 22 ++++----- piet-gpu/shader/gen/tile_alloc.spv | Bin 15176 -> 15160 bytes piet-gpu/shader/gen/transform_leaf.dxil | Bin 5664 -> 5664 bytes piet-gpu/shader/gen/transform_leaf.hlsl | 34 +++++++------- piet-gpu/shader/gen/transform_leaf.msl | 14 +++--- piet-gpu/shader/gen/transform_leaf.spv | Bin 12524 -> 12508 bytes piet-gpu/shader/gen/transform_reduce.dxil | Bin 4696 -> 4696 bytes piet-gpu/shader/gen/transform_reduce.hlsl | 18 ++++---- piet-gpu/shader/gen/transform_reduce.msl | 14 +++--- piet-gpu/shader/gen/transform_reduce.spv | Bin 7876 -> 7860 bytes piet-gpu/shader/gen/transform_root.dxil | Bin 4824 -> 4824 bytes piet-gpu/shader/gen/transform_root.hlsl | 14 +++--- piet-gpu/shader/gen/transform_root.msl | 12 ++--- piet-gpu/shader/gen/transform_root.spv | Bin 5280 -> 5336 bytes piet-gpu/shader/pathseg.comp | 2 +- piet-gpu/shader/pathtag_reduce.comp | 4 +- piet-gpu/shader/pathtag_scan.comp | 3 +- piet-gpu/shader/setup.h | 2 + piet-gpu/shader/tile_alloc.comp | 2 +- piet-gpu/shader/transform_leaf.comp | 2 +- piet-gpu/shader/transform_reduce.comp | 2 +- piet-gpu/shader/transform_scan.comp | 5 +- piet-gpu/src/encoder.rs | 22 ++++----- piet-gpu/src/render_ctx.rs | 21 +-------- piet-gpu/src/stages.rs | 8 ++-- piet-gpu/src/stages/draw.rs | 4 +- piet-gpu/src/stages/path.rs | 11 +++-- piet-gpu/src/stages/transform.rs | 4 +- piet-gpu/src/text.rs | 3 +- tests/src/draw.rs | 3 +- tests/src/path.rs | 3 +- tests/src/transform.rs | 3 +- 65 files changed, 313 insertions(+), 324 deletions(-) diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp index 313310e..c2b81fd 100644 --- a/piet-gpu/shader/binning.comp +++ b/piet-gpu/shader/binning.comp @@ -84,7 +84,7 @@ void main() { if (x0 == x1) y1 = y0; int x = x0, y = y0; uint my_slice = gl_LocalInvocationID.x / 32; - uint my_mask = 1 << (gl_LocalInvocationID.x & 31); + uint my_mask = 1u << (gl_LocalInvocationID.x & 31); while (y < y1) { atomicOr(bitmaps[my_slice][y * width_in_bins + x], my_mask); x++; diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja index 6ed2140..6a59f59 100644 --- a/piet-gpu/shader/build.ninja +++ b/piet-gpu/shader/build.ninja @@ -65,7 +65,7 @@ build gen/transform_reduce.hlsl: hlsl gen/transform_reduce.spv build gen/transform_reduce.dxil: dxil gen/transform_reduce.hlsl build gen/transform_reduce.msl: msl gen/transform_reduce.spv -build gen/transform_root.spv: glsl transform_scan.comp +build gen/transform_root.spv: glsl transform_scan.comp | setup.h flags = -DROOT build gen/transform_root.hlsl: hlsl gen/transform_root.spv build gen/transform_root.dxil: dxil gen/transform_root.hlsl @@ -81,7 +81,7 @@ build gen/pathtag_reduce.hlsl: hlsl gen/pathtag_reduce.spv build gen/pathtag_reduce.dxil: dxil gen/pathtag_reduce.hlsl build gen/pathtag_reduce.msl: msl gen/pathtag_reduce.spv -build gen/pathtag_root.spv: glsl pathtag_scan.comp | pathtag.h +build gen/pathtag_root.spv: glsl pathtag_scan.comp | pathtag.h setup.h flags = -DROOT build gen/pathtag_root.hlsl: hlsl gen/pathtag_root.spv build gen/pathtag_root.dxil: dxil gen/pathtag_root.hlsl @@ -102,7 +102,7 @@ build gen/draw_reduce.hlsl: hlsl gen/draw_reduce.spv build gen/draw_reduce.dxil: dxil gen/draw_reduce.hlsl build gen/draw_reduce.msl: msl gen/draw_reduce.spv -build gen/draw_root.spv: glsl draw_scan.comp | drawtag.h +build gen/draw_root.spv: glsl draw_scan.comp | drawtag.h setup.h flags = -DROOT build gen/draw_root.hlsl: hlsl gen/draw_root.spv build gen/draw_root.dxil: dxil gen/draw_root.hlsl diff --git a/piet-gpu/shader/draw_leaf.comp b/piet-gpu/shader/draw_leaf.comp index 85d9528..5de2652 100644 --- a/piet-gpu/shader/draw_leaf.comp +++ b/piet-gpu/shader/draw_leaf.comp @@ -11,7 +11,7 @@ #include "setup.h" #define N_ROWS 8 -#define LG_WG_SIZE 9 +#define LG_WG_SIZE (7 + LG_WG_FACTOR) #define WG_SIZE (1 << LG_WG_SIZE) #define PARTITION_SIZE (WG_SIZE * N_ROWS) diff --git a/piet-gpu/shader/draw_reduce.comp b/piet-gpu/shader/draw_reduce.comp index fe9ab2c..68d43e9 100644 --- a/piet-gpu/shader/draw_reduce.comp +++ b/piet-gpu/shader/draw_reduce.comp @@ -9,7 +9,7 @@ #include "setup.h" #define N_ROWS 8 -#define LG_WG_SIZE 9 +#define LG_WG_SIZE (7 + LG_WG_FACTOR) #define WG_SIZE (1 << LG_WG_SIZE) #define PARTITION_SIZE (WG_SIZE * N_ROWS) diff --git a/piet-gpu/shader/draw_scan.comp b/piet-gpu/shader/draw_scan.comp index d883671..2afc9ba 100644 --- a/piet-gpu/shader/draw_scan.comp +++ b/piet-gpu/shader/draw_scan.comp @@ -5,10 +5,11 @@ #version 450 #extension GL_GOOGLE_include_directive : enable +#include "setup.h" #include "drawtag.h" #define N_ROWS 8 -#define LG_WG_SIZE 9 +#define LG_WG_SIZE (7 + LG_WG_FACTOR) #define WG_SIZE (1 << LG_WG_SIZE) #define PARTITION_SIZE (WG_SIZE * N_ROWS) diff --git a/piet-gpu/shader/gen/binning.hlsl b/piet-gpu/shader/gen/binning.hlsl index 2b0901e..b13db37 100644 --- a/piet-gpu/shader/gen/binning.hlsl +++ b/piet-gpu/shader/gen/binning.hlsl @@ -248,11 +248,11 @@ void comp_main() int x = x0; int y = y0; uint my_slice = gl_LocalInvocationID.x / 32u; - uint my_mask = uint(1 << int(gl_LocalInvocationID.x & 31u)); + uint my_mask = 1u << (gl_LocalInvocationID.x & 31u); while (y < y1) { - uint _438; - InterlockedOr(bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, _438); + uint _437; + InterlockedOr(bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, _437); x++; if (x == x1) { @@ -274,8 +274,8 @@ void comp_main() if (element_count != 0u) { uint param_7 = element_count * 4u; - MallocResult _488 = malloc(param_7); - MallocResult chunk = _488; + MallocResult _487 = malloc(param_7); + MallocResult chunk = _487; chunk_alloc = chunk.alloc; sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc; if (chunk.failed) @@ -284,31 +284,31 @@ void comp_main() } } uint out_ix = (_253.Load(20) >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u); - Alloc _517; - _517.offset = _253.Load(20); + Alloc _516; + _516.offset = _253.Load(20); Alloc param_8; - param_8.offset = _517.offset; + param_8.offset = _516.offset; uint param_9 = out_ix; uint param_10 = element_count; write_mem(param_8, param_9, param_10); - Alloc _529; - _529.offset = _253.Load(20); + Alloc _528; + _528.offset = _253.Load(20); Alloc param_11; - param_11.offset = _529.offset; + param_11.offset = _528.offset; uint param_12 = out_ix + 1u; uint param_13 = chunk_alloc.offset; write_mem(param_11, param_12, param_13); GroupMemoryBarrierWithGroupSync(); - bool _544; + bool _543; if (!sh_alloc_failed) { - _544 = _84.Load(4) != 0u; + _543 = _84.Load(4) != 0u; } else { - _544 = sh_alloc_failed; + _543 = sh_alloc_failed; } - if (_544) + if (_543) { return; } @@ -327,11 +327,11 @@ void comp_main() } Alloc out_alloc = sh_chunk_alloc[bin_ix]; uint out_offset = out_alloc.offset + (idx * 4u); - BinInstanceRef _606 = { out_offset }; - BinInstance _608 = { element_ix }; + BinInstanceRef _605 = { out_offset }; + BinInstance _607 = { element_ix }; Alloc param_14 = out_alloc; - BinInstanceRef param_15 = _606; - BinInstance param_16 = _608; + BinInstanceRef param_15 = _605; + BinInstance param_16 = _607; BinInstance_write(param_14, param_15, param_16); } x++; diff --git a/piet-gpu/shader/gen/binning.msl b/piet-gpu/shader/gen/binning.msl index f6e0505..42a11ee 100644 --- a/piet-gpu/shader/gen/binning.msl +++ b/piet-gpu/shader/gen/binning.msl @@ -260,10 +260,10 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M int x = x0; int y = y0; uint my_slice = gl_LocalInvocationID.x / 32u; - uint my_mask = uint(1 << int(gl_LocalInvocationID.x & 31u)); + uint my_mask = 1u << (gl_LocalInvocationID.x & 31u); while (y < y1) { - uint _438 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, memory_order_relaxed); + uint _437 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, memory_order_relaxed); x++; if (x == x1) { @@ -285,8 +285,8 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M if (element_count != 0u) { uint param_7 = element_count * 4u; - MallocResult _488 = malloc(param_7, v_84, v_84BufferSize); - MallocResult chunk = _488; + MallocResult _487 = malloc(param_7, v_84, v_84BufferSize); + MallocResult chunk = _487; chunk_alloc = chunk.alloc; sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc; if (chunk.failed) @@ -306,16 +306,16 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M uint param_13 = chunk_alloc.offset; write_mem(param_11, param_12, param_13, v_84, v_84BufferSize); threadgroup_barrier(mem_flags::mem_threadgroup); - bool _544; + bool _543; if (!bool(sh_alloc_failed)) { - _544 = v_84.mem_error != 0u; + _543 = v_84.mem_error != 0u; } else { - _544 = bool(sh_alloc_failed); + _543 = bool(sh_alloc_failed); } - if (_544) + if (_543) { return; } diff --git a/piet-gpu/shader/gen/binning.spv b/piet-gpu/shader/gen/binning.spv index 5ec7aec1bbb810f495c2e8ea6b4b0f14dd863c98..17043bc0bc5b911798d9a4c360ac6de0b53c913c 100644 GIT binary patch delta 4554 zcmZvfXK+?U6o&Upp#&nt1W=;_B3Qs)Q9)v%S+Qcl7%)f#5=UXiA3B#&|MYYZQcBgSGChvoR>D>3b?|y$b$VYuZ*lwl z=9UEuI_3~xl|Dc_v*oOog((%kA2534=Mj_V_-ehM_&#Z6v1#D28LLn`7B6aUJI7J0 zQbk2?mJ;u~0q?&7-)I9~U%Y(a@U|ULXScQIS*z0?a7S6#a`t?tL*~jf39aq4bHLT< zsNCPP8vWJjxLoU)JNN8XisI}}Ma$P%;00cX+^(YV*TZk1`RKnf_iHEMpN4)D_|h_dGk6-Ulg;Xf{}!+@=vn%cP72r2 zOzvP}lR3Ft!I$OSpgAG^G2lI5EA|V;cfe&}W3A{3H2pE~y^vGh9c zqeX}N!D_*mgQwDN$xHbF*ckQI^nVEJV^#G%NK><_;>hzbcrvXm22}{>BVbD$vwRe+ z7L(Mc=Ke==t=zNWVA}HYbA9pR(Zez?P;26-UYcT~p2&M?J!MZ}pQicfeefEN=r1ot2i{J*D(fVTO zm%#d7Vi>paWic(z?R?&)D32o$f7wkwewfDekvDf<4=%4j&lO;8|-?7-v56kdJ;Pa@< zcwPx_^^f6$u*Msw-~L|D`eHJlfL+k7`IdePR&$KiDXUD~5Fhtf=VvrES0#=_Uw};% zm%x``wQl-4@K<29@eEXl_BFTxtFGTKf||#}tMwc3=i~S@9pWIa>RY&uScyE&(+{N$ z`w`^_TJ-n{?3#`G8T$)uCCxaq$c@q8pAmiqI~6y>9A2Kk!PR}+|G@fK4)xz@YL)|> zm6DD^a5Fps;nd2z^I&Zgo5g1#tpTsYx>P@~v1H%5wZ_w)V1n{L!D<(QqxOHn6Yav( z*c0?OL~$@J>a(s?pMOx+(czmOt{VY{O6n&hUpU*yUzwoZxD#1p?W(GL@;TruokZ7MySo8jP+lUsb9DtNFyDHir8%Bo-yFo5>$% zT90+W2p$6Vn3nJSP&9Sp8?f?N*)Z@326F&&H58YRZOk?SdG>Ci_#WI%;c6b-My!tq zSA8>@ng>_xOgxL5gO9<^qD76jKvR!PVoR`Ed`t2?St;Qk6yL#z6X4+8Ovnxn;zUMb zeVm9fBWP+)M7%B)N?6Sac-xNxPbQD0Sc;X0Z)@XBntuq%!nyTR4s)b0+ppL%9_{E4d=gX{^uj0C~=f*TWGu)VI#SapS@I z%<0c$dHD7Ls|DW|e5IG)iu{t=55XWEacCxh)qB1ym1%#lk7d+1k*4OBi6eOvxO}h! z;0%_|&&N6tt~QhLdna%Zf*p0nPlkiR>QTzcU^Q1b2`i65r+}^6Z4BaJo(fls$xjFS zIC*{3Xwx(PID1JXIs|N@nC+o(HE)@SISgLDC5OZPQDLmUh&uwTFZhvQCmv@Zn*mWY zFjl4JX?Mm>P8$i12KNly^Km=|?#EHxm>MQ|ELh!qM`7i0YaI_(%Wti$bgE|}=!ij1 z06WV#eGYUSRxNUz3|2dmNlm2REU;SmPXUi9u>S^3BA^*TF;>w6 zcBUpj6)P9d#+pn#2W!8$V^0H5CYSNX>9>+`NVVqof@%dX%_yBKQkaXN=rw9`-(j`r zaJn7Pg3p6bg~xvxZE$1My*1}!eLQgb&Y-EerDF4B4!i(TepSk2kcD6`q?lwoTrDQ4 gPt8?Ea;AOROp(r>wC5T0T%onmjD0& delta 4567 zcmZvfXK}L;@(NU;!Hz?5GIWf?&rAD#i#wBaooLj6ZZfM*ZVIXD;J7 z<2Yki>==9Rz4u-ayQru%74>;;zJpiBZ)S7e_dRFN?w;Lmzne=I6s}xQXsoT-YJ-#t zsUh`BXD@ENyw8D;7gB1U)jqqub6)GTuKAtwQcA^Co1Q@LsNlNvCiq`pF?}dMbAIQ{ z*0x!*x~7%?=rerOCunop&S{&S_gNY-Isf`U{`$16T_rtEu>l z>BL;?nlWQ;J4JDJZRJsoLyDs5-KEC4HHA6dcQ>^b$5@{_`fbYJ@4*e=e%N8~A9A1i zzByl(YxQXzxYpuVbBmwz`ecWu^7Bl^>Q{$Z{c3LYtGSgAZWgN_ zTx;cn>u;;#W8u@PcqyeCHe<@hk2Yiz@mPP6kh*>5t0Xn?W4dfn3df;CpFqho#r279H*fs|9}m zJb`u-{jKDKU}Mx*(f?trk5$$85KYahinB=c2xL4h29<~JQLrVBNj?Tvi%IHJbN?f` zR_@twFm3huace(Ib>$0(H)r0aR>rZtLB+;C3AV4+v-ULh8Jds&XLG;SxmxY#z~;Ck zFUa#?wGC+&LH-oYT;>K3Uf1~rL_3%0T!($JeASlyvzJ5_wP)jB2Di|R))zy+0@n93 z!?>5Pg4NZR_5oNec3htt{j+u3WJyi_5jM8-<0|Y;dp>m;&5Pk> z|0%o)YrJv#?eB+DUrgpRunW35-_p;)YL2lgWwq%Gh>!cLa|un&Rf!|fmtYgcRqz#9 zZ3%rH_-nA*Xa;IV`v%;CRoA~1tLE|Wj{O$AbR>Tcg*b?-`VOumRw9q{^kZeienR21!LelDx!>(adx8nd{{X994366W2~V_((vY5@wGhRA zw5ZQtmHPaR@)s=*A2UiutH;(A(9{=WBCZDR(E6;KUurd9g#EyE;OZqfwKeoFqU)Q~ zGyN>pLlk|Sn%~t8;C|{|x5?$k#rk@K9Z;V?0p#K91NPkM^XGuvmjILc@@E*<-1ckW zg<99K5y7Of!VTeSeoA6h{owz%s%CV3`Kr=JU^SmuRDXCAEfyu;Xfl7CX$#f?BX}U# zV_LoQgV5BC_ZN*kR<<#C7=t;0xdxOM?Jy+U1mxMfk>Y!B2gB7oxSL{qJh*GX>*@C9# zM8s=3$znAp;BCJ(cszM5#X_t+e8a(qRD9f4mjV=g(?(!Tr1cB8Ep|JakN)j*zc!NX z05(VPk?@Ideoa*twIkdZbr&GthGy>78A&D4d1tW6;v2OKTur~P@+fd8{&??pg&U(D zId%iP;>fW(TrE!R9$@>aXO_p`x{5K#Uf|0}5PWaAG4TZ(4Q|12oW6+L2dvMW{$7@c zZ(p!l@cqD7c=-=Va_ zGX6MwNhCTPY@(R$5pXqcnTRG1=@VA0T6xe?Q#u0EDf?}-Vbg(luc^g(P zo`yA&Tr#U#7nYB5QDYOXSp bYvtQv9ZV}9k@Ib_UfFx6Lib*~zTfx{9KRi3 diff --git a/piet-gpu/shader/gen/draw_leaf.dxil b/piet-gpu/shader/gen/draw_leaf.dxil index 86b37e940da57e098366edaadf2139a3bc786b49..17bfd04821a365b1d94408450ff3b37ff214016d 100644 GIT binary patch delta 125 zcmV-@0D}MEHQ+TAL|8&YR7Ec4m9uUObm<;+6)hUxkrasm0kNKS0Ro@^vlRl@0SAzv z00RmTq?7XlloS}62?7vpNCxF0prIiE7GE(iGdQ!G1mOb^PRHZaEQZ301N{Zz-U>rs flZ*)W0-yl1g$d&U0+4{SY73qP0)Q5?3K7-`0z)kk delta 125 zcmV-@0D}MEHQ+TAL|8&Y4eJ2Z1yB{%;uDeu`44KTkrasm0Bls3K7-`uM;b9 diff --git a/piet-gpu/shader/gen/draw_leaf.hlsl b/piet-gpu/shader/gen/draw_leaf.hlsl index e3cb387..d0bef52 100644 --- a/piet-gpu/shader/gen/draw_leaf.hlsl +++ b/piet-gpu/shader/gen/draw_leaf.hlsl @@ -151,7 +151,7 @@ struct Config uint pathseg_offset; }; -static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); static const DrawMonoid _418 = { 0u, 0u }; static const DrawMonoid _442 = { 1u, 0u }; @@ -159,8 +159,8 @@ static const DrawMonoid _444 = { 1u, 1u }; RWByteAddressBuffer _201 : register(u0, space0); ByteAddressBuffer _225 : register(t2, space0); -ByteAddressBuffer _1005 : register(t3, space0); -ByteAddressBuffer _1039 : register(t1, space0); +ByteAddressBuffer _1004 : register(t3, space0); +ByteAddressBuffer _1038 : register(t1, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; @@ -172,7 +172,7 @@ struct SPIRV_Cross_Input uint3 gl_GlobalInvocationID : SV_DispatchThreadID; }; -groupshared DrawMonoid sh_scratch[512]; +groupshared DrawMonoid sh_scratch[256]; ElementTag Element_tag(ElementRef ref) { @@ -558,7 +558,7 @@ void comp_main() local[i] = agg; } sh_scratch[gl_LocalInvocationID.x] = agg; - for (uint i_1 = 0u; i_1 < 9u; i_1++) + for (uint i_1 = 0u; i_1 < 8u; i_1++) { GroupMemoryBarrierWithGroupSync(); if (gl_LocalInvocationID.x >= (1u << i_1)) @@ -575,11 +575,11 @@ void comp_main() DrawMonoid row = tag_monoid_identity(); if (gl_WorkGroupID.x > 0u) { - DrawMonoid _1011; - _1011.path_ix = _1005.Load((gl_WorkGroupID.x - 1u) * 8 + 0); - _1011.clip_ix = _1005.Load((gl_WorkGroupID.x - 1u) * 8 + 4); - row.path_ix = _1011.path_ix; - row.clip_ix = _1011.clip_ix; + DrawMonoid _1010; + _1010.path_ix = _1004.Load((gl_WorkGroupID.x - 1u) * 8 + 0); + _1010.clip_ix = _1004.Load((gl_WorkGroupID.x - 1u) * 8 + 4); + row.path_ix = _1010.path_ix; + row.clip_ix = _1010.clip_ix; } if (gl_LocalInvocationID.x > 0u) { @@ -588,9 +588,9 @@ void comp_main() row = combine_tag_monoid(param_10, param_11); } uint out_ix = gl_GlobalInvocationID.x * 8u; - uint out_base = (_1039.Load(44) >> uint(2)) + (out_ix * 2u); - AnnotatedRef _1055 = { _1039.Load(32) + (out_ix * 40u) }; - AnnotatedRef out_ref = _1055; + uint out_base = (_1038.Load(44) >> uint(2)) + (out_ix * 2u); + AnnotatedRef _1054 = { _1038.Load(32) + (out_ix * 40u) }; + AnnotatedRef out_ref = _1054; float4 mat; float2 translate; AnnoColor anno_fill; @@ -617,7 +617,7 @@ void comp_main() tag_word = Element_tag(param_16).tag; if (((tag_word == 4u) || (tag_word == 5u)) || (tag_word == 6u)) { - uint bbox_offset = (_1039.Load(40) >> uint(2)) + (6u * (m.path_ix - 1u)); + uint bbox_offset = (_1038.Load(40) >> uint(2)) + (6u * (m.path_ix - 1u)); float bbox_l = float(_201.Load(bbox_offset * 4 + 8)) - 32768.0f; float bbox_t = float(_201.Load((bbox_offset + 1u) * 4 + 8)) - 32768.0f; float bbox_r = float(_201.Load((bbox_offset + 2u) * 4 + 8)) - 32768.0f; @@ -628,7 +628,7 @@ void comp_main() if ((linewidth >= 0.0f) || (tag_word == 5u)) { uint trans_ix = _201.Load((bbox_offset + 5u) * 4 + 8); - uint t = (_1039.Load(36) >> uint(2)) + (6u * trans_ix); + uint t = (_1038.Load(36) >> uint(2)) + (6u * trans_ix); mat = asfloat(uint4(_201.Load(t * 4 + 8), _201.Load((t + 1u) * 4 + 8), _201.Load((t + 2u) * 4 + 8), _201.Load((t + 3u) * 4 + 8))); if (tag_word == 5u) { @@ -649,9 +649,9 @@ void comp_main() anno_fill.bbox = bbox; anno_fill.linewidth = linewidth; anno_fill.rgba_color = fill.rgba_color; - Alloc _1258; - _1258.offset = _1039.Load(32); - param_18.offset = _1258.offset; + Alloc _1257; + _1257.offset = _1038.Load(32); + param_18.offset = _1257.offset; AnnotatedRef param_19 = out_ref; uint param_20 = fill_mode; AnnoColor param_21 = anno_fill; @@ -674,9 +674,9 @@ void comp_main() anno_lin.line_x = line_x; anno_lin.line_y = line_y; anno_lin.line_c = -((p0.x * line_x) + (p0.y * line_y)); - Alloc _1354; - _1354.offset = _1039.Load(32); - param_23.offset = _1354.offset; + Alloc _1353; + _1353.offset = _1038.Load(32); + param_23.offset = _1353.offset; AnnotatedRef param_24 = out_ref; uint param_25 = fill_mode; AnnoLinGradient param_26 = anno_lin; @@ -691,9 +691,9 @@ void comp_main() anno_img.linewidth = linewidth; anno_img.index = fill_img.index; anno_img.offset = fill_img.offset; - Alloc _1382; - _1382.offset = _1039.Load(32); - param_28.offset = _1382.offset; + Alloc _1381; + _1381.offset = _1038.Load(32); + param_28.offset = _1381.offset; AnnotatedRef param_29 = out_ref; uint param_30 = fill_mode; AnnoImage param_31 = anno_img; @@ -711,7 +711,7 @@ void comp_main() anno_begin_clip.bbox = begin_clip.bbox; anno_begin_clip.linewidth = 0.0f; Alloc _1410; - _1410.offset = _1039.Load(32); + _1410.offset = _1038.Load(32); param_33.offset = _1410.offset; AnnotatedRef param_34 = out_ref; uint param_35 = 0u; @@ -726,7 +726,7 @@ void comp_main() Clip end_clip = Element_EndClip_read(param_37); anno_end_clip.bbox = end_clip.bbox; Alloc _1435; - _1435.offset = _1039.Load(32); + _1435.offset = _1038.Load(32); param_38.offset = _1435.offset; AnnotatedRef param_39 = out_ref; AnnoEndClip param_40 = anno_end_clip; @@ -738,7 +738,7 @@ void comp_main() } } -[numthreads(512, 1, 1)] +[numthreads(256, 1, 1)] void main(SPIRV_Cross_Input stage_input) { gl_WorkGroupID = stage_input.gl_WorkGroupID; diff --git a/piet-gpu/shader/gen/draw_leaf.msl b/piet-gpu/shader/gen/draw_leaf.msl index e20fcb2..06a4e23 100644 --- a/piet-gpu/shader/gen/draw_leaf.msl +++ b/piet-gpu/shader/gen/draw_leaf.msl @@ -230,7 +230,7 @@ struct ConfigBuf Config conf; }; -constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); static inline __attribute__((always_inline)) ElementTag Element_tag(thread const ElementRef& ref, const device SceneBuf& v_225) @@ -606,9 +606,9 @@ void Annotated_EndClip_write(thread const Alloc& a, thread const AnnotatedRef& r AnnoEndClip_write(param_3, param_4, param_5, v_201); } -kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1039 [[buffer(1)]], const device SceneBuf& v_225 [[buffer(2)]], const device ParentBuf& _1005 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1038 [[buffer(1)]], const device SceneBuf& v_225 [[buffer(2)]], const device ParentBuf& _1004 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) { - threadgroup DrawMonoid sh_scratch[512]; + threadgroup DrawMonoid sh_scratch[256]; uint ix = gl_GlobalInvocationID.x * 8u; ElementRef ref = ElementRef{ ix * 36u }; ElementRef param = ref; @@ -630,7 +630,7 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1 local[i] = agg; } sh_scratch[gl_LocalInvocationID.x] = agg; - for (uint i_1 = 0u; i_1 < 9u; i_1++) + for (uint i_1 = 0u; i_1 < 8u; i_1++) { threadgroup_barrier(mem_flags::mem_threadgroup); if (gl_LocalInvocationID.x >= (1u << i_1)) @@ -647,9 +647,9 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1 DrawMonoid row = tag_monoid_identity(); if (gl_WorkGroupID.x > 0u) { - uint _1008 = gl_WorkGroupID.x - 1u; - row.path_ix = _1005.parent[_1008].path_ix; - row.clip_ix = _1005.parent[_1008].clip_ix; + uint _1007 = gl_WorkGroupID.x - 1u; + row.path_ix = _1004.parent[_1007].path_ix; + row.clip_ix = _1004.parent[_1007].clip_ix; } if (gl_LocalInvocationID.x > 0u) { @@ -658,8 +658,8 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1 row = combine_tag_monoid(param_10, param_11); } uint out_ix = gl_GlobalInvocationID.x * 8u; - uint out_base = (_1039.conf.drawmonoid_alloc.offset >> uint(2)) + (out_ix * 2u); - AnnotatedRef out_ref = AnnotatedRef{ _1039.conf.anno_alloc.offset + (out_ix * 40u) }; + uint out_base = (_1038.conf.drawmonoid_alloc.offset >> uint(2)) + (out_ix * 2u); + AnnotatedRef out_ref = AnnotatedRef{ _1038.conf.anno_alloc.offset + (out_ix * 40u) }; float4 mat; float2 translate; AnnoColor anno_fill; @@ -686,7 +686,7 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1 tag_word = Element_tag(param_16, v_225).tag; if (((tag_word == 4u) || (tag_word == 5u)) || (tag_word == 6u)) { - uint bbox_offset = (_1039.conf.bbox_alloc.offset >> uint(2)) + (6u * (m.path_ix - 1u)); + uint bbox_offset = (_1038.conf.bbox_alloc.offset >> uint(2)) + (6u * (m.path_ix - 1u)); float bbox_l = float(v_201.memory[bbox_offset]) - 32768.0; float bbox_t = float(v_201.memory[bbox_offset + 1u]) - 32768.0; float bbox_r = float(v_201.memory[bbox_offset + 2u]) - 32768.0; @@ -697,7 +697,7 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1 if ((linewidth >= 0.0) || (tag_word == 5u)) { uint trans_ix = v_201.memory[bbox_offset + 5u]; - uint t = (_1039.conf.trans_alloc.offset >> uint(2)) + (6u * trans_ix); + uint t = (_1038.conf.trans_alloc.offset >> uint(2)) + (6u * trans_ix); mat = as_type(uint4(v_201.memory[t], v_201.memory[t + 1u], v_201.memory[t + 2u], v_201.memory[t + 3u])); if (tag_word == 5u) { @@ -718,7 +718,7 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1 anno_fill.bbox = bbox; anno_fill.linewidth = linewidth; anno_fill.rgba_color = fill.rgba_color; - param_18.offset = _1039.conf.anno_alloc.offset; + param_18.offset = _1038.conf.anno_alloc.offset; AnnotatedRef param_19 = out_ref; uint param_20 = fill_mode; AnnoColor param_21 = anno_fill; @@ -741,7 +741,7 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1 anno_lin.line_x = line_x; anno_lin.line_y = line_y; anno_lin.line_c = -((p0.x * line_x) + (p0.y * line_y)); - param_23.offset = _1039.conf.anno_alloc.offset; + param_23.offset = _1038.conf.anno_alloc.offset; AnnotatedRef param_24 = out_ref; uint param_25 = fill_mode; AnnoLinGradient param_26 = anno_lin; @@ -756,7 +756,7 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1 anno_img.linewidth = linewidth; anno_img.index = fill_img.index; anno_img.offset = fill_img.offset; - param_28.offset = _1039.conf.anno_alloc.offset; + param_28.offset = _1038.conf.anno_alloc.offset; AnnotatedRef param_29 = out_ref; uint param_30 = fill_mode; AnnoImage param_31 = anno_img; @@ -773,7 +773,7 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1 Clip begin_clip = Element_BeginClip_read(param_32, v_225); anno_begin_clip.bbox = begin_clip.bbox; anno_begin_clip.linewidth = 0.0; - param_33.offset = _1039.conf.anno_alloc.offset; + param_33.offset = _1038.conf.anno_alloc.offset; AnnotatedRef param_34 = out_ref; uint param_35 = 0u; AnnoBeginClip param_36 = anno_begin_clip; @@ -786,7 +786,7 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1 ElementRef param_37 = this_ref; Clip end_clip = Element_EndClip_read(param_37, v_225); anno_end_clip.bbox = end_clip.bbox; - param_38.offset = _1039.conf.anno_alloc.offset; + param_38.offset = _1038.conf.anno_alloc.offset; AnnotatedRef param_39 = out_ref; AnnoEndClip param_40 = anno_end_clip; Annotated_EndClip_write(param_38, param_39, param_40, v_201); diff --git a/piet-gpu/shader/gen/draw_leaf.spv b/piet-gpu/shader/gen/draw_leaf.spv index 77ed9cf929bbf122674f3cbc92726f53b8616491..7e92cbb34da93d54a2f20df565bbe42b9642d93d 100644 GIT binary patch delta 10930 zcmZ{q3w&N>eaBCl_bt8AQUWa{P191!{niRdEo})DONBy;KufvAG)>x+T-qe8MIEO_ zV6r)6dljc!H<*Hg;ACaGjo~&05fl(nCl{euxu}5XxU^iv@AtgtpPppw{q)K6`~Cj^ z=l_3h=Xp-v^oH9<+;IDd>n_~;7qv;US28jwB}p=olGNPM^Z5Rq4$JOMaC#6FQ6~#Yu&iCx2Lyr zb%G~a<&OHgR7(w!+qyc}w{&jGynl_>+}hXPGjQ^y9oY0%^FvS2jar+$m}~a(U2w6N zHH@mr%kbVy2TYOW7+vMlT2{68w;R|XV-TCVx_aB-CLL30omzM3SkzN`dpbJTsE$nJ zetKHkyV|?yO@FRewoaYgpmhY+jh(9p*0yx^vtIfzR~j0sQTD*PzI9-2fBTwY9?Kf4?MLG=(AV12pWCzERkk!tCdcO+8nd2{ z!-r1jm5s@~duE!k?YaGWron!_)6mGzhH2ut)9TZ4_*O^0dm+^Pv@s*rF2wj;a}Fng zo#?*yj?C5w@67!U2FEr!6kJ@O!-w#4vY?by9-THZT{uK9B~8Qlq9MGToDE;Ks&`XM zZ%0Re`v83?B^QR)RbA({;MqnhCF_E-=k?zZd{wd_6E@{Sb-?B1a(Gu~PkXk=+2G2_ zm1rHEU0p5Ry{p|(?)&R1H#XKS{VWzT!E2MP;OVvHYNk^1 z1bp{2m6N9`HPh=ZdTxkXN?shsUmnI^8NzFmSK;)ZK9rKzhS*P6zk5wRW{=`R1?+bpJZA-~w`0lZkk~4j1i|(6xrBO9Cl?RSoncj=nC~l>B zOj!zcA9zVkgL`2)U6uyIx$S^0wJT@%J*iFQm`yMDm{vO+el+z2xN8)AGTarGo3$S9dWQYOaMxLW z5Oph@&*GeSCRbO5!3NjEy`v`6;C8S2$c719jFS+p4;w2YcyO&(>``DRYa7FF~SLEx4aWA;Auc5v7 zTh7zOfL?InU_vjqBEN1J_lgU<0lnsePl0>Y1%IA+UUR`6px0cLr(C}XPYnvCY)^XQ zMFS>3yYYhCL2ta^w)e&hK7kH+;{`XNw_fm%!FL8v*-iGCGjq^E?j;xv2heP%?F|mlA5i--kGAhq)OK<-h$G`eW=tPO zd597je+VvSd<1Sr^(*pT{0MA2ZAQC``ccX^DB6s59&LV{>kH_W2aP;zKLw+uyd2hM zr;b7xe+IsmqRkUXK9>nF=wslEsXeK*>7%duE`}IGwcbug-o@8aob1Q3eUq{ysT?+M z%yCZul_LFqq)b@1cmU)fYbvQSl zmuFB{&?P;#gXOU*JHTF*JFwdySN)fA%5C|IKMT$@Y_9n8a6hiNw&y5nuDF=kX|~B; zf(#G-GMaiU?r*_rE_xH`y<>kzd6Duknsz(po%?%=wq^P7|2Nm{j05tKcol7XZA-PW zBUaEc8oib`V#)>^y`DANJ&$tzT;Ld)bm6Z?Uq z6TU+|oE8#HLC_%{i2cFEgj3;azHCLp2C(h4HM1_)6K@(=-46IWwp{E>hcoCbPY1gp zQo-vqJb-#8#g7gL<_^};p;=&qe4SbHOg;##7U#Wu2E|wg7f0Jez_yJ&au|4foQ%Hi zD7fCf4jlph4C`&9XmBK0-I4e@B#%vX6xdB=9BumeN~InhI2!D71V0AcSYrNTJ!c~* z7T0qQ+=klhoR9kZzy_~wpz7W@7Ow75;$}Mzd_T4O#~5}okCeizoz*n&p z+7^Q45%6TN9oE)FEq6(5cM7@7dTbP9p4e??9Pe)MPh11P>!!zJE))v8+f{Ve=gsatZ z;6%H#;I`G~S-gzek7u#AX37TMo_spQO=*5QodX$9aZ`ObuOsofaP4t8od;HPbH%1y z4j;ZLSD>lK(SANy&3zJkEM1A<2sFlqx&UkvZBFyq)N0Y8i@<8JkuCDbKt4THvkk3;JRD}b}52?6zuE-{}|W~ z2md(O7|m>Ezmxe>urX|>uXginqSj_RZ(RnT4jJWUjLwM9Ym9OWJVvQK8eaj{_VK*a zp8%_w=yGa#bnq&$+PCt@`_I5?SG)eV&=-62NdygUt{oHuT?to@K-Ykaf&Lt>c6IJ! zoWD>j973Ohs};|ezl7WNQ;yby;?vZAJSa3?OHuQn5XY$h3S8V5e+^eNc+6M@9y6vr z+I|MC&4FD5^Ew*-4P1L1K%WPz72l4&05@nHEPo63Tv3n5_wT^A z)n+F?>-sB3qQ3_t4vyp>z%deS;dcXATkspfwu?mn2v#ee6JLZ!EbY~{tpArF8Y9?E z;9{_w;cCU_{>$(Prajtz1*|RjE#P9TTj6Shk1_d^uOdV+9is7VU~N8Dw^M(O;>hi= ziL|@>edPWLtleqdN-d8#Uk4XAM+XnID=F|Ak6g4-WI0F3(xHy_`z}1S6 z>o?&s812#aU%}df{~MTCX%cJp?+A*;oqQKOf@zP&{{hz4%=3IV^|vS{P(0oXV7V{0F)-_0NjhTiG4pXVfhXX9fs4=if5X*$&YiLEQ2TMlw0)bR=8TEGmi?;d z94KR);zz+NvYzO?RS)0p|2g+==8;R{F=M^jNw|IG1mn;30 zAB>-`R{%Ep8MW)D!9c#nJ_%l6ACeHZQmbo!oLbGcK9Wy^H&I*)zv$c! zev0A=tH1o|lpitLXTXj&*6tTz8^qTva(%RqCddx3NsGUT{1UDnKF@-U;Zbc;xjx<% z&r_!_@MAp1Ad{)>%oyrlQMARY_pfsex541QL3@r8Ba-W941W=Q2|S5n;uopq+MMm* zg0JKcdztZNh=RuT7=H)0*EW2GT0WOAceziJ--EYOo7kYX)rUP)Nzd3feI2~OLfhA< z##os<}$Tp#`2k8go5r^JfA?S{5NEEz)P6wC1r z+@!@N`vY7(eBK2cBbH3APjs?|-ygQZ&*Gu*pc?^Kvv9$8G>=bffu^w=#iOT$uI*ap z!=`1h&DA6PNN_Ry9&j~_^Uq_I?H-S}QD~8;a0;c#Xbc)7(HO9S)VJix$AayEHj~?- zao{`Yc=+rI*H7IAqiLcRot*$Kc6KlLgfe~;Dy_|R-4mhqfV!`|rHqI-2M!{=cV+N+gkn(NZZAX!A-4{S^I7}ON78MVd0_XigTUJut#-GTcz1-0;R z0ILO`24-^7?EH2n8xeHUU;zIJq2?}g=?(y^6{lq;Ts?dad=H;lXzFpM9|X3YdQ*~C zgC7hrcwtW@hrrb%_@VFNa~PU>1V0>XJ9THzXYL5FNws}4*Nz1H7r-OPpN)1pwPGYY z8a#Zo$DpY@g$AAtwp}Ej1J*~~8T8LIY7z5T@bE#V$02HnpvQxYLEjJ8N8O+(c2|LTjge5!eLU{EM2}sbKX;_Cc_Nx-B1+ zT%WkbP6KC`82N|XVoNY+j9ctOU^PDm_ZmGNzL*k+yj(vA;Vrfl9JiQ%dXsA#iR~;f zw^+6`XM#^v(D+e|%fR;9b!!I8;}$y`{3U!$Y)~8P0vojJU_-`exLWK#ZECik4z~RP)bH@?zzLM& Mlk&E<3+B)JBTmls{r~^~ delta 10916 zcmZ{q3!GPFna9sC|34z2fTNCFh8aY>;ssNakjtQkWGJY}MR6Kn1}5f$!?;Mbd%$AN z-PLmct#0k=>Xw$KZZ2!>uB*FSnrRuC*~LprR7!wa(rtUNJce9g$p?g{l{^(*7I z5|`HgoJ6lQG*qMPhjo2x-`WjrYX*5NYN)o~ACJD?*6t0tJ?mX%OG7<5KHJck^?W2g zbV9FeOy)fx(~ND;?bnV5`?a^Bk$)Q|il#aNLd|CtGh*#RjLJ3VupaD0 z_qMfXwkCK-?sqgew$ZWR;(|>bz{|<(Qc`(x;+S;K0KJsV9mLNbz{|;n@Rciju59UP zZ{N_?M;}VbiqJZ%>%2O6wvkH7y5Q`2{d~6Um6NS# z?H!$+EnPjU-BIrQ>npc5)-AXJ3z^`x$<5%dR=ZVB?o)SRI$QgyElSBVd6a6VQt~W( z-!zqz=PNZ$bt_&Rpq7%|gZL|h_^ShWZSoqN{?mt2^7;T8@P<{boo(6JO38_EXQd^3$4bd5!8h5ql$-svT?_xFR#e;m7!CSk#ds+%Us|Wa$l8!-q-5}mo-gecbmutp&oQ)vQ zILbNX-&NT%dEb#W8I`$9=JD^Z@Ei&rmU))?kE_EnzVG<5jn6!`f=3$vW$iu64$mz6 z_$0V?GkY@mOwJCM%C%GK%gQ4w(~hW{8AlGryaQNhU}H6pfXo}sIBKIqt8B?5*nKV0 zWg~b5sZF3n1c#{B6k6qpJi_qYa&%aT;PmM|wV>G_HMUnZYGu-t>L?D(EvJWt2%~cC zjDlwGjW8#)%H>n)n&W7S5d_!M$4OVK&2T9@wu+NJYTLt8O2cY~R31BidHN7up0CF- zWlqgE;lsG8X24Uhd&7%sBHT+%A4>z_-3@9J!LG8r3BTV^dpD>VNAAk2X@51xw4IwZ zcG(R&F*!I}(MS@t_W!q3o;sm1JHlN_!yia(D(ArTa*u8`hj}9PXt-+@{4lsH9@8)Z z?)rxPBDm`=KazSSoM&FnmnBzMh5iON!kyQ88r%bRD7wfW%z4bmOK`D6j0A`Ct|rOm z+1s8y@r;_(j}Y|vET zcE~F)>~gQYBA;5>HEn!ak7Z85Vk9rR@YL=_7hJm+UGPWPiC%I=?j={`>jrTzx!j&6 zYY5;aS8U)V7u6+enfJb68x6l^qM=FzM)OCGJ zM45??FZQG02PhBWC4UUOC)e*m^W^w8n(ef`!GZZWwU2qUeTSmH$zLw%-KZEUSl$}ZC*cro5 zeHN$`>7N4|GRF2(uw&EDbn@pa9W%yHc^>3!ZO&)r1+ZEjdrMDkFI4WB(U_WkdOj~d z123gZdU&A9V^w|*_G;XT-Tt`hzksXXm9O|O!D>6Px#BN@eOz&EFH+Q8aWSzgjWf&X z%izJWUqMrkwf!wv%{6Z(yI1P(D7z`!(X`tyujk)Wv@Oa<|3A5AUmTAw#A|4dk+#dT z0W0Vjja~;Eh#Bi|^ak8UF^zKlT;CX&bxhiuZsqxVP~N1(G`(dD%1$CXO@9FUI8EB# zrl>hh;)u34doliqvX>GQ^A1?eiHVI;!+B>vT}Xl^a--Z2tp&SjY#06|G{pDd>(`6*On*W%*g*(7d_50V2UdG5bE^6r3^!;bJOr%fTNf|Jp>W%3vlD&?s6{8n zf}<0DGHp)_3C1Dl5bwiya4}&6T+MH*NH`H}J8cV@(;JA_2v)ZPes{>lP1Meyvpfmx zhDZgk({M8N5fmRCrsNLR(V-*32KfrJ!S(j*=XmgESZ^CegA>5&j$|seJT}#dU^kUXi^#Qev4 zo{XSaT+jEwZK%!8`Kq4+Hh6UdRrkiJaCMIoH`{68N2%RE#;`+b`b7uc3m(q?>laQ3 zJ4Ml<>0q^Dhu#Obp|;oqGr;QAv#i?5GvId8qa_l~gsVpT{nOZ## z(R0A+KDN1rj;Te5=7ZJl%s1!<;A$fZvvBSJAJ+ea2#R}fx;<~W0B&P#5&S%GG5Gm# zwOV$6v|9+btv1i%3#fg3JG3pL^z!ZG+aYdF^V4ZDWE90sbx~eN;)~(h<8WF6R&#U3 zrd$diyeXHVsmIa29IWO(i9MEHg5U@=#)i5SY!Yow^M%xE(V@%0YO#@8z_E|Ch2Mw3 z+SW7rBT3o{R{J=!EMMjNE3TxNXazN5e}FcyiK1ieV70ilebv^$Bf0iSz80)4l6QdB zBKe2G9lrmOd^Lh%1n2}8lXt<@d>v!t-S9}RJ=*qwwMFvvV6{koxi-qmlz$xgI%>rT zumN05-UnAJvq>ZQ74S%|J=*>WSX(6D2v&>aSAx|Xcxv)qfT96hmydv5ir^mwJ3GNY z2DZb&{}gPDg=}VjCG%%sW7tk#?dI7;t<84cx|?19;wZOZM9kjT80A&)7^U`T{Bf|h zk0sSke*&y#B4gP_69wUc6?t#I`S^cUb_pudEx zU7hJ94A3?FWlm7!A!L&!?2f*4E@;*OE{ht&Qs5>J^ zF_{0#b!X&0u-p&Z7?|~*BpuTKn)$k!z!UJl!6_vUjIV>$e9xV+hp2s=F>T+Vs5xU| zuVsJJ^Dx-u@H~mw--N4YJ}FL*f!TCurWM{O)A&N zm+>X)U6fH2V;n|pXGT!}in7bQFFve)os0PO2mcM)iOW9G}?dWwm6Q_Hov zBEJQ1&26tx%eAe?_B*hho^=upUj-;k=dTCc7Rm3yH&L6|pf=XWJ@a&B+TyzO4cKgo zX41OEU{DY0mKT{IiSq9J5X zu^@kho3yxS@4(f=hrVVpV$tOKL??&9Z-$@GYv4gv3s{#G5j#Nn#C31J<9fv$JhR7k*9FVq{(m$8Y9sNuz}RK(awOvWPquY)kbR)HtviwZ*{4gNp;70M}35f%_)| zweX(^Rtw$;W^&T(e0Cn8PJIg<_nkW)Y*KBX%(WB1hf|Ixe>PhG zf}$A7P67`e?KCuXr_jJBgKZbd-vic1-5K<+FlrI=RPf+Irl%olh@kHU7lWP-*GJu; z{wYT-{HKFW6iMC(_CP7K<@A^96aF*6#>oBCplyv9zkb= z?L^GP9I!s>5vLh!oZ=GCEpugPh(Kq8?Og;q3p|&iz9nC~v%x0N<{!{piuZ%nBiTH# zgSsmplw6;<$L52xdyM=8F0v0`&=~jFxnMOPgL{#F5Pl9N4tcqL4#IovJaF7&{$Wk7 zZ78;d;JC-m2d4@eKZ5ZBu)TKO7J=n)k6j4<0zM`-sEzf>?lIcl6E+Erq<>2TePA8zuI^8nBvAbZ{+vH6><6uAiN8AAA@b`=A3X*XCe5!R~|X*Cp#9 z3L4unc7yG;`=EjkHj0mg`eFu-PfhZ}IOb>z#{V-#mBmZXt*Nivy7YcptXQ^jtY3$pdsolh%k-SSeBIdS zk9ADq=V?=4d0_cb>4buMyDB`UR(>1BQ5}KqsH|gDU&Ph~-^|`{dcOo$iw&qv&GwVP Xwx3MBm!Ahtr<|6Qx2?Kl=8XRZo*woC diff --git a/piet-gpu/shader/gen/draw_reduce.dxil b/piet-gpu/shader/gen/draw_reduce.dxil index c001e896f2b6b3aad03bd517ce94c82d200bb760..9b1b0fd6893fc7bdecd4093fd10ad0e5e93d2257 100644 GIT binary patch delta 378 zcmaDN_e4&_CBn&hCGYK`+5hwdqBdIJZU`z;oG8-C$hdKyD>i%ptR_?wL{5=&u32gn5nRQ>6<)*{W~Mm<_R1d7#Te_J97Cm z3odeC_A*>h!FcIm&QGNQGIHp*iKW1M2~Wn??bQ z7nmpMDzmXMG$k}N9FpAO_|dUIw7}qwz#T@(Ge#QT8xBq2s#92Fu;h?Dk4l3hlZ2{9 zKoiqu7NC9yjR>HA0j_BfmoRSPDw@E4IEIZyh3CKq#*Zw_31>79BaCmk4eA`FCjvK4|d<8a64VXNG{W~Me<_R1d7#TY@J97Cm z3z{qtVeT?tP{ZtHzMz4VYo)^Eg}fS4jh}iYBp5iF8VlLDBzj;>FNu%^lOOYjGB!@O z<4`E@M2CFmdzKqdhw}oi@g7G=b@q>=TSvD?J;#^)+eb>TIqe>C0J{oXb|S z#mHX%m+hMs**hxPeUs%|U#Ne>K7&J;3tmn-*8RLTo;ckF0&)et|KE0!w{kS8z=d$uz*087PUu9(SH zDrN3KKMm{o$)WFM@o}qF$?b(gc}kLqReQCc+|rbFOSv?ipJ`>q<|}1#g6{GRG;CSS@_^Mza((Pf%;$shEZ_Nbp4AkG(dDX%Eg6&==@Hu--y7RN~b;2c)|655iQh|&OYI_aa*PTqWZ#F;&RGd&2IG2l! zx7Wg^79X{LkfhdtK<-s?+h61TnvL?0yA8Yi#K4gsj)^;t{@^w`&Ss+wbdTp^=aFRN zaF@1m+yHLWakds^NJO41oJW$riQd}A(GT2JjAe-d&HoBr@^xV%oTnn4o=5%?e2h2D`d2FbIcjUOc` z2}c{)YxU*cb>0~NnRz$R7fIGHu$Sw72xO!J?f&v917cbUDP4Lweg1Z_BP&>O&jp8jlC9%amkMbINCvX**=rbua;cl@T)$q)i!q)_o0R$O>9!r~zcR*j7~do$I|UP)Ex#PU=diR&Z7g4vOv-s&ay(bf zl_Gooe|FL=$lhMf?HjI?D}~)g3B}n=A}HbUgpp!lJX?4y%(f_d$7JoU$vuroY`)55 zZ5Le9#LbxoCEYhPIaK%7`Qp%l4*Zuz5w+-Iem5!e}EaHeWfLFXl`6a&02w zZ7JE+gEuDZE9|a~WeeqOtx(K|5iMrLPA4~dEA~{ftDCr_uuCDii#2`4T!^x0F`990 zY$88;NyLEPINz#>S~XXmi2U4n#b*32)hPIVVdrZVqb7(7??m0UkRvyp)7Pv|UuXS3 z{k;(Ctt}fjpI2Pt=HY6>^Kg74nh28JtlHt0C%px`Hx*px_Pa^GgfOJPUr72wO1Mua z7HwxL7-q|Hfe5yHt-$B(0qM@q#@7f}NB(n?WTXNa>(ut@wzDvZ{p~l z40k%tX3{~YdkyXe=h>>q0%o;n~WkN5@!GK_;ltjT^= zdP=|DUio=g#DE^9=TX7+N?(-3?pNHGlH`E9EwXuDaM(FXo|Do}g$K??PLh$>y{%r} zl{i-vo&rBb@%TuH%^RY?7DYcI$s2k~crXG@^52DAy=FuBR(xIiVsab=ToyLp-Z#Kt zd1kcnt0X1iXaifrZC~RJ9G1#Eg1$oX_$BuI+|oK5ADkBbif};qSE1N12rmkMD>wS@ z;txs!wy{k^8IgR=P6xr%Av9r7QQY8l;W=SJQP^(?i~xRr#JwqTjAhTHm^8*m0ygUb`#0y!|rNiJn>a{eJTnG070+Y-EZL`4$AOQhLv*HOz0|pEV zqCf*mlQ0c30nd|T4O9WRlei5<0cMl_4Ojy329sD0JtlfEY(PN401O)tk1zn{r~(Xz n8-hLngINY+2u2JH00hARfrUQcle-R50q~O!4>bW|lSB_B8pl9e delta 185 zcmV;q07n1l9_StvL|8&Y+9QzZvln3>ZTQ_AhWvThkrasm0TjK(ko`^8*mE2Q?7I>IEX9$8T<1ld#y6TnG071e44GZL`4$AOQi0v*HOz0|q3@ zpg;o(lQ0c30nU?S4O9WPlei5<0c4Z@4Ojvo2a{M1JtloHY(PN401O)tk1zn{r~(Xz n8v;K7gINY+2u2JH00hARfrUQ!le-R50pODk4>bW^lSB_B{C_`Z diff --git a/piet-gpu/shader/gen/draw_root.hlsl b/piet-gpu/shader/gen/draw_root.hlsl index ec75d5c..56b513f 100644 --- a/piet-gpu/shader/gen/draw_root.hlsl +++ b/piet-gpu/shader/gen/draw_root.hlsl @@ -4,7 +4,7 @@ struct DrawMonoid uint clip_ix; }; -static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); static const DrawMonoid _18 = { 0u, 0u }; @@ -18,7 +18,7 @@ struct SPIRV_Cross_Input uint3 gl_GlobalInvocationID : SV_DispatchThreadID; }; -groupshared DrawMonoid sh_scratch[512]; +groupshared DrawMonoid sh_scratch[256]; DrawMonoid combine_tag_monoid(DrawMonoid a, DrawMonoid b) { @@ -55,7 +55,7 @@ void comp_main() } DrawMonoid agg = local[7]; sh_scratch[gl_LocalInvocationID.x] = agg; - for (uint i_1 = 0u; i_1 < 9u; i_1++) + for (uint i_1 = 0u; i_1 < 8u; i_1++) { GroupMemoryBarrierWithGroupSync(); if (gl_LocalInvocationID.x >= (1u << i_1)) @@ -79,13 +79,13 @@ void comp_main() DrawMonoid param_4 = row; DrawMonoid param_5 = local[i_2]; DrawMonoid m = combine_tag_monoid(param_4, param_5); - uint _178 = ix + i_2; - _57.Store(_178 * 8 + 0, m.path_ix); - _57.Store(_178 * 8 + 4, m.clip_ix); + uint _177 = ix + i_2; + _57.Store(_177 * 8 + 0, m.path_ix); + _57.Store(_177 * 8 + 4, m.clip_ix); } } -[numthreads(512, 1, 1)] +[numthreads(256, 1, 1)] void main(SPIRV_Cross_Input stage_input) { gl_LocalInvocationID = stage_input.gl_LocalInvocationID; diff --git a/piet-gpu/shader/gen/draw_root.msl b/piet-gpu/shader/gen/draw_root.msl index 2ed7ba2..0d22e4b 100644 --- a/piet-gpu/shader/gen/draw_root.msl +++ b/piet-gpu/shader/gen/draw_root.msl @@ -61,7 +61,7 @@ struct DataBuf DrawMonoid_1 data[1]; }; -constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); static inline __attribute__((always_inline)) DrawMonoid combine_tag_monoid(thread const DrawMonoid& a, thread const DrawMonoid& b) @@ -80,7 +80,7 @@ DrawMonoid tag_monoid_identity() kernel void main0(device DataBuf& _57 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) { - threadgroup DrawMonoid sh_scratch[512]; + threadgroup DrawMonoid sh_scratch[256]; uint ix = gl_GlobalInvocationID.x * 8u; spvUnsafeArray local; local[0].path_ix = _57.data[ix].path_ix; @@ -96,7 +96,7 @@ kernel void main0(device DataBuf& _57 [[buffer(0)]], uint3 gl_GlobalInvocationID } DrawMonoid agg = local[7]; sh_scratch[gl_LocalInvocationID.x] = agg; - for (uint i_1 = 0u; i_1 < 9u; i_1++) + for (uint i_1 = 0u; i_1 < 8u; i_1++) { threadgroup_barrier(mem_flags::mem_threadgroup); if (gl_LocalInvocationID.x >= (1u << i_1)) @@ -120,9 +120,9 @@ kernel void main0(device DataBuf& _57 [[buffer(0)]], uint3 gl_GlobalInvocationID DrawMonoid param_4 = row; DrawMonoid param_5 = local[i_2]; DrawMonoid m = combine_tag_monoid(param_4, param_5); - uint _178 = ix + i_2; - _57.data[_178].path_ix = m.path_ix; - _57.data[_178].clip_ix = m.clip_ix; + uint _177 = ix + i_2; + _57.data[_177].path_ix = m.path_ix; + _57.data[_177].clip_ix = m.clip_ix; } } diff --git a/piet-gpu/shader/gen/draw_root.spv b/piet-gpu/shader/gen/draw_root.spv index acecee3679ae274a8ebf82ee416e177df63eb992..1c11414ca510052be760fd68802677d698754e93 100644 GIT binary patch delta 1698 zcmZ{i$xD_|6vprCmqUqYv?z!$$W26`&_<*{FbG<;jW!071>w+O^Jg|^lYPyTrj$8k zn%ShApk<4;?fMtmG_Csm^u6)*K`%U<=RD_}`<`>|{ZunoQ;{iJQyM}NmV~k}*H^J4 zNv8JfENbZvA++spZRx}ohehE$w!OKtd2i}p{KwCP9DZk87rB|x58pkXmkGnzSTh38 zKVvLcml;lGr@v%ACrOeUE#AJWi* zdO6ni@_pOO_iZowE8w;med~t{d|SEHlW@UvI||EnRh}k=;7Y>S#z+`@gHWOssGdSJ2dB z6IbD$8wb1w&pM%r{#3yvf4UFry0sqQ-Sq)i*`40y^;E0IT5n9v`gW}KzN5mX>(g!C z@KzU-V6m?o;qTqvgcnY93r&5T27EWSftnA{cXJo(=esd>2dLSIJieQI@P)J4X`Jpp zqJf@tx(9HzCJuE*MvV<%ZEBzj@||u6u|pu_I;vJ? zpFm8XOy_(GSE~iirvDgNY;U`@<_?xh&8U<`|} z@K&)Cpb~hHF>{8o>al?r@U?mVB;1DLtC@nEqaJ_y67D&Tyt4)${4`EISPm@MmR`y$ zxXFG_F@t>#%s1DD^v#PkZ{XG>AlAHvTT>0J(KpY_8QKDItzJl+>K#0Piud?x@c?6L N`L{bb{h_Ki`2!-N&W8X1 delta 1714 zcmZ{l%Wuq46vyw(Xs0DrVnJe2!bU_wqKl>>Y0{mQl|*bRX;Km?^?pv(TfJ($pF>fq zsx?$mJ4*|ecK!t$ua(b7f0v(0{E~0Z_k7Pe_nvd_&CJBY_Y3QC*#-F!GGS_{32*!A z)@8Dz8`o#s4~7srw{^64VXMNFa00uzt*dQQ;`jdJ=fVa2uFjq0=0XX+0mvU>W6dBu zy~a>+S#BURQT~+skjZ3=gH>yn>>#BUEN07U>@JW^`McpYnGog?A1ltUo|Ty>Hdgl+ zY?m=YGx{{F?WTR(P5ZVR{WYT@}c zW1oX2FdLY^nudLtFMzpz4l#y(3CuUwhV;#gHRJGdSm58t-=bG=iyDAM`s2lq)8{t6 YhLu2U_YGVvW*SpV-)?_-bVhIH7sOrAWB>pF diff --git a/piet-gpu/shader/gen/pathseg.dxil b/piet-gpu/shader/gen/pathseg.dxil index 0ca0d185edaf3331ffbd83e41a1ba49692a9b052..3c8131532b027bf002a5b5c7a02d729315e7be6a 100644 GIT binary patch delta 5289 zcmai2dsGwWwx7%-LlP#C0D<9|0P+S+cm=H&C%gr;!A7384xuU{Vg#!A%nLC^5K!P? zt2I80iq{}$vB$OvpvF@`>cLj5)?ldzTUU$Lw%B^>`38bL>#lXz{U`Z;d(Zs#_t@Wl z%!~A?boZp(ghbsn>4)e4IHGizIxxO*^5u3K1VL3z?$tC}=Lhf5>A%9BBU7Tt4gdh@PKvO6LQ5XW8 zTtI69@`OawH@+AI{j~oQYv@1O_xr%_cSYFmrooNsu8UYcU4=6s$eF&KYfpzDKkzzz z0CG_fnq~#z+ma=9S!#AECKkJ-7s?Z^x4b^;vWA%NlFxGTOK5&ca9rfi(dIIOhBo{9 z2WI*vXkk8S5c(!)U<75mh`?C7#bpC=LY}rto;1dn74maKWYMq;j;`aWh0NX}5uy^M zKo4H38q7G2sTIDc=XG4jKXac6CgP8>AT6nENpSZ%!ev*96pfIOiT*95O$xJjHrmzHnm%*!JNQ8Y6NN&yxnN(&0)V9u3<;<-i9)diJYaVDS8I%=Qy!dtN&9t z4HRn1Sbq2mvjR1U-s3_idU{*91Y@(oA7uh!!zdiGP#RJfNGBMj2G`pfEVLN$7KDl z0;A}6Q*btyU83=}81zI$Fex*K8yGMaYGHH}YAK`L5QY<4?;(@nbk1nn8w`L0Gu#Un%St_vIjno3va8rNFgRDA!2ai?;J$xa3RFhYQA(1)7aB8sKX9<1pIQ^0W`ZZ9w$sX z9b?gCoW--8MLg$ExfY6ZzzIkAYFQ&&ETaYQI>iM*ZlFN%uG192T{;pliM}sSZ4Or< z<@6YbIkYhRcfSZNAKP)nwU=Gii-kqZc82@WuvyzeJ3uVdl<(aZ% zMi<>eFoa&EBqoQhn$fES!!QGerh(AAl(a1b^(Z^ySoXw~nzVE!$zaaNRhkr!@-fE< zvD>twm}IbQFVuK!@^z}%I4#duk;bCgtEfc71$$nFIIDv`Lm#y=hb(kHE*9z$0Q4ZsVz4o~&>GkA?G#k?+yR8Gs zjXpC>%Xit?m>voqZ*-Gxn_+U0RQ;}^+;fRbbE}Vy^p3vYZYBNc3~9j!+h*M>Xqmvk zyL}C~*9y_&aA~y7XKSt3iuKZ+)4}NZM%Rf{bS~b({L^BOdWs1Uk&uX#KNDkyp`;XN z*Z_0q+b7IH+_3C^{WwE67`d=X<>}&8ZSw_9tACK6C#`mxEV$hDcp62f=U)e8WfM9t znl29wW`{ba(Ni!k2w{Y?Gd@}`OrfZ?_W`n6tzc5CTK>eci{eZ0cdJ_w&Yj~1)bH3w z9pyisU0Q5V;Zq+?8?`iE9BezJQPu&KB~=y(PDz$}V0Ndsln4I8Gh7Ed-Tc(wb}Al+ z+gUs5>mtEX-)f${A-@K#KQGX{PjO1u&T_uXJ~)-i4WbTy*!VAO+ko*yY@5Zn23wbc zTdrZ^#s+tVP!M&Mh!5mWiZv>AgD<=t<5`P_+%4CwmOa%!8Cz0bE?gNPKu?R*-0;WU>c&LHyw&?lYg*n=YY1IxAQg-s1j8c5)e(Be_MVy-}k zMC%(^JN*RCeyMwcvVVEg}A)JaQTj(F?lW|k6J`%^u9_Am1A zQYn+(yvzQ1Fp|t6KT_I16&Dy8&vS*QH8XkMr!(Y$sZ#1TUUaF^{%I$nzA=-J7OvhO zsd>Ju$8rr$j1R<~iqrHB(uD1GYHJalemD;ytPErKr2Z_Btl zdDTecSq7op4SIWKjlZJkt(u=;VR-QSo^qw6U`!kW9+)F z*q6(`l}cph8xxlhV}OBEhexY`!_j_|l=jUtf6d(RbL@tw$ln}HuF_bMVwm#F6%#%6 zpV2q^_b<8IzvTKsFQ@>$YqBvPLnK1yfqIdoB1#feQ2~|uu0{M&4scsj%ig>COqnF% z-pG=B#|OTk8Bg?3)+l3}7gQ!jn(r|;S4cK@MXGY_Os>w7S!>_luD?#-;y=FRoAD)| zAM}O_h6ZM>ozY2xogdd5CEL0r^C~t&rP8WdYZvx@)1EmjNx1)P$^HJ!;OO?IcB zOapb%@A%ohH|p)2H%>%duJGIAReW-@?CmCGaB68S=E?29p~6f?n)t;41W@k{3h)4M z(N8#JbK^fAG_UU6oeYX20#`eg&});vM8kpwXALQ)fmEiH(RqyD&up$WIA$B;^JxvH za80a?HpNR>1xgWja%VHhx;v}gM+t^C6>s6lv7?&jSs-ic(TAy{#1)}On_!g+r0wFF zM^CDs04;5jcXE|$;BSa-tcKWWB3lbHw8;(|b&ICDEWTw=gH*2b`|JOlxZ$_EXY}y@ zWUJI5HJ#njDh@2lz77VY+gyQSa+7F?}p2EbC^*Cb!UThMWC={MvA7Zb{$pa zGQ~{PG-RzOMR&RFJbWNhPjl-ar~Kpc4=V(*`}Q4!T*cKly%>FHk`*I9x#78ZGz;2; zr(NHx1?7<%HH@e3J^naYos50h9FuAW-jasMet9Ex*=84$AG-X(F<_S2PK64OVrVt% zD%f)+BG%i*fHSyDU=O66{aFHM6-?NKP+BBcS0&Tg%JbZJ>%VMI3+WlFdC9Vntx}Ff z7|eB2FcKpUER$)27av&$CQ}yD0<&wfYC;xYpx~xc)37#l@g)lG3~8mbd&3r+DR}i1 z1*JSus3pL=j9JdYYWy#%{(CBvTN$VQx%M0^UE_Y!fN8(4TKofZQ;#ncBuTGEh88g-#iI*{lSfFdU330SM9o2yqTd%iXj={xoRI3*FLD|{@olPDSp9+WRK6}xtBS@aaBlSEt|%cRa&|b0yc(MR8a5)4?E{Cp;kes(9zF`y2c7l3 zod;99c|2OZTl|0jL(S9rnQm*<@vDmuf?NDXFy#qChHARukxh3$lhdsbb2;#fN%An| z7YChzUv!vp`b>*!Y?jY&`lqeEc{#FlOJ&B7(V$x4Pdc|gn%dvYHH9a78Z|$<2!3(u zAoU?l?DWZ?kbP{D*O%EMC(NRMa6?WtQ*g-%3Z{!gR3R-CUV33#ofQ(&O2OB!P%s$2 z8ZPz3-u^fHl*W|4Z1 z*Z2TorAN24Y{Kkkmw)FJ*@Q$qX*OCkty@cS$BB&iNfWo6gUVRY*Jp7 zz2GjljgMogE~!sixD(21jmt`wW+-yN>`8y;cuThOu`+Qun^e-cU6_(<*Q-pr$huGc ztYn$rg8IXyUcfNy6}$m#-!LC+-+;yoY~OSR_3T#eE@{k6rBlDMX*n+USIuZ%@bg}2 zIK?eM(?zck4U2{4Lc5IycG>?wyemStFwj2E^X@lDna1D-WgRMhq)Zg%gWi?2tK^P& zG=90^TOK`oA}5sp?ACD3j$1k63xnpv*iRh*UJw6yI2X{njeY67w2c)VlY3 z*a7xj=%ni~xFIOGA=a`z-Nm7C5o6s;e^7OADzVi!-+X*#PCk7j}mBKy9a2*r#9~AcCsx=Y93` zl<;Ep?b@@qL(j}am#Tq`3s+6r^V%D;rQxGr^53Sctvx+!?Z8*1^~=cu?fTKy^{1Af z?X5D0YrL^aD8bv>?`5S^!w%08TG)W*2egv9D{kPhC(PdTKHmCCv{&Nw1-e1FJ%Ik& z=W4qx+&UZw6^Ge><=Fr`*M9zXTX;XEns=|{j1?x+*Pc70{)p@+X&?i%|FJKkM|+OE ze081eBT3L_#rBTjrm!hniQBhwEIG%_FpZcCL<%ES9m!Q=t#6LzG`{sHz-H^`mEYd7 zbDer+izj~eJ!R^2;XGWNOSNq!{o9N7oZA{!b?!H(l<5yf){xF;P7M1r`*@qn8rrti zXDexZlJY(EiK-(Z2y6c1tAy delta 5314 zcmZ`-d0Z3MzMsi5Sx9hz1j3#WQC6J*8bmKn009AQ)PRUq2M|T27^zwpYBs=#pny@S zS{ua$t+r86v88FIKO;XR_4$-sio4lKGzB%=h;@=lsrZ zIb#{GGJI2VlalWa7uNKx-Vk=?!{*m#WY{M#1VQEXz7;U#+~Ezdn-IqIC?^(TF(oZQ z6n6>>Jz}O-H$xhF%M@BBecEx2tdW0x#@{M`&3)*;ogqxgot<^xAA$&o4!uAih{O=! zL;-Ct7*9$ze=p)d(CD7atiJzb-xEYR*b?ow9j2g~mW!M`n#z|BL0+_S)Qtu~V(>Yw z7vdG*8bbjt+B;9in>(AAr9pTZ`HG~gd*2%5t;DDC@>rfBN&UEN?^^svgh$}MF$EpG zknv%17-4&c@WL>{h{IIyY)E@Cj<*&+u1GIZqzv=r`TX2)!=LahF%*h-3Sg#II75ZR zLWMNp>p%Y~sPRw>nH@g}EO{Sa_@fq>0t`Y}kd{z>p!E$nDr9fQ3hE#s6Zw1iA(8i3%|^=GxEoENurWTaY{AV5(5R2CCxfMgHo1YEm--hM6^RV<;1e zfTlqARD_@z8iDk>d?TffDs->Xu+?03yZ}hq_Ce)1FzA>~ZW1M+>&R14+i=x1W)HD1 zAJrbEigK{_u0&d-V5v5^g3p=r1XCN}eQosrEx zZ8BD~R?tpC;lP|uHrH!sNH&9K9~;^6Ov%8nw()f z0daIJ0Y^&v143FjDQJyA05P?)8(#NNFNYVH9#Y(rbSz-Akcx?>Ip z$jfc-nnc4S2TFUcQWtNe3wmROY4ZOjJWX25=gMGTDAhqRC5$ihXwfxrxO7s1OV6Uz~%rq0B}y^$P@2BM!l6P(rW$~JxA@D z;cU+Ahdpsw+==X@s2^C`E#f0i7NlEuhjBm6Y& zHcmH-RL8gq$lDIoNKf!8DqnX%NDgvJw6E%?s^pS37Ad^~O|(m)JHjUN6W$H-dz$74_%yQu2?JVL{e-VJoNK*7d+t{tAD}} zs%HoSVjf2&pJ2{1SGh}R4`5_NeRsAn2pdu!+YVMcUgainaQkf?K^Ad7C%Jt%)14=K z>+`P`@xFA7Exuann({M`pfw))Zr}tI9bmX>w}yw za(mmc?j^KGPNuz^Tugtet_Qv6(dM9ftkhi!2JUH1k*)gCMOv>fADx8-J4hdV!Em}) zRmktKpXGhhz382No+(Zf&ZcjxsS=EkdJ zX!XDvi_}MrPmL)=7X*U{)+*Uif0w!3PF+{uLZsT6`|ZrKIC-osSvQW87_|`^v=lP> z@Ne*O+`p-?=O!zh!Wb7APo1Kh9`}p6t+N9`{b!WeIQ^UvtC6a6-|2_19&Ogd%47A( zV)vRQQ3Ry1huCKx|DC`v9@$c%R)%NV>?lGE2`DimdEOPvwG3 zxHu2q?V8&9kO{wf)LboAB|*G;v1^w;H0&s+t%w0Wu%VIVTsAy#*C4;-Q&(zJ?dd;+*v-K;} ziV976FRG%xXcQq&C54>(g%mj0i#a9;F?XYI;=@BgECW&7Q%^QgaMM0 zRlW06Scl$1rnb1@vWRM|KuKn#+O|XN~m{|*h%(mQtJ)WsoR`+Hm*xa z%`muubCjhZimi;NlDJXo3xVI5rkulljVDe(M^Lg33BmBgU7LZupcB8kU!V~w_n zPbI@|-7j1%GTjboMH_8EI9&;Zc*Il55ykyNB@o(h_3rC%a98(;dfdH2?dhKG zitPI6-7hpy@G~l$*h0I7cSGa6(mcL<^KW$X@|P>7SS2GJT?j9G&tyMrPnI8BmDYdd zEcQbbbt8SF+a+Oe3w?3jZaSqr)t={Vz6TpvSw8D!iTm6x3xnNgV~%`ed3S2KSK~7M z&hjYieqo0&SXVJ77~j>+WG1HKqaTEtkqG2|5_OHS0G||R`rqKu`Dh?9P8vGzJx$fP z^N#JS5XQTk4-VztCnhPI{C5WQhL&zD98e1lWJ;`BP9{m%7bN|Vc!W5pyWGQF%kf7o zLmc&QRc&e_K4j{{qOnO(2c_k*jJ~ObAGDmxAFJ6Vhyz{&cS~34Cc}3)C7n}Co=u(- zv;kzR+QZYK#Ywl#dKpy(98PJxIrWvNxfYTtsZ#N~zd|J*MN~1u1%7GX(RXJoQz+FA z_0$d-dauH!qAqTYgE=i>9W@5N%Uquu4)5RA}MoeJJol$G0P~`em_$CeQWO^YrY}UoVKQPNXrRC3J-UHeUQkS-$9@k!@Z4-~x{xCMDS5e`zV(6#QZ?4f)&w^FY zq$+W2=v@~VWBU7@G4M!P%W%-)r;F0#k3YQLr(73+76=;wYjB!Cus54lWl z^-93J>UQA?$C4AI3toCi<v|QOu8|@R?YqF}lp? z`@N)omV4aI=@ZZQXjYUnW_eGu_WpXT#{|n>^{ODSm7U*m%qCd$s+RehF>esRBJ^tvE&^G*nAm0jS^KEw#XK@=>k<2_1Y1B# zq2TSZGAl7qBpQAug_JrA zu~k)x0s4e+UQ`(~knia|t)h?3-5HRyy;5J~e&l8n-NDMyRp-m$nu6l=?fw)$if&kc zFYHTCO5iT6?p~zGx~K{p(I%9pN5*e#6Ag&4Hb}n+`PHjRjLjLd|70+KrwCWDzSOYs zQA25)2o-~r$E0G+nW-FiP|zC}H3=xX)m=8Se)pT&Ai|oRf>mPQHv;JaAXvcy#uwsd z+&xiY+fy49p!%doUHgePW`KPwJ<_&uK#ft!GpmmQ#}%AnY9R?EVI)9x1oW6=V2~O*S`d^ys`vd7F{Sm`2VIUGvilK~c z57Qs*qU3g|Gq&)6vuNU=?3t>K<15qPsDj+%iEN-r?FT}|Q4K=uBgi$EqPk6Tum=`y zmoTUw?1_>p(KDt9ngv5R=bnrU`hz&fD{S+4hsaT^AKyFBQ+wK=^)DU!NDUmJU(L}{ zM4(F;xDiedt!Yq10Mp?8ig1;Y5ELD1Fhl~Z*}o(lz~g5c`bh{MT28_T?G1`3QhKO_ zggm&3^7f74Dty10h%fl5`csN*yEy*Su?TeiJ`wktu=$hyt-FZr{#qqP=caBf!gR4{ zl~|Ys_VPf+m~$`tfCr(yc(`s@RRw)$Ccu|g&}YPx4cUalrB@g0JalB|R(zke|Kb#< zV8vl2>o37b`2N#V8bi}{nq9Drx-6bLU-;MK%r}oS-aJkd>aKl1J5sGXTYLBu%H|t> z+7CnJIu&s64C*EU>FEFWKO3J-%AL(u`szd1Rt>1*D+|cTKHuK{Fe~l7&`f({B(DT* z(6})c@Dj~5;3b`YH8QM#AS3I}*?VtJt93Lc$oyXnX_ueRRguMuY-aukeKTjX;NF;4 z*ned1lk1a#XMNyz%2=Ulh&Tm#ORR?3bRMUvroLf532lZ95*qe2)C3-ub9uldB_?}F3Bx1?z8xD6dcswRAfRxH0eN8rM_n@INfN{90KoT z+Q6tfpjB2n0tShbGXMKkK~ZJK(8FvH_~yUG+8I!M^<>p7_txhNx?&i4l#jLGY)d#1 zRWcAppwuZVkAuUpNMli^tm7?XY24EU=}-2eBT=0J?K1iHyjApWQ2&?G5g$nMHN;pa zi>OIIp65y78(~4*ihYD6X*z^xdleQtKA1|ZK8u# zb|cP^ot3S6n_gn#S6E8RtS4_4P}x=MU=y9uyn%1!JM-K>^0pqIuUW?3_F&;Q{(?IJ zuXbFxr=}3R`i0H}wWQ=lb|(EuXhlwOrT3id8+i$;A=K;*O1A-7AkEPukJ@f6dE0w{ z2g(?GbQ_GeHk`6ih}EoUfutCm5UTs7jGZSa{ho5S3-UE5oz>2d(e1aR2Esv+V=p+V zOOJ0@V4<6Nv^U?@V2P`;4*bmR`h38tqqwO1F)WUoqZ!X087`ta@yYtFq)ANrnj+!a$T32EHuh6CI`AeXzz%NvEG2#Y_CH%$0(!uZE zr$0Kr)OkxaLzueseL}O$rR#u4=BHnhx(@uV_0Z8lbTXztzBG325@k#~Ip1-66y&vF z@;sBe)fVe;%sJIx{Jog&11;FI#iF;6R~Pb#oTW!+E!Dg~xF5@`WBewn7xa3PhfR1N d4ja&%qgGppHW9dTX)GF`O3C|`J?Wv?zX3^+at8na diff --git a/piet-gpu/shader/gen/pathseg.hlsl b/piet-gpu/shader/gen/pathseg.hlsl index a9cee25..f7c9e2d 100644 --- a/piet-gpu/shader/gen/pathseg.hlsl +++ b/piet-gpu/shader/gen/pathseg.hlsl @@ -72,7 +72,7 @@ struct Config uint pathseg_offset; }; -static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); static const TagMonoid _135 = { 0u, 0u, 0u, 0u, 0u }; static const Monoid _567 = { 0.0f.xxxx, 0u }; @@ -92,8 +92,8 @@ struct SPIRV_Cross_Input uint3 gl_GlobalInvocationID : SV_DispatchThreadID; }; -groupshared TagMonoid sh_tag[512]; -groupshared Monoid sh_scratch[512]; +groupshared TagMonoid sh_tag[256]; +groupshared Monoid sh_scratch[256]; TagMonoid reduce_tag(uint tag_word) { @@ -360,7 +360,7 @@ void comp_main() uint param = tag_word; TagMonoid local_tm = reduce_tag(param); sh_tag[gl_LocalInvocationID.x] = local_tm; - for (uint i = 0u; i < 9u; i++) + for (uint i = 0u; i < 8u; i++) { GroupMemoryBarrierWithGroupSync(); if (gl_LocalInvocationID.x >= (1u << i)) @@ -547,7 +547,7 @@ void comp_main() local[i_2] = agg; } sh_scratch[gl_LocalInvocationID.x] = agg; - for (uint i_3 = 0u; i_3 < 9u; i_3++) + for (uint i_3 = 0u; i_3 < 8u; i_3++) { GroupMemoryBarrierWithGroupSync(); if (gl_LocalInvocationID.x >= (1u << i_3)) @@ -575,16 +575,16 @@ void comp_main() Monoid m = combine_monoid(param_23, param_24); bool do_atomic = false; bool _1263 = i_4 == 3u; - bool _1270; + bool _1269; if (_1263) { - _1270 = gl_LocalInvocationID.x == 511u; + _1269 = gl_LocalInvocationID.x == 255u; } else { - _1270 = _1263; + _1269 = _1263; } - if (_1270) + if (_1269) { do_atomic = true; } @@ -612,37 +612,37 @@ void comp_main() } if (do_atomic) { - bool _1335 = m.bbox.z > m.bbox.x; - bool _1344; - if (!_1335) + bool _1334 = m.bbox.z > m.bbox.x; + bool _1343; + if (!_1334) { - _1344 = m.bbox.w > m.bbox.y; + _1343 = m.bbox.w > m.bbox.y; } else { - _1344 = _1335; + _1343 = _1334; } - if (_1344) + if (_1343) { float param_29 = m.bbox.x; - uint _1353; - _111.InterlockedMin(bbox_out_ix * 4 + 8, round_down(param_29), _1353); + uint _1352; + _111.InterlockedMin(bbox_out_ix * 4 + 8, round_down(param_29), _1352); float param_30 = m.bbox.y; - uint _1361; - _111.InterlockedMin((bbox_out_ix + 1u) * 4 + 8, round_down(param_30), _1361); + uint _1360; + _111.InterlockedMin((bbox_out_ix + 1u) * 4 + 8, round_down(param_30), _1360); float param_31 = m.bbox.z; - uint _1369; - _111.InterlockedMax((bbox_out_ix + 2u) * 4 + 8, round_up(param_31), _1369); + uint _1368; + _111.InterlockedMax((bbox_out_ix + 2u) * 4 + 8, round_up(param_31), _1368); float param_32 = m.bbox.w; - uint _1377; - _111.InterlockedMax((bbox_out_ix + 3u) * 4 + 8, round_up(param_32), _1377); + uint _1376; + _111.InterlockedMax((bbox_out_ix + 3u) * 4 + 8, round_up(param_32), _1376); } bbox_out_ix += 6u; } } } -[numthreads(512, 1, 1)] +[numthreads(256, 1, 1)] void main(SPIRV_Cross_Input stage_input) { gl_WorkGroupID = stage_input.gl_WorkGroupID; diff --git a/piet-gpu/shader/gen/pathseg.msl b/piet-gpu/shader/gen/pathseg.msl index 0f60d4d..9708585 100644 --- a/piet-gpu/shader/gen/pathseg.msl +++ b/piet-gpu/shader/gen/pathseg.msl @@ -156,7 +156,7 @@ struct ParentBuf TagMonoid_1 parent[1]; }; -constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); static inline __attribute__((always_inline)) TagMonoid reduce_tag(thread const uint& tag_word) @@ -432,14 +432,14 @@ uint round_up(thread const float& x) kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _639 [[buffer(1)]], const device SceneBuf& v_574 [[buffer(2)]], const device ParentBuf& _709 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) { - threadgroup TagMonoid sh_tag[512]; - threadgroup Monoid sh_scratch[512]; + threadgroup TagMonoid sh_tag[256]; + threadgroup Monoid sh_scratch[256]; uint ix = gl_GlobalInvocationID.x * 4u; uint tag_word = v_574.scene[(_639.conf.pathtag_offset >> uint(2)) + (ix >> uint(2))]; uint param = tag_word; TagMonoid local_tm = reduce_tag(param); sh_tag[gl_LocalInvocationID.x] = local_tm; - for (uint i = 0u; i < 9u; i++) + for (uint i = 0u; i < 8u; i++) { threadgroup_barrier(mem_flags::mem_threadgroup); if (gl_LocalInvocationID.x >= (1u << i)) @@ -615,7 +615,7 @@ kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _6 local[i_2] = agg; } sh_scratch[gl_LocalInvocationID.x] = agg; - for (uint i_3 = 0u; i_3 < 9u; i_3++) + for (uint i_3 = 0u; i_3 < 8u; i_3++) { threadgroup_barrier(mem_flags::mem_threadgroup); if (gl_LocalInvocationID.x >= (1u << i_3)) @@ -643,16 +643,16 @@ kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _6 Monoid m = combine_monoid(param_23, param_24); bool do_atomic = false; bool _1263 = i_4 == 3u; - bool _1270; + bool _1269; if (_1263) { - _1270 = gl_LocalInvocationID.x == 511u; + _1269 = gl_LocalInvocationID.x == 255u; } else { - _1270 = _1263; + _1269 = _1263; } - if (_1270) + if (_1269) { do_atomic = true; } @@ -680,26 +680,26 @@ kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _6 } if (do_atomic) { - bool _1335 = m.bbox.z > m.bbox.x; - bool _1344; - if (!_1335) + bool _1334 = m.bbox.z > m.bbox.x; + bool _1343; + if (!_1334) { - _1344 = m.bbox.w > m.bbox.y; + _1343 = m.bbox.w > m.bbox.y; } else { - _1344 = _1335; + _1343 = _1334; } - if (_1344) + if (_1343) { float param_29 = m.bbox.x; - uint _1353 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix], round_down(param_29), memory_order_relaxed); + uint _1352 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix], round_down(param_29), memory_order_relaxed); float param_30 = m.bbox.y; - uint _1361 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 1u], round_down(param_30), memory_order_relaxed); + uint _1360 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 1u], round_down(param_30), memory_order_relaxed); float param_31 = m.bbox.z; - uint _1369 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 2u], round_up(param_31), memory_order_relaxed); + uint _1368 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 2u], round_up(param_31), memory_order_relaxed); float param_32 = m.bbox.w; - uint _1377 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 3u], round_up(param_32), memory_order_relaxed); + uint _1376 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 3u], round_up(param_32), memory_order_relaxed); } bbox_out_ix += 6u; } diff --git a/piet-gpu/shader/gen/pathseg.spv b/piet-gpu/shader/gen/pathseg.spv index fc63eb5375b579bd2fcdb56a795572c245b9a9e6..37c98470f4f2f05b53b865e6e1eff5a7d53a8792 100644 GIT binary patch delta 2552 zcmZ9O%X3sk6vlfV2_zVBOl0sCK$IB47vh6gG$KTlh{_``iHQn3MU>LAisJeoP;pi+ z+*``Vnw_O97pz^YMOiKt5tXNk{(e()DqVA{PWAVFefsp*J>B=t*Qd*8PM7;C<#kI^ zDy81EG#y>&n=6&ReEX07qQ5Jpcjn)ne=DWbm8$9X4!*X7-`ByP?cjSm_<;`oYR1P> zwRvvflS=7G^TSmqx=W?zZ-cwbJsVSZdas<)k>>c?oz<2@oTqKLo21oY&UKh`9p+qz zIoDy%b(nJ<%17Cc_sgldV*Pie(n9m6p|K&F;eE%trv9p?^e6EPY1dLYT?Aikx0e{) zi+&010`VhZ{_n(Mw#UV{Z;TW+T(1Sic7by<-Eg(R%*+ya zm~qFJ!q;F%J?iv;)noEra7=E^5q7BeW`K#Y^~)Lyhx&_~P?iy~*?nNOZY+Gxg5B)p zaP`=1EBUzD_N^dlgqtmPx?frI9625%^X)SyEZ+iV>Afw_Zt_5*G+i%lMIIn5;T4u| z!G`BxC3uEbJyu!=$4V{fI5l%)s#V~8mv4isd7iA}?03L9hmCJ1YQ>#kxwX8M^6+pM zSP$yqVGtZ1tnD~8bHl@GaPDCZT+M9`4|l`$uz#{vtOLuf<%>Wb9_|6p;z2z;+zSp5 z)^?nlx#3|wIQK9FS6iNW7>4WNgXvmvKUi)pUq15iumP+G_3*F}93HIgI5l&_!vo;l z!-H_OlXx)SJ~i;s))_jFke?yXk~}WAt;Id357Fnj^M&vT*vB)f{xG5DITXjHZ)%KB z)@zTVY|7Nl9>%H1P&N}^weP{>U^CS}YtQ)vSlv0d(8iqVPZDa*DULa}HjYo%Yuiw^ zX6p9FHxu<@2g-IL<{SZ=sqVU?o^8C6p91T_b(!y6`WdASKju75sOd)>Kgu&;Hy(e3 zt(VtMg!m=x4CA|6cXT((F2di-kB~gh#TeN8p?;n(hWGY4uob;;rwDUm>OJ86JUkCq zGuK&tmA(KEH}o&QGR9jw_9ETmgm(7Q%EQh+uy)kL&P!lRh8=Ul&dcE3&MR=WxTf0K z4-Y%^FWNciQ7R76eUQ+OpI~{|nE=mHLj6K}cP7D>3_Iq8ohfi`XBw^+H�{@UTPw zqMgH@vOTkOA11WpKLB~yc}-3U^{_Jswq)2b$8$H@sLk{j^RV2|>u|OF`l>l1Zkw(A m94E|^XB!KAgFZiQ@il%EuIAqX9owho_#djNxq9}~;r{^uKp;2( delta 2573 zcmZ9O+iz4=6vp?=v;_)9#{xsWR1mRZ@rtNeTdkGW8=#22p|w>hPl^{}VnR6j7Z9B% z;~$_weK7N8eCL^GV~F8ZL{u!GQh&e6taN84S=rzB?RD1v&OU3OGvA&n|9Gn0Qz>s- zl~O5nr`74`XFcXm+HS9>MepfYGMu9SHb6r z&%wj3{kjHMKSPfD_YYX@I`|tjUHl7IKM8jA>tKJb?%aQbnyZVcT{Md=L(UMNw2Ct% zwM{&rFi=AK2~CqenKHaqq3IaS8APjq)lalccYxJKS>-tMo#11P?-Q;ul8Q=GG%*(H z0_SE{!PWXRGppfY#vNM&-+~$SsM8HrkHy!5V{vPautRHa1eh3Gzpno2WN*=fvW|$& zUJq95!oq2cyV*Cu)nl`(V1I75b5_2IaI?iO_bdB1PmXP5zH{b;(tDRrTW16F5e7Sv!ATv>bJt_!+K+? zTHFSfTgyW!4-frdJ*bC=+ri<%+Fqw-Zg|)N&OK~}tGUhLVH;czW7E~*PO#isz6j*u zVLNyZ59;CJE^v6Tw%4hd8y*J0xre*qYU?u(_rUdVbhcUyg5}oorvgqmxL zW6h!Z!fdVj1jp)_#w+%)Z>HUOu_A$!C&P!nJsE3`G!Ilg==7gPt z;M~qDaJ4w5+BpOdJIt@xnY5LPDW)e0?f40nhn;Eg93|8*wsvO*Y{{@=PS}|R=XMUm z)#Aiz=QViPVSdHVoLyGD!So!V9sdEy!_K^Olu!>l3t&rz9dqov=o`nd_a->E_ZD0& sJIn`@q6OmcIm*w`a5kc26nvXGzjg6(J_1+s-+?#a+>u7-+*dpP2hy1%2><{9 diff --git a/piet-gpu/shader/gen/pathtag_reduce.dxil b/piet-gpu/shader/gen/pathtag_reduce.dxil index d585c96acc50a24c7d0b7674f55413ef78e681b2..245c4922dfd27d0d2d5f30b89554bd048b453099 100644 GIT binary patch delta 1033 zcmYL|UuaWT9LLZ7lbb)ix%t<$r`9e>o3vfjaTltwVj;ICv?;n}Hq_`gxM?@>&sJjR zm=7JfH!(H1I4ZFsvWIlqEw~Qiq8R&h@03wS$GS1ba3LgRNajT9i{=Cdx>4T@&k~+ulNM-rw6L8p;NwR4_Zgo zqC10vH{?IKtOkGw3NVEOP-B41^hhp;^1%aA$wmWkGXJq^6jLSvJ&GpjC_a~ePcuwv zA0aZ4Lpy7IQ(9*FqtnEQI|B&k{5e9izz)wzDl`IJlp4hN@}lhcOor22l*77ja&3Z~ z^KjsDr#msd+Pt?=>WE^n8a#5Z$vU2f~juOqacFo)vfR>p!@;q zhB}G`#4lTtBJJEb`%nTNJ)IgDj%PINs(~N?$FhI(+StBtJwr4NNLkv5?@>7oM8~Tv z(33=0bIF?Z>2fi5pb5e&50oKsdscD68t=sKW&}<#CA6s6EYGas7-cSqSI0E$xKNY5 zyrU{ZGRj`pM}fI49+Q-=tX0<#I0h3L%|Jt7a=tmX??ppkIWekoRvH2mt%q&dL`yCf ztP4yug|Ho2#pnMCEY<}w=DI*8b~R1~M%z6{BVhMkEc9IF6Ij&pb451>*SQW)IDuJy zwKXFUJG?b`6mFi;CI{PuFcBN3_}>ILi@`nC^AiBY5)_Wc_zYgw^12yOBVi>TW`Vt| ztpQ$#zD*zyY@V0b5~;M@dMWi4Z{`a&LFu~ou0=n?CuI9GE)-_f({KzP?6f4!RC@Ko zIEr7$={F7SQAjxf7m1_4v7}ptz*mRJCioF-Uft}qbIOm9;X^A#&Ykr6ny$pxV6CgR zf%S`6?o$WP7WGvEuDJ9ZTV$DC7?r0Jh7^QA6%)^xo|cW~Ts#H)5XyE_=j9I>%n$9H z(SDmTFOpk>K2z5`!}+a_H;fLMacmGeVH&T|mAYHkn)%iy&TK7Xb zNA51hsmn%PTr0YbTicavbX_zZUAhetbr76gBXuA{M5!w>=xCt~Ittsx#(!Sk&-eYl z&+~nFd9PSjErw=aW0TvUed?VrJo(>~_rBAHKlvB}034q-e2d_h;ILQqXLJ#ngz-0{ zU1%!E#qrZwy!#%)1{Ly)_ZpPudRn2dbR}(*-9J zd!tj@lD9JtkKfPEW)?Ol*D?z$SJA@;$APlsO!8(!a(X6Nojx`~-U-{)*KL{G4?eO- zawWRmso&)q)qR(vzlP4W^j)5s(rP1Ynd894>6;C%TaKiB&zMMJAHwyDBKCVYoqC+`d~lec1ZwPa7i9j$g7M1up3tbkz0=ml@kKhpQmf-N(nLy za)f!X7ah{^*IP9!cDhcE$u`I&4J_EIzQj%ED6rhxM9dBsSkY3<~ zZZy&!NW)lBL=Ox>@KsLj27@^@4rLW-2IFw10__-tyz44gfCnZ5KUy<5TaI4c9_6Xf zPOBU-P?C>9p74aFQBO9Xe?ZWS4b5ymKTy0*SPfV{f0L0j$2!X5u+LLKQR8;lI3&yB zsUYZ74hbso;^4tOxD~kjLjeg+4Z3ph>NWsmqi|GE@uVys+I|#ZR`wDf{HwTU?c#ng zXBCc#4$~+XhN>DHeSTg!Mzv8~z%s1ICrl#&S$5A#FlkNYmx$FEgGH+)CW=}G!CerZAQjr> z7gj> uint(2)) + ix; uint tag_word = _150.Load(scene_ix * 4 + 0); uint param = tag_word; TagMonoid agg = reduce_tag(param); - for (uint i = 1u; i < 4u; i++) + for (uint i = 1u; i < 2u; i++) { tag_word = _150.Load((scene_ix + i) * 4 + 0); uint param_1 = tag_word; @@ -111,11 +111,11 @@ void comp_main() } if (gl_LocalInvocationID.x == 0u) { - _238.Store(gl_WorkGroupID.x * 20 + 0, agg.trans_ix); - _238.Store(gl_WorkGroupID.x * 20 + 4, agg.linewidth_ix); - _238.Store(gl_WorkGroupID.x * 20 + 8, agg.pathseg_ix); - _238.Store(gl_WorkGroupID.x * 20 + 12, agg.path_ix); - _238.Store(gl_WorkGroupID.x * 20 + 16, agg.pathseg_offset); + _237.Store(gl_WorkGroupID.x * 20 + 0, agg.trans_ix); + _237.Store(gl_WorkGroupID.x * 20 + 4, agg.linewidth_ix); + _237.Store(gl_WorkGroupID.x * 20 + 8, agg.pathseg_ix); + _237.Store(gl_WorkGroupID.x * 20 + 12, agg.path_ix); + _237.Store(gl_WorkGroupID.x * 20 + 16, agg.pathseg_offset); } } diff --git a/piet-gpu/shader/gen/pathtag_reduce.msl b/piet-gpu/shader/gen/pathtag_reduce.msl index e82577c..6c0a64f 100644 --- a/piet-gpu/shader/gen/pathtag_reduce.msl +++ b/piet-gpu/shader/gen/pathtag_reduce.msl @@ -103,15 +103,15 @@ TagMonoid combine_tag_monoid(thread const TagMonoid& a, thread const TagMonoid& return c; } -kernel void main0(const device ConfigBuf& _139 [[buffer(1)]], const device SceneBuf& _150 [[buffer(2)]], device OutBuf& _238 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +kernel void main0(const device ConfigBuf& _139 [[buffer(1)]], const device SceneBuf& _150 [[buffer(2)]], device OutBuf& _237 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) { threadgroup TagMonoid sh_scratch[128]; - uint ix = gl_GlobalInvocationID.x * 4u; + uint ix = gl_GlobalInvocationID.x * 2u; uint scene_ix = (_139.conf.pathtag_offset >> uint(2)) + ix; uint tag_word = _150.scene[scene_ix]; uint param = tag_word; TagMonoid agg = reduce_tag(param); - for (uint i = 1u; i < 4u; i++) + for (uint i = 1u; i < 2u; i++) { tag_word = _150.scene[scene_ix + i]; uint param_1 = tag_word; @@ -135,11 +135,11 @@ kernel void main0(const device ConfigBuf& _139 [[buffer(1)]], const device Scene } if (gl_LocalInvocationID.x == 0u) { - _238.outbuf[gl_WorkGroupID.x].trans_ix = agg.trans_ix; - _238.outbuf[gl_WorkGroupID.x].linewidth_ix = agg.linewidth_ix; - _238.outbuf[gl_WorkGroupID.x].pathseg_ix = agg.pathseg_ix; - _238.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix; - _238.outbuf[gl_WorkGroupID.x].pathseg_offset = agg.pathseg_offset; + _237.outbuf[gl_WorkGroupID.x].trans_ix = agg.trans_ix; + _237.outbuf[gl_WorkGroupID.x].linewidth_ix = agg.linewidth_ix; + _237.outbuf[gl_WorkGroupID.x].pathseg_ix = agg.pathseg_ix; + _237.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix; + _237.outbuf[gl_WorkGroupID.x].pathseg_offset = agg.pathseg_offset; } } diff --git a/piet-gpu/shader/gen/pathtag_reduce.spv b/piet-gpu/shader/gen/pathtag_reduce.spv index 6dc35b845e6222173a4d994a2e1bc950f1404231..9fc105fcca998a019fe17a6483ef5775a4eab006 100644 GIT binary patch delta 1744 zcmZ9MOGs2v7{||@ab|SV3Z}ux8d7l+ZG<)v3PxZpL|VBB5reUyWMmo_F6xfZ(v8e6 zf>s3uK`{{IA`DvevX{N>q1i*TOndpt?Ejm&r_2NY`~AP?`Mz`Ry(epWYD)aR%8)UR z37bOW?=5L`CNDPc3sm1X#w1b)<4MVYDKO6@J6e;i?VdjLpB^ydy(1B4JhK=m)w9GY z)3Z7VuXC_B2g}?Er8IV?c6PQI6I4-jR-!YtA6vlq&F8#HJZL7x4;*f3N+b`}Clj3= zdv@hpS(3dW-kwNyQ7357S?%%mmPA|Ik$CE#-SO=4WHOdu7B(iqmv`ifu(6YkhMIhI z9irdvqDXYy3#*hY6m~0Bo=~I;Pl14EtxzxRqhwH6ClEQ287Yc1Fr!}}g4rqrmetF} zt)F>U2^&2BZ%Hb$6BU7LYi(Q9LY?D3*1er#6(OQ6K4+e&C2~G$?;*^;pxnV4gHuTv#4k;{;MQDYEjqD z74xow83A_cE3qP%z(5qF5(7|Vt3r=T`Z5En2g`X&_4ekXxFX)t<@!+e>YA+IJuWl2 zF4Sq)Zpj-$pO6-6v4jkahk$zn|drx9=8Ge z4yzHa1}wjxSJL~0v|uNHEsotbF)U79PLBac1O_CI#?Xg_Ohne)*y_pR84Lp{hW8h@5QmD1>LY2lVowh_&x~OxL~-w;4k8Q8)?CgH6@PS z8_PA*Qu1>J4Ij@pfe$Py5WLrW^}dS}%VL8b$sYpo#4;8-aW?0tI63;9{^rbxlaoIR z*RP3;ALIN Wij!0=kc3VkZ<0OYv20gqW#|taI2En{ delta 1789 zcmZvcOGs2v7{||@r4*HR$BJZEX^LivikpK?n#p_{OA0?|MxiGch0@jQF)@W$nT4Vjd4tY z$uoh@qFQIPtA2N|;=VB^(YimL)EYDa^Gs_?W3sW?v-|$BgJ!sMAmR+CXM@H1Epkfq zTM>d+JLvVH5_d!?wQa3i+nS6CsVFv2qOEl=TG05-=bR)SGNa=A4mH#zk_T&&iMEy< z+j6a_S+g_VoJby_PKf5SPjkGvA<@)yINmy^<3f)olcfn}VPg_}IY+h#8~ZZRaGj5? z!}QypABhetgs_;9Cu~=$JmE+=o`C0BslvNQX=2xeO=@Q{IB zCRji7E)rG>_nH{aBI&V$fV*0i8J}Y*73X5H*C^|gRw|%v@oWZ1 z!#<{!!BsRH|5YnKYEjo7LaBEEbOF$*?>HKT1S(Kaf?*)E4GKM~)t|n!xVMZ~TCaLG ziu>f1o^SVMx|U@8?p4WvS*x{cyVmPMw~!JlwFw>E5YVlCQ@lsGq8$9Ugi``~%F}L( zlZWrTv@=@o2ajR^_^OQ?G?zu52p0vM6yu^f5pmD)) zd%bjjj`c+x-5bj_<1%t{g$y6hH-QfeK z32}1tIsMI<6elNl6s})gGu2$ulsHLzfOey4agt&J8(|Y?H<}SAhkq!W^HZFhbpkoq j#Mzu#@$rrN2?#dnmpDlk0!i2e@+R3M9?Bdjj)ngK3E2%Y diff --git a/piet-gpu/shader/gen/pathtag_root.dxil b/piet-gpu/shader/gen/pathtag_root.dxil index 1f27f265c5be01104cb77bab185d0e73358e5fd2..77f12e6db0ec75e4d05be227c033a9793a2cc1a0 100644 GIT binary patch delta 251 zcmV<#lW-7l15pq3lj9I36W9kZ zm}M}AV8p-xKoAU&mbeFdlN=G~8G}#2h(QB@AQ((=0Twe6ki|h3WI+}*QG}Bn5>E_C BS~~y$ delta 235 zcmV|o5&7LaVf&--J{OD4-5eWLoa|slW-7l13?e>lj9I36Tk;Bm}M}AV8p-xKoAU&mavC> llN=G~8ADINh(QB@AQ((=0Twe6ki|h3WI+}*QACp+5>E_0QIY@v diff --git a/piet-gpu/shader/gen/pathtag_root.hlsl b/piet-gpu/shader/gen/pathtag_root.hlsl index f1ec389..7ad806c 100644 --- a/piet-gpu/shader/gen/pathtag_root.hlsl +++ b/piet-gpu/shader/gen/pathtag_root.hlsl @@ -7,7 +7,7 @@ struct TagMonoid uint pathseg_offset; }; -static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); static const TagMonoid _18 = { 0u, 0u, 0u, 0u, 0u }; @@ -21,7 +21,7 @@ struct SPIRV_Cross_Input uint3 gl_GlobalInvocationID : SV_DispatchThreadID; }; -groupshared TagMonoid sh_scratch[512]; +groupshared TagMonoid sh_scratch[256]; TagMonoid combine_tag_monoid(TagMonoid a, TagMonoid b) { @@ -73,7 +73,7 @@ void comp_main() } TagMonoid agg = local[7]; sh_scratch[gl_LocalInvocationID.x] = agg; - for (uint i_1 = 0u; i_1 < 9u; i_1++) + for (uint i_1 = 0u; i_1 < 8u; i_1++) { GroupMemoryBarrierWithGroupSync(); if (gl_LocalInvocationID.x >= (1u << i_1)) @@ -97,16 +97,16 @@ void comp_main() TagMonoid param_4 = row; TagMonoid param_5 = local[i_2]; TagMonoid m = combine_tag_monoid(param_4, param_5); - uint _211 = ix + i_2; - _78.Store(_211 * 20 + 0, m.trans_ix); - _78.Store(_211 * 20 + 4, m.linewidth_ix); - _78.Store(_211 * 20 + 8, m.pathseg_ix); - _78.Store(_211 * 20 + 12, m.path_ix); - _78.Store(_211 * 20 + 16, m.pathseg_offset); + uint _210 = ix + i_2; + _78.Store(_210 * 20 + 0, m.trans_ix); + _78.Store(_210 * 20 + 4, m.linewidth_ix); + _78.Store(_210 * 20 + 8, m.pathseg_ix); + _78.Store(_210 * 20 + 12, m.path_ix); + _78.Store(_210 * 20 + 16, m.pathseg_offset); } } -[numthreads(512, 1, 1)] +[numthreads(256, 1, 1)] void main(SPIRV_Cross_Input stage_input) { gl_LocalInvocationID = stage_input.gl_LocalInvocationID; diff --git a/piet-gpu/shader/gen/pathtag_root.msl b/piet-gpu/shader/gen/pathtag_root.msl index 923e77c..65e3741 100644 --- a/piet-gpu/shader/gen/pathtag_root.msl +++ b/piet-gpu/shader/gen/pathtag_root.msl @@ -67,7 +67,7 @@ struct DataBuf TagMonoid_1 data[1]; }; -constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); static inline __attribute__((always_inline)) TagMonoid combine_tag_monoid(thread const TagMonoid& a, thread const TagMonoid& b) @@ -89,7 +89,7 @@ TagMonoid tag_monoid_identity() kernel void main0(device DataBuf& _78 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) { - threadgroup TagMonoid sh_scratch[512]; + threadgroup TagMonoid sh_scratch[256]; uint ix = gl_GlobalInvocationID.x * 8u; spvUnsafeArray local; local[0].trans_ix = _78.data[ix].trans_ix; @@ -111,7 +111,7 @@ kernel void main0(device DataBuf& _78 [[buffer(0)]], uint3 gl_GlobalInvocationID } TagMonoid agg = local[7]; sh_scratch[gl_LocalInvocationID.x] = agg; - for (uint i_1 = 0u; i_1 < 9u; i_1++) + for (uint i_1 = 0u; i_1 < 8u; i_1++) { threadgroup_barrier(mem_flags::mem_threadgroup); if (gl_LocalInvocationID.x >= (1u << i_1)) @@ -135,12 +135,12 @@ kernel void main0(device DataBuf& _78 [[buffer(0)]], uint3 gl_GlobalInvocationID TagMonoid param_4 = row; TagMonoid param_5 = local[i_2]; TagMonoid m = combine_tag_monoid(param_4, param_5); - uint _211 = ix + i_2; - _78.data[_211].trans_ix = m.trans_ix; - _78.data[_211].linewidth_ix = m.linewidth_ix; - _78.data[_211].pathseg_ix = m.pathseg_ix; - _78.data[_211].path_ix = m.path_ix; - _78.data[_211].pathseg_offset = m.pathseg_offset; + uint _210 = ix + i_2; + _78.data[_210].trans_ix = m.trans_ix; + _78.data[_210].linewidth_ix = m.linewidth_ix; + _78.data[_210].pathseg_ix = m.pathseg_ix; + _78.data[_210].path_ix = m.path_ix; + _78.data[_210].pathseg_offset = m.pathseg_offset; } } diff --git a/piet-gpu/shader/gen/pathtag_root.spv b/piet-gpu/shader/gen/pathtag_root.spv index 88e20b907c6f45e137a705f3462f2a62d9151c8c..3783b49cb1351c02c3e5aa12cbfd4f26c93e4936 100644 GIT binary patch delta 1869 zcmZ{kIcQW-6o&82B$=oLl@y8=3W^|_La-50iIBoZv=u>8L{tzZ2$m!16e8+~`x5uv zB#!$!F1RM{`x>>fwQy_n`^>vK4*1~b{Ac;^J?GxbdtKL6SCdJP$cB&#eM5ElxVmO` zD%H7QNTz8=2w~BZc}>mO%1{vwU>7$wH!e*4L;v`ha2&sR(NbzNp$MN3b&{5^}v=?Zr`O(#2O|_krcWoQrvs z`+@PvH0kx}$J{}_v}e#{+)w;pG#ZB4+$B4ft+O17kiw-_2R9pYO)p8DPvt>}I~GZ&Bb65MzkLf;dJ8*9xKu{LV%GS;Rp4}g53 z+pE}XAmm$Xhvu$Bil8Hz^9I~l18_F|D?pXY$m1Bd;KjtxV($RY))sCjzT^0Z=myh( znmEo~FU5oce;EXN-zxSV@Tt0xIcxmw7>^Cy{~iD9Kfo`7_-Y=)tuY=qeFS%%dfwR# z_79J7rh`F1!OY}Rp1>{ka|-{i&w%yT+K|3=QS%(GCIzDA1zgQAphn-i-x~H0FM*;o zh@w|;MWcWsee2>zuiG(>#b8S5G b9(WPNBYqF>0dcw_+*s+;yIOct`!oFuuloSY delta 1885 zcmZ{lNo>tg6vpp+uM;9oV?kJu5Q)$oi$)BUSl9?#k%$FBB9%sBB~Q9zA?k^ln#Zc) zndee-Ypi)z8(Rz0hTre?fAwkNC11~XzH`pK=iGbWYkT#Z>i(JZ@bVB+p>L=NpBwwn zPNk188l0K8D}+$DbiurOY-uP7hp_|5@RNA63eE7g_nC>fDXrE_1)X4P&Wr4Ja*mSxyYAYJ5dhF7FQ7)Jbg5;JcM*^{tc zx#i`%vo_CAVLVy=ur^=p+kCNa^U)s!cYx?y?-0elgB1H6iC@e3Q$ol$Rg5alZbV!U z9O)O0=&uDi@Pnb`AJ839I9jqm*aBY#s=x$t9C0g9kN!6J8qmh3#Jw?w>(h_1!*DfYUZsAF9f3C_F|Sp>hVv-TLRvs2u*jebn5_PjiR0wj z|6RpVV!cn|7@B%);yB!O>s|-Z5#Fm@MFSFtO?rK;4Qd|980FZ4Og2AoLK)la4l=)vGy+9+SqhE+!N2*nLiW? zPT3E8A4~%l#exTkZoWSx`h^#aeF!Rn>licV4pu!j@Tj2o_>b{vDEs#@JwY(ZK-{zw z?n1RZzZux4U^*BG%%7QD)ib!cejMUC_60ECTpQ9iFV?(-cd|eKRCdE}ENWsKd^d4?e35Z1>;1-Pq*66>_ f&8!+S?jy|GjbF@9aJ4vwF}32S*Ou=X@H_nnD5wHn diff --git a/piet-gpu/shader/gen/tile_alloc.hlsl b/piet-gpu/shader/gen/tile_alloc.hlsl index 010e714..5231c1d 100644 --- a/piet-gpu/shader/gen/tile_alloc.hlsl +++ b/piet-gpu/shader/gen/tile_alloc.hlsl @@ -261,7 +261,7 @@ void comp_main() for (uint i = 0u; i < 8u; i++) { GroupMemoryBarrierWithGroupSync(); - if (th_ix >= uint(1 << int(i))) + if (th_ix >= (1u << i)) { total_tile_count += sh_tile_count[th_ix - (1u << i)]; } @@ -271,46 +271,46 @@ void comp_main() if (th_ix == 255u) { uint param_4 = total_tile_count * 8u; - MallocResult _477 = malloc(param_4); - sh_tile_alloc = _477; + MallocResult _476 = malloc(param_4); + sh_tile_alloc = _476; } GroupMemoryBarrierWithGroupSync(); MallocResult alloc_start = sh_tile_alloc; - bool _488; + bool _487; if (!alloc_start.failed) { - _488 = _92.Load(4) != 0u; + _487 = _92.Load(4) != 0u; } else { - _488 = alloc_start.failed; + _487 = alloc_start.failed; } - if (_488) + if (_487) { return; } if (element_ix < _305.Load(0)) { - uint _501; + uint _500; if (th_ix > 0u) { - _501 = sh_tile_count[th_ix - 1u]; + _500 = sh_tile_count[th_ix - 1u]; } else { - _501 = 0u; + _500 = 0u; } - uint tile_subix = _501; + uint tile_subix = _500; Alloc param_5 = alloc_start.alloc; uint param_6 = 8u * tile_subix; uint param_7 = 8u * tile_count; Alloc tiles_alloc = slice_mem(param_5, param_6, param_7); - TileRef _523 = { tiles_alloc.offset }; - path.tiles = _523; - Alloc _528; - _528.offset = _305.Load(16); + TileRef _522 = { tiles_alloc.offset }; + path.tiles = _522; + Alloc _527; + _527.offset = _305.Load(16); Alloc param_8; - param_8.offset = _528.offset; + param_8.offset = _527.offset; PathRef param_9 = path_ref; Path param_10 = path; Path_write(param_8, param_9, param_10); diff --git a/piet-gpu/shader/gen/tile_alloc.msl b/piet-gpu/shader/gen/tile_alloc.msl index 3906536..49bd1c4 100644 --- a/piet-gpu/shader/gen/tile_alloc.msl +++ b/piet-gpu/shader/gen/tile_alloc.msl @@ -272,7 +272,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M for (uint i = 0u; i < 8u; i++) { threadgroup_barrier(mem_flags::mem_threadgroup); - if (th_ix >= uint(1 << int(i))) + if (th_ix >= (1u << i)) { total_tile_count += sh_tile_count[th_ix - (1u << i)]; } @@ -282,36 +282,36 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M if (th_ix == 255u) { uint param_4 = total_tile_count * 8u; - MallocResult _477 = malloc(param_4, v_92, v_92BufferSize); - sh_tile_alloc = _477; + MallocResult _476 = malloc(param_4, v_92, v_92BufferSize); + sh_tile_alloc = _476; } threadgroup_barrier(mem_flags::mem_threadgroup); MallocResult alloc_start = sh_tile_alloc; - bool _488; + bool _487; if (!alloc_start.failed) { - _488 = v_92.mem_error != 0u; + _487 = v_92.mem_error != 0u; } else { - _488 = alloc_start.failed; + _487 = alloc_start.failed; } - if (_488) + if (_487) { return; } if (element_ix < _305.conf.n_elements) { - uint _501; + uint _500; if (th_ix > 0u) { - _501 = sh_tile_count[th_ix - 1u]; + _500 = sh_tile_count[th_ix - 1u]; } else { - _501 = 0u; + _500 = 0u; } - uint tile_subix = _501; + uint tile_subix = _500; Alloc param_5 = alloc_start.alloc; uint param_6 = 8u * tile_subix; uint param_7 = 8u * tile_count; diff --git a/piet-gpu/shader/gen/tile_alloc.spv b/piet-gpu/shader/gen/tile_alloc.spv index d4a6e31710c5991852f7811fe336203f50b6c276..55d62adefde50e2cd91a9a6a32d822c8d0f4267e 100644 GIT binary patch delta 2680 zcmZ9NTWnQD6o%)V_LR1^Kq*jb6tY$O=Dsri4{=sR;?FA5yiVkyr5DpTJiffd+^Ym%vt~cuQhAdtUde8!JRMd ztgpzoFA5K`1RDDIdbrVvh)9;<8YoX6wz@oc`c^ZD}Gm7!32wr+J&zd&8KfG@v@7eW(!Vr-%~ zI=F4@p$E>dc{y5I!QjZlltvX5p%rZ;t9#u=IV!`o|MOCReZ$mn{ct|lJ2kuHp2EO~ zRDO8{i?V`s1OHq~bw{6Lt0xIEj6ev4^2ACVMN>BYzt{#dtJJ z{tj5pxLYV%pV}FUjKyk?QS3BDOo!4;+v47Lk;-eHfjc#p@*dbyj5Do#rnI@erSpA| zyNd=1rKj3kk`F+x@m$v5kHBi4V*7gt>tlcQeauj^zv8&%Pr=sqM7HJ6 zz-oILuITYq+lb$<;dAiv5LTrQ1NR&U|+>Y41Tk(Gjz86d0(vgmuw&IvW zdy>;^%nxAKbvMSF-x_|x`dEW8KQh#;K|CLFDOStcyDa<+>0Ev&%iyl`r^E^V0GWI|6D1jNEfzP7Q5gplFOdY<&yJGgJPn8 zz$P$}e~I$w`xmSh&*(YuUWR&%A#ZrPEG7pxM%@kg2UY?0&~0J;bYd;G5>bZ>``;=T z7qBiQt^%8IBPYZ;i@@Dj-Ya#C zT}+DKyxbU1&>iTD!5gxq_hpIK!Iz-x8^~VaQm~p&ET{p_HIbwhS+RnFR(C6bjo==V z>GS@O>$@G_3b2Lg^DdFw5lmRg&z)HBAm{1#UB^SKwTMYyg#Y zvE>eIC&Neom6=}~b6sUGO%jDJu%*Ti`)V*jLEVo~-p(+&i;3ghwP1^lyc=xpcp3D7 zjZwFk9_jUY&T@LDAx`voUPHiohCB4qmW#cxO(NcaEfO1F;C1j;xbep6ch`2nLbb85 c>%kWa>tfOk2#UMQ+NxIG){dzs*Y02XAK5Ka`2YX_ delta 2668 zcmZ9N+iz4=6vp>VJEg6yP)ey~6k6I!rA2HNNMkS|(V#vUiw{0%QznGO#%c{ML=(+X z<3B*=*>@v`*i1|`F_D`JR>k54uXw*kyrAW#h`-h!c`lrQ zm;ECXlZ73V#N@)M;u8(cE&t$g`b0Y4*;U1t8df9=vXwci!{zgM^E}@2Kb{L$z$eEi z3u7bO#&_s0tluqiNmO{+FY8cVkn|Y6eQfjT70-`qIq8dvZhX zQSU{f+wpt|t_C+TZcMoyQPbZ*HX0T-GkW1e*fuoh_l26UlGyd7$6D^n4B0S4O=A;n zSmfGM^Z0B9Uy9#0#&~&EvA;E+sMy9zk8OUD+0< zyg1$3Sm%!91ErbPRrQHI*iW(BvHP%(;;sQ70IToioz%h)f?r{%pW!XNiuE~9yuL#W zwX-2x#v9W-uc37?B7YtH0^<;QStkjHAzMkL>G@|41F=#bgD=G zCcMOWJk9eKSk1VbC|aM|If|SswhB<(-;rPUHgaX1N8yIY<=+9jym6+IA1w}c=DTM= z?kFyL46J5HjnsFn_(*3yc^Bjw&r+T}eIKmm8Mdd#u|D=x-v-Z z1g!Ql!xcTeYJ>Rw7Cr+n31NBJVSpcnqJ4-99542F<&)1*^%~`wm)n9m(tH6{b7Pj` z7QTe5Ka=vWz-n&i8?2ApQU98u=61y99mfAH_+Bh|i_=|o$#-zIJ@{SYd$3voY{fB$ zc9yej%nxAKbvMSF-x_|x`dEW8KQh#;L3|-(%2+LJ@8a+?q=&JI6Ka7ci^FZL$uB4; z8PBI>{R%cx-KBoV`nZ(7-xz8xC5~_J53r};xirt8V6_5s&2t9qV;+5{8GmQ^QcNU? ziOzye6u+8tV70AGj>q;q++(XR&if0j?mM2v%I&KQy9SG0@)gK6=S$|C2E{}dz$P$} ze~9wvy9ic`XOz6<^Qj(VGH6d%#AMSLbvNMOS2fr}w}th~6YH@#L>(^77hLAz8n6qA z{hu}AASc8*wcuW?@y6*l&VOosF6jSa9awD(I2Kxumgu;Xcd{ABWne`od8;%sb~7n{ z^KxT6LARqX1ox*&?@tr2fiFVWH$)!)b(_FyKCz(1a4w0YT#*$k7-)4j6SxH2M>2ih z9&&xR;#&%~Fn!)6ayx_EEjcCgy+bf%Y@T3l%**rRS%FF3iSc49rU+S6>g z8+$dwNB^qSuZ_8S?4?Pfa1Gc}gHSVmlYUnkdrA$hdJ{F(XF@1DjxzA|vN!Nv55Q z96}-w943k=L`=TTEIIizb3LQ<=3 zgA*E9Hv4fNVPu-nuvvsVnORU^sYF20)Bp+AAPv^w1&peJlRNlUF*-~(=2y47$Yzk# zz@WgvHbr9(qv(Ny8BPo#hhSnMnkNp5o&kx8JcfyhXkBR(y8#lr!puH-2LC$FgIXsR zG?_JUB~A7h(6DzFPdLKB=peweqWnQ?07q4#i6Vp6NmdC%2L=HaW-qzAMiHh0315ze zq%uY~9svfHgAJ?L3oTj`WZZOXqBlLoqQF{Gkbzyy> zB4(a6$sR=sLj#~u&8&7Q%9}HUe=*kgbX^f}U|qRLT`{_8^??pY<^~0ch)0jKDuj-7 z321K=(%w`gyvcO~SC51V14mP%tKNo1>Ic_wY&O!~?4<3k80`kv^h`;+M(9dcfcDlP z?QKP(n_73kG_~q&d8B@DEywmE?d?t4;)>Cy;F|1TC221ZdeSvPd*>wWT}6VMj!uB- nI;yurN$cP`j@_HIcOTL=V}Hri{IJa-{6NCA#z-a>1_%HEy{xqK delta 550 zcmZ3Wvp`40CBn&h$_Cpx2Oq8~`1i#6nb!~1I}=4Z8JRZDD`w>2U}O_)Qe+g|EXlN! zQC>;pfx|=*g#w3zA{h=7#YIva8ZXRXRIp`ss9|e3#54ILb0g#I&BZKdnFW~Bq9!H3 z$nPqapMA2x<9^8G0FG}QP7KW)2PZUeZ1&?k!pJ1hxLJfdnOU%lku|VLHITD4P@y%* zfK@7BatGfkMuW-5{OWcG*$k2z7!)|zrfBS86g_Y-!-*ke6HF{b^Ta{XGaxaM%P=t! ztt*XUH$Y-nKC?}p!M~2PQR~ElCbI^vgvtH_8urTK2}c+h9Rzq*ls{+<;HXM8QDo3M z$SPszz#zcF>?K#%D8f`A;mgsGRKn=SBf!9NuwfN@p+#$gjGHji0=dcC1P(AZPA(Tb z$kOW9;5ykx=mqmfsjZXMh4qDUn0eA9dlV%M4S+^9v)Uy+-kc%)i?P0=>xzH_>xxC{ ziqTH14|F&(Hz-I%Jo== (1u << i_1)) @@ -184,11 +184,11 @@ void comp_main() Transform row = monoid_identity(); if (gl_WorkGroupID.x > 0u) { - Transform _383; - _383.mat = asfloat(_377.Load4((gl_WorkGroupID.x - 1u) * 32 + 0)); - _383.translate = asfloat(_377.Load2((gl_WorkGroupID.x - 1u) * 32 + 16)); - row.mat = _383.mat; - row.translate = _383.translate; + Transform _382; + _382.mat = asfloat(_376.Load4((gl_WorkGroupID.x - 1u) * 32 + 0)); + _382.translate = asfloat(_376.Load2((gl_WorkGroupID.x - 1u) * 32 + 16)); + row.mat = _382.mat; + row.translate = _382.translate; } if (gl_LocalInvocationID.x > 0u) { @@ -202,20 +202,20 @@ void comp_main() Transform param_10 = row; Transform param_11 = local[i_2]; Transform m = combine_monoid(param_10, param_11); - TransformSeg _423 = { m.mat, m.translate }; - TransformSeg transform = _423; - TransformSegRef _433 = { _278.Load(36) + ((ix + i_2) * 24u) }; - TransformSegRef trans_ref = _433; - Alloc _437; - _437.offset = _278.Load(36); - param_12.offset = _437.offset; + TransformSeg _422 = { m.mat, m.translate }; + TransformSeg transform = _422; + TransformSegRef _432 = { _278.Load(36) + ((ix + i_2) * 24u) }; + TransformSegRef trans_ref = _432; + Alloc _436; + _436.offset = _278.Load(36); + param_12.offset = _436.offset; TransformSegRef param_13 = trans_ref; TransformSeg param_14 = transform; TransformSeg_write(param_12, param_13, param_14); } } -[numthreads(512, 1, 1)] +[numthreads(256, 1, 1)] void main(SPIRV_Cross_Input stage_input) { gl_WorkGroupID = stage_input.gl_WorkGroupID; diff --git a/piet-gpu/shader/gen/transform_leaf.msl b/piet-gpu/shader/gen/transform_leaf.msl index 9c7e6b7..6a55784 100644 --- a/piet-gpu/shader/gen/transform_leaf.msl +++ b/piet-gpu/shader/gen/transform_leaf.msl @@ -127,7 +127,7 @@ struct ParentBuf Transform_1 parent[1]; }; -constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); static inline __attribute__((always_inline)) Transform Transform_read(thread const TransformRef& ref, const device SceneBuf& v_96) @@ -214,9 +214,9 @@ void TransformSeg_write(thread const Alloc& a, thread const TransformSegRef& ref write_mem(param_15, param_16, param_17, v_71); } -kernel void main0(device Memory& v_71 [[buffer(0)]], const device ConfigBuf& _278 [[buffer(1)]], const device SceneBuf& v_96 [[buffer(2)]], const device ParentBuf& _377 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +kernel void main0(device Memory& v_71 [[buffer(0)]], const device ConfigBuf& _278 [[buffer(1)]], const device SceneBuf& v_96 [[buffer(2)]], const device ParentBuf& _376 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) { - threadgroup Transform sh_scratch[512]; + threadgroup Transform sh_scratch[256]; uint ix = gl_GlobalInvocationID.x * 8u; TransformRef ref = TransformRef{ _278.conf.trans_offset + (ix * 24u) }; TransformRef param = ref; @@ -234,7 +234,7 @@ kernel void main0(device Memory& v_71 [[buffer(0)]], const device ConfigBuf& _27 local[i] = agg; } sh_scratch[gl_LocalInvocationID.x] = agg; - for (uint i_1 = 0u; i_1 < 9u; i_1++) + for (uint i_1 = 0u; i_1 < 8u; i_1++) { threadgroup_barrier(mem_flags::mem_threadgroup); if (gl_LocalInvocationID.x >= (1u << i_1)) @@ -251,9 +251,9 @@ kernel void main0(device Memory& v_71 [[buffer(0)]], const device ConfigBuf& _27 Transform row = monoid_identity(); if (gl_WorkGroupID.x > 0u) { - uint _380 = gl_WorkGroupID.x - 1u; - row.mat = _377.parent[_380].mat; - row.translate = _377.parent[_380].translate; + uint _379 = gl_WorkGroupID.x - 1u; + row.mat = _376.parent[_379].mat; + row.translate = _376.parent[_379].translate; } if (gl_LocalInvocationID.x > 0u) { diff --git a/piet-gpu/shader/gen/transform_leaf.spv b/piet-gpu/shader/gen/transform_leaf.spv index e561e9dcc5b5efa2b6e077e953d952b180e52c0a..f418bbef9311d0422c800bbcce4d1f3d78e2a24b 100644 GIT binary patch delta 2823 zcmZ9NYj0Fl6o&V7+FFVTZKFXOp@^u7mlzXCh=Byu6cQsXifFymskRh*b(jh&b-1Z` zL+T)kh=@15AOaN;75w5KAbvA`@CW$K#7h$spXbck@svqd*89F|?X~w=d!5tY3%?d- zY$CTlSaSVf_qvhdKzngyY;@Hf)~E}mO0KbGX^E)6p+bq=`XE;-6$`@~ z2MeWMu-Q)`b`*-e!=)8teQ5P+mCnYN^5(RH=XwR4J=bf~cs@+PHzXDFVLv#Y|G>1o z+Hi0huM3Ca$%*<#iq$oB;Yd}xX?SCIvDYJcfD_Xe)`rs;@H3N5Gk>lr+mo3rxdL0s z!q|m{aDP!FEvU`G+9^MVP!DdyR>JLyzqy!eB9C?2Lq5^+#9eQ%uBvYPzp#K{OgJ|= ze#w^FT&^zx(UFeIw#(*FH?{&Z zX00~XJPKA{#;Lq%d5rad(aK5CYgdTD$HDV37dis+B{aaIz2L=o*BCR$T=jo=#Q3QC z-#ExHxmvc_eZ+3ZdP3!wxsA832P(zw`@xos&$a>Vvzh01^8U(rYwMyXKz3_u+L-~c z+8#DE)7U`eZ0r2wMO>2hWeD}=G{cFO$6FZz`;zt$_dMSEC|vz$dgD)m)y(n6H-i0n ze+ErGUhcDCH7|WR+xvEpW1F$N(TsaA-}dJ)W1Sq#FG)Go zfXM^x#&1O%%ym~6+Y<#7UF4@{Te=9BEZT?P4zV+_k8<<8;P_?XgN;`fxwgPiq z%zr)A&G%a+kAuAj>|k4mSoO_FntsuhW<>o12|VHDJ;u|4Dv> zb^`Ok?121Kj{aN3jlV^3LxZRBa$~mdJFqP?b_OqBLg#9TeGk47Z*kWdcNiT^=Sd$s c_X9ZFsk07@V*CUcJd0(geedL*3y#eG54^fawEzGB delta 2817 zcmZ9N+iz4=6vp>-+S(QoXrnxK@^la)s|LzfoUtE;2`1! zF9?jFB7))tP(YxHST9K8ldrz|;2)q*CSG{*_d9cTJY~|A^?l!3d+oi~S?BcE!taI2 z`Kmcrgpdmjp)UNmXY#VfpV99m;ovVh5>)VP$qZ^jqYmJ&PTFx~zEgU84Z=f(jZf%e&jT8%m z!~KPkZm`*#5ZepI?!l48qdjP~YUPfGrqbrLg6Dc2oITgB2|OS6!uyhn`EURn&wp@2 zUUfJ$f!Bm1@Z>~2L&eIPnsBV5-7z@aS?u;m9^mYRh1KEwWqf>W+N58qO7>(DOJ-n8 zSQxu72ktLwqy@F9SR3V^LZ}7L!j{18iofZYYa)+z+Co0j^2A+ludc6b`@b-oU`)6$ zw)Ltl)w!zjrN-5zcCzaU-pWdMT@J6q?uIwuy#;@v1?~-6z%#MyDCbElTIB@tS>!#9 zuOi2jT1W1w)QoRUF>7y0HeN#DIHCtJVJuB(#NWoT>hTxKe>DABa!8rxr0iXIhcwF_ z(kypKv)tjya)&2!cXxOqx4r|E;a$Ng$_U96MSe32oTA8YggZfz*TEg0$c;NZk-L6; z?Bdmj^V2$zmtj7hOLU~8yz$zZlYKa?jKG!n2g(*jXRbeL0N;v9Ulc}cU%j1NA>OLDLTQf zGiJ3m{A1W+%$T)iv8Di4U&N_=(()L49E?^aO6{cjv(kX$X>>~+NM!MZ~E=jjdedVxxD`#!KG0{S+ zILk|4MA?DGKD`81i+y?p?9Vtg=yR5#yml{^mi90B^ve&eca?HD}VI&fmvyb#ok}PtqL6$Q;M* zQ#AEEPF@~)QI*~8uMjyvva8i zOuFPh$*Y!M4oUI9|Sh&Q%fn4tzV_;;tLd fmDje+nD;#}+ms*RYVl<^rk0)SePi=xAD!|a=;KGS diff --git a/piet-gpu/shader/gen/transform_reduce.dxil b/piet-gpu/shader/gen/transform_reduce.dxil index 1ed5e0e3dc4b5a533f1829c1818e590d6ddede9d..63df381194f6336e5c7fa13118ab5248ff4ef02a 100644 GIT binary patch delta 270 zcmV+p0rCFWB-kVrL|8&YWXuVB(4rr|An%((ZBwOJkrasm0kNK20SJfy1`&cG0SU7K z0<{4I5)>dL1d0O?(gC5`7nLH^NtY{6YMEb6lk^An2qFN32*Hqm z1heu8zySh~fU~p;QUeV^lw5&SLSYn2fd-UBlM@ah9>Fw(K{o~?1_=OyU=X1JRgA$H zkim#S0)QYGL}^fyQVuZznv;YMB^5CSFk*lJAP5GO9h89+n89d508}EAz78h=bCc!{ zS^$wSq_v|X_Nwu8358V0K{hiD9n@41Bn49vswg-0}u$^i;6p+PA7CH z?r^3~fB}>A2lfa;0D}m@kbnrY@(92I0uX_-vRhx2!KjNlYkE> m0_PNyst-5b6SSfs@V=F99Hv{t;Sc99^)k73TcF(jdvwL>&*7g58b9Pvp`LovgughL*@3Uv5ak#NO8mTKW#xY5g zFw0Jtw^V&M##ETNiAh!p#u@A#a9f{B%jOPudy=l4Q<8n1US}q>r~k2CGb{Um*V%il zC+iVIK{!^XGbcN4X8n`S%7Za8FRmq@+nw(r&JAqhviV#WoNJKv zjx>4MeBZu3#hwz5G0okXtk+MSxJme9@#;;m`k733rlZ^QvP{4(@R1@PWUxzesJ5gn z;>`MkC8-UxOJbc+Av9@(Imx(CBkZMBf3P&w%m7h=2pU~4u&iAx-TG<1OjsTIOSAt3sJ3<;N(&S=;*id&GR0l00U&A4#bXC)a=WrO(NlK7}aenbvz?bRL-Zb;aT z?OCM5>)~7gJOxU_cp7IzY^J2>Mn#^HEb}idpRDC`)ps34RZC9&XHO4ZS+O}& za9&tc!BO8$tt;njcBzGPlK1>AsZ`-Un$V~i8hRky5JnV*{ZMES;7^3yBkAPfyC`m0 z^05GaS(wx5j!ClE8{ZQFZeAmyhh0R>c+kd#sByuz@l5*CHlE9-4S3ter1S{^ALXt~ zz7Vbo_^4GS`BH%2uC_QmZ0ma^{em%tz{ycUSm$f$B#_9z0QOd$3Ps-tYXoxe5u-u@ zzn~fItjGy9vQrW)y@9u~uL%t?XZbqyW5f+Et34W z!LS}-i`7CBe7zl6ri*&epft&s&$>OH*eHGxS zhHk6U65B z>8CNK!9-11vQaR>vBA+u_Y-M_;lnvcG9u?I$)P@{FYnv4|Je~!k-gvP8$3QxaEPHG z98bP)Sa#G@+*5&N2g7DTTvu^;S8;&2h-VX5C=U0-MNG*2fY1|B1S1|g*4tBXj&?YO z;?UmR)tM3jV>)yBf-^#$s7bg}(dLb?`k8z#-(sPI_%;?5yH=n54cuk_?9eLH9)JK0Co>QP9He1tVgCfsJ*1Ka%X0p7#de6P6W=US(T6?y9ap|T|YHGN3ABwmje7Yjm7CeDEd}dDUgGY7!~sP z1?^~eMNVsw9g?8-2HwfOBD5KIT{>BMFBQtR{~(=`cDIkx*^M~bVG~2Uo01$YlKiOT zCxJ7x=k?j=`X$N72S@&Oqpc`#Lxg=p=De%6yCbhtKwoBl4z!%gG+Ccse* z-B$Z0oo|RdJK}HY@OI@t(yJ?L0lof#fUbrQF(VMXPebsH1?41|hX#Cq;c9{x%YN9z jk<+NSh;(u!yd1vKn4l)co(VQ_oJnys=s zb8)hhga`HrA^?L3!H|Fivw{er0RoVKvn2`_0}2IEhyevspaCUOlSU0!C)bS_AOHx0 z0ksB~5^SRh0Wg{%1fvOqFq%LNWo$cQfB+x}22>n$lf(_KD}_C9gWwNF69Ql~K?p_@ z24OUT7>)q;2w;x@_6T508HF7&KmZT~11c9vlbsHz0p^oN5A*|}5-F2C5b6ZRU--q7 zD-kXe7!5F(WiWw2o2D)Y!N^K0kD&;5*-d+1Po>wlq5g^0000000hxt Aa{vGU delta 302 zcmV+}0nz^0CDMy)W%}jMucRe0y94~ikrasm0Fga`HrLI8sZ!H|Fmvw{er0Rj+#vn2`_0}6#0P!a`EAO#vwlSU0!C)JG@AOHx0 z0ksC05^SRh0Wg{%1fvOqFq%LNWoSEMfB+x}22>n)lf(_KD+NAqgWwNF69Ql~K?p_@ z24OUT7>)q;2w;x@_6T505rrKwKmZT~11c9xlbsHz0pycL5A*|(5-gKF5b6YmUxdb! zD-kXe5DhSxWiWw0FBYJY!N^K0ictu5*-d&1Po>wlq5g^000000L(gH A8UO$Q diff --git a/piet-gpu/shader/gen/transform_root.hlsl b/piet-gpu/shader/gen/transform_root.hlsl index 35961b1..d447db6 100644 --- a/piet-gpu/shader/gen/transform_root.hlsl +++ b/piet-gpu/shader/gen/transform_root.hlsl @@ -4,7 +4,7 @@ struct Transform float2 translate; }; -static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); static const Transform _23 = { float4(1.0f, 0.0f, 0.0f, 1.0f), 0.0f.xx }; @@ -18,7 +18,7 @@ struct SPIRV_Cross_Input uint3 gl_GlobalInvocationID : SV_DispatchThreadID; }; -groupshared Transform sh_scratch[512]; +groupshared Transform sh_scratch[256]; Transform combine_monoid(Transform a, Transform b) { @@ -55,7 +55,7 @@ void comp_main() } Transform agg = local[7]; sh_scratch[gl_LocalInvocationID.x] = agg; - for (uint i_1 = 0u; i_1 < 9u; i_1++) + for (uint i_1 = 0u; i_1 < 8u; i_1++) { GroupMemoryBarrierWithGroupSync(); if (gl_LocalInvocationID.x >= (1u << i_1)) @@ -79,13 +79,13 @@ void comp_main() Transform param_4 = row; Transform param_5 = local[i_2]; Transform m = combine_monoid(param_4, param_5); - uint _209 = ix + i_2; - _89.Store4(_209 * 32 + 0, asuint(m.mat)); - _89.Store2(_209 * 32 + 16, asuint(m.translate)); + uint _208 = ix + i_2; + _89.Store4(_208 * 32 + 0, asuint(m.mat)); + _89.Store2(_208 * 32 + 16, asuint(m.translate)); } } -[numthreads(512, 1, 1)] +[numthreads(256, 1, 1)] void main(SPIRV_Cross_Input stage_input) { gl_LocalInvocationID = stage_input.gl_LocalInvocationID; diff --git a/piet-gpu/shader/gen/transform_root.msl b/piet-gpu/shader/gen/transform_root.msl index 2c58c06..8b4b2a1 100644 --- a/piet-gpu/shader/gen/transform_root.msl +++ b/piet-gpu/shader/gen/transform_root.msl @@ -62,7 +62,7 @@ struct DataBuf Transform_1 data[1]; }; -constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); static inline __attribute__((always_inline)) Transform combine_monoid(thread const Transform& a, thread const Transform& b) @@ -81,7 +81,7 @@ Transform monoid_identity() kernel void main0(device DataBuf& _89 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) { - threadgroup Transform sh_scratch[512]; + threadgroup Transform sh_scratch[256]; uint ix = gl_GlobalInvocationID.x * 8u; spvUnsafeArray local; local[0].mat = _89.data[ix].mat; @@ -97,7 +97,7 @@ kernel void main0(device DataBuf& _89 [[buffer(0)]], uint3 gl_GlobalInvocationID } Transform agg = local[7]; sh_scratch[gl_LocalInvocationID.x] = agg; - for (uint i_1 = 0u; i_1 < 9u; i_1++) + for (uint i_1 = 0u; i_1 < 8u; i_1++) { threadgroup_barrier(mem_flags::mem_threadgroup); if (gl_LocalInvocationID.x >= (1u << i_1)) @@ -121,9 +121,9 @@ kernel void main0(device DataBuf& _89 [[buffer(0)]], uint3 gl_GlobalInvocationID Transform param_4 = row; Transform param_5 = local[i_2]; Transform m = combine_monoid(param_4, param_5); - uint _209 = ix + i_2; - _89.data[_209].mat = m.mat; - _89.data[_209].translate = m.translate; + uint _208 = ix + i_2; + _89.data[_208].mat = m.mat; + _89.data[_208].translate = m.translate; } } diff --git a/piet-gpu/shader/gen/transform_root.spv b/piet-gpu/shader/gen/transform_root.spv index 7824d0914594eb18dfe4cd7dac5988c2b784e2d9..1578842ea8970357fca4c1f6c7d85bd116242059 100644 GIT binary patch delta 1715 zcmZ{k*-KPm7{hh=CWOMxjABEi{M@G~vc*MG%a1Q$a*> zT5PwhY%y~hTWqlu+iW#kEDP#S=%x*Vp5Ju7I9)XG@IKGGeBb+?_nc|ZAIpz8!C6_x z1WdLGn-8s#wd(??KX8m=g5ra6Oi)JYvS?{VMd`97(dx#=XjAi++E}zUULT9r#1paV z=J=)#?lx?g^RMmO<+&|hem!%U8o0aRB ze(`uF&oTF9ZOub*{~5#X=U``GJiXJ|8Vm&7P-u0@ex)$Z99_9k_J9!9Xfvf>c`-eU zjDG9suq<1aTR zJ(~5!Dc&dNUZGO>sYcLygjV6ZhJ=5TpOFEhdIy*`H;^-}?w~wvLb0-V7>9)2!jP^( z9~O!P{N3_xkUb($N9>z+aZHvU9S}Pz;G+Y$y|JAelPBb97wnBE#r-#SNOL3pZC)3r z#Hl03X}e@k3)=-^oHa@IjDUZ!x?(|S8#^m5UosQyl)zzw=fztD3fUaGQ2XZ43qr0y z4KZ?BWbyydh;~%${~mUiVmpKa<7Ve%FS;m-%U<6t-Xhr9E{QW6d0dC?c1I$4vwOsN zS}op_Nr~f|FOW$r<>pSAzQDipLht#c6`ky5@QSXe1gWAmTz+7X~&SXPd|uh&y?UM~(JMh<@l_-zBv#LIp9xi}5kx8Q|1 zIr#0RBjTJ>qQ|gC_nVgztArT>1uN8Xmh7lNGC!u^Kk&6cKDji6CXbr1>>GiafItoZ qlVbuk_Ekia$8rYCMA!}su~WSh_fPR&8lOFY7(V}Y2h#I$o1EXAxBy-N delta 1703 zcmZvcIcQW-6o&8QX)-a65JWAs5Uhe?p(sjFNT-5gz{ageA*c|PAXq3Q*k~b&j9Xk2 zwigkyH`V6=OMcw?~e{=^c_CHAV z8m?n@2;CLgFG=Ngj9Y;t{-lxZ`P|?U!yHgJSQ;Lh^EYxs>vv&k2Gi23*bUzVZgZ$t zvjBliOH^nGI!`;mUu)3*<3`_84xW5rf0HXgvz3Sz}JcyYyp=vLIeF74tF z+&q1rwh?(4tOxo$%Y_~R+NaZ1OR7iTF?4K=1*f~w5S=IBO~6F=q&}7Hdgw_|3Cz)F zOcPT34@QimDnHT4|1SF!z71ev$oJ(2O=^c~isd`tO(4$J33oQ*ET`U)FCQ|pz6<7U zHKjLm8m?_TFjAj38{3R@2V0Q;Ha3+;9qtUM0OtB2)#JW;+}F&rSbD%oD!To1K-)y% z_Aem)+`hi^K%152@wAJD?O(!j5yZ`2hHG={HztV= (1 << i)) { + if (th_ix >= (1u << i)) { total_tile_count += sh_tile_count[th_ix - (1u << i)]; } barrier(); diff --git a/piet-gpu/shader/transform_leaf.comp b/piet-gpu/shader/transform_leaf.comp index e158c50..c51dfe6 100644 --- a/piet-gpu/shader/transform_leaf.comp +++ b/piet-gpu/shader/transform_leaf.comp @@ -10,7 +10,7 @@ #include "setup.h" #define N_ROWS 8 -#define LG_WG_SIZE 9 +#define LG_WG_SIZE (7 + LG_WG_FACTOR) #define WG_SIZE (1 << LG_WG_SIZE) #define PARTITION_SIZE (WG_SIZE * N_ROWS) diff --git a/piet-gpu/shader/transform_reduce.comp b/piet-gpu/shader/transform_reduce.comp index 4b72b11..e59d559 100644 --- a/piet-gpu/shader/transform_reduce.comp +++ b/piet-gpu/shader/transform_reduce.comp @@ -9,7 +9,7 @@ #include "setup.h" #define N_ROWS 8 -#define LG_WG_SIZE 9 +#define LG_WG_SIZE (7 + LG_WG_FACTOR) #define WG_SIZE (1 << LG_WG_SIZE) #define PARTITION_SIZE (WG_SIZE * N_ROWS) diff --git a/piet-gpu/shader/transform_scan.comp b/piet-gpu/shader/transform_scan.comp index 492bf04..c4d6745 100644 --- a/piet-gpu/shader/transform_scan.comp +++ b/piet-gpu/shader/transform_scan.comp @@ -3,9 +3,12 @@ // A scan for a tree reduction prefix scan (either root or not, by ifdef). #version 450 +#extension GL_GOOGLE_include_directive : enable + +#include "setup.h" #define N_ROWS 8 -#define LG_WG_SIZE 9 +#define LG_WG_SIZE (7 + LG_WG_FACTOR) #define WG_SIZE (1 << LG_WG_SIZE) #define PARTITION_SIZE (WG_SIZE * N_ROWS) diff --git a/piet-gpu/src/encoder.rs b/piet-gpu/src/encoder.rs index fb32f26..87fff1c 100644 --- a/piet-gpu/src/encoder.rs +++ b/piet-gpu/src/encoder.rs @@ -19,7 +19,9 @@ use bytemuck::{Pod, Zeroable}; use piet_gpu_hal::BufWrite; -use crate::stages::{self, Config, PathEncoder, Transform}; +use crate::stages::{ + self, Config, PathEncoder, Transform, DRAW_PART_SIZE, PATHSEG_PART_SIZE, TRANSFORM_PART_SIZE, +}; pub struct Encoder { transform_stream: Vec, @@ -52,12 +54,6 @@ const BBOX_SIZE: usize = 24; const DRAWMONOID_SIZE: usize = 8; const ANNOTATED_SIZE: usize = 40; -// Maybe pull these from the relevant stages? In any case, they may depend -// on runtime query of GPU (supported workgroup size). -const TRANSFORM_PART_SIZE: usize = 4096; -const PATHSEG_PART_SIZE: usize = 2048; -const DRAWOBJ_PART_SIZE: usize = 4096; - // These are bytemuck versions of elements currently defined in the // Element struct in piet-gpu-types; that's pretty much going away. @@ -183,15 +179,15 @@ impl Encoder { pub fn stage_config(&self) -> (Config, usize) { // Layout of scene buffer let n_drawobj = self.n_drawobj(); - let n_drawobj_padded = align_up(n_drawobj, DRAWOBJ_PART_SIZE); + let n_drawobj_padded = align_up(n_drawobj, DRAW_PART_SIZE as usize); let trans_offset = n_drawobj_padded * DRAWOBJ_SIZE; let n_trans = self.transform_stream.len(); - let n_trans_padded = align_up(n_trans, TRANSFORM_PART_SIZE); + let n_trans_padded = align_up(n_trans, TRANSFORM_PART_SIZE as usize); let linewidth_offset = trans_offset + n_trans_padded * TRANSFORM_SIZE; let n_linewidth = self.linewidth_stream.len(); let pathtag_offset = linewidth_offset + n_linewidth * LINEWIDTH_SIZE; let n_pathtag = self.tag_stream.len(); - let n_pathtag_padded = align_up(n_pathtag, PATHSEG_PART_SIZE); + let n_pathtag_padded = align_up(n_pathtag, PATHSEG_PART_SIZE as usize); let pathseg_offset = pathtag_offset + n_pathtag_padded; // Layout of memory @@ -230,14 +226,14 @@ impl Encoder { pub fn write_scene(&self, buf: &mut BufWrite) { buf.extend_slice(&self.drawobj_stream); let n_drawobj = self.drawobj_stream.len() / DRAWOBJ_SIZE; - buf.fill_zero(padding(n_drawobj, DRAWOBJ_PART_SIZE) * DRAWOBJ_SIZE); + buf.fill_zero(padding(n_drawobj, DRAW_PART_SIZE as usize) * DRAWOBJ_SIZE); buf.extend_slice(&self.transform_stream); let n_trans = self.transform_stream.len(); - buf.fill_zero(padding(n_trans, TRANSFORM_PART_SIZE) * TRANSFORM_SIZE); + buf.fill_zero(padding(n_trans, TRANSFORM_PART_SIZE as usize) * TRANSFORM_SIZE); buf.extend_slice(&self.linewidth_stream); buf.extend_slice(&self.tag_stream); let n_pathtag = self.tag_stream.len(); - buf.fill_zero(padding(n_pathtag, PATHSEG_PART_SIZE)); + buf.fill_zero(padding(n_pathtag, PATHSEG_PART_SIZE as usize)); buf.extend_slice(&self.pathseg_stream); } diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs index 31fbf9e..96bbf03 100644 --- a/piet-gpu/src/render_ctx.rs +++ b/piet-gpu/src/render_ctx.rs @@ -11,7 +11,7 @@ use piet::{ use piet_gpu_hal::BufWrite; use piet_gpu_types::encoder::{Encode, Encoder}; -use piet_gpu_types::scene::{Element, SetFillMode}; +use piet_gpu_types::scene::Element; use crate::gradient::{LinearGradient, RampCache}; use crate::text::Font; @@ -25,7 +25,6 @@ pub struct PietGpuRenderContext { // Will probably need direct accesss to hal Device to create images etc. inner_text: PietGpuText, stroke_width: f32, - fill_mode: FillMode, // We're tallying these cpu-side for expedience, but will probably // move this to some kind of readback from element processing. /// The count of elements that make it through to coarse rasterization. @@ -69,14 +68,6 @@ struct ClipElement { bbox: Option, } -#[derive(Clone, Copy, PartialEq)] -pub(crate) enum FillMode { - // Fill path according to the non-zero winding rule. - Nonzero = 0, - // Fill stroked path. - Stroke = 1, -} - const TOLERANCE: f64 = 0.25; impl PietGpuRenderContext { @@ -91,7 +82,6 @@ impl PietGpuRenderContext { elements, inner_text, stroke_width, - fill_mode: FillMode::Nonzero, path_count: 0, pathseg_count: 0, trans_count: 0, @@ -160,15 +150,6 @@ impl PietGpuRenderContext { pub fn get_ramp_data(&self) -> Vec { self.ramp_cache.get_ramp_data() } - - pub(crate) fn set_fill_mode(&mut self, fill_mode: FillMode) { - if self.fill_mode != fill_mode { - self.elements.push(Element::SetFillMode(SetFillMode { - fill_mode: fill_mode as u32, - })); - self.fill_mode = fill_mode; - } - } } impl RenderContext for PietGpuRenderContext { diff --git a/piet-gpu/src/stages.rs b/piet-gpu/src/stages.rs index 1683cac..014cef4 100644 --- a/piet-gpu/src/stages.rs +++ b/piet-gpu/src/stages.rs @@ -22,10 +22,12 @@ mod transform; use bytemuck::{Pod, Zeroable}; -pub use draw::{DrawBinding, DrawCode, DrawMonoid, DrawStage}; -pub use path::{PathBinding, PathCode, PathEncoder, PathStage}; +pub use draw::{DrawBinding, DrawCode, DrawMonoid, DrawStage, DRAW_PART_SIZE}; +pub use path::{PathBinding, PathCode, PathEncoder, PathStage, PATHSEG_PART_SIZE}; use piet_gpu_hal::{Buffer, CmdBuf, Session}; -pub use transform::{Transform, TransformBinding, TransformCode, TransformStage}; +pub use transform::{ + Transform, TransformBinding, TransformCode, TransformStage, TRANSFORM_PART_SIZE, +}; /// The configuration block passed to piet-gpu shaders. /// diff --git a/piet-gpu/src/stages/draw.rs b/piet-gpu/src/stages/draw.rs index da773cf..5328a84 100644 --- a/piet-gpu/src/stages/draw.rs +++ b/piet-gpu/src/stages/draw.rs @@ -30,9 +30,9 @@ pub struct DrawMonoid { pub clip_ix: u32, } -const DRAW_WG: u64 = 512; +const DRAW_WG: u64 = 256; const DRAW_N_ROWS: u64 = 8; -const DRAW_PART_SIZE: u64 = DRAW_WG * DRAW_N_ROWS; +pub const DRAW_PART_SIZE: u64 = DRAW_WG * DRAW_N_ROWS; pub struct DrawCode { reduce_pipeline: Pipeline, diff --git a/piet-gpu/src/stages/path.rs b/piet-gpu/src/stages/path.rs index c9d2c60..6c524a2 100644 --- a/piet-gpu/src/stages/path.rs +++ b/piet-gpu/src/stages/path.rs @@ -39,22 +39,23 @@ pub struct PathBinding { } const REDUCE_WG: u32 = 128; -const REDUCE_N_ROWS: u32 = 4; +const REDUCE_N_ROWS: u32 = 2; const REDUCE_PART_SIZE: u32 = REDUCE_WG * REDUCE_N_ROWS; -const ROOT_WG: u32 = 512; +const ROOT_WG: u32 = 256; const ROOT_N_ROWS: u32 = 8; const ROOT_PART_SIZE: u32 = ROOT_WG * ROOT_N_ROWS; -const SCAN_WG: u32 = 512; +const SCAN_WG: u32 = 256; const SCAN_N_ROWS: u32 = 4; const SCAN_PART_SIZE: u32 = SCAN_WG * SCAN_N_ROWS; -const CLEAR_WG: u32 = 512; +pub const PATHSEG_PART_SIZE: u32 = SCAN_PART_SIZE; + +const CLEAR_WG: u32 = 256; impl PathCode { pub unsafe fn new(session: &Session) -> PathCode { - // TODO: add cross-compilation let reduce_code = include_shader!(session, "../../shader/gen/pathtag_reduce"); let reduce_pipeline = session .create_compute_pipeline( diff --git a/piet-gpu/src/stages/transform.rs b/piet-gpu/src/stages/transform.rs index 4383c14..b21712f 100644 --- a/piet-gpu/src/stages/transform.rs +++ b/piet-gpu/src/stages/transform.rs @@ -33,9 +33,9 @@ pub struct Transform { pub translate: [f32; 2], } -const TRANSFORM_WG: u64 = 512; +const TRANSFORM_WG: u64 = 256; const TRANSFORM_N_ROWS: u64 = 8; -const TRANSFORM_PART_SIZE: u64 = TRANSFORM_WG * TRANSFORM_N_ROWS; +pub const TRANSFORM_PART_SIZE: u64 = TRANSFORM_WG * TRANSFORM_N_ROWS; pub struct TransformCode { reduce_pipeline: Pipeline, diff --git a/piet-gpu/src/text.rs b/piet-gpu/src/text.rs index a47c614..dec3ffa 100644 --- a/piet-gpu/src/text.rs +++ b/piet-gpu/src/text.rs @@ -11,7 +11,7 @@ use piet::{ }; use crate::encoder::GlyphEncoder; -use crate::render_ctx::{self, FillMode}; +use crate::render_ctx; use crate::stages::Transform; use crate::PietGpuRenderContext; @@ -172,7 +172,6 @@ impl PietGpuTextLayout { let mut inv_transform = None; // TODO: handle y offsets also let mut last_x = 0.0; - ctx.set_fill_mode(FillMode::Nonzero); for glyph in &self.glyphs { let transform = match &mut inv_transform { None => { diff --git a/tests/src/draw.rs b/tests/src/draw.rs index 916f14c..d79a9d9 100644 --- a/tests/src/draw.rs +++ b/tests/src/draw.rs @@ -38,7 +38,8 @@ struct DrawTestData { pub unsafe fn draw_test(runner: &mut Runner, config: &Config) -> TestResult { let mut result = TestResult::new("draw"); - let n_tag: u64 = config.size.choose(1 << 12, 1 << 20, 1 << 24); + // TODO: implement large scan and set large to 1 << 24 + let n_tag: u64 = config.size.choose(1 << 12, 1 << 20, 1 << 22); let data = DrawTestData::new(n_tag); let stage_config = data.get_config(); diff --git a/tests/src/path.rs b/tests/src/path.rs index 7c5388f..6f1f61a 100644 --- a/tests/src/path.rs +++ b/tests/src/path.rs @@ -62,7 +62,8 @@ struct Bbox { pub unsafe fn path_test(runner: &mut Runner, config: &Config) -> TestResult { let mut result = TestResult::new("path"); - let n_path: u64 = config.size.choose(1 << 12, 1 << 16, 1 << 18); + // TODO: implement large scans and raise limit + let n_path: u64 = config.size.choose(1 << 12, 1 << 16, 209_000); let path_data = PathData::new(n_path as u32); let stage_config = path_data.get_config(); let config_buf = runner diff --git a/tests/src/transform.rs b/tests/src/transform.rs index 1c15634..6edcc3f 100644 --- a/tests/src/transform.rs +++ b/tests/src/transform.rs @@ -30,7 +30,8 @@ struct AffineTestData { pub unsafe fn transform_test(runner: &mut Runner, config: &Config) -> TestResult { let mut result = TestResult::new("transform"); - let n_elements: u64 = config.size.choose(1 << 12, 1 << 18, 1 << 24); + // TODO: implement large scan and set large to 1 << 24 + let n_elements: u64 = config.size.choose(1 << 12, 1 << 18, 1 << 22); // Validate with real transform data. let data = AffineTestData::new(n_elements as usize); let data_buf = runner