From e5dd9ae01e425dd16889edd2d2ccb2104fac591c Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Thu, 4 Jun 2020 15:58:38 -0700 Subject: [PATCH] More parallel path coarse raster Use fancier load balancing algorithm for coarse rendering of paths. Seems to work and an improvement in some cases. --- piet-gpu/shader/path_coarse.comp | 93 ++++++++++++++++++++++++------- piet-gpu/shader/path_coarse.spv | Bin 13264 -> 17364 bytes piet-gpu/shader/tile_alloc.comp | 8 ++- piet-gpu/shader/tile_alloc.spv | Bin 8736 -> 8860 bytes 4 files changed, 79 insertions(+), 22 deletions(-) diff --git a/piet-gpu/shader/path_coarse.comp b/piet-gpu/shader/path_coarse.comp index 5a4b78c..7cbda9b 100644 --- a/piet-gpu/shader/path_coarse.comp +++ b/piet-gpu/shader/path_coarse.comp @@ -7,9 +7,10 @@ #include "setup.h" -#define TILE_ALLOC_WG 32 +#define LG_COARSE_WG 5 +#define COARSE_WG (1 << LG_COARSE_WG) -layout(local_size_x = TILE_ALLOC_WG, local_size_y = 1) in; +layout(local_size_x = COARSE_WG, local_size_y = 1) in; layout(set = 0, binding = 0) buffer PathSegBuf { uint[] pathseg; @@ -32,7 +33,22 @@ layout(set = 0, binding = 2) buffer TileBuf { #define SX (1.0 / float(TILE_WIDTH_PX)) #define SY (1.0 / float(TILE_HEIGHT_PX)) +shared uint sh_tile_count[COARSE_WG]; +shared uint sh_width[COARSE_WG]; +shared uint sh_draw_width[COARSE_WG]; +shared vec2 sh_p0[COARSE_WG]; +shared vec2 sh_p1[COARSE_WG]; +shared int sh_x0[COARSE_WG]; +shared int sh_y0[COARSE_WG]; +shared float sh_a[COARSE_WG]; +shared float sh_b[COARSE_WG]; +shared float sh_c[COARSE_WG]; +shared uint sh_base[COARSE_WG]; +shared uint sh_stride[COARSE_WG]; +shared uint sh_alloc_start; + void main() { + uint th_ix = gl_LocalInvocationID.x; uint element_ix = gl_GlobalInvocationID.x; PathSegRef ref = PathSegRef(element_ix * PathSeg_size); @@ -49,6 +65,8 @@ void main() { case PathSeg_FillLine: case PathSeg_StrokeLine: line = PathSeg_StrokeLine_read(ref); + sh_p0[th_ix] = line.p0; + sh_p1[th_ix] = line.p1; xmin = min(line.p0.x, line.p1.x) - line.stroke.x; xmax = max(line.p0.x, line.p1.x) + line.stroke.x; ymin = min(line.p0.y, line.p1.y) - line.stroke.y; @@ -60,6 +78,9 @@ void main() { c = (line.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + line.stroke.y)) * SX; b = invslope; // Note: assumes square tiles, otherwise scale. a = (line.p0.x - (line.p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX; + sh_a[th_ix] = a; + sh_b[th_ix] = b; + sh_c[th_ix] = c; break; } int x0 = int(floor((xmin) * SX)); @@ -74,34 +95,68 @@ void main() { y0 = clamp(y0, bbox.y, bbox.w); x1 = clamp(x1, bbox.x, bbox.z); y1 = clamp(y1, bbox.y, bbox.w); - float t = a + b * float(y0); + sh_x0[th_ix] = x0; + // TODO: can get rid of this (fold into base), with care (also need to update `a`) + sh_y0[th_ix] = y0; int stride = bbox.z - bbox.x; - int base = (y0 - bbox.y) * stride - bbox.x; - // TODO: can be tighter, use c to bound width - uint n_tile_alloc = uint((x1 - x0) * (y1 - y0)); - // Consider using subgroups to aggregate atomic add. - uint tile_offset = atomicAdd(alloc, n_tile_alloc * TileSeg_size); - TileSeg tile_seg; - tile_seg.start = line.p0; - tile_seg.end = line.p1; - for (int y = y0; y < y1; y++) { + sh_stride[th_ix] = stride; + sh_base[th_ix] = path.tiles.offset - (bbox.y * stride + bbox.x) * Tile_size; + uint width = uint(x1 - x0); + sh_width[th_ix] = width; + uint draw_width = min(width, uint(1.0 + ceil(2.0 * c))); + sh_draw_width[th_ix] = draw_width; + uint tile_count = draw_width * uint(y1 - y0); + + sh_tile_count[th_ix] = tile_count; + for (uint i = 0; i < LG_COARSE_WG; i++) { + barrier(); + if (th_ix >= (1 << i)) { + tile_count += sh_tile_count[th_ix - (1 << i)]; + } + barrier(); + sh_tile_count[th_ix] = tile_count; + } + if (th_ix == COARSE_WG - 1) { + sh_alloc_start = atomicAdd(alloc, tile_count * TileSeg_size); + } + barrier(); + uint alloc_start = sh_alloc_start; + uint total_tile_count = sh_tile_count[COARSE_WG - 1]; + + for (uint ix = th_ix; ix < total_tile_count; ix += COARSE_WG) { + // Binary search to find element + uint el_ix = 0; + for (uint i = 0; i < LG_COARSE_WG; i++) { + uint probe = el_ix + ((COARSE_WG / 2) >> i); + if (ix >= sh_tile_count[probe - 1]) { + el_ix = probe; + } + } + uint seq_ix = ix - (el_ix > 0 ? sh_tile_count[el_ix - 1] : 0); + uint draw_width = sh_draw_width[el_ix]; + int x0 = sh_x0[el_ix]; + int x1 = x0 + int(sh_width[el_ix]); + int dx = int(seq_ix % draw_width); + uint y = sh_y0[el_ix] + seq_ix / draw_width; + float t = sh_a[el_ix] + sh_b[el_ix] * float(y); + float c = sh_c[el_ix]; int xx0 = clamp(int(floor(t - c)), x0, x1); int xx1 = clamp(int(ceil(t + c)), x0, x1); - for (int x = xx0; x < xx1; x++) { - TileRef tile_ref = Tile_index(path.tiles, uint(base + x)); - uint tile_el = tile_ref.offset >> 2; + int x = xx0 + dx; + if (x < xx1) { + uint tile_offset = alloc_start + ix * TileSeg_size; + uint tile_el = (sh_base[el_ix] + uint(y * sh_stride[el_ix] + x) * Tile_size) >> 2; uint old; uint actual; do { old = tile[tile_el]; actual = atomicCompSwap(tile[tile_el], old, tile_offset); } while (actual != old); + TileSeg tile_seg; + tile_seg.start = sh_p0[el_ix]; + tile_seg.end = sh_p1[el_ix]; tile_seg.next.offset = old; TileSeg_write(TileSegRef(tile_offset), tile_seg); - tile_offset += TileSeg_size; } - // TODO for fills: backdrop - t += b; - base += stride; } } diff --git a/piet-gpu/shader/path_coarse.spv b/piet-gpu/shader/path_coarse.spv index cf0d4b9d10d48076020e768911e4ca121658920a..58c2ab56e76cc7c8831abfe34dc29b89b54c0f4a 100644 GIT binary patch literal 17364 zcmZ{q2Y_Bh6@?#UH-r#+Zy})uNFt#ZDT`o05?T-t!DW+Ok|mqnkZd3*C11Pbr19p_I3C6 zFX)>!+<$VPaifa3{=r!Ti+am|P9n-0I;o3^RfqNr&zUj2U}$dNG%_CEcLLf-D%R&U zbixS>`-Ur&k<7O}-q?;>H+DX=4|g8(r?hg}&-u(}tnU8Bb$&HhX<ZC2^v$j(^r3}F5>iEZUyrCP7(w`}cRU2A)ebzjNRQEkxVut4jRdo8Q4quLmLME^j2 zSB#N+l`&)C&OzN6x!25@LyNW1ydT5lT8QmrPmRag?^RV(P`R$-h-=Nev)T-7Ju`}R zRr4xq+5+Ch)#J;(77q6;U@M_@RBG1LH`x11jC*9TZ?PfNxNfx#w#n0d%7XskzTBZ@ zMbxD>`=e1OoqWRfb^BkrnU@>J`#FK!t$B^8wg$J>WkfXz%oZi4quSA!o>S^)+I_V% zyxA}NF5*FPZPmQyvz80{X4koQZAo>Gy}{(*NoeKVzs0u?Kd8lb2tTaF9lGv+REzHv zeq4*!!#a;%c&qk~>U{A0o&`Pgu=*~ zG3Gw9Qgbgh8h3G7tb4M@99YNtGuF#Bc^Sb`zg!zs_oJaR1=R3kQbg zzjAMT}4t|J^O_P&L0tKD?m6msz_G z54HS}>X=7b+RUDXeKmgqeoB9D{X(_RQ!Rfal+NlIv|b*D?nd>q@lUO|Qm$kFhVC=f zJ!@#u;IMU!tX^ovmNNQ?>Sgr)y7rOPrffXd-mS++e*NNf>v3)Mxpu~SCX-r||=3w)x zjpbf#fz7?3A=4JfrvHH~G?*cDZ@TuUk|8%%}B=IxAu50*Pz?}u} z2AkjUW?`MT_f39c!(BuByAC}1uZ%BWL~}gnooCT``^?GBa~ake)ttB7=T1%kwOGgX zUZ>6*z%TP*pK))3yVs1r9c#S%LVicX-6Qh*8txustmEiXp%UE{sF9IP+#A8g{)t?zSXZ)UN^qcRiVDDSf5%7PyajN);p1v@R?W&cRjkAwli-Gq&g?2evY0;@jYUVzAycq#QPfltAhI(@NB`2 z_jiiK>wmuBzK32axPD#*uf*R6_FSL-^7Y#I_%`mjzKr)=Uvkg(CHH(^^8MQQ^fvB! zzl`_HAMT!C+{Qi2m;NPf{G5WjUY_d{@B7koeYl^ep6kPXX09l><9lw8zrJ5>{FyfH z8M}=499?qH%q91{TyoFICHI_Ma?i;n_pDrU&&nk~vW*{KaOdZ_Iq`DO&L#K!9Pa!* zLxm64&O2I-I_W48|-^gJ$0)6iyXU`0??1JS-3j&{x@IS0N5YNq9BRyH z>}yWBG0C|SxXd{QZqD2XbFPf0o}8rAM#I4c9IaV3BCc3^ctgHRj0`pU!*Uh0nIo1X{)}po?>!9m9i*+!^x?uCD zTeF`XYRR)c__b=@09{|=Hf-YBYu*T5UuxbM%uoG&s5$f}$5`-d)w~J1zB8Me$AQhG zZq0tasU^>*;MCm3Sl(0Tx*58@#BJWhS#ue;1-ib}yd{{QdY#RoKRLDnr{=C=tO@A) zvd&wB&7*G3eg>;0&$i&#s(B*1zQj#x;@WGTjIJ*=zYfe#z0T&)pB&qPU#sRR==!qG z+k?%cZq1$()RJdMuzA|=`A+Ei61Q^`=kr+Z=UveCWsP_^|fZvgAt zm*)HLjaV<=zxwv49YFJ)Cw8p;uv2NiPmR}SOvXMC?EJ&u1h$rQn`hu4urcb%r&i`O zZ~9LIJEpOFW93<|gTdJk<^4GXU0-q>3bt?7<1jEk^?K+_z8T=;>%yP@N1*G={(3X` zaGLsint2@wHm|?U2aVJ4}#17LvZU%|M}o~G~?_qHzxfTfXn_1;jVr94}(vl8E1dF zG49J3soQ<7P5r0hTa}jiza6aR{Ik#A0p_RvJxu=~YvJ#br-Ai3&SI=QdEW_MkCwde z0;`pI&w%q&Kim3~_e`)p^PY~CC+}I{@wDW9H(0IA`yMzyb-ntN_iV5}^De>4llNTk zHnikD53E+^Js-|bU9bM+eJ@y_dC$Shy&n^aeIMAp?sssG@}|fH{n@h9y}2 z>3aj%y_x)<0=KWjjcDqLxv7aM*WqR~_0;fbFh6w-j$saCtikd0r-oa=?ziOs47hzA zZbeg1%xz6fxem9Zsi%g|g88Xya13)8V-1d{-x}P@UjXNQj zsXNX+Shdvob#PhdH{izj%(xEU1gkmb!`N@p9-`ULd~#!w@7rMW`B~?C;yZAkOV4t7 zHqHB8G<})N_rT7#3!k~H{rhnB@E?G!&3B0YAA*fhPo5uv%~P()kKyWPH2r=8wy%0} z{S>VJC@ph;46N>2TJz&z>sHTt{0#gE&3U^Xa^q6-&%y5R@Lzxz({g{7f{jtnxKD!B z&G!UWZhvd|B{=s>{wp~x^*jxJie{Ys<;JA{uffL4J$pU}o=(f>!*9XAp{eI{@^@f< z>UjNUvL5#RJ@{Fg{&`J(e*mld`-1iT5zJ5hcOJ*qZ(Yw|^`)*qgUfmS1#V2{^;htp zXvW!JZcO_B9c;YZz3~sQ&vf`d!Aoq-9(W#XjC$7nUtsloM*SOXUUkR(8&)l2{s&yv z{36_#)cg|o1)6d8mm4E~8S5VP{Nj)27k~ai{mWxV(7g0_fc0xLb|=^z`L1RpT+P2V z`TK!i({> z3f9Y7^sP)&vlem2UKL!9y&Ak6dv&;Dt7q&r!0LG}*94nKJ!`)f*tOS}vDOBwhpz+n ze)(Q-udWMMGcNaA&3lmAwf}D&*TYxNWqo+LA2xtnlX2#k8)JU&w`-$KeH(&Zi`2If zT+Q{!9#Sj!koPUx-*{k|E=rR`0Qif30QgV^F**~>fcu~ z?@3^F=@;zwg zvIcSb?hCe-@Hc?X^#J?8cj|s%|0b-SXK{b9diLlW!RAqSK3!O~=U-}&jR(CCq#LBJRaUH|4;>#Y5Pn$e%1KU62 zcY~d)y8REwswKV$Y`v*%CfLt<^~B5qtK0v0tUSIxaQfdxzZ2khVAYLr@6HCR+rJkp zkFOuBmV76Ilg~5nT=-L1^~4N-)qM`kKM$SbH8>f5H&)#kYdQt2o@aM4SS>kD1(!9w9d1o$H#NNj zY>ayL{ApnG=(Db2tXguu6P&u{QqQ~KbFk{hc#b>+te!q+g3CTj;PdFCo<3)R)zjzQ zV8@={5lMW7Kojem|HY>hE~;=ewAT!TQYQy}SgRJuCkJ+&z}xbuR^*XA;f6 z#v5au7hv_LuFJt?ogai-XV&QourcbX^Fv_syN>!(=ZC@iQsJ^>bkJs2k(n`aIZm%X{Mvuv+Hw1+e|p zljkn5`kl1IeGzQU`tEPuDR+a_JzKgCH-pvu-1!pLwbpw6`3m+PnwS1BH-2sAb063o ze*fUv^?tDb?#=HW^d;^Aus+AU7b}nNtKc%n*Wk&aFL4in?Hm4eu=nv9o(lKHH^BDo z0{eMsGsb?7`ykdB=iy;amoZknRIe!2jO-}XX{2^F9Ie!E;XU<4+ zW0Ld7VCR_ckA4C+-u;m})Qn4?N5QVU-y?5E+)u%3*8CV&ZtNELeg-z5zQ?ih-MCwr z>Iwe04c0#Tyf1m)welUXuB|rX{~SD-mVN#Uu-bFvavXhX_TLU`|Lw6a^5)x>=D+pw z-y^I-^BP0jg?^*3E7Oc$6`OzOJ;A%u97lip?p5Q~TDul4 z-g-Ch126OL3s>`*Ox`zulUIN8?g!TAJ#oc!= zR~}y<*gfKBnP-<1Xln7#279mLp95Aql)XM0+fTEPdfsVs!9El2OUINOmz?v!-sj{T z1gphA1onQ#KOd}?F;4>9M?GUM06XRY%`xT1CFer0_bSimFxdF*X~xTqN&LxR??Gxf z1*{hTVz6r&|EXZL)cAI=ebiIqJHU?VzZG&!xpC%P340n%|4Y2z#CaEv1g}N&T7%}h zaCPjOG~b15V>7RJHM(n`TF(Hh=iPHASk1X-?<@gl@90n8v%vbSNt~LjcRiZdI<(ZZ zF4&sZ$NGLAg*~6<`#Ck94K}W<@f^5%-goDM)qEcP9JBvuWNYbdJ%jat(@!o;p({` z7lYNZ7p*}p_uzwIwXDq*V71+8xqlx5Uruuz{ptH*us+w!xya-D2)NAqQMg)H-CF0x ztH8;tKYc$2)@NSVL>}MA!DZfS;A&H8srM7$a>yWNmK-tKCFP%%{QDoIJOH)yh1dX=0M+R=ApZ4rPaIguRVs4USfAL~!eUk0mZO}+wF+n(ke&7qd~d%$Yh zxA%h8vTyGL-%qn1$5FTMir5Ee`h35R#6H+)**{-}XaC#_c6>GCGXB@V<@gW5)v~`G z|Le4jub%P0(fBg{H{og-|6y>(H?QNX8JF?D1un;b1gTE_ngIOChw@zspW r_>Y3i@qY?en?WJT>F;d&7SLsO1`| literal 13264 zcmZvi2Y_BxwS_O4nIu3cfk3Dsg+M3)LN5=PL~tkp0s&)G945&m3`{0tG6@Mqih%H* z1>aLq-$NmY3aEgB6dOcQv0+#2qN0Lb=>qTj?td3K4*dByXRWpO*=L_~&pr3f3~eK( zjH;>)s?q%0ZA4Xn##SRxs%qnEWZibno;!Q56+?^m+W&xk4A``4tK*E>jF=AEc5G=+ z-++!kW1FL^jm0Kn#%gW+bC`9I_G+W5Yj$_nNhfv9KDvA1;9&QPp*8)z-Ti$7z1@rY zmiI0k>RZ)o+~^{%Z(w2n%0*>B2NC7?9n~oQRn?rHp)=h4=z=T~!uR@7sge>2u>c>eC? zy*-Pj5wmjP(Dav-I(_Ag8HdrQgG`&yE~okq{%xK-y-Vr|IUhAY>*up^H4a`U?OoD6 z)U)K@#;LnD1DJnHVq5t)t+pxptzWyB);c`L_!o1uSKEP4?dz{s-1_8x%K9e4jp^T9od0@qz=z3!7~ou`Z8SaLt>$7-*p!i`ZkM($_9f@Q01 zqj_D2sA&al2YYiGt^Hmydj>17&#uI^>g%X>2RnV%n=ijp;c@H@u_z587tm&05I~=h~|I8DuS2^e#bZ ztBz<%;P&chFgct`&5v*KeZx;~afhw_^IClW@Y7rTfbfMaeqf8YRc8TOwYOJSfd_k* z_bjEgj%#Q$m+Qdm=h9wX4|lJE6;@BwgZ z+>YuoaBqL_(%yj~j}UT=uFudV{oP&t%NF$XA3LyW*}|TozGVZ)9!0F<{+%}0=1DO3 zIC@9*GbKTk&Cfu0eFd1y$U4rw)o5Im^|9`;9&^w-*0Z#JoZ|>OtzSP* zX1M2d{2LDQkAbgV+IlY7f2*4J)c2iy0(?#3-?irPv+v-G>NRPr4ivAk4)zqdZ(!Am z{$+zNUW?<#xcGbKv)_Xz2 zGv7;Up8eEbT@KH=V&1iIZ&v%>2wKfwEX(>Afq7Pfdln9@?CEdKZyR`-Yukc%5%2SDB<8dG7&7FXm;(3HMeS;B zHEyrZjLACq*xxf`Qq5X?Ex62ged8#k zmU$Rw4zq4~^O{Cy$K4j&l1%2BL~r|xw<5-QPQ?67z~ehjOY?21 zs=3fS;MznJCOrVG3sA=}MokZLiT7Bj+|K_yGKNf6mHDkAEeCqP8Y3(bw zuh0MV-5zXTwTZ;KwyAv*YrJFQGftoVC)1|CYihij^Vzxax%Ly_Q)$)AZ!Npfs@cz6 zyVEAu9$<5+O<-)Fx6WTvv%cxH#^|%Yy=YxK`|9&r_#m2yZGi33@cLm?@Ponj+mRX$ zqqPR>aevDl>u_3QT3U;{$6MpIu6ZZikMXl;^_k0YZM(44AwK~t`_F~Dj}kv0?AnEQ zgF6b|3pS7A_0hT(UR(KK!(Fe;dpS7cyDsNp<#<=a-BYf^T3U0v4%gGl&HF)GYgRj~ z;Wspz{@cOU;dM>jcYSZM1QCo(G%DnvP_k?!hbf2|o%x?d*XQ z!%wmh9`oU+)%>Ev9j6!UTwQzp)^H)(7+U9gDOjJ^-1y6yc=g2J4<3hqPkOrso=48F ztFZ&wA>-U`{T)*!X;qEivE5h~rUJ>r~^Sz?v zzE_mo_llDHUQu%2D@yLWMag}?DEZ7`+;@%A@B2o{edh@Gn)u#Pa^F2l?)yi{eFq75 zy?hS|za8#-Nca@E?;+t{Q{O|vT`%85!mY=5k&^o^67G1ui1SM`Al}L_w!#d z@9|FTaQ3YC&EGKfBiP~YXL)@80IOX}e`6oV)VxpT(BHG?2~6Go?ti)1z2Au?*Hd6~ zS<5Wie_@^to~4O-uF-EL#j(d)E!JZ`N2CRplXBblgu+ zAJWXB9qfIto;uZ@CP#AFrltegE1sbwQ@YVrP-Tg%48 zZw%gsR^3`W57cI|+V1B~z-m5cJ27*5c5aHMFL9eSajs<i4LG1P0nv0%RSGsPVG zlVc08WBDvAb8LyG?@iQfj&We~s9UpVlv?s^4Su=jBD%iBO={w-xs02Pt}iuD z0rRcb*&OQuCf* zzV$krLw|Bi2ftj+d!g&gI`0iOkGeJcJW@-Z8DR4azvla*=}X*xP2BK(zdyRZtkD5r z`>H#h&pWl`IS8Eb%C$ZiU0>o3Y2qBOj5`!vU&cEOY+rT9^LeV4{c<>1Z4CFX_sS7q zzTCCM0PbQxdaiYX^&N?MuDpWQkLQ}cS=iB-XNlOcX3`#oc}^Lx&zOw;O0e?}?*d!P z70o?x4A>a;x#H?9;c)0%RYKNSk1X){BE%0rrtBa#;K=Y57;sFC2j#&J@c9m zR`a_0d~%=IW}GG9Fw5Jk?+{p@c?W6b$-4@?E0(di5vo8n8a|uB4UEWIy^2cOKZi;?Kg4<$KflaCPI)rIp9`MzC7mu`U9u zU5NRPHJSbwgVpVS39a0B2!A$w6L=b}y8a7j)vV_}^f!am+`EqH&xcpQ)z61tN_!=y zX8+3@Zmie*KjB_$^YzmH7iLUnlh2s9G z8XQl5YPbsQ^-KPDfUP0-nEC!2uAZ3xX=40cK=Qm3O+7Wd3(U8!!7I>-EwpMG=N_dCh#aQ}(25gLaVm=G@nudQ4?3~l*^I&7tQ~$kSbLsQU@Jzc8 ztZrR*)5@LOEPP)8`?Fnn27VFExSVre0;}DRJ<87~=KM0)zTVfJm@(#GN2|~LgXj-{ zM`JnD{N3DF;p&O`8u%+sj5*}S#Q$}$IeZs&EzI!^xO!r~2{wl@=8zkcwfh#>wbS8-fqWn9UxgASv zU+LY>axI`9A~mt)HR#^Q`+hSYPV;1=u|n{!6fXFlWiHzr_M*fd|&jZ&fg*GOI^PQmv#ODUe@_2 zoNryHeT_H9Iv=LhpIm-!{4a4mL;lKfsQc z>- zFVJ7jHEg4G-nOoB2kl7AkN)(-zzaT|g4Ii|l`lgBp( zT;|vWo*eoTw<*}Z;hTZiG4>Ml1T^Fy-R$$LJ*WIKYbWqSnxjZSq;y5;s4NL zo#lCP5L`Xy#UWs|oEOeP%{6on9f_%BPt9z!PTHIauK*u`IgbAHJqq0NwblRMk37Ct zg3G+ez}39BllNF~^6F3DW<1F7uuUSMz;0c~1f-um1F%18(`+DzA+^ zzEi+u-c#XfbFt(-4V=9C)Av>2manby8q4E*HMq?C8o1hgEO}oGPG0@#`#NyT$M0Fd z^7vj4F7uuNSL?=-w+EcOXModp0l4LBtK4()_!fc7yo=##y;$-t0Vl8i^gR>Y^0n3f z-;F%Jv%%gkK3hFY`Z2Zmmx8^2;vWF3&EX#1h;|ufA9eS?@3wEiyeHh3o)>cCl5+*v z>ztfJV72&Hg1uhxuL7%O%yYo@QO}sG!H$_TMs8ekt^s?ka*v)1HvVAD8s)|${(P|4 zAT?Z2iJS-2H#=5<-4yY`vOD)c6*#ab=Bfg{$YhdmC8I`_bpN{ojtMyLRT3=kHMO0B?$^ zTdO>^ZU>%-`PmV(R^J;ZG}!luoiOX3M4LL_2{xvz^IdTDtk1i__RY_P?*Xe_jb$$H z1=~-bbFrTHVd~Z@POVeH(=b1~VAeW?c2~^$cB9R6?;5b{5`Nt(j@Q8&uEnw*9|Ego?jHuLbz<3T9|7NhIgbAHy%DU>yv{`) z-_77M??>TkU0Cvd44l0B)A!?GedcvduB|-2 zPl27ompKzh(%yxs-H9dUZm=;~+fRel?!glC8L%}c&u77EWuDJ9G0F3JxSDz9utRpH zy%)0v$1sO6UibTG*J17juZMGy-;1S&FMyq2_U;$KYWDX&x*xND&R)4O_Mb}oB}~72 zU7Wo>0z3orvlr%G--CAV22ZEm2Xo)=OPhWE0C*IZefw3geXKt&y7R0jp*F z--0u~c^zNPxQzb@xE%j?aJBiEd3K=vJ?8lCp>nVNq48zBN8uUIJdUSkTz 0 ? sh_tile_count[th_ix - 1] : 0; - path.tiles = TileRef(alloc_start + Tile_size * tile_subix); - Path_write(path_ref, path); + if (element_ix < n_elements) { + uint tile_subix = th_ix > 0 ? sh_tile_count[th_ix - 1] : 0; + path.tiles = TileRef(alloc_start + Tile_size * tile_subix); + Path_write(path_ref, path); + } // Zero out allocated tiles efficiently uint total_count = sh_tile_count[TILE_ALLOC_WG - 1] * (Tile_size / 4); diff --git a/piet-gpu/shader/tile_alloc.spv b/piet-gpu/shader/tile_alloc.spv index 37276476010468de9292b5620d8e2973b61fb095..81b3607b2cdbcec01bf533a669ec7006a7efaab9 100644 GIT binary patch delta 1250 zcmZ9LNl#Q!5QXaj-8h65JB$)Th$c8ADx)Bv;>e_+sMw$}Bq&B9(Nz=UFOZdqChi@g z>6Kfz{tW+v1Bub^gLjkG*QwMwr|MSK9s14CtKo*)nwI*AEVl5w^Qqx$VWd5aSYBAX zJNtNLZsAEpA?o5Z+QZVL(*4-vXaC`aIPaRJ<_7C)fMv<$3mG<%nMtx3NuZmVypd9Wy5t>H}NATrWzVVkg0ERlFZ; z&hvcO0q|B#eYKp6rywpgILa=4AcJ6acjUtv#=fCXVm6kWW8)^YGvL>G%-10|whL_p z>{{3PQskYSUAYGF9Ha&BB7^Qa-Jd=k=|!+d>WfXgy#!WIySofl>nG1Gkz_P>r<`kU~42k%>Vy0Yu2na>+RJ0$wF7AKOaI8cJSZ(QuvmgKAeQm zsINR+e!h0U{vw2I=nms(PixO=k3*ZE|A%M8lxtQSwUy-ut7}ghtjUI%js3Zvq|z$o zib=B6s^|N<2gb-_yRa7sJ%u4IFM~D-?#H{xSiHiEV6V_e6Hj*utiGN$J`GkIC(ovQ z32HI_GT7VrN`BK>U4d+i=>ngMT3m1y>})Mo0m~i3HM}Fzf9F(H{2b