From 9a0b17ff5b8e7df518e22707f205a7661c917955 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Tue, 12 May 2020 21:26:44 -0700 Subject: [PATCH] Use different output strategy for binning Iterate over bin bounding box. Seems to work, and is a dramatic improvement. --- piet-gpu/shader/binning.comp | 111 +++++++++++++++++++---------------- piet-gpu/shader/binning.spv | Bin 16068 -> 17052 bytes 2 files changed, 59 insertions(+), 52 deletions(-) diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp index 241d637..6e252c0 100644 --- a/piet-gpu/shader/binning.comp +++ b/piet-gpu/shader/binning.comp @@ -45,7 +45,11 @@ layout(set = 0, binding = 2) buffer BinsBuf { // Note: cudaraster has N_TILE + 1 to cut down on bank conflicts. shared uint bitmaps[N_SLICE][N_TILE]; +shared uint count[N_SLICE][N_TILE]; shared uint sh_my_tile; +shared uint sh_chunk_start[N_TILE]; +shared uint sh_chunk_end[N_TILE]; +shared uint sh_chunk_jump[N_TILE]; void main() { BinChunkRef chunk_ref = BinChunkRef((gl_LocalInvocationID.x * N_WG + gl_WorkGroupID.x) * BIN_INITIAL_ALLOC); @@ -120,70 +124,73 @@ void main() { uint element_count = 0; for (uint i = 0; i < N_SLICE; i++) { element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]); + count[i][gl_LocalInvocationID.x] = element_count; } // element_count is number of elements covering bin for this invocation. - if (element_count == 0) { - continue; - } - uint chunk_end; - uint chunk_new_start; - // Refactor to reduce code duplication? - if (chunk_n > 0) { - uint next_chunk = chunk_ref.offset + BinChunk_size + chunk_n * 4; - if (next_chunk + BinChunk_size + min(24, element_count * 4) > wr_limit) { - uint alloc_amount = max(BIN_ALLOC, BinChunk_size + element_count * 4); + if (element_count != 0) { + uint chunk_end; + uint chunk_new_start; + // Refactor to reduce code duplication? + if (chunk_n > 0) { + uint next_chunk = chunk_ref.offset + BinChunk_size + chunk_n * 4; + if (next_chunk + BinChunk_size + min(24, element_count * 4) > wr_limit) { + uint alloc_amount = max(BIN_ALLOC, BinChunk_size + element_count * 4); + if (alloc_amount - BIN_ALLOC < 64) { + alloc_amount = BIN_ALLOC; + } + next_chunk = atomicAdd(alloc, alloc_amount); + wr_limit = next_chunk + alloc_amount; + } + BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(next_chunk))); + chunk_ref = BinChunkRef(next_chunk); + } + BinInstanceRef instance_ref = BinInstanceRef(chunk_ref.offset + BinChunk_size); + if (instance_ref.offset + element_count * 4 > wr_limit) { + chunk_end = wr_limit; + chunk_n = (wr_limit - instance_ref.offset) / 4; + uint alloc_amount = max(BIN_ALLOC, BinChunk_size + (element_count - chunk_n) * 4); if (alloc_amount - BIN_ALLOC < 64) { alloc_amount = BIN_ALLOC; } - next_chunk = atomicAdd(alloc, alloc_amount); - wr_limit = next_chunk + alloc_amount; + chunk_new_start = atomicAdd(alloc, alloc_amount); + wr_limit = chunk_new_start + alloc_amount; + BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(chunk_new_start))); + chunk_ref = BinChunkRef(chunk_new_start); + chunk_new_start += BinChunk_size; + chunk_n = element_count - chunk_n; + } else { + chunk_end = ~0; + chunk_n = element_count; } - BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(next_chunk))); - chunk_ref = BinChunkRef(next_chunk); - } - BinInstanceRef instance_ref = BinInstanceRef(chunk_ref.offset + BinChunk_size); - if (instance_ref.offset + element_count * 4 > wr_limit) { - chunk_end = wr_limit; - chunk_n = (wr_limit - instance_ref.offset) / 4; - uint alloc_amount = max(BIN_ALLOC, BinChunk_size + (element_count - chunk_n) * 4); - if (alloc_amount - BIN_ALLOC < 64) { - alloc_amount = BIN_ALLOC; - } - chunk_new_start = atomicAdd(alloc, alloc_amount); - wr_limit = chunk_new_start + alloc_amount; - BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(chunk_new_start))); - chunk_ref = BinChunkRef(chunk_new_start); - chunk_new_start += BinChunk_size; - chunk_n = element_count - chunk_n; - } else { - chunk_end = ~0; - chunk_n = element_count; + sh_chunk_start[gl_LocalInvocationID.x] = instance_ref.offset; + sh_chunk_end[gl_LocalInvocationID.x] = chunk_end; + sh_chunk_jump[gl_LocalInvocationID.x] = chunk_new_start - chunk_end; } - // Iterate over bits set. - uint slice_ix = 0; - uint bitmap = bitmaps[0][gl_LocalInvocationID.x]; - while (true) { - if (bitmap == 0) { - slice_ix++; - if (slice_ix == N_SLICE) { - break; + barrier(); + // Use similar strategy as Laine & Karras paper; loop over bbox of bins + // touched by this element + x = x0; + y = y0; + while (y < y1) { + uint bin_ix = y * N_TILE_X + x; + uint out_mask = bitmaps[my_slice][bin_ix]; + if ((out_mask & my_mask) != 0) { + uint idx = bitCount(out_mask & (my_mask - 1)); + if (my_slice > 0) { + idx += count[my_slice - 1][bin_ix]; } - bitmap = bitmaps[slice_ix][gl_LocalInvocationID.x]; - if (bitmap == 0) { - continue; + uint out_offset = sh_chunk_start[bin_ix] + idx * 4; + if (out_offset >= sh_chunk_end[bin_ix]) { + out_offset += sh_chunk_jump[bin_ix]; } + BinInstance_write(BinInstanceRef(out_offset), BinInstance(element_ix)); } - element_ix = my_tile * N_TILE + slice_ix * 32 + findLSB(bitmap); - // At this point, element_ix refers to an element that covers this bin. - - if (instance_ref.offset == chunk_end) { - instance_ref.offset = chunk_new_start; + x++; + if (x == x1) { + x = x0; + y++; } - BinInstance_write(instance_ref, BinInstance(element_ix)); - instance_ref.offset += BinInstance_size; - // clear LSB - bitmap &= bitmap - 1; } } BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(0))); diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv index 52e04b35f6d4dd38ac7849309a3663f1f7b9e570..fa33483768f6342f2191a639082cbad3b311ff96 100644 GIT binary patch delta 4984 zcmZ9P`=3@-6~^}r!|O;8!X$`PB63k+63|p6lu=WQ49dvV^ksyZ7;WCcxfm68ya8(E zrMFVd@Rk~422I)B3(}{b`~m+1KVB#-vo!IN+4pqqm4AQQIqs6y*)8dXrj%Cq4b=uyN~8JrI=t4kez3ZF?NIFswMrUY zzqWPqqAATOt?TUX?9Dw}($s@^B^?FtsSONv)>d~_`@6bv%Okee_q4V(ok)zsM^)0& z`g^UDvt$nIf6emg6|r> z>Cls#nsyIAJ!Wr9wm7w<85Gq*w~%oze=GF=fge%g7ZBxro@v-G1nWB;AIHlriCS<= zQRbFFK99f0(?0<}JEbuQmGmOQ?&0Uh4>x6QFsC4oa|-=v{DRMiyUoED!rkWJv*B)Y zaL2cg;3tmYOG}*iS#s|qfC=3D;0AQh%G^CGbGNI^-Ll}b8Si!lKMwA81)rbVQgXjS za00uAH%{C>VagH;=f=E9fu5sva3J==U#j0Xscq5AC@;~%YcF_D?o~-+=>I}asCDNF^;x>($MDQfg~>9?RW$H0{voI1phbM z8})57kIMFgOmjoAu(#y2Dq1Y|ZTOosZ7$Xms20)Q0jq`oyI{3xwD5TkY*`}Df5B*3 zo@gxF>CU~6$bV@T&FiI81f@L5!1_ecCbZY_jT$w>^;5UjEpi@fq-_*U&033>q^xmd zDvh0Wg0KbNuN>l5a-Wrv}deqT}N3E=Ux^YHLIX^)N7 z7So;xc4LFL!HpAp@*%K3>iKwLsX3ddGkta-z~LA@Z4q!1+ypCFqVK%PaJ3aoe*ydm zuw&F6qtB6G&$9`A*U81c=k#C1QKCy9=Fds=Q<}5JOE8jt1VcILN8xG)F|gbfc*Cr< zeO#t#*hXHaqtVo(D$~KaRxE55`O`6o8cgCFSRO%VfYr)3bS7LqCOr#WKB2SWYULAp zEZlM0V%%}ytb9l2An2&UQ#Oy@$5W3XI!-`|&>si8?%*fE^@%7a!}U>*cgrbY^>S?%!KV>N zTio7L!ANm?v!s)g@F!4XBwY+vi=>|bt9kR{_AY^|$L&2G>|-tAQj|8v09 z`v2GWTm%Egb^R1rEw1aQ!46V4h}ZQq;JB{Kz;cuM)u4Y|*Uy5zuKH+;Ni6sJmXof) zxSVDX1Ixqtb6|aAg`Wqjx${vWweUY5Ts~c2fR|6#O8EI}_@lu98Wnv%cyrt^Z;Li> z#Raqo{w1)%W1PG^&hc)v-zXQ?C$zV%{4&bLh3ZfDCE(IM%CFI-XzGsiyQK=gh~`)` z$@PiIoks4s45gD6b6X8|z7fSWuA=E@6uCYTrQ0YyUqR`nMU=JRr6t}2Hfr3H%fb4n zN2V*lT{L5vNv@xjiSM9S>z$J(XQxtuuVU#aYWOg{-0a$V!Dke6>hle_e(D>TbcOusn-C4w=_-1;N&Hs28oa#-DE}5*pAM{^iwr!IUO#Oy z?i#Q*V{V|AhwWOhTF%Lzu7m6#$e)>|Nv?igSZO@Xx(tQW4-Y8p2-vz6=!p-#ZNO}v{s>K)M zt#GxN{Ow>LCZ8pZx6u@xy*L7W4{V^AZ5^)WWW(qC@bZ@Y0Pg={=&LRKehAhU{0{JD zUwiQyyAwfC$5@q`r#U8P9|7(HHxf7A5kG?aj!+MuyTR(ge+;%-@gus`L!ww^MKO_k z;11kMb0S;l)gs8fV71`)fy)!QAHIsF9zG9%)gLQv+E2j7$uxMN9-{a0KxurCrskH2 zBhbU(aVx;8Cyv{7Ia}B!3203;Sc> zdF;9NW3m4nu4eUi(CaVWNv|z_4u9ePw=+N|jZR<_y{oo2fv0~MTz--K60R2iXR1vt z{C`#W2Y(!XCO(f8<^MHYpU2&Q%l}(?AIq=tH#9XTB91`61D9Xb^056L>>DxOc7K4Y z#U!<mNmb2JAP8KD&ljF6&%RmZv;G4M6w46>8CCqgo4etV1D(Cr+Pimm@9sY5e)xRh>n+18E5-~-sgP>< z_raILm)15`r{4=H^>l6Q>sY>Web1_ts#8r$#kXo3+s~;;XTEjlqFl_lTa;#W-}V11~4mUR9{ zQ?y3nNL1h$2#$X{KN5rMG1T+AA9HO)?6 zly`UtTzx;G&VDG^$CIP4mZs+9#W9dMJVoKty7}|yI}CgrJG;2JwJxiJIg6jm^{xjS zaT~JdN`LI4zL@@Susa+447hn>Z_Wf8qi%oqM9o=6xv|;F1S4>Iwj$wKa0gh(Qhhz0 z4Od&pGA6+PG8~;4>zFsV+oQkE&b*R`bxfux8J|#Rxha#J8n9l3oH<%O@QTSC2_w3eKO@ z%iwDHlR5@&KYh{fa&VTvD@_On>hQFU#rk;K^flA&qO0EU<0`Pat8yUEidqab0c;h5PlOv2IVZu5QI8kSWcOc3 zzEoGk-Ji+WxYSd?NGC6~F;2pdm%bQuDp)NBy#}n7zthv;>ZLn9k0jF}K9*T$J59|p zi({a;)Zx>{rJh;5v$Y|ch2rpWnP-D%(&94fH=EyBeKGB8!D_*;1Dhv~?HsT%>h^cD z)gu0SFtt8)nQuTaVO-uD!D?}NZvxv%-6US#o568;=Yr)9=J($CxV-bhUS4DL#UvKE znfXC)!MT8D5);cK_*Ss7vBKNHYHoZ~NG;-T2j@@M9q|0=S_r@0v#-MhIu(6ec#qsn z?~cBf;%9A*6C>*GK~YBQ^LLGJV7GW5_TFM)d_#60>bJ&;x8eh_RN{jw5?K{^mk9zS#*2CF%3Um(lC zJ|@!FNmFwI;>fcc>~do_9s#SxPb+GH3t^E z((;N*dQ9XoeCCasuPW7iHOi`zI#Ba9U~6u)zUbWp))zIGhi@%7U#fL*wJ23DIA*Nx zahh7G<`SpuM!5tX@~EXY2I>Q!$>ic4w;rzM+aqG0guBd$*#K7ioS4{yjd1&J@LEO3 zrx094J*RIv)7=E7Pnu8Dx0#Ku0Y44a7s)n*O{i{lTIr+~Uxh8;QMCML?islHtgM_g zgLuVa)3<^RQg_q0V|_er`nJ*3te7|!@hsT8AA>&!j=`Or+!!lt|I7q}Nbo$^75H{* zVZa@5HT})-7r`bp*$Y^?_$92d@g{p2+(x`H`eH(_fMY`B9Qa1Qil`$>`WjfxPgit& z9UdJWP;OjI{0*?%7oVp$!D@aVBId1f%s1ul?%NfUQ&Ke8q$YTqmOqdlO!)&?tF@%~ z@sx3ki?Gsru>6zl9k|*~n#1|le-|8Y{as+WqwK~SAGLZ9>|5WM-TiG-*Vg|JwivP3