From d77dfb8c004ccbc7e0ee81a0e8c2cbaf9f3ecac0 Mon Sep 17 00:00:00 2001 From: Ishi Tatsuyuki Date: Tue, 8 Jun 2021 16:29:40 +0900 Subject: [PATCH] Runtime querying of threadgroup size --- piet-gpu-hal/src/dx12.rs | 6 +++++- piet-gpu-hal/src/lib.rs | 16 ++++++++++++++-- piet-gpu-hal/src/metal.rs | 6 +++++- piet-gpu-hal/src/vulkan.rs | 11 ++++++++--- piet-gpu/shader/backdrop.comp | 7 ++++++- piet-gpu/shader/backdrop.spv | Bin 12256 -> 12224 bytes piet-gpu/shader/backdrop_lg.spv | Bin 0 -> 12256 bytes piet-gpu/shader/build.ninja | 3 +++ piet-gpu/src/lib.rs | 9 +++++++-- 9 files changed, 48 insertions(+), 10 deletions(-) create mode 100644 piet-gpu/shader/backdrop_lg.spv diff --git a/piet-gpu-hal/src/dx12.rs b/piet-gpu-hal/src/dx12.rs index 857fe3c..bcef409 100644 --- a/piet-gpu-hal/src/dx12.rs +++ b/piet-gpu-hal/src/dx12.rs @@ -14,7 +14,7 @@ use raw_window_handle::{HasRawWindowHandle, RawWindowHandle}; use smallvec::SmallVec; -use crate::{BufferUsage, Error, GpuInfo, ImageLayout}; +use crate::{BufferUsage, Error, GpuInfo, ImageLayout, WorkgroupLimits}; use self::wrappers::{CommandAllocator, CommandQueue, Device, Factory4, Resource, ShaderByteCode}; @@ -177,6 +177,10 @@ impl Dx12Instance { has_descriptor_indexing: false, has_subgroups: false, subgroup_size: None, + workgroup_limits: WorkgroupLimits { + max_size: [1024, 1024, 64], + max_invocations: 1024, + }, has_memory_model: false, use_staging_buffers, }; diff --git a/piet-gpu-hal/src/lib.rs b/piet-gpu-hal/src/lib.rs index 0739b13..2dd0eff 100644 --- a/piet-gpu-hal/src/lib.rs +++ b/piet-gpu-hal/src/lib.rs @@ -99,6 +99,8 @@ pub struct GpuInfo { /// required in Vulkan 1.1), and we should have finer grained /// queries for shuffles, etc. pub has_subgroups: bool, + /// Limits on workgroup size for compute shaders. + pub workgroup_limits: WorkgroupLimits, /// Info about subgroup size control, if available. pub subgroup_size: Option, /// The GPU supports a real, grown-ass memory model. @@ -114,6 +116,16 @@ pub struct GpuInfo { /// available. #[derive(Clone, Debug)] pub struct SubgroupSize { - min: u32, - max: u32, + pub min: u32, + pub max: u32, +} + +/// The range of workgroup sizes supported by a back-end. +#[derive(Clone, Debug)] +pub struct WorkgroupLimits { + /// The maximum size on each workgroup dimension can be. + pub max_size: [u32; 3], + /// The maximum overall invocations a workgroup can have. That is, the product of sizes in each + /// dimension. + pub max_invocations: u32, } diff --git a/piet-gpu-hal/src/metal.rs b/piet-gpu-hal/src/metal.rs index 9b0c2b2..69141c2 100644 --- a/piet-gpu-hal/src/metal.rs +++ b/piet-gpu-hal/src/metal.rs @@ -29,7 +29,7 @@ use metal::{CGFloat, MTLFeatureSet}; use raw_window_handle::{HasRawWindowHandle, RawWindowHandle}; -use crate::{BufferUsage, Error, GpuInfo}; +use crate::{BufferUsage, Error, GpuInfo, WorkgroupLimits}; use util::*; @@ -164,6 +164,10 @@ impl MtlInstance { has_descriptor_indexing: false, has_subgroups: false, subgroup_size: None, + workgroup_limits: WorkgroupLimits { + max_size: [512, 512, 512], + max_invocations: 512, + }, has_memory_model: false, use_staging_buffers, }; diff --git a/piet-gpu-hal/src/vulkan.rs b/piet-gpu-hal/src/vulkan.rs index 3eee69a..9111900 100644 --- a/piet-gpu-hal/src/vulkan.rs +++ b/piet-gpu-hal/src/vulkan.rs @@ -12,9 +12,7 @@ use ash::{vk, Device, Entry, Instance}; use smallvec::SmallVec; -use crate::{ - BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams, SubgroupSize, -}; +use crate::{BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams, SubgroupSize, WorkgroupLimits}; use crate::backend::Device as DeviceTrait; @@ -357,10 +355,17 @@ impl VkInstance { // TODO: finer grained query of specific subgroup info. let has_subgroups = self.vk_version >= vk::make_version(1, 1, 0); + + let workgroup_limits = WorkgroupLimits { + max_invocations: props.limits.max_compute_work_group_invocations, + max_size: props.limits.max_compute_work_group_size, + }; + let gpu_info = GpuInfo { has_descriptor_indexing, has_subgroups, subgroup_size, + workgroup_limits, has_memory_model, use_staging_buffers, }; diff --git a/piet-gpu/shader/backdrop.comp b/piet-gpu/shader/backdrop.comp index 99cbf4f..247bbdf 100644 --- a/piet-gpu/shader/backdrop.comp +++ b/piet-gpu/shader/backdrop.comp @@ -21,11 +21,16 @@ #define LG_BACKDROP_WG (7 + LG_WG_FACTOR) #define BACKDROP_WG (1 << LG_BACKDROP_WG) +#ifndef BACKDROP_DIST_FACTOR // Some paths (those covering a large area) can generate a lot of backdrop tiles; BACKDROP_DIST_FACTOR defines how much // additional threads should we spawn for parallel row processing. The additional threads does not participate in the // earlier stages (calculating the tile counts) but does work in the final prefix sum stage which has a lot more // parallelism. -#define BACKDROP_DIST_FACTOR 4 + +// This feature is opt-in: one variant is compiled with the following default, while the other variant is compiled with +// a larger BACKDROP_DIST_FACTOR, which is used on GPUs supporting a larger workgroup size to improve performance. +#define BACKDROP_DIST_FACTOR 1 +#endif layout(local_size_x = BACKDROP_WG, local_size_y = BACKDROP_DIST_FACTOR) in; diff --git a/piet-gpu/shader/backdrop.spv b/piet-gpu/shader/backdrop.spv index 794165472ee1c4e5f27c20b55f14169e1188e045..6458f4bf2a338e54838dc375dc576c83c73e91e9 100644 GIT binary patch delta 101 zcmaD5e;}ThnMs+Qfq{{Mi-CjT+C<)DM#hbW&fJVwH*e)$%_5}3z{aqFfq~&FBLl-< sAf3*zd6GbwnCfL9KOKl07+8TS7#OYq*&s2H8aoDNu-ui+yy}-Z0ECbguK)l5 delta 133 zcmX>Q{~(^1nMs+Qfq{{Mi-CjT#zfv^MwX3*&fJXGH*e)$&0?g@z{0@Bz`$^s5lFBw zFu?fNfP5gAK?f+i0Z3m5^8W(qYmA$V1j58ruK@Y!K-|E<3f6xW$Oegl%&}u&2FqRD J{8sHU2LN?`826FP)IEBoJf|9TG@@5J&=n5OD~L4Nfp&6F27B>3-em;q`k-_v-|h zsF6h!5fL#cDuPj944Xj|aYGU$BI1Igvopi2Kb>7%a9qCc_Nnyg9=KB0|6f(7PMtdE zo_7;yn>c53QA{YN7E_9!Oeo4{MllgfQA{h^OWnEh!j%h$T0IL-Jm~}j_Ac7WIAiu< z+$3}b8R+V7=(rErj_g9FlSAJO#AIT}wei1+=*LUj)S`1`wR6>~&Xs3ZyVtF&4z;eR z*Q)jYMy=Y@KUnK-^>3&-c1jl4-{`In_v8VSh{)@ZPi2l3^?I`#-A<))pJs3GP_0#9 znamoBqEXve?J{ou@S^4Xvu4q9e3fF)F|n0mFZfXZl~m){3cX<6wSj8$GWpbEe|W1o z+`YDnb^cImaJaj5Tv@gv<)Nz(4q!sLF+ zJx`!+YIFWI& zo|C{M>uD=a*=?^BXM)#t4R#HnUF+H7_<8X2Y6H!|E3B`*ko%NpKfOy~w{GpV!NKMr z*!39i`;NWLVQ*K#eVp%NbUyzjV|ZIpg>NWh+Y4Sb_rKcJ=ovY5rM*%#(Rq&bV|ZJ! z3EoeKoo_REuxn#^pUJNt6I&^+9mlU5i7&U)`0K~{Zy3jK9EmTt*7%#p`EMD++lpJ^ zBl}z_wt?AYOQqH+<$c;$udZx%ch%2rY(Ta8n~ifDJ+;g6PbwZDu2!!N)EcdFkYl!E zV@zkgxn|6uv&x)p#lz@0*IxjaR}|X4lKbRm_KWZ{n~mOn-bD9V?$fBInJGT(?ti1Y zj*~i6^TloFjy2x>-`L;d(i+utu*27eZ*8r=Z!Jp*XXNuvH+*0twCTkz5DqOp3o`SE7+aMf~YY*(L!LG(oa@g-ju#NQMjA1V^d#25&n;xk> z-;WA-9J2#G*xXp{ZVos2*}+mNcB0Gor~K`hQvP1P%s#tiWxgMwz4sjZ95_Gge?5-B zFosVmep>SK?nw1jik~MNs?7Of$w%*im&S0q%Ek_H`>>318%M@XDh|SKU!!-{H?Opf z<~n8^KXwdnFHV4`+_9b1)LtwdwMXm9UYv@(zpQ&w(G70#t5UZ#&Y`{U!v-tYwH{on z$NSV)4DKFVDXs*s8*Hxe2i>}FfXCw8w=MAUsP5kXN^u*uq1yWNu1zZLNin<%$<|hE zH)eRC%=r*F->Zkm@V4R+_{h6nDRzwURf;FZ@u$Y|r^oSUGTzC0RnkF*__*}N~bMa|A(FSK$Z2h(Du%6Livwz$oiRU9dOs8XrD=x>(FlP+U2vz;PG!wh_rD*_w-N5U5b-yGozMJR z(DuqT%kNCwo7SgiVVEw_r26p^4%w-*qA=a$i zSxvo1p&v)IJHKaLF7}*Tr?p$l6UZuLD-v^lGub~&jQ7Noh&IQW!*@eltp8hJ@4!AJ zay$jr9yz`Z)@F`hlH)sQpO>gt-_wXT-(zv)c?RtISFn2P{Vo!{%lr1dLX{|?b+{^-y1VC}B)AJ9I(;mhrLw0t7+ z_efjP9P5(1Ztao#AHn9%&%r;zwOh*zX!)|V{(lC~MYL~AJ^vT5_SoZp1N+!>egBGR zb8hjk(bg!}9`DD$gC9j)=g*Rk_5KH3fAIeVdp_d*_;0W_e^cZA_#d$2^!a`)ME@_M z-L-i)%i|3E0-Vq9T%LhnBKqPC{3_W{yZiWS@QaAGn_F(2Ih^-5i1X=tDRIxhV(|YV zOA*KGGsfQu?XlhoY~VJywR$Ed!i{mhn~;cW!>%vlDp{QS6LFKU>$@uLYdhGn+RL+n z_P5+`4Cl5cxp9$uDmZHJTO)E$1M7>pJ+rvJtcJa?>x&xp20K=}^X(v3TjbpbtS#=A z>2Pi4^gY-Y?Blu9Hv`e;ITJ@c`+=kGyq=lZ_1&A!_5NUO_RTXJxzsPR3sbPa3hZ2- zW8d1dGv5~(W`zdhGbi8xR;7bwX%r7^_eDOP})^{rQ zI6uq5o-Oq+sQonbYY-pD{!#L)qn6Xb=7`_U6<}@diRV`SYQ$XT7C(>Ov#6W_x9)qv zXM%U3^RsalHtq4go(6i1z_!FiqM!}B^X zfLo{bsPjUw_IQ6T0-HyBoQaFUo(X+1*BijvgI@yn9K~?^6|SeX-uR!CkL$)-E^3+9RJ@ z?in|~I`Vgc?R)fM4OpAKkM~BKAK7D;) zxpVcStyS$idKtQo_~;)Df2NQPA zp%kaKreX93#7FqV>O&&uyTEzRu7VpA@7UF1V_0LHxohC=vA)Q2Em(WpA@2s8 zTf6tdd(b|vS>JVtHrFhUJnscNf9!+Y-bSoiZr>fNj_w}PYACE$-BCm`C5@%Q+n$!@J%!15UXF>t=`ABUSSYX1aSTijQ7 zfPJh>-zSmJAl4<0ymx}l6X)ksU~T#N*$Vgk=!w5?-j~c!JHeTPuXt{rO zavfg;cc2}k&zS9K?czt!uE*Yf32fih-gRF_e+BW;|JCGI$J}27nD z`eMv3u)T@-zXjH2eDqCQ%>Qk0%>NWv9=W~)j`^Pk%ft6wu=AMn8MNH*aQ{x?d*DOR z=5w5Wd+$EjFLmVkKG@zzy*~hJi@Q&swtR2xyFBg}wY-itzdG{&5Zs4E{vUz0J&Rbo zK5dTgM?3y9^lys^{C6@F%D<0xpZs%$&oty@#!W`=nc=+>-UGciVxH+}|BiYF`V_=5 zQP*;?&IB75e;@oM7Hzc;|mX&sK0N9}I} zJBL2=$fJL60^2{w=!^cn8LZF#y%p_a|Fpja(PqtJ&x^lXa_uqi+rZ8nJ?jET&-6vy z8n8a=b6xWAb%V7zhd9opeVvQ=9D&$Z`!xr#jw8{Lqc_=~Ouu<+;oABT^U5RdeDDIq z=V&DM>zIV+p^rtJ<2ZDj=gYvcU%p$-jB8(?c)ZJXxOq-RqRs)Z_V{gXfVFwXjG2mF zhggffat^uJ{Km)KKM3wXjL~O|HENIfhQQjQFT-GMEhJ(#fQ^ZoE(dGdh^(fc_Uj77 zvHI+%Jo>p9ycF?Sf8Nr?SC86ExH1de{HH)q`bUY&9M*N)@YW!(7dGj9A1 zo3arg{ zEB5~yu)TA7!x$M93G2e&59f&ddjByXO$9x|F zZ${#Le-x~(g+$D4;LS+v(Z|5rJl_%Xaji~J5GYWOtR_4)5F7NI|bXmk8s=(`ceZ$%t0H^%Y4gZCi% KeLuy%pML<*lv76l literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja index 22c9c78..b73da2e 100644 --- a/piet-gpu/shader/build.ninja +++ b/piet-gpu/shader/build.ninja @@ -18,6 +18,9 @@ build path_coarse.spv: glsl path_coarse.comp | annotated.h pathseg.h tile.h setu build backdrop.spv: glsl backdrop.comp | annotated.h tile.h setup.h +build backdrop_lg.spv: glsl backdrop.comp | annotated.h tile.h setup.h + flags = -DBACKDROP_DIST_FACTOR=4 + build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index ef70c9c..971b517 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -311,8 +311,13 @@ impl Renderer { let path_ds = session .create_simple_descriptor_set(&path_pipeline, &[&memory_buf_dev, &config_buf])?; - let backdrop_alloc_code = ShaderCode::Spv(include_bytes!("../shader/backdrop.spv")); - let backdrop_pipeline = session.create_simple_compute_pipeline(backdrop_alloc_code, 2)?; + let backdrop_code = if session.gpu_info().workgroup_limits.max_invocations >= 1024 { + ShaderCode::Spv(include_bytes!("../shader/backdrop_lg.spv")) + } else { + println!("using small workgroup backdrop kernel"); + ShaderCode::Spv(include_bytes!("../shader/backdrop.spv")) + }; + let backdrop_pipeline = session.create_simple_compute_pipeline(backdrop_code, 2)?; let backdrop_ds = session .create_simple_descriptor_set(&backdrop_pipeline, &[&memory_buf_dev, &config_buf])?;