From d94257a7c59baff49ee69ffdaad4103f1b67e973 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Thu, 5 Jan 2023 14:22:14 -0800 Subject: [PATCH] Support for larger pathtags Previously there was a limit of 256k pathtags in a scene, due to the need for multi-dispatch prefix sum for the pathtag monoid. This patch increases the limit to 64M, which ought to be enough for most applications. It works by having 4 dispatches for the pathtag prefix sum: 2 to reduce, then 2 to scan. --- shader/pathtag_reduce2.wgsl | 40 +++++++++++++++++++++++ shader/pathtag_scan.wgsl | 16 ++++++--- shader/pathtag_scan1.wgsl | 65 +++++++++++++++++++++++++++++++++++++ src/render.rs | 34 +++++++++++++++++-- src/shaders.rs | 33 +++++++++++++++++++ 5 files changed, 181 insertions(+), 7 deletions(-) create mode 100644 shader/pathtag_reduce2.wgsl create mode 100644 shader/pathtag_scan1.wgsl diff --git a/shader/pathtag_reduce2.wgsl b/shader/pathtag_reduce2.wgsl new file mode 100644 index 0000000..59122be --- /dev/null +++ b/shader/pathtag_reduce2.wgsl @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// This shader is the second stage of reduction for the pathtag +// monoid scan, needed when the number of tags is large. + +#import config +#import pathtag + +@group(0) @binding(0) +var reduced_in: array; + +@group(0) @binding(1) +var reduced: array; + +let LG_WG_SIZE = 8u; +let WG_SIZE = 256u; + +var sh_scratch: array; + +@compute @workgroup_size(256) +fn main( + @builtin(global_invocation_id) global_id: vec3, + @builtin(local_invocation_id) local_id: vec3, +) { + let ix = global_id.x; + var agg = reduced_in[ix]; + sh_scratch[local_id.x] = agg; + for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) { + workgroupBarrier(); + if local_id.x + (1u << i) < WG_SIZE { + let other = sh_scratch[local_id.x + (1u << i)]; + agg = combine_tag_monoid(agg, other); + } + workgroupBarrier(); + sh_scratch[local_id.x] = agg; + } + if local_id.x == 0u { + reduced[ix >> LG_WG_SIZE] = agg; + } +} diff --git a/shader/pathtag_scan.wgsl b/shader/pathtag_scan.wgsl index f8a8005..009e3b4 100644 --- a/shader/pathtag_scan.wgsl +++ b/shader/pathtag_scan.wgsl @@ -18,7 +18,9 @@ var tag_monoids: array; let LG_WG_SIZE = 8u; let WG_SIZE = 256u; +#ifdef small var sh_parent: array; +#endif // These could be combined? var sh_monoid: array; @@ -28,6 +30,7 @@ fn main( @builtin(local_invocation_id) local_id: vec3, @builtin(workgroup_id) wg_id: vec3, ) { +#ifdef small var agg = tag_monoid_identity(); if local_id.x < wg_id.x { agg = reduced[local_id.x]; @@ -42,22 +45,27 @@ fn main( workgroupBarrier(); sh_parent[local_id.x] = agg; } +#endif let ix = global_id.x; let tag_word = scene[config.pathtag_base + ix]; - agg = reduce_tag(tag_word); - sh_monoid[local_id.x] = agg; + var agg_part = reduce_tag(tag_word); + sh_monoid[local_id.x] = agg_part; for (var i = 0u; i < LG_WG_SIZE; i += 1u) { workgroupBarrier(); if local_id.x >= 1u << i { let other = sh_monoid[local_id.x - (1u << i)]; - agg = combine_tag_monoid(other, agg); + agg_part = combine_tag_monoid(other, agg_part); } workgroupBarrier(); - sh_monoid[local_id.x] = agg; + sh_monoid[local_id.x] = agg_part; } // prefix up to this workgroup +#ifdef small var tm = sh_parent[0]; +#else + var tm = reduced[wg_id.x]; +#endif if local_id.x > 0u { tm = combine_tag_monoid(tm, sh_monoid[local_id.x - 1u]); } diff --git a/shader/pathtag_scan1.wgsl b/shader/pathtag_scan1.wgsl new file mode 100644 index 0000000..1d36ab5 --- /dev/null +++ b/shader/pathtag_scan1.wgsl @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// This shader computes the scan of reduced tag monoids given +// two levels of reduction. + +#import config +#import pathtag + +@group(0) @binding(0) +var reduced: array; + +@group(0) @binding(1) +var reduced2: array; + +@group(0) @binding(2) +var tag_monoids: array; + +let LG_WG_SIZE = 8u; +let WG_SIZE = 256u; + +var sh_parent: array; +// These could be combined? +var sh_monoid: array; + +@compute @workgroup_size(256) +fn main( + @builtin(global_invocation_id) global_id: vec3, + @builtin(local_invocation_id) local_id: vec3, + @builtin(workgroup_id) wg_id: vec3, +) { + var agg = tag_monoid_identity(); + if local_id.x < wg_id.x { + agg = reduced2[local_id.x]; + } + sh_parent[local_id.x] = agg; + for (var i = 0u; i < LG_WG_SIZE; i += 1u) { + workgroupBarrier(); + if local_id.x + (1u << i) < WG_SIZE { + let other = sh_parent[local_id.x + (1u << i)]; + agg = combine_tag_monoid(agg, other); + } + workgroupBarrier(); + sh_parent[local_id.x] = agg; + } + + let ix = global_id.x; + agg = reduced[ix]; + sh_monoid[local_id.x] = agg; + for (var i = 0u; i < LG_WG_SIZE; i += 1u) { + workgroupBarrier(); + if local_id.x >= 1u << i { + let other = sh_monoid[local_id.x - (1u << i)]; + agg = combine_tag_monoid(other, agg); + } + workgroupBarrier(); + sh_monoid[local_id.x] = agg; + } + // prefix up to this workgroup + var tm = sh_parent[0]; + if local_id.x > 0u { + tm = combine_tag_monoid(tm, sh_monoid[local_id.x - 1u]); + } + // exclusive prefix sum, granularity of 4 tag bytes * workgroup size + tag_monoids[ix] = tm; +} diff --git a/src/render.rs b/src/render.rs index 97fd53c..eee3d54 100644 --- a/src/render.rs +++ b/src/render.rs @@ -236,21 +236,49 @@ pub fn render_full( let config_buf = ResourceProxy::Buf(recording.upload_uniform(bytemuck::bytes_of(&config))); let pathtag_wgs = pathtag_padded / (4 * shaders::PATHTAG_REDUCE_WG as usize); - let reduced_buf = ResourceProxy::new_buf(pathtag_wgs as u64 * TAG_MONOID_FULL_SIZE); + let pathtag_large = pathtag_wgs > shaders::PATHTAG_REDUCE_WG as usize; + let reduced_size = if pathtag_large { + align_up(pathtag_wgs, shaders::PATHTAG_REDUCE_WG) + } else { + pathtag_wgs + }; + let reduced_buf = ResourceProxy::new_buf(reduced_size as u64 * TAG_MONOID_FULL_SIZE); // TODO: really only need pathtag_wgs - 1 recording.dispatch( shaders.pathtag_reduce, (pathtag_wgs as u32, 1, 1), [config_buf, scene_buf, reduced_buf], ); + let mut pathtag_parent = reduced_buf; + if pathtag_large { + let reduced2_size = shaders::PATHTAG_REDUCE_WG as usize; + let reduced2_buf = ResourceProxy::new_buf(reduced2_size as u64 * TAG_MONOID_FULL_SIZE); + recording.dispatch( + shaders.pathtag_reduce2, + (reduced2_size as u32, 1, 1), + [reduced_buf, reduced2_buf], + ); + let reduced_scan_buf = ResourceProxy::new_buf(pathtag_wgs as u64 * TAG_MONOID_FULL_SIZE); + recording.dispatch( + shaders.pathtag_scan1, + (reduced_size as u32 / shaders::PATHTAG_REDUCE_WG, 1, 1), + [reduced_buf, reduced2_buf, reduced_scan_buf], + ); + pathtag_parent = reduced_scan_buf; + } let tagmonoid_buf = ResourceProxy::new_buf( pathtag_wgs as u64 * shaders::PATHTAG_REDUCE_WG as u64 * TAG_MONOID_FULL_SIZE, ); + let pathtag_scan = if pathtag_large { + shaders.pathtag_scan_large + } else { + shaders.pathtag_scan + }; recording.dispatch( - shaders.pathtag_scan, + pathtag_scan, (pathtag_wgs as u32, 1, 1), - [config_buf, scene_buf, reduced_buf, tagmonoid_buf], + [config_buf, scene_buf, pathtag_parent, tagmonoid_buf], ); let drawobj_wgs = (n_drawobj + shaders::PATH_BBOX_WG - 1) / shaders::PATH_BBOX_WG; let path_bbox_buf = ResourceProxy::new_buf(n_path as u64 * PATH_BBOX_SIZE); diff --git a/src/shaders.rs b/src/shaders.rs index 1df6b14..a485cb4 100644 --- a/src/shaders.rs +++ b/src/shaders.rs @@ -47,7 +47,10 @@ pub struct Shaders { // Shaders for the full pipeline pub struct FullShaders { pub pathtag_reduce: ShaderId, + pub pathtag_reduce2: ShaderId, + pub pathtag_scan1: ShaderId, pub pathtag_scan: ShaderId, + pub pathtag_scan_large: ShaderId, pub bbox_clear: ShaderId, pub pathseg: ShaderId, pub draw_reduce: ShaderId, @@ -129,12 +132,39 @@ pub fn full_shaders(device: &Device, engine: &mut Engine) -> Result Result