Support for larger pathtags

Previously there was a limit of 256k pathtags in a scene, due to the need for multi-dispatch prefix sum for the pathtag monoid. This patch increases the limit to 64M, which ought to be enough for most applications.

It works by having 4 dispatches for the pathtag prefix sum: 2 to reduce, then 2 to scan.
This commit is contained in:
Raph Levien 2023-01-05 14:22:14 -08:00
parent 57d79bdf1f
commit d94257a7c5
5 changed files with 181 additions and 7 deletions

View file

@ -0,0 +1,40 @@
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
// This shader is the second stage of reduction for the pathtag
// monoid scan, needed when the number of tags is large.
#import config
#import pathtag
@group(0) @binding(0)
var<storage> reduced_in: array<TagMonoid>;
@group(0) @binding(1)
var<storage, read_write> reduced: array<TagMonoid>;
let LG_WG_SIZE = 8u;
let WG_SIZE = 256u;
var<workgroup> sh_scratch: array<TagMonoid, WG_SIZE>;
@compute @workgroup_size(256)
fn main(
@builtin(global_invocation_id) global_id: vec3<u32>,
@builtin(local_invocation_id) local_id: vec3<u32>,
) {
let ix = global_id.x;
var agg = reduced_in[ix];
sh_scratch[local_id.x] = agg;
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
workgroupBarrier();
if local_id.x + (1u << i) < WG_SIZE {
let other = sh_scratch[local_id.x + (1u << i)];
agg = combine_tag_monoid(agg, other);
}
workgroupBarrier();
sh_scratch[local_id.x] = agg;
}
if local_id.x == 0u {
reduced[ix >> LG_WG_SIZE] = agg;
}
}

View file

@ -18,7 +18,9 @@ var<storage, read_write> tag_monoids: array<TagMonoid>;
let LG_WG_SIZE = 8u;
let WG_SIZE = 256u;
#ifdef small
var<workgroup> sh_parent: array<TagMonoid, WG_SIZE>;
#endif
// These could be combined?
var<workgroup> sh_monoid: array<TagMonoid, WG_SIZE>;
@ -28,6 +30,7 @@ fn main(
@builtin(local_invocation_id) local_id: vec3<u32>,
@builtin(workgroup_id) wg_id: vec3<u32>,
) {
#ifdef small
var agg = tag_monoid_identity();
if local_id.x < wg_id.x {
agg = reduced[local_id.x];
@ -42,22 +45,27 @@ fn main(
workgroupBarrier();
sh_parent[local_id.x] = agg;
}
#endif
let ix = global_id.x;
let tag_word = scene[config.pathtag_base + ix];
agg = reduce_tag(tag_word);
sh_monoid[local_id.x] = agg;
var agg_part = reduce_tag(tag_word);
sh_monoid[local_id.x] = agg_part;
for (var i = 0u; i < LG_WG_SIZE; i += 1u) {
workgroupBarrier();
if local_id.x >= 1u << i {
let other = sh_monoid[local_id.x - (1u << i)];
agg = combine_tag_monoid(other, agg);
agg_part = combine_tag_monoid(other, agg_part);
}
workgroupBarrier();
sh_monoid[local_id.x] = agg;
sh_monoid[local_id.x] = agg_part;
}
// prefix up to this workgroup
#ifdef small
var tm = sh_parent[0];
#else
var tm = reduced[wg_id.x];
#endif
if local_id.x > 0u {
tm = combine_tag_monoid(tm, sh_monoid[local_id.x - 1u]);
}

65
shader/pathtag_scan1.wgsl Normal file
View file

@ -0,0 +1,65 @@
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
// This shader computes the scan of reduced tag monoids given
// two levels of reduction.
#import config
#import pathtag
@group(0) @binding(0)
var<storage> reduced: array<TagMonoid>;
@group(0) @binding(1)
var<storage> reduced2: array<TagMonoid>;
@group(0) @binding(2)
var<storage, read_write> tag_monoids: array<TagMonoid>;
let LG_WG_SIZE = 8u;
let WG_SIZE = 256u;
var<workgroup> sh_parent: array<TagMonoid, WG_SIZE>;
// These could be combined?
var<workgroup> sh_monoid: array<TagMonoid, WG_SIZE>;
@compute @workgroup_size(256)
fn main(
@builtin(global_invocation_id) global_id: vec3<u32>,
@builtin(local_invocation_id) local_id: vec3<u32>,
@builtin(workgroup_id) wg_id: vec3<u32>,
) {
var agg = tag_monoid_identity();
if local_id.x < wg_id.x {
agg = reduced2[local_id.x];
}
sh_parent[local_id.x] = agg;
for (var i = 0u; i < LG_WG_SIZE; i += 1u) {
workgroupBarrier();
if local_id.x + (1u << i) < WG_SIZE {
let other = sh_parent[local_id.x + (1u << i)];
agg = combine_tag_monoid(agg, other);
}
workgroupBarrier();
sh_parent[local_id.x] = agg;
}
let ix = global_id.x;
agg = reduced[ix];
sh_monoid[local_id.x] = agg;
for (var i = 0u; i < LG_WG_SIZE; i += 1u) {
workgroupBarrier();
if local_id.x >= 1u << i {
let other = sh_monoid[local_id.x - (1u << i)];
agg = combine_tag_monoid(other, agg);
}
workgroupBarrier();
sh_monoid[local_id.x] = agg;
}
// prefix up to this workgroup
var tm = sh_parent[0];
if local_id.x > 0u {
tm = combine_tag_monoid(tm, sh_monoid[local_id.x - 1u]);
}
// exclusive prefix sum, granularity of 4 tag bytes * workgroup size
tag_monoids[ix] = tm;
}

View file

@ -236,21 +236,49 @@ pub fn render_full(
let config_buf = ResourceProxy::Buf(recording.upload_uniform(bytemuck::bytes_of(&config)));
let pathtag_wgs = pathtag_padded / (4 * shaders::PATHTAG_REDUCE_WG as usize);
let reduced_buf = ResourceProxy::new_buf(pathtag_wgs as u64 * TAG_MONOID_FULL_SIZE);
let pathtag_large = pathtag_wgs > shaders::PATHTAG_REDUCE_WG as usize;
let reduced_size = if pathtag_large {
align_up(pathtag_wgs, shaders::PATHTAG_REDUCE_WG)
} else {
pathtag_wgs
};
let reduced_buf = ResourceProxy::new_buf(reduced_size as u64 * TAG_MONOID_FULL_SIZE);
// TODO: really only need pathtag_wgs - 1
recording.dispatch(
shaders.pathtag_reduce,
(pathtag_wgs as u32, 1, 1),
[config_buf, scene_buf, reduced_buf],
);
let mut pathtag_parent = reduced_buf;
if pathtag_large {
let reduced2_size = shaders::PATHTAG_REDUCE_WG as usize;
let reduced2_buf = ResourceProxy::new_buf(reduced2_size as u64 * TAG_MONOID_FULL_SIZE);
recording.dispatch(
shaders.pathtag_reduce2,
(reduced2_size as u32, 1, 1),
[reduced_buf, reduced2_buf],
);
let reduced_scan_buf = ResourceProxy::new_buf(pathtag_wgs as u64 * TAG_MONOID_FULL_SIZE);
recording.dispatch(
shaders.pathtag_scan1,
(reduced_size as u32 / shaders::PATHTAG_REDUCE_WG, 1, 1),
[reduced_buf, reduced2_buf, reduced_scan_buf],
);
pathtag_parent = reduced_scan_buf;
}
let tagmonoid_buf = ResourceProxy::new_buf(
pathtag_wgs as u64 * shaders::PATHTAG_REDUCE_WG as u64 * TAG_MONOID_FULL_SIZE,
);
let pathtag_scan = if pathtag_large {
shaders.pathtag_scan_large
} else {
shaders.pathtag_scan
};
recording.dispatch(
shaders.pathtag_scan,
pathtag_scan,
(pathtag_wgs as u32, 1, 1),
[config_buf, scene_buf, reduced_buf, tagmonoid_buf],
[config_buf, scene_buf, pathtag_parent, tagmonoid_buf],
);
let drawobj_wgs = (n_drawobj + shaders::PATH_BBOX_WG - 1) / shaders::PATH_BBOX_WG;
let path_bbox_buf = ResourceProxy::new_buf(n_path as u64 * PATH_BBOX_SIZE);

View file

@ -47,7 +47,10 @@ pub struct Shaders {
// Shaders for the full pipeline
pub struct FullShaders {
pub pathtag_reduce: ShaderId,
pub pathtag_reduce2: ShaderId,
pub pathtag_scan1: ShaderId,
pub pathtag_scan: ShaderId,
pub pathtag_scan_large: ShaderId,
pub bbox_clear: ShaderId,
pub pathseg: ShaderId,
pub draw_reduce: ShaderId,
@ -129,12 +132,39 @@ pub fn full_shaders(device: &Device, engine: &mut Engine) -> Result<FullShaders,
let empty = HashSet::new();
let mut full_config = HashSet::new();
full_config.insert("full".into());
let mut small_config = HashSet::new();
small_config.insert("full".into());
small_config.insert("small".into());
let pathtag_reduce = engine.add_shader(
device,
preprocess::preprocess(shader!("pathtag_reduce"), &full_config, &imports).into(),
&[BindType::Uniform, BindType::BufReadOnly, BindType::Buffer],
)?;
let pathtag_reduce2 = engine.add_shader(
device,
preprocess::preprocess(shader!("pathtag_reduce2"), &full_config, &imports).into(),
&[BindType::BufReadOnly, BindType::Buffer],
)?;
let pathtag_scan1 = engine.add_shader(
device,
preprocess::preprocess(shader!("pathtag_scan1"), &full_config, &imports).into(),
&[
BindType::BufReadOnly,
BindType::BufReadOnly,
BindType::Buffer,
],
)?;
let pathtag_scan = engine.add_shader(
device,
preprocess::preprocess(shader!("pathtag_scan"), &small_config, &imports).into(),
&[
BindType::Uniform,
BindType::BufReadOnly,
BindType::BufReadOnly,
BindType::Buffer,
],
)?;
let pathtag_scan_large = engine.add_shader(
device,
preprocess::preprocess(shader!("pathtag_scan"), &full_config, &imports).into(),
&[
@ -278,7 +308,10 @@ pub fn full_shaders(device: &Device, engine: &mut Engine) -> Result<FullShaders,
)?;
Ok(FullShaders {
pathtag_reduce,
pathtag_reduce2,
pathtag_scan,
pathtag_scan1,
pathtag_scan_large,
bbox_clear,
pathseg,
draw_reduce,