diff --git a/piet-wgsl/shader/coarse.wgsl b/piet-wgsl/shader/coarse.wgsl index 5943e51..e1e88d7 100644 --- a/piet-wgsl/shader/coarse.wgsl +++ b/piet-wgsl/shader/coarse.wgsl @@ -99,7 +99,6 @@ fn alloc_cmd(size: u32) { fn write_path(tile: Tile, linewidth: f32) { // TODO: take flags - // TODO: handle stroke alloc_cmd(3u); if linewidth < 0.0 { if tile.segments != 0u { diff --git a/piet-wgsl/shader/draw_leaf.wgsl b/piet-wgsl/shader/draw_leaf.wgsl index 14a1163..5909fdd 100644 --- a/piet-wgsl/shader/draw_leaf.wgsl +++ b/piet-wgsl/shader/draw_leaf.wgsl @@ -68,8 +68,28 @@ fn main( @builtin(workgroup_id) wg_id: vec3, ) { let ix = global_id.x; + // Reduce prefix of workgroups up to this one + var agg = draw_monoid_identity(); + if local_id.x < wg_id.x { + agg = reduced[local_id.x]; + } + sh_scratch[local_id.x] = agg; + for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) { + workgroupBarrier(); + if local_id.x + (1u << i) < WG_SIZE { + let other = sh_scratch[local_id.x + (1u << i)]; + agg = combine_draw_monoid(agg, other); + } + workgroupBarrier(); + sh_scratch[local_id.x] = agg; + } + // Two barriers can be eliminated if we use separate shared arrays + // for prefix and intra-workgroup prefix sum. + workgroupBarrier(); + var m = sh_scratch[0]; + workgroupBarrier(); let tag_word = scene[config.drawtag_base + ix]; - var agg = map_draw_tag(tag_word); + agg = map_draw_tag(tag_word); sh_scratch[local_id.x] = agg; for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) { workgroupBarrier(); @@ -81,12 +101,6 @@ fn main( sh_scratch[local_id.x] = agg; } workgroupBarrier(); - var m = draw_monoid_identity(); - if wg_id.x > 0u { - // TODO: separate dispatch to scan these, or integrate into this one? - // In the meantime, will be limited to 2 * WG draw objs. - m = reduced[wg_id.x - 1u]; - } if local_id.x > 0u { m = combine_draw_monoid(m, sh_scratch[local_id.x - 1u]); } diff --git a/piet-wgsl/shader/pathtag_scan.wgsl b/piet-wgsl/shader/pathtag_scan.wgsl index d18d872..fe87750 100644 --- a/piet-wgsl/shader/pathtag_scan.wgsl +++ b/piet-wgsl/shader/pathtag_scan.wgsl @@ -43,13 +43,13 @@ fn main( @builtin(workgroup_id) wg_id: vec3, ) { var agg = tag_monoid_identity(); - if (local_id.x < wg_id.x) { + if local_id.x < wg_id.x { agg = reduced[local_id.x]; } sh_parent[local_id.x] = agg; for (var i = 0u; i < LG_WG_SIZE; i += 1u) { workgroupBarrier(); - if (local_id.x + (1u << i) < WG_SIZE) { + if local_id.x + (1u << i) < WG_SIZE { let other = sh_parent[local_id.x + (1u << i)]; agg = combine_tag_monoid(agg, other); } @@ -63,7 +63,7 @@ fn main( sh_monoid[local_id.x] = agg; for (var i = 0u; i < LG_WG_SIZE; i += 1u) { workgroupBarrier(); - if (local_id.x >= 1u << i) { + if local_id.x >= 1u << i { let other = sh_monoid[local_id.x - (1u << i)]; agg = combine_tag_monoid(other, agg); } @@ -72,7 +72,7 @@ fn main( } // prefix up to this workgroup var tm = sh_parent[0]; - if (local_id.x > 0u) { + if local_id.x > 0u { tm = combine_tag_monoid(tm, sh_monoid[local_id.x - 1u]); } // exclusive prefix sum, granularity of 4 tag bytes diff --git a/piet-wgsl/shader/tile_alloc.wgsl b/piet-wgsl/shader/tile_alloc.wgsl index 1c27c83..56771d6 100644 --- a/piet-wgsl/shader/tile_alloc.wgsl +++ b/piet-wgsl/shader/tile_alloc.wgsl @@ -85,14 +85,14 @@ fn main( workgroupBarrier(); sh_tile_count[local_id.x] = total_tile_count; } - workgroupBarrier(); - // should be able to avoid a barrier by adding total_tile count from - // thread WG_SIZE - 1, but it doesn't work - if local_id.x == 0u { - sh_tile_offset = atomicAdd(&bump.tile, sh_tile_count[WG_SIZE - 1u]); + if local_id.x == WG_SIZE - 1u { + paths[drawobj_ix].tiles = atomicAdd(&bump.tile, sh_tile_count[WG_SIZE - 1u]); } - workgroupBarrier(); - let tile_offset = sh_tile_offset; + // Using storage barriers is a workaround for what appears to be a miscompilation + // when a normal workgroup-shared variable is used to broadcast the value. + storageBarrier(); + let tile_offset = paths[drawobj_ix | (WG_SIZE - 1u)].tiles; + storageBarrier(); if drawobj_ix < config.n_drawobj { let tile_subix = select(0u, sh_tile_count[local_id.x - 1u], local_id.x > 0u); let bbox = vec4(ux0, uy0, ux1, uy1); diff --git a/piet-wgsl/src/main.rs b/piet-wgsl/src/main.rs index 8b00a26..38645db 100644 --- a/piet-wgsl/src/main.rs +++ b/piet-wgsl/src/main.rs @@ -20,7 +20,6 @@ use std::{fs::File, io::BufWriter}; use engine::Engine; -use test_scene::dump_scene_info; use wgpu::{Device, Limits, Queue}; mod engine; @@ -70,10 +69,11 @@ async fn do_render( queue: &Queue, engine: &mut Engine, ) -> Result<(), Box> { + #[allow(unused)] let shaders = shaders::init_shaders(device, engine)?; let full_shaders = shaders::full_shaders(device, engine)?; let scene = test_scene::gen_test_scene(); - dump_scene_info(&scene); + //test_scene::dump_scene_info(&scene); //let (recording, buf) = render::render(&scene, &shaders); let (recording, buf) = render::render_full(&scene, &full_shaders); let downloads = engine.run_recording(&device, &queue, &recording)?; diff --git a/piet-wgsl/src/render.rs b/piet-wgsl/src/render.rs index 99c068e..690e681 100644 --- a/piet-wgsl/src/render.rs +++ b/piet-wgsl/src/render.rs @@ -86,7 +86,6 @@ pub fn render(scene: &Scene, shaders: &Shaders) -> (Recording, BufProxy) { [config_buf, scene_buf, reduced_buf, tagmonoid_buf], ); - let n_pathtag = data.pathseg_stream.len(); let path_coarse_wgs = (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG; // TODO: more principled size calc let tiles_buf = BufProxy::new(4097 * 8); @@ -125,7 +124,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy let data = scene.data(); let n_pathtag = data.tag_stream.len(); let pathtag_padded = align_up(n_pathtag, 4 * shaders::PATHTAG_REDUCE_WG); - let pathtag_wgs = pathtag_padded / (4 * shaders::PATHTAG_REDUCE_WG as usize); + // TODO: can compute size accurately, avoid reallocation let mut scene: Vec = Vec::with_capacity(pathtag_padded); let pathtag_base = size_to_words(scene.len()); scene.extend(&data.tag_stream); @@ -160,6 +159,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy let scene_buf = recording.upload(scene); let config_buf = recording.upload(bytemuck::bytes_of(&config).to_owned()); + let pathtag_wgs = pathtag_padded / (4 * shaders::PATHTAG_REDUCE_WG as usize); let reduced_buf = BufProxy::new(pathtag_wgs as u64 * TAG_MONOID_FULL_SIZE); // TODO: really only need pathtag_wgs - 1 recording.dispatch( @@ -169,7 +169,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy ); let tagmonoid_buf = - BufProxy::new(pathtag_wgs as u64 * shaders::PATHTAG_REDUCE_WG as u64 * TAG_MONOID_SIZE); + BufProxy::new(pathtag_wgs as u64 * shaders::PATHTAG_REDUCE_WG as u64 * TAG_MONOID_FULL_SIZE); recording.dispatch( shaders.pathtag_scan, (pathtag_wgs as u32, 1, 1), @@ -182,7 +182,6 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy (drawobj_wgs, 1, 1), [config_buf, path_bbox_buf], ); - let n_pathtag = data.pathseg_stream.len(); let cubic_buf = BufProxy::new(n_pathtag as u64 * CUBIC_SIZE); let path_coarse_wgs = (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG; recording.dispatch( diff --git a/piet-wgsl/src/test_scene.rs b/piet-wgsl/src/test_scene.rs index ab7d3e3..4cfe1d4 100644 --- a/piet-wgsl/src/test_scene.rs +++ b/piet-wgsl/src/test_scene.rs @@ -48,6 +48,7 @@ pub fn gen_test_scene() -> Scene { scene } +#[allow(unused)] pub fn dump_scene_info(scene: &Scene) { let data = scene.data(); println!("tags {:?}", data.tag_stream);