Still one flaw, fat lines aren't expanded with strokes in path coarse rasterization. But that's a small visual ding, and can be fixed

That said, there is some really strange stuff going on in tile_alloc. It's using storage to do a uniform broadcast (the result of bump allocation for the workgroup), which is not great at all. It should be using workgroup storage, but on my mac it behaves as if the workgroup barrier is not in place. Investigating.
This commit is contained in:
Raph Levien 2022-11-04 21:41:37 -07:00
parent 17a74fb370
commit 494f523c41
7 changed files with 38 additions and 25 deletions

View file

@ -99,7 +99,6 @@ fn alloc_cmd(size: u32) {
fn write_path(tile: Tile, linewidth: f32) { fn write_path(tile: Tile, linewidth: f32) {
// TODO: take flags // TODO: take flags
// TODO: handle stroke
alloc_cmd(3u); alloc_cmd(3u);
if linewidth < 0.0 { if linewidth < 0.0 {
if tile.segments != 0u { if tile.segments != 0u {

View file

@ -68,8 +68,28 @@ fn main(
@builtin(workgroup_id) wg_id: vec3<u32>, @builtin(workgroup_id) wg_id: vec3<u32>,
) { ) {
let ix = global_id.x; let ix = global_id.x;
// Reduce prefix of workgroups up to this one
var agg = draw_monoid_identity();
if local_id.x < wg_id.x {
agg = reduced[local_id.x];
}
sh_scratch[local_id.x] = agg;
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
workgroupBarrier();
if local_id.x + (1u << i) < WG_SIZE {
let other = sh_scratch[local_id.x + (1u << i)];
agg = combine_draw_monoid(agg, other);
}
workgroupBarrier();
sh_scratch[local_id.x] = agg;
}
// Two barriers can be eliminated if we use separate shared arrays
// for prefix and intra-workgroup prefix sum.
workgroupBarrier();
var m = sh_scratch[0];
workgroupBarrier();
let tag_word = scene[config.drawtag_base + ix]; let tag_word = scene[config.drawtag_base + ix];
var agg = map_draw_tag(tag_word); agg = map_draw_tag(tag_word);
sh_scratch[local_id.x] = agg; sh_scratch[local_id.x] = agg;
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) { for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
workgroupBarrier(); workgroupBarrier();
@ -81,12 +101,6 @@ fn main(
sh_scratch[local_id.x] = agg; sh_scratch[local_id.x] = agg;
} }
workgroupBarrier(); workgroupBarrier();
var m = draw_monoid_identity();
if wg_id.x > 0u {
// TODO: separate dispatch to scan these, or integrate into this one?
// In the meantime, will be limited to 2 * WG draw objs.
m = reduced[wg_id.x - 1u];
}
if local_id.x > 0u { if local_id.x > 0u {
m = combine_draw_monoid(m, sh_scratch[local_id.x - 1u]); m = combine_draw_monoid(m, sh_scratch[local_id.x - 1u]);
} }

View file

@ -43,13 +43,13 @@ fn main(
@builtin(workgroup_id) wg_id: vec3<u32>, @builtin(workgroup_id) wg_id: vec3<u32>,
) { ) {
var agg = tag_monoid_identity(); var agg = tag_monoid_identity();
if (local_id.x < wg_id.x) { if local_id.x < wg_id.x {
agg = reduced[local_id.x]; agg = reduced[local_id.x];
} }
sh_parent[local_id.x] = agg; sh_parent[local_id.x] = agg;
for (var i = 0u; i < LG_WG_SIZE; i += 1u) { for (var i = 0u; i < LG_WG_SIZE; i += 1u) {
workgroupBarrier(); workgroupBarrier();
if (local_id.x + (1u << i) < WG_SIZE) { if local_id.x + (1u << i) < WG_SIZE {
let other = sh_parent[local_id.x + (1u << i)]; let other = sh_parent[local_id.x + (1u << i)];
agg = combine_tag_monoid(agg, other); agg = combine_tag_monoid(agg, other);
} }
@ -63,7 +63,7 @@ fn main(
sh_monoid[local_id.x] = agg; sh_monoid[local_id.x] = agg;
for (var i = 0u; i < LG_WG_SIZE; i += 1u) { for (var i = 0u; i < LG_WG_SIZE; i += 1u) {
workgroupBarrier(); workgroupBarrier();
if (local_id.x >= 1u << i) { if local_id.x >= 1u << i {
let other = sh_monoid[local_id.x - (1u << i)]; let other = sh_monoid[local_id.x - (1u << i)];
agg = combine_tag_monoid(other, agg); agg = combine_tag_monoid(other, agg);
} }
@ -72,7 +72,7 @@ fn main(
} }
// prefix up to this workgroup // prefix up to this workgroup
var tm = sh_parent[0]; var tm = sh_parent[0];
if (local_id.x > 0u) { if local_id.x > 0u {
tm = combine_tag_monoid(tm, sh_monoid[local_id.x - 1u]); tm = combine_tag_monoid(tm, sh_monoid[local_id.x - 1u]);
} }
// exclusive prefix sum, granularity of 4 tag bytes // exclusive prefix sum, granularity of 4 tag bytes

View file

@ -85,14 +85,14 @@ fn main(
workgroupBarrier(); workgroupBarrier();
sh_tile_count[local_id.x] = total_tile_count; sh_tile_count[local_id.x] = total_tile_count;
} }
workgroupBarrier(); if local_id.x == WG_SIZE - 1u {
// should be able to avoid a barrier by adding total_tile count from paths[drawobj_ix].tiles = atomicAdd(&bump.tile, sh_tile_count[WG_SIZE - 1u]);
// thread WG_SIZE - 1, but it doesn't work
if local_id.x == 0u {
sh_tile_offset = atomicAdd(&bump.tile, sh_tile_count[WG_SIZE - 1u]);
} }
workgroupBarrier(); // Using storage barriers is a workaround for what appears to be a miscompilation
let tile_offset = sh_tile_offset; // when a normal workgroup-shared variable is used to broadcast the value.
storageBarrier();
let tile_offset = paths[drawobj_ix | (WG_SIZE - 1u)].tiles;
storageBarrier();
if drawobj_ix < config.n_drawobj { if drawobj_ix < config.n_drawobj {
let tile_subix = select(0u, sh_tile_count[local_id.x - 1u], local_id.x > 0u); let tile_subix = select(0u, sh_tile_count[local_id.x - 1u], local_id.x > 0u);
let bbox = vec4<u32>(ux0, uy0, ux1, uy1); let bbox = vec4<u32>(ux0, uy0, ux1, uy1);

View file

@ -20,7 +20,6 @@ use std::{fs::File, io::BufWriter};
use engine::Engine; use engine::Engine;
use test_scene::dump_scene_info;
use wgpu::{Device, Limits, Queue}; use wgpu::{Device, Limits, Queue};
mod engine; mod engine;
@ -70,10 +69,11 @@ async fn do_render(
queue: &Queue, queue: &Queue,
engine: &mut Engine, engine: &mut Engine,
) -> Result<(), Box<dyn std::error::Error>> { ) -> Result<(), Box<dyn std::error::Error>> {
#[allow(unused)]
let shaders = shaders::init_shaders(device, engine)?; let shaders = shaders::init_shaders(device, engine)?;
let full_shaders = shaders::full_shaders(device, engine)?; let full_shaders = shaders::full_shaders(device, engine)?;
let scene = test_scene::gen_test_scene(); let scene = test_scene::gen_test_scene();
dump_scene_info(&scene); //test_scene::dump_scene_info(&scene);
//let (recording, buf) = render::render(&scene, &shaders); //let (recording, buf) = render::render(&scene, &shaders);
let (recording, buf) = render::render_full(&scene, &full_shaders); let (recording, buf) = render::render_full(&scene, &full_shaders);
let downloads = engine.run_recording(&device, &queue, &recording)?; let downloads = engine.run_recording(&device, &queue, &recording)?;

View file

@ -86,7 +86,6 @@ pub fn render(scene: &Scene, shaders: &Shaders) -> (Recording, BufProxy) {
[config_buf, scene_buf, reduced_buf, tagmonoid_buf], [config_buf, scene_buf, reduced_buf, tagmonoid_buf],
); );
let n_pathtag = data.pathseg_stream.len();
let path_coarse_wgs = (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG; let path_coarse_wgs = (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
// TODO: more principled size calc // TODO: more principled size calc
let tiles_buf = BufProxy::new(4097 * 8); let tiles_buf = BufProxy::new(4097 * 8);
@ -125,7 +124,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
let data = scene.data(); let data = scene.data();
let n_pathtag = data.tag_stream.len(); let n_pathtag = data.tag_stream.len();
let pathtag_padded = align_up(n_pathtag, 4 * shaders::PATHTAG_REDUCE_WG); let pathtag_padded = align_up(n_pathtag, 4 * shaders::PATHTAG_REDUCE_WG);
let pathtag_wgs = pathtag_padded / (4 * shaders::PATHTAG_REDUCE_WG as usize); // TODO: can compute size accurately, avoid reallocation
let mut scene: Vec<u8> = Vec::with_capacity(pathtag_padded); let mut scene: Vec<u8> = Vec::with_capacity(pathtag_padded);
let pathtag_base = size_to_words(scene.len()); let pathtag_base = size_to_words(scene.len());
scene.extend(&data.tag_stream); scene.extend(&data.tag_stream);
@ -160,6 +159,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
let scene_buf = recording.upload(scene); let scene_buf = recording.upload(scene);
let config_buf = recording.upload(bytemuck::bytes_of(&config).to_owned()); let config_buf = recording.upload(bytemuck::bytes_of(&config).to_owned());
let pathtag_wgs = pathtag_padded / (4 * shaders::PATHTAG_REDUCE_WG as usize);
let reduced_buf = BufProxy::new(pathtag_wgs as u64 * TAG_MONOID_FULL_SIZE); let reduced_buf = BufProxy::new(pathtag_wgs as u64 * TAG_MONOID_FULL_SIZE);
// TODO: really only need pathtag_wgs - 1 // TODO: really only need pathtag_wgs - 1
recording.dispatch( recording.dispatch(
@ -169,7 +169,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
); );
let tagmonoid_buf = let tagmonoid_buf =
BufProxy::new(pathtag_wgs as u64 * shaders::PATHTAG_REDUCE_WG as u64 * TAG_MONOID_SIZE); BufProxy::new(pathtag_wgs as u64 * shaders::PATHTAG_REDUCE_WG as u64 * TAG_MONOID_FULL_SIZE);
recording.dispatch( recording.dispatch(
shaders.pathtag_scan, shaders.pathtag_scan,
(pathtag_wgs as u32, 1, 1), (pathtag_wgs as u32, 1, 1),
@ -182,7 +182,6 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
(drawobj_wgs, 1, 1), (drawobj_wgs, 1, 1),
[config_buf, path_bbox_buf], [config_buf, path_bbox_buf],
); );
let n_pathtag = data.pathseg_stream.len();
let cubic_buf = BufProxy::new(n_pathtag as u64 * CUBIC_SIZE); let cubic_buf = BufProxy::new(n_pathtag as u64 * CUBIC_SIZE);
let path_coarse_wgs = (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG; let path_coarse_wgs = (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
recording.dispatch( recording.dispatch(

View file

@ -48,6 +48,7 @@ pub fn gen_test_scene() -> Scene {
scene scene
} }
#[allow(unused)]
pub fn dump_scene_info(scene: &Scene) { pub fn dump_scene_info(scene: &Scene) {
let data = scene.data(); let data = scene.data();
println!("tags {:?}", data.tag_stream); println!("tags {:?}", data.tag_stream);