Still one flaw, fat lines aren't expanded with strokes in path coarse rasterization. But that's a small visual ding, and can be fixed

That said, there is some really strange stuff going on in tile_alloc. It's using storage to do a uniform broadcast (the result of bump allocation for the workgroup), which is not great at all. It should be using workgroup storage, but on my mac it behaves as if the workgroup barrier is not in place. Investigating.
This commit is contained in:
Raph Levien 2022-11-04 21:41:37 -07:00
parent 17a74fb370
commit 494f523c41
7 changed files with 38 additions and 25 deletions

View file

@ -99,7 +99,6 @@ fn alloc_cmd(size: u32) {
fn write_path(tile: Tile, linewidth: f32) {
// TODO: take flags
// TODO: handle stroke
alloc_cmd(3u);
if linewidth < 0.0 {
if tile.segments != 0u {

View file

@ -68,8 +68,28 @@ fn main(
@builtin(workgroup_id) wg_id: vec3<u32>,
) {
let ix = global_id.x;
// Reduce prefix of workgroups up to this one
var agg = draw_monoid_identity();
if local_id.x < wg_id.x {
agg = reduced[local_id.x];
}
sh_scratch[local_id.x] = agg;
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
workgroupBarrier();
if local_id.x + (1u << i) < WG_SIZE {
let other = sh_scratch[local_id.x + (1u << i)];
agg = combine_draw_monoid(agg, other);
}
workgroupBarrier();
sh_scratch[local_id.x] = agg;
}
// Two barriers can be eliminated if we use separate shared arrays
// for prefix and intra-workgroup prefix sum.
workgroupBarrier();
var m = sh_scratch[0];
workgroupBarrier();
let tag_word = scene[config.drawtag_base + ix];
var agg = map_draw_tag(tag_word);
agg = map_draw_tag(tag_word);
sh_scratch[local_id.x] = agg;
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
workgroupBarrier();
@ -81,12 +101,6 @@ fn main(
sh_scratch[local_id.x] = agg;
}
workgroupBarrier();
var m = draw_monoid_identity();
if wg_id.x > 0u {
// TODO: separate dispatch to scan these, or integrate into this one?
// In the meantime, will be limited to 2 * WG draw objs.
m = reduced[wg_id.x - 1u];
}
if local_id.x > 0u {
m = combine_draw_monoid(m, sh_scratch[local_id.x - 1u]);
}

View file

@ -43,13 +43,13 @@ fn main(
@builtin(workgroup_id) wg_id: vec3<u32>,
) {
var agg = tag_monoid_identity();
if (local_id.x < wg_id.x) {
if local_id.x < wg_id.x {
agg = reduced[local_id.x];
}
sh_parent[local_id.x] = agg;
for (var i = 0u; i < LG_WG_SIZE; i += 1u) {
workgroupBarrier();
if (local_id.x + (1u << i) < WG_SIZE) {
if local_id.x + (1u << i) < WG_SIZE {
let other = sh_parent[local_id.x + (1u << i)];
agg = combine_tag_monoid(agg, other);
}
@ -63,7 +63,7 @@ fn main(
sh_monoid[local_id.x] = agg;
for (var i = 0u; i < LG_WG_SIZE; i += 1u) {
workgroupBarrier();
if (local_id.x >= 1u << i) {
if local_id.x >= 1u << i {
let other = sh_monoid[local_id.x - (1u << i)];
agg = combine_tag_monoid(other, agg);
}
@ -72,7 +72,7 @@ fn main(
}
// prefix up to this workgroup
var tm = sh_parent[0];
if (local_id.x > 0u) {
if local_id.x > 0u {
tm = combine_tag_monoid(tm, sh_monoid[local_id.x - 1u]);
}
// exclusive prefix sum, granularity of 4 tag bytes

View file

@ -85,14 +85,14 @@ fn main(
workgroupBarrier();
sh_tile_count[local_id.x] = total_tile_count;
}
workgroupBarrier();
// should be able to avoid a barrier by adding total_tile count from
// thread WG_SIZE - 1, but it doesn't work
if local_id.x == 0u {
sh_tile_offset = atomicAdd(&bump.tile, sh_tile_count[WG_SIZE - 1u]);
if local_id.x == WG_SIZE - 1u {
paths[drawobj_ix].tiles = atomicAdd(&bump.tile, sh_tile_count[WG_SIZE - 1u]);
}
workgroupBarrier();
let tile_offset = sh_tile_offset;
// Using storage barriers is a workaround for what appears to be a miscompilation
// when a normal workgroup-shared variable is used to broadcast the value.
storageBarrier();
let tile_offset = paths[drawobj_ix | (WG_SIZE - 1u)].tiles;
storageBarrier();
if drawobj_ix < config.n_drawobj {
let tile_subix = select(0u, sh_tile_count[local_id.x - 1u], local_id.x > 0u);
let bbox = vec4<u32>(ux0, uy0, ux1, uy1);

View file

@ -20,7 +20,6 @@ use std::{fs::File, io::BufWriter};
use engine::Engine;
use test_scene::dump_scene_info;
use wgpu::{Device, Limits, Queue};
mod engine;
@ -70,10 +69,11 @@ async fn do_render(
queue: &Queue,
engine: &mut Engine,
) -> Result<(), Box<dyn std::error::Error>> {
#[allow(unused)]
let shaders = shaders::init_shaders(device, engine)?;
let full_shaders = shaders::full_shaders(device, engine)?;
let scene = test_scene::gen_test_scene();
dump_scene_info(&scene);
//test_scene::dump_scene_info(&scene);
//let (recording, buf) = render::render(&scene, &shaders);
let (recording, buf) = render::render_full(&scene, &full_shaders);
let downloads = engine.run_recording(&device, &queue, &recording)?;

View file

@ -86,7 +86,6 @@ pub fn render(scene: &Scene, shaders: &Shaders) -> (Recording, BufProxy) {
[config_buf, scene_buf, reduced_buf, tagmonoid_buf],
);
let n_pathtag = data.pathseg_stream.len();
let path_coarse_wgs = (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
// TODO: more principled size calc
let tiles_buf = BufProxy::new(4097 * 8);
@ -125,7 +124,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
let data = scene.data();
let n_pathtag = data.tag_stream.len();
let pathtag_padded = align_up(n_pathtag, 4 * shaders::PATHTAG_REDUCE_WG);
let pathtag_wgs = pathtag_padded / (4 * shaders::PATHTAG_REDUCE_WG as usize);
// TODO: can compute size accurately, avoid reallocation
let mut scene: Vec<u8> = Vec::with_capacity(pathtag_padded);
let pathtag_base = size_to_words(scene.len());
scene.extend(&data.tag_stream);
@ -160,6 +159,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
let scene_buf = recording.upload(scene);
let config_buf = recording.upload(bytemuck::bytes_of(&config).to_owned());
let pathtag_wgs = pathtag_padded / (4 * shaders::PATHTAG_REDUCE_WG as usize);
let reduced_buf = BufProxy::new(pathtag_wgs as u64 * TAG_MONOID_FULL_SIZE);
// TODO: really only need pathtag_wgs - 1
recording.dispatch(
@ -169,7 +169,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
);
let tagmonoid_buf =
BufProxy::new(pathtag_wgs as u64 * shaders::PATHTAG_REDUCE_WG as u64 * TAG_MONOID_SIZE);
BufProxy::new(pathtag_wgs as u64 * shaders::PATHTAG_REDUCE_WG as u64 * TAG_MONOID_FULL_SIZE);
recording.dispatch(
shaders.pathtag_scan,
(pathtag_wgs as u32, 1, 1),
@ -182,7 +182,6 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
(drawobj_wgs, 1, 1),
[config_buf, path_bbox_buf],
);
let n_pathtag = data.pathseg_stream.len();
let cubic_buf = BufProxy::new(n_pathtag as u64 * CUBIC_SIZE);
let path_coarse_wgs = (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
recording.dispatch(

View file

@ -48,6 +48,7 @@ pub fn gen_test_scene() -> Scene {
scene
}
#[allow(unused)]
pub fn dump_scene_info(scene: &Scene) {
let data = scene.data();
println!("tags {:?}", data.tag_stream);