mirror of
https://github.com/italicsjenga/vello.git
synced 2025-01-10 12:41:30 +11:00
Tiger!
Still one flaw, fat lines aren't expanded with strokes in path coarse rasterization. But that's a small visual ding, and can be fixed That said, there is some really strange stuff going on in tile_alloc. It's using storage to do a uniform broadcast (the result of bump allocation for the workgroup), which is not great at all. It should be using workgroup storage, but on my mac it behaves as if the workgroup barrier is not in place. Investigating.
This commit is contained in:
parent
17a74fb370
commit
494f523c41
|
@ -99,7 +99,6 @@ fn alloc_cmd(size: u32) {
|
|||
|
||||
fn write_path(tile: Tile, linewidth: f32) {
|
||||
// TODO: take flags
|
||||
// TODO: handle stroke
|
||||
alloc_cmd(3u);
|
||||
if linewidth < 0.0 {
|
||||
if tile.segments != 0u {
|
||||
|
|
|
@ -68,8 +68,28 @@ fn main(
|
|||
@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
) {
|
||||
let ix = global_id.x;
|
||||
// Reduce prefix of workgroups up to this one
|
||||
var agg = draw_monoid_identity();
|
||||
if local_id.x < wg_id.x {
|
||||
agg = reduced[local_id.x];
|
||||
}
|
||||
sh_scratch[local_id.x] = agg;
|
||||
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
|
||||
workgroupBarrier();
|
||||
if local_id.x + (1u << i) < WG_SIZE {
|
||||
let other = sh_scratch[local_id.x + (1u << i)];
|
||||
agg = combine_draw_monoid(agg, other);
|
||||
}
|
||||
workgroupBarrier();
|
||||
sh_scratch[local_id.x] = agg;
|
||||
}
|
||||
// Two barriers can be eliminated if we use separate shared arrays
|
||||
// for prefix and intra-workgroup prefix sum.
|
||||
workgroupBarrier();
|
||||
var m = sh_scratch[0];
|
||||
workgroupBarrier();
|
||||
let tag_word = scene[config.drawtag_base + ix];
|
||||
var agg = map_draw_tag(tag_word);
|
||||
agg = map_draw_tag(tag_word);
|
||||
sh_scratch[local_id.x] = agg;
|
||||
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
|
||||
workgroupBarrier();
|
||||
|
@ -81,12 +101,6 @@ fn main(
|
|||
sh_scratch[local_id.x] = agg;
|
||||
}
|
||||
workgroupBarrier();
|
||||
var m = draw_monoid_identity();
|
||||
if wg_id.x > 0u {
|
||||
// TODO: separate dispatch to scan these, or integrate into this one?
|
||||
// In the meantime, will be limited to 2 * WG draw objs.
|
||||
m = reduced[wg_id.x - 1u];
|
||||
}
|
||||
if local_id.x > 0u {
|
||||
m = combine_draw_monoid(m, sh_scratch[local_id.x - 1u]);
|
||||
}
|
||||
|
|
|
@ -43,13 +43,13 @@ fn main(
|
|||
@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
) {
|
||||
var agg = tag_monoid_identity();
|
||||
if (local_id.x < wg_id.x) {
|
||||
if local_id.x < wg_id.x {
|
||||
agg = reduced[local_id.x];
|
||||
}
|
||||
sh_parent[local_id.x] = agg;
|
||||
for (var i = 0u; i < LG_WG_SIZE; i += 1u) {
|
||||
workgroupBarrier();
|
||||
if (local_id.x + (1u << i) < WG_SIZE) {
|
||||
if local_id.x + (1u << i) < WG_SIZE {
|
||||
let other = sh_parent[local_id.x + (1u << i)];
|
||||
agg = combine_tag_monoid(agg, other);
|
||||
}
|
||||
|
@ -63,7 +63,7 @@ fn main(
|
|||
sh_monoid[local_id.x] = agg;
|
||||
for (var i = 0u; i < LG_WG_SIZE; i += 1u) {
|
||||
workgroupBarrier();
|
||||
if (local_id.x >= 1u << i) {
|
||||
if local_id.x >= 1u << i {
|
||||
let other = sh_monoid[local_id.x - (1u << i)];
|
||||
agg = combine_tag_monoid(other, agg);
|
||||
}
|
||||
|
@ -72,7 +72,7 @@ fn main(
|
|||
}
|
||||
// prefix up to this workgroup
|
||||
var tm = sh_parent[0];
|
||||
if (local_id.x > 0u) {
|
||||
if local_id.x > 0u {
|
||||
tm = combine_tag_monoid(tm, sh_monoid[local_id.x - 1u]);
|
||||
}
|
||||
// exclusive prefix sum, granularity of 4 tag bytes
|
||||
|
|
|
@ -85,14 +85,14 @@ fn main(
|
|||
workgroupBarrier();
|
||||
sh_tile_count[local_id.x] = total_tile_count;
|
||||
}
|
||||
workgroupBarrier();
|
||||
// should be able to avoid a barrier by adding total_tile count from
|
||||
// thread WG_SIZE - 1, but it doesn't work
|
||||
if local_id.x == 0u {
|
||||
sh_tile_offset = atomicAdd(&bump.tile, sh_tile_count[WG_SIZE - 1u]);
|
||||
if local_id.x == WG_SIZE - 1u {
|
||||
paths[drawobj_ix].tiles = atomicAdd(&bump.tile, sh_tile_count[WG_SIZE - 1u]);
|
||||
}
|
||||
workgroupBarrier();
|
||||
let tile_offset = sh_tile_offset;
|
||||
// Using storage barriers is a workaround for what appears to be a miscompilation
|
||||
// when a normal workgroup-shared variable is used to broadcast the value.
|
||||
storageBarrier();
|
||||
let tile_offset = paths[drawobj_ix | (WG_SIZE - 1u)].tiles;
|
||||
storageBarrier();
|
||||
if drawobj_ix < config.n_drawobj {
|
||||
let tile_subix = select(0u, sh_tile_count[local_id.x - 1u], local_id.x > 0u);
|
||||
let bbox = vec4<u32>(ux0, uy0, ux1, uy1);
|
||||
|
|
|
@ -20,7 +20,6 @@ use std::{fs::File, io::BufWriter};
|
|||
|
||||
use engine::Engine;
|
||||
|
||||
use test_scene::dump_scene_info;
|
||||
use wgpu::{Device, Limits, Queue};
|
||||
|
||||
mod engine;
|
||||
|
@ -70,10 +69,11 @@ async fn do_render(
|
|||
queue: &Queue,
|
||||
engine: &mut Engine,
|
||||
) -> Result<(), Box<dyn std::error::Error>> {
|
||||
#[allow(unused)]
|
||||
let shaders = shaders::init_shaders(device, engine)?;
|
||||
let full_shaders = shaders::full_shaders(device, engine)?;
|
||||
let scene = test_scene::gen_test_scene();
|
||||
dump_scene_info(&scene);
|
||||
//test_scene::dump_scene_info(&scene);
|
||||
//let (recording, buf) = render::render(&scene, &shaders);
|
||||
let (recording, buf) = render::render_full(&scene, &full_shaders);
|
||||
let downloads = engine.run_recording(&device, &queue, &recording)?;
|
||||
|
|
|
@ -86,7 +86,6 @@ pub fn render(scene: &Scene, shaders: &Shaders) -> (Recording, BufProxy) {
|
|||
[config_buf, scene_buf, reduced_buf, tagmonoid_buf],
|
||||
);
|
||||
|
||||
let n_pathtag = data.pathseg_stream.len();
|
||||
let path_coarse_wgs = (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
|
||||
// TODO: more principled size calc
|
||||
let tiles_buf = BufProxy::new(4097 * 8);
|
||||
|
@ -125,7 +124,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
|
|||
let data = scene.data();
|
||||
let n_pathtag = data.tag_stream.len();
|
||||
let pathtag_padded = align_up(n_pathtag, 4 * shaders::PATHTAG_REDUCE_WG);
|
||||
let pathtag_wgs = pathtag_padded / (4 * shaders::PATHTAG_REDUCE_WG as usize);
|
||||
// TODO: can compute size accurately, avoid reallocation
|
||||
let mut scene: Vec<u8> = Vec::with_capacity(pathtag_padded);
|
||||
let pathtag_base = size_to_words(scene.len());
|
||||
scene.extend(&data.tag_stream);
|
||||
|
@ -160,6 +159,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
|
|||
let scene_buf = recording.upload(scene);
|
||||
let config_buf = recording.upload(bytemuck::bytes_of(&config).to_owned());
|
||||
|
||||
let pathtag_wgs = pathtag_padded / (4 * shaders::PATHTAG_REDUCE_WG as usize);
|
||||
let reduced_buf = BufProxy::new(pathtag_wgs as u64 * TAG_MONOID_FULL_SIZE);
|
||||
// TODO: really only need pathtag_wgs - 1
|
||||
recording.dispatch(
|
||||
|
@ -169,7 +169,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
|
|||
);
|
||||
|
||||
let tagmonoid_buf =
|
||||
BufProxy::new(pathtag_wgs as u64 * shaders::PATHTAG_REDUCE_WG as u64 * TAG_MONOID_SIZE);
|
||||
BufProxy::new(pathtag_wgs as u64 * shaders::PATHTAG_REDUCE_WG as u64 * TAG_MONOID_FULL_SIZE);
|
||||
recording.dispatch(
|
||||
shaders.pathtag_scan,
|
||||
(pathtag_wgs as u32, 1, 1),
|
||||
|
@ -182,7 +182,6 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
|
|||
(drawobj_wgs, 1, 1),
|
||||
[config_buf, path_bbox_buf],
|
||||
);
|
||||
let n_pathtag = data.pathseg_stream.len();
|
||||
let cubic_buf = BufProxy::new(n_pathtag as u64 * CUBIC_SIZE);
|
||||
let path_coarse_wgs = (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
|
||||
recording.dispatch(
|
||||
|
|
|
@ -48,6 +48,7 @@ pub fn gen_test_scene() -> Scene {
|
|||
scene
|
||||
}
|
||||
|
||||
#[allow(unused)]
|
||||
pub fn dump_scene_info(scene: &Scene) {
|
||||
let data = scene.data();
|
||||
println!("tags {:?}", data.tag_stream);
|
||||
|
|
Loading…
Reference in a new issue