mirror of
https://github.com/italicsjenga/vello.git
synced 2025-01-10 20:51:29 +11:00
Tiger!
Still one flaw, fat lines aren't expanded with strokes in path coarse rasterization. But that's a small visual ding, and can be fixed That said, there is some really strange stuff going on in tile_alloc. It's using storage to do a uniform broadcast (the result of bump allocation for the workgroup), which is not great at all. It should be using workgroup storage, but on my mac it behaves as if the workgroup barrier is not in place. Investigating.
This commit is contained in:
parent
17a74fb370
commit
494f523c41
|
@ -99,7 +99,6 @@ fn alloc_cmd(size: u32) {
|
||||||
|
|
||||||
fn write_path(tile: Tile, linewidth: f32) {
|
fn write_path(tile: Tile, linewidth: f32) {
|
||||||
// TODO: take flags
|
// TODO: take flags
|
||||||
// TODO: handle stroke
|
|
||||||
alloc_cmd(3u);
|
alloc_cmd(3u);
|
||||||
if linewidth < 0.0 {
|
if linewidth < 0.0 {
|
||||||
if tile.segments != 0u {
|
if tile.segments != 0u {
|
||||||
|
|
|
@ -68,8 +68,28 @@ fn main(
|
||||||
@builtin(workgroup_id) wg_id: vec3<u32>,
|
@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||||
) {
|
) {
|
||||||
let ix = global_id.x;
|
let ix = global_id.x;
|
||||||
|
// Reduce prefix of workgroups up to this one
|
||||||
|
var agg = draw_monoid_identity();
|
||||||
|
if local_id.x < wg_id.x {
|
||||||
|
agg = reduced[local_id.x];
|
||||||
|
}
|
||||||
|
sh_scratch[local_id.x] = agg;
|
||||||
|
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
|
||||||
|
workgroupBarrier();
|
||||||
|
if local_id.x + (1u << i) < WG_SIZE {
|
||||||
|
let other = sh_scratch[local_id.x + (1u << i)];
|
||||||
|
agg = combine_draw_monoid(agg, other);
|
||||||
|
}
|
||||||
|
workgroupBarrier();
|
||||||
|
sh_scratch[local_id.x] = agg;
|
||||||
|
}
|
||||||
|
// Two barriers can be eliminated if we use separate shared arrays
|
||||||
|
// for prefix and intra-workgroup prefix sum.
|
||||||
|
workgroupBarrier();
|
||||||
|
var m = sh_scratch[0];
|
||||||
|
workgroupBarrier();
|
||||||
let tag_word = scene[config.drawtag_base + ix];
|
let tag_word = scene[config.drawtag_base + ix];
|
||||||
var agg = map_draw_tag(tag_word);
|
agg = map_draw_tag(tag_word);
|
||||||
sh_scratch[local_id.x] = agg;
|
sh_scratch[local_id.x] = agg;
|
||||||
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
|
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
|
||||||
workgroupBarrier();
|
workgroupBarrier();
|
||||||
|
@ -81,12 +101,6 @@ fn main(
|
||||||
sh_scratch[local_id.x] = agg;
|
sh_scratch[local_id.x] = agg;
|
||||||
}
|
}
|
||||||
workgroupBarrier();
|
workgroupBarrier();
|
||||||
var m = draw_monoid_identity();
|
|
||||||
if wg_id.x > 0u {
|
|
||||||
// TODO: separate dispatch to scan these, or integrate into this one?
|
|
||||||
// In the meantime, will be limited to 2 * WG draw objs.
|
|
||||||
m = reduced[wg_id.x - 1u];
|
|
||||||
}
|
|
||||||
if local_id.x > 0u {
|
if local_id.x > 0u {
|
||||||
m = combine_draw_monoid(m, sh_scratch[local_id.x - 1u]);
|
m = combine_draw_monoid(m, sh_scratch[local_id.x - 1u]);
|
||||||
}
|
}
|
||||||
|
|
|
@ -43,13 +43,13 @@ fn main(
|
||||||
@builtin(workgroup_id) wg_id: vec3<u32>,
|
@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||||
) {
|
) {
|
||||||
var agg = tag_monoid_identity();
|
var agg = tag_monoid_identity();
|
||||||
if (local_id.x < wg_id.x) {
|
if local_id.x < wg_id.x {
|
||||||
agg = reduced[local_id.x];
|
agg = reduced[local_id.x];
|
||||||
}
|
}
|
||||||
sh_parent[local_id.x] = agg;
|
sh_parent[local_id.x] = agg;
|
||||||
for (var i = 0u; i < LG_WG_SIZE; i += 1u) {
|
for (var i = 0u; i < LG_WG_SIZE; i += 1u) {
|
||||||
workgroupBarrier();
|
workgroupBarrier();
|
||||||
if (local_id.x + (1u << i) < WG_SIZE) {
|
if local_id.x + (1u << i) < WG_SIZE {
|
||||||
let other = sh_parent[local_id.x + (1u << i)];
|
let other = sh_parent[local_id.x + (1u << i)];
|
||||||
agg = combine_tag_monoid(agg, other);
|
agg = combine_tag_monoid(agg, other);
|
||||||
}
|
}
|
||||||
|
@ -63,7 +63,7 @@ fn main(
|
||||||
sh_monoid[local_id.x] = agg;
|
sh_monoid[local_id.x] = agg;
|
||||||
for (var i = 0u; i < LG_WG_SIZE; i += 1u) {
|
for (var i = 0u; i < LG_WG_SIZE; i += 1u) {
|
||||||
workgroupBarrier();
|
workgroupBarrier();
|
||||||
if (local_id.x >= 1u << i) {
|
if local_id.x >= 1u << i {
|
||||||
let other = sh_monoid[local_id.x - (1u << i)];
|
let other = sh_monoid[local_id.x - (1u << i)];
|
||||||
agg = combine_tag_monoid(other, agg);
|
agg = combine_tag_monoid(other, agg);
|
||||||
}
|
}
|
||||||
|
@ -72,7 +72,7 @@ fn main(
|
||||||
}
|
}
|
||||||
// prefix up to this workgroup
|
// prefix up to this workgroup
|
||||||
var tm = sh_parent[0];
|
var tm = sh_parent[0];
|
||||||
if (local_id.x > 0u) {
|
if local_id.x > 0u {
|
||||||
tm = combine_tag_monoid(tm, sh_monoid[local_id.x - 1u]);
|
tm = combine_tag_monoid(tm, sh_monoid[local_id.x - 1u]);
|
||||||
}
|
}
|
||||||
// exclusive prefix sum, granularity of 4 tag bytes
|
// exclusive prefix sum, granularity of 4 tag bytes
|
||||||
|
|
|
@ -85,14 +85,14 @@ fn main(
|
||||||
workgroupBarrier();
|
workgroupBarrier();
|
||||||
sh_tile_count[local_id.x] = total_tile_count;
|
sh_tile_count[local_id.x] = total_tile_count;
|
||||||
}
|
}
|
||||||
workgroupBarrier();
|
if local_id.x == WG_SIZE - 1u {
|
||||||
// should be able to avoid a barrier by adding total_tile count from
|
paths[drawobj_ix].tiles = atomicAdd(&bump.tile, sh_tile_count[WG_SIZE - 1u]);
|
||||||
// thread WG_SIZE - 1, but it doesn't work
|
|
||||||
if local_id.x == 0u {
|
|
||||||
sh_tile_offset = atomicAdd(&bump.tile, sh_tile_count[WG_SIZE - 1u]);
|
|
||||||
}
|
}
|
||||||
workgroupBarrier();
|
// Using storage barriers is a workaround for what appears to be a miscompilation
|
||||||
let tile_offset = sh_tile_offset;
|
// when a normal workgroup-shared variable is used to broadcast the value.
|
||||||
|
storageBarrier();
|
||||||
|
let tile_offset = paths[drawobj_ix | (WG_SIZE - 1u)].tiles;
|
||||||
|
storageBarrier();
|
||||||
if drawobj_ix < config.n_drawobj {
|
if drawobj_ix < config.n_drawobj {
|
||||||
let tile_subix = select(0u, sh_tile_count[local_id.x - 1u], local_id.x > 0u);
|
let tile_subix = select(0u, sh_tile_count[local_id.x - 1u], local_id.x > 0u);
|
||||||
let bbox = vec4<u32>(ux0, uy0, ux1, uy1);
|
let bbox = vec4<u32>(ux0, uy0, ux1, uy1);
|
||||||
|
|
|
@ -20,7 +20,6 @@ use std::{fs::File, io::BufWriter};
|
||||||
|
|
||||||
use engine::Engine;
|
use engine::Engine;
|
||||||
|
|
||||||
use test_scene::dump_scene_info;
|
|
||||||
use wgpu::{Device, Limits, Queue};
|
use wgpu::{Device, Limits, Queue};
|
||||||
|
|
||||||
mod engine;
|
mod engine;
|
||||||
|
@ -70,10 +69,11 @@ async fn do_render(
|
||||||
queue: &Queue,
|
queue: &Queue,
|
||||||
engine: &mut Engine,
|
engine: &mut Engine,
|
||||||
) -> Result<(), Box<dyn std::error::Error>> {
|
) -> Result<(), Box<dyn std::error::Error>> {
|
||||||
|
#[allow(unused)]
|
||||||
let shaders = shaders::init_shaders(device, engine)?;
|
let shaders = shaders::init_shaders(device, engine)?;
|
||||||
let full_shaders = shaders::full_shaders(device, engine)?;
|
let full_shaders = shaders::full_shaders(device, engine)?;
|
||||||
let scene = test_scene::gen_test_scene();
|
let scene = test_scene::gen_test_scene();
|
||||||
dump_scene_info(&scene);
|
//test_scene::dump_scene_info(&scene);
|
||||||
//let (recording, buf) = render::render(&scene, &shaders);
|
//let (recording, buf) = render::render(&scene, &shaders);
|
||||||
let (recording, buf) = render::render_full(&scene, &full_shaders);
|
let (recording, buf) = render::render_full(&scene, &full_shaders);
|
||||||
let downloads = engine.run_recording(&device, &queue, &recording)?;
|
let downloads = engine.run_recording(&device, &queue, &recording)?;
|
||||||
|
|
|
@ -86,7 +86,6 @@ pub fn render(scene: &Scene, shaders: &Shaders) -> (Recording, BufProxy) {
|
||||||
[config_buf, scene_buf, reduced_buf, tagmonoid_buf],
|
[config_buf, scene_buf, reduced_buf, tagmonoid_buf],
|
||||||
);
|
);
|
||||||
|
|
||||||
let n_pathtag = data.pathseg_stream.len();
|
|
||||||
let path_coarse_wgs = (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
|
let path_coarse_wgs = (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
|
||||||
// TODO: more principled size calc
|
// TODO: more principled size calc
|
||||||
let tiles_buf = BufProxy::new(4097 * 8);
|
let tiles_buf = BufProxy::new(4097 * 8);
|
||||||
|
@ -125,7 +124,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
|
||||||
let data = scene.data();
|
let data = scene.data();
|
||||||
let n_pathtag = data.tag_stream.len();
|
let n_pathtag = data.tag_stream.len();
|
||||||
let pathtag_padded = align_up(n_pathtag, 4 * shaders::PATHTAG_REDUCE_WG);
|
let pathtag_padded = align_up(n_pathtag, 4 * shaders::PATHTAG_REDUCE_WG);
|
||||||
let pathtag_wgs = pathtag_padded / (4 * shaders::PATHTAG_REDUCE_WG as usize);
|
// TODO: can compute size accurately, avoid reallocation
|
||||||
let mut scene: Vec<u8> = Vec::with_capacity(pathtag_padded);
|
let mut scene: Vec<u8> = Vec::with_capacity(pathtag_padded);
|
||||||
let pathtag_base = size_to_words(scene.len());
|
let pathtag_base = size_to_words(scene.len());
|
||||||
scene.extend(&data.tag_stream);
|
scene.extend(&data.tag_stream);
|
||||||
|
@ -160,6 +159,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
|
||||||
let scene_buf = recording.upload(scene);
|
let scene_buf = recording.upload(scene);
|
||||||
let config_buf = recording.upload(bytemuck::bytes_of(&config).to_owned());
|
let config_buf = recording.upload(bytemuck::bytes_of(&config).to_owned());
|
||||||
|
|
||||||
|
let pathtag_wgs = pathtag_padded / (4 * shaders::PATHTAG_REDUCE_WG as usize);
|
||||||
let reduced_buf = BufProxy::new(pathtag_wgs as u64 * TAG_MONOID_FULL_SIZE);
|
let reduced_buf = BufProxy::new(pathtag_wgs as u64 * TAG_MONOID_FULL_SIZE);
|
||||||
// TODO: really only need pathtag_wgs - 1
|
// TODO: really only need pathtag_wgs - 1
|
||||||
recording.dispatch(
|
recording.dispatch(
|
||||||
|
@ -169,7 +169,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
|
||||||
);
|
);
|
||||||
|
|
||||||
let tagmonoid_buf =
|
let tagmonoid_buf =
|
||||||
BufProxy::new(pathtag_wgs as u64 * shaders::PATHTAG_REDUCE_WG as u64 * TAG_MONOID_SIZE);
|
BufProxy::new(pathtag_wgs as u64 * shaders::PATHTAG_REDUCE_WG as u64 * TAG_MONOID_FULL_SIZE);
|
||||||
recording.dispatch(
|
recording.dispatch(
|
||||||
shaders.pathtag_scan,
|
shaders.pathtag_scan,
|
||||||
(pathtag_wgs as u32, 1, 1),
|
(pathtag_wgs as u32, 1, 1),
|
||||||
|
@ -182,7 +182,6 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
|
||||||
(drawobj_wgs, 1, 1),
|
(drawobj_wgs, 1, 1),
|
||||||
[config_buf, path_bbox_buf],
|
[config_buf, path_bbox_buf],
|
||||||
);
|
);
|
||||||
let n_pathtag = data.pathseg_stream.len();
|
|
||||||
let cubic_buf = BufProxy::new(n_pathtag as u64 * CUBIC_SIZE);
|
let cubic_buf = BufProxy::new(n_pathtag as u64 * CUBIC_SIZE);
|
||||||
let path_coarse_wgs = (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
|
let path_coarse_wgs = (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
|
||||||
recording.dispatch(
|
recording.dispatch(
|
||||||
|
|
|
@ -48,6 +48,7 @@ pub fn gen_test_scene() -> Scene {
|
||||||
scene
|
scene
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[allow(unused)]
|
||||||
pub fn dump_scene_info(scene: &Scene) {
|
pub fn dump_scene_info(scene: &Scene) {
|
||||||
let data = scene.data();
|
let data = scene.data();
|
||||||
println!("tags {:?}", data.tag_stream);
|
println!("tags {:?}", data.tag_stream);
|
||||||
|
|
Loading…
Reference in a new issue