Tiger!

Still one flaw, fat lines aren't expanded with strokes in path coarse rasterization. But that's a small visual ding, and can be fixed That said, there is some really strange stuff going on in tile_alloc. It's using storage to do a uniform broadcast (the result of bump allocation for the workgroup), which is not great at all. It should be using workgroup storage, but on my mac it behaves as if the workgroup barrier is not in place. Investigating.
2025-01-10 20:51:29 +11:00 · 2022-11-04 21:41:37 -07:00 · 2022-11-04 21:41:37 -07:00 · 494f523c41
parent 17a74fb370
commit 494f523c41
7 changed files with 38 additions and 25 deletions
--- a/piet-wgsl/shader/coarse.wgsl
+++ b/piet-wgsl/shader/coarse.wgsl
@ -99,7 +99,6 @@ fn alloc_cmd(size: u32) {
 fn write_path(tile: Tile, linewidth: f32) {
    // TODO: take flags
    // TODO: handle stroke
    alloc_cmd(3u);
    if linewidth < 0.0 {
        if tile.segments != 0u {
--- a/piet-wgsl/shader/draw_leaf.wgsl
+++ b/piet-wgsl/shader/draw_leaf.wgsl
@ -68,8 +68,28 @@ fn main(
    @builtin(workgroup_id) wg_id: vec3<u32>,
 ) {
    let ix = global_id.x;
    // Reduce prefix of workgroups up to this one
    var agg = draw_monoid_identity();
    if local_id.x < wg_id.x {
        agg = reduced[local_id.x];
    }
    sh_scratch[local_id.x] = agg;
    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
        workgroupBarrier();
        if local_id.x + (1u << i) < WG_SIZE {
            let other = sh_scratch[local_id.x + (1u << i)];
            agg = combine_draw_monoid(agg, other);
        }
        workgroupBarrier();
        sh_scratch[local_id.x] = agg;
    }
    // Two barriers can be eliminated if we use separate shared arrays
    // for prefix and intra-workgroup prefix sum.
    workgroupBarrier();
    var m = sh_scratch[0];
    workgroupBarrier();
    let tag_word = scene[config.drawtag_base + ix];
-    var agg = map_draw_tag(tag_word);
+    agg = map_draw_tag(tag_word);
    sh_scratch[local_id.x] = agg;
    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
        workgroupBarrier();
@ -81,12 +101,6 @@ fn main(
        sh_scratch[local_id.x] = agg;
    }
    workgroupBarrier();
    var m = draw_monoid_identity();
    if wg_id.x > 0u {
        // TODO: separate dispatch to scan these, or integrate into this one?
        // In the meantime, will be limited to 2 * WG draw objs.
        m = reduced[wg_id.x - 1u];
    }
    if local_id.x > 0u {
        m = combine_draw_monoid(m, sh_scratch[local_id.x - 1u]);
    }
--- a/piet-wgsl/shader/pathtag_scan.wgsl
+++ b/piet-wgsl/shader/pathtag_scan.wgsl
@ -43,13 +43,13 @@ fn main(
    @builtin(workgroup_id) wg_id: vec3<u32>,
 ) {
    var agg = tag_monoid_identity();
-    if (local_id.x < wg_id.x) {
+    if local_id.x < wg_id.x {
        agg = reduced[local_id.x];
    }
    sh_parent[local_id.x] = agg;
    for (var i = 0u; i < LG_WG_SIZE; i += 1u) {
        workgroupBarrier();
-        if (local_id.x + (1u << i) < WG_SIZE) {
+        if local_id.x + (1u << i) < WG_SIZE {
            let other = sh_parent[local_id.x + (1u << i)];
            agg = combine_tag_monoid(agg, other);
        }
@ -63,7 +63,7 @@ fn main(
    sh_monoid[local_id.x] = agg;
    for (var i = 0u; i < LG_WG_SIZE; i += 1u) {
        workgroupBarrier();
-        if (local_id.x >= 1u << i) {
+        if local_id.x >= 1u << i {
            let other = sh_monoid[local_id.x - (1u << i)];
            agg = combine_tag_monoid(other, agg);
        }
@ -72,7 +72,7 @@ fn main(
    }
    // prefix up to this workgroup
    var tm = sh_parent[0];
-    if (local_id.x > 0u) {
+    if local_id.x > 0u {
        tm = combine_tag_monoid(tm, sh_monoid[local_id.x - 1u]);
    }
    // exclusive prefix sum, granularity of 4 tag bytes
--- a/piet-wgsl/shader/tile_alloc.wgsl
+++ b/piet-wgsl/shader/tile_alloc.wgsl
@ -85,14 +85,14 @@ fn main(
        workgroupBarrier();
        sh_tile_count[local_id.x] = total_tile_count;
    }
-    workgroupBarrier();
+    if local_id.x == WG_SIZE - 1u {
-    // should be able to avoid a barrier by adding total_tile count from
+        paths[drawobj_ix].tiles = atomicAdd(&bump.tile, sh_tile_count[WG_SIZE - 1u]);
    // thread WG_SIZE - 1, but it doesn't work
    if local_id.x == 0u {
        sh_tile_offset = atomicAdd(&bump.tile, sh_tile_count[WG_SIZE - 1u]);
    }
-    workgroupBarrier();
+    // Using storage barriers is a workaround for what appears to be a miscompilation
-    let tile_offset = sh_tile_offset;
+    // when a normal workgroup-shared variable is used to broadcast the value.
    storageBarrier();
    let tile_offset = paths[drawobj_ix | (WG_SIZE - 1u)].tiles;
    storageBarrier();
    if drawobj_ix < config.n_drawobj {
        let tile_subix = select(0u, sh_tile_count[local_id.x - 1u], local_id.x > 0u);
        let bbox = vec4<u32>(ux0, uy0, ux1, uy1);
--- a/piet-wgsl/src/main.rs
+++ b/piet-wgsl/src/main.rs
@ -20,7 +20,6 @@ use std::{fs::File, io::BufWriter};
 use engine::Engine;
 use test_scene::dump_scene_info;
 use wgpu::{Device, Limits, Queue};
 mod engine;
@ -70,10 +69,11 @@ async fn do_render(
    queue: &Queue,
    engine: &mut Engine,
 ) -> Result<(), Box<dyn std::error::Error>> {
    #[allow(unused)]
    let shaders = shaders::init_shaders(device, engine)?;
    let full_shaders = shaders::full_shaders(device, engine)?;
    let scene = test_scene::gen_test_scene();
-    dump_scene_info(&scene);
+    //test_scene::dump_scene_info(&scene);
    //let (recording, buf) = render::render(&scene, &shaders);
    let (recording, buf) = render::render_full(&scene, &full_shaders);
    let downloads = engine.run_recording(&device, &queue, &recording)?;
--- a/piet-wgsl/src/render.rs
+++ b/piet-wgsl/src/render.rs
@ -86,7 +86,6 @@ pub fn render(scene: &Scene, shaders: &Shaders) -> (Recording, BufProxy) {
        [config_buf, scene_buf, reduced_buf, tagmonoid_buf],
    );
    let n_pathtag = data.pathseg_stream.len();
    let path_coarse_wgs = (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
    // TODO: more principled size calc
    let tiles_buf = BufProxy::new(4097 * 8);
@ -125,7 +124,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
    let data = scene.data();
    let n_pathtag = data.tag_stream.len();
    let pathtag_padded = align_up(n_pathtag, 4 * shaders::PATHTAG_REDUCE_WG);
-    let pathtag_wgs = pathtag_padded / (4 * shaders::PATHTAG_REDUCE_WG as usize);
+    // TODO: can compute size accurately, avoid reallocation
    let mut scene: Vec<u8> = Vec::with_capacity(pathtag_padded);
    let pathtag_base = size_to_words(scene.len());
    scene.extend(&data.tag_stream);
@ -160,6 +159,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
    let scene_buf = recording.upload(scene);
    let config_buf = recording.upload(bytemuck::bytes_of(&config).to_owned());
    let pathtag_wgs = pathtag_padded / (4 * shaders::PATHTAG_REDUCE_WG as usize);
    let reduced_buf = BufProxy::new(pathtag_wgs as u64 * TAG_MONOID_FULL_SIZE);
    // TODO: really only need pathtag_wgs - 1
    recording.dispatch(
@ -169,7 +169,7 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
    );
    let tagmonoid_buf =
-        BufProxy::new(pathtag_wgs as u64 * shaders::PATHTAG_REDUCE_WG as u64 * TAG_MONOID_SIZE);
+        BufProxy::new(pathtag_wgs as u64 * shaders::PATHTAG_REDUCE_WG as u64 * TAG_MONOID_FULL_SIZE);
    recording.dispatch(
        shaders.pathtag_scan,
        (pathtag_wgs as u32, 1, 1),
@ -182,7 +182,6 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
        (drawobj_wgs, 1, 1),
        [config_buf, path_bbox_buf],
    );
    let n_pathtag = data.pathseg_stream.len();
    let cubic_buf = BufProxy::new(n_pathtag as u64 * CUBIC_SIZE);
    let path_coarse_wgs = (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
    recording.dispatch(
--- a/piet-wgsl/src/test_scene.rs
+++ b/piet-wgsl/src/test_scene.rs
@ -48,6 +48,7 @@ pub fn gen_test_scene() -> Scene {
    scene
 }
 #[allow(unused)]
 pub fn dump_scene_info(scene: &Scene) {
    let data = scene.data();
    println!("tags {:?}", data.tag_stream);