initial GPU side work for robust memory

This should handle everything on the GPU side except for blend stack loading/storing in fine.
2025-01-08 20:01:30 +11:00 · 2023-01-17 14:08:20 -05:00 · 2023-01-17 14:08:20 -05:00 · 1e8d194b6a
parent 3933c159a8
commit 1e8d194b6a
8 changed files with 93 additions and 18 deletions
--- a/shader/binning.wgsl
+++ b/shader/binning.wgsl
@ -127,7 +127,11 @@ fn main(
        sh_count[i][local_id.x] = element_count_packed;
    }
    // element_count is the number of draw objects covering this thread's bin
-    let chunk_offset = atomicAdd(&bump.binning, element_count);
+    var chunk_offset = atomicAdd(&bump.binning, element_count);
+    if chunk_offset > bump.binning_size {
+        chunk_offset = 0u;
+        atomicOr(&bump.failed, STAGE_BINNING);
+    }    
    sh_chunk_offset[local_id.x] = chunk_offset;
    bin_header[global_id.x].element_count = element_count;
    bin_header[global_id.x].chunk_offset = chunk_offset;
--- a/shader/coarse.wgsl
+++ b/shader/coarse.wgsl
@ -70,8 +70,11 @@ fn alloc_cmd(size: u32) {
        // We might be able to save a little bit of computation here
        // by setting the initial value of the bump allocator.
        let ptcl_dyn_start = config.width_in_tiles * config.height_in_tiles * PTCL_INITIAL_ALLOC;
-        let new_cmd = ptcl_dyn_start + atomicAdd(&bump.ptcl, PTCL_INCREMENT);
-        // TODO: robust memory
+        var new_cmd = ptcl_dyn_start + atomicAdd(&bump.ptcl, PTCL_INCREMENT);
+        if new_cmd > bump.ptcl_size {
+            new_cmd = 0u;
+            atomicOr(&bump.failed, STAGE_COARSE);
+        }
        ptcl[cmd_offset] = CMD_JUMP;
        ptcl[cmd_offset + 1u] = new_cmd;
        cmd_offset = new_cmd;
@ -142,6 +145,9 @@ fn main(
    @builtin(local_invocation_id) local_id: vec3<u32>,
    @builtin(workgroup_id) wg_id: vec3<u32>,
 ) {
+    if (atomicLoad(&bump.failed) & (STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_PATH_COARSE)) != 0u {
+        return;
+    }     
    let width_in_bins = (config.width_in_tiles + N_TILE_X - 1u) / N_TILE_X;
    let bin_ix = width_in_bins * wg_id.y + wg_id.x;
    let n_partitions = (config.n_drawobj + N_TILE - 1u) / N_TILE;
@ -170,6 +176,10 @@ fn main(
    var render_blend_depth = 0u;
    var max_blend_depth = 0u;

+    let blend_offset = cmd_offset;
+    cmd_offset += 1u;
+    cmd_limit -= 1u;
+
    while true {
        for (var i = 0u; i < N_SLICE; i += 1u) {
            atomicStore(&sh_bitmaps[i][local_id.x], 0u);
@ -401,6 +411,9 @@ fn main(
    }
    if bin_tile_x + tile_x < config.width_in_tiles && bin_tile_y + tile_y < config.height_in_tiles {
        ptcl[cmd_offset] = CMD_END;
-        // TODO: blend stack allocation
+        if max_blend_depth > BLEND_STACK_SPLIT {
+            let scratch_size = max_blend_depth * TILE_WIDTH * TILE_HEIGHT * 4u;
+            ptcl[blend_offset] = atomicAdd(&bump.blend, scratch_size);
+        }
    }
 }
--- a/shader/fine.wgsl
+++ b/shader/fine.wgsl
@ -27,7 +27,6 @@ var<storage> segments: array<Segment>;
 #import ptcl

 let GRADIENT_WIDTH = 512;
-let BLEND_STACK_SPLIT = 4u;

@group(0) @binding(3)
 var output: texture_storage_2d<rgba8unorm, write>;
@ -192,7 +191,8 @@ fn main(
    var clip_depth = 0u;
    var area: array<f32, PIXELS_PER_THREAD>;
    var cmd_ix = tile_ix * PTCL_INITIAL_ALLOC;
-
+    let blend_offset = ptcl[cmd_ix];
+    cmd_ix += 1u;
    // main interpretation loop
    while true {
        let tag = ptcl[cmd_ix];
--- a/shader/path_coarse_full.wgsl
+++ b/shader/path_coarse_full.wgsl
@ -93,7 +93,12 @@ fn eval_cubic(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, p3: vec2<f32>, t: f32
 }

 fn alloc_segment() -> u32 {
-    return atomicAdd(&bump.segments, 1u) + 1u;
+    var offset = atomicAdd(&bump.segments, 1u) + 1u;
+    if offset > bump.segments_size {
+        offset = 0u;
+        atomicOr(&bump.failed, STAGE_PATH_COARSE);
+    }
+    return offset;
 }

 let MAX_QUADS = 16u;
@ -102,6 +107,9 @@ let MAX_QUADS = 16u;
 fn main(
    @builtin(global_invocation_id) global_id: vec3<u32>,
 ) {
+    if (atomicLoad(&bump.failed) & (STAGE_BINNING | STAGE_TILE_ALLOC)) != 0u {
+        return;
+    }
    let ix = global_id.x;
    let tag_word = scene[config.pathtag_base + (ix >> 2u)];
    let shift = (ix & 3u) * 8u;
--- a/shader/shared/bump.wgsl
+++ b/shader/shared/bump.wgsl
@ -1,9 +1,21 @@
 // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

-// TODO: robust memory (failure flags)
+// Bitflags for each stage that can fail allocation.
+let STAGE_BINNING: u32 = 0x1u;
+let STAGE_TILE_ALLOC: u32 = 0x2u;
+let STAGE_PATH_COARSE: u32 = 0x4u;
+let STAGE_COARSE: u32 = 0x8u;
+
 struct BumpAllocators {
+    // Bitmask of stages that have failed allocation.
+    failed: atomic<u32>,
+    binning_size: u32,
+    ptcl_size: u32,
+    tiles_size: u32,
+    segments_size: u32,
    binning: atomic<u32>,
    ptcl: atomic<u32>,
    tile: atomic<u32>,
    segments: atomic<u32>,
+    blend: atomic<u32>,
 }
--- a/shader/shared/config.wgsl
+++ b/shader/shared/config.wgsl
@ -35,3 +35,5 @@ let N_TILE_X = 16u;
 let N_TILE_Y = 16u;
 //let N_TILE = N_TILE_X * N_TILE_Y;
 let N_TILE = 256u;
+
+let BLEND_STACK_SPLIT = 4u;
--- a/shader/tile_alloc.wgsl
+++ b/shader/tile_alloc.wgsl
@ -35,6 +35,9 @@ fn main(
    @builtin(global_invocation_id) global_id: vec3<u32>,
    @builtin(local_invocation_id) local_id: vec3<u32>,
 ) {
+    if (atomicLoad(&bump.failed) & STAGE_BINNING) != 0u {
+        return;
+    }    
    // scale factors useful for converting coordinates to tiles
    // TODO: make into constants
    let SX = 1.0 / f32(TILE_WIDTH);
@ -72,8 +75,13 @@ fn main(
        sh_tile_count[local_id.x] = total_tile_count;
    }
    if local_id.x == WG_SIZE - 1u {
-        paths[drawobj_ix].tiles = atomicAdd(&bump.tile, sh_tile_count[WG_SIZE - 1u]);
-    }
+        var offset = atomicAdd(&bump.tile, sh_tile_count[WG_SIZE - 1u]);
+        if offset > bump.tiles_size {
+            offset = 0u;
+            atomicOr(&bump.failed, STAGE_TILE_ALLOC);
+        }
+        paths[drawobj_ix].tiles = offset;
+    }    
    // Using storage barriers is a workaround for what appears to be a miscompilation
    // when a normal workgroup-shared variable is used to broadcast the value.
    storageBarrier();
--- a/src/render.rs
+++ b/src/render.rs
@ -21,8 +21,10 @@ const CLIP_INP_SIZE: u64 = 8;
 const CLIP_BBOX_SIZE: u64 = 16;
 const PATH_SIZE: u64 = 32;
 const DRAW_BBOX_SIZE: u64 = 16;
-const BUMP_SIZE: u64 = 16;
+const BUMP_SIZE: u64 = std::mem::size_of::<BumpAllocators>() as u64;
 const BIN_HEADER_SIZE: u64 = 8;
+const TILE_SIZE: u64 = 8;
+const SEGMENT_SIZE: u64 = 24;

 #[repr(C)]
 #[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
@ -54,6 +56,22 @@ pub const fn next_multiple_of(val: u32, rhs: u32) -> u32 {
    }
 }

+// This must be kept in sync with the struct in shaders/shared/bump.wgsl
+#[repr(C)]
+#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
+struct BumpAllocators {
+    failed: u32,
+    binning_size: u32,
+    ptcl_size: u32,
+    tiles_size: u32,
+    segments_size: u32,
+    binning: u32,
+    ptcl: u32,
+    tile: u32,
+    segments: u32,
+    blend: u32,
+}
+
 #[allow(unused)]
 fn render(scene: &Scene, shaders: &Shaders) -> (Recording, BufProxy) {
    let mut recording = Recording::default();
@ -185,6 +203,22 @@ pub fn render_encoding_full(
    let scene_buf = ResourceProxy::Buf(recording.upload("scene", packed.data));
    let config_buf =
        ResourceProxy::Buf(recording.upload_uniform("config", bytemuck::bytes_of(&config)));
+    let info_size = config.layout.bin_data_start;
+    let bump = BumpAllocators {
+        binning_size: ((1 << 20) / 4) - info_size,
+        ptcl_size: (1 << 25) / 4,
+        tiles_size: (1 << 24) / TILE_SIZE as u32,
+        segments_size: (1 << 26) / SEGMENT_SIZE as u32,
+        ..Default::default()
+    };
+    let info_bin_data_buf = ResourceProxy::new_buf(
+        (info_size + bump.binning_size) as u64 * 4,
+        "info_bin_data_buf",
+    );
+    let tile_buf = ResourceProxy::new_buf(bump.tiles_size as u64 * TILE_SIZE, "tile_buf");
+    let segments_buf =
+        ResourceProxy::new_buf(bump.segments_size as u64 * SEGMENT_SIZE, "segments_buf");
+    let ptcl_buf = ResourceProxy::new_buf(bump.ptcl_size as u64 * 4, "ptcl_buf");

    let pathtag_wgs = pathtag_padded / (4 * shaders::PATHTAG_REDUCE_WG as usize);
    let pathtag_large = pathtag_wgs > shaders::PATHTAG_REDUCE_WG as usize;
@ -267,7 +301,6 @@ pub fn render_encoding_full(
    );
    let draw_monoid_buf =
        ResourceProxy::new_buf(n_drawobj as u64 * DRAWMONOID_SIZE, "draw_monoid_buf");
-    let info_bin_data_buf = ResourceProxy::new_buf(1 << 20, "info_bin_data_buf");
    let clip_inp_buf =
        ResourceProxy::new_buf(encoding.n_clips as u64 * CLIP_INP_SIZE, "clip_inp_buf");
    recording.dispatch(
@ -320,14 +353,13 @@ pub fn render_encoding_full(
        );
    }
    let draw_bbox_buf = ResourceProxy::new_buf(n_paths as u64 * DRAW_BBOX_SIZE, "draw_bbox_buf");
-    let bump_buf = BufProxy::new(BUMP_SIZE, "bump_buf");
+    let bump_buf = recording.upload("bump_buf", bytemuck::bytes_of(&bump));
    let width_in_bins = (config.width_in_tiles + 15) / 16;
    let height_in_bins = (config.height_in_tiles + 15) / 16;
    let bin_header_buf = ResourceProxy::new_buf(
        (256 * drawobj_wgs) as u64 * BIN_HEADER_SIZE,
        "bin_header_buf",
    );
-    recording.clear_all(bump_buf);
    let bump_buf = ResourceProxy::Buf(bump_buf);
    recording.dispatch(
        shaders.binning,
@ -347,7 +379,6 @@ pub fn render_encoding_full(
    // in storage rather than workgroup memory.
    let n_path_aligned = align_up(n_paths as usize, 256);
    let path_buf = ResourceProxy::new_buf(n_path_aligned as u64 * PATH_SIZE, "path_buf");
-    let tile_buf = ResourceProxy::new_buf(1 << 24, "tile_buf");
    let path_wgs = (n_paths + shaders::PATH_BBOX_WG - 1) / shaders::PATH_BBOX_WG;
    recording.dispatch(
        shaders.tile_alloc,
@ -361,8 +392,6 @@ pub fn render_encoding_full(
            tile_buf,
        ],
    );
-
-    let segments_buf = ResourceProxy::new_buf(1 << 26, "segments_buf");
    recording.dispatch(
        shaders.path_coarse,
        (path_coarse_wgs, 1, 1),
@ -382,7 +411,6 @@ pub fn render_encoding_full(
        (path_wgs, 1, 1),
        [config_buf, path_buf, tile_buf],
    );
-    let ptcl_buf = ResourceProxy::new_buf(1 << 25, "ptcl_buf");
    recording.dispatch(
        shaders.coarse,
        (width_in_bins, height_in_bins, 1),