diff --git a/shader/binning.wgsl b/shader/binning.wgsl index 1783fc5..77ab462 100644 --- a/shader/binning.wgsl +++ b/shader/binning.wgsl @@ -127,7 +127,11 @@ fn main( sh_count[i][local_id.x] = element_count_packed; } // element_count is the number of draw objects covering this thread's bin - let chunk_offset = atomicAdd(&bump.binning, element_count); + var chunk_offset = atomicAdd(&bump.binning, element_count); + if chunk_offset + element_count > config.binning_size { + chunk_offset = 0u; + atomicOr(&bump.failed, STAGE_BINNING); + } sh_chunk_offset[local_id.x] = chunk_offset; bin_header[global_id.x].element_count = element_count; bin_header[global_id.x].chunk_offset = chunk_offset; diff --git a/shader/coarse.wgsl b/shader/coarse.wgsl index 8728903..cea3637 100644 --- a/shader/coarse.wgsl +++ b/shader/coarse.wgsl @@ -70,8 +70,11 @@ fn alloc_cmd(size: u32) { // We might be able to save a little bit of computation here // by setting the initial value of the bump allocator. let ptcl_dyn_start = config.width_in_tiles * config.height_in_tiles * PTCL_INITIAL_ALLOC; - let new_cmd = ptcl_dyn_start + atomicAdd(&bump.ptcl, PTCL_INCREMENT); - // TODO: robust memory + var new_cmd = ptcl_dyn_start + atomicAdd(&bump.ptcl, PTCL_INCREMENT); + if new_cmd + PTCL_INCREMENT > config.ptcl_size { + new_cmd = 0u; + atomicOr(&bump.failed, STAGE_COARSE); + } ptcl[cmd_offset] = CMD_JUMP; ptcl[cmd_offset + 1u] = new_cmd; cmd_offset = new_cmd; @@ -134,7 +137,7 @@ fn write_end_clip(end_clip: CmdEndClip) { ptcl[cmd_offset] = CMD_END_CLIP; ptcl[cmd_offset + 1u] = end_clip.blend; ptcl[cmd_offset + 2u] = bitcast(end_clip.alpha); - cmd_offset += 3u; + cmd_offset += 3u; } @compute @workgroup_size(256) @@ -142,6 +145,12 @@ fn main( @builtin(local_invocation_id) local_id: vec3, @builtin(workgroup_id) wg_id: vec3, ) { + // Exit early if prior stages failed, as we can't run this stage. + // We need to check only prior stages, as if this stage has failed in another workgroup, + // we still want to know this workgroup's memory requirement. + if (atomicLoad(&bump.failed) & (STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_PATH_COARSE)) != 0u { + return; + } let width_in_bins = (config.width_in_tiles + N_TILE_X - 1u) / N_TILE_X; let bin_ix = width_in_bins * wg_id.y + wg_id.x; let n_partitions = (config.n_drawobj + N_TILE - 1u) / N_TILE; @@ -170,6 +179,9 @@ fn main( var render_blend_depth = 0u; var max_blend_depth = 0u; + let blend_offset = cmd_offset; + cmd_offset += 1u; + while true { for (var i = 0u; i < N_SLICE; i += 1u) { atomicStore(&sh_bitmaps[i][local_id.x], 0u); @@ -401,6 +413,9 @@ fn main( } if bin_tile_x + tile_x < config.width_in_tiles && bin_tile_y + tile_y < config.height_in_tiles { ptcl[cmd_offset] = CMD_END; - // TODO: blend stack allocation + if max_blend_depth > BLEND_STACK_SPLIT { + let scratch_size = max_blend_depth * TILE_WIDTH * TILE_HEIGHT; + ptcl[blend_offset] = atomicAdd(&bump.blend, scratch_size); + } } } diff --git a/shader/fine.wgsl b/shader/fine.wgsl index 46f180e..4cb5a8e 100644 --- a/shader/fine.wgsl +++ b/shader/fine.wgsl @@ -27,7 +27,6 @@ var segments: array; #import ptcl let GRADIENT_WIDTH = 512; -let BLEND_STACK_SPLIT = 4u; @group(0) @binding(3) var output: texture_storage_2d; @@ -192,7 +191,8 @@ fn main( var clip_depth = 0u; var area: array; var cmd_ix = tile_ix * PTCL_INITIAL_ALLOC; - + let blend_offset = ptcl[cmd_ix]; + cmd_ix += 1u; // main interpretation loop while true { let tag = ptcl[cmd_ix]; diff --git a/shader/path_coarse_full.wgsl b/shader/path_coarse_full.wgsl index d6e5d91..ef1bed5 100644 --- a/shader/path_coarse_full.wgsl +++ b/shader/path_coarse_full.wgsl @@ -93,7 +93,12 @@ fn eval_cubic(p0: vec2, p1: vec2, p2: vec2, p3: vec2, t: f32 } fn alloc_segment() -> u32 { - return atomicAdd(&bump.segments, 1u) + 1u; + var offset = atomicAdd(&bump.segments, 1u) + 1u; + if offset + 1u > config.segments_size { + offset = 0u; + atomicOr(&bump.failed, STAGE_PATH_COARSE); + } + return offset; } let MAX_QUADS = 16u; @@ -102,6 +107,12 @@ let MAX_QUADS = 16u; fn main( @builtin(global_invocation_id) global_id: vec3, ) { + // Exit early if prior stages failed, as we can't run this stage. + // We need to check only prior stages, as if this stage has failed in another workgroup, + // we still want to know this workgroup's memory requirement. + if (atomicLoad(&bump.failed) & (STAGE_BINNING | STAGE_TILE_ALLOC)) != 0u { + return; + } let ix = global_id.x; let tag_word = scene[config.pathtag_base + (ix >> 2u)]; let shift = (ix & 3u) * 8u; diff --git a/shader/shared/bump.wgsl b/shader/shared/bump.wgsl index 89c7d54..4864f31 100644 --- a/shader/shared/bump.wgsl +++ b/shader/shared/bump.wgsl @@ -1,9 +1,18 @@ // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense -// TODO: robust memory (failure flags) +// Bitflags for each stage that can fail allocation. +let STAGE_BINNING: u32 = 0x1u; +let STAGE_TILE_ALLOC: u32 = 0x2u; +let STAGE_PATH_COARSE: u32 = 0x4u; +let STAGE_COARSE: u32 = 0x8u; + +// This must be kept in sync with the struct in src/render.rs struct BumpAllocators { + // Bitmask of stages that have failed allocation. + failed: atomic, binning: atomic, ptcl: atomic, tile: atomic, segments: atomic, + blend: atomic, } diff --git a/shader/shared/config.wgsl b/shader/shared/config.wgsl index 0cb56d8..f586f47 100644 --- a/shader/shared/config.wgsl +++ b/shader/shared/config.wgsl @@ -24,6 +24,12 @@ struct Config { transform_base: u32, linewidth_base: u32, + + // Sizes of bump allocated buffers (in element size units) + binning_size: u32, + tiles_size: u32, + segments_size: u32, + ptcl_size: u32, } // Geometry of tiles and bins @@ -35,3 +41,5 @@ let N_TILE_X = 16u; let N_TILE_Y = 16u; //let N_TILE = N_TILE_X * N_TILE_Y; let N_TILE = 256u; + +let BLEND_STACK_SPLIT = 4u; diff --git a/shader/tile_alloc.wgsl b/shader/tile_alloc.wgsl index 7bb0e72..b28166e 100644 --- a/shader/tile_alloc.wgsl +++ b/shader/tile_alloc.wgsl @@ -35,6 +35,12 @@ fn main( @builtin(global_invocation_id) global_id: vec3, @builtin(local_invocation_id) local_id: vec3, ) { + // Exit early if prior stages failed, as we can't run this stage. + // We need to check only prior stages, as if this stage has failed in another workgroup, + // we still want to know this workgroup's memory requirement. + if (atomicLoad(&bump.failed) & STAGE_BINNING) != 0u { + return; + } // scale factors useful for converting coordinates to tiles // TODO: make into constants let SX = 1.0 / f32(TILE_WIDTH); @@ -72,8 +78,14 @@ fn main( sh_tile_count[local_id.x] = total_tile_count; } if local_id.x == WG_SIZE - 1u { - paths[drawobj_ix].tiles = atomicAdd(&bump.tile, sh_tile_count[WG_SIZE - 1u]); - } + let count = sh_tile_count[WG_SIZE - 1u]; + var offset = atomicAdd(&bump.tile, count); + if offset + count > config.tiles_size { + offset = 0u; + atomicOr(&bump.failed, STAGE_TILE_ALLOC); + } + paths[drawobj_ix].tiles = offset; + } // Using storage barriers is a workaround for what appears to be a miscompilation // when a normal workgroup-shared variable is used to broadcast the value. storageBarrier(); diff --git a/src/encoding/packed.rs b/src/encoding/packed.rs index e96ec3a..55ffc17 100644 --- a/src/encoding/packed.rs +++ b/src/encoding/packed.rs @@ -62,6 +62,14 @@ pub struct Config { pub target_height: u32, /// Layout of packed scene data. pub layout: Layout, + /// Size of binning buffer allocation (in u32s). + pub binning_size: u32, + /// Size of tile buffer allocation (in Tiles). + pub tiles_size: u32, + /// Size of segment buffer allocation (in PathSegments). + pub segments_size: u32, + /// Size of per-tile command list buffer allocation (in u32s). + pub ptcl_size: u32, } /// Packed encoding of scene data. diff --git a/src/render.rs b/src/render.rs index 352b6e7..9136931 100644 --- a/src/render.rs +++ b/src/render.rs @@ -21,8 +21,10 @@ const CLIP_INP_SIZE: u64 = 8; const CLIP_BBOX_SIZE: u64 = 16; const PATH_SIZE: u64 = 32; const DRAW_BBOX_SIZE: u64 = 16; -const BUMP_SIZE: u64 = 16; +const BUMP_SIZE: u64 = std::mem::size_of::() as u64; const BIN_HEADER_SIZE: u64 = 8; +const TILE_SIZE: u64 = 8; +const SEGMENT_SIZE: u64 = 24; #[repr(C)] #[derive(Clone, Copy, Debug, Default, Zeroable, Pod)] @@ -54,6 +56,20 @@ pub const fn next_multiple_of(val: u32, rhs: u32) -> u32 { } } +// This must be kept in sync with the struct in shader/shared/bump.wgsl +#[repr(C)] +#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)] +struct BumpAllocators { + failed: u32, + // Final needed dynamic size of the buffers. If any of these are larger than the corresponding `_size` element + // reallocation needs to occur + binning: u32, + ptcl: u32, + tile: u32, + segments: u32, + blend: u32, +} + #[allow(unused)] fn render(scene: &Scene, shaders: &Shaders) -> (Recording, BufProxy) { let mut recording = Recording::default(); @@ -174,17 +190,30 @@ pub fn render_encoding_full( let new_width = next_multiple_of(width, 16); let new_height = next_multiple_of(height, 16); + let info_size = packed.layout.bin_data_start; let config = crate::encoding::Config { width_in_tiles: new_width / 16, height_in_tiles: new_height / 16, target_width: width, target_height: height, + binning_size: ((1 << 20) / 4) - info_size, + tiles_size: (1 << 24) / TILE_SIZE as u32, + segments_size: (1 << 26) / SEGMENT_SIZE as u32, + ptcl_size: (1 << 25) / 4, layout: packed.layout, }; // println!("{:?}", config); let scene_buf = ResourceProxy::Buf(recording.upload("scene", packed.data)); let config_buf = ResourceProxy::Buf(recording.upload_uniform("config", bytemuck::bytes_of(&config))); + let info_bin_data_buf = ResourceProxy::new_buf( + (info_size + config.binning_size) as u64 * 4, + "info_bin_data_buf", + ); + let tile_buf = ResourceProxy::new_buf(config.tiles_size as u64 * TILE_SIZE, "tile_buf"); + let segments_buf = + ResourceProxy::new_buf(config.segments_size as u64 * SEGMENT_SIZE, "segments_buf"); + let ptcl_buf = ResourceProxy::new_buf(config.ptcl_size as u64 * 4, "ptcl_buf"); let pathtag_wgs = pathtag_padded / (4 * shaders::PATHTAG_REDUCE_WG as usize); let pathtag_large = pathtag_wgs > shaders::PATHTAG_REDUCE_WG as usize; @@ -267,7 +296,6 @@ pub fn render_encoding_full( ); let draw_monoid_buf = ResourceProxy::new_buf(n_drawobj as u64 * DRAWMONOID_SIZE, "draw_monoid_buf"); - let info_bin_data_buf = ResourceProxy::new_buf(1 << 20, "info_bin_data_buf"); let clip_inp_buf = ResourceProxy::new_buf(encoding.n_clips as u64 * CLIP_INP_SIZE, "clip_inp_buf"); recording.dispatch( @@ -347,7 +375,6 @@ pub fn render_encoding_full( // in storage rather than workgroup memory. let n_path_aligned = align_up(n_paths as usize, 256); let path_buf = ResourceProxy::new_buf(n_path_aligned as u64 * PATH_SIZE, "path_buf"); - let tile_buf = ResourceProxy::new_buf(1 << 24, "tile_buf"); let path_wgs = (n_paths + shaders::PATH_BBOX_WG - 1) / shaders::PATH_BBOX_WG; recording.dispatch( shaders.tile_alloc, @@ -361,8 +388,6 @@ pub fn render_encoding_full( tile_buf, ], ); - - let segments_buf = ResourceProxy::new_buf(1 << 26, "segments_buf"); recording.dispatch( shaders.path_coarse, (path_coarse_wgs, 1, 1), @@ -382,7 +407,6 @@ pub fn render_encoding_full( (path_wgs, 1, 1), [config_buf, path_buf, tile_buf], ); - let ptcl_buf = ResourceProxy::new_buf(1 << 25, "ptcl_buf"); recording.dispatch( shaders.coarse, (width_in_bins, height_in_bins, 1),