Merge pull request #257 from linebender/robust-redux

Robust memory redux (GPU side only)
This commit is contained in:
Chad Brokaw 2023-01-19 12:00:07 -05:00 committed by GitHub
commit b83642bf0c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 108 additions and 17 deletions

View file

@ -127,7 +127,11 @@ fn main(
sh_count[i][local_id.x] = element_count_packed;
}
// element_count is the number of draw objects covering this thread's bin
let chunk_offset = atomicAdd(&bump.binning, element_count);
var chunk_offset = atomicAdd(&bump.binning, element_count);
if chunk_offset + element_count > config.binning_size {
chunk_offset = 0u;
atomicOr(&bump.failed, STAGE_BINNING);
}
sh_chunk_offset[local_id.x] = chunk_offset;
bin_header[global_id.x].element_count = element_count;
bin_header[global_id.x].chunk_offset = chunk_offset;

View file

@ -70,8 +70,11 @@ fn alloc_cmd(size: u32) {
// We might be able to save a little bit of computation here
// by setting the initial value of the bump allocator.
let ptcl_dyn_start = config.width_in_tiles * config.height_in_tiles * PTCL_INITIAL_ALLOC;
let new_cmd = ptcl_dyn_start + atomicAdd(&bump.ptcl, PTCL_INCREMENT);
// TODO: robust memory
var new_cmd = ptcl_dyn_start + atomicAdd(&bump.ptcl, PTCL_INCREMENT);
if new_cmd + PTCL_INCREMENT > config.ptcl_size {
new_cmd = 0u;
atomicOr(&bump.failed, STAGE_COARSE);
}
ptcl[cmd_offset] = CMD_JUMP;
ptcl[cmd_offset + 1u] = new_cmd;
cmd_offset = new_cmd;
@ -134,7 +137,7 @@ fn write_end_clip(end_clip: CmdEndClip) {
ptcl[cmd_offset] = CMD_END_CLIP;
ptcl[cmd_offset + 1u] = end_clip.blend;
ptcl[cmd_offset + 2u] = bitcast<u32>(end_clip.alpha);
cmd_offset += 3u;
cmd_offset += 3u;
}
@compute @workgroup_size(256)
@ -142,6 +145,12 @@ fn main(
@builtin(local_invocation_id) local_id: vec3<u32>,
@builtin(workgroup_id) wg_id: vec3<u32>,
) {
// Exit early if prior stages failed, as we can't run this stage.
// We need to check only prior stages, as if this stage has failed in another workgroup,
// we still want to know this workgroup's memory requirement.
if (atomicLoad(&bump.failed) & (STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_PATH_COARSE)) != 0u {
return;
}
let width_in_bins = (config.width_in_tiles + N_TILE_X - 1u) / N_TILE_X;
let bin_ix = width_in_bins * wg_id.y + wg_id.x;
let n_partitions = (config.n_drawobj + N_TILE - 1u) / N_TILE;
@ -170,6 +179,9 @@ fn main(
var render_blend_depth = 0u;
var max_blend_depth = 0u;
let blend_offset = cmd_offset;
cmd_offset += 1u;
while true {
for (var i = 0u; i < N_SLICE; i += 1u) {
atomicStore(&sh_bitmaps[i][local_id.x], 0u);
@ -401,6 +413,9 @@ fn main(
}
if bin_tile_x + tile_x < config.width_in_tiles && bin_tile_y + tile_y < config.height_in_tiles {
ptcl[cmd_offset] = CMD_END;
// TODO: blend stack allocation
if max_blend_depth > BLEND_STACK_SPLIT {
let scratch_size = max_blend_depth * TILE_WIDTH * TILE_HEIGHT;
ptcl[blend_offset] = atomicAdd(&bump.blend, scratch_size);
}
}
}

View file

@ -27,7 +27,6 @@ var<storage> segments: array<Segment>;
#import ptcl
let GRADIENT_WIDTH = 512;
let BLEND_STACK_SPLIT = 4u;
@group(0) @binding(3)
var output: texture_storage_2d<rgba8unorm, write>;
@ -192,7 +191,8 @@ fn main(
var clip_depth = 0u;
var area: array<f32, PIXELS_PER_THREAD>;
var cmd_ix = tile_ix * PTCL_INITIAL_ALLOC;
let blend_offset = ptcl[cmd_ix];
cmd_ix += 1u;
// main interpretation loop
while true {
let tag = ptcl[cmd_ix];

View file

@ -93,7 +93,12 @@ fn eval_cubic(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, p3: vec2<f32>, t: f32
}
fn alloc_segment() -> u32 {
return atomicAdd(&bump.segments, 1u) + 1u;
var offset = atomicAdd(&bump.segments, 1u) + 1u;
if offset + 1u > config.segments_size {
offset = 0u;
atomicOr(&bump.failed, STAGE_PATH_COARSE);
}
return offset;
}
let MAX_QUADS = 16u;
@ -102,6 +107,12 @@ let MAX_QUADS = 16u;
fn main(
@builtin(global_invocation_id) global_id: vec3<u32>,
) {
// Exit early if prior stages failed, as we can't run this stage.
// We need to check only prior stages, as if this stage has failed in another workgroup,
// we still want to know this workgroup's memory requirement.
if (atomicLoad(&bump.failed) & (STAGE_BINNING | STAGE_TILE_ALLOC)) != 0u {
return;
}
let ix = global_id.x;
let tag_word = scene[config.pathtag_base + (ix >> 2u)];
let shift = (ix & 3u) * 8u;

View file

@ -1,9 +1,18 @@
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
// TODO: robust memory (failure flags)
// Bitflags for each stage that can fail allocation.
let STAGE_BINNING: u32 = 0x1u;
let STAGE_TILE_ALLOC: u32 = 0x2u;
let STAGE_PATH_COARSE: u32 = 0x4u;
let STAGE_COARSE: u32 = 0x8u;
// This must be kept in sync with the struct in src/render.rs
struct BumpAllocators {
// Bitmask of stages that have failed allocation.
failed: atomic<u32>,
binning: atomic<u32>,
ptcl: atomic<u32>,
tile: atomic<u32>,
segments: atomic<u32>,
blend: atomic<u32>,
}

View file

@ -24,6 +24,12 @@ struct Config {
transform_base: u32,
linewidth_base: u32,
// Sizes of bump allocated buffers (in element size units)
binning_size: u32,
tiles_size: u32,
segments_size: u32,
ptcl_size: u32,
}
// Geometry of tiles and bins
@ -35,3 +41,5 @@ let N_TILE_X = 16u;
let N_TILE_Y = 16u;
//let N_TILE = N_TILE_X * N_TILE_Y;
let N_TILE = 256u;
let BLEND_STACK_SPLIT = 4u;

View file

@ -35,6 +35,12 @@ fn main(
@builtin(global_invocation_id) global_id: vec3<u32>,
@builtin(local_invocation_id) local_id: vec3<u32>,
) {
// Exit early if prior stages failed, as we can't run this stage.
// We need to check only prior stages, as if this stage has failed in another workgroup,
// we still want to know this workgroup's memory requirement.
if (atomicLoad(&bump.failed) & STAGE_BINNING) != 0u {
return;
}
// scale factors useful for converting coordinates to tiles
// TODO: make into constants
let SX = 1.0 / f32(TILE_WIDTH);
@ -72,8 +78,14 @@ fn main(
sh_tile_count[local_id.x] = total_tile_count;
}
if local_id.x == WG_SIZE - 1u {
paths[drawobj_ix].tiles = atomicAdd(&bump.tile, sh_tile_count[WG_SIZE - 1u]);
}
let count = sh_tile_count[WG_SIZE - 1u];
var offset = atomicAdd(&bump.tile, count);
if offset + count > config.tiles_size {
offset = 0u;
atomicOr(&bump.failed, STAGE_TILE_ALLOC);
}
paths[drawobj_ix].tiles = offset;
}
// Using storage barriers is a workaround for what appears to be a miscompilation
// when a normal workgroup-shared variable is used to broadcast the value.
storageBarrier();

View file

@ -62,6 +62,14 @@ pub struct Config {
pub target_height: u32,
/// Layout of packed scene data.
pub layout: Layout,
/// Size of binning buffer allocation (in u32s).
pub binning_size: u32,
/// Size of tile buffer allocation (in Tiles).
pub tiles_size: u32,
/// Size of segment buffer allocation (in PathSegments).
pub segments_size: u32,
/// Size of per-tile command list buffer allocation (in u32s).
pub ptcl_size: u32,
}
/// Packed encoding of scene data.

View file

@ -21,8 +21,10 @@ const CLIP_INP_SIZE: u64 = 8;
const CLIP_BBOX_SIZE: u64 = 16;
const PATH_SIZE: u64 = 32;
const DRAW_BBOX_SIZE: u64 = 16;
const BUMP_SIZE: u64 = 16;
const BUMP_SIZE: u64 = std::mem::size_of::<BumpAllocators>() as u64;
const BIN_HEADER_SIZE: u64 = 8;
const TILE_SIZE: u64 = 8;
const SEGMENT_SIZE: u64 = 24;
#[repr(C)]
#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
@ -54,6 +56,20 @@ pub const fn next_multiple_of(val: u32, rhs: u32) -> u32 {
}
}
// This must be kept in sync with the struct in shader/shared/bump.wgsl
#[repr(C)]
#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
struct BumpAllocators {
failed: u32,
// Final needed dynamic size of the buffers. If any of these are larger than the corresponding `_size` element
// reallocation needs to occur
binning: u32,
ptcl: u32,
tile: u32,
segments: u32,
blend: u32,
}
#[allow(unused)]
fn render(scene: &Scene, shaders: &Shaders) -> (Recording, BufProxy) {
let mut recording = Recording::default();
@ -174,17 +190,30 @@ pub fn render_encoding_full(
let new_width = next_multiple_of(width, 16);
let new_height = next_multiple_of(height, 16);
let info_size = packed.layout.bin_data_start;
let config = crate::encoding::Config {
width_in_tiles: new_width / 16,
height_in_tiles: new_height / 16,
target_width: width,
target_height: height,
binning_size: ((1 << 20) / 4) - info_size,
tiles_size: (1 << 24) / TILE_SIZE as u32,
segments_size: (1 << 26) / SEGMENT_SIZE as u32,
ptcl_size: (1 << 25) / 4,
layout: packed.layout,
};
// println!("{:?}", config);
let scene_buf = ResourceProxy::Buf(recording.upload("scene", packed.data));
let config_buf =
ResourceProxy::Buf(recording.upload_uniform("config", bytemuck::bytes_of(&config)));
let info_bin_data_buf = ResourceProxy::new_buf(
(info_size + config.binning_size) as u64 * 4,
"info_bin_data_buf",
);
let tile_buf = ResourceProxy::new_buf(config.tiles_size as u64 * TILE_SIZE, "tile_buf");
let segments_buf =
ResourceProxy::new_buf(config.segments_size as u64 * SEGMENT_SIZE, "segments_buf");
let ptcl_buf = ResourceProxy::new_buf(config.ptcl_size as u64 * 4, "ptcl_buf");
let pathtag_wgs = pathtag_padded / (4 * shaders::PATHTAG_REDUCE_WG as usize);
let pathtag_large = pathtag_wgs > shaders::PATHTAG_REDUCE_WG as usize;
@ -267,7 +296,6 @@ pub fn render_encoding_full(
);
let draw_monoid_buf =
ResourceProxy::new_buf(n_drawobj as u64 * DRAWMONOID_SIZE, "draw_monoid_buf");
let info_bin_data_buf = ResourceProxy::new_buf(1 << 20, "info_bin_data_buf");
let clip_inp_buf =
ResourceProxy::new_buf(encoding.n_clips as u64 * CLIP_INP_SIZE, "clip_inp_buf");
recording.dispatch(
@ -347,7 +375,6 @@ pub fn render_encoding_full(
// in storage rather than workgroup memory.
let n_path_aligned = align_up(n_paths as usize, 256);
let path_buf = ResourceProxy::new_buf(n_path_aligned as u64 * PATH_SIZE, "path_buf");
let tile_buf = ResourceProxy::new_buf(1 << 24, "tile_buf");
let path_wgs = (n_paths + shaders::PATH_BBOX_WG - 1) / shaders::PATH_BBOX_WG;
recording.dispatch(
shaders.tile_alloc,
@ -361,8 +388,6 @@ pub fn render_encoding_full(
tile_buf,
],
);
let segments_buf = ResourceProxy::new_buf(1 << 26, "segments_buf");
recording.dispatch(
shaders.path_coarse,
(path_coarse_wgs, 1, 1),
@ -382,7 +407,6 @@ pub fn render_encoding_full(
(path_wgs, 1, 1),
[config_buf, path_buf, tile_buf],
);
let ptcl_buf = ResourceProxy::new_buf(1 << 25, "ptcl_buf");
recording.dispatch(
shaders.coarse,
(width_in_bins, height_in_bins, 1),