vello/shader/tile_alloc.wgsl
Arman Uguray 2c46228d06 [binning] Correctly handle disjoint bounding-box intersections
When the bounding boxes of a path and its clip are disjoint (i.e. they
do not intersect) the result of their intersection is a negative
rectangle.

When calculating the intersection of the bboxes, the binning stage
ensures that the bbox is non-negative. It then normalizes the
coordinates to bin dimensions and rounds the top-left corner down
to the neareast integer and the bottom-right corner up.

However this rounding causes zero-area bounding boxes to have a non-zero
area and sends the bottom-right corner to the placed in the next bin.
This causes fully clipped out draw objects to be included in binning,
with an incorrect clip bounding box that causes them to be erroneously
drawn with partial clipping.

`binning` now takes care around this logic to make sure that a zero-area
intersected-bbox gets skipped and clipped out. `tile_alloc`, also takes
care in its logic.

Fixes #286 and #333
2023-06-28 20:42:52 -07:00

128 lines
4.3 KiB
WebGPU Shading Language

// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
// Tile allocation (and zeroing of tiles)
#import config
#import bump
#import drawtag
#import tile
@group(0) @binding(0)
var<uniform> config: Config;
@group(0) @binding(1)
var<storage> scene: array<u32>;
@group(0) @binding(2)
var<storage> draw_bboxes: array<vec4<f32>>;
@group(0) @binding(3)
var<storage, read_write> bump: BumpAllocators;
@group(0) @binding(4)
var<storage, read_write> paths: array<Path>;
@group(0) @binding(5)
var<storage, read_write> tiles: array<Tile>;
let WG_SIZE = 256u;
var<workgroup> sh_tile_count: array<u32, WG_SIZE>;
var<workgroup> sh_tile_offset: u32;
#ifdef have_uniform
var<workgroup> sh_atomic_failed: u32;
#endif
@compute @workgroup_size(256)
fn main(
@builtin(global_invocation_id) global_id: vec3<u32>,
@builtin(local_invocation_id) local_id: vec3<u32>,
) {
// Exit early if prior stages failed, as we can't run this stage.
// We need to check only prior stages, as if this stage has failed in another workgroup,
// we still want to know this workgroup's memory requirement.
#ifdef have_uniform
if local_id.x == 0u {
sh_atomic_failed = atomicLoad(&bump.failed);
}
let failed = workgroupUniformLoad(&sh_atomic_failed);
#else
let failed = atomicLoad(&bump.failed);
#endif
if (failed & STAGE_BINNING) != 0u {
return;
}
// scale factors useful for converting coordinates to tiles
// TODO: make into constants
let SX = 1.0 / f32(TILE_WIDTH);
let SY = 1.0 / f32(TILE_HEIGHT);
let drawobj_ix = global_id.x;
var drawtag = DRAWTAG_NOP;
if drawobj_ix < config.n_drawobj {
drawtag = scene[config.drawtag_base + drawobj_ix];
}
var x0 = 0;
var y0 = 0;
var x1 = 0;
var y1 = 0;
if drawtag != DRAWTAG_NOP && drawtag != DRAWTAG_END_CLIP {
let bbox = draw_bboxes[drawobj_ix];
// Don't round up the bottom-right corner of the bbox if the area is zero and leave the
// coordinates at 0. This will make `tile_count` zero as the shape is clipped out.
if bbox.x < bbox.z && bbox.y < bbox.w {
x0 = i32(floor(bbox.x * SX));
y0 = i32(floor(bbox.y * SY));
x1 = i32(ceil(bbox.z * SX));
y1 = i32(ceil(bbox.w * SY));
}
}
let ux0 = u32(clamp(x0, 0, i32(config.width_in_tiles)));
let uy0 = u32(clamp(y0, 0, i32(config.height_in_tiles)));
let ux1 = u32(clamp(x1, 0, i32(config.width_in_tiles)));
let uy1 = u32(clamp(y1, 0, i32(config.height_in_tiles)));
let tile_count = (ux1 - ux0) * (uy1 - uy0);
var total_tile_count = tile_count;
sh_tile_count[local_id.x] = tile_count;
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
workgroupBarrier();
if local_id.x >= (1u << i) {
total_tile_count += sh_tile_count[local_id.x - (1u << i)];
}
workgroupBarrier();
sh_tile_count[local_id.x] = total_tile_count;
}
if local_id.x == WG_SIZE - 1u {
let count = sh_tile_count[WG_SIZE - 1u];
var offset = atomicAdd(&bump.tile, count);
if offset + count > config.tiles_size {
offset = 0u;
atomicOr(&bump.failed, STAGE_TILE_ALLOC);
}
paths[drawobj_ix].tiles = offset;
}
// Using storage barriers is a workaround for what appears to be a miscompilation
// when a normal workgroup-shared variable is used to broadcast the value.
storageBarrier();
let tile_offset = paths[drawobj_ix | (WG_SIZE - 1u)].tiles;
storageBarrier();
if drawobj_ix < config.n_drawobj {
let tile_subix = select(0u, sh_tile_count[local_id.x - 1u], local_id.x > 0u);
let bbox = vec4(ux0, uy0, ux1, uy1);
let path = Path(bbox, tile_offset + tile_subix);
paths[drawobj_ix] = path;
}
// zero allocated memory
// Note: if the number of draw objects is small, utilization will be poor.
// There are two things that can be done to improve that. One would be a
// separate (indirect) dispatch. Another would be to have each workgroup
// process fewer draw objects than the number of threads in the wg.
let total_count = sh_tile_count[WG_SIZE - 1u];
for (var i = local_id.x; i < total_count; i += WG_SIZE) {
// Note: could format output buffer as u32 for even better load balancing.
tiles[tile_offset + i] = Tile(0, 0u);
}
}