mirror of
https://github.com/italicsjenga/vello.git
synced 2025-01-09 12:21:31 +11:00
2c46228d06
When the bounding boxes of a path and its clip are disjoint (i.e. they do not intersect) the result of their intersection is a negative rectangle. When calculating the intersection of the bboxes, the binning stage ensures that the bbox is non-negative. It then normalizes the coordinates to bin dimensions and rounds the top-left corner down to the neareast integer and the bottom-right corner up. However this rounding causes zero-area bounding boxes to have a non-zero area and sends the bottom-right corner to the placed in the next bin. This causes fully clipped out draw objects to be included in binning, with an incorrect clip bounding box that causes them to be erroneously drawn with partial clipping. `binning` now takes care around this logic to make sure that a zero-area intersected-bbox gets skipped and clipped out. `tile_alloc`, also takes care in its logic. Fixes #286 and #333
128 lines
4.3 KiB
WebGPU Shading Language
128 lines
4.3 KiB
WebGPU Shading Language
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
|
|
|
|
// Tile allocation (and zeroing of tiles)
|
|
|
|
#import config
|
|
#import bump
|
|
#import drawtag
|
|
#import tile
|
|
|
|
@group(0) @binding(0)
|
|
var<uniform> config: Config;
|
|
|
|
@group(0) @binding(1)
|
|
var<storage> scene: array<u32>;
|
|
|
|
@group(0) @binding(2)
|
|
var<storage> draw_bboxes: array<vec4<f32>>;
|
|
|
|
@group(0) @binding(3)
|
|
var<storage, read_write> bump: BumpAllocators;
|
|
|
|
@group(0) @binding(4)
|
|
var<storage, read_write> paths: array<Path>;
|
|
|
|
@group(0) @binding(5)
|
|
var<storage, read_write> tiles: array<Tile>;
|
|
|
|
let WG_SIZE = 256u;
|
|
|
|
var<workgroup> sh_tile_count: array<u32, WG_SIZE>;
|
|
var<workgroup> sh_tile_offset: u32;
|
|
#ifdef have_uniform
|
|
var<workgroup> sh_atomic_failed: u32;
|
|
#endif
|
|
|
|
@compute @workgroup_size(256)
|
|
fn main(
|
|
@builtin(global_invocation_id) global_id: vec3<u32>,
|
|
@builtin(local_invocation_id) local_id: vec3<u32>,
|
|
) {
|
|
// Exit early if prior stages failed, as we can't run this stage.
|
|
// We need to check only prior stages, as if this stage has failed in another workgroup,
|
|
// we still want to know this workgroup's memory requirement.
|
|
#ifdef have_uniform
|
|
if local_id.x == 0u {
|
|
sh_atomic_failed = atomicLoad(&bump.failed);
|
|
}
|
|
let failed = workgroupUniformLoad(&sh_atomic_failed);
|
|
#else
|
|
let failed = atomicLoad(&bump.failed);
|
|
#endif
|
|
if (failed & STAGE_BINNING) != 0u {
|
|
return;
|
|
}
|
|
// scale factors useful for converting coordinates to tiles
|
|
// TODO: make into constants
|
|
let SX = 1.0 / f32(TILE_WIDTH);
|
|
let SY = 1.0 / f32(TILE_HEIGHT);
|
|
|
|
let drawobj_ix = global_id.x;
|
|
var drawtag = DRAWTAG_NOP;
|
|
if drawobj_ix < config.n_drawobj {
|
|
drawtag = scene[config.drawtag_base + drawobj_ix];
|
|
}
|
|
var x0 = 0;
|
|
var y0 = 0;
|
|
var x1 = 0;
|
|
var y1 = 0;
|
|
if drawtag != DRAWTAG_NOP && drawtag != DRAWTAG_END_CLIP {
|
|
let bbox = draw_bboxes[drawobj_ix];
|
|
|
|
// Don't round up the bottom-right corner of the bbox if the area is zero and leave the
|
|
// coordinates at 0. This will make `tile_count` zero as the shape is clipped out.
|
|
if bbox.x < bbox.z && bbox.y < bbox.w {
|
|
x0 = i32(floor(bbox.x * SX));
|
|
y0 = i32(floor(bbox.y * SY));
|
|
x1 = i32(ceil(bbox.z * SX));
|
|
y1 = i32(ceil(bbox.w * SY));
|
|
}
|
|
}
|
|
let ux0 = u32(clamp(x0, 0, i32(config.width_in_tiles)));
|
|
let uy0 = u32(clamp(y0, 0, i32(config.height_in_tiles)));
|
|
let ux1 = u32(clamp(x1, 0, i32(config.width_in_tiles)));
|
|
let uy1 = u32(clamp(y1, 0, i32(config.height_in_tiles)));
|
|
let tile_count = (ux1 - ux0) * (uy1 - uy0);
|
|
var total_tile_count = tile_count;
|
|
sh_tile_count[local_id.x] = tile_count;
|
|
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
|
|
workgroupBarrier();
|
|
if local_id.x >= (1u << i) {
|
|
total_tile_count += sh_tile_count[local_id.x - (1u << i)];
|
|
}
|
|
workgroupBarrier();
|
|
sh_tile_count[local_id.x] = total_tile_count;
|
|
}
|
|
if local_id.x == WG_SIZE - 1u {
|
|
let count = sh_tile_count[WG_SIZE - 1u];
|
|
var offset = atomicAdd(&bump.tile, count);
|
|
if offset + count > config.tiles_size {
|
|
offset = 0u;
|
|
atomicOr(&bump.failed, STAGE_TILE_ALLOC);
|
|
}
|
|
paths[drawobj_ix].tiles = offset;
|
|
}
|
|
// Using storage barriers is a workaround for what appears to be a miscompilation
|
|
// when a normal workgroup-shared variable is used to broadcast the value.
|
|
storageBarrier();
|
|
let tile_offset = paths[drawobj_ix | (WG_SIZE - 1u)].tiles;
|
|
storageBarrier();
|
|
if drawobj_ix < config.n_drawobj {
|
|
let tile_subix = select(0u, sh_tile_count[local_id.x - 1u], local_id.x > 0u);
|
|
let bbox = vec4(ux0, uy0, ux1, uy1);
|
|
let path = Path(bbox, tile_offset + tile_subix);
|
|
paths[drawobj_ix] = path;
|
|
}
|
|
|
|
// zero allocated memory
|
|
// Note: if the number of draw objects is small, utilization will be poor.
|
|
// There are two things that can be done to improve that. One would be a
|
|
// separate (indirect) dispatch. Another would be to have each workgroup
|
|
// process fewer draw objects than the number of threads in the wg.
|
|
let total_count = sh_tile_count[WG_SIZE - 1u];
|
|
for (var i = local_id.x; i < total_count; i += WG_SIZE) {
|
|
// Note: could format output buffer as u32 for even better load balancing.
|
|
tiles[tile_offset + i] = Tile(0, 0u);
|
|
}
|
|
}
|