vello/shader/clip_leaf.wgsl

207 lines
6.2 KiB
GLSL
Raw Normal View History

// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
#import config
#import bbox
#import clip
#import drawtag
@group(0) @binding(0)
var<uniform> config: Config;
@group(0) @binding(1)
var<storage> clip_inp: array<ClipInp>;
@group(0) @binding(2)
var<storage> path_bboxes: array<PathBbox>;
@group(0) @binding(3)
var<storage> reduced: array<Bic>;
@group(0) @binding(4)
var<storage> clip_els: array<ClipEl>;
@group(0) @binding(5)
var<storage, read_write> draw_monoids: array<DrawMonoid>;
@group(0) @binding(6)
var<storage, read_write> clip_bboxes: array<vec4<f32>>;
2022-12-09 03:53:35 +11:00
let WG_SIZE = 256u;
var<workgroup> sh_bic: array<Bic, 510 >;
var<workgroup> sh_stack: array<u32, WG_SIZE>;
var<workgroup> sh_stack_bbox: array<vec4<f32>, WG_SIZE>;
var<workgroup> sh_bbox: array<vec4<f32>, WG_SIZE>;
var<workgroup> sh_link: array<i32, WG_SIZE>;
fn search_link(bic: ptr<function, Bic>, ix_in: u32) -> i32 {
var ix = ix_in;
var j = 0u;
while j < firstTrailingBit(WG_SIZE) {
let base = 2u * WG_SIZE - (2u << (firstTrailingBit(WG_SIZE) - j));
if ((ix >> j) & 1u) != 0u {
let test = bic_combine(sh_bic[base + (ix >> j) - 1u], *bic);
2022-11-19 09:26:26 +11:00
if test.b > 0u {
break;
}
2022-11-19 09:26:26 +11:00
*bic = test;
ix -= 1u << j;
}
j += 1u;
}
if ix > 0u {
while j > 0u {
j -= 1u;
let base = 2u * WG_SIZE - (2u << (firstTrailingBit(WG_SIZE) - j));
let test = bic_combine(sh_bic[base + (ix >> j) - 1u], *bic);
if test.b == 0u {
*bic = test;
ix -= 1u << j;
}
}
}
if ix > 0u {
return i32(ix) - 1;
} else {
2022-11-19 09:26:26 +11:00
return i32(~0u - (*bic).a);
}
}
fn load_clip_path(ix: u32) -> i32 {
if ix < config.n_clip {
return clip_inp[ix].path_ix;
} else {
2022-11-19 09:26:26 +11:00
return -2147483648;
// literal too large?
// return 0x80000000;
}
}
@compute @workgroup_size(256)
fn main(
@builtin(global_invocation_id) global_id: vec3<u32>,
@builtin(local_invocation_id) local_id: vec3<u32>,
@builtin(workgroup_id) wg_id: vec3<u32>,
) {
var bic: Bic;
if local_id.x < wg_id.x {
bic = reduced[local_id.x];
}
sh_bic[local_id.x] = bic;
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
workgroupBarrier();
if local_id.x + (1u << i) < WG_SIZE {
let other = sh_bic[local_id.x + (1u << i)];
bic = bic_combine(bic, other);
}
workgroupBarrier();
sh_bic[local_id.x] = bic;
}
workgroupBarrier();
let stack_size = sh_bic[0].b;
// TODO: if stack depth > WG_SIZE desired, scan here
// binary search in stack
let sp = WG_SIZE - 1u - local_id.x;
var ix = 0u;
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
let probe = ix + ((WG_SIZE / 2u) >> i);
if sp < sh_bic[probe].b {
ix = probe;
}
}
let b = sh_bic[ix].b;
var bbox = vec4(-1e9, -1e9, 1e9, 1e9);
if sp < b {
let el = clip_els[ix * WG_SIZE + b - sp - 1u];
sh_stack[local_id.x] = el.parent_ix;
bbox = el.bbox;
}
// forward scan of bbox values of prefix stack
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
sh_stack_bbox[local_id.x] = bbox;
workgroupBarrier();
if local_id.x >= (1u << i) {
bbox = bbox_intersect(sh_stack_bbox[local_id.x - (1u << i)], bbox);
}
workgroupBarrier();
}
sh_stack_bbox[local_id.x] = bbox;
// Read input and compute Bic binary tree
let inp = load_clip_path(global_id.x);
let is_push = inp >= 0;
bic = Bic(1u - u32(is_push), u32(is_push));
sh_bic[local_id.x] = bic;
if is_push {
let path_bbox = path_bboxes[inp];
bbox = vec4(f32(path_bbox.x0), f32(path_bbox.y0), f32(path_bbox.x1), f32(path_bbox.y1));
} else {
bbox = vec4(-1e9, -1e9, 1e9, 1e9);
}
var inbase = 0u;
2022-11-19 09:26:26 +11:00
for (var i = 0u; i < firstTrailingBit(WG_SIZE) - 1u; i += 1u) {
let outbase = 2u * WG_SIZE - (1u << (firstTrailingBit(WG_SIZE) - i));
workgroupBarrier();
if local_id.x < 1u << (firstTrailingBit(WG_SIZE) - 1u - i) {
let in_off = inbase + local_id.x * 2u;
sh_bic[outbase + local_id.x] = bic_combine(sh_bic[in_off], sh_bic[in_off + 1u]);
}
inbase = outbase;
}
workgroupBarrier();
// search for predecessor node
bic = Bic();
var link = search_link(&bic, local_id.x);
sh_link[local_id.x] = link;
workgroupBarrier();
let grandparent = select(link - 1, sh_link[link], link >= 0);
var parent: i32;
if link >= 0 {
parent = i32(wg_id.x * WG_SIZE) + link;
} else if link + i32(stack_size) >= 0 {
parent = i32(sh_stack[i32(WG_SIZE) + link]);
} else {
parent = -1;
}
// bbox scan (intersect) across parent links
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
if i != 0u {
sh_link[local_id.x] = link;
}
sh_bbox[local_id.x] = bbox;
workgroupBarrier();
if link >= 0 {
bbox = bbox_intersect(sh_bbox[link], bbox);
link = sh_link[link];
}
workgroupBarrier();
}
if link + i32(stack_size) >= 0 {
bbox = bbox_intersect(sh_stack_bbox[i32(WG_SIZE) + link], bbox);
}
// At this point, bbox is the intersection of bboxes on the path to the root
sh_bbox[local_id.x] = bbox;
workgroupBarrier();
if !is_push && global_id.x < config.n_clip {
// Fix up drawmonoid so path_ix of EndClip matches BeginClip
let parent_clip = clip_inp[parent];
let path_ix = parent_clip.path_ix;
let parent_ix = parent_clip.ix;
let ix = ~inp;
draw_monoids[ix].path_ix = u32(path_ix);
// Make EndClip point to the same draw data as BeginClip
draw_monoids[ix].scene_offset = draw_monoids[parent_ix].scene_offset;
if grandparent >= 0 {
bbox = sh_bbox[grandparent];
} else if grandparent + i32(stack_size) >= 0 {
bbox = sh_stack_bbox[i32(WG_SIZE) + grandparent];
} else {
bbox = vec4(-1e9, -1e9, 1e9, 1e9);
}
}
[shaders] Explicitly guard writes to clip_bboxes The very last statement of the `clip_leaf` shader is the assignment to the `clip_bboxes` buffer. The buffer write is indexed on the global invocation ID. It is possible for this index to be larger than the total number of clips in at least one workgroup since the clip count isn't strictly a multiple of workgroup size. Currently the size of the clip_bboxes buffer matches the number of clips. This means the buffer index is likely to run past the buffer. This is not an issue when running on wgpu as it internally enables bounds checking when compiling WGSL (so all buffer accesses are implicitly conditional). When compiling the shaders to native backends the vello_shaders crate currently does not enable implicit bounds checking, so a buffer overrun is possible. There are a few potential solutions: 1. Have an explicit bounds check in the shader. This is straightforward and consistent with the existing code that reads from clip_inp. The downside is that with bounds checking enabled, this extra check is redundant in the generated code. This is the solution included in this PR. 2. Make sure that the clip_bboxes buffer has a size that is a multiple of clip_leaf's workgroup size. This was the approach taken by piet-gpu on its native HALs. This effectively wastes up to 4080 bytes (255 * 16) to store unused bbox values. 3. Enable Naga's implicit bounds checks when compiling to native. This would make the behavior consistent with the wgpu backend, however it comes at the cost of increased renderer complexity as the native implementation must supply the sizes of each buffer in an implicitly generated buffer binding to every shader stage.
2023-04-22 11:43:51 +10:00
if global_id.x < config.n_clip {
clip_bboxes[global_id.x] = bbox;
}
}