vello/piet-wgsl/shader/clip_leaf.wgsl
Raph Levien ef3ed3c9d7 Checkpoint of clip and gradient work
This is a checkpoint of partly completed work. Much of the GPU side is done, very little of the CPU side.

For clips, the clip_els bindings (binding 6 of draw_leaf) are not added. Clip logic is missing from coarse. The overflow buffer is missing from fine, as is its size calculation in coarse (but it should work as long as the max depth fits within BLEND_STACK_SPLIT).

For gradients, the texture binding is missing (binding 6) is missing from fine, as is the infrastructure in engine to deal with texture resources, and of course porting over the logic to fill it.

The code is not tested, bugs may lurk.
2022-11-10 19:48:36 -08:00

196 lines
5.8 KiB
GLSL

#import config
#import bbox
#import clip
#import drawtag
@group(0) @binding(0)
var<storage> config: Config;
@group(0) @binding(1)
var<storage> clip_inp: array<i32>;
@group(0) @binding(2)
var<storage> path_bboxes: array<PathBbox>;
@group(0) @binding(3)
var<storage> reduced: array<Bic>;
@group(0) @binding(4)
var<storage> clip_els: array<ClipEl>;
@group(0) @binding(5)
var<storage, read_write> draw_monoids: array<DrawMonoid>;
@group(0) @binding(6)
var<storage, read_write> clip_bboxes: array<vec4<f32>>;
let WG_SIZE = 256u;
var<workgroup> sh_bic: array<Bic, 510 >;
var<workgroup> sh_stack: array<u32, WG_SIZE>;
var<workgroup> sh_stack_bbox: array<vec4<f32>, WG_SIZE>;
var<workgroup> sh_bbox: array<vec4<f32>, WG_SIZE>;
var<workgroup> sh_link: array<i32, WG_SIZE>;
fn search_link(bic: ptr<function, Bic>, ix: u32) -> i32 {
var j = 0u;
while j < firstTrailingBit(WG_SIZE) {
let base = 2u * WG_SIZE - (2u << (firstTrailingBit(WG_SIZE) - j));
if ((ix >> j) & 1u) != 0u {
let test = bic_combine(sh_bic[base + (ix >> j) - 1u], *bic);
if test.b >= 0u {
*bic = test;
ix -= 1u << j;
}
}
j += 1u;
}
if ix > 0u {
while j > 0u {
j -= 1u;
let base = 2u * WG_SIZE - (2u << (firstTrailingBit(WG_SIZE) - j));
let test = bic_combine(sh_bic[base + (ix >> j) - 1u], *bic);
if test.b == 0u {
*bic = test;
ix -= 1u << j;
}
}
}
if ix > 0u {
return i32(ix) - 1;
} else {
return i32(~(*bic).a);
}
}
fn load_clip_inp(ix: u32) -> i32 {
if ix < config.n_clip {
return clip_inp[ix];
} else {
return i32(0x80000000);
}
}
@compute @workgroup_size(256)
fn main(
@builtin(global_invocation_id) global_id: vec3<u32>,
@builtin(local_invocation_id) local_id: vec3<u32>,
@builtin(workgroup_id) wg_id: vec3<u32>,
) {
var bic: Bic;
if local_id.x < wg_id.x {
bic = reduced[local_id.x];
}
sh_bic[local_id.x] = bic;
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
workgroupBarrier();
if local_id.x + (1u << i) < WG_SIZE {
let other = sh_bic[local_id.x + (1u << i)];
bic = bic_combine(bic, other);
}
workgroupBarrier();
sh_bic[local_id.x] = bic;
}
workgroupBarrier();
let stack_size = sh_bic[0].b;
// TODO: if stack depth > WG_SIZE desired, scan here
// binary search in stack
let sp = WG_SIZE - 1u - local_id.x;
var ix = 0u;
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
let probe = ix + ((WG_SIZE / 2u) >> i);
if sp < sh_bic[probe].b {
ix = probe;
}
}
let b = sh_bic[ix].b;
var bbox = vec4<f32>(-1e9, -1e9, 1e9, 1e9);
if sp < b {
let el = clip_els[ix * WG_SIZE + b - sp - 1u];
sh_stack[local_id.x] = el.parent_ix;
bbox = el.bbox;
}
// forward scan of bbox values of prefix stack
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
sh_stack_bbox[local_id.x] = bbox;
workgroupBarrier();
if local_id.x >= (1u << i) {
bbox = bbox_intersect(sh_stack_bbox[local_id.x - (1u << i)], bbox);
}
workgroupBarrier();
}
sh_stack_bbox[local_id.x] = bbox;
// Read input and compute Bic binary tree
let inp = load_clip_inp(global_id.x);
let is_push = inp >= 0;
var bic = Bic(1u - u32(is_push), u32(is_push));
sh_bic[local_id.x] = bic;
if is_push {
let path_bbox = path_bboxes[inp];
bbox = vec4<f32>(path_bbox.x0, path_bbox.y0, path_bbox.x1, path_bbox.y1);
} else {
bbox = vec4<f32>(-1e9, -1e9, 1e9, 1e9);
}
var inbase = 0u;
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
let outbase = 2u * WG_SIZE - (1u << (firstTrailingBit(WG_SIZE) - i));
workgroupBarrier();
if local_id.x < 1u << (firstTrailingBit(WG_SIZE) - 1u - i) {
let in_off = inbase + local_id.x * 2u;
sh_bic[outbase + local_id.x] = bic_combine(sh_bic[in_off], sh_bic[in_off + 1u]);
}
inbase = outbase;
}
workgroupBarrier();
// search for predecessor node
bic = Bic();
var link = search_link(&bic, local_id.x);
sh_link[local_id.x] = link;
workgroupBarrier();
let grandparent = select(link - 1, sh_link[link], link >= 0);
var parent: i32;
if link >= 0 {
parent = i32(wg_id.x * WG_SIZE) + link;
} else if link + i32(stack_size) >= 0 {
parent = i32(sh_stack[i32(WG_SIZE) + link]);
} else {
parent = -1;
}
// bbox scan (intersect) across parent links
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
if i != 0u {
sh_link[local_id.x] = link;
}
sh_bbox[local_id.x] = bbox;
workgroupBarrier();
if link >= 0 {
bbox = bbox_intersect(sh_bbox[link], bbox);
link = sh_link[link];
}
workgroupBarrier();
}
if link + i32(stack_size) >= 0 {
bbox = bbox_intersect(sh_stack_bbox[i32(WG_SIZE) + link], bbox);
}
// At this point, bbox is the intersection of bboxes on the path to the root
sh_bbox[local_id.x] = bbox;
workgroupBarrier();
if !is_push && global_id.x < config.n_clip {
// Fix up drawmonoid so path_ix of EndClip matches BeginClip
let path_ix = clip_inp[parent];
draw_monoids[~inp].path_ix = u32(path_ix);
if grandparent >= 0 {
bbox = sh_bbox[grandparent];
} else if grandparent + i32(stack_size) >= 0 {
bbox = sh_stack_bbox[i32(WG_SIZE) + grandparent];
} else {
bbox = vec4<f32>(-1e9, -1e9, 1e9, 1e9);
}
}
clip_bboxes[global_id.x] = bbox
}