mirror of
https://github.com/italicsjenga/vello.git
synced 2025-01-10 20:51:29 +11:00
bc903d1c3b
The potential division by zero in this line led to visible visual artifacts when running against WebGPU in Chrome.
277 lines
10 KiB
WebGPU Shading Language
277 lines
10 KiB
WebGPU Shading Language
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
|
|
|
|
// Path coarse rasterization for the full implementation.
|
|
|
|
#import config
|
|
#import pathtag
|
|
#import tile
|
|
#import segment
|
|
#import cubic
|
|
#import bump
|
|
|
|
@group(0) @binding(0)
|
|
var<uniform> config: Config;
|
|
|
|
@group(0) @binding(1)
|
|
var<storage> scene: array<u32>;
|
|
|
|
@group(0) @binding(2)
|
|
var<storage> tag_monoids: array<TagMonoid>;
|
|
|
|
@group(0) @binding(3)
|
|
var<storage> cubics: array<Cubic>;
|
|
|
|
@group(0) @binding(4)
|
|
var<storage> paths: array<Path>;
|
|
|
|
// We don't get this from import as it's the atomic version
|
|
struct AtomicTile {
|
|
backdrop: atomic<i32>,
|
|
segments: atomic<u32>,
|
|
}
|
|
|
|
@group(0) @binding(5)
|
|
var<storage, read_write> bump: BumpAllocators;
|
|
|
|
@group(0) @binding(6)
|
|
var<storage, read_write> tiles: array<AtomicTile>;
|
|
|
|
@group(0) @binding(7)
|
|
var<storage, read_write> segments: array<Segment>;
|
|
|
|
struct SubdivResult {
|
|
val: f32,
|
|
a0: f32,
|
|
a2: f32,
|
|
}
|
|
|
|
let D = 0.67;
|
|
fn approx_parabola_integral(x: f32) -> f32 {
|
|
return x * inverseSqrt(sqrt(1.0 - D + (D * D * D * D + 0.25 * x * x)));
|
|
}
|
|
|
|
let B = 0.39;
|
|
fn approx_parabola_inv_integral(x: f32) -> f32 {
|
|
return x * sqrt(1.0 - B + (B * B + 0.5 * x * x));
|
|
}
|
|
|
|
fn estimate_subdiv(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, sqrt_tol: f32) -> SubdivResult {
|
|
let d01 = p1 - p0;
|
|
let d12 = p2 - p1;
|
|
let dd = d01 - d12;
|
|
let cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x;
|
|
let cross_inv = select(1.0 / cross, 1.0e9, abs(cross) < 1.0e-9);
|
|
let x0 = dot(d01, dd) * cross_inv;
|
|
let x2 = dot(d12, dd) * cross_inv;
|
|
let scale = abs(cross / (length(dd) * (x2 - x0)));
|
|
|
|
let a0 = approx_parabola_integral(x0);
|
|
let a2 = approx_parabola_integral(x2);
|
|
var val = 0.0;
|
|
if scale < 1e9 {
|
|
let da = abs(a2 - a0);
|
|
let sqrt_scale = sqrt(scale);
|
|
if sign(x0) == sign(x2) {
|
|
val = sqrt_scale;
|
|
} else {
|
|
let xmin = sqrt_tol / sqrt_scale;
|
|
val = sqrt_tol / approx_parabola_integral(xmin);
|
|
}
|
|
val *= da;
|
|
}
|
|
return SubdivResult(val, a0, a2);
|
|
}
|
|
|
|
fn eval_quad(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, t: f32) -> vec2<f32> {
|
|
let mt = 1.0 - t;
|
|
return p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t;
|
|
}
|
|
|
|
fn eval_cubic(p0: vec2<f32>, p1: vec2<f32>, p2: vec2<f32>, p3: vec2<f32>, t: f32) -> vec2<f32> {
|
|
let mt = 1.0 - t;
|
|
return p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t;
|
|
}
|
|
|
|
fn alloc_segment() -> u32 {
|
|
var offset = atomicAdd(&bump.segments, 1u) + 1u;
|
|
if offset + 1u > config.segments_size {
|
|
offset = 0u;
|
|
atomicOr(&bump.failed, STAGE_PATH_COARSE);
|
|
}
|
|
return offset;
|
|
}
|
|
|
|
let MAX_QUADS = 16u;
|
|
|
|
@compute @workgroup_size(256)
|
|
fn main(
|
|
@builtin(global_invocation_id) global_id: vec3<u32>,
|
|
) {
|
|
// Exit early if prior stages failed, as we can't run this stage.
|
|
// We need to check only prior stages, as if this stage has failed in another workgroup,
|
|
// we still want to know this workgroup's memory requirement.
|
|
if (atomicLoad(&bump.failed) & (STAGE_BINNING | STAGE_TILE_ALLOC)) != 0u {
|
|
return;
|
|
}
|
|
let ix = global_id.x;
|
|
let tag_word = scene[config.pathtag_base + (ix >> 2u)];
|
|
let shift = (ix & 3u) * 8u;
|
|
var tag_byte = (tag_word >> shift) & 0xffu;
|
|
|
|
if (tag_byte & PATH_TAG_SEG_TYPE) != 0u {
|
|
// Discussion question: it might actually be cheaper to do the path segment
|
|
// decoding & transform again rather than store the result in a buffer;
|
|
// classic memory vs ALU tradeoff.
|
|
let cubic = cubics[global_id.x];
|
|
let path = paths[cubic.path_ix];
|
|
let is_stroke = (cubic.flags & CUBIC_IS_STROKE) != 0u;
|
|
let bbox = vec4<i32>(path.bbox);
|
|
let p0 = cubic.p0;
|
|
let p1 = cubic.p1;
|
|
let p2 = cubic.p2;
|
|
let p3 = cubic.p3;
|
|
let err_v = 3.0 * (p2 - p1) + p0 - p3;
|
|
let err = dot(err_v, err_v);
|
|
let ACCURACY = 0.25;
|
|
let Q_ACCURACY = ACCURACY * 0.1;
|
|
let REM_ACCURACY = (ACCURACY - Q_ACCURACY);
|
|
let MAX_HYPOT2 = 432.0 * Q_ACCURACY * Q_ACCURACY;
|
|
var n_quads = max(u32(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0))), 1u);
|
|
n_quads = min(n_quads, MAX_QUADS);
|
|
var keep_params: array<SubdivResult, MAX_QUADS>;
|
|
var val = 0.0;
|
|
var qp0 = p0;
|
|
let step = 1.0 / f32(n_quads);
|
|
for (var i = 0u; i < n_quads; i += 1u) {
|
|
let t = f32(i + 1u) * step;
|
|
let qp2 = eval_cubic(p0, p1, p2, p3, t);
|
|
var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step);
|
|
qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
|
|
let params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY));
|
|
keep_params[i] = params;
|
|
val += params.val;
|
|
qp0 = qp2;
|
|
}
|
|
let n = max(u32(ceil(val * (0.5 / sqrt(REM_ACCURACY)))), 1u);
|
|
var lp0 = p0;
|
|
qp0 = p0;
|
|
let v_step = val / f32(n);
|
|
var n_out = 1u;
|
|
var val_sum = 0.0;
|
|
for (var i = 0u; i < n_quads; i += 1u) {
|
|
let t = f32(i + 1u) * step;
|
|
let qp2 = eval_cubic(p0, p1, p2, p3, t);
|
|
var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step);
|
|
qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
|
|
let params = keep_params[i];
|
|
let u0 = approx_parabola_inv_integral(params.a0);
|
|
let u2 = approx_parabola_inv_integral(params.a2);
|
|
let uscale = 1.0 / (u2 - u0);
|
|
var val_target = f32(n_out) * v_step;
|
|
while n_out == n || val_target < val_sum + params.val {
|
|
var lp1: vec2<f32>;
|
|
if n_out == n {
|
|
lp1 = p3;
|
|
} else {
|
|
let u = (val_target - val_sum) / params.val;
|
|
let a = mix(params.a0, params.a2, u);
|
|
let au = approx_parabola_inv_integral(a);
|
|
let t = (au - u0) * uscale;
|
|
lp1 = eval_quad(qp0, qp1, qp2, t);
|
|
}
|
|
|
|
// Output line segment lp0..lp1
|
|
let xymin = min(lp0, lp1) - cubic.stroke;
|
|
let xymax = max(lp0, lp1) + cubic.stroke;
|
|
let dp = lp1 - lp0;
|
|
let recip_dx = 1.0 / dp.x;
|
|
let invslope = select(dp.x / dp.y, 1.0e9, abs(dp.y) < 1.0e-9);
|
|
let SX = 1.0 / f32(TILE_WIDTH);
|
|
let SY = 1.0 / f32(TILE_HEIGHT);
|
|
let c = (cubic.stroke.x + abs(invslope) * (0.5 * f32(TILE_HEIGHT) + cubic.stroke.y)) * SX;
|
|
let b = invslope;
|
|
let a = (lp0.x - (lp0.y - 0.5 * f32(TILE_HEIGHT)) * b) * SX;
|
|
var x0 = i32(floor(xymin.x * SX));
|
|
var x1 = i32(floor(xymax.x * SX) + 1.0);
|
|
var y0 = i32(floor(xymin.y * SY));
|
|
var y1 = i32(floor(xymax.y * SY) + 1.0);
|
|
x0 = clamp(x0, bbox.x, bbox.z);
|
|
x1 = clamp(x1, bbox.x, bbox.z);
|
|
y0 = clamp(y0, bbox.y, bbox.w);
|
|
y1 = clamp(y1, bbox.y, bbox.w);
|
|
var xc = a + b * f32(y0);
|
|
let stride = bbox.z - bbox.x;
|
|
var base = i32(path.tiles) + (y0 - bbox.y) * stride - bbox.x;
|
|
var xray = i32(floor(lp0.x * SX));
|
|
var last_xray = i32(floor(lp1.x * SX));
|
|
if dp.y < 0.0 {
|
|
let tmp = xray;
|
|
xray = last_xray;
|
|
last_xray = tmp;
|
|
}
|
|
for (var y = y0; y < y1; y += 1) {
|
|
let tile_y0 = f32(y) * f32(TILE_HEIGHT);
|
|
let xbackdrop = max(xray + 1, bbox.x);
|
|
if !is_stroke && xymin.y < tile_y0 && xbackdrop < bbox.z {
|
|
let backdrop = select(-1, 1, dp.y < 0.0);
|
|
let tile_ix = base + xbackdrop;
|
|
atomicAdd(&tiles[tile_ix].backdrop, backdrop);
|
|
}
|
|
var next_xray = last_xray;
|
|
if y + 1 < y1 {
|
|
let tile_y1 = f32(y + 1) * f32(TILE_HEIGHT);
|
|
let x_edge = lp0.x + (tile_y1 - lp0.y) * invslope;
|
|
next_xray = i32(floor(x_edge * SX));
|
|
}
|
|
let min_xray = min(xray, next_xray);
|
|
let max_xray = max(xray, next_xray);
|
|
var xx0 = min(i32(floor(xc - c)), min_xray);
|
|
var xx1 = max(i32(ceil(xc + c)), max_xray + 1);
|
|
xx0 = clamp(xx0, x0, x1);
|
|
xx1 = clamp(xx1, x0, x1);
|
|
var tile_seg: Segment;
|
|
for (var x = xx0; x < xx1; x += 1) {
|
|
let tile_x0 = f32(x) * f32(TILE_WIDTH);
|
|
let tile_ix = base + x;
|
|
// allocate segment, insert linked list
|
|
let seg_ix = alloc_segment();
|
|
let old = atomicExchange(&tiles[tile_ix].segments, seg_ix);
|
|
tile_seg.origin = lp0;
|
|
tile_seg.delta = dp;
|
|
var y_edge = 0.0;
|
|
if !is_stroke {
|
|
y_edge = mix(lp0.y, lp1.y, (tile_x0 - lp0.x) * recip_dx);
|
|
if xymin.x < tile_x0 {
|
|
let p = vec2(tile_x0, y_edge);
|
|
if dp.x < 0.0 {
|
|
tile_seg.delta = p - lp0;
|
|
} else {
|
|
tile_seg.origin = p;
|
|
tile_seg.delta = lp1 - p;
|
|
}
|
|
if tile_seg.delta.x == 0.0 {
|
|
tile_seg.delta.x = sign(dp.x) * 1e-9;
|
|
}
|
|
}
|
|
if x <= min_xray || max_xray < x {
|
|
y_edge = 1e9;
|
|
}
|
|
}
|
|
tile_seg.y_edge = y_edge;
|
|
tile_seg.next = old;
|
|
segments[seg_ix] = tile_seg;
|
|
}
|
|
xc += b;
|
|
base += stride;
|
|
xray = next_xray;
|
|
}
|
|
n_out += 1u;
|
|
val_target += v_step;
|
|
lp0 = lp1;
|
|
}
|
|
val_sum += params.val;
|
|
qp0 = qp2;
|
|
}
|
|
}
|
|
}
|