mirror of
https://github.com/italicsjenga/vello.git
synced 2025-01-09 12:21:31 +11:00
b103a55301
Adds full support for COLRv1 radial gradients based on the two-point conical gradient algorithm at https://skia.org/docs/dev/design/conical/ Also adds robustness to degenerate cases in gradient encoding: * Radial where p0 == p1 && r0 == r1 renders transparent solid * Empty stops render as transparent solid * Single stop renders as solid
449 lines
16 KiB
WebGPU Shading Language
449 lines
16 KiB
WebGPU Shading Language
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
|
|
|
|
// The coarse rasterization stage.
|
|
|
|
#import config
|
|
#import bump
|
|
#import drawtag
|
|
#import ptcl
|
|
#import tile
|
|
|
|
@group(0) @binding(0)
|
|
var<uniform> config: Config;
|
|
|
|
@group(0) @binding(1)
|
|
var<storage> scene: array<u32>;
|
|
|
|
@group(0) @binding(2)
|
|
var<storage> draw_monoids: array<DrawMonoid>;
|
|
|
|
// TODO: dedup
|
|
struct BinHeader {
|
|
element_count: u32,
|
|
chunk_offset: u32,
|
|
}
|
|
|
|
@group(0) @binding(3)
|
|
var<storage> bin_headers: array<BinHeader>;
|
|
|
|
@group(0) @binding(4)
|
|
var<storage> info_bin_data: array<u32>;
|
|
|
|
@group(0) @binding(5)
|
|
var<storage> paths: array<Path>;
|
|
|
|
@group(0) @binding(6)
|
|
var<storage> tiles: array<Tile>;
|
|
|
|
@group(0) @binding(7)
|
|
var<storage, read_write> bump: BumpAllocators;
|
|
|
|
@group(0) @binding(8)
|
|
var<storage, read_write> ptcl: array<u32>;
|
|
|
|
|
|
|
|
// Much of this code assumes WG_SIZE == N_TILE. If these diverge, then
|
|
// a fair amount of fixup is needed.
|
|
let WG_SIZE = 256u;
|
|
//let N_SLICE = WG_SIZE / 32u;
|
|
let N_SLICE = 8u;
|
|
|
|
var<workgroup> sh_bitmaps: array<array<atomic<u32>, N_TILE>, N_SLICE>;
|
|
var<workgroup> sh_part_count: array<u32, WG_SIZE>;
|
|
var<workgroup> sh_part_offsets: array<u32, WG_SIZE>;
|
|
var<workgroup> sh_drawobj_ix: array<u32, WG_SIZE>;
|
|
var<workgroup> sh_tile_stride: array<u32, WG_SIZE>;
|
|
var<workgroup> sh_tile_width: array<u32, WG_SIZE>;
|
|
var<workgroup> sh_tile_x0y0: array<u32, WG_SIZE>;
|
|
var<workgroup> sh_tile_count: array<u32, WG_SIZE>;
|
|
var<workgroup> sh_tile_base: array<u32, WG_SIZE>;
|
|
|
|
// helper functions for writing ptcl
|
|
|
|
var<private> cmd_offset: u32;
|
|
var<private> cmd_limit: u32;
|
|
|
|
// Make sure there is space for a command of given size, plus a jump if needed
|
|
fn alloc_cmd(size: u32) {
|
|
if cmd_offset + size >= cmd_limit {
|
|
// We might be able to save a little bit of computation here
|
|
// by setting the initial value of the bump allocator.
|
|
let ptcl_dyn_start = config.width_in_tiles * config.height_in_tiles * PTCL_INITIAL_ALLOC;
|
|
var new_cmd = ptcl_dyn_start + atomicAdd(&bump.ptcl, PTCL_INCREMENT);
|
|
if new_cmd + PTCL_INCREMENT > config.ptcl_size {
|
|
new_cmd = 0u;
|
|
atomicOr(&bump.failed, STAGE_COARSE);
|
|
}
|
|
ptcl[cmd_offset] = CMD_JUMP;
|
|
ptcl[cmd_offset + 1u] = new_cmd;
|
|
cmd_offset = new_cmd;
|
|
cmd_limit = cmd_offset + (PTCL_INCREMENT - PTCL_HEADROOM);
|
|
}
|
|
}
|
|
|
|
fn write_path(tile: Tile, linewidth: f32) -> bool {
|
|
// TODO: take flags
|
|
alloc_cmd(3u);
|
|
if linewidth < 0.0 {
|
|
let even_odd = linewidth < -1.0;
|
|
if tile.segments != 0u {
|
|
let fill = CmdFill(tile.segments, tile.backdrop);
|
|
ptcl[cmd_offset] = CMD_FILL;
|
|
let segments_and_rule = select(fill.tile << 1u, (fill.tile << 1u) | 1u, even_odd);
|
|
ptcl[cmd_offset + 1u] = segments_and_rule;
|
|
ptcl[cmd_offset + 2u] = u32(fill.backdrop);
|
|
cmd_offset += 3u;
|
|
} else {
|
|
if even_odd && (abs(tile.backdrop) & 1) == 0 {
|
|
return false;
|
|
}
|
|
ptcl[cmd_offset] = CMD_SOLID;
|
|
cmd_offset += 1u;
|
|
}
|
|
} else {
|
|
let stroke = CmdStroke(tile.segments, 0.5 * linewidth);
|
|
ptcl[cmd_offset] = CMD_STROKE;
|
|
ptcl[cmd_offset + 1u] = stroke.tile;
|
|
ptcl[cmd_offset + 2u] = bitcast<u32>(stroke.half_width);
|
|
cmd_offset += 3u;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
fn write_color(color: CmdColor) {
|
|
alloc_cmd(2u);
|
|
ptcl[cmd_offset] = CMD_COLOR;
|
|
ptcl[cmd_offset + 1u] = color.rgba_color;
|
|
cmd_offset += 2u;
|
|
}
|
|
|
|
fn write_grad(ty: u32, index: u32, info_offset: u32) {
|
|
alloc_cmd(3u);
|
|
ptcl[cmd_offset] = ty;
|
|
ptcl[cmd_offset + 1u] = index;
|
|
ptcl[cmd_offset + 2u] = info_offset;
|
|
cmd_offset += 3u;
|
|
}
|
|
|
|
fn write_image(info_offset: u32) {
|
|
alloc_cmd(2u);
|
|
ptcl[cmd_offset] = CMD_IMAGE;
|
|
ptcl[cmd_offset + 1u] = info_offset;
|
|
cmd_offset += 2u;
|
|
}
|
|
|
|
fn write_begin_clip() {
|
|
alloc_cmd(1u);
|
|
ptcl[cmd_offset] = CMD_BEGIN_CLIP;
|
|
cmd_offset += 1u;
|
|
}
|
|
|
|
fn write_end_clip(end_clip: CmdEndClip) {
|
|
alloc_cmd(3u);
|
|
ptcl[cmd_offset] = CMD_END_CLIP;
|
|
ptcl[cmd_offset + 1u] = end_clip.blend;
|
|
ptcl[cmd_offset + 2u] = bitcast<u32>(end_clip.alpha);
|
|
cmd_offset += 3u;
|
|
}
|
|
|
|
@compute @workgroup_size(256)
|
|
fn main(
|
|
@builtin(local_invocation_id) local_id: vec3<u32>,
|
|
@builtin(workgroup_id) wg_id: vec3<u32>,
|
|
) {
|
|
// Exit early if prior stages failed, as we can't run this stage.
|
|
// We need to check only prior stages, as if this stage has failed in another workgroup,
|
|
// we still want to know this workgroup's memory requirement.
|
|
#ifdef have_uniform
|
|
if local_id.x == 0u {
|
|
// Reuse sh_part_count to hold failed flag, shmem is tight
|
|
sh_part_count[0] = atomicLoad(&bump.failed);
|
|
}
|
|
let failed = workgroupUniformLoad(&sh_part_count[0]);
|
|
#else
|
|
let failed = atomicLoad(&bump.failed);
|
|
#endif
|
|
if (failed & (STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_PATH_COARSE)) != 0u {
|
|
return;
|
|
}
|
|
let width_in_bins = (config.width_in_tiles + N_TILE_X - 1u) / N_TILE_X;
|
|
let bin_ix = width_in_bins * wg_id.y + wg_id.x;
|
|
let n_partitions = (config.n_drawobj + N_TILE - 1u) / N_TILE;
|
|
|
|
// Coordinates of the top left of this bin, in tiles.
|
|
let bin_tile_x = N_TILE_X * wg_id.x;
|
|
let bin_tile_y = N_TILE_Y * wg_id.y;
|
|
|
|
let tile_x = local_id.x % N_TILE_X;
|
|
let tile_y = local_id.x / N_TILE_X;
|
|
let this_tile_ix = (bin_tile_y + tile_y) * config.width_in_tiles + bin_tile_x + tile_x;
|
|
cmd_offset = this_tile_ix * PTCL_INITIAL_ALLOC;
|
|
cmd_limit = cmd_offset + (PTCL_INITIAL_ALLOC - PTCL_HEADROOM);
|
|
|
|
// clip state
|
|
var clip_zero_depth = 0u;
|
|
var clip_depth = 0u;
|
|
|
|
var partition_ix = 0u;
|
|
var rd_ix = 0u;
|
|
var wr_ix = 0u;
|
|
var part_start_ix = 0u;
|
|
var ready_ix = 0u;
|
|
|
|
// blend state
|
|
var render_blend_depth = 0u;
|
|
var max_blend_depth = 0u;
|
|
|
|
let blend_offset = cmd_offset;
|
|
cmd_offset += 1u;
|
|
|
|
while true {
|
|
for (var i = 0u; i < N_SLICE; i += 1u) {
|
|
atomicStore(&sh_bitmaps[i][local_id.x], 0u);
|
|
}
|
|
|
|
while true {
|
|
if ready_ix == wr_ix && partition_ix < n_partitions {
|
|
part_start_ix = ready_ix;
|
|
var count = 0u;
|
|
if partition_ix + local_id.x < n_partitions {
|
|
let in_ix = (partition_ix + local_id.x) * N_TILE + bin_ix;
|
|
let bin_header = bin_headers[in_ix];
|
|
count = bin_header.element_count;
|
|
sh_part_offsets[local_id.x] = bin_header.chunk_offset;
|
|
}
|
|
// prefix sum the element counts
|
|
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
|
|
sh_part_count[local_id.x] = count;
|
|
workgroupBarrier();
|
|
if local_id.x >= (1u << i) {
|
|
count += sh_part_count[local_id.x - (1u << i)];
|
|
}
|
|
workgroupBarrier();
|
|
}
|
|
sh_part_count[local_id.x] = part_start_ix + count;
|
|
#ifdef have_uniform
|
|
ready_ix = workgroupUniformLoad(&sh_part_count[WG_SIZE - 1u]);
|
|
#else
|
|
workgroupBarrier();
|
|
ready_ix = sh_part_count[WG_SIZE - 1u];
|
|
#endif
|
|
partition_ix += WG_SIZE;
|
|
}
|
|
// use binary search to find draw object to read
|
|
var ix = rd_ix + local_id.x;
|
|
if ix >= wr_ix && ix < ready_ix {
|
|
var part_ix = 0u;
|
|
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
|
|
let probe = part_ix + ((N_TILE / 2u) >> i);
|
|
if ix >= sh_part_count[probe - 1u] {
|
|
part_ix = probe;
|
|
}
|
|
}
|
|
ix -= select(part_start_ix, sh_part_count[part_ix - 1u], part_ix > 0u);
|
|
let offset = config.bin_data_start + sh_part_offsets[part_ix];
|
|
sh_drawobj_ix[local_id.x] = info_bin_data[offset + ix];
|
|
}
|
|
wr_ix = min(rd_ix + N_TILE, ready_ix);
|
|
if wr_ix - rd_ix >= N_TILE || (wr_ix >= ready_ix && partition_ix >= n_partitions) {
|
|
break;
|
|
}
|
|
}
|
|
// At this point, sh_drawobj_ix[0.. wr_ix - rd_ix] contains merged binning results.
|
|
var tag = DRAWTAG_NOP;
|
|
var drawobj_ix: u32;
|
|
if local_id.x + rd_ix < wr_ix {
|
|
drawobj_ix = sh_drawobj_ix[local_id.x];
|
|
tag = scene[config.drawtag_base + drawobj_ix];
|
|
}
|
|
|
|
var tile_count = 0u;
|
|
// I think this predicate is the same as the last, maybe they can be combined
|
|
if tag != DRAWTAG_NOP {
|
|
let path_ix = draw_monoids[drawobj_ix].path_ix;
|
|
let path = paths[path_ix];
|
|
let stride = path.bbox.z - path.bbox.x;
|
|
sh_tile_stride[local_id.x] = stride;
|
|
let dx = i32(path.bbox.x) - i32(bin_tile_x);
|
|
let dy = i32(path.bbox.y) - i32(bin_tile_y);
|
|
let x0 = clamp(dx, 0, i32(N_TILE_X));
|
|
let y0 = clamp(dy, 0, i32(N_TILE_Y));
|
|
let x1 = clamp(i32(path.bbox.z) - i32(bin_tile_x), 0, i32(N_TILE_X));
|
|
let y1 = clamp(i32(path.bbox.w) - i32(bin_tile_y), 0, i32(N_TILE_Y));
|
|
sh_tile_width[local_id.x] = u32(x1 - x0);
|
|
sh_tile_x0y0[local_id.x] = u32(x0) | u32(y0 << 16u);
|
|
tile_count = u32(x1 - x0) * u32(y1 - y0);
|
|
// base relative to bin
|
|
let base = path.tiles - u32(dy * i32(stride) + dx);
|
|
sh_tile_base[local_id.x] = base;
|
|
// TODO: there's a write_tile_alloc here in the source, not sure what it's supposed to do
|
|
}
|
|
|
|
// Prefix sum of tile counts
|
|
sh_tile_count[local_id.x] = tile_count;
|
|
for (var i = 0u; i < firstTrailingBit(N_TILE); i += 1u) {
|
|
workgroupBarrier();
|
|
if local_id.x >= (1u << i) {
|
|
tile_count += sh_tile_count[local_id.x - (1u << i)];
|
|
}
|
|
workgroupBarrier();
|
|
sh_tile_count[local_id.x] = tile_count;
|
|
}
|
|
workgroupBarrier();
|
|
let total_tile_count = sh_tile_count[N_TILE - 1u];
|
|
// Parallel iteration over all tiles
|
|
for (var ix = local_id.x; ix < total_tile_count; ix += N_TILE) {
|
|
// Binary search to find draw object which contains this tile
|
|
var el_ix = 0u;
|
|
for (var i = 0u; i < firstTrailingBit(N_TILE); i += 1u) {
|
|
let probe = el_ix + ((N_TILE / 2u) >> i);
|
|
if ix >= sh_tile_count[probe - 1u] {
|
|
el_ix = probe;
|
|
}
|
|
}
|
|
drawobj_ix = sh_drawobj_ix[el_ix];
|
|
tag = scene[config.drawtag_base + drawobj_ix];
|
|
let seq_ix = ix - select(0u, sh_tile_count[el_ix - 1u], el_ix > 0u);
|
|
let width = sh_tile_width[el_ix];
|
|
let x0y0 = sh_tile_x0y0[el_ix];
|
|
let x = (x0y0 & 0xffffu) + seq_ix % width;
|
|
let y = (x0y0 >> 16u) + seq_ix / width;
|
|
let tile_ix = sh_tile_base[el_ix] + sh_tile_stride[el_ix] * y + x;
|
|
let tile = tiles[tile_ix];
|
|
let is_clip = (tag & 1u) != 0u;
|
|
var is_blend = false;
|
|
if is_clip {
|
|
let BLEND_CLIP = (128u << 8u) | 3u;
|
|
let scene_offset = draw_monoids[drawobj_ix].scene_offset;
|
|
let dd = config.drawdata_base + scene_offset;
|
|
let blend = scene[dd];
|
|
is_blend = blend != BLEND_CLIP;
|
|
}
|
|
let include_tile = tile.segments != 0u || (tile.backdrop == 0) == is_clip || is_blend;
|
|
if include_tile {
|
|
let el_slice = el_ix / 32u;
|
|
let el_mask = 1u << (el_ix & 31u);
|
|
atomicOr(&sh_bitmaps[el_slice][y * N_TILE_X + x], el_mask);
|
|
}
|
|
}
|
|
workgroupBarrier();
|
|
// At this point bit drawobj % 32 is set in sh_bitmaps[drawobj / 32][y * N_TILE_X + x]
|
|
// if drawobj touches tile (x, y).
|
|
|
|
// Write per-tile command list for this tile
|
|
var slice_ix = 0u;
|
|
var bitmap = atomicLoad(&sh_bitmaps[0u][local_id.x]);
|
|
while true {
|
|
if bitmap == 0u {
|
|
slice_ix += 1u;
|
|
// potential optimization: make iteration limit dynamic
|
|
if slice_ix == N_SLICE {
|
|
break;
|
|
}
|
|
bitmap = atomicLoad(&sh_bitmaps[slice_ix][local_id.x]);
|
|
if bitmap == 0u {
|
|
continue;
|
|
}
|
|
}
|
|
let el_ix = slice_ix * 32u + firstTrailingBit(bitmap);
|
|
drawobj_ix = sh_drawobj_ix[el_ix];
|
|
// clear LSB of bitmap, using bit magic
|
|
bitmap &= bitmap - 1u;
|
|
let drawtag = scene[config.drawtag_base + drawobj_ix];
|
|
let dm = draw_monoids[drawobj_ix];
|
|
let dd = config.drawdata_base + dm.scene_offset;
|
|
let di = dm.info_offset;
|
|
if clip_zero_depth == 0u {
|
|
let tile_ix = sh_tile_base[el_ix] + sh_tile_stride[el_ix] * tile_y + tile_x;
|
|
let tile = tiles[tile_ix];
|
|
switch drawtag {
|
|
// DRAWTAG_FILL_COLOR
|
|
case 0x44u: {
|
|
let linewidth = bitcast<f32>(info_bin_data[di]);
|
|
if write_path(tile, linewidth) {
|
|
let rgba_color = scene[dd];
|
|
write_color(CmdColor(rgba_color));
|
|
}
|
|
}
|
|
// DRAWTAG_FILL_LIN_GRADIENT
|
|
case 0x114u: {
|
|
let linewidth = bitcast<f32>(info_bin_data[di]);
|
|
if write_path(tile, linewidth) {
|
|
let index = scene[dd];
|
|
let info_offset = di + 1u;
|
|
write_grad(CMD_LIN_GRAD, index, info_offset);
|
|
}
|
|
}
|
|
// DRAWTAG_FILL_RAD_GRADIENT
|
|
case 0x29cu: {
|
|
let linewidth = bitcast<f32>(info_bin_data[di]);
|
|
if write_path(tile, linewidth) {
|
|
let index = scene[dd];
|
|
let info_offset = di + 1u;
|
|
write_grad(CMD_RAD_GRAD, index, info_offset);
|
|
}
|
|
}
|
|
// DRAWTAG_FILL_IMAGE
|
|
case 0x248u: {
|
|
let linewidth = bitcast<f32>(info_bin_data[di]);
|
|
if write_path(tile, linewidth) {
|
|
write_image(di + 1u);
|
|
}
|
|
}
|
|
// DRAWTAG_BEGIN_CLIP
|
|
case 0x9u: {
|
|
if tile.segments == 0u && tile.backdrop == 0 {
|
|
clip_zero_depth = clip_depth + 1u;
|
|
} else {
|
|
write_begin_clip();
|
|
render_blend_depth += 1u;
|
|
max_blend_depth = max(max_blend_depth, render_blend_depth);
|
|
}
|
|
clip_depth += 1u;
|
|
}
|
|
// DRAWTAG_END_CLIP
|
|
case 0x21u: {
|
|
clip_depth -= 1u;
|
|
write_path(tile, -1.0);
|
|
let blend = scene[dd];
|
|
let alpha = bitcast<f32>(scene[dd + 1u]);
|
|
write_end_clip(CmdEndClip(blend, alpha));
|
|
render_blend_depth -= 1u;
|
|
}
|
|
default: {}
|
|
}
|
|
} else {
|
|
// In "clip zero" state, suppress all drawing
|
|
switch drawtag {
|
|
// DRAWTAG_BEGIN_CLIP
|
|
case 0x9u: {
|
|
clip_depth += 1u;
|
|
}
|
|
// DRAWTAG_END_CLIP
|
|
case 0x21u: {
|
|
if clip_depth == clip_zero_depth {
|
|
clip_zero_depth = 0u;
|
|
}
|
|
clip_depth -= 1u;
|
|
}
|
|
default: {}
|
|
}
|
|
}
|
|
}
|
|
|
|
rd_ix += N_TILE;
|
|
if rd_ix >= ready_ix && partition_ix >= n_partitions {
|
|
break;
|
|
}
|
|
workgroupBarrier();
|
|
}
|
|
if bin_tile_x + tile_x < config.width_in_tiles && bin_tile_y + tile_y < config.height_in_tiles {
|
|
ptcl[cmd_offset] = CMD_END;
|
|
if max_blend_depth > BLEND_STACK_SPLIT {
|
|
let scratch_size = max_blend_depth * TILE_WIDTH * TILE_HEIGHT;
|
|
ptcl[blend_offset] = atomicAdd(&bump.blend, scratch_size);
|
|
}
|
|
}
|
|
}
|