Checkpoint of clip and gradient work

This is a checkpoint of partly completed work. Much of the GPU side is done, very little of the CPU side.

For clips, the clip_els bindings (binding 6 of draw_leaf) are not added. Clip logic is missing from coarse. The overflow buffer is missing from fine, as is its size calculation in coarse (but it should work as long as the max depth fits within BLEND_STACK_SPLIT).

For gradients, the texture binding is missing (binding 6) is missing from fine, as is the infrastructure in engine to deal with texture resources, and of course porting over the logic to fill it.

The code is not tested, bugs may lurk.
This commit is contained in:
Raph Levien 2022-11-10 19:48:36 -08:00
parent 5bd3a3639f
commit ef3ed3c9d7
9 changed files with 441 additions and 2 deletions

View file

@ -2,6 +2,7 @@
"wgsl-analyzer.customImports": {
"bbox": "${workspaceFolder}/piet-wgsl/shader/shared/bbox.wgsl",
"bump": "${workspaceFolder}/piet-wgsl/shader/shared/bump.wgsl",
"clip": "${workspaceFolder}/piet-wgsl/shader/shared/clip.wgsl",
"config": "${workspaceFolder}/piet-wgsl/shader/shared/config.wgsl",
"cubic": "${workspaceFolder}/piet-wgsl/shader/shared/cubic.wgsl",
"drawtag": "${workspaceFolder}/piet-wgsl/shader/shared/drawtag.wgsl",

View file

@ -0,0 +1,195 @@
#import config
#import bbox
#import clip
#import drawtag
@group(0) @binding(0)
var<storage> config: Config;
@group(0) @binding(1)
var<storage> clip_inp: array<i32>;
@group(0) @binding(2)
var<storage> path_bboxes: array<PathBbox>;
@group(0) @binding(3)
var<storage> reduced: array<Bic>;
@group(0) @binding(4)
var<storage> clip_els: array<ClipEl>;
@group(0) @binding(5)
var<storage, read_write> draw_monoids: array<DrawMonoid>;
@group(0) @binding(6)
var<storage, read_write> clip_bboxes: array<vec4<f32>>;
let WG_SIZE = 256u;
var<workgroup> sh_bic: array<Bic, 510 >;
var<workgroup> sh_stack: array<u32, WG_SIZE>;
var<workgroup> sh_stack_bbox: array<vec4<f32>, WG_SIZE>;
var<workgroup> sh_bbox: array<vec4<f32>, WG_SIZE>;
var<workgroup> sh_link: array<i32, WG_SIZE>;
fn search_link(bic: ptr<function, Bic>, ix: u32) -> i32 {
var j = 0u;
while j < firstTrailingBit(WG_SIZE) {
let base = 2u * WG_SIZE - (2u << (firstTrailingBit(WG_SIZE) - j));
if ((ix >> j) & 1u) != 0u {
let test = bic_combine(sh_bic[base + (ix >> j) - 1u], *bic);
if test.b >= 0u {
*bic = test;
ix -= 1u << j;
}
}
j += 1u;
}
if ix > 0u {
while j > 0u {
j -= 1u;
let base = 2u * WG_SIZE - (2u << (firstTrailingBit(WG_SIZE) - j));
let test = bic_combine(sh_bic[base + (ix >> j) - 1u], *bic);
if test.b == 0u {
*bic = test;
ix -= 1u << j;
}
}
}
if ix > 0u {
return i32(ix) - 1;
} else {
return i32(~(*bic).a);
}
}
fn load_clip_inp(ix: u32) -> i32 {
if ix < config.n_clip {
return clip_inp[ix];
} else {
return i32(0x80000000);
}
}
@compute @workgroup_size(256)
fn main(
@builtin(global_invocation_id) global_id: vec3<u32>,
@builtin(local_invocation_id) local_id: vec3<u32>,
@builtin(workgroup_id) wg_id: vec3<u32>,
) {
var bic: Bic;
if local_id.x < wg_id.x {
bic = reduced[local_id.x];
}
sh_bic[local_id.x] = bic;
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
workgroupBarrier();
if local_id.x + (1u << i) < WG_SIZE {
let other = sh_bic[local_id.x + (1u << i)];
bic = bic_combine(bic, other);
}
workgroupBarrier();
sh_bic[local_id.x] = bic;
}
workgroupBarrier();
let stack_size = sh_bic[0].b;
// TODO: if stack depth > WG_SIZE desired, scan here
// binary search in stack
let sp = WG_SIZE - 1u - local_id.x;
var ix = 0u;
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
let probe = ix + ((WG_SIZE / 2u) >> i);
if sp < sh_bic[probe].b {
ix = probe;
}
}
let b = sh_bic[ix].b;
var bbox = vec4<f32>(-1e9, -1e9, 1e9, 1e9);
if sp < b {
let el = clip_els[ix * WG_SIZE + b - sp - 1u];
sh_stack[local_id.x] = el.parent_ix;
bbox = el.bbox;
}
// forward scan of bbox values of prefix stack
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
sh_stack_bbox[local_id.x] = bbox;
workgroupBarrier();
if local_id.x >= (1u << i) {
bbox = bbox_intersect(sh_stack_bbox[local_id.x - (1u << i)], bbox);
}
workgroupBarrier();
}
sh_stack_bbox[local_id.x] = bbox;
// Read input and compute Bic binary tree
let inp = load_clip_inp(global_id.x);
let is_push = inp >= 0;
var bic = Bic(1u - u32(is_push), u32(is_push));
sh_bic[local_id.x] = bic;
if is_push {
let path_bbox = path_bboxes[inp];
bbox = vec4<f32>(path_bbox.x0, path_bbox.y0, path_bbox.x1, path_bbox.y1);
} else {
bbox = vec4<f32>(-1e9, -1e9, 1e9, 1e9);
}
var inbase = 0u;
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
let outbase = 2u * WG_SIZE - (1u << (firstTrailingBit(WG_SIZE) - i));
workgroupBarrier();
if local_id.x < 1u << (firstTrailingBit(WG_SIZE) - 1u - i) {
let in_off = inbase + local_id.x * 2u;
sh_bic[outbase + local_id.x] = bic_combine(sh_bic[in_off], sh_bic[in_off + 1u]);
}
inbase = outbase;
}
workgroupBarrier();
// search for predecessor node
bic = Bic();
var link = search_link(&bic, local_id.x);
sh_link[local_id.x] = link;
workgroupBarrier();
let grandparent = select(link - 1, sh_link[link], link >= 0);
var parent: i32;
if link >= 0 {
parent = i32(wg_id.x * WG_SIZE) + link;
} else if link + i32(stack_size) >= 0 {
parent = i32(sh_stack[i32(WG_SIZE) + link]);
} else {
parent = -1;
}
// bbox scan (intersect) across parent links
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
if i != 0u {
sh_link[local_id.x] = link;
}
sh_bbox[local_id.x] = bbox;
workgroupBarrier();
if link >= 0 {
bbox = bbox_intersect(sh_bbox[link], bbox);
link = sh_link[link];
}
workgroupBarrier();
}
if link + i32(stack_size) >= 0 {
bbox = bbox_intersect(sh_stack_bbox[i32(WG_SIZE) + link], bbox);
}
// At this point, bbox is the intersection of bboxes on the path to the root
sh_bbox[local_id.x] = bbox;
workgroupBarrier();
if !is_push && global_id.x < config.n_clip {
// Fix up drawmonoid so path_ix of EndClip matches BeginClip
let path_ix = clip_inp[parent];
draw_monoids[~inp].path_ix = u32(path_ix);
if grandparent >= 0 {
bbox = sh_bbox[grandparent];
} else if grandparent + i32(stack_size) >= 0 {
bbox = sh_stack_bbox[i32(WG_SIZE) + grandparent];
} else {
bbox = vec4<f32>(-1e9, -1e9, 1e9, 1e9);
}
}
clip_bboxes[global_id.x] = bbox
}

View file

@ -0,0 +1,66 @@
#import config
#import bbox
#import clip
@group(0) @binding(0)
var<storage> config: Config;
@group(0) @binding(1)
var<storage> clip_inp: array<i32>;
@group(0) @binding(2)
var<storage> path_bboxes: array<PathBbox>;
@group(0) @binding(3)
var<storage, read_write> reduced: array<Bic>;
@group(0) @binding(4)
var<storage, read_write> clip_out: array<ClipEl>;
let WG_SIZE = 256u;
var<workgroup> sh_bic: array<Bic, WG_SIZE>;
var<workgroup> sh_parent: array<u32, WG_SIZE>;
var<workgroup> sh_path_ix: array<u32, WG_SIZE>;
@compute @workgroup_size(256)
fn main(
@builtin(global_invocation_id) global_id: vec3<u32>,
@builtin(local_invocation_id) local_id: vec3<u32>,
@builtin(workgroup_id) wg_id: vec3<u32>,
) {
let inp = clip_inp[global_id.x];
let is_push = inp >= 0;
var bic = Bic(1u - u32(is_push), u32(is_push));
// reverse scan of bicyclic semigroup
sh_bic[local_id.x] = bic;
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
workgroupBarrier();
if local_id.x + (1u << i) < WG_SIZE {
let other = sh_bic[local_id.x + (1u << i)];
bic = bic_combine(bic, other);
}
workgroupBarrier();
sh_bic[local_id.x] = bic;
}
if local_id.x == 0u {
reduced[wg_id.x] = bic;
}
workgroupBarrier();
let size = sh_bic[0].b;
bic = Bic();
if is_push && bic.a == 0u {
let local_ix = size - bic.b - 1u;
sh_parent[local_ix] = local_id.x;
sh_path_ix[local_ix] = u32(inp);
}
workgroupBarrier();
// TODO: possibly do forward scan here if depth can exceed wg size
if local_id.x < size {
let path_ix = sh_path_ix[local_id.x];
let path_bbox = path_bboxes[path_ix];
let parent_ix = sh_parent[local_id.x] + wg_id.x * WG_SIZE;
let bbox = vec4<f32>(path_bbox.x0, path_bbox.y0, path_bbox.x1, path_bbox.y1);
clip_out[global_id.x] = ClipEl(parent_ix, bbox);
}
}

View file

@ -125,7 +125,27 @@ fn write_color(color: CmdColor) {
ptcl[cmd_offset] = CMD_COLOR;
ptcl[cmd_offset + 1u] = color.rgba_color;
cmd_offset += 2u;
}
// Discussion point: these are basically copying from info to ptcl. We
// could just write an info offset and have fine bind that buffer and read
// from it.
fn write_lin_grad(lin: CmdLinGrad) {
alloc_cmd(5u);
ptcl[cmd_offset] = CMD_LIN_GRAD;
ptcl[cmd_offset + 1u] = lin.index;
ptcl[cmd_offset + 2u] = bitcast<u32>(lin.line_x);
ptcl[cmd_offset + 3u] = bitcast<u32>(lin.line_y);
ptcl[cmd_offset + 4u] = bitcast<u32>(lin.line_c);
cmd_offset += 5u;
}
fn write_rad_grad(rad: CmdRadGrad) {
alloc_cmd(12u);
ptcl[cmd_offset] = CMD_RAD_GRAD;
ptcl[cmd_offset + 1u] = rad.index;
cmd_offset += 12u;
}
@compute @workgroup_size(256)
@ -304,14 +324,38 @@ fn main(
if clip_zero_depth == 0u {
let tile_ix = sh_tile_base[el_ix] + sh_tile_stride[el_ix] * tile_y + tile_x;
let tile = tiles[tile_ix];
let linewidth = bitcast<f32>(info[di]);
write_path(tile, linewidth);
switch drawtag {
// DRAWTAG_FILL_COLOR
case 0x44u: {
let linewidth = bitcast<f32>(info[di]);
let rgba_color = scene[dd];
write_path(tile, linewidth);
write_color(CmdColor(rgba_color));
}
// DRAWTAG_FILL_LIN_GRADIENT
case 0x114u: {
var lin: CmdLinGrad;
lin.index = scene[dd];
lin.line_x = bitcast<f32>(info[di + 1u]);
lin.line_y = bitcast<f32>(info[di + 2u]);
lin.line_c = bitcast<f32>(info[di + 3u]);
write_lin_grad(lin);
}
// DRAWTAG_FILL_RAD_GRADIENT
case 0x2dcu: {
var rad: CmdRadGrad;
rad.index = scene[dd];
let m0 = bitcast<f32>(info[di + 1u]);
let m1 = bitcast<f32>(info[di + 2u]);
let m2 = bitcast<f32>(info[di + 3u]);
let m3 = bitcast<f32>(info[di + 4u]);
rad.matrx = vec4<f32>(m0, m1, m2, m3);
rad.xlat = vec2<f32>(bitcast<f32>(info[di + 5u]), bitcast<f32>(info[di + 6u]));
rad.c1 = vec2<f32>(bitcast<f32>(info[di + 7u]), bitcast<f32>(info[di + 8u]));
rad.ra = bitcast<f32>(info[di + 9u]);
rad.roff = bitcast<f32>(info[di + 10u]);
write_rad_grad(rad);
}
default: {}
}
}

View file

@ -42,9 +42,15 @@ var<storage, read_write> output: array<u32>;
#ifdef full
#import ptcl
let GRADIENT_WIDTH = 512;
let BLEND_STACK_SPLIT = 4u;
@group(0) @binding(4)
var<storage> ptcl: array<u32>;
@group(0) @binding(5)
var gradients: texture_2d<f32>;
fn read_fill(cmd_ix: u32) -> CmdFill {
let tile = ptcl[cmd_ix + 1u];
let backdrop = i32(ptcl[cmd_ix + 2u]);
@ -61,6 +67,33 @@ fn read_color(cmd_ix: u32) -> CmdColor {
let rgba_color = ptcl[cmd_ix + 1u];
return CmdColor(rgba_color);
}
fn read_lin_grad(cmd_ix: u32) -> CmdLinGrad {
let index = ptcl[cmd_ix + 1u];
let line_x = bitcast<f32>(ptcl[cmd_ix + 2u]);
let line_y = bitcast<f32>(ptcl[cmd_ix + 3u]);
let line_c = bitcast<f32>(ptcl[cmd_ix + 4u]);
return CmdLinGrad(index, line_x, line_y, line_c);
}
fn read_rad_grad(cmd_ix: u32) -> CmdRadGrad {
let index = ptcl[cmd_ix + 1u];
let m0 = bitcast<f32>(ptcl[cmd_ix + 2u]);
let m1 = bitcast<f32>(ptcl[cmd_ix + 3u]);
let m2 = bitcast<f32>(ptcl[cmd_ix + 4u]);
let m3 = bitcast<f32>(ptcl[cmd_ix + 5u]);
let matrx = vec4<f32>(m0, m1, m2, m3);
let xlat = vec2<f32>(bitcast<f32>(ptcl[cmd_ix + 6u]), bitcast<f32>(ptcl[cmd_ix + 7u]));
let c1 = vec2<f32>(bitcast<f32>(ptcl[cmd_ix + 8u]), bitcast<f32>(ptcl[cmd_ix + 9u]));
let ra = bitcast<f32>(ptcl[cmd_ix + 10u]);
let roff = bitcast<f32>(ptcl[cmd_ix + 11u]);
return CmdRadGrad(index, matrx, xlat, c1, ra, roff);
}
fn mix_blend_compose(backdrop: vec4<f32>, src: vec4<f32>, mode: u32) -> vec4<f32> {
// TODO: ALL the blend modes. This is just vanilla src-over.
return backdrop * (1.0 - src.a) + src;
}
#endif
let PIXELS_PER_THREAD = 4u;
@ -147,6 +180,8 @@ fn main(
let xy = vec2<f32>(f32(global_id.x * PIXELS_PER_THREAD), f32(global_id.y));
#ifdef full
var rgba: array<vec4<f32>, PIXELS_PER_THREAD>;
var blend_stack: array<array<u32, BLEND_STACK_SPLIT>, PIXELS_PER_THREAD>;
var clip_depth = 0u;
var area: array<f32, PIXELS_PER_THREAD>;
var cmd_ix = tile_ix * PTCL_INITIAL_ALLOC;
@ -187,6 +222,66 @@ fn main(
}
cmd_ix += 2u;
}
// CMD_LIN_GRAD
case 6u: {
let lin = read_lin_grad(cmd_ix);
let d = lin.line_x * xy.x + lin.line_y * xy.y + lin.line_c;
for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
let my_d = d + lin.line_x * f32(i);
let x = i32(round(clamp(my_d, 0.0, 1.0) * f32(GRADIENT_WIDTH - 1)));
let fg_rgba = textureLoad(gradients, vec2<i32>(x, i32(lin.index)), 0);
let fg_i = fg_rgba * area[i];
rgba[i] = rgba[i] * (1.0 - fg_i.a) + fg_i;
}
cmd_ix += 12u;
}
// CMD_RAD_GRAD
case 7u: {
let rad = read_rad_grad(cmd_ix);
for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
let my_xy = vec2<f32>(xy.x + f32(i), xy.y);
// TODO: can hoist y, but for now stick to piet-gpu
let xy_xformed = rad.matrx.xz * my_xy.x + rad.matrx.yw * my_xy.y - rad.xlat;
let ba = dot(xy_xformed, rad.c1);
let ca = rad.ra * dot(xy_xformed, xy_xformed);
let t = sqrt(ba * ba + ca) - ba - rad.roff;
let x = i32(round(clamp(t, 0.0, 1.0) * f32(GRADIENT_WIDTH - 1)));
let fg_rgba = textureLoad(gradients, vec2<i32>(x, i32(rad.index)), 0);
let fg_i = fg_rgba * area[i];
rgba[i] = rgba[i] * (1.0 - fg_i.a) + fg_i;
}
cmd_ix += 12u;
}
// CMD_BEGIN_CLIP
case 9u: {
if clip_depth < BLEND_STACK_SPLIT {
for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
blend_stack[clip_depth][i] = pack4x8unorm(rgba[i]);
rgba[i] = vec4<f32>(0.0);
}
} else {
// TODO: spill to memory
}
clip_depth += 1u;
cmd_ix += 1u;
}
// CMD_END_CLIP
case 10u: {
let blend = ptcl[cmd_ix + 1u];
clip_depth -= 1u;
for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
var bg_rgba: u32;
if clip_depth < BLEND_STACK_SPLIT {
bg_rgba = blend_stack[clip_depth][i];
} else {
// load from memory
}
let bg = unpack4x8unorm(bg_rgba);
let fg = rgba[i] * area[i];
rgba[i] = mix_blend_compose(bg, fg, blend);
}
cmd_ix += 2u;
}
// CMD_JUMP
case 11u: {
cmd_ix = ptcl[cmd_ix + 1u];

View file

@ -0,0 +1,14 @@
struct Bic {
a: u32,
b: u32,
}
fn bic_combine(x: Bic, y: Bic) -> Bic {
let m = min(x.b, y.a);
return Bic(x.a + y.a - m, x.b + y.b - m);
}
struct ClipEl {
parent_ix: u32,
bbox: vec4<f32>,
}

View file

@ -20,6 +20,7 @@ struct Config {
n_drawobj: u32,
n_path: u32,
n_clip: u32,
// offsets within scene buffer (in u32 units)
// Note: this is a difference from piet-gpu, which is in bytes

View file

@ -28,6 +28,10 @@ let CMD_FILL = 1u;
let CMD_STROKE = 2u;
let CMD_SOLID = 3u;
let CMD_COLOR = 5u;
let CMD_LIN_GRAD = 6u;
let CMD_RAD_GRAD = 7u;
let CMD_BEGIN_CLIP = 9u;
let CMD_END_CLIP = 10u;
let CMD_JUMP = 11u;
// The individual PTCL structs are written here, but read/write is by
@ -50,3 +54,19 @@ struct CmdJump {
struct CmdColor {
rgba_color: u32,
}
struct CmdLinGrad {
index: u32,
line_x: f32,
line_y: f32,
line_c: f32,
}
struct CmdRadGrad {
index: u32,
matrx: vec4<f32>,
xlat: vec2<f32>,
c1: vec2<f32>,
ra: f32,
roff: f32,
}

View file

@ -26,6 +26,7 @@ struct Config {
height_in_tiles: u32,
n_drawobj: u32,
n_path: u32,
n_clip: u32,
pathtag_base: u32,
pathdata_base: u32,
drawtag_base: u32,
@ -143,11 +144,13 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
let n_path = data.n_path;
// TODO: calculate for real when we do rectangles
let n_drawobj = n_path;
let n_clip = 0; // TODO: wire up correctly
let config = Config {
width_in_tiles: 64,
height_in_tiles: 64,
n_drawobj,
n_path,
n_clip,
pathtag_base,
pathdata_base,
drawtag_base,