From ef3ed3c9d73e6e220c27c8c59674e545230de68b Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Thu, 10 Nov 2022 19:48:36 -0800 Subject: [PATCH] Checkpoint of clip and gradient work This is a checkpoint of partly completed work. Much of the GPU side is done, very little of the CPU side. For clips, the clip_els bindings (binding 6 of draw_leaf) are not added. Clip logic is missing from coarse. The overflow buffer is missing from fine, as is its size calculation in coarse (but it should work as long as the max depth fits within BLEND_STACK_SPLIT). For gradients, the texture binding is missing (binding 6) is missing from fine, as is the infrastructure in engine to deal with texture resources, and of course porting over the logic to fill it. The code is not tested, bugs may lurk. --- .vscode/settings.json | 1 + piet-wgsl/shader/clip_leaf.wgsl | 195 ++++++++++++++++++++++++++++ piet-wgsl/shader/clip_reduce.wgsl | 66 ++++++++++ piet-wgsl/shader/coarse.wgsl | 48 ++++++- piet-wgsl/shader/fine.wgsl | 95 ++++++++++++++ piet-wgsl/shader/shared/clip.wgsl | 14 ++ piet-wgsl/shader/shared/config.wgsl | 1 + piet-wgsl/shader/shared/ptcl.wgsl | 20 +++ piet-wgsl/src/render.rs | 3 + 9 files changed, 441 insertions(+), 2 deletions(-) create mode 100644 piet-wgsl/shader/clip_leaf.wgsl create mode 100644 piet-wgsl/shader/clip_reduce.wgsl create mode 100644 piet-wgsl/shader/shared/clip.wgsl diff --git a/.vscode/settings.json b/.vscode/settings.json index 813fac6..b8fa532 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -2,6 +2,7 @@ "wgsl-analyzer.customImports": { "bbox": "${workspaceFolder}/piet-wgsl/shader/shared/bbox.wgsl", "bump": "${workspaceFolder}/piet-wgsl/shader/shared/bump.wgsl", + "clip": "${workspaceFolder}/piet-wgsl/shader/shared/clip.wgsl", "config": "${workspaceFolder}/piet-wgsl/shader/shared/config.wgsl", "cubic": "${workspaceFolder}/piet-wgsl/shader/shared/cubic.wgsl", "drawtag": "${workspaceFolder}/piet-wgsl/shader/shared/drawtag.wgsl", diff --git a/piet-wgsl/shader/clip_leaf.wgsl b/piet-wgsl/shader/clip_leaf.wgsl new file mode 100644 index 0000000..f294317 --- /dev/null +++ b/piet-wgsl/shader/clip_leaf.wgsl @@ -0,0 +1,195 @@ + +#import config +#import bbox +#import clip +#import drawtag + +@group(0) @binding(0) +var config: Config; + +@group(0) @binding(1) +var clip_inp: array; + +@group(0) @binding(2) +var path_bboxes: array; + +@group(0) @binding(3) +var reduced: array; + +@group(0) @binding(4) +var clip_els: array; + +@group(0) @binding(5) +var draw_monoids: array; + +@group(0) @binding(6) +var clip_bboxes: array>; + +let WG_SIZE = 256u; +var sh_bic: array; +var sh_stack: array; +var sh_stack_bbox: array, WG_SIZE>; +var sh_bbox: array, WG_SIZE>; +var sh_link: array; + +fn search_link(bic: ptr, ix: u32) -> i32 { + var j = 0u; + while j < firstTrailingBit(WG_SIZE) { + let base = 2u * WG_SIZE - (2u << (firstTrailingBit(WG_SIZE) - j)); + if ((ix >> j) & 1u) != 0u { + let test = bic_combine(sh_bic[base + (ix >> j) - 1u], *bic); + if test.b >= 0u { + *bic = test; + ix -= 1u << j; + } + } + j += 1u; + } + if ix > 0u { + while j > 0u { + j -= 1u; + let base = 2u * WG_SIZE - (2u << (firstTrailingBit(WG_SIZE) - j)); + let test = bic_combine(sh_bic[base + (ix >> j) - 1u], *bic); + if test.b == 0u { + *bic = test; + ix -= 1u << j; + } + } + } + if ix > 0u { + return i32(ix) - 1; + } else { + return i32(~(*bic).a); + } +} + +fn load_clip_inp(ix: u32) -> i32 { + if ix < config.n_clip { + return clip_inp[ix]; + } else { + return i32(0x80000000); + } +} + +@compute @workgroup_size(256) +fn main( + @builtin(global_invocation_id) global_id: vec3, + @builtin(local_invocation_id) local_id: vec3, + @builtin(workgroup_id) wg_id: vec3, +) { + var bic: Bic; + if local_id.x < wg_id.x { + bic = reduced[local_id.x]; + } + sh_bic[local_id.x] = bic; + for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) { + workgroupBarrier(); + if local_id.x + (1u << i) < WG_SIZE { + let other = sh_bic[local_id.x + (1u << i)]; + bic = bic_combine(bic, other); + } + workgroupBarrier(); + sh_bic[local_id.x] = bic; + } + workgroupBarrier(); + let stack_size = sh_bic[0].b; + // TODO: if stack depth > WG_SIZE desired, scan here + + // binary search in stack + let sp = WG_SIZE - 1u - local_id.x; + var ix = 0u; + for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) { + let probe = ix + ((WG_SIZE / 2u) >> i); + if sp < sh_bic[probe].b { + ix = probe; + } + } + let b = sh_bic[ix].b; + var bbox = vec4(-1e9, -1e9, 1e9, 1e9); + if sp < b { + let el = clip_els[ix * WG_SIZE + b - sp - 1u]; + sh_stack[local_id.x] = el.parent_ix; + bbox = el.bbox; + } + // forward scan of bbox values of prefix stack + for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) { + sh_stack_bbox[local_id.x] = bbox; + workgroupBarrier(); + if local_id.x >= (1u << i) { + bbox = bbox_intersect(sh_stack_bbox[local_id.x - (1u << i)], bbox); + } + workgroupBarrier(); + } + sh_stack_bbox[local_id.x] = bbox; + + // Read input and compute Bic binary tree + let inp = load_clip_inp(global_id.x); + let is_push = inp >= 0; + var bic = Bic(1u - u32(is_push), u32(is_push)); + sh_bic[local_id.x] = bic; + if is_push { + let path_bbox = path_bboxes[inp]; + bbox = vec4(path_bbox.x0, path_bbox.y0, path_bbox.x1, path_bbox.y1); + } else { + bbox = vec4(-1e9, -1e9, 1e9, 1e9); + } + var inbase = 0u; + for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) { + let outbase = 2u * WG_SIZE - (1u << (firstTrailingBit(WG_SIZE) - i)); + workgroupBarrier(); + if local_id.x < 1u << (firstTrailingBit(WG_SIZE) - 1u - i) { + let in_off = inbase + local_id.x * 2u; + sh_bic[outbase + local_id.x] = bic_combine(sh_bic[in_off], sh_bic[in_off + 1u]); + } + inbase = outbase; + } + workgroupBarrier(); + // search for predecessor node + bic = Bic(); + var link = search_link(&bic, local_id.x); + sh_link[local_id.x] = link; + workgroupBarrier(); + let grandparent = select(link - 1, sh_link[link], link >= 0); + var parent: i32; + if link >= 0 { + parent = i32(wg_id.x * WG_SIZE) + link; + } else if link + i32(stack_size) >= 0 { + parent = i32(sh_stack[i32(WG_SIZE) + link]); + } else { + parent = -1; + } + // bbox scan (intersect) across parent links + for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) { + if i != 0u { + sh_link[local_id.x] = link; + } + sh_bbox[local_id.x] = bbox; + workgroupBarrier(); + if link >= 0 { + bbox = bbox_intersect(sh_bbox[link], bbox); + link = sh_link[link]; + } + workgroupBarrier(); + } + if link + i32(stack_size) >= 0 { + bbox = bbox_intersect(sh_stack_bbox[i32(WG_SIZE) + link], bbox); + } + // At this point, bbox is the intersection of bboxes on the path to the root + sh_bbox[local_id.x] = bbox; + workgroupBarrier(); + + if !is_push && global_id.x < config.n_clip { + // Fix up drawmonoid so path_ix of EndClip matches BeginClip + let path_ix = clip_inp[parent]; + draw_monoids[~inp].path_ix = u32(path_ix); + + if grandparent >= 0 { + bbox = sh_bbox[grandparent]; + } else if grandparent + i32(stack_size) >= 0 { + bbox = sh_stack_bbox[i32(WG_SIZE) + grandparent]; + } else { + bbox = vec4(-1e9, -1e9, 1e9, 1e9); + } + } + clip_bboxes[global_id.x] = bbox +} diff --git a/piet-wgsl/shader/clip_reduce.wgsl b/piet-wgsl/shader/clip_reduce.wgsl new file mode 100644 index 0000000..3288b07 --- /dev/null +++ b/piet-wgsl/shader/clip_reduce.wgsl @@ -0,0 +1,66 @@ + +#import config +#import bbox +#import clip + +@group(0) @binding(0) +var config: Config; + +@group(0) @binding(1) +var clip_inp: array; + +@group(0) @binding(2) +var path_bboxes: array; + +@group(0) @binding(3) +var reduced: array; + +@group(0) @binding(4) +var clip_out: array; + +let WG_SIZE = 256u; +var sh_bic: array; +var sh_parent: array; +var sh_path_ix: array; + +@compute @workgroup_size(256) +fn main( + @builtin(global_invocation_id) global_id: vec3, + @builtin(local_invocation_id) local_id: vec3, + @builtin(workgroup_id) wg_id: vec3, +) { + let inp = clip_inp[global_id.x]; + let is_push = inp >= 0; + var bic = Bic(1u - u32(is_push), u32(is_push)); + // reverse scan of bicyclic semigroup + sh_bic[local_id.x] = bic; + for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) { + workgroupBarrier(); + if local_id.x + (1u << i) < WG_SIZE { + let other = sh_bic[local_id.x + (1u << i)]; + bic = bic_combine(bic, other); + } + workgroupBarrier(); + sh_bic[local_id.x] = bic; + } + if local_id.x == 0u { + reduced[wg_id.x] = bic; + } + workgroupBarrier(); + let size = sh_bic[0].b; + bic = Bic(); + if is_push && bic.a == 0u { + let local_ix = size - bic.b - 1u; + sh_parent[local_ix] = local_id.x; + sh_path_ix[local_ix] = u32(inp); + } + workgroupBarrier(); + // TODO: possibly do forward scan here if depth can exceed wg size + if local_id.x < size { + let path_ix = sh_path_ix[local_id.x]; + let path_bbox = path_bboxes[path_ix]; + let parent_ix = sh_parent[local_id.x] + wg_id.x * WG_SIZE; + let bbox = vec4(path_bbox.x0, path_bbox.y0, path_bbox.x1, path_bbox.y1); + clip_out[global_id.x] = ClipEl(parent_ix, bbox); + } +} diff --git a/piet-wgsl/shader/coarse.wgsl b/piet-wgsl/shader/coarse.wgsl index e1e88d7..5741ec3 100644 --- a/piet-wgsl/shader/coarse.wgsl +++ b/piet-wgsl/shader/coarse.wgsl @@ -125,7 +125,27 @@ fn write_color(color: CmdColor) { ptcl[cmd_offset] = CMD_COLOR; ptcl[cmd_offset + 1u] = color.rgba_color; cmd_offset += 2u; +} +// Discussion point: these are basically copying from info to ptcl. We +// could just write an info offset and have fine bind that buffer and read +// from it. + +fn write_lin_grad(lin: CmdLinGrad) { + alloc_cmd(5u); + ptcl[cmd_offset] = CMD_LIN_GRAD; + ptcl[cmd_offset + 1u] = lin.index; + ptcl[cmd_offset + 2u] = bitcast(lin.line_x); + ptcl[cmd_offset + 3u] = bitcast(lin.line_y); + ptcl[cmd_offset + 4u] = bitcast(lin.line_c); + cmd_offset += 5u; +} + +fn write_rad_grad(rad: CmdRadGrad) { + alloc_cmd(12u); + ptcl[cmd_offset] = CMD_RAD_GRAD; + ptcl[cmd_offset + 1u] = rad.index; + cmd_offset += 12u; } @compute @workgroup_size(256) @@ -304,14 +324,38 @@ fn main( if clip_zero_depth == 0u { let tile_ix = sh_tile_base[el_ix] + sh_tile_stride[el_ix] * tile_y + tile_x; let tile = tiles[tile_ix]; + let linewidth = bitcast(info[di]); + write_path(tile, linewidth); switch drawtag { // DRAWTAG_FILL_COLOR case 0x44u: { - let linewidth = bitcast(info[di]); let rgba_color = scene[dd]; - write_path(tile, linewidth); write_color(CmdColor(rgba_color)); } + // DRAWTAG_FILL_LIN_GRADIENT + case 0x114u: { + var lin: CmdLinGrad; + lin.index = scene[dd]; + lin.line_x = bitcast(info[di + 1u]); + lin.line_y = bitcast(info[di + 2u]); + lin.line_c = bitcast(info[di + 3u]); + write_lin_grad(lin); + } + // DRAWTAG_FILL_RAD_GRADIENT + case 0x2dcu: { + var rad: CmdRadGrad; + rad.index = scene[dd]; + let m0 = bitcast(info[di + 1u]); + let m1 = bitcast(info[di + 2u]); + let m2 = bitcast(info[di + 3u]); + let m3 = bitcast(info[di + 4u]); + rad.matrx = vec4(m0, m1, m2, m3); + rad.xlat = vec2(bitcast(info[di + 5u]), bitcast(info[di + 6u])); + rad.c1 = vec2(bitcast(info[di + 7u]), bitcast(info[di + 8u])); + rad.ra = bitcast(info[di + 9u]); + rad.roff = bitcast(info[di + 10u]); + write_rad_grad(rad); + } default: {} } } diff --git a/piet-wgsl/shader/fine.wgsl b/piet-wgsl/shader/fine.wgsl index c347630..d76edef 100644 --- a/piet-wgsl/shader/fine.wgsl +++ b/piet-wgsl/shader/fine.wgsl @@ -42,9 +42,15 @@ var output: array; #ifdef full #import ptcl +let GRADIENT_WIDTH = 512; +let BLEND_STACK_SPLIT = 4u; + @group(0) @binding(4) var ptcl: array; +@group(0) @binding(5) +var gradients: texture_2d; + fn read_fill(cmd_ix: u32) -> CmdFill { let tile = ptcl[cmd_ix + 1u]; let backdrop = i32(ptcl[cmd_ix + 2u]); @@ -61,6 +67,33 @@ fn read_color(cmd_ix: u32) -> CmdColor { let rgba_color = ptcl[cmd_ix + 1u]; return CmdColor(rgba_color); } + +fn read_lin_grad(cmd_ix: u32) -> CmdLinGrad { + let index = ptcl[cmd_ix + 1u]; + let line_x = bitcast(ptcl[cmd_ix + 2u]); + let line_y = bitcast(ptcl[cmd_ix + 3u]); + let line_c = bitcast(ptcl[cmd_ix + 4u]); + return CmdLinGrad(index, line_x, line_y, line_c); +} + +fn read_rad_grad(cmd_ix: u32) -> CmdRadGrad { + let index = ptcl[cmd_ix + 1u]; + let m0 = bitcast(ptcl[cmd_ix + 2u]); + let m1 = bitcast(ptcl[cmd_ix + 3u]); + let m2 = bitcast(ptcl[cmd_ix + 4u]); + let m3 = bitcast(ptcl[cmd_ix + 5u]); + let matrx = vec4(m0, m1, m2, m3); + let xlat = vec2(bitcast(ptcl[cmd_ix + 6u]), bitcast(ptcl[cmd_ix + 7u])); + let c1 = vec2(bitcast(ptcl[cmd_ix + 8u]), bitcast(ptcl[cmd_ix + 9u])); + let ra = bitcast(ptcl[cmd_ix + 10u]); + let roff = bitcast(ptcl[cmd_ix + 11u]); + return CmdRadGrad(index, matrx, xlat, c1, ra, roff); +} + +fn mix_blend_compose(backdrop: vec4, src: vec4, mode: u32) -> vec4 { + // TODO: ALL the blend modes. This is just vanilla src-over. + return backdrop * (1.0 - src.a) + src; +} #endif let PIXELS_PER_THREAD = 4u; @@ -147,6 +180,8 @@ fn main( let xy = vec2(f32(global_id.x * PIXELS_PER_THREAD), f32(global_id.y)); #ifdef full var rgba: array, PIXELS_PER_THREAD>; + var blend_stack: array, PIXELS_PER_THREAD>; + var clip_depth = 0u; var area: array; var cmd_ix = tile_ix * PTCL_INITIAL_ALLOC; @@ -187,6 +222,66 @@ fn main( } cmd_ix += 2u; } + // CMD_LIN_GRAD + case 6u: { + let lin = read_lin_grad(cmd_ix); + let d = lin.line_x * xy.x + lin.line_y * xy.y + lin.line_c; + for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) { + let my_d = d + lin.line_x * f32(i); + let x = i32(round(clamp(my_d, 0.0, 1.0) * f32(GRADIENT_WIDTH - 1))); + let fg_rgba = textureLoad(gradients, vec2(x, i32(lin.index)), 0); + let fg_i = fg_rgba * area[i]; + rgba[i] = rgba[i] * (1.0 - fg_i.a) + fg_i; + } + cmd_ix += 12u; + } + // CMD_RAD_GRAD + case 7u: { + let rad = read_rad_grad(cmd_ix); + for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) { + let my_xy = vec2(xy.x + f32(i), xy.y); + // TODO: can hoist y, but for now stick to piet-gpu + let xy_xformed = rad.matrx.xz * my_xy.x + rad.matrx.yw * my_xy.y - rad.xlat; + let ba = dot(xy_xformed, rad.c1); + let ca = rad.ra * dot(xy_xformed, xy_xformed); + let t = sqrt(ba * ba + ca) - ba - rad.roff; + let x = i32(round(clamp(t, 0.0, 1.0) * f32(GRADIENT_WIDTH - 1))); + let fg_rgba = textureLoad(gradients, vec2(x, i32(rad.index)), 0); + let fg_i = fg_rgba * area[i]; + rgba[i] = rgba[i] * (1.0 - fg_i.a) + fg_i; + } + cmd_ix += 12u; + } + // CMD_BEGIN_CLIP + case 9u: { + if clip_depth < BLEND_STACK_SPLIT { + for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) { + blend_stack[clip_depth][i] = pack4x8unorm(rgba[i]); + rgba[i] = vec4(0.0); + } + } else { + // TODO: spill to memory + } + clip_depth += 1u; + cmd_ix += 1u; + } + // CMD_END_CLIP + case 10u: { + let blend = ptcl[cmd_ix + 1u]; + clip_depth -= 1u; + for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) { + var bg_rgba: u32; + if clip_depth < BLEND_STACK_SPLIT { + bg_rgba = blend_stack[clip_depth][i]; + } else { + // load from memory + } + let bg = unpack4x8unorm(bg_rgba); + let fg = rgba[i] * area[i]; + rgba[i] = mix_blend_compose(bg, fg, blend); + } + cmd_ix += 2u; + } // CMD_JUMP case 11u: { cmd_ix = ptcl[cmd_ix + 1u]; diff --git a/piet-wgsl/shader/shared/clip.wgsl b/piet-wgsl/shader/shared/clip.wgsl new file mode 100644 index 0000000..608e9bf --- /dev/null +++ b/piet-wgsl/shader/shared/clip.wgsl @@ -0,0 +1,14 @@ +struct Bic { + a: u32, + b: u32, +} + +fn bic_combine(x: Bic, y: Bic) -> Bic { + let m = min(x.b, y.a); + return Bic(x.a + y.a - m, x.b + y.b - m); +} + +struct ClipEl { + parent_ix: u32, + bbox: vec4, +} diff --git a/piet-wgsl/shader/shared/config.wgsl b/piet-wgsl/shader/shared/config.wgsl index 73f4054..5db894b 100644 --- a/piet-wgsl/shader/shared/config.wgsl +++ b/piet-wgsl/shader/shared/config.wgsl @@ -20,6 +20,7 @@ struct Config { n_drawobj: u32, n_path: u32, + n_clip: u32, // offsets within scene buffer (in u32 units) // Note: this is a difference from piet-gpu, which is in bytes diff --git a/piet-wgsl/shader/shared/ptcl.wgsl b/piet-wgsl/shader/shared/ptcl.wgsl index 8d6e869..92316cc 100644 --- a/piet-wgsl/shader/shared/ptcl.wgsl +++ b/piet-wgsl/shader/shared/ptcl.wgsl @@ -28,6 +28,10 @@ let CMD_FILL = 1u; let CMD_STROKE = 2u; let CMD_SOLID = 3u; let CMD_COLOR = 5u; +let CMD_LIN_GRAD = 6u; +let CMD_RAD_GRAD = 7u; +let CMD_BEGIN_CLIP = 9u; +let CMD_END_CLIP = 10u; let CMD_JUMP = 11u; // The individual PTCL structs are written here, but read/write is by @@ -50,3 +54,19 @@ struct CmdJump { struct CmdColor { rgba_color: u32, } + +struct CmdLinGrad { + index: u32, + line_x: f32, + line_y: f32, + line_c: f32, +} + +struct CmdRadGrad { + index: u32, + matrx: vec4, + xlat: vec2, + c1: vec2, + ra: f32, + roff: f32, +} diff --git a/piet-wgsl/src/render.rs b/piet-wgsl/src/render.rs index 690e681..7d052cc 100644 --- a/piet-wgsl/src/render.rs +++ b/piet-wgsl/src/render.rs @@ -26,6 +26,7 @@ struct Config { height_in_tiles: u32, n_drawobj: u32, n_path: u32, + n_clip: u32, pathtag_base: u32, pathdata_base: u32, drawtag_base: u32, @@ -143,11 +144,13 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy let n_path = data.n_path; // TODO: calculate for real when we do rectangles let n_drawobj = n_path; + let n_clip = 0; // TODO: wire up correctly let config = Config { width_in_tiles: 64, height_in_tiles: 64, n_drawobj, n_path, + n_clip, pathtag_base, pathdata_base, drawtag_base,