Checkpoint of clip and gradient work

This is a checkpoint of partly completed work. Much of the GPU side is done, very little of the CPU side. For clips, the clip_els bindings (binding 6 of draw_leaf) are not added. Clip logic is missing from coarse. The overflow buffer is missing from fine, as is its size calculation in coarse (but it should work as long as the max depth fits within BLEND_STACK_SPLIT). For gradients, the texture binding is missing (binding 6) is missing from fine, as is the infrastructure in engine to deal with texture resources, and of course porting over the logic to fill it. The code is not tested, bugs may lurk.
2025-01-09 20:31:29 +11:00 · 2022-11-10 19:48:36 -08:00 · 2022-11-10 19:48:36 -08:00 · ef3ed3c9d7
parent 5bd3a3639f
commit ef3ed3c9d7
9 changed files with 441 additions and 2 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -2,6 +2,7 @@
  "wgsl-analyzer.customImports": {
    "bbox": "${workspaceFolder}/piet-wgsl/shader/shared/bbox.wgsl",
    "bump": "${workspaceFolder}/piet-wgsl/shader/shared/bump.wgsl",
+    "clip": "${workspaceFolder}/piet-wgsl/shader/shared/clip.wgsl",
    "config": "${workspaceFolder}/piet-wgsl/shader/shared/config.wgsl",
    "cubic": "${workspaceFolder}/piet-wgsl/shader/shared/cubic.wgsl",
    "drawtag": "${workspaceFolder}/piet-wgsl/shader/shared/drawtag.wgsl",
--- a/piet-wgsl/shader/clip_leaf.wgsl
+++ b/piet-wgsl/shader/clip_leaf.wgsl
@ -0,0 +1,195 @@
+
+#import config
+#import bbox
+#import clip
+#import drawtag
+
+@group(0) @binding(0)
+var<storage> config: Config;
+
+@group(0) @binding(1)
+var<storage> clip_inp: array<i32>;
+
+@group(0) @binding(2)
+var<storage> path_bboxes: array<PathBbox>;
+
+@group(0) @binding(3)
+var<storage> reduced: array<Bic>;
+
+@group(0) @binding(4)
+var<storage> clip_els: array<ClipEl>;
+
+@group(0) @binding(5)
+var<storage, read_write> draw_monoids: array<DrawMonoid>;
+
+@group(0) @binding(6)
+var<storage, read_write> clip_bboxes: array<vec4<f32>>;
+
+let WG_SIZE = 256u;
+var<workgroup> sh_bic: array<Bic, 510 >;
+var<workgroup> sh_stack: array<u32, WG_SIZE>;
+var<workgroup> sh_stack_bbox: array<vec4<f32>, WG_SIZE>;
+var<workgroup> sh_bbox: array<vec4<f32>, WG_SIZE>;
+var<workgroup> sh_link: array<i32, WG_SIZE>;
+
+fn search_link(bic: ptr<function, Bic>, ix: u32) -> i32 {
+    var j = 0u;
+    while j < firstTrailingBit(WG_SIZE) {
+        let base = 2u * WG_SIZE - (2u << (firstTrailingBit(WG_SIZE) - j));
+        if ((ix >> j) & 1u) != 0u {
+            let test = bic_combine(sh_bic[base + (ix >> j) - 1u], *bic);
+            if test.b >= 0u {
+                *bic = test;
+                ix -= 1u << j;
+            }
+        }
+        j += 1u;
+    }
+    if ix > 0u {
+        while j > 0u {
+            j -= 1u;
+            let base = 2u * WG_SIZE - (2u << (firstTrailingBit(WG_SIZE) - j));
+            let test = bic_combine(sh_bic[base + (ix >> j) - 1u], *bic);
+            if test.b == 0u {
+                *bic = test;
+                ix -= 1u << j;
+            }
+        }
+    }
+    if ix > 0u {
+        return i32(ix) - 1;
+    } else {
+        return i32(~(*bic).a);
+    }
+}
+
+fn load_clip_inp(ix: u32) -> i32 {
+    if ix < config.n_clip {
+        return clip_inp[ix];
+    } else {
+        return i32(0x80000000);
+    }
+}
+
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+    @builtin(workgroup_id) wg_id: vec3<u32>,
+) {
+    var bic: Bic;
+    if local_id.x < wg_id.x {
+        bic = reduced[local_id.x];
+    }
+    sh_bic[local_id.x] = bic;
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+        workgroupBarrier();
+        if local_id.x + (1u << i) < WG_SIZE {
+            let other = sh_bic[local_id.x + (1u << i)];
+            bic = bic_combine(bic, other);
+        }
+        workgroupBarrier();
+        sh_bic[local_id.x] = bic;
+    }
+    workgroupBarrier();
+    let stack_size = sh_bic[0].b;
+    // TODO: if stack depth > WG_SIZE desired, scan here
+
+    // binary search in stack
+    let sp = WG_SIZE - 1u - local_id.x;
+    var ix = 0u;
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+        let probe = ix + ((WG_SIZE / 2u) >> i);
+        if sp < sh_bic[probe].b {
+            ix = probe;
+        }
+    }
+    let b = sh_bic[ix].b;
+    var bbox = vec4<f32>(-1e9, -1e9, 1e9, 1e9);
+    if sp < b {
+        let el = clip_els[ix * WG_SIZE + b - sp - 1u];
+        sh_stack[local_id.x] = el.parent_ix;
+        bbox = el.bbox;
+    }
+    // forward scan of bbox values of prefix stack
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+        sh_stack_bbox[local_id.x] = bbox;
+        workgroupBarrier();
+        if local_id.x >= (1u << i) {
+            bbox = bbox_intersect(sh_stack_bbox[local_id.x - (1u << i)], bbox);
+        }
+        workgroupBarrier();
+    }
+    sh_stack_bbox[local_id.x] = bbox;
+
+    // Read input and compute Bic binary tree
+    let inp = load_clip_inp(global_id.x);
+    let is_push = inp >= 0;
+    var bic = Bic(1u - u32(is_push), u32(is_push));
+    sh_bic[local_id.x] = bic;
+    if is_push {
+        let path_bbox = path_bboxes[inp];
+        bbox = vec4<f32>(path_bbox.x0, path_bbox.y0, path_bbox.x1, path_bbox.y1);
+    } else {
+        bbox = vec4<f32>(-1e9, -1e9, 1e9, 1e9);
+    }
+    var inbase = 0u;
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+        let outbase = 2u * WG_SIZE - (1u << (firstTrailingBit(WG_SIZE) - i));
+        workgroupBarrier();
+        if local_id.x < 1u << (firstTrailingBit(WG_SIZE) - 1u - i) {
+            let in_off = inbase + local_id.x * 2u;
+            sh_bic[outbase + local_id.x] = bic_combine(sh_bic[in_off], sh_bic[in_off + 1u]);
+        }
+        inbase = outbase;
+    }
+    workgroupBarrier();
+    // search for predecessor node
+    bic = Bic();
+    var link = search_link(&bic, local_id.x);
+    sh_link[local_id.x] = link;
+    workgroupBarrier();
+    let grandparent = select(link - 1, sh_link[link], link >= 0);
+    var parent: i32;
+    if link >= 0 {
+        parent = i32(wg_id.x * WG_SIZE) + link;
+    } else if link + i32(stack_size) >= 0 {
+        parent = i32(sh_stack[i32(WG_SIZE) + link]);
+    } else {
+        parent = -1;
+    }
+    // bbox scan (intersect) across parent links
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+        if i != 0u {
+            sh_link[local_id.x] = link;
+        }
+        sh_bbox[local_id.x] = bbox;
+        workgroupBarrier();
+        if link >= 0 {
+            bbox = bbox_intersect(sh_bbox[link], bbox);
+            link = sh_link[link];
+        }
+        workgroupBarrier();
+    }
+    if link + i32(stack_size) >= 0 {
+        bbox = bbox_intersect(sh_stack_bbox[i32(WG_SIZE) + link], bbox);
+    }
+    // At this point, bbox is the intersection of bboxes on the path to the root
+    sh_bbox[local_id.x] = bbox;
+    workgroupBarrier();
+
+    if !is_push && global_id.x < config.n_clip {
+        // Fix up drawmonoid so path_ix of EndClip matches BeginClip
+        let path_ix = clip_inp[parent];
+        draw_monoids[~inp].path_ix = u32(path_ix);
+
+        if grandparent >= 0 {
+            bbox = sh_bbox[grandparent];
+        } else if grandparent + i32(stack_size) >= 0 {
+            bbox = sh_stack_bbox[i32(WG_SIZE) + grandparent];
+        } else {
+            bbox = vec4<f32>(-1e9, -1e9, 1e9, 1e9);
+        }
+    }
+    clip_bboxes[global_id.x] = bbox
+}
--- a/piet-wgsl/shader/clip_reduce.wgsl
+++ b/piet-wgsl/shader/clip_reduce.wgsl
@ -0,0 +1,66 @@
+
+#import config
+#import bbox
+#import clip
+
+@group(0) @binding(0)
+var<storage> config: Config;
+
+@group(0) @binding(1)
+var<storage> clip_inp: array<i32>;
+
+@group(0) @binding(2)
+var<storage> path_bboxes: array<PathBbox>;
+
+@group(0) @binding(3)
+var<storage, read_write> reduced: array<Bic>;
+
+@group(0) @binding(4)
+var<storage, read_write> clip_out: array<ClipEl>;
+
+let WG_SIZE = 256u;
+var<workgroup> sh_bic: array<Bic, WG_SIZE>;
+var<workgroup> sh_parent: array<u32, WG_SIZE>;
+var<workgroup> sh_path_ix: array<u32, WG_SIZE>;
+
+@compute @workgroup_size(256)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+    @builtin(workgroup_id) wg_id: vec3<u32>, 
+) {
+    let inp = clip_inp[global_id.x];
+    let is_push = inp >= 0;
+    var bic = Bic(1u - u32(is_push), u32(is_push));
+    // reverse scan of bicyclic semigroup
+    sh_bic[local_id.x] = bic;
+    for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
+        workgroupBarrier();
+        if local_id.x + (1u << i) < WG_SIZE {
+            let other = sh_bic[local_id.x + (1u << i)];
+            bic = bic_combine(bic, other);
+        }
+        workgroupBarrier();
+        sh_bic[local_id.x] = bic;
+    }
+    if local_id.x == 0u {
+        reduced[wg_id.x] = bic;
+    }
+    workgroupBarrier();
+    let size = sh_bic[0].b;
+    bic = Bic();
+    if is_push && bic.a == 0u {
+        let local_ix = size - bic.b - 1u;
+        sh_parent[local_ix] = local_id.x;
+        sh_path_ix[local_ix] = u32(inp);
+    }
+    workgroupBarrier();
+    // TODO: possibly do forward scan here if depth can exceed wg size
+    if local_id.x < size {
+        let path_ix = sh_path_ix[local_id.x];
+        let path_bbox = path_bboxes[path_ix];
+        let parent_ix = sh_parent[local_id.x] + wg_id.x * WG_SIZE;
+        let bbox = vec4<f32>(path_bbox.x0, path_bbox.y0, path_bbox.x1, path_bbox.y1);
+        clip_out[global_id.x] = ClipEl(parent_ix, bbox);
+    }
+}
--- a/piet-wgsl/shader/coarse.wgsl
+++ b/piet-wgsl/shader/coarse.wgsl
@ -125,7 +125,27 @@ fn write_color(color: CmdColor) {
    ptcl[cmd_offset] = CMD_COLOR;
    ptcl[cmd_offset + 1u] = color.rgba_color;
    cmd_offset += 2u;
+}

+// Discussion point: these are basically copying from info to ptcl. We
+// could just write an info offset and have fine bind that buffer and read
+// from it.
+
+fn write_lin_grad(lin: CmdLinGrad) {
+    alloc_cmd(5u);
+    ptcl[cmd_offset] = CMD_LIN_GRAD;
+    ptcl[cmd_offset + 1u] = lin.index;
+    ptcl[cmd_offset + 2u] = bitcast<u32>(lin.line_x);
+    ptcl[cmd_offset + 3u] = bitcast<u32>(lin.line_y);
+    ptcl[cmd_offset + 4u] = bitcast<u32>(lin.line_c);
+    cmd_offset += 5u;
+}
+
+fn write_rad_grad(rad: CmdRadGrad) {
+    alloc_cmd(12u);
+    ptcl[cmd_offset] = CMD_RAD_GRAD;
+    ptcl[cmd_offset + 1u] = rad.index;
+    cmd_offset += 12u;
 }

@compute @workgroup_size(256)
@ -304,14 +324,38 @@ fn main(
            if clip_zero_depth == 0u {
                let tile_ix = sh_tile_base[el_ix] + sh_tile_stride[el_ix] * tile_y + tile_x;
                let tile = tiles[tile_ix];
+                let linewidth = bitcast<f32>(info[di]);
+                write_path(tile, linewidth);
                switch drawtag {
                    // DRAWTAG_FILL_COLOR
                    case 0x44u: {
-                        let linewidth = bitcast<f32>(info[di]);
                        let rgba_color = scene[dd];
-                        write_path(tile, linewidth);
                        write_color(CmdColor(rgba_color));
                    }
+                    // DRAWTAG_FILL_LIN_GRADIENT
+                    case 0x114u: {
+                        var lin: CmdLinGrad;
+                        lin.index = scene[dd];
+                        lin.line_x = bitcast<f32>(info[di + 1u]);
+                        lin.line_y = bitcast<f32>(info[di + 2u]);
+                        lin.line_c = bitcast<f32>(info[di + 3u]);
+                        write_lin_grad(lin);
+                    }
+                    // DRAWTAG_FILL_RAD_GRADIENT
+                    case 0x2dcu: {
+                        var rad: CmdRadGrad;
+                        rad.index = scene[dd];
+                        let m0 = bitcast<f32>(info[di + 1u]);
+                        let m1 = bitcast<f32>(info[di + 2u]);
+                        let m2 = bitcast<f32>(info[di + 3u]);
+                        let m3 = bitcast<f32>(info[di + 4u]);
+                        rad.matrx = vec4<f32>(m0, m1, m2, m3);
+                        rad.xlat = vec2<f32>(bitcast<f32>(info[di + 5u]), bitcast<f32>(info[di + 6u]));
+                        rad.c1 = vec2<f32>(bitcast<f32>(info[di + 7u]), bitcast<f32>(info[di + 8u]));
+                        rad.ra = bitcast<f32>(info[di + 9u]);
+                        rad.roff = bitcast<f32>(info[di + 10u]);
+                        write_rad_grad(rad);
+                    }
                    default: {}
                }
            }
--- a/piet-wgsl/shader/fine.wgsl
+++ b/piet-wgsl/shader/fine.wgsl
@ -42,9 +42,15 @@ var<storage, read_write> output: array<u32>;
 #ifdef full
 #import ptcl

+let GRADIENT_WIDTH = 512;
+let BLEND_STACK_SPLIT = 4u;
+
@group(0) @binding(4)
 var<storage> ptcl: array<u32>;

+@group(0) @binding(5)
+var gradients: texture_2d<f32>;
+
 fn read_fill(cmd_ix: u32) -> CmdFill {
    let tile = ptcl[cmd_ix + 1u];
    let backdrop = i32(ptcl[cmd_ix + 2u]);
@ -61,6 +67,33 @@ fn read_color(cmd_ix: u32) -> CmdColor {
    let rgba_color = ptcl[cmd_ix + 1u];
    return CmdColor(rgba_color);
 }
+
+fn read_lin_grad(cmd_ix: u32) -> CmdLinGrad {
+    let index = ptcl[cmd_ix + 1u];
+    let line_x = bitcast<f32>(ptcl[cmd_ix + 2u]);
+    let line_y = bitcast<f32>(ptcl[cmd_ix + 3u]);
+    let line_c = bitcast<f32>(ptcl[cmd_ix + 4u]);
+    return CmdLinGrad(index, line_x, line_y, line_c);
+}
+
+fn read_rad_grad(cmd_ix: u32) -> CmdRadGrad {
+    let index = ptcl[cmd_ix + 1u];
+    let m0 = bitcast<f32>(ptcl[cmd_ix + 2u]);
+    let m1 = bitcast<f32>(ptcl[cmd_ix + 3u]);
+    let m2 = bitcast<f32>(ptcl[cmd_ix + 4u]);
+    let m3 = bitcast<f32>(ptcl[cmd_ix + 5u]);
+    let matrx = vec4<f32>(m0, m1, m2, m3);
+    let xlat = vec2<f32>(bitcast<f32>(ptcl[cmd_ix + 6u]), bitcast<f32>(ptcl[cmd_ix + 7u]));
+    let c1 = vec2<f32>(bitcast<f32>(ptcl[cmd_ix + 8u]), bitcast<f32>(ptcl[cmd_ix + 9u]));
+    let ra = bitcast<f32>(ptcl[cmd_ix + 10u]);
+    let roff = bitcast<f32>(ptcl[cmd_ix + 11u]);
+    return CmdRadGrad(index, matrx, xlat, c1, ra, roff);
+}
+
+fn mix_blend_compose(backdrop: vec4<f32>, src: vec4<f32>, mode: u32) -> vec4<f32> {
+    // TODO: ALL the blend modes. This is just vanilla src-over.
+    return backdrop * (1.0 - src.a) + src;
+}
 #endif

 let PIXELS_PER_THREAD = 4u;
@ -147,6 +180,8 @@ fn main(
    let xy = vec2<f32>(f32(global_id.x * PIXELS_PER_THREAD), f32(global_id.y));
 #ifdef full
    var rgba: array<vec4<f32>, PIXELS_PER_THREAD>;
+    var blend_stack: array<array<u32, BLEND_STACK_SPLIT>, PIXELS_PER_THREAD>;
+    var clip_depth = 0u;
    var area: array<f32, PIXELS_PER_THREAD>;
    var cmd_ix = tile_ix * PTCL_INITIAL_ALLOC;

@ -187,6 +222,66 @@ fn main(
                }
                cmd_ix += 2u;
            }
+            // CMD_LIN_GRAD
+            case 6u: {
+                let lin = read_lin_grad(cmd_ix);
+                let d = lin.line_x * xy.x + lin.line_y * xy.y + lin.line_c;
+                for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
+                    let my_d = d + lin.line_x * f32(i);
+                    let x = i32(round(clamp(my_d, 0.0, 1.0) * f32(GRADIENT_WIDTH - 1)));
+                    let fg_rgba = textureLoad(gradients, vec2<i32>(x, i32(lin.index)), 0);
+                    let fg_i = fg_rgba * area[i];
+                    rgba[i] = rgba[i] * (1.0 - fg_i.a) + fg_i;
+                }
+                cmd_ix += 12u;
+            }
+            // CMD_RAD_GRAD
+            case 7u: {
+                let rad = read_rad_grad(cmd_ix);
+                for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
+                    let my_xy = vec2<f32>(xy.x + f32(i), xy.y);
+                    // TODO: can hoist y, but for now stick to piet-gpu
+                    let xy_xformed = rad.matrx.xz * my_xy.x + rad.matrx.yw * my_xy.y - rad.xlat;
+                    let ba = dot(xy_xformed, rad.c1);
+                    let ca = rad.ra * dot(xy_xformed, xy_xformed);
+                    let t = sqrt(ba * ba + ca) - ba - rad.roff;
+                    let x = i32(round(clamp(t, 0.0, 1.0) * f32(GRADIENT_WIDTH - 1)));
+                    let fg_rgba = textureLoad(gradients, vec2<i32>(x, i32(rad.index)), 0);
+                    let fg_i = fg_rgba * area[i];
+                    rgba[i] = rgba[i] * (1.0 - fg_i.a) + fg_i;
+                }
+                cmd_ix += 12u;
+            }
+            // CMD_BEGIN_CLIP
+            case 9u: {
+                if clip_depth < BLEND_STACK_SPLIT {
+                    for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
+                        blend_stack[clip_depth][i] = pack4x8unorm(rgba[i]);
+                        rgba[i] = vec4<f32>(0.0);
+                    }
+                } else {
+                    // TODO: spill to memory
+                }
+                clip_depth += 1u;
+                cmd_ix += 1u;
+            }
+            // CMD_END_CLIP
+            case 10u: {
+                let blend = ptcl[cmd_ix + 1u];
+                clip_depth -= 1u;
+                for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
+                    var bg_rgba: u32;
+                    if clip_depth < BLEND_STACK_SPLIT {
+                        bg_rgba = blend_stack[clip_depth][i];
+                    } else {
+                        // load from memory
+                    }
+                    let bg = unpack4x8unorm(bg_rgba);
+                    let fg = rgba[i] * area[i];
+                    rgba[i] = mix_blend_compose(bg, fg, blend);
+                }
+                cmd_ix += 2u;
+            }
            // CMD_JUMP
            case 11u: {
                cmd_ix = ptcl[cmd_ix + 1u];
--- a/piet-wgsl/shader/shared/clip.wgsl
+++ b/piet-wgsl/shader/shared/clip.wgsl
@ -0,0 +1,14 @@
+struct Bic {
+    a: u32,
+    b: u32,
+}
+
+fn bic_combine(x: Bic, y: Bic) -> Bic {
+    let m = min(x.b, y.a);
+    return Bic(x.a + y.a - m, x.b + y.b - m);
+}
+
+struct ClipEl {
+    parent_ix: u32,
+    bbox: vec4<f32>,
+}
--- a/piet-wgsl/shader/shared/config.wgsl
+++ b/piet-wgsl/shader/shared/config.wgsl
@ -20,6 +20,7 @@ struct Config {

    n_drawobj: u32,
    n_path: u32,
+    n_clip: u32,

    // offsets within scene buffer (in u32 units)
    // Note: this is a difference from piet-gpu, which is in bytes
--- a/piet-wgsl/shader/shared/ptcl.wgsl
+++ b/piet-wgsl/shader/shared/ptcl.wgsl
@ -28,6 +28,10 @@ let CMD_FILL = 1u;
 let CMD_STROKE = 2u;
 let CMD_SOLID = 3u;
 let CMD_COLOR = 5u;
+let CMD_LIN_GRAD = 6u;
+let CMD_RAD_GRAD = 7u;
+let CMD_BEGIN_CLIP = 9u;
+let CMD_END_CLIP = 10u;
 let CMD_JUMP = 11u;

 // The individual PTCL structs are written here, but read/write is by
@ -50,3 +54,19 @@ struct CmdJump {
 struct CmdColor {
    rgba_color: u32,
 }
+
+struct CmdLinGrad {
+    index: u32,
+    line_x: f32,
+    line_y: f32,
+    line_c: f32,
+}
+
+struct CmdRadGrad {
+    index: u32,
+    matrx: vec4<f32>,
+    xlat: vec2<f32>,
+    c1: vec2<f32>,
+    ra: f32,
+    roff: f32,
+}
--- a/piet-wgsl/src/render.rs
+++ b/piet-wgsl/src/render.rs
@ -26,6 +26,7 @@ struct Config {
    height_in_tiles: u32,
    n_drawobj: u32,
    n_path: u32,
+    n_clip: u32,
    pathtag_base: u32,
    pathdata_base: u32,
    drawtag_base: u32,
@ -143,11 +144,13 @@ pub fn render_full(scene: &Scene, shaders: &FullShaders) -> (Recording, BufProxy
    let n_path = data.n_path;
    // TODO: calculate for real when we do rectangles
    let n_drawobj = n_path;
+    let n_clip = 0; // TODO: wire up correctly
    let config = Config {
        width_in_tiles: 64,
        height_in_tiles: 64,
        n_drawobj,
        n_path,
+        n_clip,
        pathtag_base,
        pathdata_base,
        drawtag_base,