vello/piet-gpu/shader/clip_leaf.comp

288 lines
8.5 KiB
GLSL
Raw Normal View History

// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
// The second dispatch of clip stack processing.
#version 450
#extension GL_GOOGLE_include_directive : enable
#include "mem.h"
#include "setup.h"
#define LG_WG_SIZE (7 + LG_WG_FACTOR)
#define WG_SIZE (1 << LG_WG_SIZE)
#define PARTITION_SIZE WG_SIZE
layout(local_size_x = WG_SIZE) in;
layout(binding = 1) readonly buffer ConfigBuf {
Config conf;
};
#include "annotated.h"
// Some of this is cut'n'paste duplication with the reduce pass, and
// arguably should be moved to a common .h file.
// The bicyclic monoid
struct ClipEl {
// index of parent node
uint parent_ix;
// bounding box
vec4 bbox;
};
struct Bic {
uint a;
uint b;
};
Bic bic_combine(Bic x, Bic y) {
uint m = min(x.b, y.a);
return Bic(x.a + y.a - m, x.b + y.b - m);
}
// Load path's bbox from bbox (as written by pathseg).
vec4 load_path_bbox(uint path_ix) {
uint base = (conf.bbox_alloc.offset >> 2) + 6 * path_ix;
float bbox_l = float(memory[base]) - 32768.0;
float bbox_t = float(memory[base + 1]) - 32768.0;
float bbox_r = float(memory[base + 2]) - 32768.0;
float bbox_b = float(memory[base + 3]) - 32768.0;
vec4 bbox = vec4(bbox_l, bbox_t, bbox_r, bbox_b);
return bbox;
}
vec4 bbox_intersect(vec4 a, vec4 b) {
return vec4(max(a.xy, b.xy), min(a.zw, b.zw));
}
shared Bic sh_bic[WG_SIZE * 2 - 2];
shared uint sh_stack[PARTITION_SIZE];
shared vec4 sh_stack_bbox[PARTITION_SIZE];
shared uint sh_link[PARTITION_SIZE];
shared vec4 sh_bbox[PARTITION_SIZE];
// This is adapted directly from the stack monoid impl.
// Return value is reference within partition if >= 0,
// otherwise reference to stack.
uint search_link(inout Bic bic) {
uint ix = gl_LocalInvocationID.x;
uint j = 0;
while (j < LG_WG_SIZE) {
uint base = 2 * WG_SIZE - (2u << (LG_WG_SIZE - j));
if (((ix >> j) & 1) != 0) {
Bic test = bic_combine(sh_bic[base + (ix >> j) - 1], bic);
if (test.b > 0) {
break;
}
bic = test;
ix -= 1u << j;
}
j++;
}
if (ix > 0) {
while (j > 0) {
j--;
uint base = 2 * WG_SIZE - (2u << (LG_WG_SIZE - j));
Bic test = bic_combine(sh_bic[base + (ix >> j) - 1], bic);
if (test.b == 0) {
bic = test;
ix -= 1u << j;
}
}
}
// ix is the smallest value such that reduce(ix..th).b == 0
if (ix > 0) {
return ix - 1;
} else {
return ~0u - bic.a;
}
}
Bic load_bic(uint ix) {
uint base = (conf.clip_bic_alloc.offset >> 2) + 2 * ix;
return Bic(memory[base], memory[base + 1]);
}
ClipEl load_clip_el(uint ix) {
uint base = (conf.clip_stack_alloc.offset >> 2) + 5 * ix;
uint parent_ix = memory[base];
float x0 = uintBitsToFloat(memory[base + 1]);
float y0 = uintBitsToFloat(memory[base + 2]);
float x1 = uintBitsToFloat(memory[base + 3]);
float y1 = uintBitsToFloat(memory[base + 4]);
vec4 bbox = vec4(x0, y0, x1, y1);
return ClipEl(parent_ix, bbox);
}
uint load_path_ix(uint ix) {
// This is one approach to a partial final block. Another would be
// to do a memset to the padding in the command queue.
if (ix < conf.n_clip) {
return memory[(conf.clip_alloc.offset >> 2) + ix];
} else {
// EndClip tags don't implicate further loads.
return 0x80000000;
}
}
void store_clip_bbox(uint ix, vec4 bbox) {
uint base = (conf.clip_bbox_alloc.offset >> 2) + 4 * ix;
memory[base] = floatBitsToUint(bbox.x);
memory[base + 1] = floatBitsToUint(bbox.y);
memory[base + 2] = floatBitsToUint(bbox.z);
memory[base + 3] = floatBitsToUint(bbox.w);
}
void main() {
// materialize stack up to the start of this partition. This
// is based on the pure stack monoid, but with two additions.
// First, (this only matters if the stack goes deeper than the
// partition size, which might be unlikely in practice), the
// topmost stack element from each partition is picked, then an
// exclusive scan of those. Also note that if this is skipped,
// a scan is not needed in the reduce stage.
// Second, after the stream compaction, do a scan of the retrieved
// bbox values.
uint th = gl_LocalInvocationID.x;
Bic bic = Bic(0, 0);
if (th < gl_WorkGroupID.x) {
bic = load_bic(th);
}
sh_bic[th] = bic;
for (uint i = 0; i < LG_WG_SIZE; i++) {
barrier();
if (th + (1u << i) < WG_SIZE) {
Bic other = sh_bic[th + (1u << i)];
bic = bic_combine(bic, other);
}
barrier();
sh_bic[th] = bic;
}
barrier();
uint stack_size = sh_bic[0].b;
// TODO: do bbox scan here (to unlock greater stack depth)
// binary search in stack
uint sp = PARTITION_SIZE - 1 - th;
uint ix = 0;
for (uint i = 0; i < LG_WG_SIZE; i++) {
uint probe = ix + (uint(PARTITION_SIZE / 2) >> i);
if (sp < sh_bic[probe].b) {
ix = probe;
}
}
// ix is largest value such that sp < sh_bic[ix].b (if any)
uint b = sh_bic[ix].b;
vec4 bbox = vec4(-1e9, -1e9, 1e9, 1e9);
if (sp < b) {
// maybe store the index here for future use?
ClipEl el = load_clip_el(ix * PARTITION_SIZE + b - sp - 1);
sh_stack[th] = el.parent_ix;
bbox = el.bbox;
// other element values here?
}
// forward scan of bbox values of prefix stack
for (uint i = 0; i < LG_WG_SIZE; i++) {
sh_stack_bbox[th] = bbox;
barrier();
if (th >= (1u << i)) {
bbox = bbox_intersect(sh_stack_bbox[th - (1u << i)], bbox);
}
barrier();
}
sh_stack_bbox[th] = bbox;
// Read input and compute bicyclic semigroup binary tree
uint inp = load_path_ix(gl_GlobalInvocationID.x);
bool is_push = int(inp) >= 0;
bic = Bic(1 - uint(is_push), uint(is_push));
sh_bic[th] = bic;
if (is_push) {
bbox = load_path_bbox(inp);
} else {
bbox = vec4(-1e9, -1e9, 1e9, 1e9);
}
uint inbase = 0;
for (uint i = 0; i < LG_WG_SIZE - 1; i++) {
uint outbase = 2 * WG_SIZE - (1u << (LG_WG_SIZE - i));
barrier();
if (th < (1u << (LG_WG_SIZE - 1 - i))) {
sh_bic[outbase + th] = bic_combine(sh_bic[inbase + th * 2], sh_bic[inbase + th * 2 + 1]);
}
inbase = outbase;
}
barrier();
// Search for predecessor node
bic = Bic(0, 0);
uint link = search_link(bic);
// we use N_SEQ > 1 convention here:
// link >= 0 is index within partition
// link < 0 is reference to stack
// We want grandparent bbox for pop nodes, so follow those links.
sh_link[th] = link;
barrier();
uint grandparent;
if (int(link) >= 0) {
grandparent = sh_link[link];
} else {
grandparent = link - 1;
}
// Resolve parent
uint parent;
if (int(link) >= 0) {
parent = gl_WorkGroupID.x * PARTITION_SIZE + link;
} else if (int(link + stack_size) >= 0) {
parent = sh_stack[PARTITION_SIZE + link];
} else {
parent = ~0u;
}
// bbox scan along parent links
for (uint i = 0; i < LG_WG_SIZE; i++) {
// sh_link was already stored for first iteration
if (i != 0) {
sh_link[th] = link;
}
sh_bbox[th] = bbox;
barrier();
if (int(link) >= 0) {
bbox = bbox_intersect(sh_bbox[link], bbox);
link = sh_link[link];
}
barrier();
}
if (int(link + stack_size) >= 0) {
bbox = bbox_intersect(sh_stack_bbox[PARTITION_SIZE + link], bbox);
}
// At this point, bbox is the reduction of bounding boxes along the tree.
sh_bbox[th] = bbox;
barrier();
uint path_ix = inp;
if (!is_push && gl_GlobalInvocationID.x < conf.n_clip) {
// Is this load expensive? If so, it's loaded earlier for in-partition
// and is in the ClipEl for cross-partition.
// If not, can probably get rid of it in the stack intermediate buf.
path_ix = load_path_ix(parent);
uint drawmonoid_out_base = (conf.drawmonoid_alloc.offset >> 2) + 2 * ~inp;
// Fix up drawmonoid so path_ix at EndClip matches BeginClip
memory[drawmonoid_out_base] = path_ix;
if (int(grandparent) >= 0) {
bbox = sh_bbox[grandparent];
} else if (int(grandparent + stack_size) >= 0) {
bbox = sh_stack_bbox[PARTITION_SIZE + grandparent];
} else {
bbox = vec4(-1e9, -1e9, 1e9, 1e9);
}
}
store_clip_bbox(gl_GlobalInvocationID.x, bbox);
}