2022-02-17 16:25:41 -08:00
|
|
|
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
|
|
|
|
|
|
|
|
// The second dispatch of clip stack processing.
|
|
|
|
|
|
|
|
#version 450
|
|
|
|
#extension GL_GOOGLE_include_directive : enable
|
|
|
|
|
|
|
|
#include "mem.h"
|
|
|
|
#include "setup.h"
|
|
|
|
|
|
|
|
#define LG_WG_SIZE (7 + LG_WG_FACTOR)
|
|
|
|
#define WG_SIZE (1 << LG_WG_SIZE)
|
|
|
|
#define PARTITION_SIZE WG_SIZE
|
|
|
|
|
|
|
|
layout(local_size_x = WG_SIZE) in;
|
|
|
|
|
|
|
|
layout(binding = 1) readonly buffer ConfigBuf {
|
|
|
|
Config conf;
|
|
|
|
};
|
|
|
|
|
|
|
|
// Some of this is cut'n'paste duplication with the reduce pass, and
|
|
|
|
// arguably should be moved to a common .h file.
|
|
|
|
// The bicyclic monoid
|
|
|
|
|
|
|
|
struct ClipEl {
|
|
|
|
// index of parent node
|
|
|
|
uint parent_ix;
|
|
|
|
// bounding box
|
|
|
|
vec4 bbox;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct Bic {
|
|
|
|
uint a;
|
|
|
|
uint b;
|
|
|
|
};
|
|
|
|
|
|
|
|
Bic bic_combine(Bic x, Bic y) {
|
|
|
|
uint m = min(x.b, y.a);
|
|
|
|
return Bic(x.a + y.a - m, x.b + y.b - m);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Load path's bbox from bbox (as written by pathseg).
|
|
|
|
vec4 load_path_bbox(uint path_ix) {
|
2022-03-02 14:44:03 -08:00
|
|
|
uint base = (conf.path_bbox_alloc.offset >> 2) + 6 * path_ix;
|
2022-02-17 16:25:41 -08:00
|
|
|
float bbox_l = float(memory[base]) - 32768.0;
|
|
|
|
float bbox_t = float(memory[base + 1]) - 32768.0;
|
|
|
|
float bbox_r = float(memory[base + 2]) - 32768.0;
|
|
|
|
float bbox_b = float(memory[base + 3]) - 32768.0;
|
|
|
|
vec4 bbox = vec4(bbox_l, bbox_t, bbox_r, bbox_b);
|
|
|
|
return bbox;
|
|
|
|
}
|
|
|
|
|
|
|
|
vec4 bbox_intersect(vec4 a, vec4 b) {
|
|
|
|
return vec4(max(a.xy, b.xy), min(a.zw, b.zw));
|
|
|
|
}
|
|
|
|
|
|
|
|
shared Bic sh_bic[WG_SIZE * 2 - 2];
|
|
|
|
shared uint sh_stack[PARTITION_SIZE];
|
|
|
|
shared vec4 sh_stack_bbox[PARTITION_SIZE];
|
|
|
|
shared uint sh_link[PARTITION_SIZE];
|
|
|
|
shared vec4 sh_bbox[PARTITION_SIZE];
|
|
|
|
|
|
|
|
// This is adapted directly from the stack monoid impl.
|
|
|
|
// Return value is reference within partition if >= 0,
|
|
|
|
// otherwise reference to stack.
|
|
|
|
uint search_link(inout Bic bic) {
|
|
|
|
uint ix = gl_LocalInvocationID.x;
|
|
|
|
uint j = 0;
|
|
|
|
while (j < LG_WG_SIZE) {
|
|
|
|
uint base = 2 * WG_SIZE - (2u << (LG_WG_SIZE - j));
|
|
|
|
if (((ix >> j) & 1) != 0) {
|
|
|
|
Bic test = bic_combine(sh_bic[base + (ix >> j) - 1], bic);
|
|
|
|
if (test.b > 0) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
bic = test;
|
|
|
|
ix -= 1u << j;
|
|
|
|
}
|
|
|
|
j++;
|
|
|
|
}
|
|
|
|
if (ix > 0) {
|
|
|
|
while (j > 0) {
|
|
|
|
j--;
|
|
|
|
uint base = 2 * WG_SIZE - (2u << (LG_WG_SIZE - j));
|
|
|
|
Bic test = bic_combine(sh_bic[base + (ix >> j) - 1], bic);
|
|
|
|
if (test.b == 0) {
|
|
|
|
bic = test;
|
|
|
|
ix -= 1u << j;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// ix is the smallest value such that reduce(ix..th).b == 0
|
|
|
|
if (ix > 0) {
|
|
|
|
return ix - 1;
|
|
|
|
} else {
|
|
|
|
return ~0u - bic.a;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Bic load_bic(uint ix) {
|
|
|
|
uint base = (conf.clip_bic_alloc.offset >> 2) + 2 * ix;
|
|
|
|
return Bic(memory[base], memory[base + 1]);
|
|
|
|
}
|
|
|
|
|
|
|
|
ClipEl load_clip_el(uint ix) {
|
|
|
|
uint base = (conf.clip_stack_alloc.offset >> 2) + 5 * ix;
|
|
|
|
uint parent_ix = memory[base];
|
|
|
|
float x0 = uintBitsToFloat(memory[base + 1]);
|
|
|
|
float y0 = uintBitsToFloat(memory[base + 2]);
|
|
|
|
float x1 = uintBitsToFloat(memory[base + 3]);
|
|
|
|
float y1 = uintBitsToFloat(memory[base + 4]);
|
|
|
|
vec4 bbox = vec4(x0, y0, x1, y1);
|
|
|
|
return ClipEl(parent_ix, bbox);
|
|
|
|
}
|
|
|
|
|
|
|
|
uint load_path_ix(uint ix) {
|
|
|
|
// This is one approach to a partial final block. Another would be
|
|
|
|
// to do a memset to the padding in the command queue.
|
|
|
|
if (ix < conf.n_clip) {
|
|
|
|
return memory[(conf.clip_alloc.offset >> 2) + ix];
|
|
|
|
} else {
|
|
|
|
// EndClip tags don't implicate further loads.
|
|
|
|
return 0x80000000;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void store_clip_bbox(uint ix, vec4 bbox) {
|
|
|
|
uint base = (conf.clip_bbox_alloc.offset >> 2) + 4 * ix;
|
|
|
|
memory[base] = floatBitsToUint(bbox.x);
|
|
|
|
memory[base + 1] = floatBitsToUint(bbox.y);
|
|
|
|
memory[base + 2] = floatBitsToUint(bbox.z);
|
|
|
|
memory[base + 3] = floatBitsToUint(bbox.w);
|
|
|
|
}
|
|
|
|
|
|
|
|
void main() {
|
|
|
|
// materialize stack up to the start of this partition. This
|
|
|
|
// is based on the pure stack monoid, but with two additions.
|
|
|
|
|
|
|
|
// First, (this only matters if the stack goes deeper than the
|
|
|
|
// partition size, which might be unlikely in practice), the
|
|
|
|
// topmost stack element from each partition is picked, then an
|
|
|
|
// exclusive scan of those. Also note that if this is skipped,
|
|
|
|
// a scan is not needed in the reduce stage.
|
|
|
|
|
|
|
|
// Second, after the stream compaction, do a scan of the retrieved
|
|
|
|
// bbox values.
|
|
|
|
uint th = gl_LocalInvocationID.x;
|
|
|
|
Bic bic = Bic(0, 0);
|
|
|
|
if (th < gl_WorkGroupID.x) {
|
|
|
|
bic = load_bic(th);
|
|
|
|
}
|
|
|
|
sh_bic[th] = bic;
|
|
|
|
for (uint i = 0; i < LG_WG_SIZE; i++) {
|
|
|
|
barrier();
|
|
|
|
if (th + (1u << i) < WG_SIZE) {
|
|
|
|
Bic other = sh_bic[th + (1u << i)];
|
|
|
|
bic = bic_combine(bic, other);
|
|
|
|
}
|
|
|
|
barrier();
|
|
|
|
sh_bic[th] = bic;
|
|
|
|
}
|
|
|
|
barrier();
|
|
|
|
uint stack_size = sh_bic[0].b;
|
|
|
|
|
|
|
|
// TODO: do bbox scan here (to unlock greater stack depth)
|
|
|
|
|
|
|
|
// binary search in stack
|
|
|
|
uint sp = PARTITION_SIZE - 1 - th;
|
|
|
|
uint ix = 0;
|
|
|
|
for (uint i = 0; i < LG_WG_SIZE; i++) {
|
|
|
|
uint probe = ix + (uint(PARTITION_SIZE / 2) >> i);
|
|
|
|
if (sp < sh_bic[probe].b) {
|
|
|
|
ix = probe;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// ix is largest value such that sp < sh_bic[ix].b (if any)
|
|
|
|
uint b = sh_bic[ix].b;
|
|
|
|
vec4 bbox = vec4(-1e9, -1e9, 1e9, 1e9);
|
|
|
|
if (sp < b) {
|
|
|
|
// maybe store the index here for future use?
|
|
|
|
ClipEl el = load_clip_el(ix * PARTITION_SIZE + b - sp - 1);
|
|
|
|
sh_stack[th] = el.parent_ix;
|
|
|
|
bbox = el.bbox;
|
|
|
|
// other element values here?
|
|
|
|
}
|
|
|
|
|
|
|
|
// forward scan of bbox values of prefix stack
|
|
|
|
for (uint i = 0; i < LG_WG_SIZE; i++) {
|
|
|
|
sh_stack_bbox[th] = bbox;
|
|
|
|
barrier();
|
|
|
|
if (th >= (1u << i)) {
|
|
|
|
bbox = bbox_intersect(sh_stack_bbox[th - (1u << i)], bbox);
|
|
|
|
}
|
|
|
|
barrier();
|
|
|
|
}
|
|
|
|
sh_stack_bbox[th] = bbox;
|
|
|
|
|
|
|
|
// Read input and compute bicyclic semigroup binary tree
|
|
|
|
uint inp = load_path_ix(gl_GlobalInvocationID.x);
|
|
|
|
bool is_push = int(inp) >= 0;
|
|
|
|
bic = Bic(1 - uint(is_push), uint(is_push));
|
|
|
|
sh_bic[th] = bic;
|
|
|
|
if (is_push) {
|
|
|
|
bbox = load_path_bbox(inp);
|
|
|
|
} else {
|
|
|
|
bbox = vec4(-1e9, -1e9, 1e9, 1e9);
|
|
|
|
}
|
|
|
|
uint inbase = 0;
|
|
|
|
for (uint i = 0; i < LG_WG_SIZE - 1; i++) {
|
|
|
|
uint outbase = 2 * WG_SIZE - (1u << (LG_WG_SIZE - i));
|
|
|
|
barrier();
|
|
|
|
if (th < (1u << (LG_WG_SIZE - 1 - i))) {
|
|
|
|
sh_bic[outbase + th] = bic_combine(sh_bic[inbase + th * 2], sh_bic[inbase + th * 2 + 1]);
|
|
|
|
}
|
|
|
|
inbase = outbase;
|
|
|
|
}
|
|
|
|
barrier();
|
|
|
|
// Search for predecessor node
|
|
|
|
bic = Bic(0, 0);
|
|
|
|
uint link = search_link(bic);
|
|
|
|
// we use N_SEQ > 1 convention here:
|
|
|
|
// link >= 0 is index within partition
|
|
|
|
// link < 0 is reference to stack
|
|
|
|
|
|
|
|
// We want grandparent bbox for pop nodes, so follow those links.
|
|
|
|
sh_link[th] = link;
|
|
|
|
barrier();
|
|
|
|
uint grandparent;
|
|
|
|
if (int(link) >= 0) {
|
|
|
|
grandparent = sh_link[link];
|
|
|
|
} else {
|
|
|
|
grandparent = link - 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Resolve parent
|
|
|
|
uint parent;
|
|
|
|
if (int(link) >= 0) {
|
|
|
|
parent = gl_WorkGroupID.x * PARTITION_SIZE + link;
|
|
|
|
} else if (int(link + stack_size) >= 0) {
|
|
|
|
parent = sh_stack[PARTITION_SIZE + link];
|
|
|
|
} else {
|
|
|
|
parent = ~0u;
|
|
|
|
}
|
|
|
|
|
|
|
|
// bbox scan along parent links
|
|
|
|
for (uint i = 0; i < LG_WG_SIZE; i++) {
|
|
|
|
// sh_link was already stored for first iteration
|
|
|
|
if (i != 0) {
|
|
|
|
sh_link[th] = link;
|
|
|
|
}
|
|
|
|
sh_bbox[th] = bbox;
|
|
|
|
barrier();
|
|
|
|
if (int(link) >= 0) {
|
|
|
|
bbox = bbox_intersect(sh_bbox[link], bbox);
|
|
|
|
link = sh_link[link];
|
|
|
|
}
|
|
|
|
barrier();
|
|
|
|
}
|
|
|
|
if (int(link + stack_size) >= 0) {
|
|
|
|
bbox = bbox_intersect(sh_stack_bbox[PARTITION_SIZE + link], bbox);
|
|
|
|
}
|
|
|
|
// At this point, bbox is the reduction of bounding boxes along the tree.
|
|
|
|
sh_bbox[th] = bbox;
|
|
|
|
barrier();
|
|
|
|
|
|
|
|
uint path_ix = inp;
|
|
|
|
if (!is_push && gl_GlobalInvocationID.x < conf.n_clip) {
|
|
|
|
// Is this load expensive? If so, it's loaded earlier for in-partition
|
|
|
|
// and is in the ClipEl for cross-partition.
|
|
|
|
// If not, can probably get rid of it in the stack intermediate buf.
|
|
|
|
path_ix = load_path_ix(parent);
|
2022-03-02 14:44:03 -08:00
|
|
|
uint drawmonoid_out_base = (conf.drawmonoid_alloc.offset >> 2) + 4 * ~inp;
|
2022-02-17 16:25:41 -08:00
|
|
|
// Fix up drawmonoid so path_ix at EndClip matches BeginClip
|
|
|
|
memory[drawmonoid_out_base] = path_ix;
|
|
|
|
|
|
|
|
if (int(grandparent) >= 0) {
|
|
|
|
bbox = sh_bbox[grandparent];
|
|
|
|
} else if (int(grandparent + stack_size) >= 0) {
|
|
|
|
bbox = sh_stack_bbox[PARTITION_SIZE + grandparent];
|
|
|
|
} else {
|
|
|
|
bbox = vec4(-1e9, -1e9, 1e9, 1e9);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
store_clip_bbox(gl_GlobalInvocationID.x, bbox);
|
|
|
|
}
|