vello/piet-gpu/shader/clip_leaf.comp
Raph Levien acb3933d94 Variable size encoding of draw objects
This patch switches to a variable size encoding of draw objects.

In addition to the CPU-side scene encoding, it changes the representation of intermediate per draw object state from the `Annotated` struct to a variable "info" encoding. In addition, the bounding boxes are moved to a separate array (for a more "structure of "arrays" approach). Data that's unchanged from the scene encoding is not copied. Rather, downstream stages can access the data from the scene buffer (reducing allocation and copying).

Prefix sums, computed in `DrawMonoid` track the offset of both scene and intermediate data. The tags for the CPU-side encoding have been split into their own stream (again a change from AoS to SoA style).

This is not necessarily the final form. There's some stuff (including at least one piet-gpu-derive type) that can be deleted. In addition, the linewidth field should probably move from the info to path-specific. Also, the 1:1 correspondence between draw object and path has not yet been broken.

Closes #152
2022-03-14 16:32:08 -07:00

286 lines
8.5 KiB
GLSL

// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
// The second dispatch of clip stack processing.
#version 450
#extension GL_GOOGLE_include_directive : enable
#include "mem.h"
#include "setup.h"
#define LG_WG_SIZE (7 + LG_WG_FACTOR)
#define WG_SIZE (1 << LG_WG_SIZE)
#define PARTITION_SIZE WG_SIZE
layout(local_size_x = WG_SIZE) in;
layout(binding = 1) readonly buffer ConfigBuf {
Config conf;
};
// Some of this is cut'n'paste duplication with the reduce pass, and
// arguably should be moved to a common .h file.
// The bicyclic monoid
struct ClipEl {
// index of parent node
uint parent_ix;
// bounding box
vec4 bbox;
};
struct Bic {
uint a;
uint b;
};
Bic bic_combine(Bic x, Bic y) {
uint m = min(x.b, y.a);
return Bic(x.a + y.a - m, x.b + y.b - m);
}
// Load path's bbox from bbox (as written by pathseg).
vec4 load_path_bbox(uint path_ix) {
uint base = (conf.path_bbox_alloc.offset >> 2) + 6 * path_ix;
float bbox_l = float(memory[base]) - 32768.0;
float bbox_t = float(memory[base + 1]) - 32768.0;
float bbox_r = float(memory[base + 2]) - 32768.0;
float bbox_b = float(memory[base + 3]) - 32768.0;
vec4 bbox = vec4(bbox_l, bbox_t, bbox_r, bbox_b);
return bbox;
}
vec4 bbox_intersect(vec4 a, vec4 b) {
return vec4(max(a.xy, b.xy), min(a.zw, b.zw));
}
shared Bic sh_bic[WG_SIZE * 2 - 2];
shared uint sh_stack[PARTITION_SIZE];
shared vec4 sh_stack_bbox[PARTITION_SIZE];
shared uint sh_link[PARTITION_SIZE];
shared vec4 sh_bbox[PARTITION_SIZE];
// This is adapted directly from the stack monoid impl.
// Return value is reference within partition if >= 0,
// otherwise reference to stack.
uint search_link(inout Bic bic) {
uint ix = gl_LocalInvocationID.x;
uint j = 0;
while (j < LG_WG_SIZE) {
uint base = 2 * WG_SIZE - (2u << (LG_WG_SIZE - j));
if (((ix >> j) & 1) != 0) {
Bic test = bic_combine(sh_bic[base + (ix >> j) - 1], bic);
if (test.b > 0) {
break;
}
bic = test;
ix -= 1u << j;
}
j++;
}
if (ix > 0) {
while (j > 0) {
j--;
uint base = 2 * WG_SIZE - (2u << (LG_WG_SIZE - j));
Bic test = bic_combine(sh_bic[base + (ix >> j) - 1], bic);
if (test.b == 0) {
bic = test;
ix -= 1u << j;
}
}
}
// ix is the smallest value such that reduce(ix..th).b == 0
if (ix > 0) {
return ix - 1;
} else {
return ~0u - bic.a;
}
}
Bic load_bic(uint ix) {
uint base = (conf.clip_bic_alloc.offset >> 2) + 2 * ix;
return Bic(memory[base], memory[base + 1]);
}
ClipEl load_clip_el(uint ix) {
uint base = (conf.clip_stack_alloc.offset >> 2) + 5 * ix;
uint parent_ix = memory[base];
float x0 = uintBitsToFloat(memory[base + 1]);
float y0 = uintBitsToFloat(memory[base + 2]);
float x1 = uintBitsToFloat(memory[base + 3]);
float y1 = uintBitsToFloat(memory[base + 4]);
vec4 bbox = vec4(x0, y0, x1, y1);
return ClipEl(parent_ix, bbox);
}
uint load_path_ix(uint ix) {
// This is one approach to a partial final block. Another would be
// to do a memset to the padding in the command queue.
if (ix < conf.n_clip) {
return memory[(conf.clip_alloc.offset >> 2) + ix];
} else {
// EndClip tags don't implicate further loads.
return 0x80000000;
}
}
void store_clip_bbox(uint ix, vec4 bbox) {
uint base = (conf.clip_bbox_alloc.offset >> 2) + 4 * ix;
memory[base] = floatBitsToUint(bbox.x);
memory[base + 1] = floatBitsToUint(bbox.y);
memory[base + 2] = floatBitsToUint(bbox.z);
memory[base + 3] = floatBitsToUint(bbox.w);
}
void main() {
// materialize stack up to the start of this partition. This
// is based on the pure stack monoid, but with two additions.
// First, (this only matters if the stack goes deeper than the
// partition size, which might be unlikely in practice), the
// topmost stack element from each partition is picked, then an
// exclusive scan of those. Also note that if this is skipped,
// a scan is not needed in the reduce stage.
// Second, after the stream compaction, do a scan of the retrieved
// bbox values.
uint th = gl_LocalInvocationID.x;
Bic bic = Bic(0, 0);
if (th < gl_WorkGroupID.x) {
bic = load_bic(th);
}
sh_bic[th] = bic;
for (uint i = 0; i < LG_WG_SIZE; i++) {
barrier();
if (th + (1u << i) < WG_SIZE) {
Bic other = sh_bic[th + (1u << i)];
bic = bic_combine(bic, other);
}
barrier();
sh_bic[th] = bic;
}
barrier();
uint stack_size = sh_bic[0].b;
// TODO: do bbox scan here (to unlock greater stack depth)
// binary search in stack
uint sp = PARTITION_SIZE - 1 - th;
uint ix = 0;
for (uint i = 0; i < LG_WG_SIZE; i++) {
uint probe = ix + (uint(PARTITION_SIZE / 2) >> i);
if (sp < sh_bic[probe].b) {
ix = probe;
}
}
// ix is largest value such that sp < sh_bic[ix].b (if any)
uint b = sh_bic[ix].b;
vec4 bbox = vec4(-1e9, -1e9, 1e9, 1e9);
if (sp < b) {
// maybe store the index here for future use?
ClipEl el = load_clip_el(ix * PARTITION_SIZE + b - sp - 1);
sh_stack[th] = el.parent_ix;
bbox = el.bbox;
// other element values here?
}
// forward scan of bbox values of prefix stack
for (uint i = 0; i < LG_WG_SIZE; i++) {
sh_stack_bbox[th] = bbox;
barrier();
if (th >= (1u << i)) {
bbox = bbox_intersect(sh_stack_bbox[th - (1u << i)], bbox);
}
barrier();
}
sh_stack_bbox[th] = bbox;
// Read input and compute bicyclic semigroup binary tree
uint inp = load_path_ix(gl_GlobalInvocationID.x);
bool is_push = int(inp) >= 0;
bic = Bic(1 - uint(is_push), uint(is_push));
sh_bic[th] = bic;
if (is_push) {
bbox = load_path_bbox(inp);
} else {
bbox = vec4(-1e9, -1e9, 1e9, 1e9);
}
uint inbase = 0;
for (uint i = 0; i < LG_WG_SIZE - 1; i++) {
uint outbase = 2 * WG_SIZE - (1u << (LG_WG_SIZE - i));
barrier();
if (th < (1u << (LG_WG_SIZE - 1 - i))) {
sh_bic[outbase + th] = bic_combine(sh_bic[inbase + th * 2], sh_bic[inbase + th * 2 + 1]);
}
inbase = outbase;
}
barrier();
// Search for predecessor node
bic = Bic(0, 0);
uint link = search_link(bic);
// we use N_SEQ > 1 convention here:
// link >= 0 is index within partition
// link < 0 is reference to stack
// We want grandparent bbox for pop nodes, so follow those links.
sh_link[th] = link;
barrier();
uint grandparent;
if (int(link) >= 0) {
grandparent = sh_link[link];
} else {
grandparent = link - 1;
}
// Resolve parent
uint parent;
if (int(link) >= 0) {
parent = gl_WorkGroupID.x * PARTITION_SIZE + link;
} else if (int(link + stack_size) >= 0) {
parent = sh_stack[PARTITION_SIZE + link];
} else {
parent = ~0u;
}
// bbox scan along parent links
for (uint i = 0; i < LG_WG_SIZE; i++) {
// sh_link was already stored for first iteration
if (i != 0) {
sh_link[th] = link;
}
sh_bbox[th] = bbox;
barrier();
if (int(link) >= 0) {
bbox = bbox_intersect(sh_bbox[link], bbox);
link = sh_link[link];
}
barrier();
}
if (int(link + stack_size) >= 0) {
bbox = bbox_intersect(sh_stack_bbox[PARTITION_SIZE + link], bbox);
}
// At this point, bbox is the reduction of bounding boxes along the tree.
sh_bbox[th] = bbox;
barrier();
uint path_ix = inp;
if (!is_push && gl_GlobalInvocationID.x < conf.n_clip) {
// Is this load expensive? If so, it's loaded earlier for in-partition
// and is in the ClipEl for cross-partition.
// If not, can probably get rid of it in the stack intermediate buf.
path_ix = load_path_ix(parent);
uint drawmonoid_out_base = (conf.drawmonoid_alloc.offset >> 2) + 4 * ~inp;
// Fix up drawmonoid so path_ix at EndClip matches BeginClip
memory[drawmonoid_out_base] = path_ix;
if (int(grandparent) >= 0) {
bbox = sh_bbox[grandparent];
} else if (int(grandparent + stack_size) >= 0) {
bbox = sh_stack_bbox[PARTITION_SIZE + grandparent];
} else {
bbox = vec4(-1e9, -1e9, 1e9, 1e9);
}
}
store_clip_bbox(gl_GlobalInvocationID.x, bbox);
}