vello/piet-gpu/shader/clip_reduce.comp
Raph Levien acb3933d94 Variable size encoding of draw objects
This patch switches to a variable size encoding of draw objects.

In addition to the CPU-side scene encoding, it changes the representation of intermediate per draw object state from the `Annotated` struct to a variable "info" encoding. In addition, the bounding boxes are moved to a separate array (for a more "structure of "arrays" approach). Data that's unchanged from the scene encoding is not copied. Rather, downstream stages can access the data from the scene buffer (reducing allocation and copying).

Prefix sums, computed in `DrawMonoid` track the offset of both scene and intermediate data. The tags for the CPU-side encoding have been split into their own stream (again a change from AoS to SoA style).

This is not necessarily the final form. There's some stuff (including at least one piet-gpu-derive type) that can be deleted. In addition, the linewidth field should probably move from the info to path-specific. Also, the 1:1 correspondence between draw object and path has not yet been broken.

Closes #152
2022-03-14 16:32:08 -07:00

147 lines
4.1 KiB
GLSL

// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
// The reduce pass for clip stack processing.
// The primary input is a sequence of path ids representing paths to
// push, with a special value of ~0 to represent pop.
// For each path, the bounding box is found in the anno stream
// (anno_alloc), though this may change.
// Output is a stack monoid reduction for the partition. The Bic
// is stored in the BicBuf, and the stack slice in StackBuf.
// Note: for this shader, only pushes are represented in the stack
// monoid reduction output, so we don't have to worry about the
// interpretation of pops.
#version 450
#extension GL_GOOGLE_include_directive : enable
#include "mem.h"
#include "setup.h"
#define LG_WG_SIZE (7 + LG_WG_FACTOR)
#define WG_SIZE (1 << LG_WG_SIZE)
#define PARTITION_SIZE WG_SIZE
layout(local_size_x = WG_SIZE) in;
layout(binding = 1) readonly buffer ConfigBuf {
Config conf;
};
// The intermediate state for clip processing.
struct ClipEl {
// index of parent node
uint parent_ix;
// bounding box
vec4 bbox;
};
// The bicyclic monoid
struct Bic {
uint a;
uint b;
};
Bic bic_combine(Bic x, Bic y) {
uint m = min(x.b, y.a);
return Bic(x.a + y.a - m, x.b + y.b - m);
}
shared Bic sh_bic[WG_SIZE];
shared uint sh_parent[WG_SIZE];
shared uint sh_path_ix[WG_SIZE];
shared vec4 sh_bbox[WG_SIZE];
// Load path's bbox from bbox (as written by pathseg).
vec4 load_path_bbox(uint path_ix) {
uint base = (conf.path_bbox_alloc.offset >> 2) + 6 * path_ix;
float bbox_l = float(memory[base]) - 32768.0;
float bbox_t = float(memory[base + 1]) - 32768.0;
float bbox_r = float(memory[base + 2]) - 32768.0;
float bbox_b = float(memory[base + 3]) - 32768.0;
vec4 bbox = vec4(bbox_l, bbox_t, bbox_r, bbox_b);
return bbox;
}
vec4 bbox_intersect(vec4 a, vec4 b) {
return vec4(max(a.xy, b.xy), min(a.zw, b.zw));
}
void store_bic(uint ix, Bic bic) {
uint base = (conf.clip_bic_alloc.offset >> 2) + 2 * ix;
memory[base] = bic.a;
memory[base + 1] = bic.b;
}
void store_clip_el(uint ix, ClipEl el) {
uint base = (conf.clip_stack_alloc.offset >> 2) + 5 * ix;
memory[base] = el.parent_ix;
memory[base + 1] = floatBitsToUint(el.bbox.x);
memory[base + 2] = floatBitsToUint(el.bbox.y);
memory[base + 3] = floatBitsToUint(el.bbox.z);
memory[base + 4] = floatBitsToUint(el.bbox.w);
}
void main() {
uint th = gl_LocalInvocationID.x;
uint inp = memory[(conf.clip_alloc.offset >> 2) + gl_GlobalInvocationID.x];
bool is_push = int(inp) >= 0;
// reverse scan of bicyclic semigroup
Bic bic = Bic(1 - uint(is_push), uint(is_push));
sh_bic[gl_LocalInvocationID.x] = bic;
for (uint i = 0; i < LG_WG_SIZE; i++) {
barrier();
if (th + (1u << i) < WG_SIZE) {
Bic other = sh_bic[gl_LocalInvocationID.x + (1u << i)];
bic = bic_combine(bic, other);
}
barrier();
sh_bic[th] = bic;
}
if (th == 0) {
store_bic(gl_WorkGroupID.x, bic);
}
barrier();
uint size = sh_bic[0].b;
bic = Bic(0, 0);
if (th + 1 < WG_SIZE) {
bic = sh_bic[th + 1];
}
if (is_push && bic.a == 0) {
uint local_ix = size - bic.b - 1;
sh_parent[local_ix] = th;
sh_path_ix[local_ix] = inp;
}
barrier();
// Do forward scan of bounding box intersection
vec4 bbox;
uint path_ix;
if (th < size) {
path_ix = sh_path_ix[th];
bbox = load_path_bbox(path_ix);
}
// Not necessary if depth is bounded by wg size
#if 0
for (uint i = 0; i < LG_WG_SIZE; i++) {
// We gate so we never access uninit data, but it might
// be more efficient to avoid the conditionals.
if (th < size) {
sh_bbox[th] = bbox;
}
barrier();
if (th < size && th >= (1u << i)) {
bbox = bbox_intersect(sh_bbox[th - (1u << i)], bbox);
}
barrier();
}
#endif
if (th < size) {
uint parent_ix = sh_parent[th] + gl_WorkGroupID.x * PARTITION_SIZE;
ClipEl el = ClipEl(parent_ix, bbox);
store_clip_el(gl_GlobalInvocationID.x, el);
}
}