mirror of
https://github.com/italicsjenga/vello.git
synced 2025-01-10 20:51:29 +11:00
d948126c16
Make max workgroup size 256 and respect LG_WG_FACTOR. Because the monoid scans only support a height of 2, this will reduce the maximum scene complexity we can render. But it also increases compatibility. Supporting larger scans is a TODO.
292 lines
10 KiB
GLSL
292 lines
10 KiB
GLSL
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
|
|
|
|
// Processing of the path stream, after the tag scan.
|
|
|
|
#version 450
|
|
#extension GL_GOOGLE_include_directive : enable
|
|
|
|
#include "mem.h"
|
|
#include "setup.h"
|
|
#include "pathtag.h"
|
|
|
|
#define N_SEQ 4
|
|
#define LG_WG_SIZE (7 + LG_WG_FACTOR)
|
|
#define WG_SIZE (1 << LG_WG_SIZE)
|
|
#define PARTITION_SIZE (WG_SIZE * N_SEQ)
|
|
|
|
layout(local_size_x = WG_SIZE, local_size_y = 1) in;
|
|
|
|
layout(binding = 1) readonly buffer ConfigBuf {
|
|
Config conf;
|
|
};
|
|
|
|
layout(binding = 2) readonly buffer SceneBuf {
|
|
uint[] scene;
|
|
};
|
|
|
|
#include "tile.h"
|
|
#include "pathseg.h"
|
|
|
|
layout(binding = 3) readonly buffer ParentBuf {
|
|
TagMonoid[] parent;
|
|
};
|
|
|
|
struct Monoid {
|
|
vec4 bbox;
|
|
uint flags;
|
|
};
|
|
|
|
#define FLAG_RESET_BBOX 1
|
|
#define FLAG_SET_BBOX 2
|
|
|
|
Monoid combine_monoid(Monoid a, Monoid b) {
|
|
Monoid c;
|
|
c.bbox = b.bbox;
|
|
// TODO: I think this should be gated on b & SET_BBOX == false also.
|
|
if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) {
|
|
c.bbox = a.bbox;
|
|
} else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 &&
|
|
(a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y))
|
|
{
|
|
c.bbox.xy = min(a.bbox.xy, c.bbox.xy);
|
|
c.bbox.zw = max(a.bbox.zw, c.bbox.zw);
|
|
}
|
|
c.flags = (a.flags & FLAG_SET_BBOX) | b.flags;
|
|
c.flags |= ((a.flags & FLAG_RESET_BBOX) << 1);
|
|
return c;
|
|
}
|
|
|
|
Monoid monoid_identity() {
|
|
return Monoid(vec4(0.0, 0.0, 0.0, 0.0), 0);
|
|
}
|
|
|
|
// These are not both live at the same time. A very smart shader compiler
|
|
// would be able to figure that out, but I suspect many won't.
|
|
shared TagMonoid sh_tag[WG_SIZE];
|
|
shared Monoid sh_scratch[WG_SIZE];
|
|
|
|
vec2 read_f32_point(uint ix) {
|
|
float x = uintBitsToFloat(scene[ix]);
|
|
float y = uintBitsToFloat(scene[ix + 1]);
|
|
return vec2(x, y);
|
|
}
|
|
|
|
vec2 read_i16_point(uint ix) {
|
|
uint raw = scene[ix];
|
|
float x = float(int(raw << 16) >> 16);
|
|
float y = float(int(raw) >> 16);
|
|
return vec2(x, y);
|
|
}
|
|
|
|
// Note: these are 16 bit, which is adequate, but we could use 32 bits.
|
|
|
|
// Round down and saturate to minimum integer; add bias
|
|
uint round_down(float x) {
|
|
return uint(max(0.0, floor(x) + 32768.0));
|
|
}
|
|
|
|
// Round up and saturate to maximum integer; add bias
|
|
uint round_up(float x) {
|
|
return uint(min(65535.0, ceil(x) + 32768.0));
|
|
}
|
|
|
|
void main() {
|
|
Monoid local[N_SEQ];
|
|
float linewidth[N_SEQ];
|
|
uint save_trans_ix[N_SEQ];
|
|
|
|
uint ix = gl_GlobalInvocationID.x * N_SEQ;
|
|
|
|
uint tag_word = scene[(conf.pathtag_offset >> 2) + (ix >> 2)];
|
|
|
|
// Scan the tag monoid
|
|
TagMonoid local_tm = reduce_tag(tag_word);
|
|
sh_tag[gl_LocalInvocationID.x] = local_tm;
|
|
for (uint i = 0; i < LG_WG_SIZE; i++) {
|
|
barrier();
|
|
if (gl_LocalInvocationID.x >= (1u << i)) {
|
|
TagMonoid other = sh_tag[gl_LocalInvocationID.x - (1u << i)];
|
|
local_tm = combine_tag_monoid(other, local_tm);
|
|
}
|
|
barrier();
|
|
sh_tag[gl_LocalInvocationID.x] = local_tm;
|
|
}
|
|
barrier();
|
|
// sh_tag is now the partition-wide inclusive scan of the tag monoid.
|
|
TagMonoid tm = tag_monoid_identity();
|
|
if (gl_WorkGroupID.x > 0) {
|
|
tm = parent[gl_WorkGroupID.x - 1];
|
|
}
|
|
if (gl_LocalInvocationID.x > 0) {
|
|
tm = combine_tag_monoid(tm, sh_tag[gl_LocalInvocationID.x - 1]);
|
|
}
|
|
// tm is now the full exclusive scan of the tag monoid.
|
|
|
|
// Indices to scene buffer in u32 units.
|
|
uint ps_ix = (conf.pathseg_offset >> 2) + tm.pathseg_offset;
|
|
uint lw_ix = (conf.linewidth_offset >> 2) + tm.linewidth_ix;
|
|
uint save_path_ix = tm.path_ix;
|
|
uint trans_ix = tm.trans_ix;
|
|
TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + trans_ix * TransformSeg_size);
|
|
PathSegRef ps_ref = PathSegRef(conf.pathseg_alloc.offset + tm.pathseg_ix * PathSeg_size);
|
|
for (uint i = 0; i < N_SEQ; i++) {
|
|
linewidth[i] = uintBitsToFloat(scene[lw_ix]);
|
|
save_trans_ix[i] = trans_ix;
|
|
// if N_SEQ > 4, need to load tag_word from local if N_SEQ % 4 == 0
|
|
uint tag_byte = tag_word >> (i * 8);
|
|
uint seg_type = tag_byte & 3;
|
|
if (seg_type != 0) {
|
|
// 1 = line, 2 = quad, 3 = cubic
|
|
// Unpack path segment from input
|
|
vec2 p0;
|
|
vec2 p1;
|
|
vec2 p2;
|
|
vec2 p3;
|
|
if ((tag_byte & 8) != 0) {
|
|
// 32 bit encoding
|
|
p0 = read_f32_point(ps_ix);
|
|
p1 = read_f32_point(ps_ix + 2);
|
|
if (seg_type >= 2) {
|
|
p2 = read_f32_point(ps_ix + 4);
|
|
if (seg_type == 3) {
|
|
p3 = read_f32_point(ps_ix + 6);
|
|
}
|
|
}
|
|
} else {
|
|
// 16 bit encoding
|
|
p0 = read_i16_point(ps_ix);
|
|
p1 = read_i16_point(ps_ix + 1);
|
|
if (seg_type >= 2) {
|
|
p2 = read_i16_point(ps_ix + 2);
|
|
if (seg_type == 3) {
|
|
p3 = read_i16_point(ps_ix + 3);
|
|
}
|
|
}
|
|
}
|
|
TransformSeg transform = TransformSeg_read(conf.trans_alloc, trans_ref);
|
|
p0 = transform.mat.xy * p0.x + transform.mat.zw * p0.y + transform.translate;
|
|
p1 = transform.mat.xy * p1.x + transform.mat.zw * p1.y + transform.translate;
|
|
vec4 bbox = vec4(min(p0, p1), max(p0, p1));
|
|
// Degree-raise and compute bbox
|
|
if (seg_type >= 2) {
|
|
p2 = transform.mat.xy * p2.x + transform.mat.zw * p2.y + transform.translate;
|
|
bbox.xy = min(bbox.xy, p2);
|
|
bbox.zw = max(bbox.zw, p2);
|
|
if (seg_type == 3) {
|
|
p3 = transform.mat.xy * p3.x + transform.mat.zw * p3.y + transform.translate;
|
|
bbox.xy = min(bbox.xy, p3);
|
|
bbox.zw = max(bbox.zw, p3);
|
|
} else {
|
|
p3 = p2;
|
|
p2 = mix(p1, p2, 1.0 / 3.0);
|
|
p1 = mix(p1, p0, 1.0 / 3.0);
|
|
}
|
|
} else {
|
|
p3 = p1;
|
|
p2 = mix(p3, p0, 1.0 / 3.0);
|
|
p1 = mix(p0, p3, 1.0 / 3.0);
|
|
}
|
|
vec2 stroke = vec2(0.0, 0.0);
|
|
if (linewidth[i] >= 0.0) {
|
|
// See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm
|
|
stroke = 0.5 * linewidth[i] * vec2(length(transform.mat.xz), length(transform.mat.yw));
|
|
bbox += vec4(-stroke, stroke);
|
|
}
|
|
local[i].bbox = bbox;
|
|
local[i].flags = 0;
|
|
|
|
// Write path segment to output
|
|
PathCubic cubic;
|
|
cubic.p0 = p0;
|
|
cubic.p1 = p1;
|
|
cubic.p2 = p2;
|
|
cubic.p3 = p3;
|
|
cubic.path_ix = tm.path_ix;
|
|
// Not needed, TODO remove from struct
|
|
cubic.trans_ix = gl_GlobalInvocationID.x * 4 + i;
|
|
cubic.stroke = stroke;
|
|
uint fill_mode = uint(linewidth[i] >= 0.0);
|
|
PathSeg_Cubic_write(conf.pathseg_alloc, ps_ref, fill_mode, cubic);
|
|
|
|
ps_ref.offset += PathSeg_size;
|
|
uint n_points = (tag_byte & 3) + ((tag_byte >> 2) & 1);
|
|
uint n_words = n_points + (n_points & (((tag_byte >> 3) & 1) * 15));
|
|
ps_ix += n_words;
|
|
} else {
|
|
local[i].bbox = vec4(0.0, 0.0, 0.0, 0.0);
|
|
// These shifts need to be kept in sync with setup.h
|
|
uint is_path = (tag_byte >> 4) & 1;
|
|
// Relies on the fact that RESET_BBOX == 1
|
|
local[i].flags = is_path;
|
|
tm.path_ix += is_path;
|
|
trans_ix += (tag_byte >> 5) & 1;
|
|
trans_ref.offset += ((tag_byte >> 5) & 1) * TransformSeg_size;
|
|
lw_ix += (tag_byte >> 6) & 1;
|
|
}
|
|
}
|
|
|
|
// Partition-wide monoid scan for bbox monoid
|
|
Monoid agg = local[0];
|
|
for (uint i = 1; i < N_SEQ; i++) {
|
|
// Note: this could be fused with the map above, but probably
|
|
// a thin performance gain not worth the complexity.
|
|
agg = combine_monoid(agg, local[i]);
|
|
local[i] = agg;
|
|
}
|
|
// local is N_SEQ sub-partition inclusive scan of bbox monoid.
|
|
sh_scratch[gl_LocalInvocationID.x] = agg;
|
|
for (uint i = 0; i < LG_WG_SIZE; i++) {
|
|
barrier();
|
|
if (gl_LocalInvocationID.x >= (1u << i)) {
|
|
Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)];
|
|
agg = combine_monoid(other, agg);
|
|
}
|
|
barrier();
|
|
sh_scratch[gl_LocalInvocationID.x] = agg;
|
|
}
|
|
// sh_scratch is the partition-wide inclusive scan of the bbox monoid,
|
|
// sampled at the end of the N_SEQ sub-partition.
|
|
|
|
barrier();
|
|
uint path_ix = save_path_ix;
|
|
uint bbox_out_ix = (conf.bbox_alloc.offset >> 2) + path_ix * 6;
|
|
// Write bboxes to paths; do atomic min/max if partial
|
|
Monoid row = monoid_identity();
|
|
if (gl_LocalInvocationID.x > 0) {
|
|
row = sh_scratch[gl_LocalInvocationID.x - 1];
|
|
}
|
|
for (uint i = 0; i < N_SEQ; i++) {
|
|
Monoid m = combine_monoid(row, local[i]);
|
|
// m is partition-wide inclusive scan of bbox monoid.
|
|
bool do_atomic = false;
|
|
if (i == N_SEQ - 1 && gl_LocalInvocationID.x == WG_SIZE - 1) {
|
|
// last element
|
|
do_atomic = true;
|
|
}
|
|
if ((m.flags & FLAG_RESET_BBOX) != 0) {
|
|
memory[bbox_out_ix + 4] = floatBitsToUint(linewidth[i]);
|
|
memory[bbox_out_ix + 5] = save_trans_ix[i];
|
|
if ((m.flags & FLAG_SET_BBOX) == 0) {
|
|
do_atomic = true;
|
|
} else {
|
|
memory[bbox_out_ix] = round_down(m.bbox.x);
|
|
memory[bbox_out_ix + 1] = round_down(m.bbox.y);
|
|
memory[bbox_out_ix + 2] = round_up(m.bbox.z);
|
|
memory[bbox_out_ix + 3] = round_up(m.bbox.w);
|
|
bbox_out_ix += 6;
|
|
do_atomic = false;
|
|
}
|
|
}
|
|
if (do_atomic) {
|
|
if (m.bbox.z > m.bbox.x || m.bbox.w > m.bbox.y) {
|
|
// atomic min/max
|
|
atomicMin(memory[bbox_out_ix], round_down(m.bbox.x));
|
|
atomicMin(memory[bbox_out_ix + 1], round_down(m.bbox.y));
|
|
atomicMax(memory[bbox_out_ix + 2], round_up(m.bbox.z));
|
|
atomicMax(memory[bbox_out_ix + 3], round_up(m.bbox.w));
|
|
}
|
|
bbox_out_ix += 6;
|
|
}
|
|
}
|
|
}
|