vello/piet-gpu/shader/pathseg.comp

291 lines
10 KiB
GLSL
Raw Normal View History

// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
// Processing of the path stream, after the tag scan.
#version 450
#extension GL_GOOGLE_include_directive : enable
#include "mem.h"
#include "setup.h"
#include "pathtag.h"
#define N_SEQ 4
#define LG_WG_SIZE (7 + LG_WG_FACTOR)
#define WG_SIZE (1 << LG_WG_SIZE)
#define PARTITION_SIZE (WG_SIZE * N_SEQ)
layout(local_size_x = WG_SIZE, local_size_y = 1) in;
layout(binding = 1) readonly buffer ConfigBuf {
Config conf;
};
layout(binding = 2) readonly buffer SceneBuf {
uint[] scene;
};
#include "tile.h"
#include "pathseg.h"
layout(binding = 3) readonly buffer ParentBuf {
TagMonoid[] parent;
};
struct Monoid {
vec4 bbox;
uint flags;
};
#define FLAG_RESET_BBOX 1
#define FLAG_SET_BBOX 2
Monoid combine_monoid(Monoid a, Monoid b) {
Monoid c;
c.bbox = b.bbox;
// TODO: I think this should be gated on b & SET_BBOX == false also.
if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) {
c.bbox = a.bbox;
} else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 &&
(a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)) {
c.bbox.xy = min(a.bbox.xy, c.bbox.xy);
c.bbox.zw = max(a.bbox.zw, c.bbox.zw);
}
c.flags = (a.flags & FLAG_SET_BBOX) | b.flags;
c.flags |= ((a.flags & FLAG_RESET_BBOX) << 1);
return c;
}
Monoid monoid_identity() {
return Monoid(vec4(0.0, 0.0, 0.0, 0.0), 0);
}
// These are not both live at the same time. A very smart shader compiler
// would be able to figure that out, but I suspect many won't.
shared TagMonoid sh_tag[WG_SIZE];
shared Monoid sh_scratch[WG_SIZE];
vec2 read_f32_point(uint ix) {
float x = uintBitsToFloat(scene[ix]);
float y = uintBitsToFloat(scene[ix + 1]);
return vec2(x, y);
}
vec2 read_i16_point(uint ix) {
uint raw = scene[ix];
float x = float(int(raw << 16) >> 16);
float y = float(int(raw) >> 16);
return vec2(x, y);
}
// Note: these are 16 bit, which is adequate, but we could use 32 bits.
// Round down and saturate to minimum integer; add bias
uint round_down(float x) {
return uint(max(0.0, floor(x) + 32768.0));
}
// Round up and saturate to maximum integer; add bias
uint round_up(float x) {
return uint(min(65535.0, ceil(x) + 32768.0));
}
void main() {
Monoid local[N_SEQ];
float linewidth[N_SEQ];
uint save_trans_ix[N_SEQ];
uint ix = gl_GlobalInvocationID.x * N_SEQ;
uint tag_word = scene[(conf.pathtag_offset >> 2) + (ix >> 2)];
// Scan the tag monoid
TagMonoid local_tm = reduce_tag(tag_word);
sh_tag[gl_LocalInvocationID.x] = local_tm;
2021-12-02 03:34:22 +11:00
for (uint i = 0; i < LG_WG_SIZE; i++) {
barrier();
if (gl_LocalInvocationID.x >= (1u << i)) {
TagMonoid other = sh_tag[gl_LocalInvocationID.x - (1u << i)];
local_tm = combine_tag_monoid(other, local_tm);
}
barrier();
sh_tag[gl_LocalInvocationID.x] = local_tm;
}
barrier();
// sh_tag is now the partition-wide inclusive scan of the tag monoid.
TagMonoid tm = tag_monoid_identity();
if (gl_WorkGroupID.x > 0) {
tm = parent[gl_WorkGroupID.x - 1];
}
if (gl_LocalInvocationID.x > 0) {
tm = combine_tag_monoid(tm, sh_tag[gl_LocalInvocationID.x - 1]);
}
// tm is now the full exclusive scan of the tag monoid.
// Indices to scene buffer in u32 units.
uint ps_ix = (conf.pathseg_offset >> 2) + tm.pathseg_offset;
uint lw_ix = (conf.linewidth_offset >> 2) + tm.linewidth_ix;
uint save_path_ix = tm.path_ix;
uint trans_ix = tm.trans_ix;
TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + trans_ix * TransformSeg_size);
PathSegRef ps_ref = PathSegRef(conf.pathseg_alloc.offset + tm.pathseg_ix * PathSeg_size);
for (uint i = 0; i < N_SEQ; i++) {
linewidth[i] = uintBitsToFloat(scene[lw_ix]);
save_trans_ix[i] = trans_ix;
// if N_SEQ > 4, need to load tag_word from local if N_SEQ % 4 == 0
uint tag_byte = tag_word >> (i * 8);
uint seg_type = tag_byte & 3;
if (seg_type != 0) {
// 1 = line, 2 = quad, 3 = cubic
// Unpack path segment from input
vec2 p0;
vec2 p1;
vec2 p2;
vec2 p3;
if ((tag_byte & 8) != 0) {
// 32 bit encoding
p0 = read_f32_point(ps_ix);
p1 = read_f32_point(ps_ix + 2);
if (seg_type >= 2) {
p2 = read_f32_point(ps_ix + 4);
if (seg_type == 3) {
p3 = read_f32_point(ps_ix + 6);
}
}
} else {
// 16 bit encoding
p0 = read_i16_point(ps_ix);
p1 = read_i16_point(ps_ix + 1);
if (seg_type >= 2) {
p2 = read_i16_point(ps_ix + 2);
if (seg_type == 3) {
p3 = read_i16_point(ps_ix + 3);
}
}
}
TransformSeg transform = TransformSeg_read(conf.trans_alloc, trans_ref);
p0 = transform.mat.xy * p0.x + transform.mat.zw * p0.y + transform.translate;
p1 = transform.mat.xy * p1.x + transform.mat.zw * p1.y + transform.translate;
vec4 bbox = vec4(min(p0, p1), max(p0, p1));
// Degree-raise and compute bbox
if (seg_type >= 2) {
p2 = transform.mat.xy * p2.x + transform.mat.zw * p2.y + transform.translate;
bbox.xy = min(bbox.xy, p2);
bbox.zw = max(bbox.zw, p2);
if (seg_type == 3) {
p3 = transform.mat.xy * p3.x + transform.mat.zw * p3.y + transform.translate;
bbox.xy = min(bbox.xy, p3);
bbox.zw = max(bbox.zw, p3);
} else {
p3 = p2;
p2 = mix(p1, p2, 1.0 / 3.0);
p1 = mix(p1, p0, 1.0 / 3.0);
}
} else {
p3 = p1;
p2 = mix(p3, p0, 1.0 / 3.0);
p1 = mix(p0, p3, 1.0 / 3.0);
}
vec2 stroke = vec2(0.0, 0.0);
if (linewidth[i] >= 0.0) {
// See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm
stroke = 0.5 * linewidth[i] * vec2(length(transform.mat.xz), length(transform.mat.yw));
bbox += vec4(-stroke, stroke);
}
local[i].bbox = bbox;
local[i].flags = 0;
// Write path segment to output
PathCubic cubic;
cubic.p0 = p0;
cubic.p1 = p1;
cubic.p2 = p2;
cubic.p3 = p3;
cubic.path_ix = tm.path_ix;
// Not needed, TODO remove from struct
cubic.trans_ix = gl_GlobalInvocationID.x * 4 + i;
cubic.stroke = stroke;
uint fill_mode = uint(linewidth[i] >= 0.0);
PathSeg_Cubic_write(conf.pathseg_alloc, ps_ref, fill_mode, cubic);
ps_ref.offset += PathSeg_size;
uint n_points = (tag_byte & 3) + ((tag_byte >> 2) & 1);
uint n_words = n_points + (n_points & (((tag_byte >> 3) & 1) * 15));
ps_ix += n_words;
} else {
local[i].bbox = vec4(0.0, 0.0, 0.0, 0.0);
// These shifts need to be kept in sync with setup.h
uint is_path = (tag_byte >> 4) & 1;
// Relies on the fact that RESET_BBOX == 1
local[i].flags = is_path;
tm.path_ix += is_path;
trans_ix += (tag_byte >> 5) & 1;
trans_ref.offset += ((tag_byte >> 5) & 1) * TransformSeg_size;
lw_ix += (tag_byte >> 6) & 1;
}
}
// Partition-wide monoid scan for bbox monoid
Monoid agg = local[0];
for (uint i = 1; i < N_SEQ; i++) {
// Note: this could be fused with the map above, but probably
// a thin performance gain not worth the complexity.
agg = combine_monoid(agg, local[i]);
local[i] = agg;
}
// local is N_SEQ sub-partition inclusive scan of bbox monoid.
sh_scratch[gl_LocalInvocationID.x] = agg;
for (uint i = 0; i < LG_WG_SIZE; i++) {
barrier();
if (gl_LocalInvocationID.x >= (1u << i)) {
Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)];
agg = combine_monoid(other, agg);
}
barrier();
sh_scratch[gl_LocalInvocationID.x] = agg;
}
// sh_scratch is the partition-wide inclusive scan of the bbox monoid,
// sampled at the end of the N_SEQ sub-partition.
barrier();
uint path_ix = save_path_ix;
uint bbox_out_ix = (conf.bbox_alloc.offset >> 2) + path_ix * 6;
// Write bboxes to paths; do atomic min/max if partial
Monoid row = monoid_identity();
if (gl_LocalInvocationID.x > 0) {
row = sh_scratch[gl_LocalInvocationID.x - 1];
}
for (uint i = 0; i < N_SEQ; i++) {
Monoid m = combine_monoid(row, local[i]);
// m is partition-wide inclusive scan of bbox monoid.
bool do_atomic = false;
if (i == N_SEQ - 1 && gl_LocalInvocationID.x == WG_SIZE - 1) {
// last element
do_atomic = true;
}
if ((m.flags & FLAG_RESET_BBOX) != 0) {
memory[bbox_out_ix + 4] = floatBitsToUint(linewidth[i]);
memory[bbox_out_ix + 5] = save_trans_ix[i];
if ((m.flags & FLAG_SET_BBOX) == 0) {
do_atomic = true;
} else {
memory[bbox_out_ix] = round_down(m.bbox.x);
memory[bbox_out_ix + 1] = round_down(m.bbox.y);
memory[bbox_out_ix + 2] = round_up(m.bbox.z);
memory[bbox_out_ix + 3] = round_up(m.bbox.w);
bbox_out_ix += 6;
do_atomic = false;
}
}
if (do_atomic) {
if (m.bbox.z > m.bbox.x || m.bbox.w > m.bbox.y) {
// atomic min/max
atomicMin(memory[bbox_out_ix], round_down(m.bbox.x));
atomicMin(memory[bbox_out_ix + 1], round_down(m.bbox.y));
atomicMax(memory[bbox_out_ix + 2], round_up(m.bbox.z));
atomicMax(memory[bbox_out_ix + 3], round_up(m.bbox.w));
}
bbox_out_ix += 6;
}
}
}