vello/piet-gpu/shader/pathseg.comp
Raph Levien 178761dcb3 Path stream processing
This patch contains the core of the path stream processing, though some
integration bits are missing. The core logic is tested, though
combinations of path types, transforms, and line widths are not (yet).

Progress towards #119
2021-12-01 07:33:24 -08:00

285 lines
9.9 KiB
GLSL

// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
// Processing of the path stream, after the tag scan.
#version 450
#extension GL_GOOGLE_include_directive : enable
#include "mem.h"
#include "setup.h"
#include "pathtag.h"
#define N_SEQ 4
#define LG_WG_SIZE 9
#define WG_SIZE (1 << LG_WG_SIZE)
#define PARTITION_SIZE (WG_SIZE * N_SEQ)
layout(local_size_x = WG_SIZE, local_size_y = 1) in;
layout(binding = 1) readonly buffer ConfigBuf {
Config conf;
};
layout(binding = 2) readonly buffer SceneBuf {
uint[] scene;
};
#include "tile.h"
#include "pathseg.h"
layout(binding = 3) readonly buffer ParentBuf {
TagMonoid[] parent;
};
struct Monoid {
vec4 bbox;
uint flags;
};
#define FLAG_RESET_BBOX 1
#define FLAG_SET_BBOX 2
Monoid combine_monoid(Monoid a, Monoid b) {
Monoid c;
c.bbox = b.bbox;
// TODO: I think this should be gated on b & SET_BBOX == false also.
if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) {
c.bbox = a.bbox;
} else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 &&
(a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y))
{
c.bbox.xy = min(a.bbox.xy, c.bbox.xy);
c.bbox.zw = max(a.bbox.zw, c.bbox.zw);
}
c.flags = (a.flags & FLAG_SET_BBOX) | b.flags;
c.flags |= ((a.flags & FLAG_RESET_BBOX) << 1);
return c;
}
Monoid monoid_identity() {
return Monoid(vec4(0.0, 0.0, 0.0, 0.0), 0);
}
// These are not both live at the same time. A very smart shader compiler
// would be able to figure that out, but I suspect many won't.
shared TagMonoid sh_tag[WG_SIZE];
shared Monoid sh_scratch[WG_SIZE];
vec2 read_f32_point(uint ix) {
float x = uintBitsToFloat(scene[ix]);
float y = uintBitsToFloat(scene[ix + 1]);
return vec2(x, y);
}
vec2 read_i16_point(uint ix) {
uint raw = scene[ix];
float x = float(int(raw << 16) >> 16);
float y = float(int(raw) >> 16);
return vec2(x, y);
}
// Note: these are 16 bit, which is adequate, but we could use 32 bits.
// Round down and saturate to minimum integer; add bias
uint round_down(float x) {
return uint(max(0.0, floor(x) + 32768.0));
}
// Round up and saturate to maximum integer; add bias
uint round_up(float x) {
return uint(min(65535.0, ceil(x) + 32768.0));
}
void main() {
Monoid local[N_SEQ];
uint ix = gl_GlobalInvocationID.x * N_SEQ;
uint tag_word = scene[(conf.pathtag_offset >> 2) + (ix >> 2)];
// Scan the tag monoid
TagMonoid local_tm = reduce_tag(tag_word);
sh_tag[gl_LocalInvocationID.x] = local_tm;
for (uint i; i < LG_WG_SIZE; i++) {
barrier();
if (gl_LocalInvocationID.x >= (1u << i)) {
TagMonoid other = sh_tag[gl_LocalInvocationID.x - (1u << i)];
local_tm = combine_tag_monoid(other, local_tm);
}
barrier();
sh_tag[gl_LocalInvocationID.x] = local_tm;
}
barrier();
// sh_tag is now the partition-wide inclusive scan of the tag monoid.
TagMonoid tm = tag_monoid_identity();
if (gl_WorkGroupID.x > 0) {
tm = parent[gl_WorkGroupID.x - 1];
}
if (gl_LocalInvocationID.x > 0) {
tm = combine_tag_monoid(tm, sh_tag[gl_LocalInvocationID.x - 1]);
}
// tm is now the full exclusive scan of the tag monoid.
// Indices to scene buffer in u32 units.
uint ps_ix = (conf.pathseg_offset >> 2) + tm.pathseg_offset;
uint lw_ix = (conf.linewidth_offset >> 2) + tm.linewidth_ix;
uint save_path_ix = tm.path_ix;
TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + tm.trans_ix * TransformSeg_size);
PathSegRef ps_ref = PathSegRef(conf.pathseg_alloc.offset + tm.pathseg_ix * PathSeg_size);
for (uint i = 0; i < N_SEQ; i++) {
// if N_SEQ > 4, need to load tag_word from local if N_SEQ % 4 == 0
uint tag_byte = tag_word >> (i * 8);
uint seg_type = tag_byte & 3;
if (seg_type != 0) {
// 1 = line, 2 = quad, 3 = cubic
// Unpack path segment from input
vec2 p0;
vec2 p1;
vec2 p2;
vec2 p3;
if ((tag_byte & 8) != 0) {
// 32 bit encoding
p0 = read_f32_point(ps_ix);
p1 = read_f32_point(ps_ix + 2);
if (seg_type >= 2) {
p2 = read_f32_point(ps_ix + 4);
if (seg_type == 3) {
p3 = read_f32_point(ps_ix + 6);
}
}
} else {
// 16 bit encoding
p0 = read_i16_point(ps_ix);
p1 = read_i16_point(ps_ix + 1);
if (seg_type >= 2) {
p2 = read_i16_point(ps_ix + 2);
if (seg_type == 3) {
p3 = read_i16_point(ps_ix + 3);
}
}
}
float linewidth = uintBitsToFloat(scene[lw_ix]);
TransformSeg transform = TransformSeg_read(conf.trans_alloc, trans_ref);
p0 = transform.mat.xy * p0.x + transform.mat.zw * p0.y + transform.translate;
p1 = transform.mat.xy * p1.x + transform.mat.zw * p1.y + transform.translate;
vec4 bbox = vec4(min(p0, p1), max(p0, p1));
// Degree-raise and compute bbox
if (seg_type >= 2) {
p2 = transform.mat.xy * p2.x + transform.mat.zw * p2.y + transform.translate;
bbox.xy = min(bbox.xy, p2);
bbox.zw = max(bbox.zw, p2);
if (seg_type == 3) {
p3 = transform.mat.xy * p3.x + transform.mat.zw * p3.y + transform.translate;
bbox.xy = min(bbox.xy, p3);
bbox.zw = max(bbox.zw, p3);
} else {
p3 = p2;
p2 = mix(p1, p2, 1.0 / 3.0);
p1 = mix(p1, p0, 1.0 / 3.0);
}
} else {
p3 = p1;
p2 = mix(p3, p0, 1.0 / 3.0);
p1 = mix(p0, p3, 1.0 / 3.0);
}
vec2 stroke = vec2(0.0, 0.0);
if (linewidth >= 0.0) {
// See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm
stroke = 0.5 * linewidth * vec2(length(transform.mat.xz), length(transform.mat.yw));
bbox += vec4(-stroke, stroke);
}
local[i].bbox = bbox;
local[i].flags = 0;
// Write path segment to output
PathCubic cubic;
cubic.p0 = p0;
cubic.p1 = p1;
cubic.p2 = p2;
cubic.p3 = p3;
cubic.path_ix = tm.path_ix;
// Not needed, TODO remove from struct
cubic.trans_ix = gl_GlobalInvocationID.x * 4 + i;
cubic.stroke = stroke;
uint fill_mode = uint(linewidth >= 0.0);
PathSeg_Cubic_write(conf.pathseg_alloc, ps_ref, fill_mode, cubic);
ps_ref.offset += PathSeg_size;
uint n_points = (tag_byte & 3) + ((tag_byte >> 2) & 1);
uint n_words = n_points + (n_points & (((tag_byte >> 3) & 1) * 15));
ps_ix += n_words;
} else {
local[i].bbox = vec4(0.0, 0.0, 0.0, 0.0);
// These shifts need to be kept in sync with setup.h
uint is_path = (tag_byte >> 4) & 1;
// Relies on the fact that RESET_BBOX == 1
local[i].flags = is_path;
tm.path_ix += is_path;
trans_ref.offset += ((tag_byte >> 5) & 1) * TransformSeg_size;
lw_ix += (tag_byte >> 6) & 1;
}
}
// Partition-wide monoid scan for bbox monoid
Monoid agg = local[0];
for (uint i = 1; i < N_SEQ; i++) {
// Note: this could be fused with the map above, but probably
// a thin performance gain not worth the complexity.
agg = combine_monoid(agg, local[i]);
local[i] = agg;
}
// local is N_SEQ sub-partition inclusive scan of bbox monoid.
sh_scratch[gl_LocalInvocationID.x] = agg;
for (uint i = 0; i < LG_WG_SIZE; i++) {
barrier();
if (gl_LocalInvocationID.x >= (1u << i)) {
Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)];
agg = combine_monoid(other, agg);
}
barrier();
sh_scratch[gl_LocalInvocationID.x] = agg;
}
// sh_scratch is the partition-wide inclusive scan of the bbox monoid,
// sampled at the end of the N_SEQ sub-partition.
barrier();
uint path_ix = save_path_ix;
uint bbox_out_ix = (conf.bbox_alloc.offset >> 2) + path_ix * 4;
// Write bboxes to paths; do atomic min/max if partial
Monoid row = monoid_identity();
if (gl_LocalInvocationID.x > 0) {
row = sh_scratch[gl_LocalInvocationID.x - 1];
}
for (uint i = 0; i < N_SEQ; i++) {
Monoid m = combine_monoid(row, local[i]);
// m is partition-wide inclusive scan of bbox monoid.
bool do_atomic = false;
if (i == N_SEQ - 1 && gl_LocalInvocationID.x == WG_SIZE - 1) {
// last element
do_atomic = true;
}
if ((m.flags & FLAG_RESET_BBOX) != 0) {
if ((m.flags & FLAG_SET_BBOX) == 0) {
do_atomic = true;
} else {
memory[bbox_out_ix] = round_down(m.bbox.x);
memory[bbox_out_ix + 1] = round_down(m.bbox.y);
memory[bbox_out_ix + 2] = round_up(m.bbox.z);
memory[bbox_out_ix + 3] = round_up(m.bbox.w);
bbox_out_ix += 4;
do_atomic = false;
}
}
if (do_atomic) {
if (m.bbox.z > m.bbox.x || m.bbox.w > m.bbox.y) {
// atomic min/max
atomicMin(memory[bbox_out_ix], round_down(m.bbox.x));
atomicMin(memory[bbox_out_ix + 1], round_down(m.bbox.y));
atomicMax(memory[bbox_out_ix + 2], round_up(m.bbox.z));
atomicMax(memory[bbox_out_ix + 3], round_up(m.bbox.w));
}
bbox_out_ix += 4;
}
}
}