vello/piet-gpu/shader/draw_leaf.comp
Raph Levien 3b67a4e7c1 New clip implementation
This PR reworks the clip implementation. The highlight is that clip bounding box accounting is now done on GPU rather than CPU. The clip mask is also rasterized on EndClip rather than BeginClip, which decreases memory traffic needed for the clip stack.

This is a pretty good working state, but not all cleanup has been applied. An important next step is to remove the CPU clip accounting (it is computed and encoded, but that result is not used). Another step is to remove the Annotated structure entirely.

Fixes #88. Also relevant to #119
2022-02-17 17:13:28 -08:00

175 lines
6.9 KiB
GLSL

// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
// The leaf scan pass for draw tag scan implemented as a tree reduction.
// This stage can be fused with its consumer but is separate now.
#version 450
#extension GL_GOOGLE_include_directive : enable
#include "mem.h"
#include "setup.h"
#define N_ROWS 8
#define LG_WG_SIZE (7 + LG_WG_FACTOR)
#define WG_SIZE (1 << LG_WG_SIZE)
#define PARTITION_SIZE (WG_SIZE * N_ROWS)
layout(local_size_x = WG_SIZE, local_size_y = 1) in;
layout(binding = 1) readonly buffer ConfigBuf {
Config conf;
};
layout(binding = 2) readonly buffer SceneBuf {
uint[] scene;
};
#include "scene.h"
#include "tile.h"
#include "drawtag.h"
#include "annotated.h"
#define Monoid DrawMonoid
layout(set = 0, binding = 3) readonly buffer ParentBuf {
Monoid[] parent;
};
shared Monoid sh_scratch[WG_SIZE];
void main() {
Monoid local[N_ROWS];
uint ix = gl_GlobalInvocationID.x * N_ROWS;
ElementRef ref = ElementRef(ix * Element_size);
uint tag_word = Element_tag(ref).tag;
Monoid agg = map_tag(tag_word);
local[0] = agg;
for (uint i = 1; i < N_ROWS; i++) {
tag_word = Element_tag(Element_index(ref, i)).tag;
agg = combine_tag_monoid(agg, map_tag(tag_word));
local[i] = agg;
}
sh_scratch[gl_LocalInvocationID.x] = agg;
for (uint i = 0; i < LG_WG_SIZE; i++) {
barrier();
if (gl_LocalInvocationID.x >= (1u << i)) {
Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)];
agg = combine_tag_monoid(other, agg);
}
barrier();
sh_scratch[gl_LocalInvocationID.x] = agg;
}
barrier();
Monoid row = tag_monoid_identity();
if (gl_WorkGroupID.x > 0) {
row = parent[gl_WorkGroupID.x - 1];
}
if (gl_LocalInvocationID.x > 0) {
row = combine_tag_monoid(row, sh_scratch[gl_LocalInvocationID.x - 1]);
}
uint out_ix = gl_GlobalInvocationID.x * N_ROWS;
uint out_base = (conf.drawmonoid_alloc.offset >> 2) + out_ix * 2;
uint clip_out_base = conf.clip_alloc.offset >> 2;
AnnotatedRef out_ref = AnnotatedRef(conf.anno_alloc.offset + out_ix * Annotated_size);
for (uint i = 0; i < N_ROWS; i++) {
Monoid m = row;
if (i > 0) {
m = combine_tag_monoid(m, local[i - 1]);
}
// m now holds exclusive scan of draw monoid
memory[out_base + i * 2] = m.path_ix;
memory[out_base + i * 2 + 1] = m.clip_ix;
// For compatibility, we'll generate an Annotated object, same as old
// pipeline. However, going forward we'll get rid of that, and have
// later stages read scene + bbox etc.
ElementRef this_ref = Element_index(ref, i);
tag_word = Element_tag(this_ref).tag;
if (tag_word == Element_FillColor || tag_word == Element_FillLinGradient || tag_word == Element_FillImage ||
tag_word == Element_BeginClip) {
uint bbox_offset = (conf.bbox_alloc.offset >> 2) + 6 * m.path_ix;
float bbox_l = float(memory[bbox_offset]) - 32768.0;
float bbox_t = float(memory[bbox_offset + 1]) - 32768.0;
float bbox_r = float(memory[bbox_offset + 2]) - 32768.0;
float bbox_b = float(memory[bbox_offset + 3]) - 32768.0;
vec4 bbox = vec4(bbox_l, bbox_t, bbox_r, bbox_b);
float linewidth = uintBitsToFloat(memory[bbox_offset + 4]);
uint fill_mode = uint(linewidth >= 0.0);
vec4 mat;
vec2 translate;
if (linewidth >= 0.0 || tag_word == Element_FillLinGradient) {
uint trans_ix = memory[bbox_offset + 5];
uint t = (conf.trans_alloc.offset >> 2) + 6 * trans_ix;
mat = uintBitsToFloat(uvec4(memory[t], memory[t + 1], memory[t + 2], memory[t + 3]));
if (tag_word == Element_FillLinGradient) {
translate = uintBitsToFloat(uvec2(memory[t + 4], memory[t + 5]));
}
}
if (linewidth >= 0.0) {
// TODO: need to deal with anisotropic case
linewidth *= sqrt(abs(mat.x * mat.w - mat.y * mat.z));
}
linewidth = max(linewidth, 0.0);
switch (tag_word) {
case Element_FillColor:
FillColor fill = Element_FillColor_read(this_ref);
AnnoColor anno_fill;
anno_fill.bbox = bbox;
anno_fill.linewidth = linewidth;
anno_fill.rgba_color = fill.rgba_color;
Annotated_Color_write(conf.anno_alloc, out_ref, fill_mode, anno_fill);
break;
case Element_FillLinGradient:
FillLinGradient lin = Element_FillLinGradient_read(this_ref);
AnnoLinGradient anno_lin;
anno_lin.bbox = bbox;
anno_lin.linewidth = linewidth;
anno_lin.index = lin.index;
vec2 p0 = mat.xy * lin.p0.x + mat.zw * lin.p0.y + translate;
vec2 p1 = mat.xy * lin.p1.x + mat.zw * lin.p1.y + translate;
vec2 dxy = p1 - p0;
float scale = 1.0 / (dxy.x * dxy.x + dxy.y * dxy.y);
float line_x = dxy.x * scale;
float line_y = dxy.y * scale;
anno_lin.line_x = line_x;
anno_lin.line_y = line_y;
anno_lin.line_c = -(p0.x * line_x + p0.y * line_y);
Annotated_LinGradient_write(conf.anno_alloc, out_ref, fill_mode, anno_lin);
break;
case Element_FillImage:
FillImage fill_img = Element_FillImage_read(this_ref);
AnnoImage anno_img;
anno_img.bbox = bbox;
anno_img.linewidth = linewidth;
anno_img.index = fill_img.index;
anno_img.offset = fill_img.offset;
Annotated_Image_write(conf.anno_alloc, out_ref, fill_mode, anno_img);
break;
case Element_BeginClip:
AnnoBeginClip anno_begin_clip;
anno_begin_clip.bbox = bbox;
anno_begin_clip.linewidth = 0.0; // don't support clip-with-stroke
Annotated_BeginClip_write(conf.anno_alloc, out_ref, 0, anno_begin_clip);
break;
}
} else if (tag_word == Element_EndClip) {
AnnoEndClip anno_end_clip;
// The actual bbox will be reconstructed from clip stream output.
anno_end_clip.bbox = vec4(-1e9, -1e9, 1e9, 1e9);
Annotated_EndClip_write(conf.anno_alloc, out_ref, anno_end_clip);
}
// Generate clip stream.
if (tag_word == Element_BeginClip || tag_word == Element_EndClip) {
uint path_ix = ~(out_ix + i);
if (tag_word == Element_BeginClip) {
path_ix = m.path_ix;
}
memory[clip_out_base + m.clip_ix] = path_ix;
}
out_ref.offset += Annotated_size;
}
}