vello/piet-gpu/shader/kernel4.comp
Raph Levien a60c2dd3c8 Scratch buffer for clip stack
We keep a small window of the clip stack in registers in the fine
rasterization kernel, and when that window is exceeded, spill to global
memory, so the clip stack can be unbounded.
2020-11-22 18:14:09 -08:00

241 lines
9.4 KiB
Plaintext

// This is "kernel 4" in a 4-kernel pipeline. It renders the commands
// in the per-tile command list to an image.
// Right now, this kernel stores the image in a buffer, but a better
// plan is to use a texture. This is because of limited support.
#version 450
#extension GL_GOOGLE_include_directive : enable
#include "setup.h"
#define CHUNK 8
#define CHUNK_DY (TILE_HEIGHT_PX / CHUNK)
layout(local_size_x = TILE_WIDTH_PX, local_size_y = CHUNK_DY) in;
// Same concern that this should be readonly as in kernel 3.
layout(set = 0, binding = 0) buffer PtclBuf {
uint[] ptcl;
};
layout(set = 0, binding = 1) buffer TileBuf {
uint[] tile;
};
layout(set = 0, binding = 2) buffer ClipScratchBuf {
uint[] clip_scratch;
};
layout(rgba8, set = 0, binding = 3) uniform writeonly image2D image;
#include "ptcl.h"
#include "tile.h"
#define BLEND_STACK_SIZE 4
// Layout of clip_scratch buffer:
// [0] is the alloc bump offset (in units of 32 bit words, initially 0)
// Starting at 1 is a sequence of frames.
// Each frame is WIDTH * HEIGHT 32-bit words, then a link reference.
#define CLIP_LINK_OFFSET (TILE_WIDTH_PX * TILE_HEIGHT_PX)
#define CLIP_BUF_SIZE (CLIP_LINK_OFFSET + 1)
shared uint sh_clip_alloc;
// Allocate a scratch buffer for clipping. Unlike offsets in the rest of the code,
// it counts 32-bit words.
uint alloc_clip_buf(uint link) {
if (gl_LocalInvocationID.x == 0 && gl_LocalInvocationID.y == 0) {
uint alloc = atomicAdd(clip_scratch[0], CLIP_BUF_SIZE) + 1;
sh_clip_alloc = alloc;
clip_scratch[alloc + CLIP_LINK_OFFSET] = link;
}
barrier();
return sh_clip_alloc;
}
// Calculate coverage based on backdrop + coverage of each line segment
float[CHUNK] computeArea(vec2 xy, int backdrop, uint tile_ref) {
// Probably better to store as float, but conversion is no doubt cheap.
float area[CHUNK];
for (uint k = 0; k < CHUNK; k++) area[k] = float(backdrop);
TileSegRef tile_seg_ref = TileSegRef(tile_ref);
do {
TileSeg seg = TileSeg_read(tile_seg_ref);
for (uint k = 0; k < CHUNK; k++) {
vec2 my_xy = vec2(xy.x, xy.y + float(k * CHUNK_DY));
vec2 start = seg.start - my_xy;
vec2 end = seg.end - my_xy;
vec2 window = clamp(vec2(start.y, end.y), 0.0, 1.0);
if (window.x != window.y) {
vec2 t = (window - start.y) / (end.y - start.y);
vec2 xs = vec2(mix(start.x, end.x, t.x), mix(start.x, end.x, t.y));
float xmin = min(min(xs.x, xs.y), 1.0) - 1e-6;
float xmax = max(xs.x, xs.y);
float b = min(xmax, 1.0);
float c = max(b, 0.0);
float d = max(xmin, 0.0);
float a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin);
area[k] += a * (window.x - window.y);
}
area[k] += sign(end.x - start.x) * clamp(my_xy.y - seg.y_edge + 1.0, 0.0, 1.0);
}
tile_seg_ref = seg.next;
} while (tile_seg_ref.offset != 0);
for (uint k = 0; k < CHUNK; k++) {
area[k] = min(abs(area[k]), 1.0);
}
return area;
}
void main() {
uint tile_ix = gl_WorkGroupID.y * WIDTH_IN_TILES + gl_WorkGroupID.x;
CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC);
uvec2 xy_uint = uvec2(gl_GlobalInvocationID.x, gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y);
vec2 xy = vec2(xy_uint);
vec3 rgb[CHUNK];
float mask[CHUNK];
uint blend_stack[BLEND_STACK_SIZE][CHUNK];
uint blend_spill = 0;
uint blend_sp = 0;
uint clip_tos = 0;
for (uint i = 0; i < CHUNK; i++) {
rgb[i] = vec3(0.5);
mask[i] = 1.0;
}
while (true) {
uint tag = Cmd_tag(cmd_ref);
if (tag == Cmd_End) {
break;
}
switch (tag) {
case Cmd_Circle:
CmdCircle circle = Cmd_Circle_read(cmd_ref);
vec4 fg_rgba = unpackUnorm4x8(circle.rgba_color).wzyx;
for (uint i = 0; i < CHUNK; i++) {
float dy = float(i * CHUNK_DY);
float r = length(vec2(xy.x, xy.y + dy) + vec2(0.5, 0.5) - circle.center.xy);
float alpha = clamp(0.5 + circle.radius - r, 0.0, 1.0);
// TODO: sRGB
rgb[i] = mix(rgb[i], fg_rgba.rgb, mask[i] * alpha * fg_rgba.a);
}
break;
case Cmd_Stroke:
// Calculate distance field from all the line segments in this tile.
CmdStroke stroke = Cmd_Stroke_read(cmd_ref);
float df[CHUNK];
for (uint k = 0; k < CHUNK; k++) df[k] = 1e9;
TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref);
do {
TileSeg seg = TileSeg_read(tile_seg_ref);
vec2 line_vec = seg.end - seg.start;
for (uint k = 0; k < CHUNK; k++) {
vec2 dpos = xy + vec2(0.5, 0.5) - seg.start;
dpos.y += float(k * CHUNK_DY);
float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
df[k] = min(df[k], length(line_vec * t - dpos));
}
tile_seg_ref = seg.next;
} while (tile_seg_ref.offset != 0);
fg_rgba = unpackUnorm4x8(stroke.rgba_color).wzyx;
for (uint k = 0; k < CHUNK; k++) {
float alpha = clamp(stroke.half_width + 0.5 - df[k], 0.0, 1.0);
rgb[k] = mix(rgb[k], fg_rgba.rgb, mask[k] * alpha * fg_rgba.a);
}
break;
case Cmd_Fill:
CmdFill fill = Cmd_Fill_read(cmd_ref);
float area[CHUNK];
area = computeArea(xy, fill.backdrop, fill.tile_ref);
fg_rgba = unpackUnorm4x8(fill.rgba_color).wzyx;
for (uint k = 0; k < CHUNK; k++) {
rgb[k] = mix(rgb[k], fg_rgba.rgb, mask[k] * area[k] * fg_rgba.a);
}
break;
case Cmd_FillMask:
CmdFillMask fill_mask = Cmd_FillMask_read(cmd_ref);
area = computeArea(xy, fill_mask.backdrop, fill_mask.tile_ref);
for (uint k = 0; k < CHUNK; k++) {
mask[k] = mix(mask[k], fill_mask.mask, area[k]);
}
break;
case Cmd_FillMaskInv:
fill_mask = Cmd_FillMask_read(cmd_ref);
area = computeArea(xy, fill_mask.backdrop, fill_mask.tile_ref);
for (uint k = 0; k < CHUNK; k++) {
mask[k] = mix(mask[k], fill_mask.mask, 1.0 - area[k]);
}
break;
case Cmd_BeginClip:
case Cmd_BeginSolidClip:
uint blend_slot = blend_sp % BLEND_STACK_SIZE;
if (blend_sp == blend_spill + BLEND_STACK_SIZE) {
// spill to scratch buffer
clip_tos = alloc_clip_buf(clip_tos);
uint base_ix = clip_tos + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y;
for (uint k = 0; k < CHUNK; k++) {
clip_scratch[base_ix + k * TILE_WIDTH_PX * CHUNK_DY] = blend_stack[blend_slot][k];
}
blend_spill++;
}
if (tag == Cmd_BeginClip) {
CmdBeginClip begin_clip = Cmd_BeginClip_read(cmd_ref);
area = computeArea(xy, begin_clip.backdrop, begin_clip.tile_ref);
for (uint k = 0; k < CHUNK; k++) {
blend_stack[blend_slot][k] = packUnorm4x8(vec4(rgb[k], clamp(abs(area[k]), 0.0, 1.0)));
}
} else {
CmdBeginSolidClip begin_solid_clip = Cmd_BeginSolidClip_read(cmd_ref);
float solid_alpha = begin_solid_clip.alpha;
for (uint k = 0; k < CHUNK; k++) {
blend_stack[blend_slot][k] = packUnorm4x8(vec4(rgb[k], solid_alpha));
}
}
blend_sp++;
break;
case Cmd_EndClip:
CmdEndClip end_clip = Cmd_EndClip_read(cmd_ref);
blend_slot = (blend_sp - 1) % BLEND_STACK_SIZE;
if (blend_sp == blend_spill) {
uint base_ix = clip_tos + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y;
for (uint k = 0; k < CHUNK; k++) {
blend_stack[blend_slot][k] = clip_scratch[base_ix + k * TILE_WIDTH_PX * CHUNK_DY];
}
clip_tos = clip_scratch[clip_tos + CLIP_LINK_OFFSET];
blend_spill--;
}
blend_sp--;
for (uint k = 0; k < CHUNK; k++) {
vec4 rgba = unpackUnorm4x8(blend_stack[blend_slot][k]);
rgb[k] = mix(rgba.rgb, rgb[k], end_clip.alpha * rgba.a);
}
break;
case Cmd_Solid:
CmdSolid solid = Cmd_Solid_read(cmd_ref);
fg_rgba = unpackUnorm4x8(solid.rgba_color).wzyx;
for (uint k = 0; k < CHUNK; k++) {
rgb[k] = mix(rgb[k], fg_rgba.rgb, mask[k] * fg_rgba.a);
}
break;
case Cmd_SolidMask:
CmdSolidMask solid_mask = Cmd_SolidMask_read(cmd_ref);
for (uint k = 0; k < CHUNK; k++) {
mask[k] = solid_mask.mask;
}
break;
case Cmd_Jump:
cmd_ref = CmdRef(Cmd_Jump_read(cmd_ref).new_ref);
continue;
}
cmd_ref.offset += Cmd_size;
}
// TODO: sRGB
for (uint i = 0; i < CHUNK; i++) {
imageStore(image, ivec2(xy_uint.x, xy_uint.y + CHUNK_DY * i), vec4(rgb[i], 1.0));
}
}