vello/piet-gpu/shader/kernel4.comp
Raph Levien d1b9821fa8 Query extensions at runtime
Don't run extensions unless they're available. This includes querying
for descriptor indexing, and running one of two versions of kernel4
depending on whether it's enabled.

Part of the support needed for #78
2021-04-02 19:58:48 -07:00

283 lines
10 KiB
Plaintext

// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
// This is "kernel 4" in a 4-kernel pipeline. It renders the commands
// in the per-tile command list to an image.
// Right now, this kernel stores the image in a buffer, but a better
// plan is to use a texture. This is because of limited support.
#version 450
#extension GL_GOOGLE_include_directive : enable
#ifdef ENABLE_IMAGE_INDICES
#extension GL_EXT_nonuniform_qualifier : enable
#endif
#include "mem.h"
#include "setup.h"
#define CHUNK_X 2
#define CHUNK_Y 4
#define CHUNK CHUNK_X * CHUNK_Y
#define CHUNK_DX (TILE_WIDTH_PX / CHUNK_X)
#define CHUNK_DY (TILE_HEIGHT_PX / CHUNK_Y)
layout(local_size_x = CHUNK_DX, local_size_y = CHUNK_DY) in;
layout(set = 0, binding = 1) readonly buffer ConfigBuf {
Config conf;
};
layout(rgba8, set = 0, binding = 2) uniform writeonly image2D image;
#ifdef ENABLE_IMAGE_INDICES
layout(rgba8, set = 0, binding = 3) uniform readonly image2D images[];
#else
layout(rgba8, set = 0, binding = 3) uniform readonly image2D images[1];
#endif
#include "ptcl.h"
#include "tile.h"
#define BLEND_STACK_SIZE 4
// Layout of a clip scratch frame:
// Each frame is WIDTH * HEIGHT 32-bit words, then a link reference.
// Link offset and frame size in 32-bit words.
#define CLIP_LINK_OFFSET (TILE_WIDTH_PX * TILE_HEIGHT_PX)
#define CLIP_BUF_SIZE (CLIP_LINK_OFFSET + 1)
shared MallocResult sh_clip_alloc;
// Allocate a scratch buffer for clipping.
MallocResult alloc_clip_buf(uint link) {
if (gl_LocalInvocationID.x == 0 && gl_LocalInvocationID.y == 0) {
MallocResult m = malloc(CLIP_BUF_SIZE * 4);
if (!m.failed) {
write_mem(m.alloc, (m.alloc.offset >> 2) + CLIP_LINK_OFFSET, link);
}
sh_clip_alloc = m;
}
barrier();
return sh_clip_alloc;
}
vec3 tosRGB(vec3 rgb) {
bvec3 cutoff = greaterThanEqual(rgb, vec3(0.0031308));
vec3 below = vec3(12.92)*rgb;
vec3 above = vec3(1.055)*pow(rgb, vec3(0.41666)) - vec3(0.055);
return mix(below, above, cutoff);
}
vec3 fromsRGB(vec3 srgb) {
// Formula from EXT_sRGB.
bvec3 cutoff = greaterThanEqual(srgb, vec3(0.04045));
vec3 below = srgb/vec3(12.92);
vec3 above = pow((srgb + vec3(0.055))/vec3(1.055), vec3(2.4));
return mix(below, above, cutoff);
}
// unpacksRGB unpacks a color in the sRGB color space to a vec4 in the linear color
// space.
vec4 unpacksRGB(uint srgba) {
vec4 color = unpackUnorm4x8(srgba).wzyx;
return vec4(fromsRGB(color.rgb), color.a);
}
// packsRGB packs a color in the linear color space into its 8-bit sRGB equivalent.
uint packsRGB(vec4 rgba) {
rgba = vec4(tosRGB(rgba.rgb), rgba.a);
return packUnorm4x8(rgba.wzyx);
}
uvec2 chunk_offset(uint i) {
return uvec2(i % CHUNK_X * CHUNK_DX, i / CHUNK_X * CHUNK_DY);
}
vec4[CHUNK] fillImage(uvec2 xy, CmdImage cmd_img) {
vec4 rgba[CHUNK];
for (uint i = 0; i < CHUNK; i++) {
ivec2 uv = ivec2(xy + chunk_offset(i)) + cmd_img.offset;
#ifdef ENABLE_IMAGE_INDICES
vec4 fg_rgba = imageLoad(images[cmd_img.index], uv);
#else
vec4 fg_rgba = imageLoad(images[0], uv);
#endif
fg_rgba.rgb = fromsRGB(fg_rgba.rgb);
rgba[i] = fg_rgba;
}
return rgba;
}
void main() {
if (mem_error != NO_ERROR) {
return;
}
uint tile_ix = gl_WorkGroupID.y * conf.width_in_tiles + gl_WorkGroupID.x;
Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC);
CmdRef cmd_ref = CmdRef(cmd_alloc.offset);
uvec2 xy_uint = uvec2(gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_WorkGroupID.x, gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y);
vec2 xy = vec2(xy_uint);
vec3 rgb[CHUNK];
uint blend_stack[BLEND_STACK_SIZE][CHUNK];
uint blend_spill = 0;
uint blend_sp = 0;
Alloc clip_tos = new_alloc(0, 0);
for (uint i = 0; i < CHUNK; i++) {
rgb[i] = vec3(0.5);
#ifdef ENABLE_IMAGE_INDICES
if (xy_uint.x < 1024 && xy_uint.y < 1024) {
rgb[i] = imageLoad(images[gl_WorkGroupID.x / 64], ivec2(xy_uint + chunk_offset(i))/4).rgb;
}
#else
if (xy_uint.x < 1024 && xy_uint.y < 1024) {
rgb[i] = imageLoad(images[0], ivec2(xy_uint + chunk_offset(i))/4).rgb;
}
#endif
}
float area[CHUNK];
while (true) {
uint tag = Cmd_tag(cmd_alloc, cmd_ref).tag;
if (tag == Cmd_End) {
break;
}
switch (tag) {
case Cmd_Stroke:
// Calculate distance field from all the line segments in this tile.
CmdStroke stroke = Cmd_Stroke_read(cmd_alloc, cmd_ref);
float df[CHUNK];
for (uint k = 0; k < CHUNK; k++) df[k] = 1e9;
TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref);
do {
TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size), tile_seg_ref);
vec2 line_vec = seg.vector;
for (uint k = 0; k < CHUNK; k++) {
vec2 dpos = xy + vec2(0.5, 0.5) - seg.origin;
dpos += vec2(chunk_offset(k));
float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
df[k] = min(df[k], length(line_vec * t - dpos));
}
tile_seg_ref = seg.next;
} while (tile_seg_ref.offset != 0);
for (uint k = 0; k < CHUNK; k++) {
area[k] = clamp(stroke.half_width + 0.5 - df[k], 0.0, 1.0);
}
cmd_ref.offset += 4 + CmdStroke_size;
break;
case Cmd_Fill:
CmdFill fill = Cmd_Fill_read(cmd_alloc, cmd_ref);
for (uint k = 0; k < CHUNK; k++) area[k] = float(fill.backdrop);
tile_seg_ref = TileSegRef(fill.tile_ref);
// Calculate coverage based on backdrop + coverage of each line segment
do {
TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size), tile_seg_ref);
for (uint k = 0; k < CHUNK; k++) {
vec2 my_xy = xy + vec2(chunk_offset(k));
vec2 start = seg.origin - my_xy;
vec2 end = start + seg.vector;
vec2 window = clamp(vec2(start.y, end.y), 0.0, 1.0);
if (window.x != window.y) {
vec2 t = (window - start.y) / seg.vector.y;
vec2 xs = vec2(mix(start.x, end.x, t.x), mix(start.x, end.x, t.y));
float xmin = min(min(xs.x, xs.y), 1.0) - 1e-6;
float xmax = max(xs.x, xs.y);
float b = min(xmax, 1.0);
float c = max(b, 0.0);
float d = max(xmin, 0.0);
float a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin);
area[k] += a * (window.x - window.y);
}
area[k] += sign(seg.vector.x) * clamp(my_xy.y - seg.y_edge + 1.0, 0.0, 1.0);
}
tile_seg_ref = seg.next;
} while (tile_seg_ref.offset != 0);
for (uint k = 0; k < CHUNK; k++) {
area[k] = min(abs(area[k]), 1.0);
}
cmd_ref.offset += 4 + CmdFill_size;
break;
case Cmd_Solid:
for (uint k = 0; k < CHUNK; k++) {
area[k] = 1.0;
}
cmd_ref.offset += 4;
break;
case Cmd_Alpha:
CmdAlpha alpha = Cmd_Alpha_read(cmd_alloc, cmd_ref);
for (uint k = 0; k < CHUNK; k++) {
area[k] = alpha.alpha;
}
cmd_ref.offset += 4 + CmdAlpha_size;
break;
case Cmd_Color:
CmdColor color = Cmd_Color_read(cmd_alloc, cmd_ref);
vec4 fg = unpacksRGB(color.rgba_color);
for (uint k = 0; k < CHUNK; k++) {
vec4 fg_k = fg * area[k];
rgb[k] = rgb[k] * (1.0 - fg_k.a) + fg_k.rgb;
}
cmd_ref.offset += 4 + CmdColor_size;
break;
case Cmd_Image:
CmdImage fill_img = Cmd_Image_read(cmd_alloc, cmd_ref);
vec4 img[CHUNK] = fillImage(xy_uint, fill_img);
for (uint k = 0; k < CHUNK; k++) {
vec4 fg_k = img[k] * area[k];
rgb[k] = rgb[k] * (1.0 - fg_k.a) + fg_k.rgb;
}
cmd_ref.offset += 4 + CmdImage_size;
break;
case Cmd_BeginClip:
uint blend_slot = blend_sp % BLEND_STACK_SIZE;
if (blend_sp == blend_spill + BLEND_STACK_SIZE) {
// spill to scratch buffer
MallocResult m = alloc_clip_buf(clip_tos.offset);
if (m.failed) {
return;
}
clip_tos = m.alloc;
uint base_ix = (clip_tos.offset >> 2) + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y;
for (uint k = 0; k < CHUNK; k++) {
uvec2 offset = chunk_offset(k);
write_mem(clip_tos, base_ix + offset.x + offset.y * TILE_WIDTH_PX, blend_stack[blend_slot][k]);
}
blend_spill++;
}
for (uint k = 0; k < CHUNK; k++) {
blend_stack[blend_slot][k] = packsRGB(vec4(rgb[k], clamp(abs(area[k]), 0.0, 1.0)));
}
blend_sp++;
cmd_ref.offset += 4;
break;
case Cmd_EndClip:
blend_slot = (blend_sp - 1) % BLEND_STACK_SIZE;
if (blend_sp == blend_spill) {
uint base_ix = (clip_tos.offset >> 2) + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y;
for (uint k = 0; k < CHUNK; k++) {
uvec2 offset = chunk_offset(k);
blend_stack[blend_slot][k] = read_mem(clip_tos, base_ix + offset.x + offset.y * TILE_WIDTH_PX);
}
clip_tos.offset = read_mem(clip_tos, (clip_tos.offset >> 2) + CLIP_LINK_OFFSET);
blend_spill--;
}
blend_sp--;
for (uint k = 0; k < CHUNK; k++) {
vec4 rgba = unpacksRGB(blend_stack[blend_slot][k]);
rgb[k] = mix(rgba.rgb, rgb[k], area[k] * rgba.a);
}
cmd_ref.offset += 4;
break;
case Cmd_Jump:
cmd_ref = CmdRef(Cmd_Jump_read(cmd_alloc, cmd_ref).new_ref);
cmd_alloc.offset = cmd_ref.offset;
continue;
}
}
for (uint i = 0; i < CHUNK; i++) {
imageStore(image, ivec2(xy_uint + chunk_offset(i)), vec4(tosRGB(rgb[i]), 1.0));
}
}