vello/piet-gpu/shader/kernel4.comp
Raph Levien 240f44a228 Implement robust dynamic memory
This is the core logic for robust dynamic memory. There are changes to both shaders and the driver logic.

On the shader side, failure information is more useful and fine grained. In particular, it now reports which stage failed and how much memory would have been required to make that stage succeed.

On the driver side, there is a new RenderDriver abstraction which owns command buffers (and associated query pools) and runs the logic to retry and reallocate buffers when necessary. There's also a fairly significant rework of the logic to produce the config block, as that overlaps the robust memory.

The RenderDriver abstraction may not stay. It was done this way to minimize code disruption, but arguably it should just be combined with Renderer.

Another change: the GLSL length() method on a buffer requires additional infrastructure (at least on Metal, where it needs a binding of its own), so we now pass that in as a field in the config.

This also moves blend memory to its own buffer. This worked out well because coarse rasterization can simply report the size of the blend buffer and it can be reallocated without needing to rerun the pipeline. In the previous state, blend allocations and ptcl writes were interleaved in coarse rasterization, so a failure of the former would require rerunning coarse. This should fix #83 (finally!)

There are a few loose ends. The binaries haven't (yet) been updated (I've been testing using a hand-written test program). Gradients weren't touched so still have a fixed size allocation. And the logic to calculate the new buffer size on allocation failure could be smarter.

Closes #175
2022-07-13 12:34:51 -07:00

302 lines
12 KiB
GLSL

// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
// This is "kernel 4" in a 4-kernel pipeline. It renders the commands
// in the per-tile command list to an image.
// Right now, this kernel stores the image in a buffer, but a better
// plan is to use a texture. This is because of limited support.
#version 450
#extension GL_GOOGLE_include_directive : enable
// We can do rendering either in sRGB colorspace (for compatibility)
// or in a linear colorspace, with conversions to sRGB (which will give
// higher quality antialiasing among other things).
#define DO_SRGB_CONVERSION 0
// TODO: the binding of the main buffer can be readonly
#include "mem.h"
#include "setup.h"
#define CHUNK_X 2
#define CHUNK_Y 4
#define CHUNK (CHUNK_X * CHUNK_Y)
#define CHUNK_DX (TILE_WIDTH_PX / CHUNK_X)
#define CHUNK_DY (TILE_HEIGHT_PX / CHUNK_Y)
layout(local_size_x = CHUNK_DX, local_size_y = CHUNK_DY) in;
layout(binding = 1) restrict readonly buffer ConfigBuf {
Config conf;
};
layout(binding = 2) buffer BlendBuf {
uint blend_mem[];
};
#ifdef GRAY
layout(r8, binding = 3) uniform restrict writeonly image2D image;
#else
layout(rgba8, binding = 3) uniform restrict writeonly image2D image;
#endif
layout(rgba8, binding = 4) uniform restrict readonly image2D image_atlas;
layout(rgba8, binding = 5) uniform restrict readonly image2D gradients;
#include "ptcl.h"
#include "tile.h"
#include "blend.h"
#define MAX_BLEND_STACK 128
mediump vec3 tosRGB(mediump vec3 rgb) {
#if DO_SRGB_CONVERSION
bvec3 cutoff = greaterThanEqual(rgb, vec3(0.0031308));
mediump vec3 below = vec3(12.92) * rgb;
mediump vec3 above = vec3(1.055) * pow(rgb, vec3(0.41666)) - vec3(0.055);
return mix(below, above, cutoff);
#else
return rgb;
#endif
}
mediump vec3 fromsRGB(mediump vec3 srgb) {
#if DO_SRGB_CONVERSION
// Formula from EXT_sRGB.
bvec3 cutoff = greaterThanEqual(srgb, vec3(0.04045));
mediump vec3 below = srgb / vec3(12.92);
mediump vec3 above = pow((srgb + vec3(0.055)) / vec3(1.055), vec3(2.4));
return mix(below, above, cutoff);
#else
return srgb;
#endif
}
// unpacksRGB unpacks a color in the sRGB color space to a vec4 in the linear color
// space.
mediump vec4 unpacksRGB(uint srgba) {
mediump vec4 color = unpackUnorm4x8(srgba).wzyx;
return vec4(fromsRGB(color.rgb), color.a);
}
// packsRGB packs a color in the linear color space into its 8-bit sRGB equivalent.
uint packsRGB(mediump vec4 rgba) {
rgba = vec4(tosRGB(rgba.rgb), rgba.a);
return packUnorm4x8(rgba.wzyx);
}
uvec2 chunk_offset(uint i) {
return uvec2(i % CHUNK_X * CHUNK_DX, i / CHUNK_X * CHUNK_DY);
}
mediump vec4[CHUNK] fillImage(uvec2 xy, CmdImage cmd_img) {
mediump vec4 rgba[CHUNK];
for (uint i = 0; i < CHUNK; i++) {
ivec2 uv = ivec2(xy + chunk_offset(i)) + cmd_img.offset;
mediump vec4 fg_rgba;
fg_rgba = imageLoad(image_atlas, uv);
fg_rgba.rgb = fromsRGB(fg_rgba.rgb);
rgba[i] = fg_rgba;
}
return rgba;
}
void main() {
uint tile_ix = gl_WorkGroupID.y * conf.width_in_tiles + gl_WorkGroupID.x;
Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC);
CmdRef cmd_ref = CmdRef(cmd_alloc.offset);
uint blend_offset = memory[cmd_ref.offset >> 2];
cmd_ref.offset += 4;
uvec2 xy_uint = uvec2(gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_WorkGroupID.x,
gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y);
vec2 xy = vec2(xy_uint);
mediump vec4 rgba[CHUNK];
uint blend_stack[BLEND_STACK_SPLIT][CHUNK];
for (uint i = 0; i < CHUNK; i++) {
rgba[i] = vec4(0.0);
}
mediump float area[CHUNK];
uint clip_depth = 0;
// Previously we would early-out if there was a memory failure, so we wouldn't try to read corrupt
// tiles. But now we assume this is checked CPU-side before launching fine rasterization.
while (true) {
uint tag = Cmd_tag(cmd_alloc, cmd_ref).tag;
if (tag == Cmd_End) {
break;
}
switch (tag) {
case Cmd_Stroke:
// Calculate distance field from all the line segments in this tile.
CmdStroke stroke = Cmd_Stroke_read(cmd_alloc, cmd_ref);
mediump float df[CHUNK];
for (uint k = 0; k < CHUNK; k++)
df[k] = 1e9;
TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref);
do {
TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, true), tile_seg_ref);
vec2 line_vec = seg.vector;
for (uint k = 0; k < CHUNK; k++) {
vec2 dpos = xy + vec2(0.5, 0.5) - seg.origin;
dpos += vec2(chunk_offset(k));
float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
df[k] = min(df[k], length(line_vec * t - dpos));
}
tile_seg_ref = seg.next;
} while (tile_seg_ref.offset != 0);
for (uint k = 0; k < CHUNK; k++) {
area[k] = clamp(stroke.half_width + 0.5 - df[k], 0.0, 1.0);
}
cmd_ref.offset += 4 + CmdStroke_size;
break;
case Cmd_Fill:
CmdFill fill = Cmd_Fill_read(cmd_alloc, cmd_ref);
for (uint k = 0; k < CHUNK; k++)
area[k] = float(fill.backdrop);
tile_seg_ref = TileSegRef(fill.tile_ref);
// Calculate coverage based on backdrop + coverage of each line segment
do {
TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, true), tile_seg_ref);
for (uint k = 0; k < CHUNK; k++) {
vec2 my_xy = xy + vec2(chunk_offset(k));
vec2 start = seg.origin - my_xy;
vec2 end = start + seg.vector;
vec2 window = clamp(vec2(start.y, end.y), 0.0, 1.0);
if (window.x != window.y) {
vec2 t = (window - start.y) / seg.vector.y;
vec2 xs = vec2(mix(start.x, end.x, t.x), mix(start.x, end.x, t.y));
float xmin = min(min(xs.x, xs.y), 1.0) - 1e-6;
float xmax = max(xs.x, xs.y);
float b = min(xmax, 1.0);
float c = max(b, 0.0);
float d = max(xmin, 0.0);
float a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin);
area[k] += a * (window.x - window.y);
}
area[k] += sign(seg.vector.x) * clamp(my_xy.y - seg.y_edge + 1.0, 0.0, 1.0);
}
tile_seg_ref = seg.next;
} while (tile_seg_ref.offset != 0);
for (uint k = 0; k < CHUNK; k++) {
area[k] = min(abs(area[k]), 1.0);
}
cmd_ref.offset += 4 + CmdFill_size;
break;
case Cmd_Solid:
for (uint k = 0; k < CHUNK; k++) {
area[k] = 1.0;
}
cmd_ref.offset += 4;
break;
case Cmd_Alpha:
CmdAlpha alpha = Cmd_Alpha_read(cmd_alloc, cmd_ref);
for (uint k = 0; k < CHUNK; k++) {
area[k] = alpha.alpha;
}
cmd_ref.offset += 4 + CmdAlpha_size;
break;
case Cmd_Color:
CmdColor color = Cmd_Color_read(cmd_alloc, cmd_ref);
mediump vec4 fg = unpacksRGB(color.rgba_color);
for (uint k = 0; k < CHUNK; k++) {
mediump vec4 fg_k = fg * area[k];
rgba[k] = rgba[k] * (1.0 - fg_k.a) + fg_k;
}
cmd_ref.offset += 4 + CmdColor_size;
break;
case Cmd_LinGrad:
CmdLinGrad lin = Cmd_LinGrad_read(cmd_alloc, cmd_ref);
float d = lin.line_x * float(xy.x) + lin.line_y * float(xy.y) + lin.line_c;
for (uint k = 0; k < CHUNK; k++) {
vec2 chunk_xy = vec2(chunk_offset(k));
float my_d = d + lin.line_x * chunk_xy.x + lin.line_y * chunk_xy.y;
int x = int(round(clamp(my_d, 0.0, 1.0) * float(GRADIENT_WIDTH - 1)));
mediump vec4 fg_rgba = imageLoad(gradients, ivec2(x, int(lin.index)));
fg_rgba.rgb = fromsRGB(fg_rgba.rgb);
mediump vec4 fg_k = fg_rgba * area[k];
rgba[k] = rgba[k] * (1.0 - fg_k.a) + fg_k;
}
cmd_ref.offset += 4 + CmdLinGrad_size;
break;
case Cmd_RadGrad:
CmdRadGrad rad = Cmd_RadGrad_read(cmd_alloc, cmd_ref);
for (uint k = 0; k < CHUNK; k++) {
vec2 my_xy = xy + vec2(chunk_offset(k));
my_xy = rad.mat.xz * my_xy.x + rad.mat.yw * my_xy.y - rad.xlat;
float ba = dot(my_xy, rad.c1);
float ca = rad.ra * dot(my_xy, my_xy);
float t = sqrt(ba * ba + ca) - ba - rad.roff;
int x = int(round(clamp(t, 0.0, 1.0) * float(GRADIENT_WIDTH - 1)));
mediump vec4 fg_rgba = imageLoad(gradients, ivec2(x, int(rad.index)));
fg_rgba.rgb = fromsRGB(fg_rgba.rgb);
mediump vec4 fg_k = fg_rgba * area[k];
rgba[k] = rgba[k] * (1.0 - fg_k.a) + fg_k;
}
cmd_ref.offset += 4 + CmdRadGrad_size;
break;
case Cmd_Image:
CmdImage fill_img = Cmd_Image_read(cmd_alloc, cmd_ref);
mediump vec4 img[CHUNK] = fillImage(xy_uint, fill_img);
for (uint k = 0; k < CHUNK; k++) {
mediump vec4 fg_k = img[k] * area[k];
rgba[k] = rgba[k] * (1.0 - fg_k.a) + fg_k;
}
cmd_ref.offset += 4 + CmdImage_size;
break;
case Cmd_BeginClip:
if (clip_depth < BLEND_STACK_SPLIT) {
for (uint k = 0; k < CHUNK; k++) {
blend_stack[clip_depth][k] = packsRGB(vec4(rgba[k]));
rgba[k] = vec4(0.0);
}
} else {
uint base_ix = (blend_offset >> 2) + (clip_depth - BLEND_STACK_SPLIT) * TILE_HEIGHT_PX * TILE_WIDTH_PX +
CHUNK * (gl_LocalInvocationID.x + CHUNK_DX * gl_LocalInvocationID.y);
for (uint k = 0; k < CHUNK; k++) {
blend_mem[base_ix + k] = packsRGB(vec4(rgba[k]));
rgba[k] = vec4(0.0);
}
}
clip_depth++;
cmd_ref.offset += 4;
break;
case Cmd_EndClip:
CmdEndClip end_clip = Cmd_EndClip_read(cmd_alloc, cmd_ref);
clip_depth--;
uint base_ix;
if (clip_depth >= BLEND_STACK_SPLIT) {
base_ix = (blend_offset >> 2) + (clip_depth - BLEND_STACK_SPLIT) * TILE_HEIGHT_PX * TILE_WIDTH_PX +
CHUNK * (gl_LocalInvocationID.x + CHUNK_DX * gl_LocalInvocationID.y);
}
for (uint k = 0; k < CHUNK; k++) {
uint bg_rgba;
if (clip_depth < BLEND_STACK_SPLIT) {
bg_rgba = blend_stack[clip_depth][k];
} else {
bg_rgba = blend_mem[base_ix + k];
}
mediump vec4 bg = unpacksRGB(bg_rgba);
mediump vec4 fg = rgba[k] * area[k];
rgba[k] = mix_blend_compose(bg, fg, end_clip.blend);
}
cmd_ref.offset += 4 + CmdEndClip_size;
break;
case Cmd_Jump:
cmd_ref = CmdRef(Cmd_Jump_read(cmd_alloc, cmd_ref).new_ref);
cmd_alloc.offset = cmd_ref.offset;
break;
}
}
for (uint i = 0; i < CHUNK; i++) {
#ifdef GRAY
// Just store the alpha value; later we can specialize this kernel more to avoid
// computing unneeded RGB colors.
imageStore(image, ivec2(xy_uint + chunk_offset(i)), vec4(rgba[i].a));
#else
imageStore(image, ivec2(xy_uint + chunk_offset(i)), vec4(tosRGB(rgba[i].rgb), rgba[i].a));
#endif
}
}