vello/piet-gpu/shader/path_coarse.comp

290 lines
11 KiB
GLSL
Raw Normal View History

// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
// Coarse rasterization of path segments.
// Allocation and initialization of tiles for paths.
#version 450
#extension GL_GOOGLE_include_directive : enable
#include "mem.h"
#include "setup.h"
#define LG_COARSE_WG 5
#define COARSE_WG (1 << LG_COARSE_WG)
layout(local_size_x = COARSE_WG, local_size_y = 1) in;
layout(set = 0, binding = 1) readonly buffer ConfigBuf {
Config conf;
};
#include "pathseg.h"
#include "tile.h"
// scale factors useful for converting coordinates to tiles
#define SX (1.0 / float(TILE_WIDTH_PX))
#define SY (1.0 / float(TILE_HEIGHT_PX))
#define ACCURACY 0.25
#define Q_ACCURACY (ACCURACY * 0.1)
#define REM_ACCURACY (ACCURACY - Q_ACCURACY)
#define MAX_HYPOT2 (432.0 * Q_ACCURACY * Q_ACCURACY)
#define MAX_QUADS 16
vec2 eval_quad(vec2 p0, vec2 p1, vec2 p2, float t) {
float mt = 1.0 - t;
return p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t;
}
vec2 eval_cubic(vec2 p0, vec2 p1, vec2 p2, vec2 p3, float t) {
float mt = 1.0 - t;
return p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t;
}
struct SubdivResult {
float val;
float a0;
float a2;
};
/// An approximation to $\int (1 + 4x^2) ^ -0.25 dx$
///
/// This is used for flattening curves.
#define D 0.67
float approx_parabola_integral(float x) {
return x * inversesqrt(sqrt(1.0 - D + (D * D * D * D + 0.25 * x * x)));
}
/// An approximation to the inverse parabola integral.
#define B 0.39
float approx_parabola_inv_integral(float x) {
return x * sqrt(1.0 - B + (B * B + 0.25 * x * x));
}
SubdivResult estimate_subdiv(vec2 p0, vec2 p1, vec2 p2, float sqrt_tol) {
vec2 d01 = p1 - p0;
vec2 d12 = p2 - p1;
vec2 dd = d01 - d12;
float cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x;
float x0 = (d01.x * dd.x + d01.y * dd.y) / cross;
float x2 = (d12.x * dd.x + d12.y * dd.y) / cross;
float scale = abs(cross / (length(dd) * (x2 - x0)));
float a0 = approx_parabola_integral(x0);
float a2 = approx_parabola_integral(x2);
float val = 0.0;
if (scale < 1e9) {
float da = abs(a2 - a0);
float sqrt_scale = sqrt(scale);
if (sign(x0) == sign(x2)) {
val = da * sqrt_scale;
} else {
float xmin = sqrt_tol / sqrt_scale;
val = sqrt_tol * da / approx_parabola_integral(xmin);
}
}
return SubdivResult(val, a0, a2);
}
Implement robust dynamic memory This is the core logic for robust dynamic memory. There are changes to both shaders and the driver logic. On the shader side, failure information is more useful and fine grained. In particular, it now reports which stage failed and how much memory would have been required to make that stage succeed. On the driver side, there is a new RenderDriver abstraction which owns command buffers (and associated query pools) and runs the logic to retry and reallocate buffers when necessary. There's also a fairly significant rework of the logic to produce the config block, as that overlaps the robust memory. The RenderDriver abstraction may not stay. It was done this way to minimize code disruption, but arguably it should just be combined with Renderer. Another change: the GLSL length() method on a buffer requires additional infrastructure (at least on Metal, where it needs a binding of its own), so we now pass that in as a field in the config. This also moves blend memory to its own buffer. This worked out well because coarse rasterization can simply report the size of the blend buffer and it can be reallocated without needing to rerun the pipeline. In the previous state, blend allocations and ptcl writes were interleaved in coarse rasterization, so a failure of the former would require rerunning coarse. This should fix #83 (finally!) There are a few loose ends. The binaries haven't (yet) been updated (I've been testing using a hand-written test program). Gradients weren't touched so still have a fixed size allocation. And the logic to calculate the new buffer size on allocation failure could be smarter. Closes #175
2022-06-24 01:48:26 +10:00
// All writes to the output must be gated by mem_ok.
bool mem_ok = true;
void main() {
Implement robust dynamic memory This is the core logic for robust dynamic memory. There are changes to both shaders and the driver logic. On the shader side, failure information is more useful and fine grained. In particular, it now reports which stage failed and how much memory would have been required to make that stage succeed. On the driver side, there is a new RenderDriver abstraction which owns command buffers (and associated query pools) and runs the logic to retry and reallocate buffers when necessary. There's also a fairly significant rework of the logic to produce the config block, as that overlaps the robust memory. The RenderDriver abstraction may not stay. It was done this way to minimize code disruption, but arguably it should just be combined with Renderer. Another change: the GLSL length() method on a buffer requires additional infrastructure (at least on Metal, where it needs a binding of its own), so we now pass that in as a field in the config. This also moves blend memory to its own buffer. This worked out well because coarse rasterization can simply report the size of the blend buffer and it can be reallocated without needing to rerun the pipeline. In the previous state, blend allocations and ptcl writes were interleaved in coarse rasterization, so a failure of the former would require rerunning coarse. This should fix #83 (finally!) There are a few loose ends. The binaries haven't (yet) been updated (I've been testing using a hand-written test program). Gradients weren't touched so still have a fixed size allocation. And the logic to calculate the new buffer size on allocation failure could be smarter. Closes #175
2022-06-24 01:48:26 +10:00
if (!check_deps(STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_PATH_COARSE)) {
return;
}
uint element_ix = gl_GlobalInvocationID.x;
PathSegRef ref = PathSegRef(conf.pathseg_alloc.offset + element_ix * PathSeg_size);
PathSegTag tag = PathSegTag(PathSeg_Nop, 0);
if (element_ix < conf.n_pathseg) {
tag = PathSeg_tag(conf.pathseg_alloc, ref);
}
switch (tag.tag) {
case PathSeg_Cubic:
PathCubic cubic = PathSeg_Cubic_read(conf.pathseg_alloc, ref);
vec2 err_v = 3.0 * (cubic.p2 - cubic.p1) + cubic.p0 - cubic.p3;
float err = err_v.x * err_v.x + err_v.y * err_v.y;
// The number of quadratics.
uint n_quads = max(uint(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0))), 1);
n_quads = min(n_quads, MAX_QUADS);
SubdivResult keep_params[MAX_QUADS];
// Iterate over quadratics and tote up the estimated number of segments.
float val = 0.0;
vec2 qp0 = cubic.p0;
float step = 1.0 / float(n_quads);
for (uint i = 0; i < n_quads; i++) {
float t = float(i + 1) * step;
vec2 qp2 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t);
vec2 qp1 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t - 0.5 * step);
qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
SubdivResult params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY));
keep_params[i] = params;
val += params.val;
qp0 = qp2;
}
uint n = max(uint(ceil(val * 0.5 / sqrt(REM_ACCURACY))), 1);
bool is_stroke = fill_mode_from_flags(tag.flags) == MODE_STROKE;
uint path_ix = cubic.path_ix;
Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size));
Alloc path_alloc =
Implement robust dynamic memory This is the core logic for robust dynamic memory. There are changes to both shaders and the driver logic. On the shader side, failure information is more useful and fine grained. In particular, it now reports which stage failed and how much memory would have been required to make that stage succeed. On the driver side, there is a new RenderDriver abstraction which owns command buffers (and associated query pools) and runs the logic to retry and reallocate buffers when necessary. There's also a fairly significant rework of the logic to produce the config block, as that overlaps the robust memory. The RenderDriver abstraction may not stay. It was done this way to minimize code disruption, but arguably it should just be combined with Renderer. Another change: the GLSL length() method on a buffer requires additional infrastructure (at least on Metal, where it needs a binding of its own), so we now pass that in as a field in the config. This also moves blend memory to its own buffer. This worked out well because coarse rasterization can simply report the size of the blend buffer and it can be reallocated without needing to rerun the pipeline. In the previous state, blend allocations and ptcl writes were interleaved in coarse rasterization, so a failure of the former would require rerunning coarse. This should fix #83 (finally!) There are a few loose ends. The binaries haven't (yet) been updated (I've been testing using a hand-written test program). Gradients weren't touched so still have a fixed size allocation. And the logic to calculate the new buffer size on allocation failure could be smarter. Closes #175
2022-06-24 01:48:26 +10:00
new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, true);
ivec4 bbox = ivec4(path.bbox);
vec2 p0 = cubic.p0;
qp0 = cubic.p0;
float v_step = val / float(n);
int n_out = 1;
float val_sum = 0.0;
for (uint i = 0; i < n_quads; i++) {
float t = float(i + 1) * step;
vec2 qp2 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t);
vec2 qp1 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t - 0.5 * step);
qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
SubdivResult params = keep_params[i];
float u0 = approx_parabola_inv_integral(params.a0);
float u2 = approx_parabola_inv_integral(params.a2);
float uscale = 1.0 / (u2 - u0);
float target = float(n_out) * v_step;
while (n_out == n || target < val_sum + params.val) {
vec2 p1;
if (n_out == n) {
p1 = cubic.p3;
} else {
float u = (target - val_sum) / params.val;
float a = mix(params.a0, params.a2, u);
float au = approx_parabola_inv_integral(a);
float t = (au - u0) * uscale;
p1 = eval_quad(qp0, qp1, qp2, t);
}
// Output line segment
// Bounding box of element in pixel coordinates.
float xmin = min(p0.x, p1.x) - cubic.stroke.x;
float xmax = max(p0.x, p1.x) + cubic.stroke.x;
float ymin = min(p0.y, p1.y) - cubic.stroke.y;
float ymax = max(p0.y, p1.y) + cubic.stroke.y;
float dx = p1.x - p0.x;
float dy = p1.y - p0.y;
// Set up for per-scanline coverage formula, below.
float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
float c = (cubic.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + cubic.stroke.y)) * SX;
float b = invslope; // Note: assumes square tiles, otherwise scale.
float a = (p0.x - (p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX;
int x0 = int(floor(xmin * SX));
int x1 = int(floor(xmax * SX) + 1);
int y0 = int(floor(ymin * SY));
int y1 = int(floor(ymax * SY) + 1);
x0 = clamp(x0, bbox.x, bbox.z);
y0 = clamp(y0, bbox.y, bbox.w);
x1 = clamp(x1, bbox.x, bbox.z);
y1 = clamp(y1, bbox.y, bbox.w);
float xc = a + b * float(y0);
int stride = bbox.z - bbox.x;
int base = (y0 - bbox.y) * stride - bbox.x;
// TODO: can be tighter, use c to bound width
uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
// Consider using subgroups to aggregate atomic add.
Implement robust dynamic memory This is the core logic for robust dynamic memory. There are changes to both shaders and the driver logic. On the shader side, failure information is more useful and fine grained. In particular, it now reports which stage failed and how much memory would have been required to make that stage succeed. On the driver side, there is a new RenderDriver abstraction which owns command buffers (and associated query pools) and runs the logic to retry and reallocate buffers when necessary. There's also a fairly significant rework of the logic to produce the config block, as that overlaps the robust memory. The RenderDriver abstraction may not stay. It was done this way to minimize code disruption, but arguably it should just be combined with Renderer. Another change: the GLSL length() method on a buffer requires additional infrastructure (at least on Metal, where it needs a binding of its own), so we now pass that in as a field in the config. This also moves blend memory to its own buffer. This worked out well because coarse rasterization can simply report the size of the blend buffer and it can be reallocated without needing to rerun the pipeline. In the previous state, blend allocations and ptcl writes were interleaved in coarse rasterization, so a failure of the former would require rerunning coarse. This should fix #83 (finally!) There are a few loose ends. The binaries haven't (yet) been updated (I've been testing using a hand-written test program). Gradients weren't touched so still have a fixed size allocation. And the logic to calculate the new buffer size on allocation failure could be smarter. Closes #175
2022-06-24 01:48:26 +10:00
uint malloc_size = n_tile_alloc * TileSeg_size;
uint tile_offset = malloc_stage(malloc_size, conf.mem_size, STAGE_PATH_COARSE);
if (tile_offset == MALLOC_FAILED) {
mem_ok = false;
}
Implement robust dynamic memory This is the core logic for robust dynamic memory. There are changes to both shaders and the driver logic. On the shader side, failure information is more useful and fine grained. In particular, it now reports which stage failed and how much memory would have been required to make that stage succeed. On the driver side, there is a new RenderDriver abstraction which owns command buffers (and associated query pools) and runs the logic to retry and reallocate buffers when necessary. There's also a fairly significant rework of the logic to produce the config block, as that overlaps the robust memory. The RenderDriver abstraction may not stay. It was done this way to minimize code disruption, but arguably it should just be combined with Renderer. Another change: the GLSL length() method on a buffer requires additional infrastructure (at least on Metal, where it needs a binding of its own), so we now pass that in as a field in the config. This also moves blend memory to its own buffer. This worked out well because coarse rasterization can simply report the size of the blend buffer and it can be reallocated without needing to rerun the pipeline. In the previous state, blend allocations and ptcl writes were interleaved in coarse rasterization, so a failure of the former would require rerunning coarse. This should fix #83 (finally!) There are a few loose ends. The binaries haven't (yet) been updated (I've been testing using a hand-written test program). Gradients weren't touched so still have a fixed size allocation. And the logic to calculate the new buffer size on allocation failure could be smarter. Closes #175
2022-06-24 01:48:26 +10:00
Alloc tile_alloc = new_alloc(tile_offset, malloc_size, true);
TileSeg tile_seg;
int xray = int(floor(p0.x * SX));
int last_xray = int(floor(p1.x * SX));
if (p0.y > p1.y) {
int tmp = xray;
xray = last_xray;
last_xray = tmp;
}
for (int y = y0; y < y1; y++) {
float tile_y0 = float(y * TILE_HEIGHT_PX);
int xbackdrop = max(xray + 1, bbox.x);
if (!is_stroke && min(p0.y, p1.y) < tile_y0 && xbackdrop < bbox.z) {
int backdrop = p1.y < p0.y ? 1 : -1;
TileRef tile_ref = Tile_index(path.tiles, uint(base + xbackdrop));
uint tile_el = tile_ref.offset >> 2;
Implement robust dynamic memory This is the core logic for robust dynamic memory. There are changes to both shaders and the driver logic. On the shader side, failure information is more useful and fine grained. In particular, it now reports which stage failed and how much memory would have been required to make that stage succeed. On the driver side, there is a new RenderDriver abstraction which owns command buffers (and associated query pools) and runs the logic to retry and reallocate buffers when necessary. There's also a fairly significant rework of the logic to produce the config block, as that overlaps the robust memory. The RenderDriver abstraction may not stay. It was done this way to minimize code disruption, but arguably it should just be combined with Renderer. Another change: the GLSL length() method on a buffer requires additional infrastructure (at least on Metal, where it needs a binding of its own), so we now pass that in as a field in the config. This also moves blend memory to its own buffer. This worked out well because coarse rasterization can simply report the size of the blend buffer and it can be reallocated without needing to rerun the pipeline. In the previous state, blend allocations and ptcl writes were interleaved in coarse rasterization, so a failure of the former would require rerunning coarse. This should fix #83 (finally!) There are a few loose ends. The binaries haven't (yet) been updated (I've been testing using a hand-written test program). Gradients weren't touched so still have a fixed size allocation. And the logic to calculate the new buffer size on allocation failure could be smarter. Closes #175
2022-06-24 01:48:26 +10:00
atomicAdd(memory[tile_el + 1], backdrop);
}
// next_xray is the xray for the next scanline; the line segment intersects
// all tiles between xray and next_xray.
int next_xray = last_xray;
if (y < y1 - 1) {
float tile_y1 = float((y + 1) * TILE_HEIGHT_PX);
float x_edge = mix(p0.x, p1.x, (tile_y1 - p0.y) / dy);
next_xray = int(floor(x_edge * SX));
}
int min_xray = min(xray, next_xray);
int max_xray = max(xray, next_xray);
int xx0 = min(int(floor(xc - c)), min_xray);
int xx1 = max(int(ceil(xc + c)), max_xray + 1);
xx0 = clamp(xx0, x0, x1);
xx1 = clamp(xx1, x0, x1);
for (int x = xx0; x < xx1; x++) {
float tile_x0 = float(x * TILE_WIDTH_PX);
TileRef tile_ref = Tile_index(TileRef(path.tiles.offset), uint(base + x));
uint tile_el = tile_ref.offset >> 2;
uint old = 0;
Implement robust dynamic memory This is the core logic for robust dynamic memory. There are changes to both shaders and the driver logic. On the shader side, failure information is more useful and fine grained. In particular, it now reports which stage failed and how much memory would have been required to make that stage succeed. On the driver side, there is a new RenderDriver abstraction which owns command buffers (and associated query pools) and runs the logic to retry and reallocate buffers when necessary. There's also a fairly significant rework of the logic to produce the config block, as that overlaps the robust memory. The RenderDriver abstraction may not stay. It was done this way to minimize code disruption, but arguably it should just be combined with Renderer. Another change: the GLSL length() method on a buffer requires additional infrastructure (at least on Metal, where it needs a binding of its own), so we now pass that in as a field in the config. This also moves blend memory to its own buffer. This worked out well because coarse rasterization can simply report the size of the blend buffer and it can be reallocated without needing to rerun the pipeline. In the previous state, blend allocations and ptcl writes were interleaved in coarse rasterization, so a failure of the former would require rerunning coarse. This should fix #83 (finally!) There are a few loose ends. The binaries haven't (yet) been updated (I've been testing using a hand-written test program). Gradients weren't touched so still have a fixed size allocation. And the logic to calculate the new buffer size on allocation failure could be smarter. Closes #175
2022-06-24 01:48:26 +10:00
old = atomicExchange(memory[tile_el], tile_offset);
tile_seg.origin = p0;
tile_seg.vector = p1 - p0;
float y_edge = 0.0;
if (!is_stroke) {
y_edge = mix(p0.y, p1.y, (tile_x0 - p0.x) / dx);
if (min(p0.x, p1.x) < tile_x0) {
vec2 p = vec2(tile_x0, y_edge);
if (p0.x > p1.x) {
tile_seg.vector = p - p0;
} else {
tile_seg.origin = p;
tile_seg.vector = p1 - p;
}
// kernel4 uses sign(vector.x) for the sign of the intersection backdrop.
// Nudge zeroes towards the intended sign.
if (tile_seg.vector.x == 0) {
tile_seg.vector.x = sign(p1.x - p0.x) * 1e-9;
}
}
if (x <= min_xray || max_xray < x) {
// Reject inconsistent intersections.
y_edge = 1e9;
}
}
tile_seg.y_edge = y_edge;
tile_seg.next.offset = old;
Implement robust dynamic memory This is the core logic for robust dynamic memory. There are changes to both shaders and the driver logic. On the shader side, failure information is more useful and fine grained. In particular, it now reports which stage failed and how much memory would have been required to make that stage succeed. On the driver side, there is a new RenderDriver abstraction which owns command buffers (and associated query pools) and runs the logic to retry and reallocate buffers when necessary. There's also a fairly significant rework of the logic to produce the config block, as that overlaps the robust memory. The RenderDriver abstraction may not stay. It was done this way to minimize code disruption, but arguably it should just be combined with Renderer. Another change: the GLSL length() method on a buffer requires additional infrastructure (at least on Metal, where it needs a binding of its own), so we now pass that in as a field in the config. This also moves blend memory to its own buffer. This worked out well because coarse rasterization can simply report the size of the blend buffer and it can be reallocated without needing to rerun the pipeline. In the previous state, blend allocations and ptcl writes were interleaved in coarse rasterization, so a failure of the former would require rerunning coarse. This should fix #83 (finally!) There are a few loose ends. The binaries haven't (yet) been updated (I've been testing using a hand-written test program). Gradients weren't touched so still have a fixed size allocation. And the logic to calculate the new buffer size on allocation failure could be smarter. Closes #175
2022-06-24 01:48:26 +10:00
if (mem_ok) {
TileSeg_write(tile_alloc, TileSegRef(tile_offset), tile_seg);
}
tile_offset += TileSeg_size;
}
xc += b;
base += stride;
xray = next_xray;
}
n_out += 1;
target += v_step;
p0 = p1;
}
val_sum += params.val;
qp0 = qp2;
}
break;
}
}