mirror of
https://github.com/italicsjenga/vello.git
synced 2025-01-11 04:51:32 +11:00
d9d518b248
The compute shaders have a check for the succesful completion of their preceding stage. However, consider a shader execution path like the following: void main() if (mem_error != NO_ERROR) { return; } ... malloc(...); ... barrier(); ... } and shader execution that fails to allocate memory, thereby setting mem_error to ERR_MALLOC_FAILED in malloc before reaching the barrier. If another shader execution then begins execution, its mem_eror check will make it return early and not reach the barrier. All GPU APIs require (dynamically) uniform control flow for barriers, and the above case may lead to GPU hangs in practice. Fix this issue by replacing the early exits with careful checks that don't interrupt barrier control flow. Unfortunately, it's harder to prove the soundness of the new checks, so this change also clears dynamic memory ranges in MEM_DEBUG mode when memory is exhausted. The result is that accessing memory after exhaustion triggers an error. Signed-off-by: Elias Naur <mail@eliasnaur.com>
110 lines
4.1 KiB
Plaintext
110 lines
4.1 KiB
Plaintext
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
|
|
|
|
// Propagation of tile backdrop for filling.
|
|
//
|
|
// Each thread reads one path element and calculates the number of spanned tiles
|
|
// based on the bounding box.
|
|
// In a further compaction step, the workgroup loops over the corresponding tile rows per element in parallel.
|
|
// For each row the per tile backdrop will be read, as calculated in the previous coarse path segment kernel,
|
|
// and propagated from the left to the right (prefix summed).
|
|
//
|
|
// Output state:
|
|
// - Each path element has an array of tiles covering the whole path based on boundig box
|
|
// - Each tile per path element contains the 'backdrop' and a list of subdivided path segments
|
|
|
|
#version 450
|
|
#extension GL_GOOGLE_include_directive : enable
|
|
|
|
#include "mem.h"
|
|
#include "setup.h"
|
|
|
|
#define LG_BACKDROP_WG (7 + LG_WG_FACTOR)
|
|
#define BACKDROP_WG (1 << LG_BACKDROP_WG)
|
|
|
|
layout(local_size_x = BACKDROP_WG, local_size_y = 1) in;
|
|
|
|
layout(set = 0, binding = 1) readonly buffer ConfigBuf {
|
|
Config conf;
|
|
};
|
|
|
|
#include "annotated.h"
|
|
#include "tile.h"
|
|
|
|
shared uint sh_row_count[BACKDROP_WG];
|
|
shared Alloc sh_row_alloc[BACKDROP_WG];
|
|
shared uint sh_row_width[BACKDROP_WG];
|
|
|
|
void main() {
|
|
uint th_ix = gl_LocalInvocationID.x;
|
|
uint element_ix = gl_GlobalInvocationID.x;
|
|
AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
|
|
|
|
// Work assignment: 1 thread : 1 path element
|
|
uint row_count = 0;
|
|
bool mem_ok = mem_error == NO_ERROR;
|
|
if (element_ix < conf.n_elements) {
|
|
AnnotatedTag tag = Annotated_tag(conf.anno_alloc, ref);
|
|
switch (tag.tag) {
|
|
case Annotated_Image:
|
|
case Annotated_BeginClip:
|
|
case Annotated_Color:
|
|
if (fill_mode_from_flags(tag.flags) != MODE_NONZERO) {
|
|
break;
|
|
}
|
|
// Fall through.
|
|
PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size);
|
|
Path path = Path_read(conf.tile_alloc, path_ref);
|
|
sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
|
|
row_count = path.bbox.w - path.bbox.y;
|
|
// Paths that don't cross tile top edges don't have backdrops.
|
|
// Don't apply the optimization to paths that may cross the y = 0
|
|
// top edge, but clipped to 1 row.
|
|
if (row_count == 1 && path.bbox.y > 0) {
|
|
// Note: this can probably be expanded to width = 2 as
|
|
// long as it doesn't cross the left edge.
|
|
row_count = 0;
|
|
}
|
|
Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
|
|
sh_row_alloc[th_ix] = path_alloc;
|
|
}
|
|
}
|
|
|
|
sh_row_count[th_ix] = row_count;
|
|
// Prefix sum of sh_row_count
|
|
for (uint i = 0; i < LG_BACKDROP_WG; i++) {
|
|
barrier();
|
|
if (th_ix >= (1 << i)) {
|
|
row_count += sh_row_count[th_ix - (1 << i)];
|
|
}
|
|
barrier();
|
|
sh_row_count[th_ix] = row_count;
|
|
}
|
|
barrier();
|
|
// Work assignment: 1 thread : 1 path element row
|
|
uint total_rows = sh_row_count[BACKDROP_WG - 1];
|
|
for (uint row = th_ix; row < total_rows; row += BACKDROP_WG) {
|
|
// Binary search to find element
|
|
uint el_ix = 0;
|
|
for (uint i = 0; i < LG_BACKDROP_WG; i++) {
|
|
uint probe = el_ix + ((BACKDROP_WG / 2) >> i);
|
|
if (row >= sh_row_count[probe - 1]) {
|
|
el_ix = probe;
|
|
}
|
|
}
|
|
uint width = sh_row_width[el_ix];
|
|
if (width > 0 && mem_ok) {
|
|
// Process one row sequentially
|
|
// Read backdrop value per tile and prefix sum it
|
|
Alloc tiles_alloc = sh_row_alloc[el_ix];
|
|
uint seq_ix = row - (el_ix > 0 ? sh_row_count[el_ix - 1] : 0);
|
|
uint tile_el_ix = (tiles_alloc.offset >> 2) + 1 + seq_ix * 2 * width;
|
|
uint sum = read_mem(tiles_alloc, tile_el_ix);
|
|
for (uint x = 1; x < width; x++) {
|
|
tile_el_ix += 2;
|
|
sum += read_mem(tiles_alloc, tile_el_ix);
|
|
write_mem(tiles_alloc, tile_el_ix, sum);
|
|
}
|
|
}
|
|
}
|
|
}
|