mirror of
https://github.com/italicsjenga/vello.git
synced 2025-01-25 18:56:35 +11:00
d9d518b248
The compute shaders have a check for the succesful completion of their preceding stage. However, consider a shader execution path like the following: void main() if (mem_error != NO_ERROR) { return; } ... malloc(...); ... barrier(); ... } and shader execution that fails to allocate memory, thereby setting mem_error to ERR_MALLOC_FAILED in malloc before reaching the barrier. If another shader execution then begins execution, its mem_eror check will make it return early and not reach the barrier. All GPU APIs require (dynamically) uniform control flow for barriers, and the above case may lead to GPU hangs in practice. Fix this issue by replacing the early exits with careful checks that don't interrupt barrier control flow. Unfortunately, it's harder to prove the soundness of the new checks, so this change also clears dynamic memory ranges in MEM_DEBUG mode when memory is exhausted. The result is that accessing memory after exhaustion triggers an error. Signed-off-by: Elias Naur <mail@eliasnaur.com>
104 lines
3.4 KiB
GLSL
104 lines
3.4 KiB
GLSL
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
|
|
|
|
// Allocation and initialization of tiles for paths.
|
|
|
|
#version 450
|
|
#extension GL_GOOGLE_include_directive : enable
|
|
|
|
#include "mem.h"
|
|
#include "setup.h"
|
|
|
|
#define LG_TILE_ALLOC_WG (7 + LG_WG_FACTOR)
|
|
#define TILE_ALLOC_WG (1 << LG_TILE_ALLOC_WG)
|
|
|
|
layout(local_size_x = TILE_ALLOC_WG, local_size_y = 1) in;
|
|
|
|
layout(set = 0, binding = 1) readonly buffer ConfigBuf {
|
|
Config conf;
|
|
};
|
|
|
|
#include "annotated.h"
|
|
#include "tile.h"
|
|
|
|
// scale factors useful for converting coordinates to tiles
|
|
#define SX (1.0 / float(TILE_WIDTH_PX))
|
|
#define SY (1.0 / float(TILE_HEIGHT_PX))
|
|
|
|
shared uint sh_tile_count[TILE_ALLOC_WG];
|
|
shared MallocResult sh_tile_alloc;
|
|
|
|
void main() {
|
|
uint th_ix = gl_LocalInvocationID.x;
|
|
uint element_ix = gl_GlobalInvocationID.x;
|
|
PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size);
|
|
AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
|
|
|
|
uint tag = Annotated_Nop;
|
|
if (element_ix < conf.n_elements) {
|
|
tag = Annotated_tag(conf.anno_alloc, ref).tag;
|
|
}
|
|
int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
|
|
switch (tag) {
|
|
case Annotated_Color:
|
|
case Annotated_Image:
|
|
case Annotated_BeginClip:
|
|
case Annotated_EndClip:
|
|
// Note: we take advantage of the fact that fills, strokes, and
|
|
// clips have compatible layout.
|
|
AnnoEndClip clip = Annotated_EndClip_read(conf.anno_alloc, ref);
|
|
x0 = int(floor(clip.bbox.x * SX));
|
|
y0 = int(floor(clip.bbox.y * SY));
|
|
x1 = int(ceil(clip.bbox.z * SX));
|
|
y1 = int(ceil(clip.bbox.w * SY));
|
|
break;
|
|
}
|
|
x0 = clamp(x0, 0, int(conf.width_in_tiles));
|
|
y0 = clamp(y0, 0, int(conf.height_in_tiles));
|
|
x1 = clamp(x1, 0, int(conf.width_in_tiles));
|
|
y1 = clamp(y1, 0, int(conf.height_in_tiles));
|
|
|
|
Path path;
|
|
path.bbox = uvec4(x0, y0, x1, y1);
|
|
uint tile_count = (x1 - x0) * (y1 - y0);
|
|
if (tag == Annotated_EndClip) {
|
|
// Don't actually allocate tiles for an end clip, but we do want
|
|
// the path structure (especially bbox) allocated for it.
|
|
tile_count = 0;
|
|
}
|
|
|
|
sh_tile_count[th_ix] = tile_count;
|
|
uint total_tile_count = tile_count;
|
|
// Prefix sum of sh_tile_count
|
|
for (uint i = 0; i < LG_TILE_ALLOC_WG; i++) {
|
|
barrier();
|
|
if (th_ix >= (1 << i)) {
|
|
total_tile_count += sh_tile_count[th_ix - (1 << i)];
|
|
}
|
|
barrier();
|
|
sh_tile_count[th_ix] = total_tile_count;
|
|
}
|
|
if (th_ix == TILE_ALLOC_WG - 1) {
|
|
sh_tile_alloc = malloc(total_tile_count * Tile_size);
|
|
}
|
|
barrier();
|
|
MallocResult alloc_start = sh_tile_alloc;
|
|
if (alloc_start.failed || mem_error != NO_ERROR) {
|
|
return;
|
|
}
|
|
|
|
if (element_ix < conf.n_elements) {
|
|
uint tile_subix = th_ix > 0 ? sh_tile_count[th_ix - 1] : 0;
|
|
Alloc tiles_alloc = slice_mem(alloc_start.alloc, Tile_size * tile_subix, Tile_size * tile_count);
|
|
path.tiles = TileRef(tiles_alloc.offset);
|
|
Path_write(conf.tile_alloc, path_ref, path);
|
|
}
|
|
|
|
// Zero out allocated tiles efficiently
|
|
uint total_count = sh_tile_count[TILE_ALLOC_WG - 1] * (Tile_size / 4);
|
|
uint start_ix = alloc_start.alloc.offset >> 2;
|
|
for (uint i = th_ix; i < total_count; i += TILE_ALLOC_WG) {
|
|
// Note: this interleaving is faster than using Tile_write
|
|
// by a significant amount.
|
|
write_mem(alloc_start.alloc, start_ix + i, 0);
|
|
}
|
|
}
|