vello/piet-gpu/shader/binning.comp
Elias Naur d9d518b248 avoid non-uniform barrier control flow when exhausting memory
The compute shaders have a check for the succesful completion of their
preceding stage. However, consider a shader execution path like the
following:

	void main()
		if (mem_error != NO_ERROR) {
		    return;
		}
		...
		malloc(...);
		...
		barrier();
		...
	}

and  shader execution that fails to allocate memory, thereby setting
mem_error to ERR_MALLOC_FAILED in malloc before reaching the barrier. If
another shader execution then begins execution, its mem_eror check will
make it return early and not reach the barrier.

All GPU APIs require (dynamically) uniform control flow for barriers,
and the above case may lead to GPU hangs in practice.

Fix this issue by replacing the early exits with careful checks that
don't interrupt barrier control flow.

Unfortunately, it's harder to prove the soundness of the new checks, so
this change also clears dynamic memory ranges in MEM_DEBUG mode when
memory is exhausted. The result is that accessing memory after
exhaustion triggers an error.

Signed-off-by: Elias Naur <mail@eliasnaur.com>
2021-04-20 10:15:29 +02:00

148 lines
5 KiB
Plaintext

// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
// The binning stage of the pipeline.
//
// Each workgroup processes N_TILE paths.
// Each thread processes one path and calculates a N_TILE_X x N_TILE_Y coverage mask
// based on the path bounding box to bin the paths.
#version 450
#extension GL_GOOGLE_include_directive : enable
#include "mem.h"
#include "setup.h"
layout(local_size_x = N_TILE, local_size_y = 1) in;
layout(set = 0, binding = 1) readonly buffer ConfigBuf {
Config conf;
};
#include "annotated.h"
#include "bins.h"
// scale factors useful for converting coordinates to bins
#define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX))
#define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX))
// Constant not available in GLSL. Also consider uintBitsToFloat(0x7f800000)
#define INFINITY (1.0 / 0.0)
// Note: cudaraster has N_TILE + 1 to cut down on bank conflicts.
// Bitmaps are sliced (256bit into 8 (N_SLICE) 32bit submaps)
shared uint bitmaps[N_SLICE][N_TILE];
shared uint count[N_SLICE][N_TILE];
shared Alloc sh_chunk_alloc[N_TILE];
shared bool sh_alloc_failed;
void main() {
uint my_n_elements = conf.n_elements;
uint my_partition = gl_WorkGroupID.x;
for (uint i = 0; i < N_SLICE; i++) {
bitmaps[i][gl_LocalInvocationID.x] = 0;
}
if (gl_LocalInvocationID.x == 0) {
sh_alloc_failed = false;
}
barrier();
// Read inputs and determine coverage of bins
uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x;
AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
uint tag = Annotated_Nop;
if (element_ix < my_n_elements) {
tag = Annotated_tag(conf.anno_alloc, ref).tag;
}
int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
switch (tag) {
case Annotated_Color:
case Annotated_Image:
case Annotated_BeginClip:
case Annotated_EndClip:
// Note: we take advantage of the fact that these drawing elements
// have the bbox at the same place in their layout.
AnnoEndClip clip = Annotated_EndClip_read(conf.anno_alloc, ref);
x0 = int(floor(clip.bbox.x * SX));
y0 = int(floor(clip.bbox.y * SY));
x1 = int(ceil(clip.bbox.z * SX));
y1 = int(ceil(clip.bbox.w * SY));
break;
}
// At this point, we run an iterator over the coverage area,
// trying to keep divergence low.
// Right now, it's just a bbox, but we'll get finer with
// segments.
uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1)/N_TILE_X;
uint height_in_bins = (conf.height_in_tiles + N_TILE_Y - 1)/N_TILE_Y;
x0 = clamp(x0, 0, int(width_in_bins));
x1 = clamp(x1, x0, int(width_in_bins));
y0 = clamp(y0, 0, int(height_in_bins));
y1 = clamp(y1, y0, int(height_in_bins));
if (x0 == x1) y1 = y0;
int x = x0, y = y0;
uint my_slice = gl_LocalInvocationID.x / 32;
uint my_mask = 1 << (gl_LocalInvocationID.x & 31);
while (y < y1) {
atomicOr(bitmaps[my_slice][y * width_in_bins + x], my_mask);
x++;
if (x == x1) {
x = x0;
y++;
}
}
barrier();
// Allocate output segments.
uint element_count = 0;
for (uint i = 0; i < N_SLICE; i++) {
element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]);
count[i][gl_LocalInvocationID.x] = element_count;
}
// element_count is number of elements covering bin for this invocation.
Alloc chunk_alloc = new_alloc(0, 0, true);
if (element_count != 0) {
// TODO: aggregate atomic adds (subgroup is probably fastest)
MallocResult chunk = malloc(element_count * BinInstance_size);
chunk_alloc = chunk.alloc;
sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
if (chunk.failed) {
sh_alloc_failed = true;
}
}
// Note: it might be more efficient for reading to do this in the
// other order (each bin is a contiguous sequence of partitions)
uint out_ix = (conf.bin_alloc.offset >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2;
write_mem(conf.bin_alloc, out_ix, element_count);
write_mem(conf.bin_alloc, out_ix + 1, chunk_alloc.offset);
barrier();
if (sh_alloc_failed || mem_error != NO_ERROR) {
return;
}
// Use similar strategy as Laine & Karras paper; loop over bbox of bins
// touched by this element
x = x0;
y = y0;
while (y < y1) {
uint bin_ix = y * width_in_bins + x;
uint out_mask = bitmaps[my_slice][bin_ix];
if ((out_mask & my_mask) != 0) {
uint idx = bitCount(out_mask & (my_mask - 1));
if (my_slice > 0) {
idx += count[my_slice - 1][bin_ix];
}
Alloc out_alloc = sh_chunk_alloc[bin_ix];
uint out_offset = out_alloc.offset + idx * BinInstance_size;
BinInstance_write(out_alloc, BinInstanceRef(out_offset), BinInstance(element_ix));
}
x++;
if (x == x1) {
x = x0;
y++;
}
}
}