vello/piet-gpu/shader/coarse.comp
Raph Levien 240f44a228 Implement robust dynamic memory
This is the core logic for robust dynamic memory. There are changes to both shaders and the driver logic.

On the shader side, failure information is more useful and fine grained. In particular, it now reports which stage failed and how much memory would have been required to make that stage succeed.

On the driver side, there is a new RenderDriver abstraction which owns command buffers (and associated query pools) and runs the logic to retry and reallocate buffers when necessary. There's also a fairly significant rework of the logic to produce the config block, as that overlaps the robust memory.

The RenderDriver abstraction may not stay. It was done this way to minimize code disruption, but arguably it should just be combined with Renderer.

Another change: the GLSL length() method on a buffer requires additional infrastructure (at least on Metal, where it needs a binding of its own), so we now pass that in as a field in the config.

This also moves blend memory to its own buffer. This worked out well because coarse rasterization can simply report the size of the blend buffer and it can be reallocated without needing to rerun the pipeline. In the previous state, blend allocations and ptcl writes were interleaved in coarse rasterization, so a failure of the former would require rerunning coarse. This should fix #83 (finally!)

There are a few loose ends. The binaries haven't (yet) been updated (I've been testing using a hand-written test program). Gradients weren't touched so still have a fixed size allocation. And the logic to calculate the new buffer size on allocation failure could be smarter.

Closes #175
2022-07-13 12:34:51 -07:00

481 lines
19 KiB
GLSL

// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
// The coarse rasterizer stage of the pipeline.
//
// As input we have the ordered partitions of paths from the binning phase and
// the annotated tile list of segments and backdrop per path.
//
// Each workgroup operating on one bin by stream compacting
// the elements corresponding to the bin.
//
// As output we have an ordered command stream per tile. Every tile from a path (backdrop + segment list) will be
// encoded.
#version 450
#extension GL_GOOGLE_include_directive : enable
#include "mem.h"
#include "setup.h"
layout(local_size_x = N_TILE, local_size_y = 1) in;
layout(binding = 1) readonly buffer ConfigBuf {
Config conf;
};
layout(binding = 2) readonly buffer SceneBuf {
uint[] scene;
};
#include "drawtag.h"
#include "bins.h"
#include "tile.h"
#include "ptcl.h"
#include "blend.h"
#define LG_N_PART_READ (7 + LG_WG_FACTOR)
#define N_PART_READ (1 << LG_N_PART_READ)
shared uint sh_elements[N_TILE];
// Number of elements in the partition; prefix sum.
shared uint sh_part_count[N_PART_READ];
shared Alloc sh_part_elements[N_PART_READ];
shared uint sh_bitmaps[N_SLICE][N_TILE];
shared uint sh_tile_count[N_TILE];
// The width of the tile rect for the element, intersected with this bin
shared uint sh_tile_width[N_TILE];
shared uint sh_tile_x0[N_TILE];
shared uint sh_tile_y0[N_TILE];
// These are set up so base + tile_y * stride + tile_x points to a Tile.
shared uint sh_tile_base[N_TILE];
shared uint sh_tile_stride[N_TILE];
#ifdef MEM_DEBUG
// Store allocs only when MEM_DEBUG to save shared memory traffic.
shared Alloc sh_tile_alloc[N_TILE];
void write_tile_alloc(uint el_ix, Alloc a) {
sh_tile_alloc[el_ix] = a;
}
Alloc read_tile_alloc(uint el_ix, bool mem_ok) {
return sh_tile_alloc[el_ix];
}
#else
void write_tile_alloc(uint el_ix, Alloc a) {
// No-op
}
Alloc read_tile_alloc(uint el_ix, bool mem_ok) {
// All memory.
return new_alloc(0, conf.mem_size, mem_ok);
}
#endif
// The maximum number of commands per annotated element.
#define ANNO_COMMANDS 2
// All writes to the output must be gated by mem_ok.
bool mem_ok = true;
// Perhaps cmd allocations should be a global? This is a style question.
void alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) {
if (cmd_ref.offset < cmd_limit) {
return;
}
uint new_cmd = malloc_stage(PTCL_INITIAL_ALLOC, conf.mem_size, STAGE_COARSE);
if (new_cmd == MALLOC_FAILED) {
mem_ok = false;
}
if (mem_ok) {
CmdJump jump = CmdJump(new_cmd);
Cmd_Jump_write(cmd_alloc, cmd_ref, jump);
}
cmd_alloc = new_alloc(new_cmd, PTCL_INITIAL_ALLOC, true);
cmd_ref = CmdRef(new_cmd);
// Reserve space for the maximum number of commands and a potential jump.
cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - (ANNO_COMMANDS + 1) * Cmd_size;
}
void write_fill(Alloc alloc, inout CmdRef cmd_ref, Tile tile, float linewidth) {
if (linewidth < 0.0) {
if (tile.tile.offset != 0) {
CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop);
if (mem_ok) {
Cmd_Fill_write(alloc, cmd_ref, cmd_fill);
}
cmd_ref.offset += 4 + CmdFill_size;
} else {
if (mem_ok) {
Cmd_Solid_write(alloc, cmd_ref);
}
cmd_ref.offset += 4;
}
} else {
CmdStroke cmd_stroke = CmdStroke(tile.tile.offset, 0.5 * linewidth);
if (mem_ok) {
Cmd_Stroke_write(alloc, cmd_ref, cmd_stroke);
}
cmd_ref.offset += 4 + CmdStroke_size;
}
}
void main() {
if (!check_deps(STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_PATH_COARSE)) {
return;
}
// Could use either linear or 2d layouts for both dispatch and
// invocations within the workgroup. We'll use variables to abstract.
uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1) / N_TILE_X;
uint bin_ix = width_in_bins * gl_WorkGroupID.y + gl_WorkGroupID.x;
uint partition_ix = 0;
uint n_partitions = (conf.n_elements + N_TILE - 1) / N_TILE;
uint th_ix = gl_LocalInvocationID.x;
// Coordinates of top left of bin, in tiles.
uint bin_tile_x = N_TILE_X * gl_WorkGroupID.x;
uint bin_tile_y = N_TILE_Y * gl_WorkGroupID.y;
// Per-tile state
uint tile_x = gl_LocalInvocationID.x % N_TILE_X;
uint tile_y = gl_LocalInvocationID.x / N_TILE_X;
uint this_tile_ix = (bin_tile_y + tile_y) * conf.width_in_tiles + bin_tile_x + tile_x;
Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, this_tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC);
CmdRef cmd_ref = CmdRef(cmd_alloc.offset);
// Reserve space for the maximum number of commands and a potential jump.
uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - (ANNO_COMMANDS + 1) * Cmd_size;
// The nesting depth of the clip stack
uint clip_depth = 0;
// State for the "clip zero" optimization. If it's nonzero, then we are
// currently in a clip for which the entire tile has an alpha of zero, and
// the value is the depth after the "begin clip" of that element.
uint clip_zero_depth = 0;
// I'm sure we can figure out how to do this with at least one fewer register...
// Items up to rd_ix have been read from sh_elements
uint rd_ix = 0;
// Items up to wr_ix have been written into sh_elements
uint wr_ix = 0;
// Items between part_start_ix and ready_ix are ready to be transferred from sh_part_elements
uint part_start_ix = 0;
uint ready_ix = 0;
Alloc scratch_alloc = slice_mem(cmd_alloc, 0, Alloc_size);
cmd_ref.offset += 4;
// Accounting for allocation of blend memory
uint render_blend_depth = 0;
uint max_blend_depth = 0;
uint drawmonoid_start = conf.drawmonoid_alloc.offset >> 2;
uint drawtag_start = conf.drawtag_offset >> 2;
uint drawdata_start = conf.drawdata_offset >> 2;
uint drawinfo_start = conf.drawinfo_alloc.offset >> 2;
while (true) {
for (uint i = 0; i < N_SLICE; i++) {
sh_bitmaps[i][th_ix] = 0;
}
// parallel read of input partitions
do {
if (ready_ix == wr_ix && partition_ix < n_partitions) {
part_start_ix = ready_ix;
uint count = 0;
if (th_ix < N_PART_READ && partition_ix + th_ix < n_partitions) {
uint in_ix = (conf.bin_alloc.offset >> 2) + ((partition_ix + th_ix) * N_TILE + bin_ix) * 2;
count = read_mem(conf.bin_alloc, in_ix);
uint offset = read_mem(conf.bin_alloc, in_ix + 1);
sh_part_elements[th_ix] = new_alloc(offset, count * BinInstance_size, true);
}
// prefix sum of counts
for (uint i = 0; i < LG_N_PART_READ; i++) {
if (th_ix < N_PART_READ) {
sh_part_count[th_ix] = count;
}
barrier();
if (th_ix < N_PART_READ) {
if (th_ix >= (1u << i)) {
count += sh_part_count[th_ix - (1u << i)];
}
}
barrier();
}
if (th_ix < N_PART_READ) {
sh_part_count[th_ix] = part_start_ix + count;
}
barrier();
ready_ix = sh_part_count[N_PART_READ - 1];
partition_ix += N_PART_READ;
}
// use binary search to find element to read
uint ix = rd_ix + th_ix;
if (ix >= wr_ix && ix < ready_ix) {
uint part_ix = 0;
for (uint i = 0; i < LG_N_PART_READ; i++) {
uint probe = part_ix + (uint(N_PART_READ / 2) >> i);
if (ix >= sh_part_count[probe - 1]) {
part_ix = probe;
}
}
ix -= part_ix > 0 ? sh_part_count[part_ix - 1] : part_start_ix;
Alloc bin_alloc = sh_part_elements[part_ix];
BinInstanceRef inst_ref = BinInstanceRef(bin_alloc.offset);
BinInstance inst = BinInstance_read(bin_alloc, BinInstance_index(inst_ref, ix));
sh_elements[th_ix] = inst.element_ix;
}
barrier();
wr_ix = min(rd_ix + N_TILE, ready_ix);
} while (wr_ix - rd_ix < N_TILE && (wr_ix < ready_ix || partition_ix < n_partitions));
// We've done the merge and filled the buffer.
// Read one element, compute coverage.
uint tag = Drawtag_Nop;
uint element_ix;
if (th_ix + rd_ix < wr_ix) {
element_ix = sh_elements[th_ix];
tag = scene[drawtag_start + element_ix];
}
// Bounding box of element in pixel coordinates.
uint tile_count;
switch (tag) {
case Drawtag_FillColor:
case Drawtag_FillImage:
case Drawtag_FillLinGradient:
case Drawtag_FillRadGradient:
case Drawtag_BeginClip:
case Drawtag_EndClip:
uint drawmonoid_base = drawmonoid_start + 4 * element_ix;
uint path_ix = memory[drawmonoid_base];
Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size));
uint stride = path.bbox.z - path.bbox.x;
sh_tile_stride[th_ix] = stride;
int dx = int(path.bbox.x) - int(bin_tile_x);
int dy = int(path.bbox.y) - int(bin_tile_y);
int x0 = clamp(dx, 0, N_TILE_X);
int y0 = clamp(dy, 0, N_TILE_Y);
int x1 = clamp(int(path.bbox.z) - int(bin_tile_x), 0, N_TILE_X);
int y1 = clamp(int(path.bbox.w) - int(bin_tile_y), 0, N_TILE_Y);
sh_tile_width[th_ix] = uint(x1 - x0);
sh_tile_x0[th_ix] = x0;
sh_tile_y0[th_ix] = y0;
tile_count = uint(x1 - x0) * uint(y1 - y0);
// base relative to bin
uint base = path.tiles.offset - uint(dy * stride + dx) * Tile_size;
sh_tile_base[th_ix] = base;
Alloc path_alloc = new_alloc(path.tiles.offset,
(path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, true);
write_tile_alloc(th_ix, path_alloc);
break;
default:
tile_count = 0;
break;
}
// Prefix sum of sh_tile_count
sh_tile_count[th_ix] = tile_count;
for (uint i = 0; i < LG_N_TILE; i++) {
barrier();
if (th_ix >= (1u << i)) {
tile_count += sh_tile_count[th_ix - (1u << i)];
}
barrier();
sh_tile_count[th_ix] = tile_count;
}
barrier();
uint total_tile_count = sh_tile_count[N_TILE - 1];
for (uint ix = th_ix; ix < total_tile_count; ix += N_TILE) {
// Binary search to find element
uint el_ix = 0;
for (uint i = 0; i < LG_N_TILE; i++) {
uint probe = el_ix + (uint(N_TILE / 2) >> i);
if (ix >= sh_tile_count[probe - 1]) {
el_ix = probe;
}
}
uint element_ix = sh_elements[el_ix];
uint tag = scene[drawtag_start + element_ix];
uint seq_ix = ix - (el_ix > 0 ? sh_tile_count[el_ix - 1] : 0);
uint width = sh_tile_width[el_ix];
uint x = sh_tile_x0[el_ix] + seq_ix % width;
uint y = sh_tile_y0[el_ix] + seq_ix / width;
bool include_tile = false;
Tile tile = Tile_read(read_tile_alloc(el_ix, true),
TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
bool is_clip = (tag & 1) != 0;
// Always include the tile if it contains a path segment.
// For draws, include the tile if it is solid.
// For clips, include the tile if it is empty - this way, logic
// below will suppress the drawing of inner elements.
// For blends, include the tile if
// (blend_mode, composition_mode) != (Normal, SrcOver)
bool is_blend = false;
if (is_clip) {
uint drawmonoid_base = drawmonoid_start + 4 * element_ix;
uint scene_offset = memory[drawmonoid_base + 2];
uint dd = drawdata_start + (scene_offset >> 2);
uint blend = scene[dd];
is_blend = (blend != BlendComp_clip);
}
include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip
|| is_blend;
if (include_tile) {
uint el_slice = el_ix / 32;
uint el_mask = 1u << (el_ix & 31);
atomicOr(sh_bitmaps[el_slice][y * N_TILE_X + x], el_mask);
}
}
barrier();
// Output draw objects for this tile. The thread does a sequential walk
// through the draw objects.
uint slice_ix = 0;
uint bitmap = sh_bitmaps[0][th_ix];
while (true) {
if (bitmap == 0) {
slice_ix++;
if (slice_ix == N_SLICE) {
break;
}
bitmap = sh_bitmaps[slice_ix][th_ix];
if (bitmap == 0) {
continue;
}
}
uint element_ref_ix = slice_ix * 32 + findLSB(bitmap);
uint element_ix = sh_elements[element_ref_ix];
// Clear LSB
bitmap &= bitmap - 1;
uint drawtag = scene[drawtag_start + element_ix];
if (clip_zero_depth == 0) {
Tile tile = Tile_read(read_tile_alloc(element_ref_ix, true),
TileRef(sh_tile_base[element_ref_ix] +
(sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
uint drawmonoid_base = drawmonoid_start + 4 * element_ix;
uint scene_offset = memory[drawmonoid_base + 2];
uint info_offset = memory[drawmonoid_base + 3];
uint dd = drawdata_start + (scene_offset >> 2);
uint di = drawinfo_start + (info_offset >> 2);
switch (drawtag) {
case Drawtag_FillColor:
float linewidth = uintBitsToFloat(memory[di]);
alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
write_fill(cmd_alloc, cmd_ref, tile, linewidth);
uint rgba = scene[dd];
if (mem_ok) {
Cmd_Color_write(cmd_alloc, cmd_ref, CmdColor(rgba));
}
cmd_ref.offset += 4 + CmdColor_size;
break;
case Drawtag_FillLinGradient:
alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
linewidth = uintBitsToFloat(memory[di]);
write_fill(cmd_alloc, cmd_ref, tile, linewidth);
CmdLinGrad cmd_lin;
cmd_lin.index = scene[dd];
cmd_lin.line_x = uintBitsToFloat(memory[di + 1]);
cmd_lin.line_y = uintBitsToFloat(memory[di + 2]);
cmd_lin.line_c = uintBitsToFloat(memory[di + 3]);
if (mem_ok) {
Cmd_LinGrad_write(cmd_alloc, cmd_ref, cmd_lin);
}
cmd_ref.offset += 4 + CmdLinGrad_size;
break;
case Drawtag_FillRadGradient:
alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
linewidth = uintBitsToFloat(memory[di]);
write_fill(cmd_alloc, cmd_ref, tile, linewidth);
CmdRadGrad cmd_rad;
cmd_rad.index = scene[dd];
// Given that this is basically a memcpy, we might consider
// letting the fine raster read the info itself.
cmd_rad.mat = uintBitsToFloat(uvec4(memory[di + 1], memory[di + 2],
memory[di + 3], memory[di + 4]));
cmd_rad.xlat = uintBitsToFloat(uvec2(memory[di + 5], memory[di + 6]));
cmd_rad.c1 = uintBitsToFloat(uvec2(memory[di + 7], memory[di + 8]));
cmd_rad.ra = uintBitsToFloat(memory[di + 9]);
cmd_rad.roff = uintBitsToFloat(memory[di + 10]);
if (mem_ok) {
Cmd_RadGrad_write(cmd_alloc, cmd_ref, cmd_rad);
}
cmd_ref.offset += 4 + CmdRadGrad_size;
break;
case Drawtag_FillImage:
alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
linewidth = uintBitsToFloat(memory[di]);
write_fill(cmd_alloc, cmd_ref, tile, linewidth);
uint index = scene[dd];
uint raw1 = scene[dd + 1];
ivec2 offset = ivec2(int(raw1 << 16) >> 16, int(raw1) >> 16);
if (mem_ok) {
Cmd_Image_write(cmd_alloc, cmd_ref, CmdImage(index, offset));
}
cmd_ref.offset += 4 + CmdImage_size;
break;
case Drawtag_BeginClip:
if (tile.tile.offset == 0 && tile.backdrop == 0) {
clip_zero_depth = clip_depth + 1;
} else {
alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
if (mem_ok) {
Cmd_BeginClip_write(cmd_alloc, cmd_ref);
}
cmd_ref.offset += 4;
render_blend_depth++;
max_blend_depth = max(max_blend_depth, render_blend_depth);
}
clip_depth++;
break;
case Drawtag_EndClip:
clip_depth--;
write_fill(cmd_alloc, cmd_ref, tile, -1.0);
uint blend = scene[dd];
if (mem_ok) {
Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(blend));
}
cmd_ref.offset += 4 + CmdEndClip_size;
render_blend_depth--;
break;
}
} else {
// In "clip zero" state, suppress all drawing
switch (drawtag) {
case Drawtag_BeginClip:
clip_depth++;
break;
case Drawtag_EndClip:
if (clip_depth == clip_zero_depth) {
clip_zero_depth = 0;
}
clip_depth--;
break;
}
}
}
barrier();
rd_ix += N_TILE;
if (rd_ix >= ready_ix && partition_ix >= n_partitions)
break;
}
if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) {
if (mem_ok) {
Cmd_End_write(cmd_alloc, cmd_ref);
}
if (max_blend_depth > BLEND_STACK_SPLIT) {
uint scratch_size = max_blend_depth * TILE_WIDTH_PX * TILE_HEIGHT_PX * CLIP_STATE_SIZE * 4;
uint scratch = atomicAdd(blend_offset, scratch_size);
write_mem(scratch_alloc, scratch_alloc.offset >> 2, scratch);
}
}
}