vello/piet-gpu/shader/coarse.comp
Elias Naur 8fab45544e shader: implement clip paths
Expand the the final kernel4 stage to maintain a per-pixel mask.

Introduce two new path elements, FillMask and FillMaskInv, to fill
the mask. FillMask acts like Fill, while FillMaskInv fills the area
outside the path.

SVG clipPaths is then representable by a FillMaskInv(0.0) for every nested
path, preceded by a FillMask(1.0) to clear the mask.

The bounding box for FillMaskInv elements is the entire screen; tightening of
the bounding box is left for future work. Note that a fullscreen bounding
box is not hopelessly inefficient because completely filling a tile with
a mask is just a single CmdSolidMask per tile.

Fixes #30

Signed-off-by: Elias Naur <mail@eliasnaur.com>
2020-10-09 13:20:26 +02:00

336 lines
13 KiB
Plaintext

// The coarse rasterizer stage of the pipeline.
//
// As input we have the ordered partitions of paths from the binning phase and
// the annotated tile list of segments and backdrop per path.
//
// Each workgroup operating on one bin by stream compacting
// the elements corresponding to the bin.
//
// As output we have an ordered command stream per tile. Every tile from a path (backdrop + segment list) will be encoded.
#version 450
#extension GL_GOOGLE_include_directive : enable
#include "setup.h"
layout(local_size_x = N_TILE, local_size_y = 1) in;
layout(set = 0, binding = 0) buffer AnnotatedBuf {
uint[] annotated;
};
layout(set = 0, binding = 1) buffer BinsBuf {
uint[] bins;
};
layout(set = 0, binding = 2) buffer TileBuf {
uint[] tile;
};
layout(set = 0, binding = 3) buffer AllocBuf {
uint n_elements;
uint alloc;
};
layout(set = 0, binding = 4) buffer PtclBuf {
uint[] ptcl;
};
#include "annotated.h"
#include "bins.h"
#include "tile.h"
#include "ptcl.h"
#define LG_N_PART_READ (7 + LG_WG_FACTOR)
#define N_PART_READ (1 << LG_N_PART_READ)
shared uint sh_elements[N_TILE];
shared float sh_right_edge[N_TILE];
// Number of elements in the partition; prefix sum.
shared uint sh_part_count[N_PART_READ];
shared uint sh_part_elements[N_PART_READ];
shared uint sh_bitmaps[N_SLICE][N_TILE];
shared uint sh_tile_count[N_TILE];
// The width of the tile rect for the element, intersected with this bin
shared uint sh_tile_width[N_TILE];
shared uint sh_tile_x0[N_TILE];
shared uint sh_tile_y0[N_TILE];
// These are set up so base + tile_y * stride + tile_x points to a Tile.
shared uint sh_tile_base[N_TILE];
shared uint sh_tile_stride[N_TILE];
// Perhaps cmd_limit should be a global? This is a style question.
void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) {
if (cmd_ref.offset > cmd_limit) {
uint new_cmd = atomicAdd(alloc, PTCL_INITIAL_ALLOC);
CmdJump jump = CmdJump(new_cmd);
Cmd_Jump_write(cmd_ref, jump);
cmd_ref = CmdRef(new_cmd);
cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
}
}
void main() {
// Could use either linear or 2d layouts for both dispatch and
// invocations within the workgroup. We'll use variables to abstract.
uint bin_ix = N_TILE_X * gl_WorkGroupID.y + gl_WorkGroupID.x;
uint partition_ix = 0;
uint n_partitions = (n_elements + N_TILE - 1) / N_TILE;
uint th_ix = gl_LocalInvocationID.x;
// Coordinates of top left of bin, in tiles.
uint bin_tile_x = N_TILE_X * gl_WorkGroupID.x;
uint bin_tile_y = N_TILE_Y * gl_WorkGroupID.y;
uint tile_x = gl_LocalInvocationID.x % N_TILE_X;
uint tile_y = gl_LocalInvocationID.x / N_TILE_X;
uint this_tile_ix = (bin_tile_y + tile_y) * WIDTH_IN_TILES + bin_tile_x + tile_x;
CmdRef cmd_ref = CmdRef(this_tile_ix * PTCL_INITIAL_ALLOC);
uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
// I'm sure we can figure out how to do this with at least one fewer register...
// Items up to rd_ix have been read from sh_elements
uint rd_ix = 0;
// Items up to wr_ix have been written into sh_elements
uint wr_ix = 0;
// Items between part_start_ix and ready_ix are ready to be transferred from sh_part_elements
uint part_start_ix = 0;
uint ready_ix = 0;
while (true) {
for (uint i = 0; i < N_SLICE; i++) {
sh_bitmaps[i][th_ix] = 0;
}
// parallel read of input partitions
do {
if (ready_ix == wr_ix && partition_ix < n_partitions) {
part_start_ix = ready_ix;
uint count = 0;
if (th_ix < N_PART_READ && partition_ix + th_ix < n_partitions) {
uint in_ix = ((partition_ix + th_ix) * N_TILE + bin_ix) * 2;
count = bins[in_ix];
sh_part_elements[th_ix] = bins[in_ix + 1];
}
// prefix sum of counts
for (uint i = 0; i < LG_N_PART_READ; i++) {
if (th_ix < N_PART_READ) {
sh_part_count[th_ix] = count;
}
barrier();
if (th_ix < N_PART_READ) {
if (th_ix >= (1 << i)) {
count += sh_part_count[th_ix - (1 << i)];
}
}
barrier();
}
if (th_ix < N_PART_READ) {
sh_part_count[th_ix] = part_start_ix + count;
}
barrier();
ready_ix = sh_part_count[N_PART_READ - 1];
partition_ix += N_PART_READ;
}
// use binary search to find element to read
uint ix = rd_ix + th_ix;
if (ix >= wr_ix && ix < ready_ix) {
uint part_ix = 0;
for (uint i = 0; i < LG_N_PART_READ; i++) {
uint probe = part_ix + ((N_PART_READ / 2) >> i);
if (ix >= sh_part_count[probe - 1]) {
part_ix = probe;
}
}
ix -= part_ix > 0 ? sh_part_count[part_ix - 1] : part_start_ix;
BinInstanceRef inst_ref = BinInstanceRef(sh_part_elements[part_ix]);
BinInstance inst = BinInstance_read(BinInstance_index(inst_ref, ix));
sh_elements[th_ix] = inst.element_ix;
sh_right_edge[th_ix] = inst.right_edge;
}
barrier();
wr_ix = min(rd_ix + N_TILE, ready_ix);
} while (wr_ix - rd_ix < N_TILE && (wr_ix < ready_ix || partition_ix < n_partitions));
// We've done the merge and filled the buffer.
// Read one element, compute coverage.
uint tag = Annotated_Nop;
uint element_ix;
AnnotatedRef ref;
float right_edge = 0.0;
if (th_ix + rd_ix < wr_ix) {
element_ix = sh_elements[th_ix];
right_edge = sh_right_edge[th_ix];
ref = AnnotatedRef(element_ix * Annotated_size);
tag = Annotated_tag(ref);
}
// Bounding box of element in pixel coordinates.
uint tile_count;
switch (tag) {
case Annotated_Fill:
case Annotated_FillMask:
case Annotated_FillMaskInv:
case Annotated_Stroke:
// Because the only elements we're processing right now are
// paths, we can just use the element index as the path index.
// In future, when we're doing a bunch of stuff, the path index
// should probably be stored in the annotated element.
uint path_ix = element_ix;
Path path = Path_read(PathRef(path_ix * Path_size));
uint stride = path.bbox.z - path.bbox.x;
sh_tile_stride[th_ix] = stride;
int dx = int(path.bbox.x) - int(bin_tile_x);
int dy = int(path.bbox.y) - int(bin_tile_y);
int x0 = clamp(dx, 0, N_TILE_X);
int y0 = clamp(dy, 0, N_TILE_Y);
int x1 = clamp(int(path.bbox.z) - int(bin_tile_x), 0, N_TILE_X);
int y1 = clamp(int(path.bbox.w) - int(bin_tile_y), 0, N_TILE_Y);
sh_tile_width[th_ix] = uint(x1 - x0);
sh_tile_x0[th_ix] = x0;
sh_tile_y0[th_ix] = y0;
tile_count = uint(x1 - x0) * uint(y1 - y0);
// base relative to bin
uint base = path.tiles.offset - uint(dy * stride + dx) * Tile_size;
sh_tile_base[th_ix] = base;
break;
default:
tile_count = 0;
break;
}
// Prefix sum of sh_tile_count
sh_tile_count[th_ix] = tile_count;
for (uint i = 0; i < LG_N_TILE; i++) {
barrier();
if (th_ix >= (1 << i)) {
tile_count += sh_tile_count[th_ix - (1 << i)];
}
barrier();
sh_tile_count[th_ix] = tile_count;
}
barrier();
uint total_tile_count = sh_tile_count[N_TILE - 1];
for (uint ix = th_ix; ix < total_tile_count; ix += N_TILE) {
// Binary search to find element
uint el_ix = 0;
for (uint i = 0; i < LG_N_TILE; i++) {
uint probe = el_ix + ((N_TILE / 2) >> i);
if (ix >= sh_tile_count[probe - 1]) {
el_ix = probe;
}
}
AnnotatedRef ref = AnnotatedRef(el_ix * Annotated_size);
uint tag = Annotated_tag(ref);
uint seq_ix = ix - (el_ix > 0 ? sh_tile_count[el_ix - 1] : 0);
uint width = sh_tile_width[el_ix];
uint x = sh_tile_x0[el_ix] + seq_ix % width;
uint y = sh_tile_y0[el_ix] + seq_ix / width;
Tile tile = Tile_read(TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
// Include the path in the tile if
// - the tile contains at least a segment (tile offset non-zero)
// - the tile is completely covered (backdrop non-zero)
// - the tile is not covered and we're filling everything outside the path (backdrop zero, inverse fills).
bool inside = tile.backdrop != 0;
bool fill = tag != Annotated_FillMaskInv;
if (tile.tile.offset != 0 || inside == fill) {
uint el_slice = el_ix / 32;
uint el_mask = 1 << (el_ix & 31);
atomicOr(sh_bitmaps[el_slice][y * N_TILE_X + x], el_mask);
}
}
barrier();
// Output non-segment elements for this tile. The thread does a sequential walk
// through the non-segment elements, and for segments, count and backdrop are
// aggregated using bit counting.
uint slice_ix = 0;
uint bitmap = sh_bitmaps[0][th_ix];
while (true) {
if (bitmap == 0) {
slice_ix++;
if (slice_ix == N_SLICE) {
break;
}
bitmap = sh_bitmaps[slice_ix][th_ix];
if (bitmap == 0) {
continue;
}
}
uint element_ref_ix = slice_ix * 32 + findLSB(bitmap);
uint element_ix = sh_elements[element_ref_ix];
// Clear LSB
bitmap &= bitmap - 1;
// At this point, we read the element again from global memory.
// If that turns out to be expensive, maybe we can pack it into
// shared memory (or perhaps just the tag).
ref = AnnotatedRef(element_ix * Annotated_size);
tag = Annotated_tag(ref);
switch (tag) {
case Annotated_Fill:
Tile tile = Tile_read(TileRef(sh_tile_base[element_ref_ix]
+ (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
AnnoFill fill = Annotated_Fill_read(ref);
alloc_cmd(cmd_ref, cmd_limit);
if (tile.tile.offset != 0) {
CmdFill cmd_fill;
cmd_fill.tile_ref = tile.tile.offset;
cmd_fill.backdrop = tile.backdrop;
cmd_fill.rgba_color = fill.rgba_color;
Cmd_Fill_write(cmd_ref, cmd_fill);
} else {
Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color));
}
cmd_ref.offset += Cmd_size;
break;
case Annotated_FillMask:
case Annotated_FillMaskInv:
tile = Tile_read(TileRef(sh_tile_base[element_ref_ix]
+ (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
AnnoFillMask fill_mask = Annotated_FillMask_read(ref);
alloc_cmd(cmd_ref, cmd_limit);
if (tile.tile.offset != 0) {
CmdFillMask cmd_fill;
cmd_fill.tile_ref = tile.tile.offset;
cmd_fill.backdrop = tile.backdrop;
cmd_fill.mask = fill_mask.mask;
if (tag == Annotated_FillMask) {
Cmd_FillMask_write(cmd_ref, cmd_fill);
} else {
Cmd_FillMaskInv_write(cmd_ref, cmd_fill);
}
} else {
Cmd_SolidMask_write(cmd_ref, CmdSolidMask(fill_mask.mask));
}
cmd_ref.offset += Cmd_size;
break;
case Annotated_Stroke:
tile = Tile_read(TileRef(sh_tile_base[element_ref_ix]
+ (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
AnnoStroke stroke = Annotated_Stroke_read(ref);
CmdStroke cmd_stroke;
cmd_stroke.tile_ref = tile.tile.offset;
cmd_stroke.half_width = 0.5 * stroke.linewidth;
cmd_stroke.rgba_color = stroke.rgba_color;
alloc_cmd(cmd_ref, cmd_limit);
Cmd_Stroke_write(cmd_ref, cmd_stroke);
cmd_ref.offset += Cmd_size;
break;
}
}
barrier();
rd_ix += N_TILE;
if (rd_ix >= ready_ix && partition_ix >= n_partitions) break;
}
Cmd_End_write(cmd_ref);
}