vello/piet-gpu/shader/coarse.comp
Raph Levien 55df3e6cc8 Fix linewidth math
Coarse rasterization wasn't entirely taking line width into account.

Also fix swizzle in matrix (not yet used). And fix missing End command
in ptcl output (hasn't been a problem because buffer was cleared).
2020-05-24 09:43:41 -07:00

397 lines
16 KiB
Plaintext

// The coarse rasterizer stage of the pipeline.
#version 450
#extension GL_GOOGLE_include_directive : enable
#include "setup.h"
layout(local_size_x = N_TILE, local_size_y = 1) in;
layout(set = 0, binding = 0) buffer AnnotatedBuf {
uint[] annotated;
};
layout(set = 0, binding = 1) buffer BinsBuf {
uint[] bins;
};
layout(set = 0, binding = 2) buffer AllocBuf {
uint alloc;
};
layout(set = 0, binding = 3) buffer PtclBuf {
uint[] ptcl;
};
#include "annotated.h"
#include "bins.h"
#include "ptcl.h"
#define N_RINGBUF 512
shared uint sh_elements[N_RINGBUF];
shared float sh_right_edge[N_RINGBUF];
shared uint sh_chunk[N_WG];
shared uint sh_chunk_next[N_WG];
shared uint sh_chunk_n[N_WG];
shared uint sh_min_buf;
// Some of these are kept in shared memory to ease register
// pressure, but it could go either way.
shared uint sh_first_el[N_WG];
shared uint sh_selected_n;
shared uint sh_elements_ref;
shared uint sh_bitmaps[N_SLICE][N_TILE];
shared uint sh_backdrop[N_SLICE][N_TILE];
shared uint sh_bd_sign[N_SLICE];
// scale factors useful for converting coordinates to tiles
#define SX (1.0 / float(TILE_WIDTH_PX))
#define SY (1.0 / float(TILE_HEIGHT_PX))
// Perhaps cmd_limit should be a global? This is a style question.
void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) {
if (cmd_ref.offset > cmd_limit) {
uint new_cmd = atomicAdd(alloc, PTCL_INITIAL_ALLOC);
CmdJump jump = CmdJump(new_cmd);
Cmd_Jump_write(cmd_ref, jump);
cmd_ref = CmdRef(new_cmd);
cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
}
}
// Ensure that there is space to encode a segment.
void alloc_chunk(inout uint chunk_n_segs, inout SegChunkRef seg_chunk_ref,
inout SegChunkRef first_seg_chunk, inout uint seg_limit)
{
// TODO: Reduce divergence of atomic alloc?
if (chunk_n_segs == 0) {
if (seg_chunk_ref.offset + 40 > seg_limit) {
seg_chunk_ref.offset = atomicAdd(alloc, SEG_CHUNK_ALLOC);
seg_limit = seg_chunk_ref.offset + SEG_CHUNK_ALLOC - Segment_size;
}
first_seg_chunk = seg_chunk_ref;
} else if (seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs > seg_limit) {
uint new_chunk_ref = atomicAdd(alloc, SEG_CHUNK_ALLOC);
seg_limit = new_chunk_ref + SEG_CHUNK_ALLOC - Segment_size;
SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, SegChunkRef(new_chunk_ref)));
seg_chunk_ref.offset = new_chunk_ref;
chunk_n_segs = 0;
}
}
// Accumulate delta to backdrop.
//
// Each bit for which bd_bitmap is 1 and bd_sign is 1 counts as +1, and each
// bit for which bd_bitmap is 1 and bd_sign is 0 counts as -1.
int count_backdrop(uint bd_bitmap, uint bd_sign) {
return bitCount(bd_bitmap & bd_sign) - bitCount(bd_bitmap & ~bd_sign);
}
void main() {
// Could use either linear or 2d layouts for both dispatch and
// invocations within the workgroup. We'll use variables to abstract.
uint bin_ix = N_TILE_X * gl_WorkGroupID.y + gl_WorkGroupID.x;
// Top left coordinates of this bin.
vec2 xy0 = vec2(N_TILE_X * TILE_WIDTH_PX * gl_WorkGroupID.x, N_TILE_Y * TILE_HEIGHT_PX * gl_WorkGroupID.y);
uint th_ix = gl_LocalInvocationID.x;
uint tile_x = N_TILE_X * gl_WorkGroupID.x + gl_LocalInvocationID.x % N_TILE_X;
uint tile_y = N_TILE_Y * gl_WorkGroupID.y + gl_LocalInvocationID.x / N_TILE_X;
uint tile_ix = tile_y * WIDTH_IN_TILES + tile_x;
CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC);
uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
// Allocation and management of segment output
SegChunkRef seg_chunk_ref = SegChunkRef(0);
SegChunkRef first_seg_chunk = SegChunkRef(0);
uint seg_limit = 0;
uint chunk_n_segs = 0;
uint wr_ix = 0;
uint rd_ix = 0;
uint first_el;
if (th_ix < N_WG) {
uint start_chunk = (bin_ix * N_WG + th_ix) * BIN_INITIAL_ALLOC;
sh_chunk[th_ix] = start_chunk;
BinChunk chunk = BinChunk_read(BinChunkRef(start_chunk));
sh_chunk_n[th_ix] = chunk.n;
sh_chunk_next[th_ix] = chunk.next.offset;
sh_first_el[th_ix] = chunk.n > 0 ?
BinInstance_read(BinInstanceRef(start_chunk + BinChunk_size)).element_ix : ~0;
}
if (th_ix < N_SLICE) {
sh_bd_sign[th_ix] = 0;
}
int backdrop = 0;
while (true) {
for (uint i = 0; i < N_SLICE; i++) {
sh_bitmaps[i][th_ix] = 0;
sh_backdrop[i][th_ix] = 0;
}
while (wr_ix - rd_ix <= N_TILE) {
// Choose segment with least element.
uint my_min;
if (th_ix < N_WG) {
if (th_ix == 0) {
sh_selected_n = 0;
sh_min_buf = ~0;
}
}
barrier();
// Tempting to do this with subgroups, but atomic should be good enough.
if (th_ix < N_WG) {
my_min = sh_first_el[th_ix];
atomicMin(sh_min_buf, my_min);
}
barrier();
if (th_ix < N_WG) {
if (my_min == sh_min_buf && my_min != ~0) {
sh_elements_ref = sh_chunk[th_ix] + BinChunk_size;
uint selected_n = sh_chunk_n[th_ix];
sh_selected_n = selected_n;
uint next_chunk = sh_chunk_next[th_ix];
if (next_chunk == 0) {
sh_first_el[th_ix] = ~0;
} else {
sh_chunk[th_ix] = next_chunk;
BinChunk chunk = BinChunk_read(BinChunkRef(next_chunk));
sh_chunk_n[th_ix] = chunk.n;
sh_chunk_next[th_ix] = chunk.next.offset;
sh_first_el[th_ix] = BinInstance_read(
BinInstanceRef(next_chunk + BinChunk_size)).element_ix;
}
}
}
barrier();
uint chunk_n = sh_selected_n;
if (chunk_n == 0) {
// All chunks consumed
break;
}
BinInstanceRef inst_ref = BinInstanceRef(sh_elements_ref);
if (th_ix < chunk_n) {
BinInstance inst = BinInstance_read(BinInstance_index(inst_ref, th_ix));
uint wr_el_ix = (wr_ix + th_ix) % N_RINGBUF;
sh_elements[wr_el_ix] = inst.element_ix;
sh_right_edge[wr_el_ix] = inst.right_edge;
}
wr_ix += chunk_n;
}
barrier();
// We've done the merge and filled the buffer.
// Read one element, compute coverage.
uint tag = Annotated_Nop;
AnnotatedRef ref;
float right_edge = 0.0;
if (th_ix + rd_ix < wr_ix) {
uint rd_el_ix = (rd_ix + th_ix) % N_RINGBUF;
uint element_ix = sh_elements[rd_el_ix];
right_edge = sh_right_edge[rd_el_ix];
ref = AnnotatedRef(element_ix * Annotated_size);
tag = Annotated_tag(ref);
}
// Setup for coverage algorithm.
float a, b, c;
// Bounding box of element in pixel coordinates.
float xmin, xmax, ymin, ymax;
uint my_slice = th_ix / 32;
uint my_mask = 1 << (th_ix & 31);
switch (tag) {
case Annotated_FillLine:
case Annotated_StrokeLine:
AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref);
xmin = min(line.p0.x, line.p1.x) - line.stroke.x;
xmax = max(line.p0.x, line.p1.x) + line.stroke.x;
ymin = min(line.p0.y, line.p1.y) - line.stroke.y;
ymax = max(line.p0.y, line.p1.y) + line.stroke.y;
float dx = line.p1.x - line.p0.x;
float dy = line.p1.y - line.p0.y;
if (tag == Annotated_FillLine) {
// Set bit for backdrop sign calculation, 1 is +1, 0 is -1.
if (dy < 0) {
atomicOr(sh_bd_sign[my_slice], my_mask);
} else {
atomicAnd(sh_bd_sign[my_slice], ~my_mask);
}
}
// Set up for per-scanline coverage formula, below.
float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
c = (line.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + line.stroke.y)) * SX;
b = invslope; // Note: assumes square tiles, otherwise scale.
a = (line.p0.x - xy0.x - (line.p0.y - 0.5 * float(TILE_HEIGHT_PX) - xy0.y) * b) * SX;
break;
case Annotated_Fill:
case Annotated_Stroke:
// Note: we take advantage of the fact that fills and strokes
// have compatible layout.
AnnoFill fill = Annotated_Fill_read(ref);
xmin = fill.bbox.x;
xmax = fill.bbox.z;
ymin = fill.bbox.y;
ymax = fill.bbox.w;
// Just let the clamping to xmin and xmax determine the bounds.
a = 0.0;
b = 0.0;
c = 1e9;
break;
default:
ymin = 0;
ymax = 0;
break;
}
// Draw the coverage area into the bitmasks. This uses an algorithm
// that computes the coverage of a span for given scanline.
// Compute bounding box in tiles and clip to this bin.
int x0 = int(floor((xmin - xy0.x) * SX));
int x1 = int(ceil((xmax - xy0.x) * SX));
int xr = int(ceil((right_edge - xy0.x) * SX));
int y0 = int(floor((ymin - xy0.y) * SY));
int y1 = int(ceil((ymax - xy0.y) * SY));
x0 = clamp(x0, 0, N_TILE_X);
x1 = clamp(x1, x0, N_TILE_X);
xr = clamp(xr, 0, N_TILE_X);
y0 = clamp(y0, 0, N_TILE_Y);
y1 = clamp(y1, y0, N_TILE_Y);
float t = a + b * float(y0);
for (uint y = y0; y < y1; y++) {
uint xx0 = clamp(int(floor(t - c)), x0, x1);
uint xx1 = clamp(int(ceil(t + c)), x0, x1);
for (uint x = xx0; x < xx1; x++) {
atomicOr(sh_bitmaps[my_slice][y * N_TILE_X + x], my_mask);
}
if (tag == Annotated_FillLine && ymin <= xy0.y + float(y * TILE_HEIGHT_PX)) {
// Assign backdrop to all tiles to the right of the ray crossing the
// top edge of this tile, up to the right edge of the fill bbox.
float xray = t - 0.5 * b;
xx0 = max(int(ceil(xray)), 0);
for (uint x = xx0; x < xr; x++) {
atomicOr(sh_backdrop[my_slice][y * N_TILE_X + x], my_mask);
}
}
t += b;
}
barrier();
// Output elements for this tile, based on bitmaps.
uint slice_ix = 0;
uint bitmap = sh_bitmaps[0][th_ix];
uint bd_bitmap = sh_backdrop[0][th_ix];
uint bd_sign = sh_bd_sign[0];
while (true) {
if (bitmap == 0) {
backdrop += count_backdrop(bd_bitmap, bd_sign);
slice_ix++;
if (slice_ix == N_SLICE) {
break;
}
bitmap = sh_bitmaps[slice_ix][th_ix];
bd_bitmap = sh_backdrop[slice_ix][th_ix];
bd_sign = sh_bd_sign[slice_ix];
if (bitmap == 0) {
continue;
}
}
uint element_ref_ix = slice_ix * 32 + findLSB(bitmap);
uint element_ix = sh_elements[(rd_ix + element_ref_ix) % N_RINGBUF];
// Bits up to and including the lsb
uint bd_mask = (bitmap - 1) ^ bitmap;
backdrop += count_backdrop(bd_bitmap & bd_mask, bd_sign);
// Clear bits that have been consumed.
bd_bitmap &= ~bd_mask;
bitmap &= ~bd_mask;
// At this point, we read the element again from global memory.
// If that turns out to be expensive, maybe we can pack it into
// shared memory (or perhaps just the tag).
ref = AnnotatedRef(element_ix * Annotated_size);
tag = Annotated_tag(ref);
switch (tag) {
case Annotated_FillLine:
AnnoFillLineSeg fill_line = Annotated_FillLine_read(ref);
// This is basically the same logic as piet-metal, but should be made numerically robust.
vec2 tile_xy = vec2(tile_x * TILE_WIDTH_PX, tile_y * TILE_HEIGHT_PX);
float yEdge = mix(fill_line.p0.y, fill_line.p1.y, (tile_xy.x - fill_line.p0.x) / (fill_line.p1.x - fill_line.p0.x));
if (min(fill_line.p0.x, fill_line.p1.x) < tile_xy.x && yEdge >= tile_xy.y && yEdge < tile_xy.y + TILE_HEIGHT_PX) {
Segment edge_seg;
if (fill_line.p0.x > fill_line.p1.x) {
fill_line.p1 = vec2(tile_xy.x, yEdge);
edge_seg.start = fill_line.p1;
edge_seg.end = vec2(tile_xy.x, tile_xy.y + TILE_HEIGHT_PX);
} else {
fill_line.p0 = vec2(tile_xy.x, yEdge);
edge_seg.start = vec2(tile_xy.x, tile_xy.y + TILE_HEIGHT_PX);
edge_seg.end = fill_line.p0;
}
alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit);
Segment_write(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs), edge_seg);
chunk_n_segs++;
}
Segment fill_seg = Segment(fill_line.p0, fill_line.p1);
alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit);
Segment_write(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs), fill_seg);
chunk_n_segs++;
break;
case Annotated_StrokeLine:
AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref);
Segment seg = Segment(line.p0, line.p1);
alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit);
Segment_write(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs), seg);
chunk_n_segs++;
break;
case Annotated_Fill:
if (chunk_n_segs > 0) {
AnnoFill fill = Annotated_Fill_read(ref);
SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, SegChunkRef(0)));
seg_chunk_ref.offset += SegChunk_size + Segment_size * chunk_n_segs;
CmdFill cmd_fill;
cmd_fill.seg_ref = first_seg_chunk.offset;
cmd_fill.backdrop = backdrop;
cmd_fill.rgba_color = fill.rgba_color;
alloc_cmd(cmd_ref, cmd_limit);
Cmd_Fill_write(cmd_ref, cmd_fill);
cmd_ref.offset += Cmd_size;
chunk_n_segs = 0;
} else if (backdrop != 0) {
AnnoFill fill = Annotated_Fill_read(ref);
alloc_cmd(cmd_ref, cmd_limit);
Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color));
cmd_ref.offset += Cmd_size;
}
backdrop = 0;
break;
case Annotated_Stroke:
if (chunk_n_segs > 0) {
AnnoStroke stroke = Annotated_Stroke_read(ref);
SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, SegChunkRef(0)));
seg_chunk_ref.offset += SegChunk_size + Segment_size * chunk_n_segs;
CmdStroke cmd_stroke;
cmd_stroke.seg_ref = first_seg_chunk.offset;
cmd_stroke.half_width = 0.5 * stroke.linewidth;
cmd_stroke.rgba_color = stroke.rgba_color;
alloc_cmd(cmd_ref, cmd_limit);
Cmd_Stroke_write(cmd_ref, cmd_stroke);
cmd_ref.offset += Cmd_size;
chunk_n_segs = 0;
}
break;
}
}
barrier();
rd_ix += N_TILE;
// The second disjunct is there as a strange workaround on Nvidia. If it is
// removed, then the kernel fails with ERROR_DEVICE_LOST.
if (rd_ix >= wr_ix || bin_ix == ~0) break;
}
Cmd_End_write(cmd_ref);
}