vello/piet-gpu/shader/coarse.comp
Raph Levien 868b0320a4 Render strokes
As of this point, it mostly renders stroke outlines for tiger. Some
dropouts are because the scan in the elements pass doesn't do lookback
yet, others are probably a bug.
2020-05-15 17:38:17 -07:00

280 lines
10 KiB
Plaintext

// The coarse rasterizer stage of the pipeline.
#version 450
#extension GL_GOOGLE_include_directive : enable
#include "setup.h"
layout(local_size_x = N_TILE, local_size_y = 1) in;
layout(set = 0, binding = 0) buffer AnnotatedBuf {
uint[] annotated;
};
layout(set = 0, binding = 1) buffer BinsBuf {
uint[] bins;
};
layout(set = 0, binding = 2) buffer AllocBuf {
uint alloc;
};
layout(set = 0, binding = 3) buffer PtclBuf {
uint[] ptcl;
};
#include "annotated.h"
#include "bins.h"
#include "ptcl.h"
#define N_RINGBUF 512
shared uint sh_elements[N_RINGBUF];
shared uint sh_chunk[N_WG];
shared uint sh_chunk_next[N_WG];
shared uint sh_chunk_n[N_WG];
shared uint sh_min_buf;
// Some of these are kept in shared memory to ease register
// pressure, but it could go either way.
shared uint sh_first_el[N_WG];
shared uint sh_selected_n;
shared uint sh_elements_ref;
shared uint sh_bitmaps[N_SLICE][N_TILE];
// scale factors useful for converting coordinates to tiles
#define SX (1.0 / float(TILE_WIDTH_PX))
#define SY (1.0 / float(TILE_HEIGHT_PX))
// Perhaps cmd_limit should be a global? This is a style question.
void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) {
if (cmd_ref.offset > cmd_limit) {
uint new_cmd = atomicAdd(alloc, PTCL_INITIAL_ALLOC);
CmdJump jump = CmdJump(new_cmd);
Cmd_Jump_write(cmd_ref, jump);
cmd_ref = CmdRef(new_cmd);
cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
}
}
// Ensure that there is space to encode a segment.
void alloc_chunk(inout uint chunk_n_segs, inout SegChunkRef seg_chunk_ref,
inout SegChunkRef first_seg_chunk, inout uint seg_limit)
{
// TODO: Reduce divergence of atomic alloc?
if (chunk_n_segs == 0) {
if (seg_chunk_ref.offset + 40 > seg_limit) {
seg_chunk_ref.offset = atomicAdd(alloc, SEG_CHUNK_ALLOC);
seg_limit = seg_chunk_ref.offset + SEG_CHUNK_ALLOC - Segment_size;
}
first_seg_chunk = seg_chunk_ref;
} else if (seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs > seg_limit) {
uint new_chunk_ref = atomicAdd(alloc, SEG_CHUNK_ALLOC);
seg_limit = new_chunk_ref + SEG_CHUNK_ALLOC - Segment_size;
SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, SegChunkRef(new_chunk_ref)));
seg_chunk_ref.offset = new_chunk_ref;
chunk_n_segs = 0;
}
}
void main() {
// Could use either linear or 2d layouts for both dispatch and
// invocations within the workgroup. We'll use variables to abstract.
uint bin_ix = N_TILE_X * gl_WorkGroupID.y + gl_WorkGroupID.x;
// Top left coordinates of this bin.
vec2 xy0 = vec2(N_TILE_X * TILE_WIDTH_PX * gl_WorkGroupID.x, N_TILE_Y * TILE_HEIGHT_PX * gl_WorkGroupID.y);
uint th_ix = gl_LocalInvocationID.x;
uint tile_x = N_TILE_X * gl_WorkGroupID.x + gl_LocalInvocationID.x % N_TILE_X;
uint tile_y = N_TILE_Y * gl_WorkGroupID.y + gl_LocalInvocationID.x / N_TILE_X;
uint tile_ix = tile_y * WIDTH_IN_TILES + tile_x;
CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC);
uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
// Allocation and management of segment output
SegChunkRef seg_chunk_ref = SegChunkRef(0);
SegChunkRef first_seg_chunk = SegChunkRef(0);
uint seg_limit = 0;
uint chunk_n_segs = 0;
uint wr_ix = 0;
uint rd_ix = 0;
uint first_el;
if (th_ix < N_WG) {
uint start_chunk = (bin_ix * N_WG + th_ix) * BIN_INITIAL_ALLOC;
sh_chunk[th_ix] = start_chunk;
BinChunk chunk = BinChunk_read(BinChunkRef(start_chunk));
sh_chunk_n[th_ix] = chunk.n;
sh_chunk_next[th_ix] = chunk.next.offset;
sh_first_el[th_ix] = chunk.n > 0 ?
BinInstance_read(BinInstanceRef(start_chunk + BinChunk_size)).element_ix : ~0;
}
uint probe = 0; // for debugging
do {
for (uint i = 0; i < N_SLICE; i++) {
sh_bitmaps[i][th_ix] = 0;
}
while (wr_ix - rd_ix <= N_TILE) {
// Choose segment with least element.
uint my_min;
if (th_ix < N_WG) {
if (th_ix == 0) {
sh_selected_n = 0;
sh_min_buf = ~1;
}
}
barrier();
// Tempting to do this with subgroups, but atomic should be good enough.
my_min = sh_first_el[th_ix];
if (th_ix < N_WG) {
atomicMin(sh_min_buf, my_min);
}
barrier();
if (th_ix < N_WG) {
if (sh_first_el[th_ix] == sh_min_buf) {
sh_elements_ref = sh_chunk[th_ix] + BinChunk_size;
uint selected_n = sh_chunk_n[th_ix];
sh_selected_n = selected_n;
uint next_chunk = sh_chunk_next[th_ix];
if (next_chunk == 0) {
sh_first_el[th_ix] = ~0;
} else {
sh_chunk[th_ix] = next_chunk;
BinChunk chunk = BinChunk_read(BinChunkRef(next_chunk));
sh_chunk_n[th_ix] = chunk.n;
sh_chunk_next[th_ix] = chunk.next.offset;
sh_first_el[th_ix] = BinInstance_read(
BinInstanceRef(next_chunk + BinChunk_size)).element_ix;
}
}
}
barrier();
uint chunk_n = sh_selected_n;
if (chunk_n == 0) {
// All chunks consumed
break;
}
BinInstanceRef inst_ref = BinInstanceRef(sh_elements_ref);
if (th_ix < chunk_n) {
uint el = BinInstance_read(BinInstance_index(inst_ref, th_ix)).element_ix;
sh_elements[(wr_ix + th_ix) % N_RINGBUF] = el;
}
wr_ix += chunk_n;
}
// We've done the merge and filled the buffer.
// Read one element, compute coverage.
uint tag = Annotated_Nop;
AnnotatedRef ref;
if (th_ix + rd_ix < wr_ix) {
uint element_ix = sh_elements[(rd_ix + th_ix) % N_RINGBUF];
ref = AnnotatedRef(element_ix * Annotated_size);
tag = Annotated_tag(ref);
}
int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
switch (tag) {
case Annotated_Line:
AnnoLineSeg line = Annotated_Line_read(ref);
x0 = int(floor((min(line.p0.x, line.p1.x) - line.stroke.x - xy0.x) * SX));
y0 = int(floor((min(line.p0.y, line.p1.y) - line.stroke.y - xy0.y) * SY));
x1 = int(ceil((max(line.p0.x, line.p1.x) + line.stroke.x - xy0.x) * SX));
y1 = int(ceil((max(line.p0.y, line.p1.y) + line.stroke.y - xy0.y) * SY));
break;
case Annotated_Fill:
case Annotated_Stroke:
// Note: we take advantage of the fact that fills and strokes
// have compatible layout.
AnnoFill fill = Annotated_Fill_read(ref);
x0 = int(floor((fill.bbox.x - xy0.x) * SX));
y0 = int(floor((fill.bbox.y - xy0.y) * SY));
x1 = int(ceil((fill.bbox.z - xy0.x) * SX));
y1 = int(ceil((fill.bbox.w - xy0.y) * SY));
break;
}
// At this point, we run an iterator over the coverage area,
// trying to keep divergence low.
// Right now, it's just a bbox, but we'll get finer with
// segments.
x0 = clamp(x0, 0, N_TILE_X);
x1 = clamp(x1, x0, N_TILE_X);
y0 = clamp(y0, 0, N_TILE_Y);
y1 = clamp(y1, y0, N_TILE_Y);
// This loop draws a rectangle to the coverage bitmasks. For
// line segments, draw more precisely.
if (x0 == x1) y1 = y0;
int x = x0, y = y0;
uint my_slice = th_ix / 32;
uint my_mask = 1 << (th_ix & 31);
while (y < y1) {
atomicOr(sh_bitmaps[my_slice][y * N_TILE_X + x], my_mask);
x++;
if (x == x1) {
x = x0;
y++;
}
}
barrier();
// Output elements for this tile, based on bitmaps.
uint slice_ix = 0;
uint bitmap = sh_bitmaps[0][th_ix];
while (true) {
if (bitmap == 0) {
slice_ix++;
if (slice_ix == N_SLICE) {
break;
}
bitmap = sh_bitmaps[slice_ix][th_ix];
if (bitmap == 0) {
continue;
}
}
uint element_ref_ix = slice_ix * 32 + findLSB(bitmap);
uint element_ix = sh_elements[(rd_ix + element_ref_ix) % N_RINGBUF];
// At this point, we read the element again from global memory.
// If that turns out to be expensive, maybe we can pack it into
// shared memory (or perhaps just the tag).
ref = AnnotatedRef(element_ix * Annotated_size);
tag = Annotated_tag(ref);
switch (tag) {
case Annotated_Line:
AnnoLineSeg line = Annotated_Line_read(ref);
Segment seg = Segment(line.p0, line.p1);
alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit);
Segment_write(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs), seg);
chunk_n_segs++;
break;
case Annotated_Fill:
chunk_n_segs = 0;
break;
case Annotated_Stroke:
if (chunk_n_segs > 0) {
AnnoStroke stroke = Annotated_Stroke_read(ref);
SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, SegChunkRef(0)));
seg_chunk_ref.offset += SegChunk_size + Segment_size * chunk_n_segs;
CmdStroke cmd_stroke;
cmd_stroke.seg_ref = first_seg_chunk.offset;
cmd_stroke.half_width = 0.5 * stroke.linewidth;
cmd_stroke.rgba_color = stroke.rgba_color;
alloc_cmd(cmd_ref, cmd_limit);
Cmd_Stroke_write(cmd_ref, cmd_stroke);
cmd_ref.offset += Cmd_size;
chunk_n_segs = 0;
}
break;
}
// clear LSB
bitmap &= bitmap - 1;
}
rd_ix += N_TILE;
break;
} while (wr_ix > rd_ix);
}