mirror of
https://github.com/italicsjenga/vello.git
synced 2024-10-17 23:11:30 +11:00
076e6d600d
Write the right_edge to the binning output. More work on encoding the fill/stroke distinction and plumbing that through the pipeline. This is a bit unsatisfying because of the code duplication; having an extra fill/stroke bool might be better, but I want to avoid making the structs bigger (this could be solved by better packing in the struct encoding). Fills are plumbed through to the last stage. Backdrop is WIP.
322 lines
12 KiB
Plaintext
322 lines
12 KiB
Plaintext
// The coarse rasterizer stage of the pipeline.
|
|
|
|
#version 450
|
|
#extension GL_GOOGLE_include_directive : enable
|
|
|
|
#include "setup.h"
|
|
|
|
layout(local_size_x = N_TILE, local_size_y = 1) in;
|
|
|
|
layout(set = 0, binding = 0) buffer AnnotatedBuf {
|
|
uint[] annotated;
|
|
};
|
|
|
|
layout(set = 0, binding = 1) buffer BinsBuf {
|
|
uint[] bins;
|
|
};
|
|
|
|
layout(set = 0, binding = 2) buffer AllocBuf {
|
|
uint alloc;
|
|
};
|
|
|
|
layout(set = 0, binding = 3) buffer PtclBuf {
|
|
uint[] ptcl;
|
|
};
|
|
|
|
#include "annotated.h"
|
|
#include "bins.h"
|
|
#include "ptcl.h"
|
|
|
|
#define N_RINGBUF 512
|
|
|
|
shared uint sh_elements[N_RINGBUF];
|
|
shared float sh_right_edge[N_RINGBUF];
|
|
shared uint sh_chunk[N_WG];
|
|
shared uint sh_chunk_next[N_WG];
|
|
shared uint sh_chunk_n[N_WG];
|
|
shared uint sh_min_buf;
|
|
// Some of these are kept in shared memory to ease register
|
|
// pressure, but it could go either way.
|
|
shared uint sh_first_el[N_WG];
|
|
shared uint sh_selected_n;
|
|
shared uint sh_elements_ref;
|
|
|
|
shared uint sh_bitmaps[N_SLICE][N_TILE];
|
|
shared uint sh_backdrop[N_SLICE][N_TILE];
|
|
|
|
// scale factors useful for converting coordinates to tiles
|
|
#define SX (1.0 / float(TILE_WIDTH_PX))
|
|
#define SY (1.0 / float(TILE_HEIGHT_PX))
|
|
|
|
// Perhaps cmd_limit should be a global? This is a style question.
|
|
void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) {
|
|
if (cmd_ref.offset > cmd_limit) {
|
|
uint new_cmd = atomicAdd(alloc, PTCL_INITIAL_ALLOC);
|
|
CmdJump jump = CmdJump(new_cmd);
|
|
Cmd_Jump_write(cmd_ref, jump);
|
|
cmd_ref = CmdRef(new_cmd);
|
|
cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
|
|
}
|
|
}
|
|
|
|
// Ensure that there is space to encode a segment.
|
|
void alloc_chunk(inout uint chunk_n_segs, inout SegChunkRef seg_chunk_ref,
|
|
inout SegChunkRef first_seg_chunk, inout uint seg_limit)
|
|
{
|
|
// TODO: Reduce divergence of atomic alloc?
|
|
if (chunk_n_segs == 0) {
|
|
if (seg_chunk_ref.offset + 40 > seg_limit) {
|
|
seg_chunk_ref.offset = atomicAdd(alloc, SEG_CHUNK_ALLOC);
|
|
seg_limit = seg_chunk_ref.offset + SEG_CHUNK_ALLOC - Segment_size;
|
|
}
|
|
first_seg_chunk = seg_chunk_ref;
|
|
} else if (seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs > seg_limit) {
|
|
uint new_chunk_ref = atomicAdd(alloc, SEG_CHUNK_ALLOC);
|
|
seg_limit = new_chunk_ref + SEG_CHUNK_ALLOC - Segment_size;
|
|
SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, SegChunkRef(new_chunk_ref)));
|
|
seg_chunk_ref.offset = new_chunk_ref;
|
|
chunk_n_segs = 0;
|
|
}
|
|
}
|
|
|
|
void main() {
|
|
// Could use either linear or 2d layouts for both dispatch and
|
|
// invocations within the workgroup. We'll use variables to abstract.
|
|
uint bin_ix = N_TILE_X * gl_WorkGroupID.y + gl_WorkGroupID.x;
|
|
// Top left coordinates of this bin.
|
|
vec2 xy0 = vec2(N_TILE_X * TILE_WIDTH_PX * gl_WorkGroupID.x, N_TILE_Y * TILE_HEIGHT_PX * gl_WorkGroupID.y);
|
|
uint th_ix = gl_LocalInvocationID.x;
|
|
|
|
uint tile_x = N_TILE_X * gl_WorkGroupID.x + gl_LocalInvocationID.x % N_TILE_X;
|
|
uint tile_y = N_TILE_Y * gl_WorkGroupID.y + gl_LocalInvocationID.x / N_TILE_X;
|
|
uint tile_ix = tile_y * WIDTH_IN_TILES + tile_x;
|
|
CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC);
|
|
uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
|
|
|
|
// Allocation and management of segment output
|
|
SegChunkRef seg_chunk_ref = SegChunkRef(0);
|
|
SegChunkRef first_seg_chunk = SegChunkRef(0);
|
|
uint seg_limit = 0;
|
|
uint chunk_n_segs = 0;
|
|
|
|
uint wr_ix = 0;
|
|
uint rd_ix = 0;
|
|
uint first_el;
|
|
if (th_ix < N_WG) {
|
|
uint start_chunk = (bin_ix * N_WG + th_ix) * BIN_INITIAL_ALLOC;
|
|
sh_chunk[th_ix] = start_chunk;
|
|
BinChunk chunk = BinChunk_read(BinChunkRef(start_chunk));
|
|
sh_chunk_n[th_ix] = chunk.n;
|
|
sh_chunk_next[th_ix] = chunk.next.offset;
|
|
sh_first_el[th_ix] = chunk.n > 0 ?
|
|
BinInstance_read(BinInstanceRef(start_chunk + BinChunk_size)).element_ix : ~0;
|
|
}
|
|
uint count = 0;
|
|
while (true) {
|
|
for (uint i = 0; i < N_SLICE; i++) {
|
|
sh_bitmaps[i][th_ix] = 0;
|
|
sh_backdrop[i][th_ix] = 0;
|
|
}
|
|
|
|
while (wr_ix - rd_ix <= N_TILE) {
|
|
// Choose segment with least element.
|
|
uint my_min;
|
|
if (th_ix < N_WG) {
|
|
if (th_ix == 0) {
|
|
sh_selected_n = 0;
|
|
sh_min_buf = ~0;
|
|
}
|
|
}
|
|
barrier();
|
|
// Tempting to do this with subgroups, but atomic should be good enough.
|
|
if (th_ix < N_WG) {
|
|
my_min = sh_first_el[th_ix];
|
|
atomicMin(sh_min_buf, my_min);
|
|
}
|
|
barrier();
|
|
if (th_ix < N_WG) {
|
|
if (my_min == sh_min_buf && my_min != ~0) {
|
|
sh_elements_ref = sh_chunk[th_ix] + BinChunk_size;
|
|
uint selected_n = sh_chunk_n[th_ix];
|
|
sh_selected_n = selected_n;
|
|
uint next_chunk = sh_chunk_next[th_ix];
|
|
if (next_chunk == 0) {
|
|
sh_first_el[th_ix] = ~0;
|
|
} else {
|
|
sh_chunk[th_ix] = next_chunk;
|
|
BinChunk chunk = BinChunk_read(BinChunkRef(next_chunk));
|
|
sh_chunk_n[th_ix] = chunk.n;
|
|
sh_chunk_next[th_ix] = chunk.next.offset;
|
|
sh_first_el[th_ix] = BinInstance_read(
|
|
BinInstanceRef(next_chunk + BinChunk_size)).element_ix;
|
|
}
|
|
}
|
|
}
|
|
barrier();
|
|
uint chunk_n = sh_selected_n;
|
|
if (chunk_n == 0) {
|
|
// All chunks consumed
|
|
break;
|
|
}
|
|
BinInstanceRef inst_ref = BinInstanceRef(sh_elements_ref);
|
|
if (th_ix < chunk_n) {
|
|
BinInstance inst = BinInstance_read(BinInstance_index(inst_ref, th_ix));
|
|
uint wr_el_ix = (wr_ix + th_ix) % N_RINGBUF;
|
|
sh_elements[wr_el_ix] = inst.element_ix;
|
|
sh_right_edge[wr_el_ix] = inst.right_edge;
|
|
}
|
|
wr_ix += chunk_n;
|
|
}
|
|
barrier();
|
|
|
|
// We've done the merge and filled the buffer.
|
|
|
|
// Read one element, compute coverage.
|
|
uint tag = Annotated_Nop;
|
|
AnnotatedRef ref;
|
|
if (th_ix + rd_ix < wr_ix) {
|
|
uint element_ix = sh_elements[(rd_ix + th_ix) % N_RINGBUF];
|
|
ref = AnnotatedRef(element_ix * Annotated_size);
|
|
tag = Annotated_tag(ref);
|
|
}
|
|
|
|
// Setup for coverage algorithm.
|
|
float a, b, c;
|
|
// Bounding box of element in pixel coordinates.
|
|
float xmin, xmax, ymin, ymax;
|
|
switch (tag) {
|
|
case Annotated_FillLine:
|
|
case Annotated_StrokeLine:
|
|
AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref);
|
|
xmin = min(line.p0.x, line.p1.x) - line.stroke.x;
|
|
xmax = max(line.p0.x, line.p1.x) + line.stroke.x;
|
|
ymin = min(line.p0.y, line.p1.y) - line.stroke.y;
|
|
ymax = max(line.p0.y, line.p1.y) + line.stroke.y;
|
|
float dx = line.p1.x - line.p0.x;
|
|
float dy = line.p1.y - line.p0.y;
|
|
// Set up for per-scanline coverage formula, below.
|
|
float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
|
|
c = abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + line.stroke.y) * SX;
|
|
b = invslope; // Note: assumes square tiles, otherwise scale.
|
|
a = (line.p0.x - xy0.x - (line.p0.y - 0.5 * float(TILE_HEIGHT_PX) - xy0.y) * b) * SX;
|
|
break;
|
|
case Annotated_Fill:
|
|
case Annotated_Stroke:
|
|
// Note: we take advantage of the fact that fills and strokes
|
|
// have compatible layout.
|
|
AnnoFill fill = Annotated_Fill_read(ref);
|
|
xmin = fill.bbox.x;
|
|
xmax = fill.bbox.z;
|
|
ymin = fill.bbox.y;
|
|
ymax = fill.bbox.w;
|
|
// Just let the clamping to xmin and xmax determine the bounds.
|
|
a = 0.0;
|
|
b = 0.0;
|
|
c = 1e9;
|
|
break;
|
|
default:
|
|
ymin = 0;
|
|
ymax = 0;
|
|
break;
|
|
}
|
|
|
|
// Draw the coverage area into the bitmasks. This uses an algorithm
|
|
// that computes the coverage of a span for given scanline.
|
|
|
|
// Compute bounding box in tiles and clip to this bin.
|
|
int x0 = int(floor((xmin - xy0.x) * SX));
|
|
int x1 = int(ceil((xmax - xy0.x) * SX));
|
|
int y0 = int(floor((ymin - xy0.y) * SY));
|
|
int y1 = int(ceil((ymax - xy0.y) * SY));
|
|
x0 = clamp(x0, 0, N_TILE_X);
|
|
x1 = clamp(x1, x0, N_TILE_X);
|
|
y0 = clamp(y0, 0, N_TILE_Y);
|
|
y1 = clamp(y1, y0, N_TILE_Y);
|
|
uint my_slice = th_ix / 32;
|
|
uint my_mask = 1 << (th_ix & 31);
|
|
float t = a + b * float(y0);
|
|
for (uint y = y0; y < y1; y++) {
|
|
uint xx0 = clamp(int(floor(t - c)), x0, x1);
|
|
uint xx1 = clamp(int(ceil(t + c)), x0, x1);
|
|
for (uint x = xx0; x < xx1; x++) {
|
|
atomicOr(sh_bitmaps[my_slice][y * N_TILE_X + x], my_mask);
|
|
}
|
|
t += b;
|
|
}
|
|
barrier();
|
|
|
|
// Output elements for this tile, based on bitmaps.
|
|
uint slice_ix = 0;
|
|
uint bitmap = sh_bitmaps[0][th_ix];
|
|
while (true) {
|
|
if (bitmap == 0) {
|
|
slice_ix++;
|
|
if (slice_ix == N_SLICE) {
|
|
break;
|
|
}
|
|
bitmap = sh_bitmaps[slice_ix][th_ix];
|
|
if (bitmap == 0) {
|
|
continue;
|
|
}
|
|
}
|
|
uint element_ref_ix = slice_ix * 32 + findLSB(bitmap);
|
|
uint element_ix = sh_elements[(rd_ix + element_ref_ix) % N_RINGBUF];
|
|
|
|
// At this point, we read the element again from global memory.
|
|
// If that turns out to be expensive, maybe we can pack it into
|
|
// shared memory (or perhaps just the tag).
|
|
ref = AnnotatedRef(element_ix * Annotated_size);
|
|
tag = Annotated_tag(ref);
|
|
|
|
switch (tag) {
|
|
case Annotated_FillLine:
|
|
case Annotated_StrokeLine:
|
|
AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref);
|
|
Segment seg = Segment(line.p0, line.p1);
|
|
alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit);
|
|
Segment_write(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs), seg);
|
|
chunk_n_segs++;
|
|
break;
|
|
case Annotated_Fill:
|
|
if (chunk_n_segs > 0) {
|
|
AnnoFill fill = Annotated_Fill_read(ref);
|
|
SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, SegChunkRef(0)));
|
|
seg_chunk_ref.offset += SegChunk_size + Segment_size * chunk_n_segs;
|
|
CmdFill cmd_fill;
|
|
cmd_fill.seg_ref = first_seg_chunk.offset;
|
|
cmd_fill.rgba_color = fill.rgba_color;
|
|
alloc_cmd(cmd_ref, cmd_limit);
|
|
Cmd_Fill_write(cmd_ref, cmd_fill);
|
|
cmd_ref.offset += Cmd_size;
|
|
chunk_n_segs = 0;
|
|
}
|
|
break;
|
|
case Annotated_Stroke:
|
|
if (chunk_n_segs > 0) {
|
|
AnnoStroke stroke = Annotated_Stroke_read(ref);
|
|
SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, SegChunkRef(0)));
|
|
seg_chunk_ref.offset += SegChunk_size + Segment_size * chunk_n_segs;
|
|
CmdStroke cmd_stroke;
|
|
cmd_stroke.seg_ref = first_seg_chunk.offset;
|
|
cmd_stroke.half_width = 0.5 * stroke.linewidth;
|
|
cmd_stroke.rgba_color = stroke.rgba_color;
|
|
alloc_cmd(cmd_ref, cmd_limit);
|
|
Cmd_Stroke_write(cmd_ref, cmd_stroke);
|
|
cmd_ref.offset += Cmd_size;
|
|
chunk_n_segs = 0;
|
|
}
|
|
break;
|
|
}
|
|
|
|
// clear LSB
|
|
bitmap &= bitmap - 1;
|
|
}
|
|
barrier();
|
|
|
|
rd_ix += N_TILE;
|
|
// The second disjunct is there as a strange workaround on Nvidia. If it is
|
|
// removed, then the kernel fails with ERROR_DEVICE_LOST.
|
|
if (rd_ix >= wr_ix || bin_ix == ~0) break;
|
|
}
|
|
}
|