vello/piet-gpu/shader/path_coarse.comp
Raph Levien af0a1af8e1 Make fills work
The backdrop propagation is slow but it does work.
2020-06-05 22:40:44 -07:00

189 lines
6.5 KiB
Plaintext

// Coarse rasterization of path segments.
// Allocation and initialization of tiles for paths.
#version 450
#extension GL_GOOGLE_include_directive : enable
#include "setup.h"
#define LG_COARSE_WG 5
#define COARSE_WG (1 << LG_COARSE_WG)
layout(local_size_x = COARSE_WG, local_size_y = 1) in;
layout(set = 0, binding = 0) buffer PathSegBuf {
uint[] pathseg;
};
layout(set = 0, binding = 1) buffer AllocBuf {
uint n_paths;
uint n_pathseg;
uint alloc;
};
layout(set = 0, binding = 2) buffer TileBuf {
uint[] tile;
};
#include "pathseg.h"
#include "tile.h"
// scale factors useful for converting coordinates to tiles
#define SX (1.0 / float(TILE_WIDTH_PX))
#define SY (1.0 / float(TILE_HEIGHT_PX))
shared uint sh_tile_count[COARSE_WG];
shared uint sh_width[COARSE_WG];
shared uint sh_draw_width[COARSE_WG];
shared uint sh_tag[COARSE_WG];
shared vec2 sh_p0[COARSE_WG];
shared vec2 sh_p1[COARSE_WG];
shared int sh_x0[COARSE_WG];
shared int sh_bbox_x1[COARSE_WG];
shared int sh_y0[COARSE_WG];
shared float sh_a[COARSE_WG];
shared float sh_b[COARSE_WG];
shared float sh_c[COARSE_WG];
shared uint sh_base[COARSE_WG];
shared uint sh_stride[COARSE_WG];
shared uint sh_alloc_start;
void main() {
uint th_ix = gl_LocalInvocationID.x;
uint element_ix = gl_GlobalInvocationID.x;
PathSegRef ref = PathSegRef(element_ix * PathSeg_size);
uint tag = PathSeg_Nop;
if (element_ix < n_pathseg) {
tag = PathSeg_tag(ref);
}
sh_tag[th_ix] = tag;
// Setup for coverage algorithm.
float a, b, c;
// Bounding box of element in pixel coordinates.
float xmin, xmax, ymin, ymax;
PathStrokeLine line;
switch (tag) {
case PathSeg_FillLine:
case PathSeg_StrokeLine:
line = PathSeg_StrokeLine_read(ref);
sh_p0[th_ix] = line.p0;
sh_p1[th_ix] = line.p1;
xmin = min(line.p0.x, line.p1.x) - line.stroke.x;
xmax = max(line.p0.x, line.p1.x) + line.stroke.x;
ymin = min(line.p0.y, line.p1.y) - line.stroke.y;
ymax = max(line.p0.y, line.p1.y) + line.stroke.y;
float dx = line.p1.x - line.p0.x;
float dy = line.p1.y - line.p0.y;
// Set up for per-scanline coverage formula, below.
float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
c = (line.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + line.stroke.y)) * SX;
b = invslope; // Note: assumes square tiles, otherwise scale.
a = (line.p0.x - (line.p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX;
sh_a[th_ix] = a;
sh_b[th_ix] = b;
sh_c[th_ix] = c;
break;
}
int x0 = int(floor((xmin) * SX));
int x1 = int(ceil((xmax) * SX));
int y0 = int(floor((ymin) * SY));
int y1 = int(ceil((ymax) * SY));
uint path_ix = line.path_ix;
Path path = Path_read(PathRef(path_ix * Path_size));
ivec4 bbox = ivec4(path.bbox);
x0 = clamp(x0, bbox.x, bbox.z);
y0 = clamp(y0, bbox.y, bbox.w);
x1 = clamp(x1, bbox.x, bbox.z);
y1 = clamp(y1, bbox.y, bbox.w);
sh_x0[th_ix] = x0;
sh_bbox_x1[th_ix] = bbox.z;
// TODO: can get rid of this (fold into base), with care (also need to update `a`)
sh_y0[th_ix] = y0;
int stride = bbox.z - bbox.x;
sh_stride[th_ix] = stride;
sh_base[th_ix] = path.tiles.offset - (bbox.y * stride + bbox.x) * Tile_size;
uint width = uint(x1 - x0);
sh_width[th_ix] = width;
uint draw_width = min(width, uint(1.0 + ceil(2.0 * c)));
sh_draw_width[th_ix] = draw_width;
uint tile_count = draw_width * uint(y1 - y0);
sh_tile_count[th_ix] = tile_count;
for (uint i = 0; i < LG_COARSE_WG; i++) {
barrier();
if (th_ix >= (1 << i)) {
tile_count += sh_tile_count[th_ix - (1 << i)];
}
barrier();
sh_tile_count[th_ix] = tile_count;
}
if (th_ix == COARSE_WG - 1) {
sh_alloc_start = atomicAdd(alloc, tile_count * TileSeg_size);
}
barrier();
uint alloc_start = sh_alloc_start;
uint total_tile_count = sh_tile_count[COARSE_WG - 1];
for (uint ix = th_ix; ix < total_tile_count; ix += COARSE_WG) {
// Binary search to find element
uint el_ix = 0;
for (uint i = 0; i < LG_COARSE_WG; i++) {
uint probe = el_ix + ((COARSE_WG / 2) >> i);
if (ix >= sh_tile_count[probe - 1]) {
el_ix = probe;
}
}
uint seq_ix = ix - (el_ix > 0 ? sh_tile_count[el_ix - 1] : 0);
uint draw_width = sh_draw_width[el_ix];
int x0 = sh_x0[el_ix];
int x1 = x0 + int(sh_width[el_ix]);
int dx = int(seq_ix % draw_width);
uint y = sh_y0[el_ix] + seq_ix / draw_width;
float b = sh_b[el_ix];
float t = sh_a[el_ix] + b * float(y);
float c = sh_c[el_ix];
int xx0 = clamp(int(floor(t - c)), x0, x1);
int xx1 = clamp(int(ceil(t + c)), x0, x1);
int x = xx0 + dx;
if (x < xx1) {
uint tile_offset = alloc_start + ix * TileSeg_size;
uint tile_el = (sh_base[el_ix] + uint(y * sh_stride[el_ix] + x) * Tile_size) >> 2;
uint old = atomicExchange(tile[tile_el], tile_offset);
TileSeg tile_seg;
vec2 p0 = sh_p0[el_ix];
vec2 p1 = sh_p1[el_ix];
float y_edge = 0.0;
if (sh_tag[el_ix] == PathSeg_FillLine) {
vec2 tile_xy = vec2(x * TILE_WIDTH_PX, y * TILE_HEIGHT_PX);
if (dx == 0 && min(p0.y, p1.y) <= tile_xy.y) {
// TODO: need a little more work to make sure this triggers even
// when line is to the left of bbox.
int xray = max(int(ceil(t - 0.5 * b)), x0);
if (xray < sh_bbox_x1[el_ix]) {
int backdrop = p1.y < p0.y ? 1 : -1;
atomicAdd(tile[tile_el + 1 + 2 * (xray - x)], backdrop);
}
}
y_edge = mix(p0.y, p1.y, (tile_xy.x - p0.x) / (p1.x - p0.x));
if (min(p0.x, p1.x) < tile_xy.x && y_edge >= tile_xy.y && y_edge < tile_xy.y + TILE_HEIGHT_PX) {
if (p0.x > p1.x) {
p1 = vec2(tile_xy.x, y_edge);
} else {
p0 = vec2(tile_xy.x, y_edge);
}
} else {
y_edge = 1e9;
}
}
tile_seg.start = p0;
tile_seg.end = p1;
tile_seg.y_edge = y_edge;
tile_seg.next.offset = old;
TileSeg_write(TileSegRef(tile_offset), tile_seg);
}
}
}