mirror of
https://github.com/italicsjenga/vello.git
synced 2025-01-10 12:41:30 +11:00
More parallel path coarse raster
Use fancier load balancing algorithm for coarse rendering of paths. Seems to work and an improvement in some cases.
This commit is contained in:
parent
877da4a98e
commit
e5dd9ae01e
|
@ -7,9 +7,10 @@
|
||||||
|
|
||||||
#include "setup.h"
|
#include "setup.h"
|
||||||
|
|
||||||
#define TILE_ALLOC_WG 32
|
#define LG_COARSE_WG 5
|
||||||
|
#define COARSE_WG (1 << LG_COARSE_WG)
|
||||||
|
|
||||||
layout(local_size_x = TILE_ALLOC_WG, local_size_y = 1) in;
|
layout(local_size_x = COARSE_WG, local_size_y = 1) in;
|
||||||
|
|
||||||
layout(set = 0, binding = 0) buffer PathSegBuf {
|
layout(set = 0, binding = 0) buffer PathSegBuf {
|
||||||
uint[] pathseg;
|
uint[] pathseg;
|
||||||
|
@ -32,7 +33,22 @@ layout(set = 0, binding = 2) buffer TileBuf {
|
||||||
#define SX (1.0 / float(TILE_WIDTH_PX))
|
#define SX (1.0 / float(TILE_WIDTH_PX))
|
||||||
#define SY (1.0 / float(TILE_HEIGHT_PX))
|
#define SY (1.0 / float(TILE_HEIGHT_PX))
|
||||||
|
|
||||||
|
shared uint sh_tile_count[COARSE_WG];
|
||||||
|
shared uint sh_width[COARSE_WG];
|
||||||
|
shared uint sh_draw_width[COARSE_WG];
|
||||||
|
shared vec2 sh_p0[COARSE_WG];
|
||||||
|
shared vec2 sh_p1[COARSE_WG];
|
||||||
|
shared int sh_x0[COARSE_WG];
|
||||||
|
shared int sh_y0[COARSE_WG];
|
||||||
|
shared float sh_a[COARSE_WG];
|
||||||
|
shared float sh_b[COARSE_WG];
|
||||||
|
shared float sh_c[COARSE_WG];
|
||||||
|
shared uint sh_base[COARSE_WG];
|
||||||
|
shared uint sh_stride[COARSE_WG];
|
||||||
|
shared uint sh_alloc_start;
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
|
uint th_ix = gl_LocalInvocationID.x;
|
||||||
uint element_ix = gl_GlobalInvocationID.x;
|
uint element_ix = gl_GlobalInvocationID.x;
|
||||||
PathSegRef ref = PathSegRef(element_ix * PathSeg_size);
|
PathSegRef ref = PathSegRef(element_ix * PathSeg_size);
|
||||||
|
|
||||||
|
@ -49,6 +65,8 @@ void main() {
|
||||||
case PathSeg_FillLine:
|
case PathSeg_FillLine:
|
||||||
case PathSeg_StrokeLine:
|
case PathSeg_StrokeLine:
|
||||||
line = PathSeg_StrokeLine_read(ref);
|
line = PathSeg_StrokeLine_read(ref);
|
||||||
|
sh_p0[th_ix] = line.p0;
|
||||||
|
sh_p1[th_ix] = line.p1;
|
||||||
xmin = min(line.p0.x, line.p1.x) - line.stroke.x;
|
xmin = min(line.p0.x, line.p1.x) - line.stroke.x;
|
||||||
xmax = max(line.p0.x, line.p1.x) + line.stroke.x;
|
xmax = max(line.p0.x, line.p1.x) + line.stroke.x;
|
||||||
ymin = min(line.p0.y, line.p1.y) - line.stroke.y;
|
ymin = min(line.p0.y, line.p1.y) - line.stroke.y;
|
||||||
|
@ -60,6 +78,9 @@ void main() {
|
||||||
c = (line.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + line.stroke.y)) * SX;
|
c = (line.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + line.stroke.y)) * SX;
|
||||||
b = invslope; // Note: assumes square tiles, otherwise scale.
|
b = invslope; // Note: assumes square tiles, otherwise scale.
|
||||||
a = (line.p0.x - (line.p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX;
|
a = (line.p0.x - (line.p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX;
|
||||||
|
sh_a[th_ix] = a;
|
||||||
|
sh_b[th_ix] = b;
|
||||||
|
sh_c[th_ix] = c;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
int x0 = int(floor((xmin) * SX));
|
int x0 = int(floor((xmin) * SX));
|
||||||
|
@ -74,34 +95,68 @@ void main() {
|
||||||
y0 = clamp(y0, bbox.y, bbox.w);
|
y0 = clamp(y0, bbox.y, bbox.w);
|
||||||
x1 = clamp(x1, bbox.x, bbox.z);
|
x1 = clamp(x1, bbox.x, bbox.z);
|
||||||
y1 = clamp(y1, bbox.y, bbox.w);
|
y1 = clamp(y1, bbox.y, bbox.w);
|
||||||
float t = a + b * float(y0);
|
sh_x0[th_ix] = x0;
|
||||||
|
// TODO: can get rid of this (fold into base), with care (also need to update `a`)
|
||||||
|
sh_y0[th_ix] = y0;
|
||||||
int stride = bbox.z - bbox.x;
|
int stride = bbox.z - bbox.x;
|
||||||
int base = (y0 - bbox.y) * stride - bbox.x;
|
sh_stride[th_ix] = stride;
|
||||||
// TODO: can be tighter, use c to bound width
|
sh_base[th_ix] = path.tiles.offset - (bbox.y * stride + bbox.x) * Tile_size;
|
||||||
uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
|
uint width = uint(x1 - x0);
|
||||||
// Consider using subgroups to aggregate atomic add.
|
sh_width[th_ix] = width;
|
||||||
uint tile_offset = atomicAdd(alloc, n_tile_alloc * TileSeg_size);
|
uint draw_width = min(width, uint(1.0 + ceil(2.0 * c)));
|
||||||
TileSeg tile_seg;
|
sh_draw_width[th_ix] = draw_width;
|
||||||
tile_seg.start = line.p0;
|
uint tile_count = draw_width * uint(y1 - y0);
|
||||||
tile_seg.end = line.p1;
|
|
||||||
for (int y = y0; y < y1; y++) {
|
sh_tile_count[th_ix] = tile_count;
|
||||||
|
for (uint i = 0; i < LG_COARSE_WG; i++) {
|
||||||
|
barrier();
|
||||||
|
if (th_ix >= (1 << i)) {
|
||||||
|
tile_count += sh_tile_count[th_ix - (1 << i)];
|
||||||
|
}
|
||||||
|
barrier();
|
||||||
|
sh_tile_count[th_ix] = tile_count;
|
||||||
|
}
|
||||||
|
if (th_ix == COARSE_WG - 1) {
|
||||||
|
sh_alloc_start = atomicAdd(alloc, tile_count * TileSeg_size);
|
||||||
|
}
|
||||||
|
barrier();
|
||||||
|
uint alloc_start = sh_alloc_start;
|
||||||
|
uint total_tile_count = sh_tile_count[COARSE_WG - 1];
|
||||||
|
|
||||||
|
for (uint ix = th_ix; ix < total_tile_count; ix += COARSE_WG) {
|
||||||
|
// Binary search to find element
|
||||||
|
uint el_ix = 0;
|
||||||
|
for (uint i = 0; i < LG_COARSE_WG; i++) {
|
||||||
|
uint probe = el_ix + ((COARSE_WG / 2) >> i);
|
||||||
|
if (ix >= sh_tile_count[probe - 1]) {
|
||||||
|
el_ix = probe;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
uint seq_ix = ix - (el_ix > 0 ? sh_tile_count[el_ix - 1] : 0);
|
||||||
|
uint draw_width = sh_draw_width[el_ix];
|
||||||
|
int x0 = sh_x0[el_ix];
|
||||||
|
int x1 = x0 + int(sh_width[el_ix]);
|
||||||
|
int dx = int(seq_ix % draw_width);
|
||||||
|
uint y = sh_y0[el_ix] + seq_ix / draw_width;
|
||||||
|
float t = sh_a[el_ix] + sh_b[el_ix] * float(y);
|
||||||
|
float c = sh_c[el_ix];
|
||||||
int xx0 = clamp(int(floor(t - c)), x0, x1);
|
int xx0 = clamp(int(floor(t - c)), x0, x1);
|
||||||
int xx1 = clamp(int(ceil(t + c)), x0, x1);
|
int xx1 = clamp(int(ceil(t + c)), x0, x1);
|
||||||
for (int x = xx0; x < xx1; x++) {
|
int x = xx0 + dx;
|
||||||
TileRef tile_ref = Tile_index(path.tiles, uint(base + x));
|
if (x < xx1) {
|
||||||
uint tile_el = tile_ref.offset >> 2;
|
uint tile_offset = alloc_start + ix * TileSeg_size;
|
||||||
|
uint tile_el = (sh_base[el_ix] + uint(y * sh_stride[el_ix] + x) * Tile_size) >> 2;
|
||||||
uint old;
|
uint old;
|
||||||
uint actual;
|
uint actual;
|
||||||
do {
|
do {
|
||||||
old = tile[tile_el];
|
old = tile[tile_el];
|
||||||
actual = atomicCompSwap(tile[tile_el], old, tile_offset);
|
actual = atomicCompSwap(tile[tile_el], old, tile_offset);
|
||||||
} while (actual != old);
|
} while (actual != old);
|
||||||
|
TileSeg tile_seg;
|
||||||
|
tile_seg.start = sh_p0[el_ix];
|
||||||
|
tile_seg.end = sh_p1[el_ix];
|
||||||
tile_seg.next.offset = old;
|
tile_seg.next.offset = old;
|
||||||
TileSeg_write(TileSegRef(tile_offset), tile_seg);
|
TileSeg_write(TileSegRef(tile_offset), tile_seg);
|
||||||
tile_offset += TileSeg_size;
|
|
||||||
}
|
}
|
||||||
// TODO for fills: backdrop
|
|
||||||
t += b;
|
|
||||||
base += stride;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Binary file not shown.
|
@ -83,9 +83,11 @@ void main() {
|
||||||
barrier();
|
barrier();
|
||||||
uint alloc_start = sh_tile_alloc;
|
uint alloc_start = sh_tile_alloc;
|
||||||
|
|
||||||
|
if (element_ix < n_elements) {
|
||||||
uint tile_subix = th_ix > 0 ? sh_tile_count[th_ix - 1] : 0;
|
uint tile_subix = th_ix > 0 ? sh_tile_count[th_ix - 1] : 0;
|
||||||
path.tiles = TileRef(alloc_start + Tile_size * tile_subix);
|
path.tiles = TileRef(alloc_start + Tile_size * tile_subix);
|
||||||
Path_write(path_ref, path);
|
Path_write(path_ref, path);
|
||||||
|
}
|
||||||
|
|
||||||
// Zero out allocated tiles efficiently
|
// Zero out allocated tiles efficiently
|
||||||
uint total_count = sh_tile_count[TILE_ALLOC_WG - 1] * (Tile_size / 4);
|
uint total_count = sh_tile_count[TILE_ALLOC_WG - 1] * (Tile_size / 4);
|
||||||
|
|
Binary file not shown.
Loading…
Reference in a new issue