vello/piet-gpu/shader/tile_alloc.comp
Raph Levien 63ba45c774 Fix performance issues
Use larger workgroup for tile initialization (utilization was poor).
Provide correct element count to coarse rasterizer.
2020-06-03 15:32:58 -07:00

99 lines
2.8 KiB
Plaintext

// Allocation and initialization of tiles for paths.
#version 450
#extension GL_GOOGLE_include_directive : enable
#include "setup.h"
#define LG_TILE_ALLOC_WG 8
#define TILE_ALLOC_WG (1 << LG_TILE_ALLOC_WG)
layout(local_size_x = TILE_ALLOC_WG, local_size_y = 1) in;
layout(set = 0, binding = 0) buffer AnnotatedBuf {
uint[] annotated;
};
layout(set = 0, binding = 1) buffer AllocBuf {
uint n_elements;
uint n_pathseg;
uint alloc;
};
layout(set = 0, binding = 2) buffer TileBuf {
uint[] tile;
};
#include "annotated.h"
#include "tile.h"
// scale factors useful for converting coordinates to tiles
#define SX (1.0 / float(TILE_WIDTH_PX))
#define SY (1.0 / float(TILE_HEIGHT_PX))
shared uint sh_tile_count[TILE_ALLOC_WG];
shared uint sh_tile_alloc;
void main() {
uint th_ix = gl_LocalInvocationID.x;
uint element_ix = gl_GlobalInvocationID.x;
PathRef path_ref = PathRef(element_ix * Path_size);
AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size);
uint tag = Annotated_Nop;
if (element_ix < n_elements) {
tag = Annotated_tag(ref);
}
int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
switch (tag) {
case Annotated_Fill:
case Annotated_Stroke:
// Note: we take advantage of the fact that fills and strokes
// have compatible layout.
AnnoFill fill = Annotated_Fill_read(ref);
x0 = int(floor(fill.bbox.x * SX));
y0 = int(floor(fill.bbox.y * SY));
x1 = int(ceil(fill.bbox.z * SX));
y1 = int(ceil(fill.bbox.w * SY));
break;
}
x0 = clamp(x0, 0, WIDTH_IN_TILES);
y0 = clamp(y0, 0, HEIGHT_IN_TILES);
x1 = clamp(x1, 0, WIDTH_IN_TILES);
y1 = clamp(y1, 0, HEIGHT_IN_TILES);
Path path;
path.bbox = uvec4(x0, y0, x1, y1);
uint tile_count = (x1 - x0) * (y1 - y0);
uint n_tiles = tile_count;
sh_tile_count[th_ix] = tile_count;
// Prefix sum of sh_tile_count
for (uint i = 0; i < LG_TILE_ALLOC_WG; i++) {
barrier();
if (th_ix >= (1 << i)) {
tile_count += sh_tile_count[th_ix - (1 << i)];
}
barrier();
sh_tile_count[th_ix] = tile_count;
}
if (th_ix == TILE_ALLOC_WG - 1) {
sh_tile_alloc = atomicAdd(alloc, tile_count * Tile_size);
}
barrier();
uint alloc_start = sh_tile_alloc;
uint tile_subix = th_ix > 0 ? sh_tile_count[th_ix - 1] : 0;
path.tiles = TileRef(alloc_start + Tile_size * tile_subix);
Path_write(path_ref, path);
// Zero out allocated tiles efficiently
uint total_count = sh_tile_count[TILE_ALLOC_WG - 1] * (Tile_size / 4);
uint start_ix = alloc_start >> 2;
for (uint i = th_ix; i < total_count; i += TILE_ALLOC_WG) {
// Note: this interleaving is faster than using Tile_write
// by a significant amount.
tile[start_ix + i] = 0;
}
}