mirror of
https://github.com/italicsjenga/vello.git
synced 2025-01-10 12:41:30 +11:00
Optimize tile allocation
Use parallel scheme to zero out tiles.
This commit is contained in:
parent
70a9c17e23
commit
ff8cee059c
|
@ -210,7 +210,7 @@ fn main() -> Result<(), Error> {
|
||||||
|
|
||||||
/*
|
/*
|
||||||
let mut data: Vec<u32> = Default::default();
|
let mut data: Vec<u32> = Default::default();
|
||||||
device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap();
|
device.read_buffer(&renderer.tile_buf, &mut data).unwrap();
|
||||||
piet_gpu::dump_k1_data(&data);
|
piet_gpu::dump_k1_data(&data);
|
||||||
//trace_ptcl(&data);
|
//trace_ptcl(&data);
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -5,7 +5,8 @@
|
||||||
|
|
||||||
#include "setup.h"
|
#include "setup.h"
|
||||||
|
|
||||||
#define TILE_ALLOC_WG 32
|
#define LG_TILE_ALLOC_WG 5
|
||||||
|
#define TILE_ALLOC_WG (1 << LG_TILE_ALLOC_WG)
|
||||||
|
|
||||||
layout(local_size_x = TILE_ALLOC_WG, local_size_y = 1) in;
|
layout(local_size_x = TILE_ALLOC_WG, local_size_y = 1) in;
|
||||||
|
|
||||||
|
@ -30,7 +31,11 @@ layout(set = 0, binding = 2) buffer TileBuf {
|
||||||
#define SX (1.0 / float(TILE_WIDTH_PX))
|
#define SX (1.0 / float(TILE_WIDTH_PX))
|
||||||
#define SY (1.0 / float(TILE_HEIGHT_PX))
|
#define SY (1.0 / float(TILE_HEIGHT_PX))
|
||||||
|
|
||||||
|
shared uint sh_tile_count[TILE_ALLOC_WG];
|
||||||
|
shared uint sh_tile_alloc;
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
|
uint th_ix = gl_LocalInvocationID.x;
|
||||||
uint element_ix = gl_GlobalInvocationID.x;
|
uint element_ix = gl_GlobalInvocationID.x;
|
||||||
PathRef path_ref = PathRef(element_ix * Path_size);
|
PathRef path_ref = PathRef(element_ix * Path_size);
|
||||||
AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size);
|
AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size);
|
||||||
|
@ -59,15 +64,35 @@ void main() {
|
||||||
|
|
||||||
Path path;
|
Path path;
|
||||||
path.bbox = uvec4(x0, y0, x1, y1);
|
path.bbox = uvec4(x0, y0, x1, y1);
|
||||||
uint n_tiles = (x1 - x0) * (y1 - y0);
|
uint tile_count = (x1 - x0) * (y1 - y0);
|
||||||
path.tiles = TileRef(0);
|
uint n_tiles = tile_count;
|
||||||
if (n_tiles > 0) {
|
|
||||||
path.tiles.offset = atomicAdd(alloc, n_tiles * Tile_size);
|
sh_tile_count[th_ix] = tile_count;
|
||||||
Tile init_tile = Tile(TileSegRef(0), 0);
|
// Prefix sum of sh_tile_count
|
||||||
// TODO: improve load balancing
|
for (uint i = 0; i < LG_TILE_ALLOC_WG; i++) {
|
||||||
for (uint i = 0; i < n_tiles; i++) {
|
barrier();
|
||||||
Tile_write(Tile_index(path.tiles, i), init_tile);
|
if (th_ix >= (1 << i)) {
|
||||||
|
tile_count += sh_tile_count[th_ix - (1 << i)];
|
||||||
}
|
}
|
||||||
|
barrier();
|
||||||
|
sh_tile_count[th_ix] = tile_count;
|
||||||
}
|
}
|
||||||
|
if (th_ix == TILE_ALLOC_WG - 1) {
|
||||||
|
sh_tile_alloc = atomicAdd(alloc, tile_count * Tile_size);
|
||||||
|
}
|
||||||
|
barrier();
|
||||||
|
uint alloc_start = sh_tile_alloc;
|
||||||
|
|
||||||
|
uint tile_subix = th_ix > 0 ? sh_tile_count[th_ix - 1] : 0;
|
||||||
|
path.tiles = TileRef(alloc_start + Tile_size * tile_subix);
|
||||||
Path_write(path_ref, path);
|
Path_write(path_ref, path);
|
||||||
|
|
||||||
|
// Zero out allocated tiles efficiently
|
||||||
|
uint total_count = sh_tile_count[TILE_ALLOC_WG - 1] * (Tile_size / 4);
|
||||||
|
uint start_ix = alloc_start >> 2;
|
||||||
|
for (uint i = th_ix; i < total_count; i += TILE_ALLOC_WG) {
|
||||||
|
// Note: this interleaving is faster than using Tile_write
|
||||||
|
// by a significant amount.
|
||||||
|
tile[start_ix + i] = 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Binary file not shown.
Loading…
Reference in a new issue