mirror of
https://github.com/italicsjenga/vello.git
synced 2025-01-10 20:51:29 +11:00
Use different output strategy for binning
Iterate over bin bounding box. Seems to work, and is a dramatic improvement.
This commit is contained in:
parent
64daf843b0
commit
9a0b17ff5b
|
@ -45,7 +45,11 @@ layout(set = 0, binding = 2) buffer BinsBuf {
|
||||||
|
|
||||||
// Note: cudaraster has N_TILE + 1 to cut down on bank conflicts.
|
// Note: cudaraster has N_TILE + 1 to cut down on bank conflicts.
|
||||||
shared uint bitmaps[N_SLICE][N_TILE];
|
shared uint bitmaps[N_SLICE][N_TILE];
|
||||||
|
shared uint count[N_SLICE][N_TILE];
|
||||||
shared uint sh_my_tile;
|
shared uint sh_my_tile;
|
||||||
|
shared uint sh_chunk_start[N_TILE];
|
||||||
|
shared uint sh_chunk_end[N_TILE];
|
||||||
|
shared uint sh_chunk_jump[N_TILE];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
BinChunkRef chunk_ref = BinChunkRef((gl_LocalInvocationID.x * N_WG + gl_WorkGroupID.x) * BIN_INITIAL_ALLOC);
|
BinChunkRef chunk_ref = BinChunkRef((gl_LocalInvocationID.x * N_WG + gl_WorkGroupID.x) * BIN_INITIAL_ALLOC);
|
||||||
|
@ -120,70 +124,73 @@ void main() {
|
||||||
uint element_count = 0;
|
uint element_count = 0;
|
||||||
for (uint i = 0; i < N_SLICE; i++) {
|
for (uint i = 0; i < N_SLICE; i++) {
|
||||||
element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]);
|
element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]);
|
||||||
|
count[i][gl_LocalInvocationID.x] = element_count;
|
||||||
}
|
}
|
||||||
// element_count is number of elements covering bin for this invocation.
|
// element_count is number of elements covering bin for this invocation.
|
||||||
if (element_count == 0) {
|
if (element_count != 0) {
|
||||||
continue;
|
uint chunk_end;
|
||||||
}
|
uint chunk_new_start;
|
||||||
uint chunk_end;
|
// Refactor to reduce code duplication?
|
||||||
uint chunk_new_start;
|
if (chunk_n > 0) {
|
||||||
// Refactor to reduce code duplication?
|
uint next_chunk = chunk_ref.offset + BinChunk_size + chunk_n * 4;
|
||||||
if (chunk_n > 0) {
|
if (next_chunk + BinChunk_size + min(24, element_count * 4) > wr_limit) {
|
||||||
uint next_chunk = chunk_ref.offset + BinChunk_size + chunk_n * 4;
|
uint alloc_amount = max(BIN_ALLOC, BinChunk_size + element_count * 4);
|
||||||
if (next_chunk + BinChunk_size + min(24, element_count * 4) > wr_limit) {
|
if (alloc_amount - BIN_ALLOC < 64) {
|
||||||
uint alloc_amount = max(BIN_ALLOC, BinChunk_size + element_count * 4);
|
alloc_amount = BIN_ALLOC;
|
||||||
|
}
|
||||||
|
next_chunk = atomicAdd(alloc, alloc_amount);
|
||||||
|
wr_limit = next_chunk + alloc_amount;
|
||||||
|
}
|
||||||
|
BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(next_chunk)));
|
||||||
|
chunk_ref = BinChunkRef(next_chunk);
|
||||||
|
}
|
||||||
|
BinInstanceRef instance_ref = BinInstanceRef(chunk_ref.offset + BinChunk_size);
|
||||||
|
if (instance_ref.offset + element_count * 4 > wr_limit) {
|
||||||
|
chunk_end = wr_limit;
|
||||||
|
chunk_n = (wr_limit - instance_ref.offset) / 4;
|
||||||
|
uint alloc_amount = max(BIN_ALLOC, BinChunk_size + (element_count - chunk_n) * 4);
|
||||||
if (alloc_amount - BIN_ALLOC < 64) {
|
if (alloc_amount - BIN_ALLOC < 64) {
|
||||||
alloc_amount = BIN_ALLOC;
|
alloc_amount = BIN_ALLOC;
|
||||||
}
|
}
|
||||||
next_chunk = atomicAdd(alloc, alloc_amount);
|
chunk_new_start = atomicAdd(alloc, alloc_amount);
|
||||||
wr_limit = next_chunk + alloc_amount;
|
wr_limit = chunk_new_start + alloc_amount;
|
||||||
|
BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(chunk_new_start)));
|
||||||
|
chunk_ref = BinChunkRef(chunk_new_start);
|
||||||
|
chunk_new_start += BinChunk_size;
|
||||||
|
chunk_n = element_count - chunk_n;
|
||||||
|
} else {
|
||||||
|
chunk_end = ~0;
|
||||||
|
chunk_n = element_count;
|
||||||
}
|
}
|
||||||
BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(next_chunk)));
|
sh_chunk_start[gl_LocalInvocationID.x] = instance_ref.offset;
|
||||||
chunk_ref = BinChunkRef(next_chunk);
|
sh_chunk_end[gl_LocalInvocationID.x] = chunk_end;
|
||||||
}
|
sh_chunk_jump[gl_LocalInvocationID.x] = chunk_new_start - chunk_end;
|
||||||
BinInstanceRef instance_ref = BinInstanceRef(chunk_ref.offset + BinChunk_size);
|
|
||||||
if (instance_ref.offset + element_count * 4 > wr_limit) {
|
|
||||||
chunk_end = wr_limit;
|
|
||||||
chunk_n = (wr_limit - instance_ref.offset) / 4;
|
|
||||||
uint alloc_amount = max(BIN_ALLOC, BinChunk_size + (element_count - chunk_n) * 4);
|
|
||||||
if (alloc_amount - BIN_ALLOC < 64) {
|
|
||||||
alloc_amount = BIN_ALLOC;
|
|
||||||
}
|
|
||||||
chunk_new_start = atomicAdd(alloc, alloc_amount);
|
|
||||||
wr_limit = chunk_new_start + alloc_amount;
|
|
||||||
BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(chunk_new_start)));
|
|
||||||
chunk_ref = BinChunkRef(chunk_new_start);
|
|
||||||
chunk_new_start += BinChunk_size;
|
|
||||||
chunk_n = element_count - chunk_n;
|
|
||||||
} else {
|
|
||||||
chunk_end = ~0;
|
|
||||||
chunk_n = element_count;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Iterate over bits set.
|
barrier();
|
||||||
uint slice_ix = 0;
|
// Use similar strategy as Laine & Karras paper; loop over bbox of bins
|
||||||
uint bitmap = bitmaps[0][gl_LocalInvocationID.x];
|
// touched by this element
|
||||||
while (true) {
|
x = x0;
|
||||||
if (bitmap == 0) {
|
y = y0;
|
||||||
slice_ix++;
|
while (y < y1) {
|
||||||
if (slice_ix == N_SLICE) {
|
uint bin_ix = y * N_TILE_X + x;
|
||||||
break;
|
uint out_mask = bitmaps[my_slice][bin_ix];
|
||||||
|
if ((out_mask & my_mask) != 0) {
|
||||||
|
uint idx = bitCount(out_mask & (my_mask - 1));
|
||||||
|
if (my_slice > 0) {
|
||||||
|
idx += count[my_slice - 1][bin_ix];
|
||||||
}
|
}
|
||||||
bitmap = bitmaps[slice_ix][gl_LocalInvocationID.x];
|
uint out_offset = sh_chunk_start[bin_ix] + idx * 4;
|
||||||
if (bitmap == 0) {
|
if (out_offset >= sh_chunk_end[bin_ix]) {
|
||||||
continue;
|
out_offset += sh_chunk_jump[bin_ix];
|
||||||
}
|
}
|
||||||
|
BinInstance_write(BinInstanceRef(out_offset), BinInstance(element_ix));
|
||||||
}
|
}
|
||||||
element_ix = my_tile * N_TILE + slice_ix * 32 + findLSB(bitmap);
|
x++;
|
||||||
// At this point, element_ix refers to an element that covers this bin.
|
if (x == x1) {
|
||||||
|
x = x0;
|
||||||
if (instance_ref.offset == chunk_end) {
|
y++;
|
||||||
instance_ref.offset = chunk_new_start;
|
|
||||||
}
|
}
|
||||||
BinInstance_write(instance_ref, BinInstance(element_ix));
|
|
||||||
instance_ref.offset += BinInstance_size;
|
|
||||||
// clear LSB
|
|
||||||
bitmap &= bitmap - 1;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(0)));
|
BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(0)));
|
||||||
|
|
Binary file not shown.
Loading…
Reference in a new issue