Use different output strategy for binning

Iterate over bin bounding box.

Seems to work, and is a dramatic improvement.
This commit is contained in:
Raph Levien 2020-05-12 21:26:44 -07:00
parent 64daf843b0
commit 9a0b17ff5b
2 changed files with 59 additions and 52 deletions

View file

@ -45,7 +45,11 @@ layout(set = 0, binding = 2) buffer BinsBuf {
// Note: cudaraster has N_TILE + 1 to cut down on bank conflicts. // Note: cudaraster has N_TILE + 1 to cut down on bank conflicts.
shared uint bitmaps[N_SLICE][N_TILE]; shared uint bitmaps[N_SLICE][N_TILE];
shared uint count[N_SLICE][N_TILE];
shared uint sh_my_tile; shared uint sh_my_tile;
shared uint sh_chunk_start[N_TILE];
shared uint sh_chunk_end[N_TILE];
shared uint sh_chunk_jump[N_TILE];
void main() { void main() {
BinChunkRef chunk_ref = BinChunkRef((gl_LocalInvocationID.x * N_WG + gl_WorkGroupID.x) * BIN_INITIAL_ALLOC); BinChunkRef chunk_ref = BinChunkRef((gl_LocalInvocationID.x * N_WG + gl_WorkGroupID.x) * BIN_INITIAL_ALLOC);
@ -120,11 +124,10 @@ void main() {
uint element_count = 0; uint element_count = 0;
for (uint i = 0; i < N_SLICE; i++) { for (uint i = 0; i < N_SLICE; i++) {
element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]); element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]);
count[i][gl_LocalInvocationID.x] = element_count;
} }
// element_count is number of elements covering bin for this invocation. // element_count is number of elements covering bin for this invocation.
if (element_count == 0) { if (element_count != 0) {
continue;
}
uint chunk_end; uint chunk_end;
uint chunk_new_start; uint chunk_new_start;
// Refactor to reduce code duplication? // Refactor to reduce code duplication?
@ -159,31 +162,35 @@ void main() {
chunk_end = ~0; chunk_end = ~0;
chunk_n = element_count; chunk_n = element_count;
} }
sh_chunk_start[gl_LocalInvocationID.x] = instance_ref.offset;
sh_chunk_end[gl_LocalInvocationID.x] = chunk_end;
sh_chunk_jump[gl_LocalInvocationID.x] = chunk_new_start - chunk_end;
}
// Iterate over bits set. barrier();
uint slice_ix = 0; // Use similar strategy as Laine & Karras paper; loop over bbox of bins
uint bitmap = bitmaps[0][gl_LocalInvocationID.x]; // touched by this element
while (true) { x = x0;
if (bitmap == 0) { y = y0;
slice_ix++; while (y < y1) {
if (slice_ix == N_SLICE) { uint bin_ix = y * N_TILE_X + x;
break; uint out_mask = bitmaps[my_slice][bin_ix];
if ((out_mask & my_mask) != 0) {
uint idx = bitCount(out_mask & (my_mask - 1));
if (my_slice > 0) {
idx += count[my_slice - 1][bin_ix];
} }
bitmap = bitmaps[slice_ix][gl_LocalInvocationID.x]; uint out_offset = sh_chunk_start[bin_ix] + idx * 4;
if (bitmap == 0) { if (out_offset >= sh_chunk_end[bin_ix]) {
continue; out_offset += sh_chunk_jump[bin_ix];
} }
BinInstance_write(BinInstanceRef(out_offset), BinInstance(element_ix));
} }
element_ix = my_tile * N_TILE + slice_ix * 32 + findLSB(bitmap); x++;
// At this point, element_ix refers to an element that covers this bin. if (x == x1) {
x = x0;
if (instance_ref.offset == chunk_end) { y++;
instance_ref.offset = chunk_new_start;
} }
BinInstance_write(instance_ref, BinInstance(element_ix));
instance_ref.offset += BinInstance_size;
// clear LSB
bitmap &= bitmap - 1;
} }
} }
BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(0))); BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(0)));

Binary file not shown.