Better output allocation in binning

This commit is contained in:
Raph Levien 2020-05-12 19:54:19 -07:00
parent 343e4c3075
commit 64daf843b0
2 changed files with 45 additions and 24 deletions

View file

@ -49,16 +49,16 @@ shared uint sh_my_tile;
void main() { void main() {
BinChunkRef chunk_ref = BinChunkRef((gl_LocalInvocationID.x * N_WG + gl_WorkGroupID.x) * BIN_INITIAL_ALLOC); BinChunkRef chunk_ref = BinChunkRef((gl_LocalInvocationID.x * N_WG + gl_WorkGroupID.x) * BIN_INITIAL_ALLOC);
uint chunk_limit = chunk_ref.offset + BIN_INITIAL_ALLOC - BinInstance_size; uint wr_limit = chunk_ref.offset + BIN_INITIAL_ALLOC;
uint chunk_n = 0; uint chunk_n = 0;
BinInstanceRef instance_ref = BinInstanceRef(chunk_ref.offset + BinChunk_size); uint my_n_elements = n_elements;
while (true) { while (true) {
if (gl_LocalInvocationID.x == 0) { if (gl_LocalInvocationID.x == 0) {
sh_my_tile = atomicAdd(tile_ix, 1); sh_my_tile = atomicAdd(tile_ix, 1);
} }
barrier(); barrier();
uint my_tile = sh_my_tile; uint my_tile = sh_my_tile;
if (my_tile * N_TILE >= n_elements) { if (my_tile * N_TILE >= my_n_elements) {
break; break;
} }
@ -70,7 +70,10 @@ void main() {
// Read inputs and determine coverage of bins // Read inputs and determine coverage of bins
uint element_ix = my_tile * N_TILE + gl_LocalInvocationID.x; uint element_ix = my_tile * N_TILE + gl_LocalInvocationID.x;
AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size); AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size);
uint tag = Annotated_tag(ref); uint tag = Annotated_Nop;
if (element_ix < my_n_elements) {
tag = Annotated_tag(ref);
}
int x0 = 0, y0 = 0, x1 = 0, y1 = 0; int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
switch (tag) { switch (tag) {
case Annotated_Line: case Annotated_Line:
@ -119,18 +122,43 @@ void main() {
element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]); element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]);
} }
// element_count is number of elements covering bin for this invocation. // element_count is number of elements covering bin for this invocation.
if (element_count > 0 && chunk_n > 0) { if (element_count == 0) {
uint new_chunk = instance_ref.offset; continue;
if (new_chunk + min(32, element_count * 4) > chunk_limit) { }
new_chunk = atomicAdd(alloc, BIN_ALLOC); uint chunk_end;
chunk_limit = new_chunk + BIN_ALLOC - BinInstance_size; uint chunk_new_start;
} // Refactor to reduce code duplication?
BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(new_chunk))); if (chunk_n > 0) {
chunk_ref = BinChunkRef(new_chunk); uint next_chunk = chunk_ref.offset + BinChunk_size + chunk_n * 4;
instance_ref = BinInstanceRef(new_chunk + BinChunk_size); if (next_chunk + BinChunk_size + min(24, element_count * 4) > wr_limit) {
chunk_n = 0; uint alloc_amount = max(BIN_ALLOC, BinChunk_size + element_count * 4);
if (alloc_amount - BIN_ALLOC < 64) {
alloc_amount = BIN_ALLOC;
}
next_chunk = atomicAdd(alloc, alloc_amount);
wr_limit = next_chunk + alloc_amount;
}
BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(next_chunk)));
chunk_ref = BinChunkRef(next_chunk);
}
BinInstanceRef instance_ref = BinInstanceRef(chunk_ref.offset + BinChunk_size);
if (instance_ref.offset + element_count * 4 > wr_limit) {
chunk_end = wr_limit;
chunk_n = (wr_limit - instance_ref.offset) / 4;
uint alloc_amount = max(BIN_ALLOC, BinChunk_size + (element_count - chunk_n) * 4);
if (alloc_amount - BIN_ALLOC < 64) {
alloc_amount = BIN_ALLOC;
}
chunk_new_start = atomicAdd(alloc, alloc_amount);
wr_limit = chunk_new_start + alloc_amount;
BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(chunk_new_start)));
chunk_ref = BinChunkRef(chunk_new_start);
chunk_new_start += BinChunk_size;
chunk_n = element_count - chunk_n;
} else {
chunk_end = ~0;
chunk_n = element_count;
} }
// TODO: allocate output here
// Iterate over bits set. // Iterate over bits set.
uint slice_ix = 0; uint slice_ix = 0;
@ -149,17 +177,10 @@ void main() {
element_ix = my_tile * N_TILE + slice_ix * 32 + findLSB(bitmap); element_ix = my_tile * N_TILE + slice_ix * 32 + findLSB(bitmap);
// At this point, element_ix refers to an element that covers this bin. // At this point, element_ix refers to an element that covers this bin.
// TODO: batch allocated based on element_count; this is divergent if (instance_ref.offset == chunk_end) {
if (instance_ref.offset > chunk_limit) { instance_ref.offset = chunk_new_start;
uint new_chunk = atomicAdd(alloc, BIN_ALLOC);
BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(new_chunk)));
chunk_ref = BinChunkRef(new_chunk);
instance_ref = BinInstanceRef(new_chunk + BinChunk_size);
chunk_n = 0;
chunk_limit = new_chunk + BIN_ALLOC - BinInstance_size;
} }
BinInstance_write(instance_ref, BinInstance(element_ix)); BinInstance_write(instance_ref, BinInstance(element_ix));
chunk_n++;
instance_ref.offset += BinInstance_size; instance_ref.offset += BinInstance_size;
// clear LSB // clear LSB
bitmap &= bitmap - 1; bitmap &= bitmap - 1;

Binary file not shown.