mirror of
https://github.com/italicsjenga/vello.git
synced 2025-01-10 20:51:29 +11:00
Better output allocation in binning
This commit is contained in:
parent
343e4c3075
commit
64daf843b0
|
@ -49,16 +49,16 @@ shared uint sh_my_tile;
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
BinChunkRef chunk_ref = BinChunkRef((gl_LocalInvocationID.x * N_WG + gl_WorkGroupID.x) * BIN_INITIAL_ALLOC);
|
BinChunkRef chunk_ref = BinChunkRef((gl_LocalInvocationID.x * N_WG + gl_WorkGroupID.x) * BIN_INITIAL_ALLOC);
|
||||||
uint chunk_limit = chunk_ref.offset + BIN_INITIAL_ALLOC - BinInstance_size;
|
uint wr_limit = chunk_ref.offset + BIN_INITIAL_ALLOC;
|
||||||
uint chunk_n = 0;
|
uint chunk_n = 0;
|
||||||
BinInstanceRef instance_ref = BinInstanceRef(chunk_ref.offset + BinChunk_size);
|
uint my_n_elements = n_elements;
|
||||||
while (true) {
|
while (true) {
|
||||||
if (gl_LocalInvocationID.x == 0) {
|
if (gl_LocalInvocationID.x == 0) {
|
||||||
sh_my_tile = atomicAdd(tile_ix, 1);
|
sh_my_tile = atomicAdd(tile_ix, 1);
|
||||||
}
|
}
|
||||||
barrier();
|
barrier();
|
||||||
uint my_tile = sh_my_tile;
|
uint my_tile = sh_my_tile;
|
||||||
if (my_tile * N_TILE >= n_elements) {
|
if (my_tile * N_TILE >= my_n_elements) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -70,7 +70,10 @@ void main() {
|
||||||
// Read inputs and determine coverage of bins
|
// Read inputs and determine coverage of bins
|
||||||
uint element_ix = my_tile * N_TILE + gl_LocalInvocationID.x;
|
uint element_ix = my_tile * N_TILE + gl_LocalInvocationID.x;
|
||||||
AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size);
|
AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size);
|
||||||
uint tag = Annotated_tag(ref);
|
uint tag = Annotated_Nop;
|
||||||
|
if (element_ix < my_n_elements) {
|
||||||
|
tag = Annotated_tag(ref);
|
||||||
|
}
|
||||||
int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
|
int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
|
||||||
switch (tag) {
|
switch (tag) {
|
||||||
case Annotated_Line:
|
case Annotated_Line:
|
||||||
|
@ -119,18 +122,43 @@ void main() {
|
||||||
element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]);
|
element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]);
|
||||||
}
|
}
|
||||||
// element_count is number of elements covering bin for this invocation.
|
// element_count is number of elements covering bin for this invocation.
|
||||||
if (element_count > 0 && chunk_n > 0) {
|
if (element_count == 0) {
|
||||||
uint new_chunk = instance_ref.offset;
|
continue;
|
||||||
if (new_chunk + min(32, element_count * 4) > chunk_limit) {
|
|
||||||
new_chunk = atomicAdd(alloc, BIN_ALLOC);
|
|
||||||
chunk_limit = new_chunk + BIN_ALLOC - BinInstance_size;
|
|
||||||
}
|
}
|
||||||
BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(new_chunk)));
|
uint chunk_end;
|
||||||
chunk_ref = BinChunkRef(new_chunk);
|
uint chunk_new_start;
|
||||||
instance_ref = BinInstanceRef(new_chunk + BinChunk_size);
|
// Refactor to reduce code duplication?
|
||||||
chunk_n = 0;
|
if (chunk_n > 0) {
|
||||||
|
uint next_chunk = chunk_ref.offset + BinChunk_size + chunk_n * 4;
|
||||||
|
if (next_chunk + BinChunk_size + min(24, element_count * 4) > wr_limit) {
|
||||||
|
uint alloc_amount = max(BIN_ALLOC, BinChunk_size + element_count * 4);
|
||||||
|
if (alloc_amount - BIN_ALLOC < 64) {
|
||||||
|
alloc_amount = BIN_ALLOC;
|
||||||
|
}
|
||||||
|
next_chunk = atomicAdd(alloc, alloc_amount);
|
||||||
|
wr_limit = next_chunk + alloc_amount;
|
||||||
|
}
|
||||||
|
BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(next_chunk)));
|
||||||
|
chunk_ref = BinChunkRef(next_chunk);
|
||||||
|
}
|
||||||
|
BinInstanceRef instance_ref = BinInstanceRef(chunk_ref.offset + BinChunk_size);
|
||||||
|
if (instance_ref.offset + element_count * 4 > wr_limit) {
|
||||||
|
chunk_end = wr_limit;
|
||||||
|
chunk_n = (wr_limit - instance_ref.offset) / 4;
|
||||||
|
uint alloc_amount = max(BIN_ALLOC, BinChunk_size + (element_count - chunk_n) * 4);
|
||||||
|
if (alloc_amount - BIN_ALLOC < 64) {
|
||||||
|
alloc_amount = BIN_ALLOC;
|
||||||
|
}
|
||||||
|
chunk_new_start = atomicAdd(alloc, alloc_amount);
|
||||||
|
wr_limit = chunk_new_start + alloc_amount;
|
||||||
|
BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(chunk_new_start)));
|
||||||
|
chunk_ref = BinChunkRef(chunk_new_start);
|
||||||
|
chunk_new_start += BinChunk_size;
|
||||||
|
chunk_n = element_count - chunk_n;
|
||||||
|
} else {
|
||||||
|
chunk_end = ~0;
|
||||||
|
chunk_n = element_count;
|
||||||
}
|
}
|
||||||
// TODO: allocate output here
|
|
||||||
|
|
||||||
// Iterate over bits set.
|
// Iterate over bits set.
|
||||||
uint slice_ix = 0;
|
uint slice_ix = 0;
|
||||||
|
@ -149,17 +177,10 @@ void main() {
|
||||||
element_ix = my_tile * N_TILE + slice_ix * 32 + findLSB(bitmap);
|
element_ix = my_tile * N_TILE + slice_ix * 32 + findLSB(bitmap);
|
||||||
// At this point, element_ix refers to an element that covers this bin.
|
// At this point, element_ix refers to an element that covers this bin.
|
||||||
|
|
||||||
// TODO: batch allocated based on element_count; this is divergent
|
if (instance_ref.offset == chunk_end) {
|
||||||
if (instance_ref.offset > chunk_limit) {
|
instance_ref.offset = chunk_new_start;
|
||||||
uint new_chunk = atomicAdd(alloc, BIN_ALLOC);
|
|
||||||
BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(new_chunk)));
|
|
||||||
chunk_ref = BinChunkRef(new_chunk);
|
|
||||||
instance_ref = BinInstanceRef(new_chunk + BinChunk_size);
|
|
||||||
chunk_n = 0;
|
|
||||||
chunk_limit = new_chunk + BIN_ALLOC - BinInstance_size;
|
|
||||||
}
|
}
|
||||||
BinInstance_write(instance_ref, BinInstance(element_ix));
|
BinInstance_write(instance_ref, BinInstance(element_ix));
|
||||||
chunk_n++;
|
|
||||||
instance_ref.offset += BinInstance_size;
|
instance_ref.offset += BinInstance_size;
|
||||||
// clear LSB
|
// clear LSB
|
||||||
bitmap &= bitmap - 1;
|
bitmap &= bitmap - 1;
|
||||||
|
|
Binary file not shown.
Loading…
Reference in a new issue