mirror of
https://github.com/italicsjenga/vello.git
synced 2025-01-10 12:41:30 +11:00
Bit magic for backdrop accumulation
Use bit counting rather than iterating backdrop increments one by one. A nice if not huge speedup.
This commit is contained in:
parent
a616b4d010
commit
7d040dff37
|
@ -80,6 +80,14 @@ void alloc_chunk(inout uint chunk_n_segs, inout SegChunkRef seg_chunk_ref,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Accumulate delta to backdrop.
|
||||||
|
//
|
||||||
|
// Each bit for which bd_bitmap is 1 and bd_sign is 1 counts as +1, and each
|
||||||
|
// bit for which bd_bitmap is 1 and bd_sign is 0 counts as -1.
|
||||||
|
int count_backdrop(uint bd_bitmap, uint bd_sign) {
|
||||||
|
return bitCount(bd_bitmap & bd_sign) - bitCount(bd_bitmap & ~bd_sign);
|
||||||
|
}
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
// Could use either linear or 2d layouts for both dispatch and
|
// Could use either linear or 2d layouts for both dispatch and
|
||||||
// invocations within the workgroup. We'll use variables to abstract.
|
// invocations within the workgroup. We'll use variables to abstract.
|
||||||
|
@ -275,33 +283,31 @@ void main() {
|
||||||
uint slice_ix = 0;
|
uint slice_ix = 0;
|
||||||
uint bitmap = sh_bitmaps[0][th_ix];
|
uint bitmap = sh_bitmaps[0][th_ix];
|
||||||
uint bd_bitmap = sh_backdrop[0][th_ix];
|
uint bd_bitmap = sh_backdrop[0][th_ix];
|
||||||
uint combined = bitmap | bd_bitmap;
|
uint bd_sign = sh_bd_sign[0];
|
||||||
while (true) {
|
while (true) {
|
||||||
if (combined == 0) {
|
if (bitmap == 0) {
|
||||||
|
backdrop += count_backdrop(bd_bitmap, bd_sign);
|
||||||
slice_ix++;
|
slice_ix++;
|
||||||
if (slice_ix == N_SLICE) {
|
if (slice_ix == N_SLICE) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
bitmap = sh_bitmaps[slice_ix][th_ix];
|
bitmap = sh_bitmaps[slice_ix][th_ix];
|
||||||
bd_bitmap = sh_backdrop[slice_ix][th_ix];
|
bd_bitmap = sh_backdrop[slice_ix][th_ix];
|
||||||
combined = bitmap | bd_bitmap;
|
bd_sign = sh_bd_sign[slice_ix];
|
||||||
if (combined == 0) {
|
if (bitmap == 0) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
uint element_ref_ix = slice_ix * 32 + findLSB(combined);
|
uint element_ref_ix = slice_ix * 32 + findLSB(bitmap);
|
||||||
uint element_ix = sh_elements[(rd_ix + element_ref_ix) % N_RINGBUF];
|
uint element_ix = sh_elements[(rd_ix + element_ref_ix) % N_RINGBUF];
|
||||||
|
|
||||||
// TODO: use bit magic to aggregate this calculation.
|
// Bits up to and including the lsb
|
||||||
if ((bd_bitmap & (1 << (element_ref_ix & 31))) != 0) {
|
uint bd_mask = (bitmap - 1) ^ bitmap;
|
||||||
if ((sh_bd_sign[slice_ix] & (1 << (element_ref_ix & 31))) != 0) {
|
backdrop += count_backdrop(bd_bitmap & bd_mask, bd_sign);
|
||||||
backdrop += 1;
|
// Clear bits that have been consumed.
|
||||||
} else {
|
bd_bitmap &= ~bd_mask;
|
||||||
backdrop -= 1;
|
bitmap &= ~bd_mask;
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((bitmap & (1 << (element_ref_ix & 31))) != 0) {
|
|
||||||
// At this point, we read the element again from global memory.
|
// At this point, we read the element again from global memory.
|
||||||
// If that turns out to be expensive, maybe we can pack it into
|
// If that turns out to be expensive, maybe we can pack it into
|
||||||
// shared memory (or perhaps just the tag).
|
// shared memory (or perhaps just the tag).
|
||||||
|
@ -379,10 +385,6 @@ void main() {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// clear LSB
|
|
||||||
combined &= combined - 1;
|
|
||||||
}
|
|
||||||
barrier();
|
barrier();
|
||||||
|
|
||||||
rd_ix += N_TILE;
|
rd_ix += N_TILE;
|
||||||
|
|
Binary file not shown.
Loading…
Reference in a new issue