Bit magic for backdrop accumulation

Use bit counting rather than iterating backdrop increments one by one.
A nice if not huge speedup.
This commit is contained in:
Raph Levien 2020-05-22 07:13:27 -07:00
parent a616b4d010
commit 7d040dff37
2 changed files with 20 additions and 18 deletions

View file

@ -80,6 +80,14 @@ void alloc_chunk(inout uint chunk_n_segs, inout SegChunkRef seg_chunk_ref,
} }
} }
// Accumulate delta to backdrop.
//
// Each bit for which bd_bitmap is 1 and bd_sign is 1 counts as +1, and each
// bit for which bd_bitmap is 1 and bd_sign is 0 counts as -1.
int count_backdrop(uint bd_bitmap, uint bd_sign) {
return bitCount(bd_bitmap & bd_sign) - bitCount(bd_bitmap & ~bd_sign);
}
void main() { void main() {
// Could use either linear or 2d layouts for both dispatch and // Could use either linear or 2d layouts for both dispatch and
// invocations within the workgroup. We'll use variables to abstract. // invocations within the workgroup. We'll use variables to abstract.
@ -275,33 +283,31 @@ void main() {
uint slice_ix = 0; uint slice_ix = 0;
uint bitmap = sh_bitmaps[0][th_ix]; uint bitmap = sh_bitmaps[0][th_ix];
uint bd_bitmap = sh_backdrop[0][th_ix]; uint bd_bitmap = sh_backdrop[0][th_ix];
uint combined = bitmap | bd_bitmap; uint bd_sign = sh_bd_sign[0];
while (true) { while (true) {
if (combined == 0) { if (bitmap == 0) {
backdrop += count_backdrop(bd_bitmap, bd_sign);
slice_ix++; slice_ix++;
if (slice_ix == N_SLICE) { if (slice_ix == N_SLICE) {
break; break;
} }
bitmap = sh_bitmaps[slice_ix][th_ix]; bitmap = sh_bitmaps[slice_ix][th_ix];
bd_bitmap = sh_backdrop[slice_ix][th_ix]; bd_bitmap = sh_backdrop[slice_ix][th_ix];
combined = bitmap | bd_bitmap; bd_sign = sh_bd_sign[slice_ix];
if (combined == 0) { if (bitmap == 0) {
continue; continue;
} }
} }
uint element_ref_ix = slice_ix * 32 + findLSB(combined); uint element_ref_ix = slice_ix * 32 + findLSB(bitmap);
uint element_ix = sh_elements[(rd_ix + element_ref_ix) % N_RINGBUF]; uint element_ix = sh_elements[(rd_ix + element_ref_ix) % N_RINGBUF];
// TODO: use bit magic to aggregate this calculation. // Bits up to and including the lsb
if ((bd_bitmap & (1 << (element_ref_ix & 31))) != 0) { uint bd_mask = (bitmap - 1) ^ bitmap;
if ((sh_bd_sign[slice_ix] & (1 << (element_ref_ix & 31))) != 0) { backdrop += count_backdrop(bd_bitmap & bd_mask, bd_sign);
backdrop += 1; // Clear bits that have been consumed.
} else { bd_bitmap &= ~bd_mask;
backdrop -= 1; bitmap &= ~bd_mask;
}
}
if ((bitmap & (1 << (element_ref_ix & 31))) != 0) {
// At this point, we read the element again from global memory. // At this point, we read the element again from global memory.
// If that turns out to be expensive, maybe we can pack it into // If that turns out to be expensive, maybe we can pack it into
// shared memory (or perhaps just the tag). // shared memory (or perhaps just the tag).
@ -379,10 +385,6 @@ void main() {
break; break;
} }
} }
// clear LSB
combined &= combined - 1;
}
barrier(); barrier();
rd_ix += N_TILE; rd_ix += N_TILE;

Binary file not shown.