Boost backdrop parallelism for the prefix sums

This commit is contained in:
Ishi Tatsuyuki 2021-05-27 11:32:33 +09:00
parent 8b65942f65
commit c2772ceac7
2 changed files with 41 additions and 31 deletions

View file

@ -2,9 +2,10 @@
// Propagation of tile backdrop for filling. // Propagation of tile backdrop for filling.
// //
// Each thread reads one path element and calculates the number of spanned tiles // Each thread reads one path element and calculates the row and column counts of spanned tiles
// based on the bounding box. // based on the bounding box.
// In a further compaction step, the workgroup loops over the corresponding tile rows per element in parallel. // The row count then goes through a prefix sum to redistribute and load-balance the work across the workgroup.
// In the following step, the workgroup loops over the corresponding tile rows per element in parallel.
// For each row the per tile backdrop will be read, as calculated in the previous coarse path segment kernel, // For each row the per tile backdrop will be read, as calculated in the previous coarse path segment kernel,
// and propagated from the left to the right (prefix summed). // and propagated from the left to the right (prefix summed).
// //
@ -20,8 +21,13 @@
#define LG_BACKDROP_WG (7 + LG_WG_FACTOR) #define LG_BACKDROP_WG (7 + LG_WG_FACTOR)
#define BACKDROP_WG (1 << LG_BACKDROP_WG) #define BACKDROP_WG (1 << LG_BACKDROP_WG)
// Some paths (those covering a large area) can generate a lot of backdrop tiles; BACKDROP_DIST_FACTOR defines how much
// additional threads should we spawn for parallel row processing. The additional threads does not participate in the
// earlier stages (calculating the tile counts) but does work in the final prefix sum stage which has a lot more
// parallelism.
#define BACKDROP_DIST_FACTOR 4
layout(local_size_x = BACKDROP_WG, local_size_y = 1) in; layout(local_size_x = BACKDROP_WG, local_size_y = BACKDROP_DIST_FACTOR) in;
layout(set = 0, binding = 1) readonly buffer ConfigBuf { layout(set = 0, binding = 1) readonly buffer ConfigBuf {
Config conf; Config conf;
@ -35,13 +41,14 @@ shared Alloc sh_row_alloc[BACKDROP_WG];
shared uint sh_row_width[BACKDROP_WG]; shared uint sh_row_width[BACKDROP_WG];
void main() { void main() {
uint th_ix = gl_LocalInvocationID.x; uint th_ix = gl_LocalInvocationIndex;
uint element_ix = gl_GlobalInvocationID.x; uint element_ix = gl_GlobalInvocationID.x;
AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size); AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
// Work assignment: 1 thread : 1 path element // Work assignment: 1 thread : 1 path element
uint row_count = 0; uint row_count = 0;
bool mem_ok = mem_error == NO_ERROR; bool mem_ok = mem_error == NO_ERROR;
if (gl_LocalInvocationID.y == 0) {
if (element_ix < conf.n_elements) { if (element_ix < conf.n_elements) {
AnnotatedTag tag = Annotated_tag(conf.anno_alloc, ref); AnnotatedTag tag = Annotated_tag(conf.anno_alloc, ref);
switch (tag.tag) { switch (tag.tag) {
@ -68,21 +75,24 @@ void main() {
sh_row_alloc[th_ix] = path_alloc; sh_row_alloc[th_ix] = path_alloc;
} }
} }
sh_row_count[th_ix] = row_count; sh_row_count[th_ix] = row_count;
}
// Prefix sum of sh_row_count // Prefix sum of sh_row_count
for (uint i = 0; i < LG_BACKDROP_WG; i++) { for (uint i = 0; i < LG_BACKDROP_WG; i++) {
barrier(); barrier();
if (th_ix >= (1 << i)) { if (gl_LocalInvocationID.y == 0 && th_ix >= (1 << i)) {
row_count += sh_row_count[th_ix - (1 << i)]; row_count += sh_row_count[th_ix - (1 << i)];
} }
barrier(); barrier();
if (gl_LocalInvocationID.y == 0) {
sh_row_count[th_ix] = row_count; sh_row_count[th_ix] = row_count;
} }
}
barrier(); barrier();
// Work assignment: 1 thread : 1 path element row // Work assignment: 1 thread : 1 path element row
uint total_rows = sh_row_count[BACKDROP_WG - 1]; uint total_rows = sh_row_count[BACKDROP_WG - 1];
for (uint row = th_ix; row < total_rows; row += BACKDROP_WG) { for (uint row = th_ix; row < total_rows; row += BACKDROP_WG * BACKDROP_DIST_FACTOR) {
// Binary search to find element // Binary search to find element
uint el_ix = 0; uint el_ix = 0;
for (uint i = 0; i < LG_BACKDROP_WG; i++) { for (uint i = 0; i < LG_BACKDROP_WG; i++) {

Binary file not shown.