mirror of
https://github.com/italicsjenga/vello.git
synced 2025-01-10 20:51:29 +11:00
54 lines
1.3 KiB
GLSL
54 lines
1.3 KiB
GLSL
|
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
|
||
|
|
||
|
// The reduction phase for prefix sum implemented as a tree reduction.
|
||
|
|
||
|
#version 450
|
||
|
|
||
|
#define N_ROWS 8
|
||
|
#define LG_WG_SIZE 9
|
||
|
#define WG_SIZE (1 << LG_WG_SIZE)
|
||
|
#define PARTITION_SIZE (WG_SIZE * N_ROWS)
|
||
|
|
||
|
layout(local_size_x = WG_SIZE, local_size_y = 1) in;
|
||
|
|
||
|
struct Monoid {
|
||
|
uint element;
|
||
|
};
|
||
|
|
||
|
layout(set = 0, binding = 0) readonly buffer InBuf {
|
||
|
Monoid[] inbuf;
|
||
|
};
|
||
|
|
||
|
layout(set = 0, binding = 1) buffer OutBuf {
|
||
|
Monoid[] outbuf;
|
||
|
};
|
||
|
|
||
|
shared Monoid sh_scratch[WG_SIZE];
|
||
|
|
||
|
Monoid combine_monoid(Monoid a, Monoid b) {
|
||
|
return Monoid(a.element + b.element);
|
||
|
}
|
||
|
|
||
|
void main() {
|
||
|
uint ix = gl_GlobalInvocationID.x * N_ROWS;
|
||
|
// TODO: gate buffer read
|
||
|
Monoid agg = inbuf[ix];
|
||
|
for (uint i = 1; i < N_ROWS; i++) {
|
||
|
agg = combine_monoid(agg, inbuf[ix + i]);
|
||
|
}
|
||
|
sh_scratch[gl_LocalInvocationID.x] = agg;
|
||
|
for (uint i = 0; i < LG_WG_SIZE; i++) {
|
||
|
barrier();
|
||
|
// We could make this predicate tighter, but would it help?
|
||
|
if (gl_LocalInvocationID.x + (1 << i) < WG_SIZE) {
|
||
|
Monoid other = sh_scratch[gl_LocalInvocationID.x + (1 << i)];
|
||
|
agg = combine_monoid(agg, other);
|
||
|
}
|
||
|
barrier();
|
||
|
sh_scratch[gl_LocalInvocationID.x] = agg;
|
||
|
}
|
||
|
if (gl_LocalInvocationID.x == 0) {
|
||
|
outbuf[gl_WorkGroupID.x] = agg;
|
||
|
}
|
||
|
}
|