mirror of
https://github.com/italicsjenga/vello.git
synced 2025-01-10 20:51:29 +11:00
Merge pull request #120 from linebender/element_barrier
Add memory barrier to elements shader
This commit is contained in:
commit
b0b0f33c3c
|
@ -175,6 +175,7 @@ shared State sh_state[WG_SIZE];
|
||||||
|
|
||||||
shared uint sh_part_ix;
|
shared uint sh_part_ix;
|
||||||
shared State sh_prefix;
|
shared State sh_prefix;
|
||||||
|
shared uint sh_flag;
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
State th_state[N_ROWS];
|
State th_state[N_ROWS];
|
||||||
|
@ -219,38 +220,56 @@ void main() {
|
||||||
|
|
||||||
// Publish aggregate for this partition
|
// Publish aggregate for this partition
|
||||||
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
|
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
|
||||||
// Note: with memory model, we'd want to generate the atomic store version of this.
|
|
||||||
State_write(state_aggregate_ref(part_ix), agg);
|
State_write(state_aggregate_ref(part_ix), agg);
|
||||||
|
if (part_ix == 0) {
|
||||||
|
State_write(state_prefix_ref(part_ix), agg);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
// Write flag with release semantics; this is done portably with a barrier.
|
||||||
memoryBarrierBuffer();
|
memoryBarrierBuffer();
|
||||||
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
|
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
|
||||||
uint flag = FLAG_AGGREGATE_READY;
|
uint flag = FLAG_AGGREGATE_READY;
|
||||||
if (part_ix == 0) {
|
if (part_ix == 0) {
|
||||||
State_write(state_prefix_ref(part_ix), agg);
|
|
||||||
flag = FLAG_PREFIX_READY;
|
flag = FLAG_PREFIX_READY;
|
||||||
}
|
}
|
||||||
state[state_flag_index(part_ix)] = flag;
|
state[state_flag_index(part_ix)] = flag;
|
||||||
if (part_ix != 0) {
|
}
|
||||||
// step 4 of paper: decoupled lookback
|
if (part_ix != 0) {
|
||||||
uint look_back_ix = part_ix - 1;
|
// step 4 of paper: decoupled lookback
|
||||||
|
uint look_back_ix = part_ix - 1;
|
||||||
|
|
||||||
State their_agg;
|
State their_agg;
|
||||||
uint their_ix = 0;
|
uint their_ix = 0;
|
||||||
while (true) {
|
while (true) {
|
||||||
flag = state[state_flag_index(look_back_ix)];
|
// Read flag with acquire semantics.
|
||||||
if (flag == FLAG_PREFIX_READY) {
|
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
|
||||||
|
sh_flag = state[state_flag_index(look_back_ix)];
|
||||||
|
}
|
||||||
|
// The flag load is done only in the last thread. However, because the
|
||||||
|
// translation of memoryBarrierBuffer to Metal requires uniform control
|
||||||
|
// flow, we broadcast it to all threads.
|
||||||
|
barrier();
|
||||||
|
memoryBarrierBuffer();
|
||||||
|
uint flag = sh_flag;
|
||||||
|
|
||||||
|
if (flag == FLAG_PREFIX_READY) {
|
||||||
|
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
|
||||||
State their_prefix = State_read(state_prefix_ref(look_back_ix));
|
State their_prefix = State_read(state_prefix_ref(look_back_ix));
|
||||||
exclusive = combine_state(their_prefix, exclusive);
|
exclusive = combine_state(their_prefix, exclusive);
|
||||||
break;
|
}
|
||||||
} else if (flag == FLAG_AGGREGATE_READY) {
|
break;
|
||||||
|
} else if (flag == FLAG_AGGREGATE_READY) {
|
||||||
|
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
|
||||||
their_agg = State_read(state_aggregate_ref(look_back_ix));
|
their_agg = State_read(state_aggregate_ref(look_back_ix));
|
||||||
exclusive = combine_state(their_agg, exclusive);
|
exclusive = combine_state(their_agg, exclusive);
|
||||||
look_back_ix--;
|
|
||||||
their_ix = 0;
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
// else spin
|
look_back_ix--;
|
||||||
|
their_ix = 0;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// else spin
|
||||||
|
|
||||||
|
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
|
||||||
// Unfortunately there's no guarantee of forward progress of other
|
// Unfortunately there's no guarantee of forward progress of other
|
||||||
// workgroups, so compute a bit of the aggregate before trying again.
|
// workgroups, so compute a bit of the aggregate before trying again.
|
||||||
// In the worst case, spinning stops when the aggregate is complete.
|
// In the worst case, spinning stops when the aggregate is complete.
|
||||||
|
@ -265,22 +284,29 @@ void main() {
|
||||||
if (their_ix == PARTITION_SIZE) {
|
if (their_ix == PARTITION_SIZE) {
|
||||||
exclusive = combine_state(their_agg, exclusive);
|
exclusive = combine_state(their_agg, exclusive);
|
||||||
if (look_back_ix == 0) {
|
if (look_back_ix == 0) {
|
||||||
break;
|
sh_flag = FLAG_PREFIX_READY;
|
||||||
|
} else {
|
||||||
|
look_back_ix--;
|
||||||
|
their_ix = 0;
|
||||||
}
|
}
|
||||||
look_back_ix--;
|
|
||||||
their_ix = 0;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
barrier();
|
||||||
// step 5 of paper: compute inclusive prefix
|
flag = sh_flag;
|
||||||
|
if (flag == FLAG_PREFIX_READY) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// step 5 of paper: compute inclusive prefix
|
||||||
|
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
|
||||||
State inclusive_prefix = combine_state(exclusive, agg);
|
State inclusive_prefix = combine_state(exclusive, agg);
|
||||||
sh_prefix = exclusive;
|
sh_prefix = exclusive;
|
||||||
State_write(state_prefix_ref(part_ix), inclusive_prefix);
|
State_write(state_prefix_ref(part_ix), inclusive_prefix);
|
||||||
}
|
}
|
||||||
}
|
memoryBarrierBuffer();
|
||||||
memoryBarrierBuffer();
|
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
|
||||||
if (gl_LocalInvocationID.x == WG_SIZE - 1 && part_ix != 0) {
|
state[state_flag_index(part_ix)] = FLAG_PREFIX_READY;
|
||||||
state[state_flag_index(part_ix)] = FLAG_PREFIX_READY;
|
}
|
||||||
}
|
}
|
||||||
barrier();
|
barrier();
|
||||||
if (part_ix != 0) {
|
if (part_ix != 0) {
|
||||||
|
|
Binary file not shown.
Loading…
Reference in a new issue