Merge pull request #120 from linebender/element_barrier

Add memory barrier to elements shader
2025-01-10 20:51:29 +11:00 · 2021-11-05 13:38:13 -07:00 · 2021-11-05 13:38:13 -07:00 · b0b0f33c3c
parent c648038967 95aad3e6c7
commit b0b0f33c3c
2 changed files with 51 additions and 25 deletions
--- a/piet-gpu/shader/elements.comp
+++ b/piet-gpu/shader/elements.comp
@ -175,6 +175,7 @@ shared State sh_state[WG_SIZE];
 shared uint sh_part_ix;
 shared State sh_prefix;
 shared uint sh_flag;
 void main() {
    State th_state[N_ROWS];
@ -219,17 +220,20 @@ void main() {
    // Publish aggregate for this partition
    if (gl_LocalInvocationID.x == WG_SIZE - 1) {
        // Note: with memory model, we'd want to generate the atomic store version of this.
        State_write(state_aggregate_ref(part_ix), agg);
        if (part_ix == 0) {
            State_write(state_prefix_ref(part_ix), agg);
        }
    }
    // Write flag with release semantics; this is done portably with a barrier.
    memoryBarrierBuffer();
    if (gl_LocalInvocationID.x == WG_SIZE - 1) {
        uint flag = FLAG_AGGREGATE_READY;
        if (part_ix == 0) {
            State_write(state_prefix_ref(part_ix), agg);
            flag = FLAG_PREFIX_READY;
        }
        state[state_flag_index(part_ix)] = flag;
    }
    if (part_ix != 0) {
        // step 4 of paper: decoupled lookback
        uint look_back_ix = part_ix - 1;
@ -237,20 +241,35 @@ void main() {
        State their_agg;
        uint their_ix = 0;
        while (true) {
-                flag = state[state_flag_index(look_back_ix)];
+            // Read flag with acquire semantics.
            if (gl_LocalInvocationID.x == WG_SIZE - 1) {
                sh_flag = state[state_flag_index(look_back_ix)];
            }
            // The flag load is done only in the last thread. However, because the
            // translation of memoryBarrierBuffer to Metal requires uniform control
            // flow, we broadcast it to all threads.
            barrier();
            memoryBarrierBuffer();
            uint flag = sh_flag;
            if (flag == FLAG_PREFIX_READY) {
                if (gl_LocalInvocationID.x == WG_SIZE - 1) {
                    State their_prefix = State_read(state_prefix_ref(look_back_ix));
                    exclusive = combine_state(their_prefix, exclusive);
                }
                break;
            } else if (flag == FLAG_AGGREGATE_READY) {
                if (gl_LocalInvocationID.x == WG_SIZE - 1) {
                    their_agg = State_read(state_aggregate_ref(look_back_ix));
                    exclusive = combine_state(their_agg, exclusive);
                }
                look_back_ix--;
                their_ix = 0;
                continue;
            }
            // else spin
            if (gl_LocalInvocationID.x == WG_SIZE - 1) {
                // Unfortunately there's no guarantee of forward progress of other
                // workgroups, so compute a bit of the aggregate before trying again.
                // In the worst case, spinning stops when the aggregate is complete.
@ -265,23 +284,30 @@ void main() {
                if (their_ix == PARTITION_SIZE) {
                    exclusive = combine_state(their_agg, exclusive);
                    if (look_back_ix == 0) {
-                        break;
+                        sh_flag = FLAG_PREFIX_READY;
-                    }
+                    } else {
                        look_back_ix--;
                        their_ix = 0;
                    }
                }
-
+            }
            barrier();
            flag = sh_flag;
            if (flag == FLAG_PREFIX_READY) {
                break;
            }
        }
        // step 5 of paper: compute inclusive prefix
        if (gl_LocalInvocationID.x == WG_SIZE - 1) {
            State inclusive_prefix = combine_state(exclusive, agg);
            sh_prefix = exclusive;
            State_write(state_prefix_ref(part_ix), inclusive_prefix);
        }
    }
        memoryBarrierBuffer();
-    if (gl_LocalInvocationID.x == WG_SIZE - 1 && part_ix != 0) {
+        if (gl_LocalInvocationID.x == WG_SIZE - 1) {
            state[state_flag_index(part_ix)] = FLAG_PREFIX_READY;
        }
    }
    barrier();
    if (part_ix != 0) {
        exclusive = sh_prefix;
--- a/piet-gpu/shader/elements.spv
+++ b/piet-gpu/shader/elements.spv