Merge pull request #35 from eliasnaur/stop-spin

Limit spinning in elements.comp
2025-01-10 12:41:30 +11:00 · 2020-11-05 20:28:15 +01:00 · 2020-11-05 20:28:15 +01:00 · d3fe8630be
parent 61810b1bff b942e4035b
commit d3fe8630be
2 changed files with 29 additions and 8 deletions
--- a/piet-gpu/shader/elements.comp
+++ b/piet-gpu/shader/elements.comp
@ -94,7 +94,7 @@ State combine_state(State a, State b) {
    return c;
 }
-State map_element(ElementRef ref, inout bool is_fill) {
+State map_element(ElementRef ref) {
    // TODO: it would *probably* be more efficient to make the memory read patterns less
    // divergent, though it would be more wasted memory.
    uint tag = Element_tag(ref);
@ -106,7 +106,6 @@ State map_element(ElementRef ref, inout bool is_fill) {
    c.flags = 0;
    c.path_count = 0;
    c.pathseg_count = 0;
    is_fill = false;
    switch (tag) {
    case Element_FillLine:
    case Element_StrokeLine:
@ -132,8 +131,6 @@ State map_element(ElementRef ref, inout bool is_fill) {
    case Element_Fill:
    case Element_FillMask:
    case Element_FillMaskInv:
        is_fill = true;
        // fall-through
    case Element_Stroke:
        c.flags = FLAG_RESET_BBOX;
        c.path_count = 1;
@ -185,12 +182,11 @@ void main() {
    uint ix = part_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
    ElementRef ref = ElementRef(ix * Element_size);
-    bool is_fill;
+    th_state[0] = map_element(ref);
    th_state[0] = map_element(ref, is_fill);
    for (uint i = 1; i < N_ROWS; i++) {
        // discussion question: would it be faster to load using more coherent patterns
        // into thread memory? This is kinda strided.
-        th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i), is_fill));
+        th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i)));
    }
    State agg = th_state[N_ROWS - 1];
    sh_mat[gl_LocalInvocationID.x] = agg.mat;
@ -247,6 +243,9 @@ void main() {
        if (part_ix != 0) {
            // step 4 of paper: decoupled lookback
            uint look_back_ix = part_ix - 1;
            State their_agg;
            uint their_ix = 0;
            while (true) {
                flag = state[state_flag_index(look_back_ix)];
                if (flag == FLAG_PREFIX_READY) {
@ -254,11 +253,33 @@ void main() {
                    exclusive = combine_state(their_prefix, exclusive);
                    break;
                } else if (flag == FLAG_AGGREGATE_READY) {
-                    State their_agg = State_read(state_aggregate_ref(look_back_ix));
+                    their_agg = State_read(state_aggregate_ref(look_back_ix));
                    exclusive = combine_state(their_agg, exclusive);
                    look_back_ix--;
                    their_ix = 0;
                    continue;
                }
                // else spin
                // Unfortunately there's no guarantee of forward progress of other
                // workgroups, so compute a bit of the aggregate before trying again.
                // In the worst case, spinning stops when the aggregate is complete.
                ElementRef ref = ElementRef((look_back_ix * PARTITION_SIZE + their_ix) * Element_size);
                State s = map_element(ref);
                if (their_ix == 0) {
                    their_agg = s;
                } else {
                    their_agg = combine_state(their_agg, s);
                }
                their_ix++;
                if (their_ix == PARTITION_SIZE) {
                    exclusive = combine_state(their_agg, exclusive);
                    if (look_back_ix == 0) {
                        break;
                    }
                    look_back_ix--;
                    their_ix = 0;
                }
            }
            // step 5 of paper: compute inclusive prefix
--- a/piet-gpu/shader/elements.spv
+++ b/piet-gpu/shader/elements.spv