diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp index b0e4779..8bb9a4d 100644 --- a/piet-gpu/shader/elements.comp +++ b/piet-gpu/shader/elements.comp @@ -94,7 +94,7 @@ State combine_state(State a, State b) { return c; } -State map_element(ElementRef ref, inout bool is_fill) { +State map_element(ElementRef ref) { // TODO: it would *probably* be more efficient to make the memory read patterns less // divergent, though it would be more wasted memory. uint tag = Element_tag(ref); @@ -106,7 +106,6 @@ State map_element(ElementRef ref, inout bool is_fill) { c.flags = 0; c.path_count = 0; c.pathseg_count = 0; - is_fill = false; switch (tag) { case Element_FillLine: case Element_StrokeLine: @@ -132,8 +131,6 @@ State map_element(ElementRef ref, inout bool is_fill) { case Element_Fill: case Element_FillMask: case Element_FillMaskInv: - is_fill = true; - // fall-through case Element_Stroke: c.flags = FLAG_RESET_BBOX; c.path_count = 1; @@ -185,12 +182,11 @@ void main() { uint ix = part_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS; ElementRef ref = ElementRef(ix * Element_size); - bool is_fill; - th_state[0] = map_element(ref, is_fill); + th_state[0] = map_element(ref); for (uint i = 1; i < N_ROWS; i++) { // discussion question: would it be faster to load using more coherent patterns // into thread memory? This is kinda strided. - th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i), is_fill)); + th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i))); } State agg = th_state[N_ROWS - 1]; sh_mat[gl_LocalInvocationID.x] = agg.mat; @@ -247,6 +243,9 @@ void main() { if (part_ix != 0) { // step 4 of paper: decoupled lookback uint look_back_ix = part_ix - 1; + + State their_agg; + uint their_ix = 0; while (true) { flag = state[state_flag_index(look_back_ix)]; if (flag == FLAG_PREFIX_READY) { @@ -254,11 +253,33 @@ void main() { exclusive = combine_state(their_prefix, exclusive); break; } else if (flag == FLAG_AGGREGATE_READY) { - State their_agg = State_read(state_aggregate_ref(look_back_ix)); + their_agg = State_read(state_aggregate_ref(look_back_ix)); exclusive = combine_state(their_agg, exclusive); look_back_ix--; + their_ix = 0; + continue; } // else spin + + // Unfortunately there's no guarantee of forward progress of other + // workgroups, so compute a bit of the aggregate before trying again. + // In the worst case, spinning stops when the aggregate is complete. + ElementRef ref = ElementRef((look_back_ix * PARTITION_SIZE + their_ix) * Element_size); + State s = map_element(ref); + if (their_ix == 0) { + their_agg = s; + } else { + their_agg = combine_state(their_agg, s); + } + their_ix++; + if (their_ix == PARTITION_SIZE) { + exclusive = combine_state(their_agg, exclusive); + if (look_back_ix == 0) { + break; + } + look_back_ix--; + their_ix = 0; + } } // step 5 of paper: compute inclusive prefix diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv index 599465a..fed60fb 100644 Binary files a/piet-gpu/shader/elements.spv and b/piet-gpu/shader/elements.spv differ