Clean up bits of right edge tracking logic left over from sort-middle.

2025-01-10 20:51:29 +11:00 · 2020-08-09 16:13:50 -07:00 · 2020-08-09 16:13:50 -07:00 · d836d21d12
parent 724c4899f2
commit d836d21d12
5 changed files with 18 additions and 76 deletions
--- a/piet-gpu/shader/binning.comp
+++ b/piet-gpu/shader/binning.comp
@ -15,24 +15,18 @@ layout(set = 0, binding = 0) buffer AnnotatedBuf {
    uint[] annotated;
 };
-// This is for scanning forward for right_edge data.
+layout(set = 0, binding = 1) buffer AllocBuf {
 layout(set = 0, binding = 1) buffer StateBuf {
    uint[] state;
 };
 layout(set = 0, binding = 2) buffer AllocBuf {
    uint n_elements; // paths
    // Will be incremented atomically to claim tiles
    uint tile_ix;
    uint alloc;
 };
-layout(set = 0, binding = 3) buffer BinsBuf {
+layout(set = 0, binding = 2) buffer BinsBuf {
    uint[] bins;
 };
 #include "annotated.h"
 #include "state.h"
 #include "bins.h"
 // scale factors useful for converting coordinates to bins
@ -52,12 +46,6 @@ shared uint sh_chunk_start[N_TILE];
 shared float sh_right_edge[N_TILE];
 #define StateBuf_stride (8 + 2 * State_size)
 uint state_right_edge_index(uint partition_ix) {
    return 2 + partition_ix * (StateBuf_stride / 4);
 }
 void main() {
    uint chunk_n = 0;
    uint my_n_elements = n_elements;
@ -103,37 +91,6 @@ void main() {
        break;
    }
    /*
    // If the last element in this partition is a fill edge, then we need to do a
    // look-forward to find the right edge of its corresponding fill. That data is
    // recorded in aggregates computed in the element processing pass.
    if (gl_LocalInvocationID.x == N_TILE - 1 && tag == Annotated_FillLine) {
        uint aggregate_ix = (my_partition + 1) * ELEMENT_BINNING_RATIO;
        // This is sequential but the expectation is that the amount of
        // look-forward is small (performance may degrade in the case
        // of massively complex paths).
        do {
            my_right_edge = uintBitsToFloat(state[state_right_edge_index(aggregate_ix)]);
            aggregate_ix++;
        } while (isinf(my_right_edge));
    }
    // Now propagate right_edge backward, from fill to segment.
    for (uint i = 0; i < LG_N_TILE; i++) {
        // Note: we could try to cut down on write bandwidth here if the value hasn't
        // changed, but not sure it's worth the complexity to track.
        sh_right_edge[gl_LocalInvocationID.x] = my_right_edge;
        barrier();
        if (gl_LocalInvocationID.x + (1 << i) < N_TILE && isinf(my_right_edge)) {
            my_right_edge = sh_right_edge[gl_LocalInvocationID.x + (1 << i)];
        }
        barrier();
    }
    if (crosses_edge) {
        x1 = int(ceil(my_right_edge * SX));
    }
    */
    // At this point, we run an iterator over the coverage area,
    // trying to keep divergence low.
    // Right now, it's just a bbox, but we'll get finer with
--- a/piet-gpu/shader/binning.spv
+++ b/piet-gpu/shader/binning.spv
--- a/piet-gpu/shader/elements.comp
+++ b/piet-gpu/shader/elements.comp
@ -167,9 +167,7 @@ shared uint sh_flags[WG_SIZE];
 shared uint sh_path_count[WG_SIZE];
 shared uint sh_pathseg_count[WG_SIZE];
-shared uint sh_min_fill;
+shared uint sh_part_ix;
 shared uint sh_tile_ix;
 shared State sh_prefix;
 void main() {
@ -177,28 +175,21 @@ void main() {
    // Determine partition to process by atomic counter (described in Section
    // 4.4 of prefix sum paper).
    if (gl_LocalInvocationID.x == 0) {
-        sh_tile_ix = atomicAdd(state[0], 1);
+        sh_part_ix = atomicAdd(state[0], 1);
        sh_min_fill = ~0;
    }
    barrier();
-    uint tile_ix = sh_tile_ix;
+    uint part_ix = sh_part_ix;
-    uint ix = tile_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
+    uint ix = part_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
    ElementRef ref = ElementRef(ix * Element_size);
    bool is_fill;
    uint my_min_fill = ~0;
    th_state[0] = map_element(ref, is_fill);
    if (is_fill) my_min_fill = ix;
    for (uint i = 1; i < N_ROWS; i++) {
        // discussion question: would it be faster to load using more coherent patterns
        // into thread memory? This is kinda strided.
        th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i), is_fill));
        if (is_fill && my_min_fill == ~0) {
            my_min_fill = ix + i;
    }
    }
    atomicMin(sh_min_fill, my_min_fill);
    State agg = th_state[N_ROWS - 1];
    sh_mat[gl_LocalInvocationID.x] = agg.mat;
    sh_translate[gl_LocalInvocationID.x] = agg.translate;
@ -243,17 +234,17 @@ void main() {
    // Publish aggregate for this partition
    if (gl_LocalInvocationID.x == WG_SIZE - 1) {
        // Note: with memory model, we'd want to generate the atomic store version of this.
-        State_write(state_aggregate_ref(tile_ix), agg);
+        State_write(state_aggregate_ref(part_ix), agg);
        uint flag = FLAG_AGGREGATE_READY;
        memoryBarrierBuffer();
-        if (tile_ix == 0) {
+        if (part_ix == 0) {
-            State_write(state_prefix_ref(tile_ix), agg);
+            State_write(state_prefix_ref(part_ix), agg);
            flag = FLAG_PREFIX_READY;
        }
-        state[state_flag_index(tile_ix)] = flag;
+        state[state_flag_index(part_ix)] = flag;
-        if (tile_ix != 0) {
+        if (part_ix != 0) {
            // step 4 of paper: decoupled lookback
-            uint look_back_ix = tile_ix - 1;
+            uint look_back_ix = part_ix - 1;
            while (true) {
                flag = state[state_flag_index(look_back_ix)];
                if (flag == FLAG_PREFIX_READY) {
@ -271,15 +262,14 @@ void main() {
            // step 5 of paper: compute inclusive prefix
            State inclusive_prefix = combine_state(exclusive, agg);
            sh_prefix = exclusive;
-            State_write(state_prefix_ref(tile_ix), inclusive_prefix);
+            State_write(state_prefix_ref(part_ix), inclusive_prefix);
            memoryBarrierBuffer();
            flag = FLAG_PREFIX_READY;
-            state[state_flag_index(tile_ix)] = flag;
+            state[state_flag_index(part_ix)] = flag;
        }
    }
    barrier();
-    my_min_fill = sh_min_fill;
+    if (part_ix != 0) {
    if (tile_ix != 0) {
        exclusive = sh_prefix;
    }
@ -296,14 +286,9 @@ void main() {
        other.pathseg_count = sh_pathseg_count[ix];
        row = combine_state(row, other);
    }
    if (my_min_fill == ~0 && gl_LocalInvocationID.x == 0) {
        state[state_flag_index(tile_ix) + 1] = 0x7f800000; // infinity
    }
    for (uint i = 0; i < N_ROWS; i++) {
        State st = combine_state(row, th_state[i]);
-        if (my_min_fill == ix + i) {
+
            state[state_flag_index(tile_ix) + 1] = floatBitsToUint(st.bbox.z);
        }
        // We write the state now for development purposes, but the
        // actual goal is to write transformed and annotated elements.
        //State_write(StateRef((ix + i) * State_size), st);
--- a/piet-gpu/shader/elements.spv
+++ b/piet-gpu/shader/elements.spv
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -237,10 +237,10 @@ impl<D: Device> Renderer<D> {
            &[n_paths as u32, 0, bin_alloc_start as u32],
        )?;
        let bin_code = include_bytes!("../shader/binning.spv");
-        let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 4, 0)?;
+        let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 3, 0)?;
        let bin_ds = device.create_descriptor_set(
            &bin_pipeline,
-            &[&anno_buf, &state_buf, &bin_alloc_buf_dev, &bin_buf],
+            &[&anno_buf, &bin_alloc_buf_dev, &bin_buf],
            &[],
        )?;