Fix prefix sum

First, add decoupled lookback. Second, fix problem with monoid that was overly aggressive in resetting the bbox.
2025-01-10 12:41:30 +11:00 · 2020-05-15 20:09:39 -07:00 · 2020-05-15 20:09:39 -07:00 · 93044b469b
parent 868b0320a4
commit 93044b469b
4 changed files with 84 additions and 15 deletions
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@ -70,7 +70,7 @@ fn main() -> Result<(), Error> {
        piet_gpu::dump_k1_data(&data);

        let mut data: Vec<u32> = Default::default();
-        device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap();
+        device.read_buffer(&renderer.anno_buf, &mut data).unwrap();
        piet_gpu::dump_k1_data(&data);
        */

--- a/piet-gpu/shader/elements.comp
+++ b/piet-gpu/shader/elements.comp
@ -18,10 +18,10 @@ layout(set = 0, binding = 0) readonly buffer SceneBuf {
    uint[] scene;
 };

-// This will be used for inter-workgroup aggregates. In the
-// meantime, for development, it has been used to store the
-// scan of the state objects.
-layout(set = 0, binding = 1) buffer StateBuf {
+// It would be better to use the Vulkan memory model than
+// "volatile" but shooting for compatibility here rather
+// than doing things right.
+layout(set = 0, binding = 1) volatile buffer StateBuf {
    uint[] state;
 };

@ -34,8 +34,28 @@ layout(set = 0, binding = 2) buffer AnnotatedBuf {
 #include "state.h"
 #include "annotated.h"

+#define StateBuf_stride (4 + 2 * State_size)
+
+StateRef state_aggregate_ref(uint partition_ix) {
+    return StateRef(8 + partition_ix * StateBuf_stride);
+}
+
+StateRef state_prefix_ref(uint partition_ix) {
+    return StateRef(8 + partition_ix * StateBuf_stride + State_size);
+}
+
+uint state_flag_index(uint partition_ix) {
+    return 1 + partition_ix * (StateBuf_stride / 4);
+}
+
+// These correspond to X, A, P respectively in the prefix sum paper.
+#define FLAG_NOT_READY 0
+#define FLAG_AGGREGATE_READY 1
+#define FLAG_PREFIX_READY 2
+
 #define FLAG_SET_LINEWIDTH 1
-#define FLAG_RESET_BBOX 2
+#define FLAG_SET_BBOX 2
+#define FLAG_RESET_BBOX 4

 // This is almost like a monoid (the interaction between transformation and
 // bounding boxes is approximate)
@ -47,7 +67,9 @@ State combine_state(State a, State b) {
    c.bbox.w = max(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + max(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
    if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) {
        c.bbox = a.bbox;
-    } else if ((a.flags & FLAG_RESET_BBOX) == 0 && (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)) {
+    } else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 &&
+        (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y))
+    {
        c.bbox.xy = min(a.bbox.xy, c.bbox.xy);
        c.bbox.zw = max(a.bbox.zw, c.bbox.zw);
    }
@ -59,7 +81,7 @@ State combine_state(State a, State b) {
    c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x;
    c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y;
    c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth;
-    c.flags = a.flags | b.flags;
+    c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX)) | b.flags;
    return c;
 }

@ -122,10 +144,18 @@ shared vec4 sh_bbox[WG_SIZE];
 shared float sh_width[WG_SIZE];
 shared uint sh_flags[WG_SIZE];

+shared uint sh_tile_ix;
+shared State sh_prefix;
+
 void main() {
    State th_state[N_ROWS];
-    // this becomes an atomic counter
-    uint tile_ix = gl_WorkGroupID.x;
+    // Determine partition to process by atomic counter (described in Section
+    // 4.4 of prefix sum paper).
+    if (gl_LocalInvocationID.x == 0) {
+        sh_tile_ix = atomicAdd(state[0], 1);
+    }
+    barrier();
+    uint tile_ix = sh_tile_ix;

    uint ix = tile_ix * TILE_SIZE + gl_LocalInvocationID.x * N_ROWS;
    ElementRef ref = ElementRef(ix * Element_size);
@ -162,16 +192,54 @@ void main() {
        sh_flags[gl_LocalInvocationID.x] = agg.flags;
    }

-    // TODO: if last invocation in wg, publish agg.
-
-    barrier();
    State exclusive;
    exclusive.bbox = vec4(0.0, 0.0, 0.0, 0.0);
    exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0);
    exclusive.translate = vec2(0.0, 0.0);
    exclusive.linewidth = 1.0; //TODO should be 0.0
    exclusive.flags = 0;
-    // TODO: do decoupled look-back
+
+    // Publish aggregate for this partition
+    if (gl_LocalInvocationID.x == WG_SIZE - 1) {
+        // Note: with memory model, we'd want to generate the atomic store version of this.
+        State_write(state_aggregate_ref(tile_ix), agg);
+        uint flag = FLAG_AGGREGATE_READY;
+        memoryBarrierBuffer();
+        if (tile_ix == 0) {
+            State_write(state_prefix_ref(tile_ix), agg);
+            flag = FLAG_PREFIX_READY;
+        }
+        state[state_flag_index(tile_ix)] = flag;
+        if (tile_ix != 0) {
+            // step 4 of paper: decoupled lookback
+            uint look_back_ix = tile_ix - 1;
+            while (true) {
+                flag = state[state_flag_index(look_back_ix)];
+                if (flag == FLAG_PREFIX_READY) {
+                    State their_prefix = State_read(state_prefix_ref(look_back_ix));
+                    exclusive = combine_state(their_prefix, exclusive);
+                    break;
+                } else if (flag == FLAG_AGGREGATE_READY) {
+                    State their_agg = State_read(state_aggregate_ref(look_back_ix));
+                    exclusive = combine_state(their_agg, exclusive);
+                    look_back_ix--;
+                }
+                // else spin
+            }
+
+            // step 5 of paper: compute inclusive prefix
+            State inclusive_prefix = combine_state(exclusive, agg);
+            sh_prefix = exclusive;
+            State_write(state_prefix_ref(tile_ix), inclusive_prefix);
+            memoryBarrierBuffer();
+            flag = FLAG_PREFIX_READY;
+            state[state_flag_index(tile_ix)] = flag;
+        }
+    }
+    barrier();
+    if (tile_ix != 0) {
+        exclusive = sh_prefix;
+    }

    State row = exclusive;
    if (gl_LocalInvocationID.x > 0) {
--- a/piet-gpu/shader/elements.spv
+++ b/piet-gpu/shader/elements.spv
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -158,7 +158,7 @@ impl<D: Device> Renderer<D> {
            .unwrap();
        device.write_buffer(&scene_buf, &scene)?;

-        let state_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
+        let state_buf = device.create_buffer(1 * 1024 * 1024, dev)?;
        let anno_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
        let bin_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
        let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?;
@ -245,6 +245,7 @@ impl<D: Device> Renderer<D> {
        cmd_buf.copy_buffer(&self.scene_buf, &self.scene_dev);
        cmd_buf.copy_buffer(&self.bin_alloc_buf_host, &self.bin_alloc_buf_dev);
        cmd_buf.copy_buffer(&self.coarse_alloc_buf_host, &self.coarse_alloc_buf_dev);
+        cmd_buf.clear_buffer(&self.state_buf);
        cmd_buf.memory_barrier();
        cmd_buf.image_barrier(
            &self.image_dev,