Optimize tile allocation

Use parallel scheme to zero out tiles.
2025-01-09 12:21:31 +11:00 · 2020-06-03 13:04:52 -07:00 · 2020-06-03 13:04:52 -07:00 · ff8cee059c
parent 70a9c17e23
commit ff8cee059c
3 changed files with 35 additions and 10 deletions
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@ -210,7 +210,7 @@ fn main() -> Result<(), Error> {

        /*
        let mut data: Vec<u32> = Default::default();
-        device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap();
+        device.read_buffer(&renderer.tile_buf, &mut data).unwrap();
        piet_gpu::dump_k1_data(&data);
        //trace_ptcl(&data);
        */
--- a/piet-gpu/shader/tile_alloc.comp
+++ b/piet-gpu/shader/tile_alloc.comp
@ -5,7 +5,8 @@

 #include "setup.h"

-#define TILE_ALLOC_WG 32
+#define LG_TILE_ALLOC_WG 5
+#define TILE_ALLOC_WG (1 << LG_TILE_ALLOC_WG)

 layout(local_size_x = TILE_ALLOC_WG, local_size_y = 1) in;

@ -30,7 +31,11 @@ layout(set = 0, binding = 2) buffer TileBuf {
 #define SX (1.0 / float(TILE_WIDTH_PX))
 #define SY (1.0 / float(TILE_HEIGHT_PX))

+shared uint sh_tile_count[TILE_ALLOC_WG];
+shared uint sh_tile_alloc;
+
 void main() {
+    uint th_ix = gl_LocalInvocationID.x;
    uint element_ix = gl_GlobalInvocationID.x;
    PathRef path_ref = PathRef(element_ix * Path_size);
    AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size);
@ -59,15 +64,35 @@ void main() {

    Path path;
    path.bbox = uvec4(x0, y0, x1, y1);
-    uint n_tiles = (x1 - x0) * (y1 - y0);
-    path.tiles = TileRef(0);
-    if (n_tiles > 0) {
-        path.tiles.offset = atomicAdd(alloc, n_tiles * Tile_size);
-        Tile init_tile = Tile(TileSegRef(0), 0);
-        // TODO: improve load balancing
-        for (uint i = 0; i < n_tiles; i++) {
-            Tile_write(Tile_index(path.tiles, i), init_tile);
+    uint tile_count = (x1 - x0) * (y1 - y0);
+    uint n_tiles = tile_count;
+
+    sh_tile_count[th_ix] = tile_count;
+    // Prefix sum of sh_tile_count
+    for (uint i = 0; i < LG_TILE_ALLOC_WG; i++) {
+        barrier();
+        if (th_ix >= (1 << i)) {
+            tile_count += sh_tile_count[th_ix - (1 << i)];
        }
+        barrier();
+        sh_tile_count[th_ix] = tile_count;
    }
+    if (th_ix == TILE_ALLOC_WG - 1) {
+        sh_tile_alloc = atomicAdd(alloc, tile_count * Tile_size);
+    }
+    barrier();
+    uint alloc_start = sh_tile_alloc;
+
+    uint tile_subix = th_ix > 0 ? sh_tile_count[th_ix - 1] : 0;
+    path.tiles = TileRef(alloc_start + Tile_size * tile_subix);
    Path_write(path_ref, path);
+
+    // Zero out allocated tiles efficiently
+    uint total_count = sh_tile_count[TILE_ALLOC_WG - 1] * (Tile_size / 4);
+    uint start_ix = alloc_start >> 2;
+    for (uint i = th_ix; i < total_count; i += TILE_ALLOC_WG) {
+        // Note: this interleaving is faster than using Tile_write
+        // by a significant amount.
+        tile[start_ix + i] = 0;
+    }
 }
--- a/piet-gpu/shader/tile_alloc.spv
+++ b/piet-gpu/shader/tile_alloc.spv