Optimize tile allocation

Use parallel scheme to zero out tiles.
2025-01-09 20:31:29 +11:00 · 2020-06-03 13:04:52 -07:00 · 2020-06-03 13:04:52 -07:00 · ff8cee059c
parent 70a9c17e23
commit ff8cee059c
3 changed files with 35 additions and 10 deletions
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@ -210,7 +210,7 @@ fn main() -> Result<(), Error> {
        /*
        let mut data: Vec<u32> = Default::default();
-        device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap();
+        device.read_buffer(&renderer.tile_buf, &mut data).unwrap();
        piet_gpu::dump_k1_data(&data);
        //trace_ptcl(&data);
        */
--- a/piet-gpu/shader/tile_alloc.comp
+++ b/piet-gpu/shader/tile_alloc.comp
@ -5,7 +5,8 @@
 #include "setup.h"
-#define TILE_ALLOC_WG 32
+#define LG_TILE_ALLOC_WG 5
 #define TILE_ALLOC_WG (1 << LG_TILE_ALLOC_WG)
 layout(local_size_x = TILE_ALLOC_WG, local_size_y = 1) in;
@ -30,7 +31,11 @@ layout(set = 0, binding = 2) buffer TileBuf {
 #define SX (1.0 / float(TILE_WIDTH_PX))
 #define SY (1.0 / float(TILE_HEIGHT_PX))
 shared uint sh_tile_count[TILE_ALLOC_WG];
 shared uint sh_tile_alloc;
 void main() {
    uint th_ix = gl_LocalInvocationID.x;
    uint element_ix = gl_GlobalInvocationID.x;
    PathRef path_ref = PathRef(element_ix * Path_size);
    AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size);
@ -59,15 +64,35 @@ void main() {
    Path path;
    path.bbox = uvec4(x0, y0, x1, y1);
-    uint n_tiles = (x1 - x0) * (y1 - y0);
+    uint tile_count = (x1 - x0) * (y1 - y0);
-    path.tiles = TileRef(0);
+    uint n_tiles = tile_count;
-    if (n_tiles > 0) {
+
-        path.tiles.offset = atomicAdd(alloc, n_tiles * Tile_size);
+    sh_tile_count[th_ix] = tile_count;
-        Tile init_tile = Tile(TileSegRef(0), 0);
+    // Prefix sum of sh_tile_count
-        // TODO: improve load balancing
+    for (uint i = 0; i < LG_TILE_ALLOC_WG; i++) {
-        for (uint i = 0; i < n_tiles; i++) {
+        barrier();
-            Tile_write(Tile_index(path.tiles, i), init_tile);
+        if (th_ix >= (1 << i)) {
            tile_count += sh_tile_count[th_ix - (1 << i)];
        }
        barrier();
        sh_tile_count[th_ix] = tile_count;
    }
    if (th_ix == TILE_ALLOC_WG - 1) {
        sh_tile_alloc = atomicAdd(alloc, tile_count * Tile_size);
    }
    barrier();
    uint alloc_start = sh_tile_alloc;
    uint tile_subix = th_ix > 0 ? sh_tile_count[th_ix - 1] : 0;
    path.tiles = TileRef(alloc_start + Tile_size * tile_subix);
    Path_write(path_ref, path);
    // Zero out allocated tiles efficiently
    uint total_count = sh_tile_count[TILE_ALLOC_WG - 1] * (Tile_size / 4);
    uint start_ix = alloc_start >> 2;
    for (uint i = th_ix; i < total_count; i += TILE_ALLOC_WG) {
        // Note: this interleaving is faster than using Tile_write
        // by a significant amount.
        tile[start_ix + i] = 0;
    }
 }
--- a/piet-gpu/shader/tile_alloc.spv
+++ b/piet-gpu/shader/tile_alloc.spv