Binning stage

Adds a binning stage. This is a first draft, and a number of loose ends exist.
2025-01-10 12:41:30 +11:00 · 2020-05-12 13:38:26 -07:00 · 2020-05-12 13:38:26 -07:00 · 343e4c3075
parent 736f883f66
commit 343e4c3075
11 changed files with 322 additions and 11 deletions
--- a/piet-gpu-types/src/bins.rs
+++ b/piet-gpu-types/src/bins.rs
@ -0,0 +1,19 @@
 use piet_gpu_derive::piet_gpu;
 // The output of the binning stage, organized as a linked list of chunks.
 piet_gpu! {
    #[gpu_write]
    mod bins {
        struct BinInstance {
            element_ix: u32,
        }
        struct BinChunk {
            // First chunk can have n = 0, subsequent ones not.
            n: u32,
            next: Ref<BinChunk>,
            // Instances follow
        }
    }
 }
--- a/piet-gpu-types/src/lib.rs
+++ b/piet-gpu-types/src/lib.rs
@ -1,4 +1,7 @@
 // Structures used only internally probably don't need to be pub.
 pub mod annotated;
 pub mod bins;
 pub mod encoder;
 pub mod fill_seg;
 pub mod ptcl;
--- a/piet-gpu-types/src/main.rs
+++ b/piet-gpu-types/src/main.rs
@ -7,6 +7,7 @@ fn main() {
        "scene" => print!("{}", piet_gpu_types::scene::gen_gpu_scene()),
        "state" => print!("{}", piet_gpu_types::state::gen_gpu_state()),
        "annotated" => print!("{}", piet_gpu_types::annotated::gen_gpu_annotated()),
        "bins" => print!("{}", piet_gpu_types::bins::gen_gpu_bins()),
        "tilegroup" => print!("{}", piet_gpu_types::tilegroup::gen_gpu_tilegroup()),
        "segment" => print!("{}", piet_gpu_types::segment::gen_gpu_segment()),
        "fill_seg" => print!("{}", piet_gpu_types::fill_seg::gen_gpu_fill_seg()),
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@ -41,7 +41,7 @@ fn main() -> Result<(), Error> {
        let fence = device.create_fence(false)?;
        let mut cmd_buf = device.create_cmd_buf()?;
-        let query_pool = device.create_query_pool(2)?;
+        let query_pool = device.create_query_pool(3)?;
        let mut ctx = PietGpuRenderContext::new();
        render_scene(&mut ctx);
@ -58,13 +58,14 @@ fn main() -> Result<(), Error> {
        cmd_buf.finish();
        device.run_cmd_buf(&cmd_buf, &[], &[], Some(&fence))?;
        device.wait_and_reset(&[fence])?;
-        let timestamps = device.reap_query_pool(&query_pool).unwrap();
+        let ts = device.reap_query_pool(&query_pool).unwrap();
-        println!("Element kernel time: {:.3}ms", timestamps[0] * 1e3);
+        println!("Element kernel time: {:.3}ms", ts[0] * 1e3);
        println!("Binning kernel time: {:.3}ms", (ts[1] - ts[0]) * 1e3);
        /*
-        let mut data: Vec<u8> = Default::default();
+        let mut data: Vec<u32> = Default::default();
-        device.read_buffer(&renderer.state_buf, &mut data).unwrap();
+        device.read_buffer(&renderer.bin_buf, &mut data).unwrap();
-        dump_state(&data);
+        piet_gpu::dump_k1_data(&data);
        */
        let mut img_data: Vec<u8> = Default::default();
--- a/piet-gpu/shader/binning.comp
+++ b/piet-gpu/shader/binning.comp
@ -0,0 +1,169 @@
 // The binning stage of the pipeline.
 #version 450
 #extension GL_GOOGLE_include_directive : enable
 #define N_ROWS 4
 #define WG_SIZE 32
 #define LG_WG_SIZE 5
 #define TILE_SIZE (WG_SIZE * N_ROWS)
 // TODO: move these to setup file
 #define N_TILE_X 16
 #define N_TILE_Y 16
 #define N_TILE (N_TILE_X * N_TILE_Y)
 #define N_SLICE (N_TILE / 32)
 #define N_WG 16 // Number of workgroups, should be 1 per SM
 #define BIN_INITIAL_ALLOC 64
 #define BIN_ALLOC 256
 layout(local_size_x = N_TILE, local_size_y = 1) in;
 layout(set = 0, binding = 0) buffer AnnotatedBuf {
    uint[] annotated;
 };
 layout(set = 0, binding = 1) buffer AllocBuf {
    uint n_elements;
    // Will be incremented atomically to claim tiles
    uint tile_ix;
    uint alloc;
 };
 layout(set = 0, binding = 2) buffer BinsBuf {
    uint[] bins;
 };
 #include "annotated.h"
 #include "bins.h"
 #include "setup.h"
 // scale factors useful for converting coordinates to bins
 #define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX))
 #define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX))
 // Note: cudaraster has N_TILE + 1 to cut down on bank conflicts.
 shared uint bitmaps[N_SLICE][N_TILE];
 shared uint sh_my_tile;
 void main() {
    BinChunkRef chunk_ref = BinChunkRef((gl_LocalInvocationID.x * N_WG + gl_WorkGroupID.x) * BIN_INITIAL_ALLOC);
    uint chunk_limit = chunk_ref.offset + BIN_INITIAL_ALLOC - BinInstance_size;
    uint chunk_n = 0;
    BinInstanceRef instance_ref = BinInstanceRef(chunk_ref.offset + BinChunk_size);
    while (true) {
        if (gl_LocalInvocationID.x == 0) {
            sh_my_tile = atomicAdd(tile_ix, 1);
        }
        barrier();
        uint my_tile = sh_my_tile;
        if (my_tile * N_TILE >= n_elements) {
            break;
        }
        for (uint i = 0; i < N_SLICE; i++) {
            bitmaps[i][gl_LocalInvocationID.x] = 0;
        }
        barrier();
        // Read inputs and determine coverage of bins
        uint element_ix = my_tile * N_TILE + gl_LocalInvocationID.x;
        AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size);
        uint tag = Annotated_tag(ref);
        int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
        switch (tag) {
        case Annotated_Line:
            AnnoLineSeg line = Annotated_Line_read(ref);
            x0 = int(floor((min(line.p0.x, line.p1.x) - line.stroke.x) * SX));
            y0 = int(floor((min(line.p0.y, line.p1.y) - line.stroke.y) * SY));
            x1 = int(ceil((max(line.p0.x, line.p1.x) + line.stroke.x) * SX));
            y1 = int(ceil((max(line.p0.y, line.p1.y) + line.stroke.y) * SY));
            break;
        case Annotated_Fill:
        case Annotated_Stroke:
            // Note: we take advantage of the fact that fills and strokes
            // have compatible layout.
            AnnoFill fill = Annotated_Fill_read(ref);
            x0 = int(floor(fill.bbox.x * SX));
            y0 = int(floor(fill.bbox.y * SY));
            x1 = int(ceil(fill.bbox.z * SX));
            y1 = int(ceil(fill.bbox.w * SY));
            break;
        }
        // At this point, we run an iterator over the coverage area,
        // trying to keep divergence low.
        // Right now, it's just a bbox, but we'll get finer with
        // segments.
        x0 = clamp(x0, 0, N_TILE_X);
        x1 = clamp(x1, x0, N_TILE_X);
        y0 = clamp(y0, 0, N_TILE_Y);
        y1 = clamp(y1, y0, N_TILE_Y);
        if (x0 == x1) y1 = y0;
        int x = x0, y = y0;
        uint my_slice = gl_LocalInvocationID.x / 32;
        uint my_mask = 1 << (gl_LocalInvocationID.x & 31);
        while (y < y1) {
            atomicOr(bitmaps[my_slice][y * N_TILE_X + x], my_mask);
            x++;
            if (x == x1) {
                x = x0;
                y++;
            }
        }
        barrier();
        // Allocate output segments.
        uint element_count = 0;
        for (uint i = 0; i < N_SLICE; i++) {
            element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]);
        }
        // element_count is number of elements covering bin for this invocation.
        if (element_count > 0 && chunk_n > 0) {
            uint new_chunk = instance_ref.offset;
            if (new_chunk + min(32, element_count * 4) > chunk_limit) {
                new_chunk = atomicAdd(alloc, BIN_ALLOC);
                chunk_limit = new_chunk + BIN_ALLOC - BinInstance_size;
            }
            BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(new_chunk)));
            chunk_ref = BinChunkRef(new_chunk);
            instance_ref = BinInstanceRef(new_chunk + BinChunk_size);
            chunk_n = 0;
        }
        // TODO: allocate output here
        // Iterate over bits set.
        uint slice_ix = 0;
        uint bitmap = bitmaps[0][gl_LocalInvocationID.x];
        while (true) {
            if (bitmap == 0) {
                slice_ix++;
                if (slice_ix == N_SLICE) {
                    break;
                }
                bitmap = bitmaps[slice_ix][gl_LocalInvocationID.x];
                if (bitmap == 0) {
                    continue;
                }
            }
            element_ix = my_tile * N_TILE + slice_ix * 32 + findLSB(bitmap);
            // At this point, element_ix refers to an element that covers this bin.
            // TODO: batch allocated based on element_count; this is divergent
            if (instance_ref.offset > chunk_limit) {
                uint new_chunk = atomicAdd(alloc, BIN_ALLOC);
                BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(new_chunk)));
                chunk_ref = BinChunkRef(new_chunk);
                instance_ref = BinInstanceRef(new_chunk + BinChunk_size);
                chunk_n = 0;
                chunk_limit = new_chunk + BIN_ALLOC - BinInstance_size;
            }
            BinInstance_write(instance_ref, BinInstance(element_ix));
            chunk_n++;
            instance_ref.offset += BinInstance_size;
            // clear LSB
            bitmap &= bitmap - 1;
        }
    }
    BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(0)));
 }
--- a/piet-gpu/shader/binning.spv
+++ b/piet-gpu/shader/binning.spv
--- a/piet-gpu/shader/bins.h
+++ b/piet-gpu/shader/bins.h
@ -0,0 +1,60 @@
 // Code auto-generated by piet-gpu-derive
 struct BinInstanceRef {
    uint offset;
 };
 struct BinChunkRef {
    uint offset;
 };
 struct BinInstance {
    uint element_ix;
 };
 #define BinInstance_size 4
 BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) {
    return BinInstanceRef(ref.offset + index * BinInstance_size);
 }
 struct BinChunk {
    uint n;
    BinChunkRef next;
 };
 #define BinChunk_size 8
 BinChunkRef BinChunk_index(BinChunkRef ref, uint index) {
    return BinChunkRef(ref.offset + index * BinChunk_size);
 }
 BinInstance BinInstance_read(BinInstanceRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = bins[ix + 0];
    BinInstance s;
    s.element_ix = raw0;
    return s;
 }
 void BinInstance_write(BinInstanceRef ref, BinInstance s) {
    uint ix = ref.offset >> 2;
    bins[ix + 0] = s.element_ix;
 }
 BinChunk BinChunk_read(BinChunkRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = bins[ix + 0];
    uint raw1 = bins[ix + 1];
    BinChunk s;
    s.n = raw0;
    s.next = BinChunkRef(raw1);
    return s;
 }
 void BinChunk_write(BinChunkRef ref, BinChunk s) {
    uint ix = ref.offset >> 2;
    bins[ix + 0] = s.n;
    bins[ix + 1] = s.next.offset;
 }
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@ -20,4 +20,6 @@ build kernel3.spv: glsl kernel3.comp | scene.h tilegroup.h segment.h fill_seg.h
 build kernel4.spv: glsl kernel4.comp | ptcl.h segment.h fill_seg.h setup.h
-build elements.spv: glsl elements.comp | scene.h state.h
+build elements.spv: glsl elements.comp | scene.h state.h annotated.h
 build binning.spv: glsl binning.comp | annotated.h setup.h
--- a/piet-gpu/shader/elements.comp
+++ b/piet-gpu/shader/elements.comp
@ -1,3 +1,9 @@
 // The element processing stage, first in the pipeline.
 //
 // This stage is primarily about applying transforms and computing bounding
 // boxes. It is organized as a scan over the input elements, producing
 // annotated output elements.
 #version 450
 #extension GL_GOOGLE_include_directive : enable
@ -208,6 +214,13 @@ void main() {
            anno_stroke.linewidth = st.linewidth * sqrt(st.mat.x * st.mat.w - st.mat.y * st.mat.z);
            Annotated_Stroke_write(out_ref, anno_stroke);
            break;
        case Element_Fill:
            Fill fill = Element_Fill_read(this_ref);
            AnnoFill anno_fill;
            anno_fill.rgba_color = fill.rgba_color;
            anno_fill.bbox = st.bbox;
            Annotated_Fill_write(out_ref, anno_fill);
            break;
        default:
            Annotated_Nop_write(out_ref);
            break;
--- a/piet-gpu/shader/elements.spv
+++ b/piet-gpu/shader/elements.spv
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -32,6 +32,8 @@ const K2_PER_TILE_SIZE: usize = 8;
 const N_CIRCLES: usize = 1;
 const N_WG: u32 = 16;
 pub fn render_scene(rc: &mut impl RenderContext) {
    let mut rng = rand::thread_rng();
    for _ in 0..N_CIRCLES {
@ -98,10 +100,10 @@ fn dump_scene(buf: &[u8]) {
 }
 #[allow(unused)]
-fn dump_k1_data(k1_buf: &[u32]) {
+pub fn dump_k1_data(k1_buf: &[u32]) {
    for i in 0..k1_buf.len() {
        if k1_buf[i] != 0 {
-            println!("{:4x}: {:8x}", i, k1_buf[i]);
+            println!("{:4x}: {:8x}", i * 4, k1_buf[i]);
        }
    }
 }
@ -114,10 +116,17 @@ pub struct Renderer<D: Device> {
    pub state_buf: D::Buffer,
    pub anno_buf: D::Buffer,
    pub bin_buf: D::Buffer,
    el_pipeline: D::Pipeline,
    el_ds: D::DescriptorSet,
    bin_pipeline: D::Pipeline,
    bin_ds: D::DescriptorSet,
    bin_alloc_buf_host: D::Buffer,
    bin_alloc_buf_dev: D::Buffer,
    /*
    k1_alloc_buf_host: D::Buffer,
    k1_alloc_buf_dev: D::Buffer,
@ -149,6 +158,9 @@ impl<D: Device> Renderer<D> {
        let host = MemFlags::host_coherent();
        let dev = MemFlags::device_local();
        let n_elements = scene.len() / piet_gpu_types::scene::Element::fixed_size();
        println!("scene: {} elements", n_elements);
        let scene_buf = device
            .create_buffer(std::mem::size_of_val(&scene[..]) as u64, host)
            .unwrap();
@ -159,6 +171,7 @@ impl<D: Device> Renderer<D> {
        let state_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
        let anno_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
        let bin_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
        let image_dev = device.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?;
        let el_code = include_bytes!("../shader/elements.spv");
@ -169,8 +182,25 @@ impl<D: Device> Renderer<D> {
            &[],
        )?;
-        let n_elements = scene.len() / piet_gpu_types::scene::Element::fixed_size();
+        let bin_alloc_buf_host = device.create_buffer(12, host)?;
-        println!("scene: {} elements", n_elements);
+        let bin_alloc_buf_dev = device.create_buffer(12, dev)?;
        // TODO: constants
        let bin_alloc_start = 256 * 64 * N_WG;
        device
            .write_buffer(&bin_alloc_buf_host, &[
                n_elements as u32,
                0,
                bin_alloc_start,
            ])
            ?;
        let bin_code = include_bytes!("../shader/binning.spv");
        let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 3, 0)?;
        let bin_ds = device.create_descriptor_set(
            &bin_pipeline,
            &[&anno_buf, &bin_alloc_buf_dev, &bin_buf],
            &[],
        )?;
        /*
        let tilegroup_buf = device.create_buffer(4 * 1024 * 1024, dev)?;
@ -253,14 +283,20 @@ impl<D: Device> Renderer<D> {
            image_dev,
            el_pipeline,
            el_ds,
            bin_pipeline,
            bin_ds,
            state_buf,
            anno_buf,
            bin_buf,
            bin_alloc_buf_host,
            bin_alloc_buf_dev,
            n_elements,
        })
    }
    pub unsafe fn record(&self, cmd_buf: &mut impl CmdBuf<D>, query_pool: &D::QueryPool) {
        cmd_buf.copy_buffer(&self.scene_buf, &self.scene_dev);
        cmd_buf.copy_buffer(&self.bin_alloc_buf_host, &self.bin_alloc_buf_dev);
        cmd_buf.memory_barrier();
        cmd_buf.image_barrier(
            &self.image_dev,
@ -276,6 +312,13 @@ impl<D: Device> Renderer<D> {
        );
        cmd_buf.write_timestamp(&query_pool, 1);
        cmd_buf.memory_barrier();
        cmd_buf.dispatch(
            &self.bin_pipeline,
            &self.bin_ds,
            (N_WG, 1, 1),
        );
        cmd_buf.write_timestamp(&query_pool, 2);
        cmd_buf.memory_barrier();
        cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
    }
 }