Start writing tiles

This is the first checkpoint where it actually runs a pipeline end to end, though it's far from accurate.
2025-01-10 20:51:29 +11:00 · 2020-05-15 12:28:29 -07:00 · 2020-05-15 12:28:29 -07:00 · 3a6428238b
parent 06cad48dca
commit 3a6428238b
6 changed files with 56 additions and 94 deletions
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@ -41,7 +41,7 @@ fn main() -> Result<(), Error> {
        let fence = device.create_fence(false)?;
        let mut cmd_buf = device.create_cmd_buf()?;
-        let query_pool = device.create_query_pool(4)?;
+        let query_pool = device.create_query_pool(5)?;
        let mut ctx = PietGpuRenderContext::new();
        render_scene(&mut ctx);
@ -62,10 +62,11 @@ fn main() -> Result<(), Error> {
        println!("Element kernel time: {:.3}ms", ts[0] * 1e3);
        println!("Binning kernel time: {:.3}ms", (ts[1] - ts[0]) * 1e3);
        println!("Coarse kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3);
        println!("Render kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3);
        /*
        let mut data: Vec<u32> = Default::default();
-        device.read_buffer(&renderer.bin_buf, &mut data).unwrap();
+        device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap();
        piet_gpu::dump_k1_data(&data);
        let mut data: Vec<u32> = Default::default();
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@ -46,6 +46,17 @@ shared uint sh_bitmaps[N_SLICE][N_TILE];
 #define SX (1.0 / float(TILE_WIDTH_PX))
 #define SY (1.0 / float(TILE_HEIGHT_PX))
 // Perhaps cmd_limit should be a global? This is a style question.
 void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) {
    if (cmd_ref.offset > cmd_limit) {
        uint new_cmd = atomicAdd(alloc, PTCL_INITIAL_ALLOC);
        CmdJump jump = CmdJump(new_cmd);
        Cmd_Jump_write(cmd_ref, jump);
        cmd_ref = CmdRef(new_cmd);
        cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
    }
 }
 void main() {
    // Could use either linear or 2d layouts for both dispatch and
    // invocations within the workgroup. We'll use variables to abstract.
@ -53,6 +64,13 @@ void main() {
    // Top left coordinates of this bin.
    vec2 xy0 = vec2(N_TILE_X * TILE_WIDTH_PX * gl_WorkGroupID.x, N_TILE_Y * TILE_HEIGHT_PX * gl_WorkGroupID.y);
    uint th_ix = gl_LocalInvocationID.x;
    uint tile_x = N_TILE_X * gl_WorkGroupID.x + gl_LocalInvocationID.x % N_TILE_X;
    uint tile_y = N_TILE_Y * gl_WorkGroupID.y + gl_LocalInvocationID.x / N_TILE_X;
    uint tile_ix = tile_y * WIDTH_IN_TILES + tile_x;
    CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC);
    uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
    uint wr_ix = 0;
    uint rd_ix = 0;
    uint first_el;
@ -172,6 +190,7 @@ void main() {
                y++;
            }
        }
        barrier();
        // Output elements for this tile, based on bitmaps.
        uint slice_ix = 0;
@ -193,13 +212,25 @@ void main() {
            // At this point, we read the element again from global memory.
            // If that turns out to be expensive, maybe we can pack it into
            // shared memory (or perhaps just the tag).
-            probe += 1;
+            ref = AnnotatedRef(element_ix * Annotated_size);
            tag = Annotated_tag(ref);
            switch (tag) {
            case Annotated_Fill:
            case Annotated_Stroke:
                // Note: we take advantage of the fact that fills and strokes
                // have compatible layout.
                AnnoFill fill = Annotated_Fill_read(ref);
                alloc_cmd(cmd_ref, cmd_limit);
                Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color));
                break;
            }
            // clear LSB
            bitmap &= bitmap - 1;
        }
        rd_ix += N_TILE;
        break;
    } while (wr_ix > rd_ix);
    ptcl[bin_ix * N_TILE + th_ix] = probe;
 }
--- a/piet-gpu/shader/coarse.spv
+++ b/piet-gpu/shader/coarse.spv
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@ -39,7 +39,8 @@ void main() {
    uvec2 xy_uint = gl_GlobalInvocationID.xy;
    vec2 xy = vec2(xy_uint);
    vec2 uv = xy * vec2(1.0 / IMAGE_WIDTH, 1.0 / IMAGE_HEIGHT);
-    vec3 rgb = uv.xyy;
+    //vec3 rgb = uv.xyy;
    vec3 rgb = vec3(0.75);
    while (true) {
        uint tag = Cmd_tag(cmd_ref);
--- a/piet-gpu/shader/kernel4.spv
+++ b/piet-gpu/shader/kernel4.spv
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -30,7 +30,7 @@ const PTCL_INITIAL_ALLOC: usize = 1024;
 const K2_PER_TILE_SIZE: usize = 8;
-const N_CIRCLES: usize = 1;
+const N_CIRCLES: usize = 0;
 const N_WG: u32 = 16;
@ -47,11 +47,13 @@ pub fn render_scene(rc: &mut impl RenderContext) {
        rc.fill(circle, &color);
    }
    let mut path = BezPath::new();
    /*
    path.move_to((100.0, 1150.0));
    path.line_to((200.0, 1200.0));
    path.line_to((150.0, 1250.0));
    path.close_path();
    rc.fill(path, &Color::rgb8(128, 0, 128));
    */
    rc.stroke(
        Line::new((100.0, 100.0), (200.0, 150.0)),
        &Color::WHITE,
@ -134,29 +136,9 @@ pub struct Renderer<D: Device> {
    coarse_alloc_buf_host: D::Buffer,
    coarse_alloc_buf_dev: D::Buffer,
    /*
    k1_alloc_buf_host: D::Buffer,
    k1_alloc_buf_dev: D::Buffer,
    k2s_alloc_buf_host: D::Buffer,
    k2s_alloc_buf_dev: D::Buffer,
    k2f_alloc_buf_host: D::Buffer,
    k2f_alloc_buf_dev: D::Buffer,
    k3_alloc_buf_host: D::Buffer,
    k3_alloc_buf_dev: D::Buffer,
    tilegroup_buf: D::Buffer,
    ptcl_buf: D::Buffer,
    k1_pipeline: D::Pipeline,
    k1_ds: D::DescriptorSet,
    k2s_pipeline: D::Pipeline,
    k2s_ds: D::DescriptorSet,
    k2f_pipeline: D::Pipeline,
    k2f_ds: D::DescriptorSet,
    k3_pipeline: D::Pipeline,
    k3_ds: D::DescriptorSet,
    k4_pipeline: D::Pipeline,
    k4_ds: D::DescriptorSet,
-    */
+ 
    n_elements: usize,
 }
@ -213,10 +195,10 @@ impl<D: Device> Renderer<D> {
        let coarse_alloc_buf_host = device.create_buffer(4, host)?;
        let coarse_alloc_buf_dev = device.create_buffer(4, dev)?;
-        let coarse_alloc_start = 256 * 64 * N_WG;
+        let coarse_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC;
        device
            .write_buffer(&coarse_alloc_buf_host, &[
-                coarse_alloc_start,
+                coarse_alloc_start as u32,
            ])
            ?;
        let coarse_code = include_bytes!("../shader/coarse.spv");
@ -227,72 +209,11 @@ impl<D: Device> Renderer<D> {
            &[],
        )?;
-        /*
+        // These will probably be combined with the ptcl buf, as they're all written by the
-        let tilegroup_buf = device.create_buffer(4 * 1024 * 1024, dev)?;
+        // same kernel now.
        let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?;
        let segment_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
        let fill_seg_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
        let k1_alloc_buf_host = device.create_buffer(4, host)?;
        let k1_alloc_buf_dev = device.create_buffer(4, dev)?;
        let k1_alloc_start = WIDTH_IN_TILEGROUPS * HEIGHT_IN_TILEGROUPS * TILEGROUP_STRIDE;
        device.write_buffer(&k1_alloc_buf_host, &[k1_alloc_start as u32])?;
        let k1_code = include_bytes!("../shader/kernel1.spv");
        let k1_pipeline = device.create_simple_compute_pipeline(k1_code, 3, 0)?;
        let k1_ds = device.create_descriptor_set(
            &k1_pipeline,
            &[&scene_dev, &tilegroup_buf, &k1_alloc_buf_dev],
            &[],
        )?;
        let k2s_alloc_buf_host = device.create_buffer(4, host)?;
        let k2s_alloc_buf_dev = device.create_buffer(4, dev)?;
        let k2s_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * K2_PER_TILE_SIZE;
        device.write_buffer(&k2s_alloc_buf_host, &[k2s_alloc_start as u32])?;
        let k2s_code = include_bytes!("../shader/kernel2s.spv");
        let k2s_pipeline = device.create_simple_compute_pipeline(k2s_code, 4, 0)?;
        let k2s_ds = device.create_descriptor_set(
            &k2s_pipeline,
            &[&scene_dev, &tilegroup_buf, &segment_buf, &k2s_alloc_buf_dev],
            &[],
        )?;
        let k2f_alloc_buf_host = device.create_buffer(4, host)?;
        let k2f_alloc_buf_dev = device.create_buffer(4, dev)?;
        let k2f_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * K2_PER_TILE_SIZE;
        device.write_buffer(&k2f_alloc_buf_host, &[k2f_alloc_start as u32])?;
        let k2f_code = include_bytes!("../shader/kernel2f.spv");
        let k2f_pipeline = device.create_simple_compute_pipeline(k2f_code, 4, 0)?;
        let k2f_ds = device.create_descriptor_set(
            &k2f_pipeline,
            &[
                &scene_dev,
                &tilegroup_buf,
                &fill_seg_buf,
                &k2f_alloc_buf_dev,
            ],
            &[],
        )?;
        let k3_alloc_buf_host = device.create_buffer(4, host)?;
        let k3_alloc_buf_dev = device.create_buffer(4, dev)?;
        let k3_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC;
        device.write_buffer(&k3_alloc_buf_host, &[k3_alloc_start as u32])?;
        let k3_code = include_bytes!("../shader/kernel3.spv");
        let k3_pipeline = device.create_simple_compute_pipeline(k3_code, 6, 0)?;
        let k3_ds = device.create_descriptor_set(
            &k3_pipeline,
            &[
                &scene_dev,
                &tilegroup_buf,
                &segment_buf,
                &fill_seg_buf,
                &ptcl_buf,
                &k3_alloc_buf_dev,
            ],
            &[],
        )?;
        let k4_code = include_bytes!("../shader/kernel4.spv");
        let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 3, 1)?;
        let k4_ds = device.create_descriptor_set(
@ -300,7 +221,6 @@ impl<D: Device> Renderer<D> {
            &[&ptcl_buf, &segment_buf, &fill_seg_buf],
            &[&image_dev],
        )?;
        */
        Ok(Renderer {
            scene_buf,
@ -312,6 +232,8 @@ impl<D: Device> Renderer<D> {
            bin_ds,
            coarse_pipeline,
            coarse_ds,
            k4_pipeline,
            k4_ds,
            state_buf,
            anno_buf,
            bin_buf,
@ -339,7 +261,7 @@ impl<D: Device> Renderer<D> {
        cmd_buf.dispatch(
            &self.el_pipeline,
            &self.el_ds,
-            ((self.n_elements / 128) as u32, 1, 1),
+            (((self.n_elements + 127) / 128) as u32, 1, 1),
        );
        cmd_buf.write_timestamp(&query_pool, 1);
        cmd_buf.memory_barrier();
@ -357,6 +279,13 @@ impl<D: Device> Renderer<D> {
        );
        cmd_buf.write_timestamp(&query_pool, 3);
        cmd_buf.memory_barrier();
        cmd_buf.dispatch(
            &self.k4_pipeline,
            &self.k4_ds,
            ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1),
        );
        cmd_buf.write_timestamp(&query_pool, 4);
        cmd_buf.memory_barrier();
        cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
    }
 }