From 47d2e0a756af10fb2f8375d6e80e4a929f561d46 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Mon, 24 May 2021 11:44:56 -0700
Subject: [PATCH 1/4] Add create_buffer_init method

Add a method to create a buffer with initial content, which requires
staging buffers under the hood.

This patch also changes the lower-level (Vulkan) interface to be closer
to the raw Vulkan call.
---
 piet-gpu-hal/examples/collatz.rs |   7 +-
 piet-gpu-hal/src/hub.rs          | 125 +++++++++++++++++++++++++++----
 piet-gpu-hal/src/lib.rs          |  48 +++++++++---
 piet-gpu-hal/src/vulkan.rs       |  94 +++++++++++++++++------
 piet-gpu/bin/cli.rs              |   3 +-
 piet-gpu/src/lib.rs              |  66 ++++++----------
 6 files changed, 246 insertions(+), 97 deletions(-)
diff --git a/piet-gpu-hal/examples/collatz.rs b/piet-gpu-hal/examples/collatz.rs
index ddf6769..bde4084 100644
--- a/piet-gpu-hal/examples/collatz.rs
+++ b/piet-gpu-hal/examples/collatz.rs
@@ -7,12 +7,9 @@ fn main() {
     unsafe {
         let device = instance.device(None).unwrap();
         let session = hub::Session::new(device);
-        let usage = BufferUsage::MAP_READ | BufferUsage::MAP_WRITE | BufferUsage::STORAGE;
+        let usage = BufferUsage::MAP_READ | BufferUsage::STORAGE;
         let src = (0..256).map(|x| x + 1).collect::<Vec<u32>>();
-        let mut buffer = session
-            .create_buffer(std::mem::size_of_val(&src[..]) as u64, usage)
-            .unwrap();
-        buffer.write(&src).unwrap();
+        let buffer = session.create_buffer_init(&src, usage).unwrap();
         let code = include_bytes!("./shader/collatz.spv");
         let pipeline = session.create_simple_compute_pipeline(code, 1).unwrap();
         let descriptor_set = session
diff --git a/piet-gpu-hal/src/hub.rs b/piet-gpu-hal/src/hub.rs
index d98108c..554a024 100644
--- a/piet-gpu-hal/src/hub.rs
+++ b/piet-gpu-hal/src/hub.rs
@@ -5,9 +5,11 @@
 //! negotiation of backends (Vulkan, DX12), but right now it's Vulkan-only.
 
 use std::any::Any;
+use std::convert::TryInto;
 use std::sync::{Arc, Mutex, Weak};
 
 use crate::vulkan;
+use crate::CmdBuf as CmdBufTrait;
 use crate::DescriptorSetBuilder as DescriptorSetBuilderTrait;
 use crate::PipelineBuilder as PipelineBuilderTrait;
 use crate::{BufferUsage, Device, Error, GpuInfo, SamplerParams};
@@ -31,6 +33,8 @@ struct SessionInner {
     cmd_buf_pool: Mutex<Vec<(vulkan::CmdBuf, Fence)>>,
     /// Command buffers that are still pending (so resources can't be freed).
     pending: Mutex<Vec<SubmittedCmdBufInner>>,
+    /// A command buffer that is used for copying from staging buffers.
+    staging_cmd_buf: Mutex<Option<CmdBuf>>,
     gpu_info: GpuInfo,
 }
 
@@ -45,9 +49,12 @@ pub struct CmdBuf {
 pub struct SubmittedCmdBuf(Option<SubmittedCmdBufInner>, Weak<SessionInner>);
 
 struct SubmittedCmdBufInner {
+    // It's inconsistent, cmd_buf is unpacked, staging_cmd_buf isn't. Probably
+    // better to chose one or the other.
     cmd_buf: vulkan::CmdBuf,
     fence: Fence,
     resources: Vec<Box<dyn Any>>,
+    staging_cmd_buf: Option<CmdBuf>,
 }
 
 #[derive(Clone)]
@@ -70,6 +77,20 @@ pub struct PipelineBuilder(vulkan::PipelineBuilder);
 
 pub struct DescriptorSetBuilder(vulkan::DescriptorSetBuilder);
 
+/// Data types that can be stored in a GPU buffer.
+pub unsafe trait PlainData {}
+
+unsafe impl PlainData for u8 {}
+unsafe impl PlainData for u16 {}
+unsafe impl PlainData for u32 {}
+unsafe impl PlainData for u64 {}
+unsafe impl PlainData for i8 {}
+unsafe impl PlainData for i16 {}
+unsafe impl PlainData for i32 {}
+unsafe impl PlainData for i64 {}
+unsafe impl PlainData for f32 {}
+unsafe impl PlainData for f64 {}
+
 impl Session {
     pub fn new(device: vulkan::VkDevice) -> Session {
         let gpu_info = device.query_gpu_info();
@@ -78,6 +99,7 @@ impl Session {
             gpu_info,
             cmd_buf_pool: Default::default(),
             pending: Default::default(),
+            staging_cmd_buf: Default::default(),
         }))
     }
 
@@ -107,12 +129,13 @@ impl Session {
                     let item = pending.swap_remove(i);
                     // TODO: wait is superfluous, can just reset
                     let _ = self.0.device.wait_and_reset(&[item.fence]);
-                    self.0
-                        .cmd_buf_pool
-                        .lock()
-                        .unwrap()
-                        .push((item.cmd_buf, item.fence));
+                    let mut pool = self.0.cmd_buf_pool.lock().unwrap();
+                    pool.push((item.cmd_buf, item.fence));
                     std::mem::drop(item.resources);
+                    if let Some(staging_cmd_buf) = item.staging_cmd_buf {
+                        pool.push((staging_cmd_buf.cmd_buf, staging_cmd_buf.fence));
+                        std::mem::drop(staging_cmd_buf.resources);
+                    }
                 } else {
                     i += 1;
                 }
@@ -126,8 +149,19 @@ impl Session {
         wait_semaphores: &[Semaphore],
         signal_semaphores: &[Semaphore],
     ) -> Result<SubmittedCmdBuf, Error> {
-        self.0.device.run_cmd_buf(
-            &cmd_buf.cmd_buf,
+        // Again, SmallVec here?
+        let mut cmd_bufs = Vec::with_capacity(2);
+        let mut staging_cmd_buf = self.0.staging_cmd_buf.lock().unwrap().take();
+        if let Some(staging) = &mut staging_cmd_buf {
+            // With finer grained resource tracking, we might be able to avoid this in
+            // some cases.
+            staging.memory_barrier();
+            staging.finish();
+            cmd_bufs.push(&staging.cmd_buf);
+        }
+        cmd_bufs.push(&cmd_buf.cmd_buf);
+        self.0.device.run_cmd_bufs(
+            &cmd_bufs,
             wait_semaphores,
             signal_semaphores,
             Some(&cmd_buf.fence),
@@ -137,6 +171,7 @@ impl Session {
                 cmd_buf: cmd_buf.cmd_buf,
                 fence: cmd_buf.fence,
                 resources: cmd_buf.resources,
+                staging_cmd_buf,
             }),
             cmd_buf.session,
         ))
@@ -150,11 +185,58 @@ impl Session {
         })))
     }
 
-    pub unsafe fn create_image2d(
+    /// Create a buffer with initialized data.
+    pub fn create_buffer_init(
         &self,
-        width: u32,
-        height: u32,
-    ) -> Result<Image, Error> {
+        contents: &[impl PlainData],
+        usage: BufferUsage,
+    ) -> Result<Buffer, Error> {
+        unsafe {
+            self.create_buffer_init_raw(
+                contents.as_ptr() as *const u8,
+                std::mem::size_of_val(contents).try_into()?,
+                usage,
+            )
+        }
+    }
+
+    /// Create a buffer with initialized data, from a raw pointer memory region.
+    pub unsafe fn create_buffer_init_raw(
+        &self,
+        contents: *const u8,
+        size: u64,
+        usage: BufferUsage,
+    ) -> Result<Buffer, Error> {
+        let use_staging_buffer = !usage.intersects(BufferUsage::MAP_READ | BufferUsage::MAP_WRITE)
+            && self.gpu_info().use_staging_buffers;
+        let create_usage = if use_staging_buffer {
+            BufferUsage::MAP_WRITE | BufferUsage::COPY_SRC
+        } else {
+            usage | BufferUsage::MAP_WRITE
+        };
+        let create_buf = self.create_buffer(size, create_usage)?;
+        self.0
+            .device
+            .write_buffer(&create_buf.vk_buffer(), contents, 0, size)?;
+        if use_staging_buffer {
+            let buf = self.create_buffer(size, usage | BufferUsage::COPY_DST)?;
+            let mut staging_cmd_buf = self.0.staging_cmd_buf.lock().unwrap();
+            if staging_cmd_buf.is_none() {
+                let mut cmd_buf = self.cmd_buf()?;
+                cmd_buf.begin();
+                *staging_cmd_buf = Some(cmd_buf);
+            }
+            let staging_cmd_buf = staging_cmd_buf.as_mut().unwrap();
+            // This will ensure the staging buffer is deallocated
+            staging_cmd_buf.add_resource(&create_buf);
+            staging_cmd_buf.copy_buffer(create_buf.vk_buffer(), buf.vk_buffer());
+            Ok(buf)
+        } else {
+            Ok(create_buf)
+        }
+    }
+
+    pub unsafe fn create_image2d(&self, width: u32, height: u32) -> Result<Image, Error> {
         let image = self.0.device.create_image2d(width, height)?;
         Ok(Image(Arc::new(ImageInner {
             image,
@@ -305,16 +387,29 @@ impl Buffer {
         &self.0.buffer
     }
 
-    pub unsafe fn write<T: Sized>(&mut self, contents: &[T]) -> Result<(), Error> {
+    pub unsafe fn write<T: PlainData>(&mut self, contents: &[T]) -> Result<(), Error> {
         if let Some(session) = Weak::upgrade(&self.0.session) {
-            session.device.write_buffer(&self.0.buffer, contents)?;
+            session.device.write_buffer(
+                &self.0.buffer,
+                contents.as_ptr() as *const u8,
+                0,
+                std::mem::size_of_val(contents).try_into()?,
+            )?;
         }
         // else session lost error?
         Ok(())
     }
-    pub unsafe fn read<T: Sized>(&self, result: &mut Vec<T>) -> Result<(), Error> {
+    pub unsafe fn read<T: PlainData>(&self, result: &mut Vec<T>) -> Result<(), Error> {
+        let size = self.vk_buffer().size;
+        let len = size as usize / std::mem::size_of::<T>();
+        if len > result.len() {
+            result.reserve(len - result.len());
+        }
         if let Some(session) = Weak::upgrade(&self.0.session) {
-            session.device.read_buffer(&self.0.buffer, result)?;
+            session
+                .device
+                .read_buffer(&self.0.buffer, result.as_mut_ptr() as *mut u8, 0, size)?;
+            result.set_len(len);
         }
         // else session lost error?
         Ok(())
diff --git a/piet-gpu-hal/src/lib.rs b/piet-gpu-hal/src/lib.rs
index ce7c6d7..323f5e5 100644
--- a/piet-gpu-hal/src/lib.rs
+++ b/piet-gpu-hal/src/lib.rs
@@ -2,7 +2,6 @@
 ///
 /// This abstraction is inspired by gfx-hal, but is specialized to the needs of piet-gpu.
 /// In time, it may go away and be replaced by either gfx-hal or wgpu.
-
 use bitflags::bitflags;
 
 pub mod hub;
@@ -66,6 +65,8 @@ pub struct GpuInfo {
     pub subgroup_size: Option<SubgroupSize>,
     /// The GPU supports a real, grown-ass memory model.
     pub has_memory_model: bool,
+    /// Whether staging buffers should be used.
+    pub use_staging_buffers: bool,
 }
 
 #[derive(Clone, Debug)]
@@ -103,11 +104,7 @@ pub trait Device: Sized {
     /// Maybe doesn't need result return?
     unsafe fn destroy_buffer(&self, buffer: &Self::Buffer) -> Result<(), Error>;
 
-    unsafe fn create_image2d(
-        &self,
-        width: u32,
-        height: u32,
-    ) -> Result<Self::Image, Error>;
+    unsafe fn create_image2d(&self, width: u32, height: u32) -> Result<Self::Image, Error>;
 
     /// Destroy an image.
     ///
@@ -175,6 +172,7 @@ pub trait Device: Sized {
     /// All submitted commands that refer to this query pool must have completed.
     unsafe fn fetch_query_pool(&self, pool: &Self::QueryPool) -> Result<Vec<f64>, Error>;
 
+    #[deprecated]
     unsafe fn run_cmd_buf(
         &self,
         cmd_buf: &Self::CmdBuf,
@@ -183,16 +181,44 @@ pub trait Device: Sized {
         fence: Option<&Self::Fence>,
     ) -> Result<(), Error>;
 
-    unsafe fn read_buffer<T: Sized>(
+    unsafe fn run_cmd_bufs(
         &self,
-        buffer: &Self::Buffer,
-        result: &mut Vec<T>,
+        cmd_buf: &[&Self::CmdBuf],
+        wait_semaphores: &[Self::Semaphore],
+        signal_semaphores: &[Self::Semaphore],
+        fence: Option<&Self::Fence>,
     ) -> Result<(), Error>;
 
-    unsafe fn write_buffer<T: Sized>(
+    /// Copy data from the buffer to memory.
+    ///
+    /// Discussion question: add offset?
+    ///
+    /// # Safety
+    ///
+    /// The buffer must be valid to access. The destination memory must be valid to
+    /// write to. The ranges must not overlap. The offset + size must be within
+    /// the buffer's allocation, and size within the destination.
+    unsafe fn read_buffer(
         &self,
         buffer: &Self::Buffer,
-        contents: &[T],
+        dst: *mut u8,
+        offset: u64,
+        size: u64,
+    ) -> Result<(), Error>;
+
+    /// Copy data from memory to the buffer.
+    ///
+    /// # Safety
+    ///
+    /// The buffer must be valid to access. The source memory must be valid to
+    /// read from. The ranges must not overlap. The offset + size must be within
+    /// the buffer's allocation, and size within the source.
+    unsafe fn write_buffer(
+        &self,
+        buffer: &Self::Buffer,
+        contents: *const u8,
+        offset: u64,
+        size: u64,
     ) -> Result<(), Error>;
 
     unsafe fn create_semaphore(&self) -> Result<Self::Semaphore, Error>;
diff --git a/piet-gpu-hal/src/vulkan.rs b/piet-gpu-hal/src/vulkan.rs
index 8d2700f..959e330 100644
--- a/piet-gpu-hal/src/vulkan.rs
+++ b/piet-gpu-hal/src/vulkan.rs
@@ -1,6 +1,7 @@
 //! Vulkan implemenation of HAL trait.
 
 use std::borrow::Cow;
+use std::convert::TryInto;
 use std::ffi::{CStr, CString};
 use std::os::raw::c_char;
 use std::sync::Arc;
@@ -9,7 +10,9 @@ use ash::extensions::{ext::DebugUtils, khr};
 use ash::version::{DeviceV1_0, EntryV1_0, InstanceV1_0, InstanceV1_1};
 use ash::{vk, Device, Entry, Instance};
 
-use crate::{BufferUsage, Device as DeviceTrait, Error, GpuInfo, ImageLayout, SamplerParams, SubgroupSize};
+use crate::{
+    BufferUsage, Device as DeviceTrait, Error, GpuInfo, ImageLayout, SamplerParams, SubgroupSize,
+};
 
 pub struct VkInstance {
     /// Retain the dynamic lib.
@@ -60,7 +63,8 @@ pub struct VkSwapchain {
 pub struct Buffer {
     buffer: vk::Buffer,
     buffer_memory: vk::DeviceMemory,
-    size: u64,
+    // TODO: there should probably be a Buffer trait and this should be a method.
+    pub size: u64,
 }
 
 pub struct Image {
@@ -339,6 +343,14 @@ impl VkInstance {
             None
         };
 
+        // The question of when and when not to use staging buffers is complex, and this
+        // is only a first approximation. Basically, it *must* be false when buffers can
+        // be created with a memory type that is not host-visible. That is not guaranteed
+        // here but is likely to be the case.
+        //
+        // I'm still investigating what should be done in systems with Resizable BAR.
+        let use_staging_buffers = props.device_type != vk::PhysicalDeviceType::INTEGRATED_GPU;
+
         // TODO: finer grained query of specific subgroup info.
         let has_subgroups = self.vk_version >= vk::make_version(1, 1, 0);
         let gpu_info = GpuInfo {
@@ -346,6 +358,7 @@ impl VkInstance {
             has_subgroups,
             subgroup_size,
             has_memory_model,
+            use_staging_buffers,
         };
 
         Ok(VkDevice {
@@ -511,11 +524,7 @@ impl crate::Device for VkDevice {
         Ok(())
     }
 
-    unsafe fn create_image2d(
-        &self,
-        width: u32,
-        height: u32,
-    ) -> Result<Self::Image, Error> {
+    unsafe fn create_image2d(&self, width: u32, height: u32) -> Result<Self::Image, Error> {
         let device = &self.device.device;
         let extent = vk::Extent3D {
             width,
@@ -726,37 +735,78 @@ impl crate::Device for VkDevice {
         Ok(())
     }
 
-    unsafe fn read_buffer<T: Sized>(
+    /// Run the command buffers.
+    ///
+    /// This submits the command buffer for execution. The provided fence
+    /// is signalled when the execution is complete.
+    unsafe fn run_cmd_bufs(
         &self,
-        buffer: &Buffer,
-        result: &mut Vec<T>,
+        cmd_bufs: &[&CmdBuf],
+        wait_semaphores: &[Self::Semaphore],
+        signal_semaphores: &[Self::Semaphore],
+        fence: Option<&Self::Fence>,
     ) -> Result<(), Error> {
         let device = &self.device.device;
+
+        let fence = match fence {
+            Some(fence) => *fence,
+            None => vk::Fence::null(),
+        };
+        let wait_stages = wait_semaphores
+            .iter()
+            .map(|_| vk::PipelineStageFlags::ALL_COMMANDS)
+            .collect::<Vec<_>>();
+        // Use SmallVec or similar here to reduce allocation?
+        let cmd_bufs = cmd_bufs.iter().map(|c| c.cmd_buf).collect::<Vec<_>>();
+        device.queue_submit(
+            self.queue,
+            &[vk::SubmitInfo::builder()
+                .command_buffers(&cmd_bufs)
+                .wait_semaphores(wait_semaphores)
+                .wait_dst_stage_mask(&wait_stages)
+                .signal_semaphores(signal_semaphores)
+                .build()],
+            fence,
+        )?;
+        Ok(())
+    }
+
+    unsafe fn read_buffer(
+        &self,
+        buffer: &Self::Buffer,
+        dst: *mut u8,
+        offset: u64,
+        size: u64,
+    ) -> Result<(), Error> {
+        let copy_size = size.try_into()?;
+        let device = &self.device.device;
         let buf = device.map_memory(
             buffer.buffer_memory,
-            0,
-            buffer.size,
+            offset,
+            size,
             vk::MemoryMapFlags::empty(),
         )?;
-        let len = buffer.size as usize / std::mem::size_of::<T>();
-        if len > result.len() {
-            result.reserve(len - result.len());
-        }
-        std::ptr::copy_nonoverlapping(buf as *const T, result.as_mut_ptr(), len);
-        result.set_len(len);
+        std::ptr::copy_nonoverlapping(buf as *const u8, dst, copy_size);
         device.unmap_memory(buffer.buffer_memory);
         Ok(())
     }
 
-    unsafe fn write_buffer<T: Sized>(&self, buffer: &Buffer, contents: &[T]) -> Result<(), Error> {
+    unsafe fn write_buffer(
+        &self,
+        buffer: &Buffer,
+        contents: *const u8,
+        offset: u64,
+        size: u64,
+    ) -> Result<(), Error> {
+        let copy_size = size.try_into()?;
         let device = &self.device.device;
         let buf = device.map_memory(
             buffer.buffer_memory,
-            0,
-            std::mem::size_of_val(contents) as u64,
+            offset,
+            size,
             vk::MemoryMapFlags::empty(),
         )?;
-        std::ptr::copy_nonoverlapping(contents.as_ptr(), buf as *mut T, contents.len());
+        std::ptr::copy_nonoverlapping(contents, buf as *mut u8, copy_size);
         device.unmap_memory(buffer.buffer_memory);
         Ok(())
     }
diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs
index 53201f8..69e0fc4 100644
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@@ -254,8 +254,7 @@ fn main() -> Result<(), Error> {
 
         let renderer = Renderer::new(&session, scene, n_paths, n_pathseg, n_trans)?;
         let image_usage = BufferUsage::MAP_READ | BufferUsage::COPY_DST;
-        let image_buf =
-            session.create_buffer((WIDTH * HEIGHT * 4) as u64, image_usage)?;
+        let image_buf = session.create_buffer((WIDTH * HEIGHT * 4) as u64, image_usage)?;
 
         cmd_buf.begin();
         renderer.record(&mut cmd_buf, &query_pool);
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index 26b3c44..3c87f68 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -189,16 +189,18 @@ pub fn dump_k1_data(k1_buf: &[u32]) {
 pub struct Renderer {
     pub image_dev: hub::Image, // resulting image
 
-    scene_buf_host: hub::Buffer,
-    scene_buf_dev: hub::Buffer,
+    // The reference is held by the pipelines. We will be changing
+    // this to make the scene upload dynamic.
+    #[allow(dead_code)]
+    scene_buf: hub::Buffer,
 
     memory_buf_host: hub::Buffer,
     memory_buf_dev: hub::Buffer,
 
     state_buf: hub::Buffer,
 
-    config_buf_host: hub::Buffer,
-    config_buf_dev: hub::Buffer,
+    #[allow(dead_code)]
+    config_buf: hub::Buffer,
 
     el_pipeline: hub::Pipeline,
     el_ds: hub::DescriptorSet,
@@ -246,21 +248,11 @@ impl Renderer {
             n_elements, n_paths, n_pathseg, n_trans
         );
 
-        let mut scene_buf_host = session
-            .create_buffer(std::mem::size_of_val(&scene[..]) as u64, host_upload)
-            .unwrap();
-        let scene_buf_dev = session
-            .create_buffer(std::mem::size_of_val(&scene[..]) as u64, dev)
-            .unwrap();
-        scene_buf_host.write(&scene)?;
+        let scene_buf = session.create_buffer_init(&scene[..], dev).unwrap();
 
         let state_buf = session.create_buffer(1 * 1024 * 1024, dev)?;
         let image_dev = session.create_image2d(WIDTH as u32, HEIGHT as u32)?;
 
-        const CONFIG_SIZE: u64 = 10 * 4; // Size of Config in setup.h.
-        let mut config_buf_host = session.create_buffer(CONFIG_SIZE, host_upload)?;
-        let config_buf_dev = session.create_buffer(CONFIG_SIZE, dev)?;
-
         // TODO: constants
         const PATH_SIZE: usize = 12;
         const BIN_SIZE: usize = 8;
@@ -280,7 +272,7 @@ impl Renderer {
         alloc += (n_paths * ANNO_SIZE + 3) & !3;
         let trans_base = alloc;
         alloc += (n_trans * TRANS_SIZE + 3) & !3;
-        config_buf_host.write(&[
+        let config = &[
             n_paths as u32,
             n_pathseg as u32,
             WIDTH_IN_TILES as u32,
@@ -291,8 +283,11 @@ impl Renderer {
             pathseg_base as u32,
             anno_base as u32,
             trans_base as u32,
-        ])?;
+        ];
+        let config_buf = session.create_buffer_init(&config[..], dev).unwrap();
 
+        // Perhaps we could avoid the explicit staging buffer by having buffer creation method
+        // that takes both initial contents and a size.
         let mut memory_buf_host = session.create_buffer(2 * 4, host_upload)?;
         let memory_buf_dev = session.create_buffer(128 * 1024 * 1024, dev)?;
         memory_buf_host.write(&[alloc as u32, 0 /* Overflow flag */])?;
@@ -301,36 +296,34 @@ impl Renderer {
         let el_pipeline = session.create_simple_compute_pipeline(el_code, 4)?;
         let el_ds = session.create_simple_descriptor_set(
             &el_pipeline,
-            &[&memory_buf_dev, &config_buf_dev, &scene_buf_dev, &state_buf],
+            &[&memory_buf_dev, &config_buf, &scene_buf, &state_buf],
         )?;
 
         let tile_alloc_code = include_bytes!("../shader/tile_alloc.spv");
         let tile_pipeline = session.create_simple_compute_pipeline(tile_alloc_code, 2)?;
         let tile_ds = session
-            .create_simple_descriptor_set(&tile_pipeline, &[&memory_buf_dev, &config_buf_dev])?;
+            .create_simple_descriptor_set(&tile_pipeline, &[&memory_buf_dev, &config_buf])?;
 
         let path_alloc_code = include_bytes!("../shader/path_coarse.spv");
         let path_pipeline = session.create_simple_compute_pipeline(path_alloc_code, 2)?;
         let path_ds = session
-            .create_simple_descriptor_set(&path_pipeline, &[&memory_buf_dev, &config_buf_dev])?;
+            .create_simple_descriptor_set(&path_pipeline, &[&memory_buf_dev, &config_buf])?;
 
         let backdrop_alloc_code = include_bytes!("../shader/backdrop.spv");
         let backdrop_pipeline = session.create_simple_compute_pipeline(backdrop_alloc_code, 2)?;
-        let backdrop_ds = session.create_simple_descriptor_set(
-            &backdrop_pipeline,
-            &[&memory_buf_dev, &config_buf_dev],
-        )?;
+        let backdrop_ds = session
+            .create_simple_descriptor_set(&backdrop_pipeline, &[&memory_buf_dev, &config_buf])?;
 
         // TODO: constants
         let bin_code = include_bytes!("../shader/binning.spv");
         let bin_pipeline = session.create_simple_compute_pipeline(bin_code, 2)?;
-        let bin_ds = session
-            .create_simple_descriptor_set(&bin_pipeline, &[&memory_buf_dev, &config_buf_dev])?;
+        let bin_ds =
+            session.create_simple_descriptor_set(&bin_pipeline, &[&memory_buf_dev, &config_buf])?;
 
         let coarse_code = include_bytes!("../shader/coarse.spv");
         let coarse_pipeline = session.create_simple_compute_pipeline(coarse_code, 2)?;
         let coarse_ds = session
-            .create_simple_descriptor_set(&coarse_pipeline, &[&memory_buf_dev, &config_buf_dev])?;
+            .create_simple_descriptor_set(&coarse_pipeline, &[&memory_buf_dev, &config_buf])?;
 
         let bg_image = Self::make_test_bg_image(&session);
 
@@ -358,19 +351,17 @@ impl Renderer {
             .create_compute_pipeline(&session, k4_code)?;
         let k4_ds = session
             .descriptor_set_builder()
-            .add_buffers(&[&memory_buf_dev, &config_buf_dev])
+            .add_buffers(&[&memory_buf_dev, &config_buf])
             .add_images(&[&image_dev])
             .add_textures(&[&bg_image])
             .build(&session, &k4_pipeline)?;
 
         Ok(Renderer {
-            scene_buf_host,
-            scene_buf_dev,
+            scene_buf,
             memory_buf_host,
             memory_buf_dev,
             state_buf,
-            config_buf_host,
-            config_buf_dev,
+            config_buf,
             image_dev,
             el_pipeline,
             el_ds,
@@ -394,14 +385,6 @@ impl Renderer {
     }
 
     pub unsafe fn record(&self, cmd_buf: &mut hub::CmdBuf, query_pool: &hub::QueryPool) {
-        cmd_buf.copy_buffer(
-            self.scene_buf_host.vk_buffer(),
-            self.scene_buf_dev.vk_buffer(),
-        );
-        cmd_buf.copy_buffer(
-            self.config_buf_host.vk_buffer(),
-            self.config_buf_dev.vk_buffer(),
-        );
         cmd_buf.copy_buffer(
             self.memory_buf_host.vk_buffer(),
             self.memory_buf_dev.vk_buffer(),
@@ -488,8 +471,7 @@ impl Renderer {
             let host_upload = BufferUsage::MAP_WRITE | BufferUsage::COPY_SRC;
             let mut buffer = session.create_buffer(buf.len() as u64, host_upload)?;
             buffer.write(buf)?;
-            let image =
-                session.create_image2d(width.try_into()?, height.try_into()?)?;
+            let image = session.create_image2d(width.try_into()?, height.try_into()?)?;
             let mut cmd_buf = session.cmd_buf()?;
             cmd_buf.begin();
             cmd_buf.image_barrier(

From 0cc72d97651d0d39459462a1b75b3f07903787ca Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Mon, 24 May 2021 13:56:24 -0700
Subject: [PATCH 2/4] Reduce allocations for retaining resources

Use an enum instead of Box<dyn Any> for resources to be retained until
command buffer completion, and allow both references (which will be
cloned) and owned resources (useful for staging buffers).
---
 piet-gpu-hal/src/hub.rs | 37 ++++++++++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 7 deletions(-)

diff --git a/piet-gpu-hal/src/hub.rs b/piet-gpu-hal/src/hub.rs
index 554a024..116939c 100644
--- a/piet-gpu-hal/src/hub.rs
+++ b/piet-gpu-hal/src/hub.rs
@@ -4,7 +4,6 @@
 //! It is likely that it will also take care of compile time and runtime
 //! negotiation of backends (Vulkan, DX12), but right now it's Vulkan-only.
 
-use std::any::Any;
 use std::convert::TryInto;
 use std::sync::{Arc, Mutex, Weak};
 
@@ -41,7 +40,7 @@ struct SessionInner {
 pub struct CmdBuf {
     cmd_buf: vulkan::CmdBuf,
     fence: Fence,
-    resources: Vec<Box<dyn Any>>,
+    resources: Vec<RetainResource>,
     session: Weak<SessionInner>,
 }
 
@@ -53,7 +52,7 @@ struct SubmittedCmdBufInner {
     // better to chose one or the other.
     cmd_buf: vulkan::CmdBuf,
     fence: Fence,
-    resources: Vec<Box<dyn Any>>,
+    resources: Vec<RetainResource>,
     staging_cmd_buf: Option<CmdBuf>,
 }
 
@@ -91,6 +90,12 @@ unsafe impl PlainData for i64 {}
 unsafe impl PlainData for f32 {}
 unsafe impl PlainData for f64 {}
 
+/// A resource to retain during the lifetime of a command submission.
+pub enum RetainResource {
+    Buffer(Buffer),
+    Image(Image),
+}
+
 impl Session {
     pub fn new(device: vulkan::VkDevice) -> Session {
         let gpu_info = device.query_gpu_info();
@@ -227,9 +232,9 @@ impl Session {
                 *staging_cmd_buf = Some(cmd_buf);
             }
             let staging_cmd_buf = staging_cmd_buf.as_mut().unwrap();
-            // This will ensure the staging buffer is deallocated
-            staging_cmd_buf.add_resource(&create_buf);
+            // This will ensure the staging buffer is deallocated. It would be nice to
             staging_cmd_buf.copy_buffer(create_buf.vk_buffer(), buf.vk_buffer());
+            staging_cmd_buf.add_resource(create_buf);
             Ok(buf)
         } else {
             Ok(create_buf)
@@ -307,8 +312,8 @@ impl CmdBuf {
     /// There are two choices for upholding the lifetime invariant: this function, or
     /// the caller can manually hold the reference. The latter is appropriate when it's
     /// part of retained state.
-    pub fn add_resource<T: Clone + 'static>(&mut self, resource: &T) {
-        self.resources.push(Box::new(resource.clone()));
+    pub fn add_resource(&mut self, resource: impl Into<RetainResource>) {
+        self.resources.push(resource.into());
     }
 }
 
@@ -546,3 +551,21 @@ impl<'a, T> IntoRefs<'a, T> for Vec<&'a T> {
         self.into_iter()
     }
 }
+
+impl From<Buffer> for RetainResource {
+    fn from(buf: Buffer) -> Self {
+        RetainResource::Buffer(buf)
+    }
+}
+
+impl From<Image> for RetainResource {
+    fn from(img: Image) -> Self {
+        RetainResource::Image(img)
+    }
+}
+
+impl<'a, T: Into<RetainResource>> From<&'a T> for RetainResource {
+    fn from(resource: &'a T) -> Self {
+        resource.clone().into()
+    }
+}

From 22935fccc62e4de81ab1b7f5178e7fced3b0d574 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Mon, 24 May 2021 14:25:13 -0700
Subject: [PATCH 3/4] Use const generics for IntoRefs

Yay! Now we can use an array of any size, slice, or vector.
---
 piet-gpu-hal/src/hub.rs | 38 +-------------------------------------
 1 file changed, 1 insertion(+), 37 deletions(-)

diff --git a/piet-gpu-hal/src/hub.rs b/piet-gpu-hal/src/hub.rs
index 116939c..2ad9651 100644
--- a/piet-gpu-hal/src/hub.rs
+++ b/piet-gpu-hal/src/hub.rs
@@ -502,43 +502,7 @@ impl<'a, T> IntoRefs<'a, T> for &'a [&'a T] {
     }
 }
 
-// TODO: this will benefit from const generics!
-impl<'a, T> IntoRefs<'a, T> for &'a [&'a T; 1] {
-    type Iterator = std::iter::Copied<std::slice::Iter<'a, &'a T>>;
-    fn into_refs(self) -> Self::Iterator {
-        self.into_iter().copied()
-    }
-}
-
-impl<'a, T> IntoRefs<'a, T> for &'a [&'a T; 2] {
-    type Iterator = std::iter::Copied<std::slice::Iter<'a, &'a T>>;
-    fn into_refs(self) -> Self::Iterator {
-        self.into_iter().copied()
-    }
-}
-
-impl<'a, T> IntoRefs<'a, T> for &'a [&'a T; 3] {
-    type Iterator = std::iter::Copied<std::slice::Iter<'a, &'a T>>;
-    fn into_refs(self) -> Self::Iterator {
-        self.into_iter().copied()
-    }
-}
-
-impl<'a, T> IntoRefs<'a, T> for &'a [&'a T; 4] {
-    type Iterator = std::iter::Copied<std::slice::Iter<'a, &'a T>>;
-    fn into_refs(self) -> Self::Iterator {
-        self.into_iter().copied()
-    }
-}
-
-impl<'a, T> IntoRefs<'a, T> for &'a [&'a T; 5] {
-    type Iterator = std::iter::Copied<std::slice::Iter<'a, &'a T>>;
-    fn into_refs(self) -> Self::Iterator {
-        self.into_iter().copied()
-    }
-}
-
-impl<'a, T> IntoRefs<'a, T> for &'a [&'a T; 6] {
+impl<'a, T, const N: usize> IntoRefs<'a, T> for &'a [&'a T; N] {
     type Iterator = std::iter::Copied<std::slice::Iter<'a, &'a T>>;
     fn into_refs(self) -> Self::Iterator {
         self.into_iter().copied()

From 174c81ec09927624ceacf1b3b432e49644ca2318 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Mon, 24 May 2021 15:42:25 -0700
Subject: [PATCH 4/4] Cleanup

Fix bound on blanket RetainResource impl. Clean up run_cmd_buf.
---
 piet-gpu-hal/src/hub.rs    |  2 +-
 piet-gpu-hal/src/lib.rs    |  9 ---------
 piet-gpu-hal/src/vulkan.rs | 34 ----------------------------------
 3 files changed, 1 insertion(+), 44 deletions(-)

diff --git a/piet-gpu-hal/src/hub.rs b/piet-gpu-hal/src/hub.rs
index 2ad9651..3238376 100644
--- a/piet-gpu-hal/src/hub.rs
+++ b/piet-gpu-hal/src/hub.rs
@@ -528,7 +528,7 @@ impl From<Image> for RetainResource {
     }
 }
 
-impl<'a, T: Into<RetainResource>> From<&'a T> for RetainResource {
+impl<'a, T: Clone + Into<RetainResource>> From<&'a T> for RetainResource {
     fn from(resource: &'a T) -> Self {
         resource.clone().into()
     }
diff --git a/piet-gpu-hal/src/lib.rs b/piet-gpu-hal/src/lib.rs
index 323f5e5..ed27dd5 100644
--- a/piet-gpu-hal/src/lib.rs
+++ b/piet-gpu-hal/src/lib.rs
@@ -172,15 +172,6 @@ pub trait Device: Sized {
     /// All submitted commands that refer to this query pool must have completed.
     unsafe fn fetch_query_pool(&self, pool: &Self::QueryPool) -> Result<Vec<f64>, Error>;
 
-    #[deprecated]
-    unsafe fn run_cmd_buf(
-        &self,
-        cmd_buf: &Self::CmdBuf,
-        wait_semaphores: &[Self::Semaphore],
-        signal_semaphores: &[Self::Semaphore],
-        fence: Option<&Self::Fence>,
-    ) -> Result<(), Error>;
-
     unsafe fn run_cmd_bufs(
         &self,
         cmd_buf: &[&Self::CmdBuf],
diff --git a/piet-gpu-hal/src/vulkan.rs b/piet-gpu-hal/src/vulkan.rs
index 959e330..94772e5 100644
--- a/piet-gpu-hal/src/vulkan.rs
+++ b/piet-gpu-hal/src/vulkan.rs
@@ -701,40 +701,6 @@ impl crate::Device for VkDevice {
         Ok(result)
     }
 
-    /// Run the command buffer.
-    ///
-    /// This submits the command buffer for execution. The provided fence
-    /// is signalled when the execution is complete.
-    unsafe fn run_cmd_buf(
-        &self,
-        cmd_buf: &CmdBuf,
-        wait_semaphores: &[Self::Semaphore],
-        signal_semaphores: &[Self::Semaphore],
-        fence: Option<&Self::Fence>,
-    ) -> Result<(), Error> {
-        let device = &self.device.device;
-
-        let fence = match fence {
-            Some(fence) => *fence,
-            None => vk::Fence::null(),
-        };
-        let wait_stages = wait_semaphores
-            .iter()
-            .map(|_| vk::PipelineStageFlags::ALL_COMMANDS)
-            .collect::<Vec<_>>();
-        device.queue_submit(
-            self.queue,
-            &[vk::SubmitInfo::builder()
-                .command_buffers(&[cmd_buf.cmd_buf])
-                .wait_semaphores(wait_semaphores)
-                .wait_dst_stage_mask(&wait_stages)
-                .signal_semaphores(signal_semaphores)
-                .build()],
-            fence,
-        )?;
-        Ok(())
-    }
-
     /// Run the command buffers.
     ///
     /// This submits the command buffer for execution. The provided fence