diff --git a/piet-gpu-hal/examples/collatz.rs b/piet-gpu-hal/examples/collatz.rs index ddf6769..bde4084 100644 --- a/piet-gpu-hal/examples/collatz.rs +++ b/piet-gpu-hal/examples/collatz.rs @@ -7,12 +7,9 @@ fn main() { unsafe { let device = instance.device(None).unwrap(); let session = hub::Session::new(device); - let usage = BufferUsage::MAP_READ | BufferUsage::MAP_WRITE | BufferUsage::STORAGE; + let usage = BufferUsage::MAP_READ | BufferUsage::STORAGE; let src = (0..256).map(|x| x + 1).collect::>(); - let mut buffer = session - .create_buffer(std::mem::size_of_val(&src[..]) as u64, usage) - .unwrap(); - buffer.write(&src).unwrap(); + let buffer = session.create_buffer_init(&src, usage).unwrap(); let code = include_bytes!("./shader/collatz.spv"); let pipeline = session.create_simple_compute_pipeline(code, 1).unwrap(); let descriptor_set = session diff --git a/piet-gpu-hal/src/hub.rs b/piet-gpu-hal/src/hub.rs index 2a670e1..3238376 100644 --- a/piet-gpu-hal/src/hub.rs +++ b/piet-gpu-hal/src/hub.rs @@ -4,10 +4,11 @@ //! It is likely that it will also take care of compile time and runtime //! negotiation of backends (Vulkan, DX12), but right now it's Vulkan-only. -use std::any::Any; +use std::convert::TryInto; use std::sync::{Arc, Mutex, Weak}; use crate::vulkan; +use crate::CmdBuf as CmdBufTrait; use crate::DescriptorSetBuilder as DescriptorSetBuilderTrait; use crate::PipelineBuilder as PipelineBuilderTrait; use crate::{BufferUsage, Device, Error, GpuInfo, SamplerParams}; @@ -31,13 +32,15 @@ struct SessionInner { cmd_buf_pool: Mutex>, /// Command buffers that are still pending (so resources can't be freed). pending: Mutex>, + /// A command buffer that is used for copying from staging buffers. + staging_cmd_buf: Mutex>, gpu_info: GpuInfo, } pub struct CmdBuf { cmd_buf: vulkan::CmdBuf, fence: Fence, - resources: Vec>, + resources: Vec, session: Weak, } @@ -45,9 +48,12 @@ pub struct CmdBuf { pub struct SubmittedCmdBuf(Option, Weak); struct SubmittedCmdBufInner { + // It's inconsistent, cmd_buf is unpacked, staging_cmd_buf isn't. Probably + // better to chose one or the other. cmd_buf: vulkan::CmdBuf, fence: Fence, - resources: Vec>, + resources: Vec, + staging_cmd_buf: Option, } #[derive(Clone)] @@ -70,6 +76,26 @@ pub struct PipelineBuilder(vulkan::PipelineBuilder); pub struct DescriptorSetBuilder(vulkan::DescriptorSetBuilder); +/// Data types that can be stored in a GPU buffer. +pub unsafe trait PlainData {} + +unsafe impl PlainData for u8 {} +unsafe impl PlainData for u16 {} +unsafe impl PlainData for u32 {} +unsafe impl PlainData for u64 {} +unsafe impl PlainData for i8 {} +unsafe impl PlainData for i16 {} +unsafe impl PlainData for i32 {} +unsafe impl PlainData for i64 {} +unsafe impl PlainData for f32 {} +unsafe impl PlainData for f64 {} + +/// A resource to retain during the lifetime of a command submission. +pub enum RetainResource { + Buffer(Buffer), + Image(Image), +} + impl Session { pub fn new(device: vulkan::VkDevice) -> Session { let gpu_info = device.query_gpu_info(); @@ -78,6 +104,7 @@ impl Session { gpu_info, cmd_buf_pool: Default::default(), pending: Default::default(), + staging_cmd_buf: Default::default(), })) } @@ -107,12 +134,13 @@ impl Session { let item = pending.swap_remove(i); // TODO: wait is superfluous, can just reset let _ = self.0.device.wait_and_reset(&[item.fence]); - self.0 - .cmd_buf_pool - .lock() - .unwrap() - .push((item.cmd_buf, item.fence)); + let mut pool = self.0.cmd_buf_pool.lock().unwrap(); + pool.push((item.cmd_buf, item.fence)); std::mem::drop(item.resources); + if let Some(staging_cmd_buf) = item.staging_cmd_buf { + pool.push((staging_cmd_buf.cmd_buf, staging_cmd_buf.fence)); + std::mem::drop(staging_cmd_buf.resources); + } } else { i += 1; } @@ -126,8 +154,19 @@ impl Session { wait_semaphores: &[Semaphore], signal_semaphores: &[Semaphore], ) -> Result { - self.0.device.run_cmd_buf( - &cmd_buf.cmd_buf, + // Again, SmallVec here? + let mut cmd_bufs = Vec::with_capacity(2); + let mut staging_cmd_buf = self.0.staging_cmd_buf.lock().unwrap().take(); + if let Some(staging) = &mut staging_cmd_buf { + // With finer grained resource tracking, we might be able to avoid this in + // some cases. + staging.memory_barrier(); + staging.finish(); + cmd_bufs.push(&staging.cmd_buf); + } + cmd_bufs.push(&cmd_buf.cmd_buf); + self.0.device.run_cmd_bufs( + &cmd_bufs, wait_semaphores, signal_semaphores, Some(&cmd_buf.fence), @@ -137,6 +176,7 @@ impl Session { cmd_buf: cmd_buf.cmd_buf, fence: cmd_buf.fence, resources: cmd_buf.resources, + staging_cmd_buf, }), cmd_buf.session, )) @@ -150,6 +190,57 @@ impl Session { }))) } + /// Create a buffer with initialized data. + pub fn create_buffer_init( + &self, + contents: &[impl PlainData], + usage: BufferUsage, + ) -> Result { + unsafe { + self.create_buffer_init_raw( + contents.as_ptr() as *const u8, + std::mem::size_of_val(contents).try_into()?, + usage, + ) + } + } + + /// Create a buffer with initialized data, from a raw pointer memory region. + pub unsafe fn create_buffer_init_raw( + &self, + contents: *const u8, + size: u64, + usage: BufferUsage, + ) -> Result { + let use_staging_buffer = !usage.intersects(BufferUsage::MAP_READ | BufferUsage::MAP_WRITE) + && self.gpu_info().use_staging_buffers; + let create_usage = if use_staging_buffer { + BufferUsage::MAP_WRITE | BufferUsage::COPY_SRC + } else { + usage | BufferUsage::MAP_WRITE + }; + let create_buf = self.create_buffer(size, create_usage)?; + self.0 + .device + .write_buffer(&create_buf.vk_buffer(), contents, 0, size)?; + if use_staging_buffer { + let buf = self.create_buffer(size, usage | BufferUsage::COPY_DST)?; + let mut staging_cmd_buf = self.0.staging_cmd_buf.lock().unwrap(); + if staging_cmd_buf.is_none() { + let mut cmd_buf = self.cmd_buf()?; + cmd_buf.begin(); + *staging_cmd_buf = Some(cmd_buf); + } + let staging_cmd_buf = staging_cmd_buf.as_mut().unwrap(); + // This will ensure the staging buffer is deallocated. It would be nice to + staging_cmd_buf.copy_buffer(create_buf.vk_buffer(), buf.vk_buffer()); + staging_cmd_buf.add_resource(create_buf); + Ok(buf) + } else { + Ok(create_buf) + } + } + pub unsafe fn create_image2d(&self, width: u32, height: u32) -> Result { let image = self.0.device.create_image2d(width, height)?; Ok(Image(Arc::new(ImageInner { @@ -221,8 +312,8 @@ impl CmdBuf { /// There are two choices for upholding the lifetime invariant: this function, or /// the caller can manually hold the reference. The latter is appropriate when it's /// part of retained state. - pub fn add_resource(&mut self, resource: &T) { - self.resources.push(Box::new(resource.clone())); + pub fn add_resource(&mut self, resource: impl Into) { + self.resources.push(resource.into()); } } @@ -301,16 +392,29 @@ impl Buffer { &self.0.buffer } - pub unsafe fn write(&mut self, contents: &[T]) -> Result<(), Error> { + pub unsafe fn write(&mut self, contents: &[T]) -> Result<(), Error> { if let Some(session) = Weak::upgrade(&self.0.session) { - session.device.write_buffer(&self.0.buffer, contents)?; + session.device.write_buffer( + &self.0.buffer, + contents.as_ptr() as *const u8, + 0, + std::mem::size_of_val(contents).try_into()?, + )?; } // else session lost error? Ok(()) } - pub unsafe fn read(&self, result: &mut Vec) -> Result<(), Error> { + pub unsafe fn read(&self, result: &mut Vec) -> Result<(), Error> { + let size = self.vk_buffer().size; + let len = size as usize / std::mem::size_of::(); + if len > result.len() { + result.reserve(len - result.len()); + } if let Some(session) = Weak::upgrade(&self.0.session) { - session.device.read_buffer(&self.0.buffer, result)?; + session + .device + .read_buffer(&self.0.buffer, result.as_mut_ptr() as *mut u8, 0, size)?; + result.set_len(len); } // else session lost error? Ok(()) @@ -398,43 +502,7 @@ impl<'a, T> IntoRefs<'a, T> for &'a [&'a T] { } } -// TODO: this will benefit from const generics! -impl<'a, T> IntoRefs<'a, T> for &'a [&'a T; 1] { - type Iterator = std::iter::Copied>; - fn into_refs(self) -> Self::Iterator { - self.into_iter().copied() - } -} - -impl<'a, T> IntoRefs<'a, T> for &'a [&'a T; 2] { - type Iterator = std::iter::Copied>; - fn into_refs(self) -> Self::Iterator { - self.into_iter().copied() - } -} - -impl<'a, T> IntoRefs<'a, T> for &'a [&'a T; 3] { - type Iterator = std::iter::Copied>; - fn into_refs(self) -> Self::Iterator { - self.into_iter().copied() - } -} - -impl<'a, T> IntoRefs<'a, T> for &'a [&'a T; 4] { - type Iterator = std::iter::Copied>; - fn into_refs(self) -> Self::Iterator { - self.into_iter().copied() - } -} - -impl<'a, T> IntoRefs<'a, T> for &'a [&'a T; 5] { - type Iterator = std::iter::Copied>; - fn into_refs(self) -> Self::Iterator { - self.into_iter().copied() - } -} - -impl<'a, T> IntoRefs<'a, T> for &'a [&'a T; 6] { +impl<'a, T, const N: usize> IntoRefs<'a, T> for &'a [&'a T; N] { type Iterator = std::iter::Copied>; fn into_refs(self) -> Self::Iterator { self.into_iter().copied() @@ -447,3 +515,21 @@ impl<'a, T> IntoRefs<'a, T> for Vec<&'a T> { self.into_iter() } } + +impl From for RetainResource { + fn from(buf: Buffer) -> Self { + RetainResource::Buffer(buf) + } +} + +impl From for RetainResource { + fn from(img: Image) -> Self { + RetainResource::Image(img) + } +} + +impl<'a, T: Clone + Into> From<&'a T> for RetainResource { + fn from(resource: &'a T) -> Self { + resource.clone().into() + } +} diff --git a/piet-gpu-hal/src/lib.rs b/piet-gpu-hal/src/lib.rs index 16b5660..2fc7a8d 100644 --- a/piet-gpu-hal/src/lib.rs +++ b/piet-gpu-hal/src/lib.rs @@ -67,6 +67,8 @@ pub struct GpuInfo { pub subgroup_size: Option, /// The GPU supports a real, grown-ass memory model. pub has_memory_model: bool, + /// Whether staging buffers should be used. + pub use_staging_buffers: bool, } #[derive(Clone, Debug)] @@ -173,24 +175,44 @@ pub trait Device: Sized { /// All submitted commands that refer to this query pool must have completed. unsafe fn fetch_query_pool(&self, pool: &Self::QueryPool) -> Result, Error>; - unsafe fn run_cmd_buf( + unsafe fn run_cmd_bufs( &self, - cmd_buf: &Self::CmdBuf, + cmd_buf: &[&Self::CmdBuf], wait_semaphores: &[Self::Semaphore], signal_semaphores: &[Self::Semaphore], fence: Option<&Self::Fence>, ) -> Result<(), Error>; - unsafe fn read_buffer( + /// Copy data from the buffer to memory. + /// + /// Discussion question: add offset? + /// + /// # Safety + /// + /// The buffer must be valid to access. The destination memory must be valid to + /// write to. The ranges must not overlap. The offset + size must be within + /// the buffer's allocation, and size within the destination. + unsafe fn read_buffer( &self, buffer: &Self::Buffer, - result: &mut Vec, + dst: *mut u8, + offset: u64, + size: u64, ) -> Result<(), Error>; - unsafe fn write_buffer( + /// Copy data from memory to the buffer. + /// + /// # Safety + /// + /// The buffer must be valid to access. The source memory must be valid to + /// read from. The ranges must not overlap. The offset + size must be within + /// the buffer's allocation, and size within the source. + unsafe fn write_buffer( &self, buffer: &Self::Buffer, - contents: &[T], + contents: *const u8, + offset: u64, + size: u64, ) -> Result<(), Error>; unsafe fn create_semaphore(&self) -> Result; diff --git a/piet-gpu-hal/src/vulkan.rs b/piet-gpu-hal/src/vulkan.rs index 95ed9f5..b93b0c0 100644 --- a/piet-gpu-hal/src/vulkan.rs +++ b/piet-gpu-hal/src/vulkan.rs @@ -1,6 +1,7 @@ //! Vulkan implemenation of HAL trait. use std::borrow::Cow; +use std::convert::TryInto; use std::ffi::{CStr, CString}; use std::os::raw::c_char; use std::sync::Arc; @@ -62,7 +63,8 @@ pub struct VkSwapchain { pub struct Buffer { buffer: vk::Buffer, buffer_memory: vk::DeviceMemory, - size: u64, + // TODO: there should probably be a Buffer trait and this should be a method. + pub size: u64, } pub struct Image { @@ -341,6 +343,14 @@ impl VkInstance { None }; + // The question of when and when not to use staging buffers is complex, and this + // is only a first approximation. Basically, it *must* be false when buffers can + // be created with a memory type that is not host-visible. That is not guaranteed + // here but is likely to be the case. + // + // I'm still investigating what should be done in systems with Resizable BAR. + let use_staging_buffers = props.device_type != vk::PhysicalDeviceType::INTEGRATED_GPU; + // TODO: finer grained query of specific subgroup info. let has_subgroups = self.vk_version >= vk::make_version(1, 1, 0); let gpu_info = GpuInfo { @@ -348,6 +358,7 @@ impl VkInstance { has_subgroups, subgroup_size, has_memory_model, + use_staging_buffers, }; Ok(VkDevice { @@ -691,13 +702,13 @@ impl crate::Device for VkDevice { Ok(result) } - /// Run the command buffer. + /// Run the command buffers. /// /// This submits the command buffer for execution. The provided fence /// is signalled when the execution is complete. - unsafe fn run_cmd_buf( + unsafe fn run_cmd_bufs( &self, - cmd_buf: &CmdBuf, + cmd_bufs: &[&CmdBuf], wait_semaphores: &[Self::Semaphore], signal_semaphores: &[Self::Semaphore], fence: Option<&Self::Fence>, @@ -712,10 +723,12 @@ impl crate::Device for VkDevice { .iter() .map(|_| vk::PipelineStageFlags::ALL_COMMANDS) .collect::>(); + // Use SmallVec or similar here to reduce allocation? + let cmd_bufs = cmd_bufs.iter().map(|c| c.cmd_buf).collect::>(); device.queue_submit( self.queue, &[vk::SubmitInfo::builder() - .command_buffers(&[cmd_buf.cmd_buf]) + .command_buffers(&cmd_bufs) .wait_semaphores(wait_semaphores) .wait_dst_stage_mask(&wait_stages) .signal_semaphores(signal_semaphores) @@ -725,37 +738,42 @@ impl crate::Device for VkDevice { Ok(()) } - unsafe fn read_buffer( + unsafe fn read_buffer( &self, - buffer: &Buffer, - result: &mut Vec, + buffer: &Self::Buffer, + dst: *mut u8, + offset: u64, + size: u64, ) -> Result<(), Error> { + let copy_size = size.try_into()?; let device = &self.device.device; let buf = device.map_memory( buffer.buffer_memory, - 0, - buffer.size, + offset, + size, vk::MemoryMapFlags::empty(), )?; - let len = buffer.size as usize / std::mem::size_of::(); - if len > result.len() { - result.reserve(len - result.len()); - } - std::ptr::copy_nonoverlapping(buf as *const T, result.as_mut_ptr(), len); - result.set_len(len); + std::ptr::copy_nonoverlapping(buf as *const u8, dst, copy_size); device.unmap_memory(buffer.buffer_memory); Ok(()) } - unsafe fn write_buffer(&self, buffer: &Buffer, contents: &[T]) -> Result<(), Error> { + unsafe fn write_buffer( + &self, + buffer: &Buffer, + contents: *const u8, + offset: u64, + size: u64, + ) -> Result<(), Error> { + let copy_size = size.try_into()?; let device = &self.device.device; let buf = device.map_memory( buffer.buffer_memory, - 0, - std::mem::size_of_val(contents) as u64, + offset, + size, vk::MemoryMapFlags::empty(), )?; - std::ptr::copy_nonoverlapping(contents.as_ptr(), buf as *mut T, contents.len()); + std::ptr::copy_nonoverlapping(contents, buf as *mut u8, copy_size); device.unmap_memory(buffer.buffer_memory); Ok(()) } diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index 0f7125a..3c87f68 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -189,16 +189,18 @@ pub fn dump_k1_data(k1_buf: &[u32]) { pub struct Renderer { pub image_dev: hub::Image, // resulting image - scene_buf_host: hub::Buffer, - scene_buf_dev: hub::Buffer, + // The reference is held by the pipelines. We will be changing + // this to make the scene upload dynamic. + #[allow(dead_code)] + scene_buf: hub::Buffer, memory_buf_host: hub::Buffer, memory_buf_dev: hub::Buffer, state_buf: hub::Buffer, - config_buf_host: hub::Buffer, - config_buf_dev: hub::Buffer, + #[allow(dead_code)] + config_buf: hub::Buffer, el_pipeline: hub::Pipeline, el_ds: hub::DescriptorSet, @@ -246,21 +248,11 @@ impl Renderer { n_elements, n_paths, n_pathseg, n_trans ); - let mut scene_buf_host = session - .create_buffer(std::mem::size_of_val(&scene[..]) as u64, host_upload) - .unwrap(); - let scene_buf_dev = session - .create_buffer(std::mem::size_of_val(&scene[..]) as u64, dev) - .unwrap(); - scene_buf_host.write(&scene)?; + let scene_buf = session.create_buffer_init(&scene[..], dev).unwrap(); let state_buf = session.create_buffer(1 * 1024 * 1024, dev)?; let image_dev = session.create_image2d(WIDTH as u32, HEIGHT as u32)?; - const CONFIG_SIZE: u64 = 10 * 4; // Size of Config in setup.h. - let mut config_buf_host = session.create_buffer(CONFIG_SIZE, host_upload)?; - let config_buf_dev = session.create_buffer(CONFIG_SIZE, dev)?; - // TODO: constants const PATH_SIZE: usize = 12; const BIN_SIZE: usize = 8; @@ -280,7 +272,7 @@ impl Renderer { alloc += (n_paths * ANNO_SIZE + 3) & !3; let trans_base = alloc; alloc += (n_trans * TRANS_SIZE + 3) & !3; - config_buf_host.write(&[ + let config = &[ n_paths as u32, n_pathseg as u32, WIDTH_IN_TILES as u32, @@ -291,8 +283,11 @@ impl Renderer { pathseg_base as u32, anno_base as u32, trans_base as u32, - ])?; + ]; + let config_buf = session.create_buffer_init(&config[..], dev).unwrap(); + // Perhaps we could avoid the explicit staging buffer by having buffer creation method + // that takes both initial contents and a size. let mut memory_buf_host = session.create_buffer(2 * 4, host_upload)?; let memory_buf_dev = session.create_buffer(128 * 1024 * 1024, dev)?; memory_buf_host.write(&[alloc as u32, 0 /* Overflow flag */])?; @@ -301,36 +296,34 @@ impl Renderer { let el_pipeline = session.create_simple_compute_pipeline(el_code, 4)?; let el_ds = session.create_simple_descriptor_set( &el_pipeline, - &[&memory_buf_dev, &config_buf_dev, &scene_buf_dev, &state_buf], + &[&memory_buf_dev, &config_buf, &scene_buf, &state_buf], )?; let tile_alloc_code = include_bytes!("../shader/tile_alloc.spv"); let tile_pipeline = session.create_simple_compute_pipeline(tile_alloc_code, 2)?; let tile_ds = session - .create_simple_descriptor_set(&tile_pipeline, &[&memory_buf_dev, &config_buf_dev])?; + .create_simple_descriptor_set(&tile_pipeline, &[&memory_buf_dev, &config_buf])?; let path_alloc_code = include_bytes!("../shader/path_coarse.spv"); let path_pipeline = session.create_simple_compute_pipeline(path_alloc_code, 2)?; let path_ds = session - .create_simple_descriptor_set(&path_pipeline, &[&memory_buf_dev, &config_buf_dev])?; + .create_simple_descriptor_set(&path_pipeline, &[&memory_buf_dev, &config_buf])?; let backdrop_alloc_code = include_bytes!("../shader/backdrop.spv"); let backdrop_pipeline = session.create_simple_compute_pipeline(backdrop_alloc_code, 2)?; - let backdrop_ds = session.create_simple_descriptor_set( - &backdrop_pipeline, - &[&memory_buf_dev, &config_buf_dev], - )?; + let backdrop_ds = session + .create_simple_descriptor_set(&backdrop_pipeline, &[&memory_buf_dev, &config_buf])?; // TODO: constants let bin_code = include_bytes!("../shader/binning.spv"); let bin_pipeline = session.create_simple_compute_pipeline(bin_code, 2)?; - let bin_ds = session - .create_simple_descriptor_set(&bin_pipeline, &[&memory_buf_dev, &config_buf_dev])?; + let bin_ds = + session.create_simple_descriptor_set(&bin_pipeline, &[&memory_buf_dev, &config_buf])?; let coarse_code = include_bytes!("../shader/coarse.spv"); let coarse_pipeline = session.create_simple_compute_pipeline(coarse_code, 2)?; let coarse_ds = session - .create_simple_descriptor_set(&coarse_pipeline, &[&memory_buf_dev, &config_buf_dev])?; + .create_simple_descriptor_set(&coarse_pipeline, &[&memory_buf_dev, &config_buf])?; let bg_image = Self::make_test_bg_image(&session); @@ -358,19 +351,17 @@ impl Renderer { .create_compute_pipeline(&session, k4_code)?; let k4_ds = session .descriptor_set_builder() - .add_buffers(&[&memory_buf_dev, &config_buf_dev]) + .add_buffers(&[&memory_buf_dev, &config_buf]) .add_images(&[&image_dev]) .add_textures(&[&bg_image]) .build(&session, &k4_pipeline)?; Ok(Renderer { - scene_buf_host, - scene_buf_dev, + scene_buf, memory_buf_host, memory_buf_dev, state_buf, - config_buf_host, - config_buf_dev, + config_buf, image_dev, el_pipeline, el_ds, @@ -394,14 +385,6 @@ impl Renderer { } pub unsafe fn record(&self, cmd_buf: &mut hub::CmdBuf, query_pool: &hub::QueryPool) { - cmd_buf.copy_buffer( - self.scene_buf_host.vk_buffer(), - self.scene_buf_dev.vk_buffer(), - ); - cmd_buf.copy_buffer( - self.config_buf_host.vk_buffer(), - self.config_buf_dev.vk_buffer(), - ); cmd_buf.copy_buffer( self.memory_buf_host.vk_buffer(), self.memory_buf_dev.vk_buffer(),