diff --git a/piet-gpu-hal/examples/collatz.rs b/piet-gpu-hal/examples/collatz.rs index ddf6769..bde4084 100644 --- a/piet-gpu-hal/examples/collatz.rs +++ b/piet-gpu-hal/examples/collatz.rs @@ -7,12 +7,9 @@ fn main() { unsafe { let device = instance.device(None).unwrap(); let session = hub::Session::new(device); - let usage = BufferUsage::MAP_READ | BufferUsage::MAP_WRITE | BufferUsage::STORAGE; + let usage = BufferUsage::MAP_READ | BufferUsage::STORAGE; let src = (0..256).map(|x| x + 1).collect::>(); - let mut buffer = session - .create_buffer(std::mem::size_of_val(&src[..]) as u64, usage) - .unwrap(); - buffer.write(&src).unwrap(); + let buffer = session.create_buffer_init(&src, usage).unwrap(); let code = include_bytes!("./shader/collatz.spv"); let pipeline = session.create_simple_compute_pipeline(code, 1).unwrap(); let descriptor_set = session diff --git a/piet-gpu-hal/src/hub.rs b/piet-gpu-hal/src/hub.rs index d98108c..554a024 100644 --- a/piet-gpu-hal/src/hub.rs +++ b/piet-gpu-hal/src/hub.rs @@ -5,9 +5,11 @@ //! negotiation of backends (Vulkan, DX12), but right now it's Vulkan-only. use std::any::Any; +use std::convert::TryInto; use std::sync::{Arc, Mutex, Weak}; use crate::vulkan; +use crate::CmdBuf as CmdBufTrait; use crate::DescriptorSetBuilder as DescriptorSetBuilderTrait; use crate::PipelineBuilder as PipelineBuilderTrait; use crate::{BufferUsage, Device, Error, GpuInfo, SamplerParams}; @@ -31,6 +33,8 @@ struct SessionInner { cmd_buf_pool: Mutex>, /// Command buffers that are still pending (so resources can't be freed). pending: Mutex>, + /// A command buffer that is used for copying from staging buffers. + staging_cmd_buf: Mutex>, gpu_info: GpuInfo, } @@ -45,9 +49,12 @@ pub struct CmdBuf { pub struct SubmittedCmdBuf(Option, Weak); struct SubmittedCmdBufInner { + // It's inconsistent, cmd_buf is unpacked, staging_cmd_buf isn't. Probably + // better to chose one or the other. cmd_buf: vulkan::CmdBuf, fence: Fence, resources: Vec>, + staging_cmd_buf: Option, } #[derive(Clone)] @@ -70,6 +77,20 @@ pub struct PipelineBuilder(vulkan::PipelineBuilder); pub struct DescriptorSetBuilder(vulkan::DescriptorSetBuilder); +/// Data types that can be stored in a GPU buffer. +pub unsafe trait PlainData {} + +unsafe impl PlainData for u8 {} +unsafe impl PlainData for u16 {} +unsafe impl PlainData for u32 {} +unsafe impl PlainData for u64 {} +unsafe impl PlainData for i8 {} +unsafe impl PlainData for i16 {} +unsafe impl PlainData for i32 {} +unsafe impl PlainData for i64 {} +unsafe impl PlainData for f32 {} +unsafe impl PlainData for f64 {} + impl Session { pub fn new(device: vulkan::VkDevice) -> Session { let gpu_info = device.query_gpu_info(); @@ -78,6 +99,7 @@ impl Session { gpu_info, cmd_buf_pool: Default::default(), pending: Default::default(), + staging_cmd_buf: Default::default(), })) } @@ -107,12 +129,13 @@ impl Session { let item = pending.swap_remove(i); // TODO: wait is superfluous, can just reset let _ = self.0.device.wait_and_reset(&[item.fence]); - self.0 - .cmd_buf_pool - .lock() - .unwrap() - .push((item.cmd_buf, item.fence)); + let mut pool = self.0.cmd_buf_pool.lock().unwrap(); + pool.push((item.cmd_buf, item.fence)); std::mem::drop(item.resources); + if let Some(staging_cmd_buf) = item.staging_cmd_buf { + pool.push((staging_cmd_buf.cmd_buf, staging_cmd_buf.fence)); + std::mem::drop(staging_cmd_buf.resources); + } } else { i += 1; } @@ -126,8 +149,19 @@ impl Session { wait_semaphores: &[Semaphore], signal_semaphores: &[Semaphore], ) -> Result { - self.0.device.run_cmd_buf( - &cmd_buf.cmd_buf, + // Again, SmallVec here? + let mut cmd_bufs = Vec::with_capacity(2); + let mut staging_cmd_buf = self.0.staging_cmd_buf.lock().unwrap().take(); + if let Some(staging) = &mut staging_cmd_buf { + // With finer grained resource tracking, we might be able to avoid this in + // some cases. + staging.memory_barrier(); + staging.finish(); + cmd_bufs.push(&staging.cmd_buf); + } + cmd_bufs.push(&cmd_buf.cmd_buf); + self.0.device.run_cmd_bufs( + &cmd_bufs, wait_semaphores, signal_semaphores, Some(&cmd_buf.fence), @@ -137,6 +171,7 @@ impl Session { cmd_buf: cmd_buf.cmd_buf, fence: cmd_buf.fence, resources: cmd_buf.resources, + staging_cmd_buf, }), cmd_buf.session, )) @@ -150,11 +185,58 @@ impl Session { }))) } - pub unsafe fn create_image2d( + /// Create a buffer with initialized data. + pub fn create_buffer_init( &self, - width: u32, - height: u32, - ) -> Result { + contents: &[impl PlainData], + usage: BufferUsage, + ) -> Result { + unsafe { + self.create_buffer_init_raw( + contents.as_ptr() as *const u8, + std::mem::size_of_val(contents).try_into()?, + usage, + ) + } + } + + /// Create a buffer with initialized data, from a raw pointer memory region. + pub unsafe fn create_buffer_init_raw( + &self, + contents: *const u8, + size: u64, + usage: BufferUsage, + ) -> Result { + let use_staging_buffer = !usage.intersects(BufferUsage::MAP_READ | BufferUsage::MAP_WRITE) + && self.gpu_info().use_staging_buffers; + let create_usage = if use_staging_buffer { + BufferUsage::MAP_WRITE | BufferUsage::COPY_SRC + } else { + usage | BufferUsage::MAP_WRITE + }; + let create_buf = self.create_buffer(size, create_usage)?; + self.0 + .device + .write_buffer(&create_buf.vk_buffer(), contents, 0, size)?; + if use_staging_buffer { + let buf = self.create_buffer(size, usage | BufferUsage::COPY_DST)?; + let mut staging_cmd_buf = self.0.staging_cmd_buf.lock().unwrap(); + if staging_cmd_buf.is_none() { + let mut cmd_buf = self.cmd_buf()?; + cmd_buf.begin(); + *staging_cmd_buf = Some(cmd_buf); + } + let staging_cmd_buf = staging_cmd_buf.as_mut().unwrap(); + // This will ensure the staging buffer is deallocated + staging_cmd_buf.add_resource(&create_buf); + staging_cmd_buf.copy_buffer(create_buf.vk_buffer(), buf.vk_buffer()); + Ok(buf) + } else { + Ok(create_buf) + } + } + + pub unsafe fn create_image2d(&self, width: u32, height: u32) -> Result { let image = self.0.device.create_image2d(width, height)?; Ok(Image(Arc::new(ImageInner { image, @@ -305,16 +387,29 @@ impl Buffer { &self.0.buffer } - pub unsafe fn write(&mut self, contents: &[T]) -> Result<(), Error> { + pub unsafe fn write(&mut self, contents: &[T]) -> Result<(), Error> { if let Some(session) = Weak::upgrade(&self.0.session) { - session.device.write_buffer(&self.0.buffer, contents)?; + session.device.write_buffer( + &self.0.buffer, + contents.as_ptr() as *const u8, + 0, + std::mem::size_of_val(contents).try_into()?, + )?; } // else session lost error? Ok(()) } - pub unsafe fn read(&self, result: &mut Vec) -> Result<(), Error> { + pub unsafe fn read(&self, result: &mut Vec) -> Result<(), Error> { + let size = self.vk_buffer().size; + let len = size as usize / std::mem::size_of::(); + if len > result.len() { + result.reserve(len - result.len()); + } if let Some(session) = Weak::upgrade(&self.0.session) { - session.device.read_buffer(&self.0.buffer, result)?; + session + .device + .read_buffer(&self.0.buffer, result.as_mut_ptr() as *mut u8, 0, size)?; + result.set_len(len); } // else session lost error? Ok(()) diff --git a/piet-gpu-hal/src/lib.rs b/piet-gpu-hal/src/lib.rs index ce7c6d7..323f5e5 100644 --- a/piet-gpu-hal/src/lib.rs +++ b/piet-gpu-hal/src/lib.rs @@ -2,7 +2,6 @@ /// /// This abstraction is inspired by gfx-hal, but is specialized to the needs of piet-gpu. /// In time, it may go away and be replaced by either gfx-hal or wgpu. - use bitflags::bitflags; pub mod hub; @@ -66,6 +65,8 @@ pub struct GpuInfo { pub subgroup_size: Option, /// The GPU supports a real, grown-ass memory model. pub has_memory_model: bool, + /// Whether staging buffers should be used. + pub use_staging_buffers: bool, } #[derive(Clone, Debug)] @@ -103,11 +104,7 @@ pub trait Device: Sized { /// Maybe doesn't need result return? unsafe fn destroy_buffer(&self, buffer: &Self::Buffer) -> Result<(), Error>; - unsafe fn create_image2d( - &self, - width: u32, - height: u32, - ) -> Result; + unsafe fn create_image2d(&self, width: u32, height: u32) -> Result; /// Destroy an image. /// @@ -175,6 +172,7 @@ pub trait Device: Sized { /// All submitted commands that refer to this query pool must have completed. unsafe fn fetch_query_pool(&self, pool: &Self::QueryPool) -> Result, Error>; + #[deprecated] unsafe fn run_cmd_buf( &self, cmd_buf: &Self::CmdBuf, @@ -183,16 +181,44 @@ pub trait Device: Sized { fence: Option<&Self::Fence>, ) -> Result<(), Error>; - unsafe fn read_buffer( + unsafe fn run_cmd_bufs( &self, - buffer: &Self::Buffer, - result: &mut Vec, + cmd_buf: &[&Self::CmdBuf], + wait_semaphores: &[Self::Semaphore], + signal_semaphores: &[Self::Semaphore], + fence: Option<&Self::Fence>, ) -> Result<(), Error>; - unsafe fn write_buffer( + /// Copy data from the buffer to memory. + /// + /// Discussion question: add offset? + /// + /// # Safety + /// + /// The buffer must be valid to access. The destination memory must be valid to + /// write to. The ranges must not overlap. The offset + size must be within + /// the buffer's allocation, and size within the destination. + unsafe fn read_buffer( &self, buffer: &Self::Buffer, - contents: &[T], + dst: *mut u8, + offset: u64, + size: u64, + ) -> Result<(), Error>; + + /// Copy data from memory to the buffer. + /// + /// # Safety + /// + /// The buffer must be valid to access. The source memory must be valid to + /// read from. The ranges must not overlap. The offset + size must be within + /// the buffer's allocation, and size within the source. + unsafe fn write_buffer( + &self, + buffer: &Self::Buffer, + contents: *const u8, + offset: u64, + size: u64, ) -> Result<(), Error>; unsafe fn create_semaphore(&self) -> Result; diff --git a/piet-gpu-hal/src/vulkan.rs b/piet-gpu-hal/src/vulkan.rs index 8d2700f..959e330 100644 --- a/piet-gpu-hal/src/vulkan.rs +++ b/piet-gpu-hal/src/vulkan.rs @@ -1,6 +1,7 @@ //! Vulkan implemenation of HAL trait. use std::borrow::Cow; +use std::convert::TryInto; use std::ffi::{CStr, CString}; use std::os::raw::c_char; use std::sync::Arc; @@ -9,7 +10,9 @@ use ash::extensions::{ext::DebugUtils, khr}; use ash::version::{DeviceV1_0, EntryV1_0, InstanceV1_0, InstanceV1_1}; use ash::{vk, Device, Entry, Instance}; -use crate::{BufferUsage, Device as DeviceTrait, Error, GpuInfo, ImageLayout, SamplerParams, SubgroupSize}; +use crate::{ + BufferUsage, Device as DeviceTrait, Error, GpuInfo, ImageLayout, SamplerParams, SubgroupSize, +}; pub struct VkInstance { /// Retain the dynamic lib. @@ -60,7 +63,8 @@ pub struct VkSwapchain { pub struct Buffer { buffer: vk::Buffer, buffer_memory: vk::DeviceMemory, - size: u64, + // TODO: there should probably be a Buffer trait and this should be a method. + pub size: u64, } pub struct Image { @@ -339,6 +343,14 @@ impl VkInstance { None }; + // The question of when and when not to use staging buffers is complex, and this + // is only a first approximation. Basically, it *must* be false when buffers can + // be created with a memory type that is not host-visible. That is not guaranteed + // here but is likely to be the case. + // + // I'm still investigating what should be done in systems with Resizable BAR. + let use_staging_buffers = props.device_type != vk::PhysicalDeviceType::INTEGRATED_GPU; + // TODO: finer grained query of specific subgroup info. let has_subgroups = self.vk_version >= vk::make_version(1, 1, 0); let gpu_info = GpuInfo { @@ -346,6 +358,7 @@ impl VkInstance { has_subgroups, subgroup_size, has_memory_model, + use_staging_buffers, }; Ok(VkDevice { @@ -511,11 +524,7 @@ impl crate::Device for VkDevice { Ok(()) } - unsafe fn create_image2d( - &self, - width: u32, - height: u32, - ) -> Result { + unsafe fn create_image2d(&self, width: u32, height: u32) -> Result { let device = &self.device.device; let extent = vk::Extent3D { width, @@ -726,37 +735,78 @@ impl crate::Device for VkDevice { Ok(()) } - unsafe fn read_buffer( + /// Run the command buffers. + /// + /// This submits the command buffer for execution. The provided fence + /// is signalled when the execution is complete. + unsafe fn run_cmd_bufs( &self, - buffer: &Buffer, - result: &mut Vec, + cmd_bufs: &[&CmdBuf], + wait_semaphores: &[Self::Semaphore], + signal_semaphores: &[Self::Semaphore], + fence: Option<&Self::Fence>, ) -> Result<(), Error> { let device = &self.device.device; + + let fence = match fence { + Some(fence) => *fence, + None => vk::Fence::null(), + }; + let wait_stages = wait_semaphores + .iter() + .map(|_| vk::PipelineStageFlags::ALL_COMMANDS) + .collect::>(); + // Use SmallVec or similar here to reduce allocation? + let cmd_bufs = cmd_bufs.iter().map(|c| c.cmd_buf).collect::>(); + device.queue_submit( + self.queue, + &[vk::SubmitInfo::builder() + .command_buffers(&cmd_bufs) + .wait_semaphores(wait_semaphores) + .wait_dst_stage_mask(&wait_stages) + .signal_semaphores(signal_semaphores) + .build()], + fence, + )?; + Ok(()) + } + + unsafe fn read_buffer( + &self, + buffer: &Self::Buffer, + dst: *mut u8, + offset: u64, + size: u64, + ) -> Result<(), Error> { + let copy_size = size.try_into()?; + let device = &self.device.device; let buf = device.map_memory( buffer.buffer_memory, - 0, - buffer.size, + offset, + size, vk::MemoryMapFlags::empty(), )?; - let len = buffer.size as usize / std::mem::size_of::(); - if len > result.len() { - result.reserve(len - result.len()); - } - std::ptr::copy_nonoverlapping(buf as *const T, result.as_mut_ptr(), len); - result.set_len(len); + std::ptr::copy_nonoverlapping(buf as *const u8, dst, copy_size); device.unmap_memory(buffer.buffer_memory); Ok(()) } - unsafe fn write_buffer(&self, buffer: &Buffer, contents: &[T]) -> Result<(), Error> { + unsafe fn write_buffer( + &self, + buffer: &Buffer, + contents: *const u8, + offset: u64, + size: u64, + ) -> Result<(), Error> { + let copy_size = size.try_into()?; let device = &self.device.device; let buf = device.map_memory( buffer.buffer_memory, - 0, - std::mem::size_of_val(contents) as u64, + offset, + size, vk::MemoryMapFlags::empty(), )?; - std::ptr::copy_nonoverlapping(contents.as_ptr(), buf as *mut T, contents.len()); + std::ptr::copy_nonoverlapping(contents, buf as *mut u8, copy_size); device.unmap_memory(buffer.buffer_memory); Ok(()) } diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs index 53201f8..69e0fc4 100644 --- a/piet-gpu/bin/cli.rs +++ b/piet-gpu/bin/cli.rs @@ -254,8 +254,7 @@ fn main() -> Result<(), Error> { let renderer = Renderer::new(&session, scene, n_paths, n_pathseg, n_trans)?; let image_usage = BufferUsage::MAP_READ | BufferUsage::COPY_DST; - let image_buf = - session.create_buffer((WIDTH * HEIGHT * 4) as u64, image_usage)?; + let image_buf = session.create_buffer((WIDTH * HEIGHT * 4) as u64, image_usage)?; cmd_buf.begin(); renderer.record(&mut cmd_buf, &query_pool); diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index 26b3c44..3c87f68 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -189,16 +189,18 @@ pub fn dump_k1_data(k1_buf: &[u32]) { pub struct Renderer { pub image_dev: hub::Image, // resulting image - scene_buf_host: hub::Buffer, - scene_buf_dev: hub::Buffer, + // The reference is held by the pipelines. We will be changing + // this to make the scene upload dynamic. + #[allow(dead_code)] + scene_buf: hub::Buffer, memory_buf_host: hub::Buffer, memory_buf_dev: hub::Buffer, state_buf: hub::Buffer, - config_buf_host: hub::Buffer, - config_buf_dev: hub::Buffer, + #[allow(dead_code)] + config_buf: hub::Buffer, el_pipeline: hub::Pipeline, el_ds: hub::DescriptorSet, @@ -246,21 +248,11 @@ impl Renderer { n_elements, n_paths, n_pathseg, n_trans ); - let mut scene_buf_host = session - .create_buffer(std::mem::size_of_val(&scene[..]) as u64, host_upload) - .unwrap(); - let scene_buf_dev = session - .create_buffer(std::mem::size_of_val(&scene[..]) as u64, dev) - .unwrap(); - scene_buf_host.write(&scene)?; + let scene_buf = session.create_buffer_init(&scene[..], dev).unwrap(); let state_buf = session.create_buffer(1 * 1024 * 1024, dev)?; let image_dev = session.create_image2d(WIDTH as u32, HEIGHT as u32)?; - const CONFIG_SIZE: u64 = 10 * 4; // Size of Config in setup.h. - let mut config_buf_host = session.create_buffer(CONFIG_SIZE, host_upload)?; - let config_buf_dev = session.create_buffer(CONFIG_SIZE, dev)?; - // TODO: constants const PATH_SIZE: usize = 12; const BIN_SIZE: usize = 8; @@ -280,7 +272,7 @@ impl Renderer { alloc += (n_paths * ANNO_SIZE + 3) & !3; let trans_base = alloc; alloc += (n_trans * TRANS_SIZE + 3) & !3; - config_buf_host.write(&[ + let config = &[ n_paths as u32, n_pathseg as u32, WIDTH_IN_TILES as u32, @@ -291,8 +283,11 @@ impl Renderer { pathseg_base as u32, anno_base as u32, trans_base as u32, - ])?; + ]; + let config_buf = session.create_buffer_init(&config[..], dev).unwrap(); + // Perhaps we could avoid the explicit staging buffer by having buffer creation method + // that takes both initial contents and a size. let mut memory_buf_host = session.create_buffer(2 * 4, host_upload)?; let memory_buf_dev = session.create_buffer(128 * 1024 * 1024, dev)?; memory_buf_host.write(&[alloc as u32, 0 /* Overflow flag */])?; @@ -301,36 +296,34 @@ impl Renderer { let el_pipeline = session.create_simple_compute_pipeline(el_code, 4)?; let el_ds = session.create_simple_descriptor_set( &el_pipeline, - &[&memory_buf_dev, &config_buf_dev, &scene_buf_dev, &state_buf], + &[&memory_buf_dev, &config_buf, &scene_buf, &state_buf], )?; let tile_alloc_code = include_bytes!("../shader/tile_alloc.spv"); let tile_pipeline = session.create_simple_compute_pipeline(tile_alloc_code, 2)?; let tile_ds = session - .create_simple_descriptor_set(&tile_pipeline, &[&memory_buf_dev, &config_buf_dev])?; + .create_simple_descriptor_set(&tile_pipeline, &[&memory_buf_dev, &config_buf])?; let path_alloc_code = include_bytes!("../shader/path_coarse.spv"); let path_pipeline = session.create_simple_compute_pipeline(path_alloc_code, 2)?; let path_ds = session - .create_simple_descriptor_set(&path_pipeline, &[&memory_buf_dev, &config_buf_dev])?; + .create_simple_descriptor_set(&path_pipeline, &[&memory_buf_dev, &config_buf])?; let backdrop_alloc_code = include_bytes!("../shader/backdrop.spv"); let backdrop_pipeline = session.create_simple_compute_pipeline(backdrop_alloc_code, 2)?; - let backdrop_ds = session.create_simple_descriptor_set( - &backdrop_pipeline, - &[&memory_buf_dev, &config_buf_dev], - )?; + let backdrop_ds = session + .create_simple_descriptor_set(&backdrop_pipeline, &[&memory_buf_dev, &config_buf])?; // TODO: constants let bin_code = include_bytes!("../shader/binning.spv"); let bin_pipeline = session.create_simple_compute_pipeline(bin_code, 2)?; - let bin_ds = session - .create_simple_descriptor_set(&bin_pipeline, &[&memory_buf_dev, &config_buf_dev])?; + let bin_ds = + session.create_simple_descriptor_set(&bin_pipeline, &[&memory_buf_dev, &config_buf])?; let coarse_code = include_bytes!("../shader/coarse.spv"); let coarse_pipeline = session.create_simple_compute_pipeline(coarse_code, 2)?; let coarse_ds = session - .create_simple_descriptor_set(&coarse_pipeline, &[&memory_buf_dev, &config_buf_dev])?; + .create_simple_descriptor_set(&coarse_pipeline, &[&memory_buf_dev, &config_buf])?; let bg_image = Self::make_test_bg_image(&session); @@ -358,19 +351,17 @@ impl Renderer { .create_compute_pipeline(&session, k4_code)?; let k4_ds = session .descriptor_set_builder() - .add_buffers(&[&memory_buf_dev, &config_buf_dev]) + .add_buffers(&[&memory_buf_dev, &config_buf]) .add_images(&[&image_dev]) .add_textures(&[&bg_image]) .build(&session, &k4_pipeline)?; Ok(Renderer { - scene_buf_host, - scene_buf_dev, + scene_buf, memory_buf_host, memory_buf_dev, state_buf, - config_buf_host, - config_buf_dev, + config_buf, image_dev, el_pipeline, el_ds, @@ -394,14 +385,6 @@ impl Renderer { } pub unsafe fn record(&self, cmd_buf: &mut hub::CmdBuf, query_pool: &hub::QueryPool) { - cmd_buf.copy_buffer( - self.scene_buf_host.vk_buffer(), - self.scene_buf_dev.vk_buffer(), - ); - cmd_buf.copy_buffer( - self.config_buf_host.vk_buffer(), - self.config_buf_dev.vk_buffer(), - ); cmd_buf.copy_buffer( self.memory_buf_host.vk_buffer(), self.memory_buf_dev.vk_buffer(), @@ -488,8 +471,7 @@ impl Renderer { let host_upload = BufferUsage::MAP_WRITE | BufferUsage::COPY_SRC; let mut buffer = session.create_buffer(buf.len() as u64, host_upload)?; buffer.write(buf)?; - let image = - session.create_image2d(width.try_into()?, height.try_into()?)?; + let image = session.create_image2d(width.try_into()?, height.try_into()?)?; let mut cmd_buf = session.cmd_buf()?; cmd_buf.begin(); cmd_buf.image_barrier(