diff --git a/piet-gpu-hal/src/bestfit.rs b/piet-gpu-hal/src/bestfit.rs new file mode 100644 index 0000000..12bb041 --- /dev/null +++ b/piet-gpu-hal/src/bestfit.rs @@ -0,0 +1,81 @@ +// Copyright © 2021 piet-gpu developers. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those + +//! A simple best-fit allocator. + +use std::collections::{BTreeMap, BTreeSet}; + +/// An allocator that tracks free ranges and returns best fit. +pub struct BestFit { + // map offset to size of free block + free_by_ix: BTreeMap, + // size and offset + free_by_size: BTreeSet<(u32, u32)>, +} + +impl BestFit { + pub fn new(size: u32) -> BestFit { + let mut free_by_ix = BTreeMap::new(); + free_by_ix.insert(0, size); + let mut free_by_size = BTreeSet::new(); + free_by_size.insert((size, 0)); + BestFit { + free_by_ix, + free_by_size, + } + } + + pub fn alloc(&mut self, size: u32) -> Option { + let block = *self.free_by_size.range((size, 0)..).next()?; + let ix = block.1; + self.free_by_ix.remove(&ix); + self.free_by_size.remove(&block); + let fragment_size = block.0 - size; + if fragment_size > 0 { + let fragment_ix = ix + size; + self.free_by_ix.insert(fragment_ix, fragment_size); + self.free_by_size.insert((fragment_size, fragment_ix)); + } + Some(ix) + } + + pub fn free(&mut self, ix: u32, size: u32) { + let next_ix = size + ix; + if let Some((&prev_ix, &prev_size)) = self.free_by_ix.range(..ix).rev().next() { + if prev_ix + prev_size == ix { + self.free_by_size.remove(&(prev_size, prev_ix)); + if let Some(&next_size) = self.free_by_ix.get(&next_ix) { + // consolidate with prev and next + let new_size = prev_size + size + next_size; + *self.free_by_ix.get_mut(&prev_ix).unwrap() = new_size; + self.free_by_ix.remove(&next_ix); + self.free_by_size.remove(&(next_size, next_ix)); + self.free_by_size.insert((new_size, prev_ix)); + } else { + // consolidate with prev + let new_size = prev_size + size; + *self.free_by_ix.get_mut(&prev_ix).unwrap() = new_size; + self.free_by_size.insert((new_size, prev_ix)); + } + return; + } + } + if let Some(&next_size) = self.free_by_ix.get(&next_ix) { + // consolidate with next + let new_size = size + next_size; + self.free_by_ix.remove(&next_ix); + self.free_by_ix.insert(ix, new_size); + self.free_by_size.remove(&(next_size, next_ix)); + self.free_by_size.insert((new_size, ix)); + } else { + // new isolated free block + self.free_by_ix.insert(ix, size); + self.free_by_size.insert((size, ix)); + } + } +} diff --git a/piet-gpu-hal/src/dx12.rs b/piet-gpu-hal/src/dx12.rs index 66befa5..5172cc3 100644 --- a/piet-gpu-hal/src/dx12.rs +++ b/piet-gpu-hal/src/dx12.rs @@ -1,14 +1,20 @@ //! DX12 implemenation of HAL trait. +mod descriptor; mod error; mod wrappers; -use std::{cell::Cell, convert::{TryFrom, TryInto}, mem, ptr}; +use std::{ + cell::Cell, + convert::{TryFrom, TryInto}, + mem, ptr, + sync::{Arc, Mutex}, +}; -use winapi::shared::minwindef::TRUE; -use winapi::shared::{dxgi, dxgi1_2, dxgitype}; #[allow(unused)] use winapi::shared::dxgi1_3; // for error reporting in debug mode +use winapi::shared::minwindef::TRUE; +use winapi::shared::{dxgi, dxgi1_2, dxgitype}; use winapi::um::d3d12; use raw_window_handle::{HasRawWindowHandle, RawWindowHandle}; @@ -17,7 +23,12 @@ use smallvec::SmallVec; use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, WorkgroupLimits}; -use self::wrappers::{CommandAllocator, CommandQueue, Device, Factory4, Resource, ShaderByteCode}; +use self::{ + descriptor::{CpuHeapRefOwned, DescriptorPool, GpuHeapRefOwned}, + wrappers::{ + CommandAllocator, CommandQueue, DescriptorHeap, Device, Factory4, Resource, ShaderByteCode, + }, +}; pub struct Dx12Instance { factory: Factory4, @@ -38,17 +49,26 @@ pub struct Dx12Device { ts_freq: u64, gpu_info: GpuInfo, memory_arch: MemoryArchitecture, + descriptor_pool: Mutex, } #[derive(Clone)] pub struct Buffer { resource: Resource, pub size: u64, + // Always present except for query readback buffer. + cpu_ref: Option>, + // Present when created with CLEAR usage. Heap is here for + // the same reason it's in DescriptorSet, and might be removed + // when CmdBuf has access to the descriptor pool. + gpu_ref: Option<(Arc, DescriptorHeap)>, } #[derive(Clone)] pub struct Image { resource: Resource, + // Present except for swapchain images. + cpu_ref: Option>, size: (u32, u32), } @@ -63,13 +83,17 @@ pub struct Pipeline { root_signature: wrappers::RootSignature, } -// Right now, each descriptor set gets its own heap, but we'll move -// to a more sophisticated allocation scheme, probably using the -// gpu-descriptor crate. -pub struct DescriptorSet(wrappers::DescriptorHeap); +pub struct DescriptorSet { + gpu_ref: GpuHeapRefOwned, + // Note: the heap is only needed here so CmdBuf::dispatch can get + // use it easily. If CmdBuf had a reference to the Device (or just + // the descriptor pool), we could get rid of this. + heap: DescriptorHeap, +} pub struct QueryPool { heap: wrappers::QueryHeap, + // Maybe this should just be a Resource, not a full Buffer. buf: Buffer, n_queries: u32, } @@ -85,11 +109,9 @@ pub struct Fence { /// semaphore is needed for presentation on DX12. pub struct Semaphore; -// TODO #[derive(Default)] pub struct DescriptorSetBuilder { - buffers: Vec, - images: Vec, + handles: SmallVec<[d3d12::D3D12_CPU_DESCRIPTOR_HANDLE; 16]>, } #[derive(PartialEq, Eq)] @@ -175,12 +197,14 @@ impl Dx12Instance { has_memory_model: false, use_staging_buffers, }; + let descriptor_pool = Default::default(); Ok(Dx12Device { device, command_queue, ts_freq, memory_arch, gpu_info, + descriptor_pool, }) } } @@ -251,14 +275,44 @@ impl crate::backend::Device for Dx12Device { //TODO: consider flag D3D12_HEAP_FLAG_ALLOW_SHADER_ATOMICS? let flags = d3d12::D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS; let resource = self.device.create_buffer( - size.try_into()?, + size, d3d12::D3D12_HEAP_TYPE_CUSTOM, page_property, memory_pool, d3d12::D3D12_RESOURCE_STATE_COMMON, flags, )?; - Ok(Buffer { resource, size }) + let mut descriptor_pool = self.descriptor_pool.lock().unwrap(); + let cpu_ref = Arc::new(descriptor_pool.alloc_cpu(&self.device)?); + let cpu_handle = descriptor_pool.cpu_handle(&cpu_ref); + self.device + .create_byte_addressed_buffer_unordered_access_view( + &resource, + cpu_handle, + 0, + (size / 4).try_into()?, + ); + let gpu_ref = if usage.contains(BufferUsage::CLEAR) { + let gpu_ref = Arc::new(descriptor_pool.alloc_gpu(&self.device, 1)?); + let gpu_handle = descriptor_pool.cpu_handle_of_gpu(&gpu_ref, 0); + self.device.copy_descriptors( + &[gpu_handle], + &[1], + &[cpu_handle], + &[1], + d3d12::D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV, + ); + let heap = descriptor_pool.gpu_heap(&gpu_ref).to_owned(); + Some((gpu_ref, heap)) + } else { + None + }; + Ok(Buffer { + resource, + size, + cpu_ref: Some(cpu_ref), + gpu_ref, + }) } } @@ -272,8 +326,18 @@ impl crate::backend::Device for Dx12Device { let resource = self .device .create_texture2d_buffer(width.into(), height, format, true)?; + + let mut descriptor_pool = self.descriptor_pool.lock().unwrap(); + let cpu_ref = Arc::new(descriptor_pool.alloc_cpu(&self.device)?); + let cpu_handle = descriptor_pool.cpu_handle(&cpu_ref); + self.device + .create_unordered_access_view(&resource, cpu_handle); let size = (width, height); - Ok(Image { resource, size }) + Ok(Image { + resource, + cpu_ref: Some(cpu_ref), + size, + }) } unsafe fn destroy_image(&self, image: &Self::Image) -> Result<(), Error> { @@ -424,7 +488,9 @@ impl crate::backend::Device for Dx12Device { let mut i = 0; fn map_range_type(bind_type: BindType) -> d3d12::D3D12_DESCRIPTOR_RANGE_TYPE { match bind_type { - BindType::Buffer | BindType::Image | BindType::ImageRead => d3d12::D3D12_DESCRIPTOR_RANGE_TYPE_UAV, + BindType::Buffer | BindType::Image | BindType::ImageRead => { + d3d12::D3D12_DESCRIPTOR_RANGE_TYPE_UAV + } BindType::BufReadOnly => d3d12::D3D12_DESCRIPTOR_RANGE_TYPE_SRV, } } @@ -482,9 +548,7 @@ impl crate::backend::Device for Dx12Device { &root_signature_desc, d3d12::D3D_ROOT_SIGNATURE_VERSION_1, )?; - let root_signature = self - .device - .create_root_signature(0, root_signature_blob)?; + let root_signature = self.device.create_root_signature(0, root_signature_blob)?; let desc = d3d12::D3D12_COMPUTE_PIPELINE_STATE_DESC { pRootSignature: root_signature.0.as_raw(), CS: shader.bytecode, @@ -515,14 +579,21 @@ impl Dx12Device { fn create_readback_buffer(&self, size: u64) -> Result { unsafe { let resource = self.device.create_buffer( - size.try_into()?, + size, d3d12::D3D12_HEAP_TYPE_READBACK, d3d12::D3D12_CPU_PAGE_PROPERTY_UNKNOWN, d3d12::D3D12_MEMORY_POOL_UNKNOWN, d3d12::D3D12_RESOURCE_STATE_COPY_DEST, d3d12::D3D12_RESOURCE_FLAG_NONE, )?; - Ok(Buffer { resource, size }) + let cpu_ref = None; + let gpu_ref = None; + Ok(Buffer { + resource, + size, + cpu_ref, + gpu_ref, + }) } } } @@ -551,11 +622,10 @@ impl crate::backend::CmdBuf for CmdBuf { self.c.set_pipeline_state(&pipeline.pipeline_state); self.c .set_compute_pipeline_root_signature(&pipeline.root_signature); - self.c.set_descriptor_heaps(&[&descriptor_set.0]); - self.c.set_compute_root_descriptor_table( - 0, - descriptor_set.0.get_gpu_descriptor_handle_at_offset(0), - ); + // TODO: persist heap ix and only set if changed. + self.c.set_descriptor_heaps(&[&descriptor_set.heap]); + self.c + .set_compute_root_descriptor_table(0, descriptor_set.gpu_ref.gpu_handle()); self.c .dispatch(workgroup_count.0, workgroup_count.1, workgroup_count.2); } @@ -598,10 +668,22 @@ impl crate::backend::CmdBuf for CmdBuf { self.memory_barrier(); } - unsafe fn clear_buffer(&self, _buffer: &Buffer, _size: Option) { - // Open question: do we call ClearUnorderedAccessViewUint or dispatch a - // compute shader? Either way we will need descriptors here. - todo!() + unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option) { + let cpu_ref = buffer.cpu_ref.as_ref().unwrap(); + let (gpu_ref, heap) = buffer + .gpu_ref + .as_ref() + .expect("Need to set CLEAR usage on buffer"); + // Same TODO as dispatch: track and only set if changed. + self.c.set_descriptor_heaps(&[heap]); + // Discussion question: would compute shader be faster? Should measure. + self.c.clear_uav( + gpu_ref.gpu_handle(), + cpu_ref.handle(), + &buffer.resource, + 0, + size, + ); } unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) { @@ -638,14 +720,15 @@ impl crate::backend::CmdBuf for CmdBuf { impl crate::backend::DescriptorSetBuilder for DescriptorSetBuilder { fn add_buffers(&mut self, buffers: &[&Buffer]) { - // Note: we could get rid of the clone here (which is an AddRef) - // and store a raw pointer, as it's a safety precondition that - // the resources are kept alive til build. - self.buffers.extend(buffers.iter().copied().cloned()); + for buf in buffers { + self.handles.push(buf.cpu_ref.as_ref().unwrap().handle()); + } } fn add_images(&mut self, images: &[&Image]) { - self.images.extend(images.iter().copied().cloned()); + for img in images { + self.handles.push(img.cpu_ref.as_ref().unwrap().handle()); + } } fn add_textures(&mut self, _images: &[&Image]) { @@ -657,34 +740,12 @@ impl crate::backend::DescriptorSetBuilder for DescriptorSetBuilder { device: &Dx12Device, _pipeline: &Pipeline, ) -> Result { - let n_descriptors = self.buffers.len() + self.images.len(); - let heap_desc = d3d12::D3D12_DESCRIPTOR_HEAP_DESC { - Type: d3d12::D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV, - NumDescriptors: n_descriptors.try_into()?, - Flags: d3d12::D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE, - NodeMask: 0, - }; - let heap = device.device.create_descriptor_heap(&heap_desc)?; - let mut ix = 0; - for buffer in self.buffers { - device - .device - .create_byte_addressed_buffer_unordered_access_view( - &buffer.resource, - heap.get_cpu_descriptor_handle_at_offset(ix), - 0, - (buffer.size / 4).try_into()?, - ); - ix += 1; - } - for image in self.images { - device.device.create_unordered_access_view( - &image.resource, - heap.get_cpu_descriptor_handle_at_offset(ix), - ); - ix += 1; - } - Ok(DescriptorSet(heap)) + let mut descriptor_pool = device.descriptor_pool.lock().unwrap(); + let n_descriptors = self.handles.len().try_into()?; + let gpu_ref = descriptor_pool.alloc_gpu(&device.device, n_descriptors)?; + gpu_ref.copy_descriptors(&device.device, &self.handles); + let heap = descriptor_pool.gpu_heap(&gpu_ref).to_owned(); + Ok(DescriptorSet { gpu_ref, heap }) } } @@ -737,6 +798,7 @@ impl Dx12Swapchain { let buffer = self.swapchain.get_buffer(idx as u32); Image { resource: buffer, + cpu_ref: None, size: self.size, } } diff --git a/piet-gpu-hal/src/dx12/descriptor.rs b/piet-gpu-hal/src/dx12/descriptor.rs new file mode 100644 index 0000000..1be203e --- /dev/null +++ b/piet-gpu-hal/src/dx12/descriptor.rs @@ -0,0 +1,295 @@ +// Copyright © 2021 piet-gpu developers. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those + +//! Descriptor management. + +use std::{ + convert::TryInto, + ops::Deref, + sync::{Arc, Mutex, Weak}, +}; + +use smallvec::SmallVec; +use winapi::um::d3d12::{ + D3D12_CPU_DESCRIPTOR_HANDLE, D3D12_DESCRIPTOR_HEAP_DESC, D3D12_DESCRIPTOR_HEAP_FLAG_NONE, + D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV, + D3D12_GPU_DESCRIPTOR_HANDLE, +}; + +use crate::{bestfit::BestFit, Error}; + +use super::wrappers::{DescriptorHeap, Device}; + +const CPU_CHUNK_SIZE: u32 = 256; +const GPU_CHUNK_SIZE: u32 = 4096; + +#[derive(Default)] +pub struct DescriptorPool { + cpu_visible: Vec, + gpu_visible: Vec, + free_list: Arc>, +} + +#[derive(Default)] +pub struct DescriptorFreeList { + cpu_free: Vec>, + gpu_free: Vec, +} + +struct CpuHeap { + // Retained for lifetime reasons. + #[allow(unused)] + dx12_heap: DescriptorHeap, + cpu_handle: D3D12_CPU_DESCRIPTOR_HANDLE, + increment_size: u32, +} + +pub struct CpuHeapRef { + heap_ix: usize, + offset: u32, +} + +/// An owned reference to the CPU heap. +/// +/// When dropped, the corresponding heap range will be freed. +pub struct CpuHeapRefOwned { + heap_ref: CpuHeapRef, + handle: D3D12_CPU_DESCRIPTOR_HANDLE, + free_list: Weak>, +} + +/// A shader-visible descriptor heap. +struct GpuHeap { + dx12_heap: DescriptorHeap, + cpu_handle: D3D12_CPU_DESCRIPTOR_HANDLE, + gpu_handle: D3D12_GPU_DESCRIPTOR_HANDLE, + increment_size: u32, +} + +pub struct GpuHeapRef { + heap_ix: usize, + offset: u32, + n: u32, +} + +/// An owned reference to the GPU heap. +/// +/// When dropped, the corresponding heap range will be freed. +pub struct GpuHeapRefOwned { + heap_ref: GpuHeapRef, + cpu_handle: D3D12_CPU_DESCRIPTOR_HANDLE, + gpu_handle: D3D12_GPU_DESCRIPTOR_HANDLE, + free_list: Weak>, +} + +impl DescriptorPool { + pub fn alloc_cpu(&mut self, device: &Device) -> Result { + let free_list = &self.free_list; + let mk_owned = |heap_ref, handle| CpuHeapRefOwned { + heap_ref, + handle, + free_list: Arc::downgrade(free_list), + }; + let mut free_list = free_list.lock().unwrap(); + for (heap_ix, free) in free_list.cpu_free.iter_mut().enumerate() { + if let Some(offset) = free.pop() { + let handle = self.cpu_visible[heap_ix].cpu_handle(offset); + return Ok(mk_owned(CpuHeapRef { heap_ix, offset }, handle)); + } + } + unsafe { + let heap_type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV; + let desc = D3D12_DESCRIPTOR_HEAP_DESC { + Type: heap_type, + NumDescriptors: CPU_CHUNK_SIZE, + Flags: D3D12_DESCRIPTOR_HEAP_FLAG_NONE, + NodeMask: 0, + }; + let dx12_heap = device.create_descriptor_heap(&desc)?; + let mut free = (0..CPU_CHUNK_SIZE).rev().collect::>(); + let offset = free.pop().unwrap(); + debug_assert_eq!(offset, 0); + let heap_ref = CpuHeapRef { + heap_ix: self.cpu_visible.len(), + offset, + }; + let cpu_handle = dx12_heap.get_cpu_descriptor_handle_for_heap_start(); + let increment_size = device.get_descriptor_increment_size(heap_type); + let heap = CpuHeap { + dx12_heap, + cpu_handle, + increment_size, + }; + self.cpu_visible.push(heap); + free_list.cpu_free.push(free); + Ok(mk_owned(heap_ref, cpu_handle)) + } + } + + pub fn cpu_handle(&self, cpu_ref: &CpuHeapRef) -> D3D12_CPU_DESCRIPTOR_HANDLE { + self.cpu_visible[cpu_ref.heap_ix].cpu_handle(cpu_ref.offset) + } + + pub fn alloc_gpu(&mut self, device: &Device, n: u32) -> Result { + let free_list = &self.free_list; + let mk_owned = |heap_ref, cpu_handle, gpu_handle| GpuHeapRefOwned { + heap_ref, + cpu_handle, + gpu_handle, + free_list: Arc::downgrade(free_list), + }; + let mut free_list = free_list.lock().unwrap(); + for (heap_ix, free) in free_list.gpu_free.iter_mut().enumerate() { + if let Some(offset) = free.alloc(n) { + let heap = &self.gpu_visible[heap_ix]; + let cpu_handle = heap.cpu_handle(offset); + let gpu_handle = heap.gpu_handle(offset); + return Ok(mk_owned( + GpuHeapRef { heap_ix, offset, n }, + cpu_handle, + gpu_handle, + )); + } + } + unsafe { + let size = n.max(GPU_CHUNK_SIZE).next_power_of_two(); + let heap_type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV; + let desc = D3D12_DESCRIPTOR_HEAP_DESC { + Type: heap_type, + NumDescriptors: size, + Flags: D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE, + NodeMask: 0, + }; + let dx12_heap = device.create_descriptor_heap(&desc)?; + let heap_ix = self.gpu_visible.len(); + let mut free = BestFit::new(size); + let offset = free.alloc(n).unwrap(); + // We assume the first allocation is at 0, to avoid recomputing offsets. + debug_assert_eq!(offset, 0); + let cpu_handle = dx12_heap.get_cpu_descriptor_handle_for_heap_start(); + let gpu_handle = dx12_heap.get_gpu_descriptor_handle_for_heap_start(); + let increment_size = device.get_descriptor_increment_size(heap_type); + let heap = GpuHeap { + dx12_heap, + cpu_handle, + gpu_handle, + increment_size, + }; + self.gpu_visible.push(heap); + free_list.gpu_free.push(free); + Ok(mk_owned( + GpuHeapRef { heap_ix, offset, n }, + cpu_handle, + gpu_handle, + )) + } + } + + pub fn cpu_handle_of_gpu( + &self, + gpu_ref: &GpuHeapRef, + offset: u32, + ) -> D3D12_CPU_DESCRIPTOR_HANDLE { + debug_assert!(offset < gpu_ref.n); + let dx12_heap = &self.gpu_visible[gpu_ref.heap_ix]; + dx12_heap.cpu_handle(gpu_ref.offset + offset) + } + + pub fn gpu_heap(&self, gpu_ref: &GpuHeapRef) -> &DescriptorHeap { + &self.gpu_visible[gpu_ref.heap_ix].dx12_heap + } +} + +impl DescriptorFreeList { + fn free_cpu(&mut self, cpu_ref: &CpuHeapRef) { + self.cpu_free[cpu_ref.heap_ix].push(cpu_ref.offset); + } + + fn free_gpu(&mut self, gpu_ref: &GpuHeapRef) { + self.gpu_free[gpu_ref.heap_ix].free(gpu_ref.offset, gpu_ref.n); + } +} + +impl Drop for CpuHeapRefOwned { + fn drop(&mut self) { + if let Some(a) = self.free_list.upgrade() { + a.lock().unwrap().free_cpu(&self.heap_ref) + } + } +} + +impl CpuHeapRefOwned { + pub fn handle(&self) -> D3D12_CPU_DESCRIPTOR_HANDLE { + self.handle + } +} + +impl GpuHeapRefOwned { + pub fn gpu_handle(&self) -> D3D12_GPU_DESCRIPTOR_HANDLE { + self.gpu_handle + } + + pub unsafe fn copy_descriptors(&self, device: &Device, src: &[D3D12_CPU_DESCRIPTOR_HANDLE]) { + // TODO: optimize a bit (use simple variant where appropriate) + let n = src.len().try_into().unwrap(); + let sizes = (0..n).map(|_| 1).collect::>(); + device.copy_descriptors( + &[self.cpu_handle], + &[n], + src, + &sizes, + D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV, + ); + } +} + +impl Deref for CpuHeapRefOwned { + type Target = CpuHeapRef; + + fn deref(&self) -> &Self::Target { + &self.heap_ref + } +} + +impl Drop for GpuHeapRefOwned { + fn drop(&mut self) { + if let Some(a) = self.free_list.upgrade() { + a.lock().unwrap().free_gpu(&self.heap_ref) + } + } +} + +impl Deref for GpuHeapRefOwned { + type Target = GpuHeapRef; + + fn deref(&self) -> &Self::Target { + &self.heap_ref + } +} + +impl CpuHeap { + fn cpu_handle(&self, offset: u32) -> D3D12_CPU_DESCRIPTOR_HANDLE { + let mut handle = self.cpu_handle; + handle.ptr += (offset as usize) * (self.increment_size as usize); + handle + } +} + +impl GpuHeap { + fn cpu_handle(&self, offset: u32) -> D3D12_CPU_DESCRIPTOR_HANDLE { + let mut handle = self.cpu_handle; + handle.ptr += (offset as usize) * (self.increment_size as usize); + handle + } + + fn gpu_handle(&self, offset: u32) -> D3D12_GPU_DESCRIPTOR_HANDLE { + let mut handle = self.gpu_handle; + handle.ptr += (offset as u64) * (self.increment_size as u64); + handle + } +} diff --git a/piet-gpu-hal/src/dx12/wrappers.rs b/piet-gpu-hal/src/dx12/wrappers.rs index dd834fa..a8eade7 100644 --- a/piet-gpu-hal/src/dx12/wrappers.rs +++ b/piet-gpu-hal/src/dx12/wrappers.rs @@ -7,6 +7,7 @@ // except according to those terms. use crate::dx12::error::{self, error_if_failed_else_unit, explain_error, Error}; +use smallvec::SmallVec; use std::convert::{TryFrom, TryInto}; use std::sync::atomic::{AtomicPtr, Ordering}; use std::{ffi, mem, ptr}; @@ -51,11 +52,7 @@ pub type CpuDescriptor = d3d12::D3D12_CPU_DESCRIPTOR_HANDLE; pub type GpuDescriptor = d3d12::D3D12_GPU_DESCRIPTOR_HANDLE; #[derive(Clone)] -pub struct DescriptorHeap { - pub heap_type: d3d12::D3D12_DESCRIPTOR_HEAP_TYPE, - pub increment_size: u32, - pub heap: ComPtr, -} +pub struct DescriptorHeap(ComPtr); #[derive(Clone)] pub struct RootSignature(pub ComPtr); @@ -381,11 +378,7 @@ impl Device { "device could not create descriptor heap", )?; - Ok(DescriptorHeap { - heap_type: heap_description.Type, - increment_size: self.get_descriptor_increment_size(heap_description.Type), - heap: ComPtr::from_raw(heap), - }) + Ok(DescriptorHeap(ComPtr::from_raw(heap))) } pub unsafe fn get_descriptor_increment_size( @@ -395,6 +388,31 @@ impl Device { self.0.GetDescriptorHandleIncrementSize(heap_type) } + pub unsafe fn copy_descriptors( + &self, + dst_starts: &[d3d12::D3D12_CPU_DESCRIPTOR_HANDLE], + dst_sizes: &[u32], + src_starts: &[d3d12::D3D12_CPU_DESCRIPTOR_HANDLE], + src_sizes: &[u32], + descriptor_heap_type: d3d12::D3D12_DESCRIPTOR_HEAP_TYPE, + ) { + debug_assert_eq!(dst_starts.len(), dst_sizes.len()); + debug_assert_eq!(src_starts.len(), src_sizes.len()); + debug_assert_eq!( + src_sizes.iter().copied().sum::(), + dst_sizes.iter().copied().sum() + ); + self.0.CopyDescriptors( + dst_starts.len().try_into().unwrap(), + dst_starts.as_ptr(), + dst_sizes.as_ptr(), + src_starts.len().try_into().unwrap(), + src_starts.as_ptr(), + src_sizes.as_ptr(), + descriptor_heap_type, + ); + } + pub unsafe fn create_compute_pipeline_state( &self, compute_pipeline_desc: &d3d12::D3D12_COMPUTE_PIPELINE_STATE_DESC, @@ -564,7 +582,7 @@ impl Device { pub unsafe fn create_buffer( &self, - buffer_size_in_bytes: u32, + buffer_size_in_bytes: u64, heap_type: d3d12::D3D12_HEAP_TYPE, cpu_page: d3d12::D3D12_CPU_PAGE_PROPERTY, memory_pool_preference: d3d12::D3D12_MEMORY_POOL, @@ -581,7 +599,7 @@ impl Device { }; let resource_description = d3d12::D3D12_RESOURCE_DESC { Dimension: d3d12::D3D12_RESOURCE_DIMENSION_BUFFER, - Width: buffer_size_in_bytes as u64, + Width: buffer_size_in_bytes, Height: 1, DepthOrArraySize: 1, MipLevels: 1, @@ -681,26 +699,12 @@ impl Device { } impl DescriptorHeap { - unsafe fn get_cpu_descriptor_handle_for_heap_start(&self) -> CpuDescriptor { - self.heap.GetCPUDescriptorHandleForHeapStart() + pub unsafe fn get_cpu_descriptor_handle_for_heap_start(&self) -> CpuDescriptor { + self.0.GetCPUDescriptorHandleForHeapStart() } - unsafe fn get_gpu_descriptor_handle_for_heap_start(&self) -> GpuDescriptor { - self.heap.GetGPUDescriptorHandleForHeapStart() - } - - pub unsafe fn get_cpu_descriptor_handle_at_offset(&self, offset: u32) -> CpuDescriptor { - let mut descriptor = self.get_cpu_descriptor_handle_for_heap_start(); - descriptor.ptr += (offset as usize) * (self.increment_size as usize); - - descriptor - } - - pub unsafe fn get_gpu_descriptor_handle_at_offset(&self, offset: u32) -> GpuDescriptor { - let mut descriptor = self.get_gpu_descriptor_handle_for_heap_start(); - descriptor.ptr += (offset as u64) * (self.increment_size as u64); - - descriptor + pub unsafe fn get_gpu_descriptor_handle_for_heap_start(&self) -> GpuDescriptor { + self.0.GetGPUDescriptorHandleForHeapStart() } } @@ -923,8 +927,8 @@ impl GraphicsCommandList { } pub unsafe fn set_descriptor_heaps(&self, descriptor_heaps: &[&DescriptorHeap]) { - let mut descriptor_heap_pointers: Vec<_> = - descriptor_heaps.iter().map(|dh| dh.heap.as_raw()).collect(); + let mut descriptor_heap_pointers: SmallVec<[_; 4]> = + descriptor_heaps.iter().map(|dh| dh.0.as_raw()).collect(); self.0.SetDescriptorHeaps( u32::try_from(descriptor_heap_pointers.len()) .expect("could not safely convert descriptor_heap_pointers.len() into u32"), @@ -958,6 +962,38 @@ impl GraphicsCommandList { ); } + pub unsafe fn clear_uav( + &self, + gpu_handle: d3d12::D3D12_GPU_DESCRIPTOR_HANDLE, + cpu_handle: d3d12::D3D12_CPU_DESCRIPTOR_HANDLE, + resource: &Resource, + value: u32, + size: Option, + ) { + // In testing, only the first value seems to be used, but just in case... + let values = [value, value, value, value]; + let mut rect = d3d12::D3D12_RECT { + left: 0, + right: 0, + top: 0, + bottom: 1, + }; + let (num_rects, p_rects) = if let Some(size) = size { + rect.right = (size / 4).try_into().unwrap(); + (1, &rect as *const _) + } else { + (0, std::ptr::null()) + }; + self.0.ClearUnorderedAccessViewUint( + gpu_handle, + cpu_handle, + resource.get_mut(), + &values, + num_rects, + p_rects, + ); + } + /// Copy an entire resource (buffer or image) pub unsafe fn copy_resource(&self, src: &Resource, dst: &Resource) { self.0.CopyResource(dst.get_mut(), src.get_mut()); diff --git a/piet-gpu-hal/src/hub.rs b/piet-gpu-hal/src/hub.rs index 2acfee0..6210ead 100644 --- a/piet-gpu-hal/src/hub.rs +++ b/piet-gpu-hal/src/hub.rs @@ -369,7 +369,13 @@ impl Session { } /// Choose shader code from the available choices. - pub fn choose_shader<'a>(&self, spv: &'a [u8], hlsl: &'a str, dxil: &'a [u8], msl: &'a str) -> ShaderCode<'a> { + pub fn choose_shader<'a>( + &self, + spv: &'a [u8], + hlsl: &'a str, + dxil: &'a [u8], + msl: &'a str, + ) -> ShaderCode<'a> { self.0.device.choose_shader(spv, hlsl, dxil, msl) } diff --git a/piet-gpu-hal/src/lib.rs b/piet-gpu-hal/src/lib.rs index 05e2394..cd4219b 100644 --- a/piet-gpu-hal/src/lib.rs +++ b/piet-gpu-hal/src/lib.rs @@ -6,6 +6,7 @@ use bitflags::bitflags; mod backend; +mod bestfit; mod hub; #[macro_use] diff --git a/tests/src/clear.rs b/tests/src/clear.rs index d643161..6e46e1f 100644 --- a/tests/src/clear.rs +++ b/tests/src/clear.rs @@ -44,7 +44,7 @@ pub struct ClearBinding { pub unsafe fn run_clear_test(runner: &mut Runner, config: &Config) -> TestResult { let mut result = TestResult::new("clear buffers"); let n_elements: u64 = config.size.choose(1 << 12, 1 << 20, 1 << 24); - let out_buf = runner.buf_down(n_elements * 4); + let out_buf = runner.buf_down(n_elements * 4, BufferUsage::empty()); let code = ClearCode::new(runner); let stage = ClearStage::new_with_value(runner, n_elements, 0x42); let binding = stage.bind(runner, &code, &out_buf.dev_buf); diff --git a/tests/src/linkedlist.rs b/tests/src/linkedlist.rs index fbac6ac..f8ed826 100644 --- a/tests/src/linkedlist.rs +++ b/tests/src/linkedlist.rs @@ -14,7 +14,7 @@ // // Also licensed under MIT license, at your choice. -use piet_gpu_hal::{include_shader, BackendType, BindType, DescriptorSet}; +use piet_gpu_hal::{include_shader, BackendType, BindType, BufferUsage, DescriptorSet}; use piet_gpu_hal::{Buffer, Pipeline}; use crate::clear::{ClearBinding, ClearCode, ClearStage}; @@ -41,7 +41,7 @@ struct LinkedListBinding { pub unsafe fn run_linkedlist_test(runner: &mut Runner, config: &Config) -> TestResult { let mut result = TestResult::new("linked list"); - let mem_buf = runner.buf_down(1024 * N_BUCKETS); + let mem_buf = runner.buf_down(1024 * N_BUCKETS, BufferUsage::CLEAR); let code = LinkedListCode::new(runner); let stage = LinkedListStage::new(runner, &code, N_BUCKETS); let binding = stage.bind(runner, &code, &mem_buf.dev_buf); @@ -77,7 +77,7 @@ impl LinkedListCode { .session .create_compute_pipeline(code, &[BindType::Buffer]) .unwrap(); - let clear_code = if runner.backend_type() != BackendType::Vulkan { + let clear_code = if runner.backend_type() == BackendType::Metal { Some(ClearCode::new(runner)) } else { None diff --git a/tests/src/message_passing.rs b/tests/src/message_passing.rs index 8accc25..c0f85af 100644 --- a/tests/src/message_passing.rs +++ b/tests/src/message_passing.rs @@ -54,7 +54,7 @@ pub unsafe fn run_message_passing_test( variant: Variant, ) -> TestResult { let mut result = TestResult::new(format!("message passing litmus, {:?}", variant)); - let out_buf = runner.buf_down(4); + let out_buf = runner.buf_down(4, BufferUsage::CLEAR); let code = MessagePassingCode::new(runner, variant); let stage = MessagePassingStage::new(runner, &code); let binding = stage.bind(runner, &code, &out_buf.dev_buf); @@ -92,9 +92,9 @@ impl MessagePassingCode { .session .create_compute_pipeline(code, &[BindType::Buffer, BindType::Buffer]) .unwrap(); - // Currently, DX12 and Metal backends don't support buffer clearing, so use a + // Currently, Metal backend doesn't support buffer clearing, so use a // compute shader as a workaround. - let clear_code = if runner.backend_type() != BackendType::Vulkan { + let clear_code = if runner.backend_type() == BackendType::Metal { Some(ClearCode::new(runner)) } else { None @@ -111,7 +111,10 @@ impl MessagePassingStage { let data_buf_size = 8 * N_ELEMENTS; let data_buf = runner .session - .create_buffer(data_buf_size, BufferUsage::STORAGE | BufferUsage::COPY_DST) + .create_buffer( + data_buf_size, + BufferUsage::STORAGE | BufferUsage::COPY_DST | BufferUsage::CLEAR, + ) .unwrap(); let clear_stages = if let Some(clear_code) = &code.clear_code { let stage0 = ClearStage::new(runner, N_ELEMENTS * 2); diff --git a/tests/src/prefix.rs b/tests/src/prefix.rs index 1391c36..bfbc5b6 100644 --- a/tests/src/prefix.rs +++ b/tests/src/prefix.rs @@ -77,7 +77,7 @@ pub unsafe fn run_prefix_test( .session .create_buffer_init(&data, BufferUsage::STORAGE) .unwrap(); - let out_buf = runner.buf_down(data_buf.size()); + let out_buf = runner.buf_down(data_buf.size(), BufferUsage::empty()); let code = PrefixCode::new(runner, variant); let stage = PrefixStage::new(runner, &code, n_elements); let binding = stage.bind(runner, &code, &data_buf, &out_buf.dev_buf); @@ -121,7 +121,7 @@ impl PrefixCode { .unwrap(); // Currently, DX12 and Metal backends don't support buffer clearing, so use a // compute shader as a workaround. - let clear_code = if runner.backend_type() != BackendType::Vulkan { + let clear_code = if runner.backend_type() == BackendType::Metal { Some(ClearCode::new(runner)) } else { None @@ -139,7 +139,10 @@ impl PrefixStage { let state_buf_size = 4 + 12 * n_workgroups; let state_buf = runner .session - .create_buffer(state_buf_size, BufferUsage::STORAGE | BufferUsage::COPY_DST) + .create_buffer( + state_buf_size, + BufferUsage::STORAGE | BufferUsage::COPY_DST | BufferUsage::CLEAR, + ) .unwrap(); let clear_stage = if let Some(clear_code) = &code.clear_code { let stage = ClearStage::new(runner, state_buf_size / 4); diff --git a/tests/src/prefix_tree.rs b/tests/src/prefix_tree.rs index 80a332f..9603385 100644 --- a/tests/src/prefix_tree.rs +++ b/tests/src/prefix_tree.rs @@ -52,7 +52,7 @@ pub unsafe fn run_prefix_test(runner: &mut Runner, config: &Config) -> TestResul .session .create_buffer_init(&data, BufferUsage::STORAGE) .unwrap(); - let out_buf = runner.buf_down(data_buf.size()); + let out_buf = runner.buf_down(data_buf.size(), BufferUsage::empty()); let code = PrefixTreeCode::new(runner); let stage = PrefixTreeStage::new(runner, n_elements); let binding = stage.bind(runner, &code, &out_buf.dev_buf); diff --git a/tests/src/runner.rs b/tests/src/runner.rs index ed57c29..7677c58 100644 --- a/tests/src/runner.rs +++ b/tests/src/runner.rs @@ -101,19 +101,17 @@ impl Runner { BufUp { stage_buf, dev_buf } } - pub fn buf_down(&self, size: u64) -> BufDown { + /// Create a buffer for download (readback). + /// + /// The `usage` parameter need not include COPY_SRC and STORAGE. + pub fn buf_down(&self, size: u64, usage: BufferUsage) -> BufDown { let stage_buf = self .session .create_buffer(size, BufferUsage::MAP_READ | BufferUsage::COPY_DST) .unwrap(); - // Note: the COPY_DST isn't needed in all use cases, but I don't think - // making this tighter would help. let dev_buf = self .session - .create_buffer( - size, - BufferUsage::COPY_SRC | BufferUsage::COPY_DST | BufferUsage::STORAGE, - ) + .create_buffer(size, usage | BufferUsage::COPY_SRC | BufferUsage::STORAGE) .unwrap(); BufDown { stage_buf, dev_buf } }