diff --git a/piet-gpu-hal/examples/dx12_toy.rs b/piet-gpu-hal/examples/dx12_toy.rs index 084d0e5..33e3c74 100644 --- a/piet-gpu-hal/examples/dx12_toy.rs +++ b/piet-gpu-hal/examples/dx12_toy.rs @@ -2,7 +2,7 @@ //! This will probably go away when it's fully implemented and we can //! just use the hub. -use piet_gpu_hal::{dx12, CmdBuf, Device, Error, MemFlags}; +use piet_gpu_hal::{dx12, BufferUsage, CmdBuf, Device, Error}; const SHADER_CODE: &str = r#"RWByteAddressBuffer _53 : register(u0, space0); @@ -49,8 +49,17 @@ void main(SPIRV_Cross_Input stage_input) fn toy() -> Result<(), Error> { let instance = dx12::Dx12Instance::new()?; let device = instance.device()?; - let buf = device.create_buffer(1024, MemFlags::host_coherent())?; - let dev_buf = device.create_buffer(1024, MemFlags::device_local())?; + let buf = device.create_buffer( + 1024, + BufferUsage::MAP_READ + | BufferUsage::MAP_WRITE + | BufferUsage::COPY_SRC + | BufferUsage::COPY_DST, + )?; + let dev_buf = device.create_buffer( + 1024, + BufferUsage::STORAGE | BufferUsage::COPY_SRC | BufferUsage::COPY_DST, + )?; let data: Vec = (1..257).collect(); let query_pool = device.create_query_pool(2)?; unsafe { diff --git a/piet-gpu-hal/src/dx12.rs b/piet-gpu-hal/src/dx12.rs index 3c96656..cc39ac5 100644 --- a/piet-gpu-hal/src/dx12.rs +++ b/piet-gpu-hal/src/dx12.rs @@ -6,9 +6,10 @@ mod wrappers; use std::{cell::Cell, convert::TryInto, mem, ptr}; use winapi::shared::dxgi1_3; +use winapi::shared::minwindef::TRUE; use winapi::um::d3d12; -use crate::Error; +use crate::{BufferUsage, Error}; use self::wrappers::{ CommandAllocator, CommandQueue, Device, Factory4, GraphicsCommandList, Resource, ShaderByteCode, @@ -23,6 +24,7 @@ pub struct Dx12Device { command_allocator: CommandAllocator, command_queue: CommandQueue, ts_freq: u64, + memory_arch: MemoryArchitecture, } #[derive(Clone)] @@ -35,15 +37,6 @@ pub struct Image { resource: Resource, } -// TODO: this doesn't make an upload/download distinction. Probably -// we want to move toward webgpu-style usage flags, ie map_read and -// map_write are the main ones that affect buffer creation. -#[derive(Clone, Copy)] -pub enum MemFlags { - DeviceLocal, - HostCoherent, -} - pub struct CmdBuf(GraphicsCommandList); pub struct Pipeline { @@ -59,10 +52,6 @@ pub struct DescriptorSet(wrappers::DescriptorHeap); pub struct QueryPool { heap: wrappers::QueryHeap, buf: Buffer, - // TODO: piet-dx12 manages to do this with one buffer. I think a - // HEAP_TYPE_READBACK will work, but we currently don't have fine - // grained usage flags to express this. - readback_buf: Buffer, n_queries: u32, } @@ -88,6 +77,16 @@ pub struct DescriptorSetBuilder { buffers: Vec, } +#[derive(PartialEq, Eq)] +enum MemoryArchitecture { + /// Integrated graphics + CacheCoherentUMA, + /// Unified memory with no cache coherence (does this happen?) + UMA, + /// Discrete graphics + NUMA, +} + impl Dx12Instance { /// Create a new instance. /// @@ -128,11 +127,20 @@ impl Dx12Instance { )?; let command_allocator = device.create_command_allocator(list_type)?; let ts_freq = command_queue.get_timestamp_frequency()?; + let features_architecture = device.get_features_architecture()?; + let uma = features_architecture.UMA == TRUE; + let cc_uma = features_architecture.CacheCoherentUMA == TRUE; + let memory_arch = match (uma, cc_uma) { + (true, true) => MemoryArchitecture::CacheCoherentUMA, + (true, false) => MemoryArchitecture::UMA, + _ => MemoryArchitecture::NUMA, + }; Ok(Dx12Device { device, command_queue, command_allocator, ts_freq, + memory_arch, }) } } @@ -143,8 +151,6 @@ impl crate::Device for Dx12Device { type Image = Image; - type MemFlags = MemFlags; - type Pipeline = Pipeline; type DescriptorSet = DescriptorSet; @@ -166,16 +172,22 @@ impl crate::Device for Dx12Device { // Currently this is HLSL source, but we'll probably change it to IR. type ShaderSource = str; - fn create_buffer(&self, size: u64, mem_flags: Self::MemFlags) -> Result { + fn create_buffer(&self, size: u64, usage: BufferUsage) -> Result { + // TODO: consider supporting BufferUsage::QUERY_RESOLVE here rather than + // having a separate function. unsafe { - let resource = match mem_flags { - MemFlags::DeviceLocal => self - .device - .create_gpu_only_byte_addressed_buffer(size.try_into()?)?, - MemFlags::HostCoherent => self - .device - .create_uploadable_byte_addressed_buffer(size.try_into()?)?, - }; + let page_property = self.memory_arch.page_property(usage); + let memory_pool = self.memory_arch.memory_pool(usage); + //TODO: consider flag D3D12_HEAP_FLAG_ALLOW_SHADER_ATOMICS? + let flags = d3d12::D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS; + let resource = self.device.create_buffer( + size.try_into()?, + d3d12::D3D12_HEAP_TYPE_CUSTOM, + page_property, + memory_pool, + d3d12::D3D12_RESOURCE_STATE_COMMON, + flags, + )?; Ok(Buffer { resource, size }) } } @@ -185,12 +197,7 @@ impl crate::Device for Dx12Device { Ok(()) } - unsafe fn create_image2d( - &self, - width: u32, - height: u32, - mem_flags: Self::MemFlags, - ) -> Result { + unsafe fn create_image2d(&self, width: u32, height: u32) -> Result { todo!() } @@ -217,14 +224,10 @@ impl crate::Device for Dx12Device { let heap = self .device .create_query_heap(d3d12::D3D12_QUERY_HEAP_TYPE_TIMESTAMP, n_queries)?; - let buf = - self.create_buffer((n_queries * 8) as u64, crate::MemFlags::device_local())?; - let readback_buf = - self.create_buffer((n_queries * 8) as u64, crate::MemFlags::host_coherent())?; + let buf = self.create_readback_buffer((n_queries * 8) as u64)?; Ok(QueryPool { heap, buf, - readback_buf, n_queries, }) } @@ -232,7 +235,7 @@ impl crate::Device for Dx12Device { unsafe fn fetch_query_pool(&self, pool: &Self::QueryPool) -> Result, Error> { let mut buf = vec![0u64; pool.n_queries as usize]; - self.read_buffer(&pool.readback_buf, &mut buf)?; + self.read_buffer(&pool.buf, &mut buf)?; let ts0 = buf[0]; let tsp = (self.ts_freq as f64).recip(); let result = buf[1..] @@ -327,6 +330,22 @@ impl crate::Device for Dx12Device { } } +impl Dx12Device { + fn create_readback_buffer(&self, size: u64) -> Result { + unsafe { + let resource = self.device.create_buffer( + size.try_into()?, + d3d12::D3D12_HEAP_TYPE_READBACK, + d3d12::D3D12_CPU_PAGE_PROPERTY_UNKNOWN, + d3d12::D3D12_MEMORY_POOL_UNKNOWN, + d3d12::D3D12_RESOURCE_STATE_COPY_DEST, + d3d12::D3D12_RESOURCE_FLAG_NONE, + )?; + Ok(Buffer { resource, size }) + } + } +} + impl crate::CmdBuf for CmdBuf { unsafe fn begin(&mut self) {} @@ -406,17 +425,6 @@ impl crate::CmdBuf for CmdBuf { unsafe fn finish_timestamps(&mut self, pool: &QueryPool) { self.0 .resolve_timing_query_data(&pool.heap, 0, pool.n_queries, &pool.buf.resource, 0); - self.copy_buffer(&pool.buf, &pool.readback_buf); - } -} - -impl crate::MemFlags for MemFlags { - fn device_local() -> Self { - MemFlags::DeviceLocal - } - - fn host_coherent() -> Self { - MemFlags::HostCoherent } } @@ -543,3 +551,31 @@ impl crate::DescriptorSetBuilder for DescriptorSetBuilder { Ok(DescriptorSet(heap)) } } + +impl MemoryArchitecture { + // See https://msdn.microsoft.com/de-de/library/windows/desktop/dn788678(v=vs.85).aspx + + fn page_property(&self, usage: BufferUsage) -> d3d12::D3D12_CPU_PAGE_PROPERTY { + if usage.contains(BufferUsage::MAP_READ) { + d3d12::D3D12_CPU_PAGE_PROPERTY_WRITE_BACK + } else if usage.contains(BufferUsage::MAP_WRITE) { + if *self == MemoryArchitecture::CacheCoherentUMA { + d3d12::D3D12_CPU_PAGE_PROPERTY_WRITE_BACK + } else { + d3d12::D3D12_CPU_PAGE_PROPERTY_WRITE_COMBINE + } + } else { + d3d12::D3D12_CPU_PAGE_PROPERTY_NOT_AVAILABLE + } + } + + fn memory_pool(&self, usage: BufferUsage) -> d3d12::D3D12_MEMORY_POOL { + if *self == MemoryArchitecture::NUMA + && !usage.intersects(BufferUsage::MAP_READ | BufferUsage::MAP_WRITE) + { + d3d12::D3D12_MEMORY_POOL_L1 + } else { + d3d12::D3D12_MEMORY_POOL_L0 + } + } +} diff --git a/piet-gpu-hal/src/dx12/wrappers.rs b/piet-gpu-hal/src/dx12/wrappers.rs index df5f5aa..cafd69b 100644 --- a/piet-gpu-hal/src/dx12/wrappers.rs +++ b/piet-gpu-hal/src/dx12/wrappers.rs @@ -677,7 +677,6 @@ impl Device { .CreateRenderTargetView(resource.get_mut(), desc, descriptor); } - // TODO: interface not complete pub unsafe fn create_fence(&self, initial: u64) -> Result { let mut fence = ptr::null_mut(); explain_error( @@ -810,57 +809,19 @@ impl Device { (layouts, num_rows, row_size_in_bytes, total_size) } - // TODO: probably combine these and add usage flags - pub unsafe fn create_uploadable_buffer( - &self, - buffer_size_in_bytes: u64, - ) -> Result { - let heap_properties = d3d12::D3D12_HEAP_PROPERTIES { - //for GPU access only - Type: d3d12::D3D12_HEAP_TYPE_UPLOAD, - CPUPageProperty: d3d12::D3D12_CPU_PAGE_PROPERTY_UNKNOWN, - //TODO: what should MemoryPoolPreference flag be? - MemoryPoolPreference: d3d12::D3D12_MEMORY_POOL_UNKNOWN, - //we don't care about multi-adapter operation, so these next two will be zero - CreationNodeMask: 0, - VisibleNodeMask: 0, - }; - let resource_description = d3d12::D3D12_RESOURCE_DESC { - Dimension: d3d12::D3D12_RESOURCE_DIMENSION_BUFFER, - Width: buffer_size_in_bytes, - Height: 1, - DepthOrArraySize: 1, - MipLevels: 1, - SampleDesc: dxgitype::DXGI_SAMPLE_DESC { - Count: 1, - Quality: 0, - }, - Layout: d3d12::D3D12_TEXTURE_LAYOUT_ROW_MAJOR, - Flags: d3d12::D3D12_RESOURCE_FLAG_NONE, - ..mem::zeroed() - }; - - let buffer = self.create_committed_resource( - &heap_properties, - //TODO: is this heap flag ok? - d3d12::D3D12_HEAP_FLAG_NONE, - &resource_description, - d3d12::D3D12_RESOURCE_STATE_GENERIC_READ, - ptr::null(), - )?; - - Ok(buffer) - } - - pub unsafe fn create_uploadable_byte_addressed_buffer( + pub unsafe fn create_buffer( &self, buffer_size_in_bytes: u32, + heap_type: d3d12::D3D12_HEAP_TYPE, + cpu_page: d3d12::D3D12_CPU_PAGE_PROPERTY, + memory_pool_preference: d3d12::D3D12_MEMORY_POOL, + init_resource_state: d3d12::D3D12_RESOURCE_STATES, + resource_flags: d3d12::D3D12_RESOURCE_FLAGS, ) -> Result { let heap_properties = d3d12::D3D12_HEAP_PROPERTIES { - Type: d3d12::D3D12_HEAP_TYPE_UPLOAD, - CPUPageProperty: d3d12::D3D12_CPU_PAGE_PROPERTY_UNKNOWN, - //TODO: what should MemoryPoolPreference flag be? - MemoryPoolPreference: d3d12::D3D12_MEMORY_POOL_UNKNOWN, + Type: heap_type, + CPUPageProperty: cpu_page, + MemoryPoolPreference: memory_pool_preference, //we don't care about multi-adapter operation, so these next two will be zero CreationNodeMask: 0, VisibleNodeMask: 0, @@ -876,50 +837,7 @@ impl Device { Quality: 0, }, Layout: d3d12::D3D12_TEXTURE_LAYOUT_ROW_MAJOR, - Flags: d3d12::D3D12_RESOURCE_FLAG_NONE, - ..mem::zeroed() - }; - - let byte_addressed_buffer = self.create_committed_resource( - &heap_properties, - //TODO: is this heap flag ok? - d3d12::D3D12_HEAP_FLAG_NONE, - &resource_description, - d3d12::D3D12_RESOURCE_STATE_GENERIC_READ, - ptr::null(), - )?; - - Ok(byte_addressed_buffer) - } - - pub unsafe fn create_gpu_only_byte_addressed_buffer( - &self, - buffer_size_in_bytes: u32, - ) -> Result { - //TODO: consider flag D3D12_HEAP_FLAG_ALLOW_SHADER_ATOMICS? - let heap_properties = d3d12::D3D12_HEAP_PROPERTIES { - //for GPU access only - Type: d3d12::D3D12_HEAP_TYPE_DEFAULT, - CPUPageProperty: d3d12::D3D12_CPU_PAGE_PROPERTY_UNKNOWN, - //TODO: what should MemoryPoolPreference flag be? - MemoryPoolPreference: d3d12::D3D12_MEMORY_POOL_UNKNOWN, - //we don't care about multi-adapter operation, so these next two will be zero - CreationNodeMask: 0, - VisibleNodeMask: 0, - }; - let resource_description = d3d12::D3D12_RESOURCE_DESC { - Dimension: d3d12::D3D12_RESOURCE_DIMENSION_BUFFER, - Width: buffer_size_in_bytes as u64, - Height: 1, - DepthOrArraySize: 1, - MipLevels: 1, - SampleDesc: dxgitype::DXGI_SAMPLE_DESC { - Count: 1, - Quality: 0, - }, - //essentially we're letting the adapter decide the layout - Layout: d3d12::D3D12_TEXTURE_LAYOUT_ROW_MAJOR, - Flags: d3d12::D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS, + Flags: resource_flags, ..mem::zeroed() }; @@ -927,7 +845,7 @@ impl Device { &heap_properties, d3d12::D3D12_HEAP_FLAG_NONE, &resource_description, - d3d12::D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + init_resource_state, ptr::null(), )?; @@ -994,6 +912,21 @@ impl Device { Ok(buffer) } + pub unsafe fn get_features_architecture( + &self, + ) -> Result { + let mut features_architecture = mem::zeroed(); + explain_error( + self.0.CheckFeatureSupport( + d3d12::D3D12_FEATURE_ARCHITECTURE, + &mut features_architecture as *mut _ as *mut _, + mem::size_of::() as u32, + ), + "error querying feature architecture", + )?; + Ok(features_architecture) + } + pub unsafe fn get_removal_reason(&self) -> Error { Error::Hresult(self.0.GetDeviceRemovedReason()) } diff --git a/piet-gpu-hal/src/hub.rs b/piet-gpu-hal/src/hub.rs index d98108c..2a670e1 100644 --- a/piet-gpu-hal/src/hub.rs +++ b/piet-gpu-hal/src/hub.rs @@ -150,11 +150,7 @@ impl Session { }))) } - pub unsafe fn create_image2d( - &self, - width: u32, - height: u32, - ) -> Result { + pub unsafe fn create_image2d(&self, width: u32, height: u32) -> Result { let image = self.0.device.create_image2d(width, height)?; Ok(Image(Arc::new(ImageInner { image, diff --git a/piet-gpu-hal/src/lib.rs b/piet-gpu-hal/src/lib.rs index 9cf121a..a619e8d 100644 --- a/piet-gpu-hal/src/lib.rs +++ b/piet-gpu-hal/src/lib.rs @@ -2,7 +2,6 @@ /// /// This abstraction is inspired by gfx-hal, but is specialized to the needs of piet-gpu. /// In time, it may go away and be replaced by either gfx-hal or wgpu. - use bitflags::bitflags; pub mod hub; @@ -106,11 +105,7 @@ pub trait Device: Sized { /// Maybe doesn't need result return? unsafe fn destroy_buffer(&self, buffer: &Self::Buffer) -> Result<(), Error>; - unsafe fn create_image2d( - &self, - width: u32, - height: u32, - ) -> Result; + unsafe fn create_image2d(&self, width: u32, height: u32) -> Result; /// Destroy an image. /// diff --git a/piet-gpu-hal/src/vulkan.rs b/piet-gpu-hal/src/vulkan.rs index 65dc14b..95ed9f5 100644 --- a/piet-gpu-hal/src/vulkan.rs +++ b/piet-gpu-hal/src/vulkan.rs @@ -9,7 +9,9 @@ use ash::extensions::{ext::DebugUtils, khr}; use ash::version::{DeviceV1_0, EntryV1_0, InstanceV1_0, InstanceV1_1}; use ash::{vk, Device, Entry, Instance}; -use crate::{BufferUsage, Device as DeviceTrait, Error, GpuInfo, ImageLayout, SamplerParams, SubgroupSize}; +use crate::{ + BufferUsage, Device as DeviceTrait, Error, GpuInfo, ImageLayout, SamplerParams, SubgroupSize, +}; pub struct VkInstance { /// Retain the dynamic lib. @@ -512,11 +514,7 @@ impl crate::Device for VkDevice { Ok(()) } - unsafe fn create_image2d( - &self, - width: u32, - height: u32, - ) -> Result { + unsafe fn create_image2d(&self, width: u32, height: u32) -> Result { let device = &self.device.device; let extent = vk::Extent3D { width,