From e4b16e706ade6fd1bc5821b24fc5021a9a89c6a9 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Fri, 21 May 2021 13:19:10 -0700 Subject: [PATCH] Timestamp queries These function, but can use some work. First, the buffer situation is worse than it should be. It should be possible to create a single readback buffer rather then copy from gpu-local to host-coherent. Second, the command buffer `finish_timestamps` call doesn't correlate to anything in Vulkan, so needs plumbing up through the hub in one form or other when that happens. I'm inclined to make it ergonomic by doing a bit of resource tracking that will trigger the appropriate call (and subsequent host barrier) in the `finish` method on the command buffer. --- piet-gpu-hal/examples/dx12_toy.rs | 5 ++++ piet-gpu-hal/src/dx12.rs | 48 ++++++++++++++++++++++++++++--- piet-gpu-hal/src/dx12/wrappers.rs | 29 ++++++++++--------- piet-gpu-hal/src/lib.rs | 4 +++ 4 files changed, 69 insertions(+), 17 deletions(-) diff --git a/piet-gpu-hal/examples/dx12_toy.rs b/piet-gpu-hal/examples/dx12_toy.rs index bd769f5..084d0e5 100644 --- a/piet-gpu-hal/examples/dx12_toy.rs +++ b/piet-gpu-hal/examples/dx12_toy.rs @@ -52,6 +52,7 @@ fn toy() -> Result<(), Error> { let buf = device.create_buffer(1024, MemFlags::host_coherent())?; let dev_buf = device.create_buffer(1024, MemFlags::device_local())?; let data: Vec = (1..257).collect(); + let query_pool = device.create_query_pool(2)?; unsafe { device.write_buffer(&buf, &data)?; let pipeline = device.create_simple_compute_pipeline(SHADER_CODE, 1, 0)?; @@ -61,9 +62,12 @@ fn toy() -> Result<(), Error> { cmd_buf.begin(); cmd_buf.copy_buffer(&buf, &dev_buf); cmd_buf.memory_barrier(); + cmd_buf.write_timestamp(&query_pool, 0); cmd_buf.dispatch(&pipeline, &ds, (1, 1, 1)); + cmd_buf.write_timestamp(&query_pool, 1); cmd_buf.memory_barrier(); cmd_buf.copy_buffer(&dev_buf, &buf); + cmd_buf.finish_timestamps(&query_pool); cmd_buf.host_barrier(); cmd_buf.finish(); device.run_cmd_buf(&cmd_buf, &[], &[], Some(&fence))?; @@ -71,6 +75,7 @@ fn toy() -> Result<(), Error> { let mut readback: Vec = Vec::new(); device.read_buffer(&buf, &mut readback)?; println!("{:?}", readback); + println!("{:?}", device.fetch_query_pool(&query_pool)); } Ok(()) } diff --git a/piet-gpu-hal/src/dx12.rs b/piet-gpu-hal/src/dx12.rs index fff2519..3c96656 100644 --- a/piet-gpu-hal/src/dx12.rs +++ b/piet-gpu-hal/src/dx12.rs @@ -22,6 +22,7 @@ pub struct Dx12Device { device: Device, command_allocator: CommandAllocator, command_queue: CommandQueue, + ts_freq: u64, } #[derive(Clone)] @@ -55,7 +56,15 @@ pub struct Pipeline { // gpu-descriptor crate. pub struct DescriptorSet(wrappers::DescriptorHeap); -pub struct QueryPool; +pub struct QueryPool { + heap: wrappers::QueryHeap, + buf: Buffer, + // TODO: piet-dx12 manages to do this with one buffer. I think a + // HEAP_TYPE_READBACK will work, but we currently don't have fine + // grained usage flags to express this. + readback_buf: Buffer, + n_queries: u32, +} pub struct Fence { fence: wrappers::Fence, @@ -118,10 +127,12 @@ impl Dx12Instance { 0, )?; let command_allocator = device.create_command_allocator(list_type)?; + let ts_freq = command_queue.get_timestamp_frequency()?; Ok(Dx12Device { device, command_queue, command_allocator, + ts_freq, }) } } @@ -202,11 +213,33 @@ impl crate::Device for Dx12Device { } fn create_query_pool(&self, n_queries: u32) -> Result { - todo!() + unsafe { + let heap = self + .device + .create_query_heap(d3d12::D3D12_QUERY_HEAP_TYPE_TIMESTAMP, n_queries)?; + let buf = + self.create_buffer((n_queries * 8) as u64, crate::MemFlags::device_local())?; + let readback_buf = + self.create_buffer((n_queries * 8) as u64, crate::MemFlags::host_coherent())?; + Ok(QueryPool { + heap, + buf, + readback_buf, + n_queries, + }) + } } unsafe fn fetch_query_pool(&self, pool: &Self::QueryPool) -> Result, Error> { - todo!() + let mut buf = vec![0u64; pool.n_queries as usize]; + self.read_buffer(&pool.readback_buf, &mut buf)?; + let ts0 = buf[0]; + let tsp = (self.ts_freq as f64).recip(); + let result = buf[1..] + .iter() + .map(|ts| ts.wrapping_sub(ts0) as f64 * tsp) + .collect(); + Ok(result) } unsafe fn run_cmd_buf( @@ -248,6 +281,7 @@ impl crate::Device for Dx12Device { contents: &[T], ) -> Result<(), Error> { let len = buffer.size as usize / std::mem::size_of::(); + assert!(len >= contents.len()); buffer.resource.write_resource(len, contents.as_ptr())?; Ok(()) } @@ -366,7 +400,13 @@ impl crate::CmdBuf for CmdBuf { } unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) { - todo!() + self.0.end_timing_query(&pool.heap, query); + } + + unsafe fn finish_timestamps(&mut self, pool: &QueryPool) { + self.0 + .resolve_timing_query_data(&pool.heap, 0, pool.n_queries, &pool.buf.resource, 0); + self.copy_buffer(&pool.buf, &pool.readback_buf); } } diff --git a/piet-gpu-hal/src/dx12/wrappers.rs b/piet-gpu-hal/src/dx12/wrappers.rs index f60f42a..df5f5aa 100644 --- a/piet-gpu-hal/src/dx12/wrappers.rs +++ b/piet-gpu-hal/src/dx12/wrappers.rs @@ -107,7 +107,7 @@ impl Resource { } pub fn get(&self) -> *const d3d12::ID3D12Resource { - self.ptr.load(Ordering::Relaxed) + self.get_mut() } pub fn get_mut(&self) -> *mut d3d12::ID3D12Resource { @@ -491,6 +491,7 @@ impl Device { Ok(RootSignature(ComPtr::from_raw(signature))) } + // This is for indirect command submission and we probably won't use it. pub unsafe fn create_command_signature( &self, root_signature: RootSignature, @@ -722,7 +723,7 @@ impl Device { &self, heap_type: d3d12::D3D12_QUERY_HEAP_TYPE, num_expected_queries: u32, - ) -> QueryHeap { + ) -> Result { let query_heap_desc = d3d12::D3D12_QUERY_HEAP_DESC { Type: heap_type, Count: num_expected_queries, @@ -731,14 +732,16 @@ impl Device { let mut query_heap = ptr::null_mut(); - error_if_failed_else_unit(self.0.CreateQueryHeap( - &query_heap_desc as *const _, - &d3d12::ID3D12QueryHeap::uuidof(), - &mut query_heap as *mut _ as *mut _, - )) - .expect("could not create query heap"); + explain_error( + self.0.CreateQueryHeap( + &query_heap_desc as *const _, + &d3d12::ID3D12QueryHeap::uuidof(), + &mut query_heap as *mut _ as *mut _, + ), + "could not create query heap", + )?; - QueryHeap(ComPtr::from_raw(query_heap)) + Ok(QueryHeap(ComPtr::from_raw(query_heap))) } // based on: https://github.com/microsoft/DirectX-Graphics-Samples/blob/682051ddbe4be820195fffed0bfbdbbde8611a90/Libraries/D3DX12/d3dx12.h#L1875 @@ -1375,9 +1378,9 @@ impl GraphicsCommandList { ); } - pub unsafe fn end_timing_query(&self, query_heap: QueryHeap, index: u32) { + pub unsafe fn end_timing_query(&self, query_heap: &QueryHeap, index: u32) { self.0.EndQuery( - query_heap.0.as_raw() as *mut _, + query_heap.0.as_raw(), d3d12::D3D12_QUERY_TYPE_TIMESTAMP, index, ); @@ -1385,10 +1388,10 @@ impl GraphicsCommandList { pub unsafe fn resolve_timing_query_data( &self, - query_heap: QueryHeap, + query_heap: &QueryHeap, start_index: u32, num_queries: u32, - destination_buffer: Resource, + destination_buffer: &Resource, aligned_destination_buffer_offset: u64, ) { self.0.ResolveQueryData( diff --git a/piet-gpu-hal/src/lib.rs b/piet-gpu-hal/src/lib.rs index 1548838..9a8772d 100644 --- a/piet-gpu-hal/src/lib.rs +++ b/piet-gpu-hal/src/lib.rs @@ -244,6 +244,10 @@ pub trait CmdBuf { unsafe fn reset_query_pool(&mut self, pool: &D::QueryPool); unsafe fn write_timestamp(&mut self, pool: &D::QueryPool, query: u32); + + /// Prepare the timestamps for reading. This isn't required on Vulkan but + /// is required on (at least) DX12. + unsafe fn finish_timestamps(&mut self, pool: &D::QueryPool) {} } pub trait MemFlags: Sized + Clone + Copy {