From 9980c858b6e57666e849976af0f51ea4a21b0256 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Thu, 14 Apr 2022 16:27:28 -0700 Subject: [PATCH] Fix timer queries in Vulkan and DX12 backends Current status: the piet-gpu-hal module (including the collatz example) have the new API (with queries set on compute pass) implemented. The other uses have not yet been updated. On Metal, only M1 is tested. The "command" counter style is partly implemented, but not fully wired up. --- piet-gpu-hal/src/dx12.rs | 35 +++++++++++++++----- piet-gpu-hal/src/dx12/wrappers.rs | 3 -- piet-gpu-hal/src/hub.rs | 15 ++++++++- piet-gpu-hal/src/lib.rs | 6 ++++ piet-gpu-hal/src/vulkan.rs | 55 ++++++++++++++++++++++--------- 5 files changed, 85 insertions(+), 29 deletions(-) diff --git a/piet-gpu-hal/src/dx12.rs b/piet-gpu-hal/src/dx12.rs index 78ad449..c5e1e04 100644 --- a/piet-gpu-hal/src/dx12.rs +++ b/piet-gpu-hal/src/dx12.rs @@ -21,7 +21,7 @@ use raw_window_handle::{HasRawWindowHandle, RawWindowHandle}; use smallvec::SmallVec; -use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, MapMode, WorkgroupLimits, ImageFormat}; +use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, MapMode, WorkgroupLimits, ImageFormat, ComputePassDescriptor}; use self::{ descriptor::{CpuHeapRefOwned, DescriptorPool, GpuHeapRefOwned}, @@ -76,6 +76,7 @@ pub struct CmdBuf { c: wrappers::GraphicsCommandList, allocator: CommandAllocator, needs_reset: bool, + end_query: Option<(wrappers::QueryHeap, u32)>, } pub struct Pipeline { @@ -360,6 +361,7 @@ impl crate::backend::Device for Dx12Device { c, allocator, needs_reset: false, + end_query: None, }) } } @@ -388,11 +390,10 @@ impl crate::backend::Device for Dx12Device { let mapped = self.map_buffer(&pool.buf, 0, size as u64, MapMode::Read)?; std::ptr::copy_nonoverlapping(mapped, buf.as_mut_ptr() as *mut u8, size); self.unmap_buffer(&pool.buf, 0, size as u64, MapMode::Read)?; - let ts0 = buf[0]; let tsp = (self.ts_freq as f64).recip(); - let result = buf[1..] + let result = buf .iter() - .map(|ts| ts.wrapping_sub(ts0) as f64 * tsp) + .map(|ts| *ts as f64 * tsp) .collect(); Ok(result) } @@ -610,6 +611,16 @@ impl crate::backend::CmdBuf for CmdBuf { self.allocator.reset().is_ok() && self.c.reset(&self.allocator, None).is_ok() } + unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) { + if let Some((pool, start, end)) = &desc.timer_queries { + #[allow(irrefutable_let_patterns)] + if let crate::hub::QueryPool::Dx12(pool) = pool { + self.write_timestamp(pool, *start); + self.end_query = Some((pool.heap.clone(), *end)); + } + } + } + unsafe fn dispatch( &mut self, pipeline: &Pipeline, @@ -628,6 +639,12 @@ impl crate::backend::CmdBuf for CmdBuf { .dispatch(workgroup_count.0, workgroup_count.1, workgroup_count.2); } + unsafe fn end_compute_pass(&mut self) { + if let Some((heap, end)) = self.end_query.take() { + self.c.end_timing_query(&heap, end); + } + } + unsafe fn memory_barrier(&mut self) { // See comments in CommandBuffer::pipeline_barrier in gfx-hal dx12 backend. // The "proper" way to do this would be to name the actual buffers participating @@ -666,7 +683,7 @@ impl crate::backend::CmdBuf for CmdBuf { self.memory_barrier(); } - unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option) { + unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option) { let cpu_ref = buffer.cpu_ref.as_ref().unwrap(); let (gpu_ref, heap) = buffer .gpu_ref @@ -684,23 +701,23 @@ impl crate::backend::CmdBuf for CmdBuf { ); } - unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) { + unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) { // TODO: consider using copy_resource here (if sizes match) let size = src.size.min(dst.size); self.c.copy_buffer(&dst.resource, 0, &src.resource, 0, size); } - unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) { + unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) { self.c .copy_texture_to_buffer(&src.resource, &dst.resource, src.size.0, src.size.1); } - unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) { + unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) { self.c .copy_buffer_to_texture(&src.resource, &dst.resource, dst.size.0, dst.size.1); } - unsafe fn blit_image(&self, src: &Image, dst: &Image) { + unsafe fn blit_image(&mut self, src: &Image, dst: &Image) { self.c.copy_resource(&src.resource, &dst.resource); } diff --git a/piet-gpu-hal/src/dx12/wrappers.rs b/piet-gpu-hal/src/dx12/wrappers.rs index 4bbb86c..9a3fb90 100644 --- a/piet-gpu-hal/src/dx12/wrappers.rs +++ b/piet-gpu-hal/src/dx12/wrappers.rs @@ -79,7 +79,6 @@ pub struct Blob(pub ComPtr); #[derive(Clone)] pub struct ShaderByteCode { pub bytecode: d3d12::D3D12_SHADER_BYTECODE, - blob: Option, } #[derive(Clone)] @@ -741,7 +740,6 @@ impl ShaderByteCode { BytecodeLength: blob.0.GetBufferSize(), pShaderBytecode: blob.0.GetBufferPointer(), }, - blob: Some(blob), } } @@ -810,7 +808,6 @@ impl ShaderByteCode { BytecodeLength: bytecode.len(), pShaderBytecode: bytecode.as_ptr() as *const _, }, - blob: None, } } } diff --git a/piet-gpu-hal/src/hub.rs b/piet-gpu-hal/src/hub.rs index 37c59df..5c7122a 100644 --- a/piet-gpu-hal/src/hub.rs +++ b/piet-gpu-hal/src/hub.rs @@ -375,8 +375,17 @@ impl Session { /// /// This should be called after waiting on the command buffer that wrote the /// timer queries. + /// + /// The returned vector is one shorter than the number of timer queries in the + /// pool; the first value is subtracted off. It would likely be better to return + /// the raw timestamps, but that change should be made consistently. pub unsafe fn fetch_query_pool(&self, pool: &QueryPool) -> Result, Error> { - self.0.device.fetch_query_pool(pool) + let result = self.0.device.fetch_query_pool(pool)?; + // Subtract off first timestamp. + Ok(result[1..] + .iter() + .map(|ts| *ts as f64 - result[0]) + .collect()) } #[doc(hidden)] @@ -602,6 +611,10 @@ impl CmdBuf { /// Write a timestamp. /// /// The query index must be less than the size of the query pool on creation. + /// + /// Deprecation: for greater portability, set timestamp queries on compute + /// passes instead. + #[deprecated(note = "use compute pass descriptor instead")] pub unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) { self.cmd_buf().write_timestamp(pool, query); } diff --git a/piet-gpu-hal/src/lib.rs b/piet-gpu-hal/src/lib.rs index 241cdfd..18f6390 100644 --- a/piet-gpu-hal/src/lib.rs +++ b/piet-gpu-hal/src/lib.rs @@ -190,9 +190,15 @@ pub struct WorkgroupLimits { pub max_invocations: u32, } +/// Options for creating a compute pass. #[derive(Default)] pub struct ComputePassDescriptor<'a> { // Maybe label should go here? It does in wgpu and wgpu_hal. + /// Timer query parameters. + /// + /// To record timer queries for a compute pass, set the query pool, start + /// query index, and end query index here. The indices must be less than + /// the size of the query pool. timer_queries: Option<(&'a QueryPool, u32, u32)>, } diff --git a/piet-gpu-hal/src/vulkan.rs b/piet-gpu-hal/src/vulkan.rs index 8392899..504d947 100644 --- a/piet-gpu-hal/src/vulkan.rs +++ b/piet-gpu-hal/src/vulkan.rs @@ -15,7 +15,7 @@ use smallvec::SmallVec; use crate::backend::Device as DeviceTrait; use crate::{ BindType, BufferUsage, Error, GpuInfo, ImageFormat, ImageLayout, MapMode, SamplerParams, SubgroupSize, - WorkgroupLimits, + WorkgroupLimits, ComputePassDescriptor, }; pub struct VkInstance { @@ -92,6 +92,7 @@ pub struct CmdBuf { cmd_buf: vk::CommandBuffer, cmd_pool: vk::CommandPool, device: Arc, + end_query: Option<(vk::QueryPool, u32)>, } pub struct QueryPool { @@ -738,6 +739,7 @@ impl crate::backend::Device for VkDevice { cmd_buf, cmd_pool, device: self.device.clone(), + end_query: None, }) } } @@ -770,11 +772,10 @@ impl crate::backend::Device for VkDevice { // results (Windows 10, AMD 5700 XT). let flags = vk::QueryResultFlags::TYPE_64 | vk::QueryResultFlags::WAIT; device.get_query_pool_results(pool.pool, 0, pool.n_queries, &mut buf, flags)?; - let ts0 = buf[0]; let tsp = self.timestamp_period as f64 * 1e-9; - let result = buf[1..] + let result = buf .iter() - .map(|ts| ts.wrapping_sub(ts0) as f64 * tsp) + .map(|ts| *ts as f64 * tsp) .collect(); Ok(result) } @@ -902,6 +903,16 @@ impl crate::backend::CmdBuf for CmdBuf { true } + unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) { + if let Some((pool, start, end)) = &desc.timer_queries { + #[allow(irrefutable_let_patterns)] + if let crate::hub::QueryPool::Vk(pool) = pool { + self.write_timestamp_raw(pool.pool, *start); + self.end_query = Some((pool.pool, *end)); + } + } + } + unsafe fn dispatch( &mut self, pipeline: &Pipeline, @@ -931,6 +942,12 @@ impl crate::backend::CmdBuf for CmdBuf { ); } + unsafe fn end_compute_pass(&mut self) { + if let Some((pool, end)) = self.end_query.take() { + self.write_timestamp_raw(pool, end); + } + } + /// Insert a pipeline barrier for all memory accesses. unsafe fn memory_barrier(&mut self) { let device = &self.device.device; @@ -995,13 +1012,13 @@ impl crate::backend::CmdBuf for CmdBuf { ); } - unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option) { + unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option) { let device = &self.device.device; let size = size.unwrap_or(vk::WHOLE_SIZE); device.cmd_fill_buffer(self.cmd_buf, buffer.buffer, 0, size, 0); } - unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) { + unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) { let device = &self.device.device; let size = src.size.min(dst.size); device.cmd_copy_buffer( @@ -1012,7 +1029,7 @@ impl crate::backend::CmdBuf for CmdBuf { ); } - unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) { + unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) { let device = &self.device.device; device.cmd_copy_image_to_buffer( self.cmd_buf, @@ -1035,7 +1052,7 @@ impl crate::backend::CmdBuf for CmdBuf { ); } - unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) { + unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) { let device = &self.device.device; device.cmd_copy_buffer_to_image( self.cmd_buf, @@ -1058,7 +1075,7 @@ impl crate::backend::CmdBuf for CmdBuf { ); } - unsafe fn blit_image(&self, src: &Image, dst: &Image) { + unsafe fn blit_image(&mut self, src: &Image, dst: &Image) { let device = &self.device.device; device.cmd_blit_image( self.cmd_buf, @@ -1106,13 +1123,7 @@ impl crate::backend::CmdBuf for CmdBuf { } unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) { - let device = &self.device.device; - device.cmd_write_timestamp( - self.cmd_buf, - vk::PipelineStageFlags::COMPUTE_SHADER, - pool.pool, - query, - ); + self.write_timestamp_raw(pool.pool, query); } unsafe fn begin_debug_label(&mut self, label: &str) { @@ -1130,6 +1141,18 @@ impl crate::backend::CmdBuf for CmdBuf { } } +impl CmdBuf { + unsafe fn write_timestamp_raw(&mut self, pool: vk::QueryPool, query: u32) { + let device = &self.device.device; + device.cmd_write_timestamp( + self.cmd_buf, + vk::PipelineStageFlags::COMPUTE_SHADER, + pool, + query, + ); + } +} + impl crate::backend::DescriptorSetBuilder for DescriptorSetBuilder { fn add_buffers(&mut self, buffers: &[&Buffer]) { self.buffers.extend(buffers.iter().map(|b| b.buffer));