Fix timer queries in Vulkan and DX12 backends

Current status: the piet-gpu-hal module (including the collatz example)
have the new API (with queries set on compute pass) implemented. The
other uses have not yet been updated.

On Metal, only M1 is tested. The "command" counter style is partly
implemented, but not fully wired up.
This commit is contained in:
Raph Levien 2022-04-14 16:27:28 -07:00
parent ba2b27cc3c
commit 9980c858b6
5 changed files with 85 additions and 29 deletions

View file

@ -21,7 +21,7 @@ use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};
use smallvec::SmallVec; use smallvec::SmallVec;
use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, MapMode, WorkgroupLimits, ImageFormat}; use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, MapMode, WorkgroupLimits, ImageFormat, ComputePassDescriptor};
use self::{ use self::{
descriptor::{CpuHeapRefOwned, DescriptorPool, GpuHeapRefOwned}, descriptor::{CpuHeapRefOwned, DescriptorPool, GpuHeapRefOwned},
@ -76,6 +76,7 @@ pub struct CmdBuf {
c: wrappers::GraphicsCommandList, c: wrappers::GraphicsCommandList,
allocator: CommandAllocator, allocator: CommandAllocator,
needs_reset: bool, needs_reset: bool,
end_query: Option<(wrappers::QueryHeap, u32)>,
} }
pub struct Pipeline { pub struct Pipeline {
@ -360,6 +361,7 @@ impl crate::backend::Device for Dx12Device {
c, c,
allocator, allocator,
needs_reset: false, needs_reset: false,
end_query: None,
}) })
} }
} }
@ -388,11 +390,10 @@ impl crate::backend::Device for Dx12Device {
let mapped = self.map_buffer(&pool.buf, 0, size as u64, MapMode::Read)?; let mapped = self.map_buffer(&pool.buf, 0, size as u64, MapMode::Read)?;
std::ptr::copy_nonoverlapping(mapped, buf.as_mut_ptr() as *mut u8, size); std::ptr::copy_nonoverlapping(mapped, buf.as_mut_ptr() as *mut u8, size);
self.unmap_buffer(&pool.buf, 0, size as u64, MapMode::Read)?; self.unmap_buffer(&pool.buf, 0, size as u64, MapMode::Read)?;
let ts0 = buf[0];
let tsp = (self.ts_freq as f64).recip(); let tsp = (self.ts_freq as f64).recip();
let result = buf[1..] let result = buf
.iter() .iter()
.map(|ts| ts.wrapping_sub(ts0) as f64 * tsp) .map(|ts| *ts as f64 * tsp)
.collect(); .collect();
Ok(result) Ok(result)
} }
@ -610,6 +611,16 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
self.allocator.reset().is_ok() && self.c.reset(&self.allocator, None).is_ok() self.allocator.reset().is_ok() && self.c.reset(&self.allocator, None).is_ok()
} }
unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
if let Some((pool, start, end)) = &desc.timer_queries {
#[allow(irrefutable_let_patterns)]
if let crate::hub::QueryPool::Dx12(pool) = pool {
self.write_timestamp(pool, *start);
self.end_query = Some((pool.heap.clone(), *end));
}
}
}
unsafe fn dispatch( unsafe fn dispatch(
&mut self, &mut self,
pipeline: &Pipeline, pipeline: &Pipeline,
@ -628,6 +639,12 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
.dispatch(workgroup_count.0, workgroup_count.1, workgroup_count.2); .dispatch(workgroup_count.0, workgroup_count.1, workgroup_count.2);
} }
unsafe fn end_compute_pass(&mut self) {
if let Some((heap, end)) = self.end_query.take() {
self.c.end_timing_query(&heap, end);
}
}
unsafe fn memory_barrier(&mut self) { unsafe fn memory_barrier(&mut self) {
// See comments in CommandBuffer::pipeline_barrier in gfx-hal dx12 backend. // See comments in CommandBuffer::pipeline_barrier in gfx-hal dx12 backend.
// The "proper" way to do this would be to name the actual buffers participating // The "proper" way to do this would be to name the actual buffers participating
@ -666,7 +683,7 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
self.memory_barrier(); self.memory_barrier();
} }
unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) { unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
let cpu_ref = buffer.cpu_ref.as_ref().unwrap(); let cpu_ref = buffer.cpu_ref.as_ref().unwrap();
let (gpu_ref, heap) = buffer let (gpu_ref, heap) = buffer
.gpu_ref .gpu_ref
@ -684,23 +701,23 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
); );
} }
unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) { unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
// TODO: consider using copy_resource here (if sizes match) // TODO: consider using copy_resource here (if sizes match)
let size = src.size.min(dst.size); let size = src.size.min(dst.size);
self.c.copy_buffer(&dst.resource, 0, &src.resource, 0, size); self.c.copy_buffer(&dst.resource, 0, &src.resource, 0, size);
} }
unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) { unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
self.c self.c
.copy_texture_to_buffer(&src.resource, &dst.resource, src.size.0, src.size.1); .copy_texture_to_buffer(&src.resource, &dst.resource, src.size.0, src.size.1);
} }
unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) { unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
self.c self.c
.copy_buffer_to_texture(&src.resource, &dst.resource, dst.size.0, dst.size.1); .copy_buffer_to_texture(&src.resource, &dst.resource, dst.size.0, dst.size.1);
} }
unsafe fn blit_image(&self, src: &Image, dst: &Image) { unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
self.c.copy_resource(&src.resource, &dst.resource); self.c.copy_resource(&src.resource, &dst.resource);
} }

View file

@ -79,7 +79,6 @@ pub struct Blob(pub ComPtr<d3dcommon::ID3DBlob>);
#[derive(Clone)] #[derive(Clone)]
pub struct ShaderByteCode { pub struct ShaderByteCode {
pub bytecode: d3d12::D3D12_SHADER_BYTECODE, pub bytecode: d3d12::D3D12_SHADER_BYTECODE,
blob: Option<Blob>,
} }
#[derive(Clone)] #[derive(Clone)]
@ -741,7 +740,6 @@ impl ShaderByteCode {
BytecodeLength: blob.0.GetBufferSize(), BytecodeLength: blob.0.GetBufferSize(),
pShaderBytecode: blob.0.GetBufferPointer(), pShaderBytecode: blob.0.GetBufferPointer(),
}, },
blob: Some(blob),
} }
} }
@ -810,7 +808,6 @@ impl ShaderByteCode {
BytecodeLength: bytecode.len(), BytecodeLength: bytecode.len(),
pShaderBytecode: bytecode.as_ptr() as *const _, pShaderBytecode: bytecode.as_ptr() as *const _,
}, },
blob: None,
} }
} }
} }

View file

@ -375,8 +375,17 @@ impl Session {
/// ///
/// This should be called after waiting on the command buffer that wrote the /// This should be called after waiting on the command buffer that wrote the
/// timer queries. /// timer queries.
///
/// The returned vector is one shorter than the number of timer queries in the
/// pool; the first value is subtracted off. It would likely be better to return
/// the raw timestamps, but that change should be made consistently.
pub unsafe fn fetch_query_pool(&self, pool: &QueryPool) -> Result<Vec<f64>, Error> { pub unsafe fn fetch_query_pool(&self, pool: &QueryPool) -> Result<Vec<f64>, Error> {
self.0.device.fetch_query_pool(pool) let result = self.0.device.fetch_query_pool(pool)?;
// Subtract off first timestamp.
Ok(result[1..]
.iter()
.map(|ts| *ts as f64 - result[0])
.collect())
} }
#[doc(hidden)] #[doc(hidden)]
@ -602,6 +611,10 @@ impl CmdBuf {
/// Write a timestamp. /// Write a timestamp.
/// ///
/// The query index must be less than the size of the query pool on creation. /// The query index must be less than the size of the query pool on creation.
///
/// Deprecation: for greater portability, set timestamp queries on compute
/// passes instead.
#[deprecated(note = "use compute pass descriptor instead")]
pub unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) { pub unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
self.cmd_buf().write_timestamp(pool, query); self.cmd_buf().write_timestamp(pool, query);
} }

View file

@ -190,9 +190,15 @@ pub struct WorkgroupLimits {
pub max_invocations: u32, pub max_invocations: u32,
} }
/// Options for creating a compute pass.
#[derive(Default)] #[derive(Default)]
pub struct ComputePassDescriptor<'a> { pub struct ComputePassDescriptor<'a> {
// Maybe label should go here? It does in wgpu and wgpu_hal. // Maybe label should go here? It does in wgpu and wgpu_hal.
/// Timer query parameters.
///
/// To record timer queries for a compute pass, set the query pool, start
/// query index, and end query index here. The indices must be less than
/// the size of the query pool.
timer_queries: Option<(&'a QueryPool, u32, u32)>, timer_queries: Option<(&'a QueryPool, u32, u32)>,
} }

View file

@ -15,7 +15,7 @@ use smallvec::SmallVec;
use crate::backend::Device as DeviceTrait; use crate::backend::Device as DeviceTrait;
use crate::{ use crate::{
BindType, BufferUsage, Error, GpuInfo, ImageFormat, ImageLayout, MapMode, SamplerParams, SubgroupSize, BindType, BufferUsage, Error, GpuInfo, ImageFormat, ImageLayout, MapMode, SamplerParams, SubgroupSize,
WorkgroupLimits, WorkgroupLimits, ComputePassDescriptor,
}; };
pub struct VkInstance { pub struct VkInstance {
@ -92,6 +92,7 @@ pub struct CmdBuf {
cmd_buf: vk::CommandBuffer, cmd_buf: vk::CommandBuffer,
cmd_pool: vk::CommandPool, cmd_pool: vk::CommandPool,
device: Arc<RawDevice>, device: Arc<RawDevice>,
end_query: Option<(vk::QueryPool, u32)>,
} }
pub struct QueryPool { pub struct QueryPool {
@ -738,6 +739,7 @@ impl crate::backend::Device for VkDevice {
cmd_buf, cmd_buf,
cmd_pool, cmd_pool,
device: self.device.clone(), device: self.device.clone(),
end_query: None,
}) })
} }
} }
@ -770,11 +772,10 @@ impl crate::backend::Device for VkDevice {
// results (Windows 10, AMD 5700 XT). // results (Windows 10, AMD 5700 XT).
let flags = vk::QueryResultFlags::TYPE_64 | vk::QueryResultFlags::WAIT; let flags = vk::QueryResultFlags::TYPE_64 | vk::QueryResultFlags::WAIT;
device.get_query_pool_results(pool.pool, 0, pool.n_queries, &mut buf, flags)?; device.get_query_pool_results(pool.pool, 0, pool.n_queries, &mut buf, flags)?;
let ts0 = buf[0];
let tsp = self.timestamp_period as f64 * 1e-9; let tsp = self.timestamp_period as f64 * 1e-9;
let result = buf[1..] let result = buf
.iter() .iter()
.map(|ts| ts.wrapping_sub(ts0) as f64 * tsp) .map(|ts| *ts as f64 * tsp)
.collect(); .collect();
Ok(result) Ok(result)
} }
@ -902,6 +903,16 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
true true
} }
unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
if let Some((pool, start, end)) = &desc.timer_queries {
#[allow(irrefutable_let_patterns)]
if let crate::hub::QueryPool::Vk(pool) = pool {
self.write_timestamp_raw(pool.pool, *start);
self.end_query = Some((pool.pool, *end));
}
}
}
unsafe fn dispatch( unsafe fn dispatch(
&mut self, &mut self,
pipeline: &Pipeline, pipeline: &Pipeline,
@ -931,6 +942,12 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
); );
} }
unsafe fn end_compute_pass(&mut self) {
if let Some((pool, end)) = self.end_query.take() {
self.write_timestamp_raw(pool, end);
}
}
/// Insert a pipeline barrier for all memory accesses. /// Insert a pipeline barrier for all memory accesses.
unsafe fn memory_barrier(&mut self) { unsafe fn memory_barrier(&mut self) {
let device = &self.device.device; let device = &self.device.device;
@ -995,13 +1012,13 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
); );
} }
unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) { unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
let device = &self.device.device; let device = &self.device.device;
let size = size.unwrap_or(vk::WHOLE_SIZE); let size = size.unwrap_or(vk::WHOLE_SIZE);
device.cmd_fill_buffer(self.cmd_buf, buffer.buffer, 0, size, 0); device.cmd_fill_buffer(self.cmd_buf, buffer.buffer, 0, size, 0);
} }
unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) { unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
let device = &self.device.device; let device = &self.device.device;
let size = src.size.min(dst.size); let size = src.size.min(dst.size);
device.cmd_copy_buffer( device.cmd_copy_buffer(
@ -1012,7 +1029,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
); );
} }
unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) { unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
let device = &self.device.device; let device = &self.device.device;
device.cmd_copy_image_to_buffer( device.cmd_copy_image_to_buffer(
self.cmd_buf, self.cmd_buf,
@ -1035,7 +1052,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
); );
} }
unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) { unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
let device = &self.device.device; let device = &self.device.device;
device.cmd_copy_buffer_to_image( device.cmd_copy_buffer_to_image(
self.cmd_buf, self.cmd_buf,
@ -1058,7 +1075,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
); );
} }
unsafe fn blit_image(&self, src: &Image, dst: &Image) { unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
let device = &self.device.device; let device = &self.device.device;
device.cmd_blit_image( device.cmd_blit_image(
self.cmd_buf, self.cmd_buf,
@ -1106,13 +1123,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
} }
unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) { unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
let device = &self.device.device; self.write_timestamp_raw(pool.pool, query);
device.cmd_write_timestamp(
self.cmd_buf,
vk::PipelineStageFlags::COMPUTE_SHADER,
pool.pool,
query,
);
} }
unsafe fn begin_debug_label(&mut self, label: &str) { unsafe fn begin_debug_label(&mut self, label: &str) {
@ -1130,6 +1141,18 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
} }
} }
impl CmdBuf {
unsafe fn write_timestamp_raw(&mut self, pool: vk::QueryPool, query: u32) {
let device = &self.device.device;
device.cmd_write_timestamp(
self.cmd_buf,
vk::PipelineStageFlags::COMPUTE_SHADER,
pool,
query,
);
}
}
impl crate::backend::DescriptorSetBuilder<VkDevice> for DescriptorSetBuilder { impl crate::backend::DescriptorSetBuilder<VkDevice> for DescriptorSetBuilder {
fn add_buffers(&mut self, buffers: &[&Buffer]) { fn add_buffers(&mut self, buffers: &[&Buffer]) {
self.buffers.extend(buffers.iter().map(|b| b.buffer)); self.buffers.extend(buffers.iter().map(|b| b.buffer));