Merge pull request #164 from linebender/metal_timer

Timer queries on more platforms, particularly Metal
This commit is contained in:
Chad Brokaw 2022-05-05 13:56:18 -04:00 committed by GitHub
commit 05624de19d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
29 changed files with 781 additions and 208 deletions

1
Cargo.lock generated
View file

@ -921,6 +921,7 @@ dependencies = [
"block", "block",
"bytemuck", "bytemuck",
"cocoa-foundation", "cocoa-foundation",
"foreign-types",
"metal", "metal",
"objc", "objc",
"raw-window-handle", "raw-window-handle",

View file

@ -28,3 +28,4 @@ metal = "0.22"
objc = "0.2.5" objc = "0.2.5"
block = "0.1.6" block = "0.1.6"
cocoa-foundation = "0.1" cocoa-foundation = "0.1"
foreign-types = "0.3.2"

View file

@ -1,4 +1,4 @@
use piet_gpu_hal::{include_shader, BindType}; use piet_gpu_hal::{include_shader, BindType, ComputePassDescriptor};
use piet_gpu_hal::{BufferUsage, Instance, InstanceFlags, Session}; use piet_gpu_hal::{BufferUsage, Instance, InstanceFlags, Session};
fn main() { fn main() {
@ -20,9 +20,9 @@ fn main() {
let mut cmd_buf = session.cmd_buf().unwrap(); let mut cmd_buf = session.cmd_buf().unwrap();
cmd_buf.begin(); cmd_buf.begin();
cmd_buf.reset_query_pool(&query_pool); cmd_buf.reset_query_pool(&query_pool);
cmd_buf.write_timestamp(&query_pool, 0); let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 0, 1));
cmd_buf.dispatch(&pipeline, &descriptor_set, (256, 1, 1), (1, 1, 1)); pass.dispatch(&pipeline, &descriptor_set, (256, 1, 1), (1, 1, 1));
cmd_buf.write_timestamp(&query_pool, 1); pass.end();
cmd_buf.finish_timestamps(&query_pool); cmd_buf.finish_timestamps(&query_pool);
cmd_buf.host_barrier(); cmd_buf.host_barrier();
cmd_buf.finish(); cmd_buf.finish();

View file

@ -17,7 +17,8 @@
//! The generic trait for backends to implement. //! The generic trait for backends to implement.
use crate::{ use crate::{
BindType, BufferUsage, Error, GpuInfo, ImageFormat, ImageLayout, MapMode, SamplerParams, BindType, BufferUsage, ComputePassDescriptor, Error, GpuInfo, ImageFormat, ImageLayout,
MapMode, SamplerParams,
}; };
pub trait Device: Sized { pub trait Device: Sized {
@ -159,14 +160,32 @@ pub trait Device: Sized {
unsafe fn create_sampler(&self, params: SamplerParams) -> Result<Self::Sampler, Error>; unsafe fn create_sampler(&self, params: SamplerParams) -> Result<Self::Sampler, Error>;
} }
/// The trait implemented by backend command buffer implementations.
///
/// Valid encoding is represented by a state machine (currently not validated
/// but it is easy to imagine there might be at least debug validation). Most
/// methods are only valid in a particular state, and some move it to another
/// state.
pub trait CmdBuf<D: Device> { pub trait CmdBuf<D: Device> {
/// Begin encoding.
///
/// State: init -> ready
unsafe fn begin(&mut self); unsafe fn begin(&mut self);
/// State: ready -> finished
unsafe fn finish(&mut self); unsafe fn finish(&mut self);
/// Return true if the command buffer is suitable for reuse. /// Return true if the command buffer is suitable for reuse.
unsafe fn reset(&mut self) -> bool; unsafe fn reset(&mut self) -> bool;
/// Begin a compute pass.
///
/// State: ready -> in_compute_pass
unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor);
/// Dispatch
///
/// State: in_compute_pass
unsafe fn dispatch( unsafe fn dispatch(
&mut self, &mut self,
pipeline: &D::Pipeline, pipeline: &D::Pipeline,
@ -175,6 +194,9 @@ pub trait CmdBuf<D: Device> {
workgroup_size: (u32, u32, u32), workgroup_size: (u32, u32, u32),
); );
/// State: in_compute_pass -> ready
unsafe fn end_compute_pass(&mut self);
/// Insert an execution and memory barrier. /// Insert an execution and memory barrier.
/// ///
/// Compute kernels (and other actions) after this barrier may read from buffers /// Compute kernels (and other actions) after this barrier may read from buffers
@ -202,16 +224,16 @@ pub trait CmdBuf<D: Device> {
/// This is readily supported in Vulkan, but for portability it is remarkably /// This is readily supported in Vulkan, but for portability it is remarkably
/// tricky (unimplemented in gfx-hal right now). Possibly best to write a compute /// tricky (unimplemented in gfx-hal right now). Possibly best to write a compute
/// kernel, or organize the code not to need it. /// kernel, or organize the code not to need it.
unsafe fn clear_buffer(&self, buffer: &D::Buffer, size: Option<u64>); unsafe fn clear_buffer(&mut self, buffer: &D::Buffer, size: Option<u64>);
unsafe fn copy_buffer(&self, src: &D::Buffer, dst: &D::Buffer); unsafe fn copy_buffer(&mut self, src: &D::Buffer, dst: &D::Buffer);
unsafe fn copy_image_to_buffer(&self, src: &D::Image, dst: &D::Buffer); unsafe fn copy_image_to_buffer(&mut self, src: &D::Image, dst: &D::Buffer);
unsafe fn copy_buffer_to_image(&self, src: &D::Buffer, dst: &D::Image); unsafe fn copy_buffer_to_image(&mut self, src: &D::Buffer, dst: &D::Image);
// low portability, dx12 doesn't support it natively // low portability, dx12 doesn't support it natively
unsafe fn blit_image(&self, src: &D::Image, dst: &D::Image); unsafe fn blit_image(&mut self, src: &D::Image, dst: &D::Image);
/// Reset the query pool. /// Reset the query pool.
/// ///
@ -227,7 +249,7 @@ pub trait CmdBuf<D: Device> {
unsafe fn finish_timestamps(&mut self, _pool: &D::QueryPool) {} unsafe fn finish_timestamps(&mut self, _pool: &D::QueryPool) {}
/// Begin a labeled section for debugging and profiling purposes. /// Begin a labeled section for debugging and profiling purposes.
unsafe fn begin_debug_label(&mut self, label: &str) {} unsafe fn begin_debug_label(&mut self, _label: &str) {}
/// End a section opened by `begin_debug_label`. /// End a section opened by `begin_debug_label`.
unsafe fn end_debug_label(&mut self) {} unsafe fn end_debug_label(&mut self) {}

View file

@ -21,7 +21,7 @@ use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};
use smallvec::SmallVec; use smallvec::SmallVec;
use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, MapMode, WorkgroupLimits, ImageFormat}; use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, MapMode, WorkgroupLimits, ImageFormat, ComputePassDescriptor};
use self::{ use self::{
descriptor::{CpuHeapRefOwned, DescriptorPool, GpuHeapRefOwned}, descriptor::{CpuHeapRefOwned, DescriptorPool, GpuHeapRefOwned},
@ -76,6 +76,7 @@ pub struct CmdBuf {
c: wrappers::GraphicsCommandList, c: wrappers::GraphicsCommandList,
allocator: CommandAllocator, allocator: CommandAllocator,
needs_reset: bool, needs_reset: bool,
end_query: Option<(wrappers::QueryHeap, u32)>,
} }
pub struct Pipeline { pub struct Pipeline {
@ -360,6 +361,7 @@ impl crate::backend::Device for Dx12Device {
c, c,
allocator, allocator,
needs_reset: false, needs_reset: false,
end_query: None,
}) })
} }
} }
@ -388,11 +390,10 @@ impl crate::backend::Device for Dx12Device {
let mapped = self.map_buffer(&pool.buf, 0, size as u64, MapMode::Read)?; let mapped = self.map_buffer(&pool.buf, 0, size as u64, MapMode::Read)?;
std::ptr::copy_nonoverlapping(mapped, buf.as_mut_ptr() as *mut u8, size); std::ptr::copy_nonoverlapping(mapped, buf.as_mut_ptr() as *mut u8, size);
self.unmap_buffer(&pool.buf, 0, size as u64, MapMode::Read)?; self.unmap_buffer(&pool.buf, 0, size as u64, MapMode::Read)?;
let ts0 = buf[0];
let tsp = (self.ts_freq as f64).recip(); let tsp = (self.ts_freq as f64).recip();
let result = buf[1..] let result = buf
.iter() .iter()
.map(|ts| ts.wrapping_sub(ts0) as f64 * tsp) .map(|ts| *ts as f64 * tsp)
.collect(); .collect();
Ok(result) Ok(result)
} }
@ -610,6 +611,16 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
self.allocator.reset().is_ok() && self.c.reset(&self.allocator, None).is_ok() self.allocator.reset().is_ok() && self.c.reset(&self.allocator, None).is_ok()
} }
unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
if let Some((pool, start, end)) = &desc.timer_queries {
#[allow(irrefutable_let_patterns)]
if let crate::hub::QueryPool::Dx12(pool) = pool {
self.write_timestamp(pool, *start);
self.end_query = Some((pool.heap.clone(), *end));
}
}
}
unsafe fn dispatch( unsafe fn dispatch(
&mut self, &mut self,
pipeline: &Pipeline, pipeline: &Pipeline,
@ -628,6 +639,12 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
.dispatch(workgroup_count.0, workgroup_count.1, workgroup_count.2); .dispatch(workgroup_count.0, workgroup_count.1, workgroup_count.2);
} }
unsafe fn end_compute_pass(&mut self) {
if let Some((heap, end)) = self.end_query.take() {
self.c.end_timing_query(&heap, end);
}
}
unsafe fn memory_barrier(&mut self) { unsafe fn memory_barrier(&mut self) {
// See comments in CommandBuffer::pipeline_barrier in gfx-hal dx12 backend. // See comments in CommandBuffer::pipeline_barrier in gfx-hal dx12 backend.
// The "proper" way to do this would be to name the actual buffers participating // The "proper" way to do this would be to name the actual buffers participating
@ -666,7 +683,7 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
self.memory_barrier(); self.memory_barrier();
} }
unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) { unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
let cpu_ref = buffer.cpu_ref.as_ref().unwrap(); let cpu_ref = buffer.cpu_ref.as_ref().unwrap();
let (gpu_ref, heap) = buffer let (gpu_ref, heap) = buffer
.gpu_ref .gpu_ref
@ -684,23 +701,23 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
); );
} }
unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) { unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
// TODO: consider using copy_resource here (if sizes match) // TODO: consider using copy_resource here (if sizes match)
let size = src.size.min(dst.size); let size = src.size.min(dst.size);
self.c.copy_buffer(&dst.resource, 0, &src.resource, 0, size); self.c.copy_buffer(&dst.resource, 0, &src.resource, 0, size);
} }
unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) { unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
self.c self.c
.copy_texture_to_buffer(&src.resource, &dst.resource, src.size.0, src.size.1); .copy_texture_to_buffer(&src.resource, &dst.resource, src.size.0, src.size.1);
} }
unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) { unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
self.c self.c
.copy_buffer_to_texture(&src.resource, &dst.resource, dst.size.0, dst.size.1); .copy_buffer_to_texture(&src.resource, &dst.resource, dst.size.0, dst.size.1);
} }
unsafe fn blit_image(&self, src: &Image, dst: &Image) { unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
self.c.copy_resource(&src.resource, &dst.resource); self.c.copy_resource(&src.resource, &dst.resource);
} }

View file

@ -79,7 +79,6 @@ pub struct Blob(pub ComPtr<d3dcommon::ID3DBlob>);
#[derive(Clone)] #[derive(Clone)]
pub struct ShaderByteCode { pub struct ShaderByteCode {
pub bytecode: d3d12::D3D12_SHADER_BYTECODE, pub bytecode: d3d12::D3D12_SHADER_BYTECODE,
blob: Option<Blob>,
} }
#[derive(Clone)] #[derive(Clone)]
@ -741,7 +740,6 @@ impl ShaderByteCode {
BytecodeLength: blob.0.GetBufferSize(), BytecodeLength: blob.0.GetBufferSize(),
pShaderBytecode: blob.0.GetBufferPointer(), pShaderBytecode: blob.0.GetBufferPointer(),
}, },
blob: Some(blob),
} }
} }
@ -810,7 +808,6 @@ impl ShaderByteCode {
BytecodeLength: bytecode.len(), BytecodeLength: bytecode.len(),
pShaderBytecode: bytecode.as_ptr() as *const _, pShaderBytecode: bytecode.as_ptr() as *const _,
}, },
blob: None,
} }
} }
} }

View file

@ -13,7 +13,7 @@ use std::sync::{Arc, Mutex, Weak};
use bytemuck::Pod; use bytemuck::Pod;
use smallvec::SmallVec; use smallvec::SmallVec;
use crate::{mux, BackendType, BufWrite, ImageFormat, MapMode}; use crate::{mux, BackendType, BufWrite, ComputePassDescriptor, ImageFormat, MapMode};
use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams}; use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams};
@ -135,6 +135,11 @@ pub struct BufReadGuard<'a> {
size: u64, size: u64,
} }
/// A sub-object of a command buffer for a sequence of compute dispatches.
pub struct ComputePass<'a> {
cmd_buf: &'a mut CmdBuf,
}
impl Session { impl Session {
/// Create a new session, choosing the best backend. /// Create a new session, choosing the best backend.
pub fn new(device: mux::Device) -> Session { pub fn new(device: mux::Device) -> Session {
@ -370,8 +375,17 @@ impl Session {
/// ///
/// This should be called after waiting on the command buffer that wrote the /// This should be called after waiting on the command buffer that wrote the
/// timer queries. /// timer queries.
///
/// The returned vector is one shorter than the number of timer queries in the
/// pool; the first value is subtracted off. It would likely be better to return
/// the raw timestamps, but that change should be made consistently.
pub unsafe fn fetch_query_pool(&self, pool: &QueryPool) -> Result<Vec<f64>, Error> { pub unsafe fn fetch_query_pool(&self, pool: &QueryPool) -> Result<Vec<f64>, Error> {
self.0.device.fetch_query_pool(pool) let result = self.0.device.fetch_query_pool(pool)?;
// Subtract off first timestamp.
Ok(result[1..]
.iter()
.map(|ts| *ts as f64 - result[0])
.collect())
} }
#[doc(hidden)] #[doc(hidden)]
@ -471,23 +485,10 @@ impl CmdBuf {
self.cmd_buf().finish(); self.cmd_buf().finish();
} }
/// Dispatch a compute shader. /// Begin a compute pass.
/// pub unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) -> ComputePass {
/// Request a compute shader to be run, using the pipeline to specify the self.cmd_buf().begin_compute_pass(desc);
/// code, and the descriptor set to address the resources read and written. ComputePass { cmd_buf: self }
///
/// Both the workgroup count (number of workgroups) and the workgroup size
/// (number of threads in a workgroup) must be specified here, though not
/// all back-ends require the latter info.
pub unsafe fn dispatch(
&mut self,
pipeline: &Pipeline,
descriptor_set: &DescriptorSet,
workgroup_count: (u32, u32, u32),
workgroup_size: (u32, u32, u32),
) {
self.cmd_buf()
.dispatch(pipeline, descriptor_set, workgroup_count, workgroup_size);
} }
/// Insert an execution and memory barrier. /// Insert an execution and memory barrier.
@ -582,13 +583,6 @@ impl CmdBuf {
self.cmd_buf().reset_query_pool(pool); self.cmd_buf().reset_query_pool(pool);
} }
/// Write a timestamp.
///
/// The query index must be less than the size of the query pool on creation.
pub unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
self.cmd_buf().write_timestamp(pool, query);
}
/// Prepare the timestamps for reading. This isn't required on Vulkan but /// Prepare the timestamps for reading. This isn't required on Vulkan but
/// is required on (at least) DX12. /// is required on (at least) DX12.
/// ///
@ -692,6 +686,51 @@ impl Drop for SubmittedCmdBuf {
} }
} }
impl<'a> ComputePass<'a> {
/// Dispatch a compute shader.
///
/// Request a compute shader to be run, using the pipeline to specify the
/// code, and the descriptor set to address the resources read and written.
///
/// Both the workgroup count (number of workgroups) and the workgroup size
/// (number of threads in a workgroup) must be specified here, though not
/// all back-ends require the latter info.
pub unsafe fn dispatch(
&mut self,
pipeline: &Pipeline,
descriptor_set: &DescriptorSet,
workgroup_count: (u32, u32, u32),
workgroup_size: (u32, u32, u32),
) {
self.cmd_buf
.cmd_buf()
.dispatch(pipeline, descriptor_set, workgroup_count, workgroup_size);
}
/// Add a memory barrier.
///
/// Inserts a memory barrier in the compute encoder. This is a convenience
/// function for calling the same function on the underlying command buffer,
/// avoiding borrow check issues.
pub unsafe fn memory_barrier(&mut self) {
self.cmd_buf.memory_barrier();
}
/// Begin a labeled section for debugging and profiling purposes.
pub unsafe fn begin_debug_label(&mut self, label: &str) {
self.cmd_buf.begin_debug_label(label);
}
/// End a section opened by `begin_debug_label`.
pub unsafe fn end_debug_label(&mut self) {
self.cmd_buf.end_debug_label();
}
pub unsafe fn end(self) {
self.cmd_buf.cmd_buf().end_compute_pass();
}
}
impl Drop for BufferInner { impl Drop for BufferInner {
fn drop(&mut self) { fn drop(&mut self) {
if let Some(session) = Weak::upgrade(&self.session) { if let Some(session) = Weak::upgrade(&self.session) {

View file

@ -21,8 +21,8 @@ pub use crate::mux::{
}; };
pub use bufwrite::BufWrite; pub use bufwrite::BufWrite;
pub use hub::{ pub use hub::{
BufReadGuard, BufWriteGuard, Buffer, CmdBuf, DescriptorSetBuilder, Image, RetainResource, BufReadGuard, BufWriteGuard, Buffer, CmdBuf, ComputePass, DescriptorSetBuilder, Image,
Session, SubmittedCmdBuf, RetainResource, Session, SubmittedCmdBuf,
}; };
// TODO: because these are conditionally included, "cargo fmt" does not // TODO: because these are conditionally included, "cargo fmt" does not
@ -189,3 +189,23 @@ pub struct WorkgroupLimits {
/// dimension. /// dimension.
pub max_invocations: u32, pub max_invocations: u32,
} }
/// Options for creating a compute pass.
#[derive(Default)]
pub struct ComputePassDescriptor<'a> {
// Maybe label should go here? It does in wgpu and wgpu_hal.
/// Timer query parameters.
///
/// To record timer queries for a compute pass, set the query pool, start
/// query index, and end query index here. The indices must be less than
/// the size of the query pool.
timer_queries: Option<(&'a QueryPool, u32, u32)>,
}
impl<'a> ComputePassDescriptor<'a> {
pub fn timer(pool: &'a QueryPool, start_query: u32, end_query: u32) -> ComputePassDescriptor {
ComputePassDescriptor {
timer_queries: Some((pool, start_query, end_query)),
}
}
}

View file

@ -15,25 +15,32 @@
// Also licensed under MIT license, at your choice. // Also licensed under MIT license, at your choice.
mod clear; mod clear;
mod timer;
mod util; mod util;
use std::mem; use std::mem;
use std::sync::{Arc, Mutex}; use std::sync::{Arc, Mutex};
use block::Block;
use cocoa_foundation::base::id; use cocoa_foundation::base::id;
use cocoa_foundation::foundation::{NSInteger, NSUInteger}; use cocoa_foundation::foundation::{NSInteger, NSUInteger};
use foreign_types::ForeignType;
use objc::rc::autoreleasepool; use objc::rc::autoreleasepool;
use objc::runtime::{Object, BOOL, YES}; use objc::runtime::{Object, BOOL, YES};
use objc::{class, msg_send, sel, sel_impl}; use objc::{class, msg_send, sel, sel_impl};
use metal::{CGFloat, MTLFeatureSet}; use metal::{CGFloat, CommandBufferRef, MTLFeatureSet};
use raw_window_handle::{HasRawWindowHandle, RawWindowHandle}; use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};
use crate::{BufferUsage, Error, GpuInfo, ImageFormat, MapMode, WorkgroupLimits}; use crate::{
BufferUsage, ComputePassDescriptor, Error, GpuInfo, ImageFormat, MapMode, WorkgroupLimits,
};
use util::*; use util::*;
use self::timer::{CounterSampleBuffer, CounterSet, TimeCalibration};
pub struct MtlInstance; pub struct MtlInstance;
pub struct MtlDevice { pub struct MtlDevice {
@ -41,6 +48,18 @@ pub struct MtlDevice {
cmd_queue: Arc<Mutex<metal::CommandQueue>>, cmd_queue: Arc<Mutex<metal::CommandQueue>>,
gpu_info: GpuInfo, gpu_info: GpuInfo,
helpers: Arc<Helpers>, helpers: Arc<Helpers>,
timer_set: Option<CounterSet>,
counter_style: CounterStyle,
}
/// Type of counter sampling.
///
/// See https://developer.apple.com/documentation/metal/counter_sampling/sampling_gpu_data_into_counter_sample_buffers
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
enum CounterStyle {
None,
Stage,
Command,
} }
pub struct MtlSurface { pub struct MtlSurface {
@ -81,9 +100,22 @@ pub struct Semaphore;
pub struct CmdBuf { pub struct CmdBuf {
cmd_buf: metal::CommandBuffer, cmd_buf: metal::CommandBuffer,
helpers: Arc<Helpers>, helpers: Arc<Helpers>,
cur_encoder: Encoder,
time_calibration: Arc<Mutex<TimeCalibration>>,
counter_style: CounterStyle,
} }
pub struct QueryPool; enum Encoder {
None,
Compute(metal::ComputeCommandEncoder, Option<(id, u32)>),
Blit(metal::BlitCommandEncoder),
}
#[derive(Default)]
pub struct QueryPool {
counter_sample_buf: Option<CounterSampleBuffer>,
calibration: Arc<Mutex<Option<Arc<Mutex<TimeCalibration>>>>>,
}
pub struct Pipeline(metal::ComputePipelineState); pub struct Pipeline(metal::ComputePipelineState);
@ -209,18 +241,43 @@ impl MtlDevice {
let helpers = Arc::new(Helpers { let helpers = Arc::new(Helpers {
clear_pipeline: clear::make_clear_pipeline(&device), clear_pipeline: clear::make_clear_pipeline(&device),
}); });
// Timer stuff
let timer_set = CounterSet::get_timer_counter_set(&device);
let counter_style = if timer_set.is_some() {
if device.supports_counter_sampling(metal::MTLCounterSamplingPoint::AtStageBoundary) {
CounterStyle::Stage
} else if device
.supports_counter_sampling(metal::MTLCounterSamplingPoint::AtDispatchBoundary)
{
CounterStyle::Command
} else {
CounterStyle::None
}
} else {
CounterStyle::None
};
MtlDevice { MtlDevice {
device, device,
cmd_queue: Arc::new(Mutex::new(cmd_queue)), cmd_queue: Arc::new(Mutex::new(cmd_queue)),
gpu_info, gpu_info,
helpers, helpers,
timer_set,
counter_style,
} }
} }
pub fn cmd_buf_from_raw_mtl(&self, raw_cmd_buf: metal::CommandBuffer) -> CmdBuf { pub fn cmd_buf_from_raw_mtl(&self, raw_cmd_buf: metal::CommandBuffer) -> CmdBuf {
let cmd_buf = raw_cmd_buf; let cmd_buf = raw_cmd_buf;
let helpers = self.helpers.clone(); let helpers = self.helpers.clone();
CmdBuf { cmd_buf, helpers } let cur_encoder = Encoder::None;
let time_calibration = Default::default();
CmdBuf {
cmd_buf,
helpers,
cur_encoder,
time_calibration,
counter_style: self.counter_style,
}
} }
pub fn image_from_raw_mtl(&self, texture: metal::Texture, width: u32, height: u32) -> Image { pub fn image_from_raw_mtl(&self, texture: metal::Texture, width: u32, height: u32) -> Image {
@ -330,11 +387,35 @@ impl crate::backend::Device for MtlDevice {
fn create_cmd_buf(&self) -> Result<Self::CmdBuf, Error> { fn create_cmd_buf(&self) -> Result<Self::CmdBuf, Error> {
let cmd_queue = self.cmd_queue.lock().unwrap(); let cmd_queue = self.cmd_queue.lock().unwrap();
// A discussion about autorelease pools.
//
// Autorelease pools are a sore point in Rust/Objective-C interop. Basically,
// you can have any two of correctness, ergonomics, and performance. Here we've
// chosen the first two, using the pattern of a fine grained autorelease pool
// to give the Obj-C object Rust-like lifetime semantics whenever objects are
// created as autorelease (by convention, this is any object creation with an
// Obj-C method name that doesn't begin with "new" or "alloc").
//
// To gain back some of the performance, we'd need a way to wrap an autorelease
// pool over a chunk of work - that could be one frame of rendering, but for
// tests that iterate a number of command buffer submissions, it would need to
// be around that. On non-mac platforms, it would be a no-op.
//
// In any case, this way, the caller doesn't need to worry, and the performance
// hit might not be so bad (perhaps we should measure).
// consider new_command_buffer_with_unretained_references for performance // consider new_command_buffer_with_unretained_references for performance
let cmd_buf = cmd_queue.new_command_buffer(); let cmd_buf = autoreleasepool(|| cmd_queue.new_command_buffer().to_owned());
let cmd_buf = autoreleasepool(|| cmd_buf.to_owned());
let helpers = self.helpers.clone(); let helpers = self.helpers.clone();
Ok(CmdBuf { cmd_buf, helpers }) let cur_encoder = Encoder::None;
let time_calibration = Default::default();
Ok(CmdBuf {
cmd_buf,
helpers,
cur_encoder,
time_calibration,
counter_style: self.counter_style,
})
} }
unsafe fn destroy_cmd_buf(&self, _cmd_buf: Self::CmdBuf) -> Result<(), Error> { unsafe fn destroy_cmd_buf(&self, _cmd_buf: Self::CmdBuf) -> Result<(), Error> {
@ -342,12 +423,31 @@ impl crate::backend::Device for MtlDevice {
} }
fn create_query_pool(&self, n_queries: u32) -> Result<Self::QueryPool, Error> { fn create_query_pool(&self, n_queries: u32) -> Result<Self::QueryPool, Error> {
// TODO if let Some(timer_set) = &self.timer_set {
Ok(QueryPool) let pool = CounterSampleBuffer::new(&self.device, n_queries as u64, timer_set)
.ok_or("error creating timer query pool")?;
return Ok(QueryPool {
counter_sample_buf: Some(pool),
calibration: Default::default(),
});
}
Ok(QueryPool::default())
} }
unsafe fn fetch_query_pool(&self, pool: &Self::QueryPool) -> Result<Vec<f64>, Error> { unsafe fn fetch_query_pool(&self, pool: &Self::QueryPool) -> Result<Vec<f64>, Error> {
// TODO if let Some(raw) = &pool.counter_sample_buf {
let resolved = raw.resolve();
let calibration = pool.calibration.lock().unwrap();
if let Some(calibration) = &*calibration {
let calibration = calibration.lock().unwrap();
let result = resolved
.iter()
.map(|time_ns| calibration.correlate(*time_ns))
.collect();
return Ok(result);
}
}
// Maybe should return None indicating it wasn't successful? But that might break.
Ok(Vec::new()) Ok(Vec::new())
} }
@ -358,7 +458,37 @@ impl crate::backend::Device for MtlDevice {
_signal_semaphores: &[&Self::Semaphore], _signal_semaphores: &[&Self::Semaphore],
fence: Option<&mut Self::Fence>, fence: Option<&mut Self::Fence>,
) -> Result<(), Error> { ) -> Result<(), Error> {
unsafe fn add_scheduled_handler(
cmd_buf: &metal::CommandBufferRef,
block: &Block<(&CommandBufferRef,), ()>,
) {
msg_send![cmd_buf, addScheduledHandler: block]
}
for cmd_buf in cmd_bufs { for cmd_buf in cmd_bufs {
let time_calibration = cmd_buf.time_calibration.clone();
let start_block = block::ConcreteBlock::new(move |buffer: &metal::CommandBufferRef| {
let device: id = msg_send![buffer, device];
let mut time_calibration = time_calibration.lock().unwrap();
let cpu_ts_ptr = &mut time_calibration.cpu_start_ts as *mut _;
let gpu_ts_ptr = &mut time_calibration.gpu_start_ts as *mut _;
// TODO: only do this if supported.
let () = msg_send![device, sampleTimestamps: cpu_ts_ptr gpuTimestamp: gpu_ts_ptr];
})
.copy();
add_scheduled_handler(&cmd_buf.cmd_buf, &start_block);
let time_calibration = cmd_buf.time_calibration.clone();
let completed_block =
block::ConcreteBlock::new(move |buffer: &metal::CommandBufferRef| {
let device: id = msg_send![buffer, device];
let mut time_calibration = time_calibration.lock().unwrap();
let cpu_ts_ptr = &mut time_calibration.cpu_end_ts as *mut _;
let gpu_ts_ptr = &mut time_calibration.gpu_end_ts as *mut _;
// TODO: only do this if supported.
let () =
msg_send![device, sampleTimestamps: cpu_ts_ptr gpuTimestamp: gpu_ts_ptr];
})
.copy();
cmd_buf.cmd_buf.add_completed_handler(&completed_block);
cmd_buf.cmd_buf.commit(); cmd_buf.cmd_buf.commit();
} }
if let Some(last_cmd_buf) = cmd_bufs.last() { if let Some(last_cmd_buf) = cmd_bufs.last() {
@ -439,12 +569,70 @@ impl crate::backend::Device for MtlDevice {
impl crate::backend::CmdBuf<MtlDevice> for CmdBuf { impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
unsafe fn begin(&mut self) {} unsafe fn begin(&mut self) {}
unsafe fn finish(&mut self) {} unsafe fn finish(&mut self) {
self.flush_encoder();
}
unsafe fn reset(&mut self) -> bool { unsafe fn reset(&mut self) -> bool {
false false
} }
unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
// TODO: we might want to get better about validation but the following
// assert is likely to trigger, and also a case can be made that
// validation should be done at the hub level, for consistency.
//debug_assert!(matches!(self.cur_encoder, Encoder::None));
self.flush_encoder();
autoreleasepool(|| {
let (encoder, end_query) = match (&desc.timer_queries, self.counter_style) {
(Some(queries), CounterStyle::Stage) => {
let descriptor: id =
msg_send![class!(MTLComputePassDescriptor), computePassDescriptor];
let attachments: id = msg_send![descriptor, sampleBufferAttachments];
let index: NSUInteger = 0;
let attachment: id = msg_send![attachments, objectAtIndexedSubscript: index];
// Here we break the hub/mux separation a bit, for expedience
#[allow(irrefutable_let_patterns)]
if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 {
if let Some(sample_buf) = &query_pool.counter_sample_buf {
let () = msg_send![attachment, setSampleBuffer: sample_buf.id()];
}
}
let start_index = queries.1 as NSUInteger;
let end_index = queries.2 as NSInteger;
let () = msg_send![attachment, setStartOfEncoderSampleIndex: start_index];
let () = msg_send![attachment, setEndOfEncoderSampleIndex: end_index];
(
msg_send![
self.cmd_buf,
computeCommandEncoderWithDescriptor: descriptor
],
None,
)
}
(Some(queries), CounterStyle::Command) => {
let encoder = self.cmd_buf.new_compute_command_encoder();
#[allow(irrefutable_let_patterns)]
let end_query = if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 {
if let Some(sample_buf) = &query_pool.counter_sample_buf {
let sample_index = queries.1 as NSUInteger;
let sample_buf = sample_buf.id();
let () = msg_send![encoder, sampleCountersInBuffer: sample_buf atSampleIndex: sample_index withBarrier: true];
Some((sample_buf, queries.2))
} else {
None
}
} else {
None
};
(encoder, end_query)
}
_ => (self.cmd_buf.new_compute_command_encoder(), None),
};
self.cur_encoder = Encoder::Compute(encoder.to_owned(), end_query);
});
}
unsafe fn dispatch( unsafe fn dispatch(
&mut self, &mut self,
pipeline: &Pipeline, pipeline: &Pipeline,
@ -452,7 +640,7 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
workgroup_count: (u32, u32, u32), workgroup_count: (u32, u32, u32),
workgroup_size: (u32, u32, u32), workgroup_size: (u32, u32, u32),
) { ) {
let encoder = self.cmd_buf.new_compute_command_encoder(); let encoder = self.compute_command_encoder();
encoder.set_compute_pipeline_state(&pipeline.0); encoder.set_compute_pipeline_state(&pipeline.0);
let mut buf_ix = 0; let mut buf_ix = 0;
for buffer in &descriptor_set.buffers { for buffer in &descriptor_set.buffers {
@ -475,7 +663,11 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
depth: workgroup_size.2 as u64, depth: workgroup_size.2 as u64,
}; };
encoder.dispatch_thread_groups(workgroup_count, workgroup_size); encoder.dispatch_thread_groups(workgroup_count, workgroup_size);
encoder.end_encoding(); }
unsafe fn end_compute_pass(&mut self) {
// TODO: might validate that we are in a compute encoder state
self.flush_encoder();
} }
unsafe fn memory_barrier(&mut self) { unsafe fn memory_barrier(&mut self) {
@ -494,22 +686,23 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
// I think these are being tracked. // I think these are being tracked.
} }
unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) { unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
let size = size.unwrap_or(buffer.size); let size = size.unwrap_or(buffer.size);
let encoder = self.cmd_buf.new_compute_command_encoder(); let _ = self.compute_command_encoder();
clear::encode_clear(&encoder, &self.helpers.clear_pipeline, &buffer.buffer, size); // Getting this directly is a workaround for a borrow checker issue.
encoder.end_encoding() if let Encoder::Compute(e, _) = &self.cur_encoder {
clear::encode_clear(e, &self.helpers.clear_pipeline, &buffer.buffer, size);
}
} }
unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) { unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
let encoder = self.cmd_buf.new_blit_command_encoder(); let encoder = self.blit_command_encoder();
let size = src.size.min(dst.size); let size = src.size.min(dst.size);
encoder.copy_from_buffer(&src.buffer, 0, &dst.buffer, 0, size); encoder.copy_from_buffer(&src.buffer, 0, &dst.buffer, 0, size);
encoder.end_encoding();
} }
unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) { unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
let encoder = self.cmd_buf.new_blit_command_encoder(); let encoder = self.blit_command_encoder();
assert_eq!(dst.size, (src.width as u64) * (src.height as u64) * 4); assert_eq!(dst.size, (src.width as u64) * (src.height as u64) * 4);
let bytes_per_row = (src.width * 4) as NSUInteger; let bytes_per_row = (src.width * 4) as NSUInteger;
let src_size = metal::MTLSize { let src_size = metal::MTLSize {
@ -530,11 +723,10 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
bytes_per_row * src.height as NSUInteger, bytes_per_row * src.height as NSUInteger,
metal::MTLBlitOption::empty(), metal::MTLBlitOption::empty(),
); );
encoder.end_encoding();
} }
unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) { unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
let encoder = self.cmd_buf.new_blit_command_encoder(); let encoder = self.blit_command_encoder();
assert_eq!(src.size, (dst.width as u64) * (dst.height as u64) * 4); assert_eq!(src.size, (dst.width as u64) * (dst.height as u64) * 4);
let bytes_per_row = (dst.width * 4) as NSUInteger; let bytes_per_row = (dst.width * 4) as NSUInteger;
let src_size = metal::MTLSize { let src_size = metal::MTLSize {
@ -555,11 +747,10 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
origin, origin,
metal::MTLBlitOption::empty(), metal::MTLBlitOption::empty(),
); );
encoder.end_encoding();
} }
unsafe fn blit_image(&self, src: &Image, dst: &Image) { unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
let encoder = self.cmd_buf.new_blit_command_encoder(); let encoder = self.blit_command_encoder();
let src_size = metal::MTLSize { let src_size = metal::MTLSize {
width: src.width.min(dst.width) as NSUInteger, width: src.width.min(dst.width) as NSUInteger,
height: src.width.min(dst.height) as NSUInteger, height: src.width.min(dst.height) as NSUInteger,
@ -577,15 +768,79 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
0, 0,
origin, origin,
); );
encoder.end_encoding();
} }
unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {} unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {
let mut calibration = pool.calibration.lock().unwrap();
*calibration = Some(self.time_calibration.clone());
}
unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) { unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
// TODO if let Some(buf) = &pool.counter_sample_buf {
// This really a PITA because it's pretty different than Vulkan. if matches!(self.cur_encoder, Encoder::None) {
// See https://developer.apple.com/documentation/metal/counter_sampling self.cur_encoder =
Encoder::Compute(self.cmd_buf.new_compute_command_encoder().to_owned(), None);
}
let sample_index = query as NSUInteger;
if self.counter_style == CounterStyle::Command {
match &self.cur_encoder {
Encoder::Compute(e, _) => {
let () = msg_send![e.as_ptr(), sampleCountersInBuffer: buf.id() atSampleIndex: sample_index withBarrier: true];
}
Encoder::None => unreachable!(),
_ => todo!(),
}
} else if self.counter_style == CounterStyle::Stage {
match &self.cur_encoder {
Encoder::Compute(_e, _) => {
println!("write_timestamp is not supported for stage-style encoders");
}
_ => (),
}
}
}
}
}
impl CmdBuf {
fn compute_command_encoder(&mut self) -> &metal::ComputeCommandEncoder {
if !matches!(self.cur_encoder, Encoder::Compute(..)) {
self.flush_encoder();
self.cur_encoder =
Encoder::Compute(self.cmd_buf.new_compute_command_encoder().to_owned(), None);
}
if let Encoder::Compute(e, _) = &self.cur_encoder {
e
} else {
unreachable!()
}
}
fn blit_command_encoder(&mut self) -> &metal::BlitCommandEncoder {
if !matches!(self.cur_encoder, Encoder::Blit(_)) {
self.flush_encoder();
self.cur_encoder = Encoder::Blit(self.cmd_buf.new_blit_command_encoder().to_owned());
}
if let Encoder::Blit(e) = &self.cur_encoder {
e
} else {
unreachable!()
}
}
fn flush_encoder(&mut self) {
match std::mem::replace(&mut self.cur_encoder, Encoder::None) {
Encoder::Compute(e, Some((sample_buf, end_query))) => {
let sample_index = end_query as NSUInteger;
unsafe {
let () = msg_send![e.as_ptr(), sampleCountersInBuffer: sample_buf atSampleIndex: sample_index withBarrier: true];
}
e.end_encoding();
}
Encoder::Compute(e, None) => e.end_encoding(),
Encoder::Blit(e) => e.end_encoding(),
Encoder::None => (),
}
} }
} }

View file

@ -0,0 +1,172 @@
// Copyright 2021 The piet-gpu authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Also licensed under MIT license, at your choice.
//! Support for timer queries.
//!
//! Likely some of this should be upstreamed into metal-rs.
use std::{ffi::CStr, ptr::null_mut};
use cocoa_foundation::{
base::id,
foundation::{NSRange, NSUInteger},
};
use metal::{DeviceRef, MTLStorageMode};
use objc::{class, msg_send, sel, sel_impl};
pub struct CounterSampleBuffer {
id: id,
count: u64,
}
pub struct CounterSet {
id: id,
}
#[derive(Default)]
pub struct TimeCalibration {
pub cpu_start_ts: u64,
pub gpu_start_ts: u64,
pub cpu_end_ts: u64,
pub gpu_end_ts: u64,
}
impl Drop for CounterSampleBuffer {
fn drop(&mut self) {
unsafe { msg_send![self.id, release] }
}
}
impl Clone for CounterSampleBuffer {
fn clone(&self) -> CounterSampleBuffer {
unsafe {
CounterSampleBuffer {
id: msg_send![self.id, retain],
count: self.count,
}
}
}
}
impl CounterSampleBuffer {
pub fn id(&self) -> id {
self.id
}
}
impl Drop for CounterSet {
fn drop(&mut self) {
unsafe { msg_send![self.id, release] }
}
}
impl CounterSet {
pub fn get_timer_counter_set(device: &DeviceRef) -> Option<CounterSet> {
unsafe {
// TODO: version check
let sets: id = msg_send!(device, counterSets);
let count: NSUInteger = msg_send![sets, count];
for i in 0..count {
let set: id = msg_send![sets, objectAtIndex: i];
let name: id = msg_send![set, name];
let name_cstr = CStr::from_ptr(msg_send![name, UTF8String]);
if name_cstr.to_bytes() == b"timestamp" {
return Some(CounterSet { id: set });
}
}
None
}
}
}
// copied from metal-rs; should be in common utilities maybe?
fn nsstring_as_str(nsstr: &objc::runtime::Object) -> &str {
let bytes = unsafe {
let bytes: *const std::os::raw::c_char = msg_send![nsstr, UTF8String];
bytes as *const u8
};
let len: NSUInteger = unsafe { msg_send![nsstr, length] };
unsafe {
let bytes = std::slice::from_raw_parts(bytes, len as usize);
std::str::from_utf8(bytes).unwrap()
}
}
impl CounterSampleBuffer {
pub fn new(
device: &DeviceRef,
count: u64,
counter_set: &CounterSet,
) -> Option<CounterSampleBuffer> {
unsafe {
let desc_cls = class!(MTLCounterSampleBufferDescriptor);
let descriptor: id = msg_send![desc_cls, alloc];
let _: id = msg_send![descriptor, init];
let count = count as NSUInteger;
let () = msg_send![descriptor, setSampleCount: count];
let () = msg_send![descriptor, setCounterSet: counter_set.id];
let () = msg_send![
descriptor,
setStorageMode: MTLStorageMode::Shared as NSUInteger
];
let mut error: id = null_mut();
let buf: id = msg_send![device, newCounterSampleBufferWithDescriptor: descriptor error: &mut error];
let () = msg_send![descriptor, release];
if !error.is_null() {
let description = msg_send![error, localizedDescription];
println!(
"error allocating sample buffer, code = {}",
nsstring_as_str(description)
);
let () = msg_send![error, release];
return None;
}
Some(CounterSampleBuffer { id: buf, count })
}
}
// Read the timestamps.
//
// Safety: the lifetime of the returned slice is wrong, it's actually autoreleased.
pub unsafe fn resolve(&self) -> &[u64] {
let range = NSRange::new(0, self.count);
let data: id = msg_send![self.id, resolveCounterRange: range];
if data.is_null() {
&[]
} else {
let bytes: *const u64 = msg_send![data, bytes];
std::slice::from_raw_parts(bytes, self.count as usize)
}
}
}
impl TimeCalibration {
/// Convert GPU timestamp into CPU time base.
///
/// See https://developer.apple.com/documentation/metal/performance_tuning/correlating_cpu_and_gpu_timestamps
pub fn correlate(&self, raw_ts: u64) -> f64 {
let delta_cpu = self.cpu_end_ts - self.cpu_start_ts;
let delta_gpu = self.gpu_end_ts - self.gpu_start_ts;
let adj_ts = if delta_gpu > 0 {
let scale = delta_cpu as f64 / delta_gpu as f64;
self.cpu_start_ts as f64 + (raw_ts as f64 - self.gpu_start_ts as f64) * scale
} else {
// Default is ns on Apple Silicon; on other hardware this will be wrong
raw_ts as f64
};
adj_ts * 1e-9
}
}

View file

@ -35,6 +35,7 @@ use crate::backend::DescriptorSetBuilder as DescriptorSetBuilderTrait;
use crate::backend::Device as DeviceTrait; use crate::backend::Device as DeviceTrait;
use crate::BackendType; use crate::BackendType;
use crate::BindType; use crate::BindType;
use crate::ComputePassDescriptor;
use crate::ImageFormat; use crate::ImageFormat;
use crate::MapMode; use crate::MapMode;
use crate::{BufferUsage, Error, GpuInfo, ImageLayout, InstanceFlags}; use crate::{BufferUsage, Error, GpuInfo, ImageLayout, InstanceFlags};
@ -658,6 +659,14 @@ impl CmdBuf {
} }
} }
pub unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
mux_match! { self;
CmdBuf::Vk(c) => c.begin_compute_pass(desc),
CmdBuf::Dx12(c) => c.begin_compute_pass(desc),
CmdBuf::Mtl(c) => c.begin_compute_pass(desc),
}
}
/// Dispatch a compute shader. /// Dispatch a compute shader.
/// ///
/// Note that both the number of workgroups (`workgroup_count`) and the number of /// Note that both the number of workgroups (`workgroup_count`) and the number of
@ -680,6 +689,14 @@ impl CmdBuf {
} }
} }
pub unsafe fn end_compute_pass(&mut self) {
mux_match! { self;
CmdBuf::Vk(c) => c.end_compute_pass(),
CmdBuf::Dx12(c) => c.end_compute_pass(),
CmdBuf::Mtl(c) => c.end_compute_pass(),
}
}
pub unsafe fn memory_barrier(&mut self) { pub unsafe fn memory_barrier(&mut self) {
mux_match! { self; mux_match! { self;
CmdBuf::Vk(c) => c.memory_barrier(), CmdBuf::Vk(c) => c.memory_barrier(),

View file

@ -15,7 +15,7 @@ use smallvec::SmallVec;
use crate::backend::Device as DeviceTrait; use crate::backend::Device as DeviceTrait;
use crate::{ use crate::{
BindType, BufferUsage, Error, GpuInfo, ImageFormat, ImageLayout, MapMode, SamplerParams, SubgroupSize, BindType, BufferUsage, Error, GpuInfo, ImageFormat, ImageLayout, MapMode, SamplerParams, SubgroupSize,
WorkgroupLimits, WorkgroupLimits, ComputePassDescriptor,
}; };
pub struct VkInstance { pub struct VkInstance {
@ -92,6 +92,7 @@ pub struct CmdBuf {
cmd_buf: vk::CommandBuffer, cmd_buf: vk::CommandBuffer,
cmd_pool: vk::CommandPool, cmd_pool: vk::CommandPool,
device: Arc<RawDevice>, device: Arc<RawDevice>,
end_query: Option<(vk::QueryPool, u32)>,
} }
pub struct QueryPool { pub struct QueryPool {
@ -738,6 +739,7 @@ impl crate::backend::Device for VkDevice {
cmd_buf, cmd_buf,
cmd_pool, cmd_pool,
device: self.device.clone(), device: self.device.clone(),
end_query: None,
}) })
} }
} }
@ -770,11 +772,10 @@ impl crate::backend::Device for VkDevice {
// results (Windows 10, AMD 5700 XT). // results (Windows 10, AMD 5700 XT).
let flags = vk::QueryResultFlags::TYPE_64 | vk::QueryResultFlags::WAIT; let flags = vk::QueryResultFlags::TYPE_64 | vk::QueryResultFlags::WAIT;
device.get_query_pool_results(pool.pool, 0, pool.n_queries, &mut buf, flags)?; device.get_query_pool_results(pool.pool, 0, pool.n_queries, &mut buf, flags)?;
let ts0 = buf[0];
let tsp = self.timestamp_period as f64 * 1e-9; let tsp = self.timestamp_period as f64 * 1e-9;
let result = buf[1..] let result = buf
.iter() .iter()
.map(|ts| ts.wrapping_sub(ts0) as f64 * tsp) .map(|ts| *ts as f64 * tsp)
.collect(); .collect();
Ok(result) Ok(result)
} }
@ -902,6 +903,16 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
true true
} }
unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
if let Some((pool, start, end)) = &desc.timer_queries {
#[allow(irrefutable_let_patterns)]
if let crate::hub::QueryPool::Vk(pool) = pool {
self.write_timestamp_raw(pool.pool, *start);
self.end_query = Some((pool.pool, *end));
}
}
}
unsafe fn dispatch( unsafe fn dispatch(
&mut self, &mut self,
pipeline: &Pipeline, pipeline: &Pipeline,
@ -931,6 +942,12 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
); );
} }
unsafe fn end_compute_pass(&mut self) {
if let Some((pool, end)) = self.end_query.take() {
self.write_timestamp_raw(pool, end);
}
}
/// Insert a pipeline barrier for all memory accesses. /// Insert a pipeline barrier for all memory accesses.
unsafe fn memory_barrier(&mut self) { unsafe fn memory_barrier(&mut self) {
let device = &self.device.device; let device = &self.device.device;
@ -995,13 +1012,13 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
); );
} }
unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) { unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
let device = &self.device.device; let device = &self.device.device;
let size = size.unwrap_or(vk::WHOLE_SIZE); let size = size.unwrap_or(vk::WHOLE_SIZE);
device.cmd_fill_buffer(self.cmd_buf, buffer.buffer, 0, size, 0); device.cmd_fill_buffer(self.cmd_buf, buffer.buffer, 0, size, 0);
} }
unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) { unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
let device = &self.device.device; let device = &self.device.device;
let size = src.size.min(dst.size); let size = src.size.min(dst.size);
device.cmd_copy_buffer( device.cmd_copy_buffer(
@ -1012,7 +1029,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
); );
} }
unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) { unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
let device = &self.device.device; let device = &self.device.device;
device.cmd_copy_image_to_buffer( device.cmd_copy_image_to_buffer(
self.cmd_buf, self.cmd_buf,
@ -1035,7 +1052,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
); );
} }
unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) { unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
let device = &self.device.device; let device = &self.device.device;
device.cmd_copy_buffer_to_image( device.cmd_copy_buffer_to_image(
self.cmd_buf, self.cmd_buf,
@ -1058,7 +1075,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
); );
} }
unsafe fn blit_image(&self, src: &Image, dst: &Image) { unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
let device = &self.device.device; let device = &self.device.device;
device.cmd_blit_image( device.cmd_blit_image(
self.cmd_buf, self.cmd_buf,
@ -1106,13 +1123,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
} }
unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) { unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
let device = &self.device.device; self.write_timestamp_raw(pool.pool, query);
device.cmd_write_timestamp(
self.cmd_buf,
vk::PipelineStageFlags::COMPUTE_SHADER,
pool.pool,
query,
);
} }
unsafe fn begin_debug_label(&mut self, label: &str) { unsafe fn begin_debug_label(&mut self, label: &str) {
@ -1130,6 +1141,18 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
} }
} }
impl CmdBuf {
unsafe fn write_timestamp_raw(&mut self, pool: vk::QueryPool, query: u32) {
let device = &self.device.device;
device.cmd_write_timestamp(
self.cmd_buf,
vk::PipelineStageFlags::COMPUTE_SHADER,
pool,
query,
);
}
}
impl crate::backend::DescriptorSetBuilder<VkDevice> for DescriptorSetBuilder { impl crate::backend::DescriptorSetBuilder<VkDevice> for DescriptorSetBuilder {
fn add_buffers(&mut self, buffers: &[&Buffer]) { fn add_buffers(&mut self, buffers: &[&Buffer]) {
self.buffers.extend(buffers.iter().map(|b| b.buffer)); self.buffers.extend(buffers.iter().map(|b| b.buffer));

View file

@ -70,7 +70,7 @@ fn main() -> Result<(), Error> {
.map(|_| session.create_semaphore()) .map(|_| session.create_semaphore())
.collect::<Result<Vec<_>, Error>>()?; .collect::<Result<Vec<_>, Error>>()?;
let query_pools = (0..NUM_FRAMES) let query_pools = (0..NUM_FRAMES)
.map(|_| session.create_query_pool(8)) .map(|_| session.create_query_pool(12))
.collect::<Result<Vec<_>, Error>>()?; .collect::<Result<Vec<_>, Error>>()?;
let mut cmd_bufs: [Option<CmdBuf>; NUM_FRAMES] = Default::default(); let mut cmd_bufs: [Option<CmdBuf>; NUM_FRAMES] = Default::default();
let mut submitted: [Option<SubmittedCmdBuf>; NUM_FRAMES] = Default::default(); let mut submitted: [Option<SubmittedCmdBuf>; NUM_FRAMES] = Default::default();
@ -112,22 +112,23 @@ fn main() -> Result<(), Error> {
if !ts.is_empty() { if !ts.is_empty() {
info_string = format!( info_string = format!(
"{:.3}ms :: e:{:.3}ms|alloc:{:.3}ms|cp:{:.3}ms|bd:{:.3}ms|bin:{:.3}ms|cr:{:.3}ms|r:{:.3}ms", "{:.3}ms :: e:{:.3}ms|alloc:{:.3}ms|cp:{:.3}ms|bd:{:.3}ms|bin:{:.3}ms|cr:{:.3}ms|r:{:.3}ms",
ts[6] * 1e3, ts[10] * 1e3,
ts[0] * 1e3, ts[0] * 1e3,
(ts[1] - ts[0]) * 1e3, (ts[1] - ts[0]) * 1e3,
(ts[2] - ts[1]) * 1e3, (ts[2] - ts[1]) * 1e3,
(ts[3] - ts[2]) * 1e3,
(ts[4] - ts[3]) * 1e3, (ts[4] - ts[3]) * 1e3,
(ts[5] - ts[4]) * 1e3,
(ts[6] - ts[5]) * 1e3, (ts[6] - ts[5]) * 1e3,
(ts[8] - ts[7]) * 1e3,
(ts[10] - ts[9]) * 1e3,
); );
} }
} }
let mut ctx = PietGpuRenderContext::new(); let mut ctx = PietGpuRenderContext::new();
let test_blend = false;
if let Some(svg) = &svg { if let Some(svg) = &svg {
test_scenes::render_svg(&mut ctx, svg); test_scenes::render_svg(&mut ctx, svg);
} else { } else if test_blend {
use piet_gpu::{Blend, BlendMode::*, CompositionMode::*}; use piet_gpu::{Blend, BlendMode::*, CompositionMode::*};
let blends = [ let blends = [
Blend::new(Normal, SrcOver), Blend::new(Normal, SrcOver),
@ -163,6 +164,8 @@ fn main() -> Result<(), Error> {
let blend = blends[mode % blends.len()]; let blend = blends[mode % blends.len()];
test_scenes::render_blend_test(&mut ctx, current_frame, blend); test_scenes::render_blend_test(&mut ctx, current_frame, blend);
info_string = format!("{:?}", blend); info_string = format!("{:?}", blend);
} else {
test_scenes::render_anim_frame(&mut ctx, current_frame);
} }
render_info_string(&mut ctx, &info_string); render_info_string(&mut ctx, &info_string);
if let Err(e) = renderer.upload_render_ctx(&mut ctx, frame_idx) { if let Err(e) = renderer.upload_render_ctx(&mut ctx, frame_idx) {

View file

@ -18,8 +18,8 @@ use piet::kurbo::Vec2;
use piet::{ImageFormat, RenderContext}; use piet::{ImageFormat, RenderContext};
use piet_gpu_hal::{ use piet_gpu_hal::{
include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Error, Image, include_shader, BindType, Buffer, BufferUsage, CmdBuf, ComputePassDescriptor, DescriptorSet,
ImageLayout, Pipeline, QueryPool, Session, Error, Image, ImageLayout, Pipeline, QueryPool, Session,
}; };
pub use pico_svg::PicoSvg; pub use pico_svg::PicoSvg;
@ -424,10 +424,10 @@ impl Renderer {
cmd_buf.copy_buffer_to_image(&self.gradient_bufs[buf_ix], &self.gradients); cmd_buf.copy_buffer_to_image(&self.gradient_bufs[buf_ix], &self.gradients);
cmd_buf.image_barrier(&self.gradients, ImageLayout::BlitDst, ImageLayout::General); cmd_buf.image_barrier(&self.gradients, ImageLayout::BlitDst, ImageLayout::General);
cmd_buf.reset_query_pool(&query_pool); cmd_buf.reset_query_pool(&query_pool);
cmd_buf.write_timestamp(&query_pool, 0);
cmd_buf.begin_debug_label("Element bounding box calculation"); cmd_buf.begin_debug_label("Element bounding box calculation");
let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 0, 1));
self.element_stage.record( self.element_stage.record(
cmd_buf, &mut pass,
&self.element_code, &self.element_code,
&self.element_bindings[buf_ix], &self.element_bindings[buf_ix],
self.n_transform as u64, self.n_transform as u64,
@ -435,56 +435,59 @@ impl Renderer {
self.n_pathtag as u32, self.n_pathtag as u32,
self.n_drawobj as u64, self.n_drawobj as u64,
); );
pass.end();
cmd_buf.end_debug_label(); cmd_buf.end_debug_label();
cmd_buf.write_timestamp(&query_pool, 1);
cmd_buf.memory_barrier(); cmd_buf.memory_barrier();
cmd_buf.begin_debug_label("Clip bounding box calculation"); let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 2, 3));
pass.begin_debug_label("Clip bounding box calculation");
self.clip_binding self.clip_binding
.record(cmd_buf, &self.clip_code, self.n_clip as u32); .record(&mut pass, &self.clip_code, self.n_clip as u32);
cmd_buf.end_debug_label(); pass.end_debug_label();
cmd_buf.begin_debug_label("Element binning"); pass.begin_debug_label("Element binning");
cmd_buf.dispatch( pass.dispatch(
&self.bin_pipeline, &self.bin_pipeline,
&self.bin_ds, &self.bin_ds,
(((self.n_paths + 255) / 256) as u32, 1, 1), (((self.n_paths + 255) / 256) as u32, 1, 1),
(256, 1, 1), (256, 1, 1),
); );
cmd_buf.end_debug_label(); pass.end_debug_label();
cmd_buf.memory_barrier(); pass.memory_barrier();
cmd_buf.begin_debug_label("Tile allocation"); pass.begin_debug_label("Tile allocation");
cmd_buf.dispatch( pass.dispatch(
&self.tile_pipeline, &self.tile_pipeline,
&self.tile_ds[buf_ix], &self.tile_ds[buf_ix],
(((self.n_paths + 255) / 256) as u32, 1, 1), (((self.n_paths + 255) / 256) as u32, 1, 1),
(256, 1, 1), (256, 1, 1),
); );
cmd_buf.end_debug_label(); pass.end_debug_label();
cmd_buf.write_timestamp(&query_pool, 2); pass.end();
cmd_buf.memory_barrier();
cmd_buf.begin_debug_label("Path flattening"); cmd_buf.begin_debug_label("Path flattening");
cmd_buf.dispatch( cmd_buf.memory_barrier();
let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 4, 5));
pass.dispatch(
&self.path_pipeline, &self.path_pipeline,
&self.path_ds, &self.path_ds,
(((self.n_pathseg + 31) / 32) as u32, 1, 1), (((self.n_pathseg + 31) / 32) as u32, 1, 1),
(32, 1, 1), (32, 1, 1),
); );
pass.end();
cmd_buf.end_debug_label(); cmd_buf.end_debug_label();
cmd_buf.write_timestamp(&query_pool, 3);
cmd_buf.memory_barrier(); cmd_buf.memory_barrier();
cmd_buf.begin_debug_label("Backdrop propagation"); cmd_buf.begin_debug_label("Backdrop propagation");
cmd_buf.dispatch( let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 6, 7));
pass.dispatch(
&self.backdrop_pipeline, &self.backdrop_pipeline,
&self.backdrop_ds, &self.backdrop_ds,
(((self.n_paths + 255) / 256) as u32, 1, 1), (((self.n_paths + 255) / 256) as u32, 1, 1),
(256, self.backdrop_y, 1), (256, self.backdrop_y, 1),
); );
pass.end();
cmd_buf.end_debug_label(); cmd_buf.end_debug_label();
cmd_buf.write_timestamp(&query_pool, 4);
// TODO: redo query accounting // TODO: redo query accounting
cmd_buf.write_timestamp(&query_pool, 5);
cmd_buf.memory_barrier(); cmd_buf.memory_barrier();
cmd_buf.begin_debug_label("Coarse raster"); cmd_buf.begin_debug_label("Coarse raster");
cmd_buf.dispatch( let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 8, 9));
pass.dispatch(
&self.coarse_pipeline, &self.coarse_pipeline,
&self.coarse_ds[buf_ix], &self.coarse_ds[buf_ix],
( (
@ -494,11 +497,13 @@ impl Renderer {
), ),
(256, 1, 1), (256, 1, 1),
); );
pass.end();
cmd_buf.end_debug_label(); cmd_buf.end_debug_label();
cmd_buf.write_timestamp(&query_pool, 6);
cmd_buf.memory_barrier(); cmd_buf.memory_barrier();
cmd_buf.begin_debug_label("Fine raster"); cmd_buf.begin_debug_label("Fine raster");
cmd_buf.dispatch( let mut pass =
cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 10, 11));
pass.dispatch(
&self.k4_pipeline, &self.k4_pipeline,
&self.k4_ds, &self.k4_ds,
( (
@ -508,8 +513,8 @@ impl Renderer {
), ),
(8, 4, 1), (8, 4, 1),
); );
pass.end();
cmd_buf.end_debug_label(); cmd_buf.end_debug_label();
cmd_buf.write_timestamp(&query_pool, 7);
cmd_buf.memory_barrier(); cmd_buf.memory_barrier();
cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc); cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
} }

View file

@ -26,7 +26,7 @@ use bytemuck::{Pod, Zeroable};
pub use clip::{ClipBinding, ClipCode, CLIP_PART_SIZE}; pub use clip::{ClipBinding, ClipCode, CLIP_PART_SIZE};
pub use draw::{DrawBinding, DrawCode, DrawMonoid, DrawStage, DRAW_PART_SIZE}; pub use draw::{DrawBinding, DrawCode, DrawMonoid, DrawStage, DRAW_PART_SIZE};
pub use path::{PathBinding, PathCode, PathEncoder, PathStage, PATHSEG_PART_SIZE}; pub use path::{PathBinding, PathCode, PathEncoder, PathStage, PATHSEG_PART_SIZE};
use piet_gpu_hal::{Buffer, CmdBuf, Session}; use piet_gpu_hal::{Buffer, ComputePass, Session};
pub use transform::{ pub use transform::{
Transform, TransformBinding, TransformCode, TransformStage, TRANSFORM_PART_SIZE, Transform, TransformBinding, TransformCode, TransformStage, TRANSFORM_PART_SIZE,
}; };
@ -140,7 +140,7 @@ impl ElementStage {
pub unsafe fn record( pub unsafe fn record(
&self, &self,
cmd_buf: &mut CmdBuf, pass: &mut ComputePass,
code: &ElementCode, code: &ElementCode,
binding: &ElementBinding, binding: &ElementBinding,
n_transform: u64, n_transform: u64,
@ -149,14 +149,14 @@ impl ElementStage {
n_drawobj: u64, n_drawobj: u64,
) { ) {
self.transform_stage.record( self.transform_stage.record(
cmd_buf, pass,
&code.transform_code, &code.transform_code,
&binding.transform_binding, &binding.transform_binding,
n_transform, n_transform,
); );
// No memory barrier needed here; path has at least one before pathseg // No memory barrier needed here; path has at least one before pathseg
self.path_stage.record( self.path_stage.record(
cmd_buf, pass,
&code.path_code, &code.path_code,
&binding.path_binding, &binding.path_binding,
n_paths, n_paths,
@ -164,6 +164,6 @@ impl ElementStage {
); );
// No memory barrier needed here; draw has at least one before draw_leaf // No memory barrier needed here; draw has at least one before draw_leaf
self.draw_stage self.draw_stage
.record(cmd_buf, &code.draw_code, &binding.draw_binding, n_drawobj); .record(pass, &code.draw_code, &binding.draw_binding, n_drawobj);
} }
} }

View file

@ -16,7 +16,7 @@
//! The clip processing stage (includes substages). //! The clip processing stage (includes substages).
use piet_gpu_hal::{include_shader, BindType, Buffer, CmdBuf, DescriptorSet, Pipeline, Session}; use piet_gpu_hal::{include_shader, BindType, Buffer, ComputePass, DescriptorSet, Pipeline, Session};
// Note that this isn't the code/stage/binding pattern of most of the other stages // Note that this isn't the code/stage/binding pattern of most of the other stages
// in the new element processing pipeline. We want to move those temporary buffers // in the new element processing pipeline. We want to move those temporary buffers
@ -69,26 +69,26 @@ impl ClipBinding {
/// Record the clip dispatches. /// Record the clip dispatches.
/// ///
/// Assumes memory barrier on entry. Provides memory barrier on exit. /// Assumes memory barrier on entry. Provides memory barrier on exit.
pub unsafe fn record(&self, cmd_buf: &mut CmdBuf, code: &ClipCode, n_clip: u32) { pub unsafe fn record(&self, pass: &mut ComputePass, code: &ClipCode, n_clip: u32) {
let n_wg_reduce = n_clip.saturating_sub(1) / CLIP_PART_SIZE; let n_wg_reduce = n_clip.saturating_sub(1) / CLIP_PART_SIZE;
if n_wg_reduce > 0 { if n_wg_reduce > 0 {
cmd_buf.dispatch( pass.dispatch(
&code.reduce_pipeline, &code.reduce_pipeline,
&self.reduce_ds, &self.reduce_ds,
(n_wg_reduce, 1, 1), (n_wg_reduce, 1, 1),
(CLIP_PART_SIZE, 1, 1), (CLIP_PART_SIZE, 1, 1),
); );
cmd_buf.memory_barrier(); pass.memory_barrier();
} }
let n_wg = (n_clip + CLIP_PART_SIZE - 1) / CLIP_PART_SIZE; let n_wg = (n_clip + CLIP_PART_SIZE - 1) / CLIP_PART_SIZE;
if n_wg > 0 { if n_wg > 0 {
cmd_buf.dispatch( pass.dispatch(
&code.leaf_pipeline, &code.leaf_pipeline,
&self.leaf_ds, &self.leaf_ds,
(n_wg, 1, 1), (n_wg, 1, 1),
(CLIP_PART_SIZE, 1, 1), (CLIP_PART_SIZE, 1, 1),
); );
cmd_buf.memory_barrier(); pass.memory_barrier();
} }
} }
} }

View file

@ -19,7 +19,7 @@
use bytemuck::{Pod, Zeroable}; use bytemuck::{Pod, Zeroable};
use piet_gpu_hal::{ use piet_gpu_hal::{
include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session, include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session,
}; };
/// The output element of the draw object stage. /// The output element of the draw object stage.
@ -130,7 +130,7 @@ impl DrawStage {
pub unsafe fn record( pub unsafe fn record(
&self, &self,
cmd_buf: &mut CmdBuf, pass: &mut ComputePass,
code: &DrawCode, code: &DrawCode,
binding: &DrawBinding, binding: &DrawBinding,
size: u64, size: u64,
@ -140,22 +140,22 @@ impl DrawStage {
} }
let n_workgroups = (size + DRAW_PART_SIZE - 1) / DRAW_PART_SIZE; let n_workgroups = (size + DRAW_PART_SIZE - 1) / DRAW_PART_SIZE;
if n_workgroups > 1 { if n_workgroups > 1 {
cmd_buf.dispatch( pass.dispatch(
&code.reduce_pipeline, &code.reduce_pipeline,
&binding.reduce_ds, &binding.reduce_ds,
(n_workgroups as u32, 1, 1), (n_workgroups as u32, 1, 1),
(DRAW_WG as u32, 1, 1), (DRAW_WG as u32, 1, 1),
); );
cmd_buf.memory_barrier(); pass.memory_barrier();
cmd_buf.dispatch( pass.dispatch(
&code.root_pipeline, &code.root_pipeline,
&self.root_ds, &self.root_ds,
(1, 1, 1), (1, 1, 1),
(DRAW_WG as u32, 1, 1), (DRAW_WG as u32, 1, 1),
); );
} }
cmd_buf.memory_barrier(); pass.memory_barrier();
cmd_buf.dispatch( pass.dispatch(
&code.leaf_pipeline, &code.leaf_pipeline,
&binding.leaf_ds, &binding.leaf_ds,
(n_workgroups as u32, 1, 1), (n_workgroups as u32, 1, 1),

View file

@ -17,7 +17,7 @@
//! The path stage (includes substages). //! The path stage (includes substages).
use piet_gpu_hal::{ use piet_gpu_hal::{
include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session, include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session,
}; };
pub struct PathCode { pub struct PathCode {
@ -148,7 +148,7 @@ impl PathStage {
/// those are consumed. Result is written without barrier. /// those are consumed. Result is written without barrier.
pub unsafe fn record( pub unsafe fn record(
&self, &self,
cmd_buf: &mut CmdBuf, pass: &mut ComputePass,
code: &PathCode, code: &PathCode,
binding: &PathBinding, binding: &PathBinding,
n_paths: u32, n_paths: u32,
@ -166,15 +166,15 @@ impl PathStage {
let reduce_part_tags = REDUCE_PART_SIZE * 4; let reduce_part_tags = REDUCE_PART_SIZE * 4;
let n_wg_tag_reduce = (n_tags + reduce_part_tags - 1) / reduce_part_tags; let n_wg_tag_reduce = (n_tags + reduce_part_tags - 1) / reduce_part_tags;
if n_wg_tag_reduce > 1 { if n_wg_tag_reduce > 1 {
cmd_buf.dispatch( pass.dispatch(
&code.reduce_pipeline, &code.reduce_pipeline,
&binding.reduce_ds, &binding.reduce_ds,
(n_wg_tag_reduce, 1, 1), (n_wg_tag_reduce, 1, 1),
(REDUCE_WG, 1, 1), (REDUCE_WG, 1, 1),
); );
// I think we can skip root if n_wg_tag_reduce == 2 // I think we can skip root if n_wg_tag_reduce == 2
cmd_buf.memory_barrier(); pass.memory_barrier();
cmd_buf.dispatch( pass.dispatch(
&code.tag_root_pipeline, &code.tag_root_pipeline,
&self.tag_root_ds, &self.tag_root_ds,
(1, 1, 1), (1, 1, 1),
@ -183,15 +183,15 @@ impl PathStage {
// No barrier needed here; clear doesn't depend on path tags // No barrier needed here; clear doesn't depend on path tags
} }
let n_wg_clear = (n_paths + CLEAR_WG - 1) / CLEAR_WG; let n_wg_clear = (n_paths + CLEAR_WG - 1) / CLEAR_WG;
cmd_buf.dispatch( pass.dispatch(
&code.clear_pipeline, &code.clear_pipeline,
&binding.clear_ds, &binding.clear_ds,
(n_wg_clear, 1, 1), (n_wg_clear, 1, 1),
(CLEAR_WG, 1, 1), (CLEAR_WG, 1, 1),
); );
cmd_buf.memory_barrier(); pass.memory_barrier();
let n_wg_pathseg = (n_tags + SCAN_PART_SIZE - 1) / SCAN_PART_SIZE; let n_wg_pathseg = (n_tags + SCAN_PART_SIZE - 1) / SCAN_PART_SIZE;
cmd_buf.dispatch( pass.dispatch(
&code.pathseg_pipeline, &code.pathseg_pipeline,
&binding.path_ds, &binding.path_ds,
(n_wg_pathseg, 1, 1), (n_wg_pathseg, 1, 1),

View file

@ -20,7 +20,7 @@ use bytemuck::{Pod, Zeroable};
use piet::kurbo::Affine; use piet::kurbo::Affine;
use piet_gpu_hal::{ use piet_gpu_hal::{
include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session, include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session,
}; };
/// An affine transform. /// An affine transform.
@ -132,7 +132,7 @@ impl TransformStage {
pub unsafe fn record( pub unsafe fn record(
&self, &self,
cmd_buf: &mut CmdBuf, pass: &mut ComputePass,
code: &TransformCode, code: &TransformCode,
binding: &TransformBinding, binding: &TransformBinding,
size: u64, size: u64,
@ -142,22 +142,22 @@ impl TransformStage {
} }
let n_workgroups = (size + TRANSFORM_PART_SIZE - 1) / TRANSFORM_PART_SIZE; let n_workgroups = (size + TRANSFORM_PART_SIZE - 1) / TRANSFORM_PART_SIZE;
if n_workgroups > 1 { if n_workgroups > 1 {
cmd_buf.dispatch( pass.dispatch(
&code.reduce_pipeline, &code.reduce_pipeline,
&binding.reduce_ds, &binding.reduce_ds,
(n_workgroups as u32, 1, 1), (n_workgroups as u32, 1, 1),
(TRANSFORM_WG as u32, 1, 1), (TRANSFORM_WG as u32, 1, 1),
); );
cmd_buf.memory_barrier(); pass.memory_barrier();
cmd_buf.dispatch( pass.dispatch(
&code.root_pipeline, &code.root_pipeline,
&self.root_ds, &self.root_ds,
(1, 1, 1), (1, 1, 1),
(TRANSFORM_WG as u32, 1, 1), (TRANSFORM_WG as u32, 1, 1),
); );
cmd_buf.memory_barrier(); pass.memory_barrier();
} }
cmd_buf.dispatch( pass.dispatch(
&code.leaf_pipeline, &code.leaf_pipeline,
&binding.leaf_ds, &binding.leaf_ds,
(n_workgroups as u32, 1, 1), (n_workgroups as u32, 1, 1),

View file

@ -16,11 +16,11 @@
//! Utilities (and a benchmark) for clearing buffers with compute shaders. //! Utilities (and a benchmark) for clearing buffers with compute shaders.
use piet_gpu_hal::{include_shader, BindType, BufferUsage, DescriptorSet}; use piet_gpu_hal::{include_shader, BindType, BufferUsage, ComputePass, DescriptorSet};
use piet_gpu_hal::{Buffer, Pipeline}; use piet_gpu_hal::{Buffer, Pipeline};
use crate::config::Config; use crate::config::Config;
use crate::runner::{Commands, Runner}; use crate::runner::Runner;
use crate::test_result::TestResult; use crate::test_result::TestResult;
const WG_SIZE: u64 = 256; const WG_SIZE: u64 = 256;
@ -52,9 +52,9 @@ pub unsafe fn run_clear_test(runner: &mut Runner, config: &Config) -> TestResult
let mut total_elapsed = 0.0; let mut total_elapsed = 0.0;
for i in 0..n_iter { for i in 0..n_iter {
let mut commands = runner.commands(); let mut commands = runner.commands();
commands.write_timestamp(0); let mut pass = commands.compute_pass(0, 1);
stage.record(&mut commands, &code, &binding); stage.record(&mut pass, &code, &binding);
commands.write_timestamp(1); pass.end();
if i == 0 || config.verify_all { if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
commands.download(&out_buf); commands.download(&out_buf);
@ -108,17 +108,12 @@ impl ClearStage {
ClearBinding { descriptor_set } ClearBinding { descriptor_set }
} }
pub unsafe fn record( pub unsafe fn record(&self, pass: &mut ComputePass, code: &ClearCode, bindings: &ClearBinding) {
&self,
commands: &mut Commands,
code: &ClearCode,
bindings: &ClearBinding,
) {
let n_workgroups = (self.n_elements + WG_SIZE - 1) / WG_SIZE; let n_workgroups = (self.n_elements + WG_SIZE - 1) / WG_SIZE;
// An issue: for clearing large buffers (>16M), we need to check the // An issue: for clearing large buffers (>16M), we need to check the
// number of workgroups against the (dynamically detected) limit, and // number of workgroups against the (dynamically detected) limit, and
// potentially issue multiple dispatches. // potentially issue multiple dispatches.
commands.cmd_buf.dispatch( pass.dispatch(
&code.pipeline, &code.pipeline,
&bindings.descriptor_set, &bindings.descriptor_set,
(n_workgroups as u32, 1, 1), (n_workgroups as u32, 1, 1),

View file

@ -58,11 +58,11 @@ pub unsafe fn clip_test(runner: &mut Runner, config: &Config) -> TestResult {
let binding = ClipBinding::new(&runner.session, &code, &config_buf, &memory.dev_buf); let binding = ClipBinding::new(&runner.session, &code, &config_buf, &memory.dev_buf);
let mut commands = runner.commands(); let mut commands = runner.commands();
commands.write_timestamp(0);
commands.upload(&memory); commands.upload(&memory);
binding.record(&mut commands.cmd_buf, &code, n_clip as u32); let mut pass = commands.compute_pass(0, 1);
binding.record(&mut pass, &code, n_clip as u32);
pass.end();
commands.download(&memory); commands.download(&memory);
commands.write_timestamp(1);
runner.submit(commands); runner.submit(commands);
let dst = memory.map_read(..); let dst = memory.map_read(..);
if let Some(failure) = data.verify(&dst) { if let Some(failure) = data.verify(&dst) {

View file

@ -77,9 +77,9 @@ pub unsafe fn draw_test(runner: &mut Runner, config: &Config) -> TestResult {
let n_iter = config.n_iter; let n_iter = config.n_iter;
for i in 0..n_iter { for i in 0..n_iter {
let mut commands = runner.commands(); let mut commands = runner.commands();
commands.write_timestamp(0); let mut pass = commands.compute_pass(0, 1);
stage.record(&mut commands.cmd_buf, &code, &binding, n_tag); stage.record(&mut pass, &code, &binding, n_tag);
commands.write_timestamp(1); pass.end();
if i == 0 || config.verify_all { if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
commands.download(&memory); commands.download(&memory);

View file

@ -45,9 +45,7 @@ pub unsafe fn run_linkedlist_test(runner: &mut Runner, config: &Config) -> TestR
for i in 0..n_iter { for i in 0..n_iter {
let mut commands = runner.commands(); let mut commands = runner.commands();
// Might clear only buckets to save time. // Might clear only buckets to save time.
commands.write_timestamp(0);
stage.record(&mut commands, &code, &binding, &mem_buf.dev_buf); stage.record(&mut commands, &code, &binding, &mem_buf.dev_buf);
commands.write_timestamp(1);
if i == 0 || config.verify_all { if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
commands.download(&mem_buf); commands.download(&mem_buf);
@ -107,12 +105,14 @@ impl LinkedListStage {
commands.cmd_buf.clear_buffer(out_buf, None); commands.cmd_buf.clear_buffer(out_buf, None);
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
let n_workgroups = N_BUCKETS / WG_SIZE; let n_workgroups = N_BUCKETS / WG_SIZE;
commands.cmd_buf.dispatch( let mut pass = commands.compute_pass(0, 1);
pass.dispatch(
&code.pipeline, &code.pipeline,
&bindings.descriptor_set, &bindings.descriptor_set,
(n_workgroups as u32, 1, 1), (n_workgroups as u32, 1, 1),
(WG_SIZE as u32, 1, 1), (WG_SIZE as u32, 1, 1),
); );
pass.end();
} }
} }

View file

@ -59,9 +59,7 @@ pub unsafe fn run_message_passing_test(
let mut failures = 0; let mut failures = 0;
for _ in 0..n_iter { for _ in 0..n_iter {
let mut commands = runner.commands(); let mut commands = runner.commands();
commands.write_timestamp(0);
stage.record(&mut commands, &code, &binding, &out_buf.dev_buf); stage.record(&mut commands, &code, &binding, &out_buf.dev_buf);
commands.write_timestamp(1);
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
commands.download(&out_buf); commands.download(&out_buf);
total_elapsed += runner.submit(commands); total_elapsed += runner.submit(commands);
@ -128,11 +126,13 @@ impl MessagePassingStage {
commands.cmd_buf.clear_buffer(&self.data_buf, None); commands.cmd_buf.clear_buffer(&self.data_buf, None);
commands.cmd_buf.clear_buffer(out_buf, None); commands.cmd_buf.clear_buffer(out_buf, None);
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
commands.cmd_buf.dispatch( let mut pass = commands.compute_pass(0, 1);
pass.dispatch(
&code.pipeline, &code.pipeline,
&bindings.descriptor_set, &bindings.descriptor_set,
(256, 1, 1), (256, 1, 1),
(256, 1, 1), (256, 1, 1),
); );
pass.end();
} }
} }

View file

@ -105,15 +105,15 @@ pub unsafe fn path_test(runner: &mut Runner, config: &Config) -> TestResult {
let mut commands = runner.commands(); let mut commands = runner.commands();
commands.cmd_buf.copy_buffer(&memory_init, &memory.dev_buf); commands.cmd_buf.copy_buffer(&memory_init, &memory.dev_buf);
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
commands.write_timestamp(0); let mut pass = commands.compute_pass(0, 1);
stage.record( stage.record(
&mut commands.cmd_buf, &mut pass,
&code, &code,
&binding, &binding,
path_data.n_path, path_data.n_path,
path_data.tags.len() as u32, path_data.tags.len() as u32,
); );
commands.write_timestamp(1); pass.end();
if i == 0 || config.verify_all { if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
commands.download(&memory); commands.download(&memory);

View file

@ -85,9 +85,7 @@ pub unsafe fn run_prefix_test(
let mut total_elapsed = 0.0; let mut total_elapsed = 0.0;
for i in 0..n_iter { for i in 0..n_iter {
let mut commands = runner.commands(); let mut commands = runner.commands();
commands.write_timestamp(0);
stage.record(&mut commands, &code, &binding); stage.record(&mut commands, &code, &binding);
commands.write_timestamp(1);
if i == 0 || config.verify_all { if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
commands.download(&out_buf); commands.download(&out_buf);
@ -159,12 +157,14 @@ impl PrefixStage {
let n_workgroups = (self.n_elements + ELEMENTS_PER_WG - 1) / ELEMENTS_PER_WG; let n_workgroups = (self.n_elements + ELEMENTS_PER_WG - 1) / ELEMENTS_PER_WG;
commands.cmd_buf.clear_buffer(&self.state_buf, None); commands.cmd_buf.clear_buffer(&self.state_buf, None);
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
commands.cmd_buf.dispatch( let mut pass = commands.compute_pass(0, 1);
pass.dispatch(
&code.pipeline, &code.pipeline,
&bindings.descriptor_set, &bindings.descriptor_set,
(n_workgroups as u32, 1, 1), (n_workgroups as u32, 1, 1),
(WG_SIZE as u32, 1, 1), (WG_SIZE as u32, 1, 1),
); );
pass.end();
// One thing that's missing here is registering the buffers so // One thing that's missing here is registering the buffers so
// they can be safely dropped by Rust code before the execution // they can be safely dropped by Rust code before the execution
// of the command buffer completes. // of the command buffer completes.

View file

@ -66,9 +66,7 @@ pub unsafe fn run_prefix_test(runner: &mut Runner, config: &Config) -> TestResul
let mut commands = runner.commands(); let mut commands = runner.commands();
commands.cmd_buf.copy_buffer(&data_buf, &out_buf.dev_buf); commands.cmd_buf.copy_buffer(&data_buf, &out_buf.dev_buf);
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
commands.write_timestamp(0);
stage.record(&mut commands, &code, &binding); stage.record(&mut commands, &code, &binding);
commands.write_timestamp(1);
if i == 0 || config.verify_all { if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
commands.download(&out_buf); commands.download(&out_buf);
@ -175,33 +173,35 @@ impl PrefixTreeStage {
code: &PrefixTreeCode, code: &PrefixTreeCode,
bindings: &PrefixTreeBinding, bindings: &PrefixTreeBinding,
) { ) {
let mut pass = commands.compute_pass(0, 1);
let n = self.tmp_bufs.len(); let n = self.tmp_bufs.len();
for i in 0..n { for i in 0..n {
let n_workgroups = self.sizes[i + 1]; let n_workgroups = self.sizes[i + 1];
commands.cmd_buf.dispatch( pass.dispatch(
&code.reduce_pipeline, &code.reduce_pipeline,
&bindings.descriptor_sets[i], &bindings.descriptor_sets[i],
(n_workgroups as u32, 1, 1), (n_workgroups as u32, 1, 1),
(WG_SIZE as u32, 1, 1), (WG_SIZE as u32, 1, 1),
); );
commands.cmd_buf.memory_barrier(); pass.memory_barrier();
} }
commands.cmd_buf.dispatch( pass.dispatch(
&code.root_pipeline, &code.root_pipeline,
&bindings.descriptor_sets[n], &bindings.descriptor_sets[n],
(1, 1, 1), (1, 1, 1),
(WG_SIZE as u32, 1, 1), (WG_SIZE as u32, 1, 1),
); );
for i in (0..n).rev() { for i in (0..n).rev() {
commands.cmd_buf.memory_barrier(); pass.memory_barrier();
let n_workgroups = self.sizes[i + 1]; let n_workgroups = self.sizes[i + 1];
commands.cmd_buf.dispatch( pass.dispatch(
&code.scan_pipeline, &code.scan_pipeline,
&bindings.descriptor_sets[2 * n - i], &bindings.descriptor_sets[2 * n - i],
(n_workgroups as u32, 1, 1), (n_workgroups as u32, 1, 1),
(WG_SIZE as u32, 1, 1), (WG_SIZE as u32, 1, 1),
); );
} }
pass.end();
} }
} }

View file

@ -20,8 +20,8 @@ use std::ops::RangeBounds;
use bytemuck::Pod; use bytemuck::Pod;
use piet_gpu_hal::{ use piet_gpu_hal::{
BackendType, BufReadGuard, BufWriteGuard, Buffer, BufferUsage, CmdBuf, Instance, InstanceFlags, BackendType, BufReadGuard, BufWriteGuard, Buffer, BufferUsage, CmdBuf, ComputePass,
QueryPool, Session, ComputePassDescriptor, Instance, InstanceFlags, QueryPool, Session,
}; };
pub struct Runner { pub struct Runner {
@ -118,8 +118,14 @@ impl Runner {
} }
impl Commands { impl Commands {
pub unsafe fn write_timestamp(&mut self, query: u32) { /// Start a compute pass with timer queries.
self.cmd_buf.write_timestamp(&self.query_pool, query); pub unsafe fn compute_pass(&mut self, start_query: u32, end_query: u32) -> ComputePass {
self.cmd_buf
.begin_compute_pass(&ComputePassDescriptor::timer(
&self.query_pool,
start_query,
end_query,
))
} }
pub unsafe fn upload(&mut self, buf: &BufStage) { pub unsafe fn upload(&mut self, buf: &BufStage) {

View file

@ -61,9 +61,9 @@ pub unsafe fn transform_test(runner: &mut Runner, config: &Config) -> TestResult
let n_iter = config.n_iter; let n_iter = config.n_iter;
for i in 0..n_iter { for i in 0..n_iter {
let mut commands = runner.commands(); let mut commands = runner.commands();
commands.write_timestamp(0); let mut pass = commands.compute_pass(0, 1);
stage.record(&mut commands.cmd_buf, &code, &binding, n_elements); stage.record(&mut pass, &code, &binding, n_elements);
commands.write_timestamp(1); pass.end();
if i == 0 || config.verify_all { if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
commands.download(&memory); commands.download(&memory);