Merge pull request #164 from linebender/metal_timer

Timer queries on more platforms, particularly Metal
This commit is contained in:
Chad Brokaw 2022-05-05 13:56:18 -04:00 committed by GitHub
commit 05624de19d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
29 changed files with 781 additions and 208 deletions

1
Cargo.lock generated
View file

@ -921,6 +921,7 @@ dependencies = [
"block",
"bytemuck",
"cocoa-foundation",
"foreign-types",
"metal",
"objc",
"raw-window-handle",

View file

@ -28,3 +28,4 @@ metal = "0.22"
objc = "0.2.5"
block = "0.1.6"
cocoa-foundation = "0.1"
foreign-types = "0.3.2"

View file

@ -1,4 +1,4 @@
use piet_gpu_hal::{include_shader, BindType};
use piet_gpu_hal::{include_shader, BindType, ComputePassDescriptor};
use piet_gpu_hal::{BufferUsage, Instance, InstanceFlags, Session};
fn main() {
@ -20,9 +20,9 @@ fn main() {
let mut cmd_buf = session.cmd_buf().unwrap();
cmd_buf.begin();
cmd_buf.reset_query_pool(&query_pool);
cmd_buf.write_timestamp(&query_pool, 0);
cmd_buf.dispatch(&pipeline, &descriptor_set, (256, 1, 1), (1, 1, 1));
cmd_buf.write_timestamp(&query_pool, 1);
let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 0, 1));
pass.dispatch(&pipeline, &descriptor_set, (256, 1, 1), (1, 1, 1));
pass.end();
cmd_buf.finish_timestamps(&query_pool);
cmd_buf.host_barrier();
cmd_buf.finish();

View file

@ -17,7 +17,8 @@
//! The generic trait for backends to implement.
use crate::{
BindType, BufferUsage, Error, GpuInfo, ImageFormat, ImageLayout, MapMode, SamplerParams,
BindType, BufferUsage, ComputePassDescriptor, Error, GpuInfo, ImageFormat, ImageLayout,
MapMode, SamplerParams,
};
pub trait Device: Sized {
@ -159,14 +160,32 @@ pub trait Device: Sized {
unsafe fn create_sampler(&self, params: SamplerParams) -> Result<Self::Sampler, Error>;
}
/// The trait implemented by backend command buffer implementations.
///
/// Valid encoding is represented by a state machine (currently not validated
/// but it is easy to imagine there might be at least debug validation). Most
/// methods are only valid in a particular state, and some move it to another
/// state.
pub trait CmdBuf<D: Device> {
/// Begin encoding.
///
/// State: init -> ready
unsafe fn begin(&mut self);
/// State: ready -> finished
unsafe fn finish(&mut self);
/// Return true if the command buffer is suitable for reuse.
unsafe fn reset(&mut self) -> bool;
/// Begin a compute pass.
///
/// State: ready -> in_compute_pass
unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor);
/// Dispatch
///
/// State: in_compute_pass
unsafe fn dispatch(
&mut self,
pipeline: &D::Pipeline,
@ -175,6 +194,9 @@ pub trait CmdBuf<D: Device> {
workgroup_size: (u32, u32, u32),
);
/// State: in_compute_pass -> ready
unsafe fn end_compute_pass(&mut self);
/// Insert an execution and memory barrier.
///
/// Compute kernels (and other actions) after this barrier may read from buffers
@ -202,16 +224,16 @@ pub trait CmdBuf<D: Device> {
/// This is readily supported in Vulkan, but for portability it is remarkably
/// tricky (unimplemented in gfx-hal right now). Possibly best to write a compute
/// kernel, or organize the code not to need it.
unsafe fn clear_buffer(&self, buffer: &D::Buffer, size: Option<u64>);
unsafe fn clear_buffer(&mut self, buffer: &D::Buffer, size: Option<u64>);
unsafe fn copy_buffer(&self, src: &D::Buffer, dst: &D::Buffer);
unsafe fn copy_buffer(&mut self, src: &D::Buffer, dst: &D::Buffer);
unsafe fn copy_image_to_buffer(&self, src: &D::Image, dst: &D::Buffer);
unsafe fn copy_image_to_buffer(&mut self, src: &D::Image, dst: &D::Buffer);
unsafe fn copy_buffer_to_image(&self, src: &D::Buffer, dst: &D::Image);
unsafe fn copy_buffer_to_image(&mut self, src: &D::Buffer, dst: &D::Image);
// low portability, dx12 doesn't support it natively
unsafe fn blit_image(&self, src: &D::Image, dst: &D::Image);
unsafe fn blit_image(&mut self, src: &D::Image, dst: &D::Image);
/// Reset the query pool.
///
@ -227,7 +249,7 @@ pub trait CmdBuf<D: Device> {
unsafe fn finish_timestamps(&mut self, _pool: &D::QueryPool) {}
/// Begin a labeled section for debugging and profiling purposes.
unsafe fn begin_debug_label(&mut self, label: &str) {}
unsafe fn begin_debug_label(&mut self, _label: &str) {}
/// End a section opened by `begin_debug_label`.
unsafe fn end_debug_label(&mut self) {}

View file

@ -21,7 +21,7 @@ use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};
use smallvec::SmallVec;
use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, MapMode, WorkgroupLimits, ImageFormat};
use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, MapMode, WorkgroupLimits, ImageFormat, ComputePassDescriptor};
use self::{
descriptor::{CpuHeapRefOwned, DescriptorPool, GpuHeapRefOwned},
@ -76,6 +76,7 @@ pub struct CmdBuf {
c: wrappers::GraphicsCommandList,
allocator: CommandAllocator,
needs_reset: bool,
end_query: Option<(wrappers::QueryHeap, u32)>,
}
pub struct Pipeline {
@ -360,6 +361,7 @@ impl crate::backend::Device for Dx12Device {
c,
allocator,
needs_reset: false,
end_query: None,
})
}
}
@ -388,11 +390,10 @@ impl crate::backend::Device for Dx12Device {
let mapped = self.map_buffer(&pool.buf, 0, size as u64, MapMode::Read)?;
std::ptr::copy_nonoverlapping(mapped, buf.as_mut_ptr() as *mut u8, size);
self.unmap_buffer(&pool.buf, 0, size as u64, MapMode::Read)?;
let ts0 = buf[0];
let tsp = (self.ts_freq as f64).recip();
let result = buf[1..]
let result = buf
.iter()
.map(|ts| ts.wrapping_sub(ts0) as f64 * tsp)
.map(|ts| *ts as f64 * tsp)
.collect();
Ok(result)
}
@ -610,6 +611,16 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
self.allocator.reset().is_ok() && self.c.reset(&self.allocator, None).is_ok()
}
unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
if let Some((pool, start, end)) = &desc.timer_queries {
#[allow(irrefutable_let_patterns)]
if let crate::hub::QueryPool::Dx12(pool) = pool {
self.write_timestamp(pool, *start);
self.end_query = Some((pool.heap.clone(), *end));
}
}
}
unsafe fn dispatch(
&mut self,
pipeline: &Pipeline,
@ -628,6 +639,12 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
.dispatch(workgroup_count.0, workgroup_count.1, workgroup_count.2);
}
unsafe fn end_compute_pass(&mut self) {
if let Some((heap, end)) = self.end_query.take() {
self.c.end_timing_query(&heap, end);
}
}
unsafe fn memory_barrier(&mut self) {
// See comments in CommandBuffer::pipeline_barrier in gfx-hal dx12 backend.
// The "proper" way to do this would be to name the actual buffers participating
@ -666,7 +683,7 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
self.memory_barrier();
}
unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) {
unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
let cpu_ref = buffer.cpu_ref.as_ref().unwrap();
let (gpu_ref, heap) = buffer
.gpu_ref
@ -684,23 +701,23 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
);
}
unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) {
unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
// TODO: consider using copy_resource here (if sizes match)
let size = src.size.min(dst.size);
self.c.copy_buffer(&dst.resource, 0, &src.resource, 0, size);
}
unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) {
unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
self.c
.copy_texture_to_buffer(&src.resource, &dst.resource, src.size.0, src.size.1);
}
unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) {
unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
self.c
.copy_buffer_to_texture(&src.resource, &dst.resource, dst.size.0, dst.size.1);
}
unsafe fn blit_image(&self, src: &Image, dst: &Image) {
unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
self.c.copy_resource(&src.resource, &dst.resource);
}

View file

@ -79,7 +79,6 @@ pub struct Blob(pub ComPtr<d3dcommon::ID3DBlob>);
#[derive(Clone)]
pub struct ShaderByteCode {
pub bytecode: d3d12::D3D12_SHADER_BYTECODE,
blob: Option<Blob>,
}
#[derive(Clone)]
@ -741,7 +740,6 @@ impl ShaderByteCode {
BytecodeLength: blob.0.GetBufferSize(),
pShaderBytecode: blob.0.GetBufferPointer(),
},
blob: Some(blob),
}
}
@ -810,7 +808,6 @@ impl ShaderByteCode {
BytecodeLength: bytecode.len(),
pShaderBytecode: bytecode.as_ptr() as *const _,
},
blob: None,
}
}
}

View file

@ -13,7 +13,7 @@ use std::sync::{Arc, Mutex, Weak};
use bytemuck::Pod;
use smallvec::SmallVec;
use crate::{mux, BackendType, BufWrite, ImageFormat, MapMode};
use crate::{mux, BackendType, BufWrite, ComputePassDescriptor, ImageFormat, MapMode};
use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams};
@ -135,6 +135,11 @@ pub struct BufReadGuard<'a> {
size: u64,
}
/// A sub-object of a command buffer for a sequence of compute dispatches.
pub struct ComputePass<'a> {
cmd_buf: &'a mut CmdBuf,
}
impl Session {
/// Create a new session, choosing the best backend.
pub fn new(device: mux::Device) -> Session {
@ -370,8 +375,17 @@ impl Session {
///
/// This should be called after waiting on the command buffer that wrote the
/// timer queries.
///
/// The returned vector is one shorter than the number of timer queries in the
/// pool; the first value is subtracted off. It would likely be better to return
/// the raw timestamps, but that change should be made consistently.
pub unsafe fn fetch_query_pool(&self, pool: &QueryPool) -> Result<Vec<f64>, Error> {
self.0.device.fetch_query_pool(pool)
let result = self.0.device.fetch_query_pool(pool)?;
// Subtract off first timestamp.
Ok(result[1..]
.iter()
.map(|ts| *ts as f64 - result[0])
.collect())
}
#[doc(hidden)]
@ -471,23 +485,10 @@ impl CmdBuf {
self.cmd_buf().finish();
}
/// Dispatch a compute shader.
///
/// Request a compute shader to be run, using the pipeline to specify the
/// code, and the descriptor set to address the resources read and written.
///
/// Both the workgroup count (number of workgroups) and the workgroup size
/// (number of threads in a workgroup) must be specified here, though not
/// all back-ends require the latter info.
pub unsafe fn dispatch(
&mut self,
pipeline: &Pipeline,
descriptor_set: &DescriptorSet,
workgroup_count: (u32, u32, u32),
workgroup_size: (u32, u32, u32),
) {
self.cmd_buf()
.dispatch(pipeline, descriptor_set, workgroup_count, workgroup_size);
/// Begin a compute pass.
pub unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) -> ComputePass {
self.cmd_buf().begin_compute_pass(desc);
ComputePass { cmd_buf: self }
}
/// Insert an execution and memory barrier.
@ -582,13 +583,6 @@ impl CmdBuf {
self.cmd_buf().reset_query_pool(pool);
}
/// Write a timestamp.
///
/// The query index must be less than the size of the query pool on creation.
pub unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
self.cmd_buf().write_timestamp(pool, query);
}
/// Prepare the timestamps for reading. This isn't required on Vulkan but
/// is required on (at least) DX12.
///
@ -692,6 +686,51 @@ impl Drop for SubmittedCmdBuf {
}
}
impl<'a> ComputePass<'a> {
/// Dispatch a compute shader.
///
/// Request a compute shader to be run, using the pipeline to specify the
/// code, and the descriptor set to address the resources read and written.
///
/// Both the workgroup count (number of workgroups) and the workgroup size
/// (number of threads in a workgroup) must be specified here, though not
/// all back-ends require the latter info.
pub unsafe fn dispatch(
&mut self,
pipeline: &Pipeline,
descriptor_set: &DescriptorSet,
workgroup_count: (u32, u32, u32),
workgroup_size: (u32, u32, u32),
) {
self.cmd_buf
.cmd_buf()
.dispatch(pipeline, descriptor_set, workgroup_count, workgroup_size);
}
/// Add a memory barrier.
///
/// Inserts a memory barrier in the compute encoder. This is a convenience
/// function for calling the same function on the underlying command buffer,
/// avoiding borrow check issues.
pub unsafe fn memory_barrier(&mut self) {
self.cmd_buf.memory_barrier();
}
/// Begin a labeled section for debugging and profiling purposes.
pub unsafe fn begin_debug_label(&mut self, label: &str) {
self.cmd_buf.begin_debug_label(label);
}
/// End a section opened by `begin_debug_label`.
pub unsafe fn end_debug_label(&mut self) {
self.cmd_buf.end_debug_label();
}
pub unsafe fn end(self) {
self.cmd_buf.cmd_buf().end_compute_pass();
}
}
impl Drop for BufferInner {
fn drop(&mut self) {
if let Some(session) = Weak::upgrade(&self.session) {

View file

@ -21,8 +21,8 @@ pub use crate::mux::{
};
pub use bufwrite::BufWrite;
pub use hub::{
BufReadGuard, BufWriteGuard, Buffer, CmdBuf, DescriptorSetBuilder, Image, RetainResource,
Session, SubmittedCmdBuf,
BufReadGuard, BufWriteGuard, Buffer, CmdBuf, ComputePass, DescriptorSetBuilder, Image,
RetainResource, Session, SubmittedCmdBuf,
};
// TODO: because these are conditionally included, "cargo fmt" does not
@ -189,3 +189,23 @@ pub struct WorkgroupLimits {
/// dimension.
pub max_invocations: u32,
}
/// Options for creating a compute pass.
#[derive(Default)]
pub struct ComputePassDescriptor<'a> {
// Maybe label should go here? It does in wgpu and wgpu_hal.
/// Timer query parameters.
///
/// To record timer queries for a compute pass, set the query pool, start
/// query index, and end query index here. The indices must be less than
/// the size of the query pool.
timer_queries: Option<(&'a QueryPool, u32, u32)>,
}
impl<'a> ComputePassDescriptor<'a> {
pub fn timer(pool: &'a QueryPool, start_query: u32, end_query: u32) -> ComputePassDescriptor {
ComputePassDescriptor {
timer_queries: Some((pool, start_query, end_query)),
}
}
}

View file

@ -15,25 +15,32 @@
// Also licensed under MIT license, at your choice.
mod clear;
mod timer;
mod util;
use std::mem;
use std::sync::{Arc, Mutex};
use block::Block;
use cocoa_foundation::base::id;
use cocoa_foundation::foundation::{NSInteger, NSUInteger};
use foreign_types::ForeignType;
use objc::rc::autoreleasepool;
use objc::runtime::{Object, BOOL, YES};
use objc::{class, msg_send, sel, sel_impl};
use metal::{CGFloat, MTLFeatureSet};
use metal::{CGFloat, CommandBufferRef, MTLFeatureSet};
use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};
use crate::{BufferUsage, Error, GpuInfo, ImageFormat, MapMode, WorkgroupLimits};
use crate::{
BufferUsage, ComputePassDescriptor, Error, GpuInfo, ImageFormat, MapMode, WorkgroupLimits,
};
use util::*;
use self::timer::{CounterSampleBuffer, CounterSet, TimeCalibration};
pub struct MtlInstance;
pub struct MtlDevice {
@ -41,6 +48,18 @@ pub struct MtlDevice {
cmd_queue: Arc<Mutex<metal::CommandQueue>>,
gpu_info: GpuInfo,
helpers: Arc<Helpers>,
timer_set: Option<CounterSet>,
counter_style: CounterStyle,
}
/// Type of counter sampling.
///
/// See https://developer.apple.com/documentation/metal/counter_sampling/sampling_gpu_data_into_counter_sample_buffers
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
enum CounterStyle {
None,
Stage,
Command,
}
pub struct MtlSurface {
@ -81,9 +100,22 @@ pub struct Semaphore;
pub struct CmdBuf {
cmd_buf: metal::CommandBuffer,
helpers: Arc<Helpers>,
cur_encoder: Encoder,
time_calibration: Arc<Mutex<TimeCalibration>>,
counter_style: CounterStyle,
}
pub struct QueryPool;
enum Encoder {
None,
Compute(metal::ComputeCommandEncoder, Option<(id, u32)>),
Blit(metal::BlitCommandEncoder),
}
#[derive(Default)]
pub struct QueryPool {
counter_sample_buf: Option<CounterSampleBuffer>,
calibration: Arc<Mutex<Option<Arc<Mutex<TimeCalibration>>>>>,
}
pub struct Pipeline(metal::ComputePipelineState);
@ -209,18 +241,43 @@ impl MtlDevice {
let helpers = Arc::new(Helpers {
clear_pipeline: clear::make_clear_pipeline(&device),
});
// Timer stuff
let timer_set = CounterSet::get_timer_counter_set(&device);
let counter_style = if timer_set.is_some() {
if device.supports_counter_sampling(metal::MTLCounterSamplingPoint::AtStageBoundary) {
CounterStyle::Stage
} else if device
.supports_counter_sampling(metal::MTLCounterSamplingPoint::AtDispatchBoundary)
{
CounterStyle::Command
} else {
CounterStyle::None
}
} else {
CounterStyle::None
};
MtlDevice {
device,
cmd_queue: Arc::new(Mutex::new(cmd_queue)),
gpu_info,
helpers,
timer_set,
counter_style,
}
}
pub fn cmd_buf_from_raw_mtl(&self, raw_cmd_buf: metal::CommandBuffer) -> CmdBuf {
let cmd_buf = raw_cmd_buf;
let helpers = self.helpers.clone();
CmdBuf { cmd_buf, helpers }
let cur_encoder = Encoder::None;
let time_calibration = Default::default();
CmdBuf {
cmd_buf,
helpers,
cur_encoder,
time_calibration,
counter_style: self.counter_style,
}
}
pub fn image_from_raw_mtl(&self, texture: metal::Texture, width: u32, height: u32) -> Image {
@ -330,11 +387,35 @@ impl crate::backend::Device for MtlDevice {
fn create_cmd_buf(&self) -> Result<Self::CmdBuf, Error> {
let cmd_queue = self.cmd_queue.lock().unwrap();
// A discussion about autorelease pools.
//
// Autorelease pools are a sore point in Rust/Objective-C interop. Basically,
// you can have any two of correctness, ergonomics, and performance. Here we've
// chosen the first two, using the pattern of a fine grained autorelease pool
// to give the Obj-C object Rust-like lifetime semantics whenever objects are
// created as autorelease (by convention, this is any object creation with an
// Obj-C method name that doesn't begin with "new" or "alloc").
//
// To gain back some of the performance, we'd need a way to wrap an autorelease
// pool over a chunk of work - that could be one frame of rendering, but for
// tests that iterate a number of command buffer submissions, it would need to
// be around that. On non-mac platforms, it would be a no-op.
//
// In any case, this way, the caller doesn't need to worry, and the performance
// hit might not be so bad (perhaps we should measure).
// consider new_command_buffer_with_unretained_references for performance
let cmd_buf = cmd_queue.new_command_buffer();
let cmd_buf = autoreleasepool(|| cmd_buf.to_owned());
let cmd_buf = autoreleasepool(|| cmd_queue.new_command_buffer().to_owned());
let helpers = self.helpers.clone();
Ok(CmdBuf { cmd_buf, helpers })
let cur_encoder = Encoder::None;
let time_calibration = Default::default();
Ok(CmdBuf {
cmd_buf,
helpers,
cur_encoder,
time_calibration,
counter_style: self.counter_style,
})
}
unsafe fn destroy_cmd_buf(&self, _cmd_buf: Self::CmdBuf) -> Result<(), Error> {
@ -342,12 +423,31 @@ impl crate::backend::Device for MtlDevice {
}
fn create_query_pool(&self, n_queries: u32) -> Result<Self::QueryPool, Error> {
// TODO
Ok(QueryPool)
if let Some(timer_set) = &self.timer_set {
let pool = CounterSampleBuffer::new(&self.device, n_queries as u64, timer_set)
.ok_or("error creating timer query pool")?;
return Ok(QueryPool {
counter_sample_buf: Some(pool),
calibration: Default::default(),
});
}
Ok(QueryPool::default())
}
unsafe fn fetch_query_pool(&self, pool: &Self::QueryPool) -> Result<Vec<f64>, Error> {
// TODO
if let Some(raw) = &pool.counter_sample_buf {
let resolved = raw.resolve();
let calibration = pool.calibration.lock().unwrap();
if let Some(calibration) = &*calibration {
let calibration = calibration.lock().unwrap();
let result = resolved
.iter()
.map(|time_ns| calibration.correlate(*time_ns))
.collect();
return Ok(result);
}
}
// Maybe should return None indicating it wasn't successful? But that might break.
Ok(Vec::new())
}
@ -358,7 +458,37 @@ impl crate::backend::Device for MtlDevice {
_signal_semaphores: &[&Self::Semaphore],
fence: Option<&mut Self::Fence>,
) -> Result<(), Error> {
unsafe fn add_scheduled_handler(
cmd_buf: &metal::CommandBufferRef,
block: &Block<(&CommandBufferRef,), ()>,
) {
msg_send![cmd_buf, addScheduledHandler: block]
}
for cmd_buf in cmd_bufs {
let time_calibration = cmd_buf.time_calibration.clone();
let start_block = block::ConcreteBlock::new(move |buffer: &metal::CommandBufferRef| {
let device: id = msg_send![buffer, device];
let mut time_calibration = time_calibration.lock().unwrap();
let cpu_ts_ptr = &mut time_calibration.cpu_start_ts as *mut _;
let gpu_ts_ptr = &mut time_calibration.gpu_start_ts as *mut _;
// TODO: only do this if supported.
let () = msg_send![device, sampleTimestamps: cpu_ts_ptr gpuTimestamp: gpu_ts_ptr];
})
.copy();
add_scheduled_handler(&cmd_buf.cmd_buf, &start_block);
let time_calibration = cmd_buf.time_calibration.clone();
let completed_block =
block::ConcreteBlock::new(move |buffer: &metal::CommandBufferRef| {
let device: id = msg_send![buffer, device];
let mut time_calibration = time_calibration.lock().unwrap();
let cpu_ts_ptr = &mut time_calibration.cpu_end_ts as *mut _;
let gpu_ts_ptr = &mut time_calibration.gpu_end_ts as *mut _;
// TODO: only do this if supported.
let () =
msg_send![device, sampleTimestamps: cpu_ts_ptr gpuTimestamp: gpu_ts_ptr];
})
.copy();
cmd_buf.cmd_buf.add_completed_handler(&completed_block);
cmd_buf.cmd_buf.commit();
}
if let Some(last_cmd_buf) = cmd_bufs.last() {
@ -439,12 +569,70 @@ impl crate::backend::Device for MtlDevice {
impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
unsafe fn begin(&mut self) {}
unsafe fn finish(&mut self) {}
unsafe fn finish(&mut self) {
self.flush_encoder();
}
unsafe fn reset(&mut self) -> bool {
false
}
unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
// TODO: we might want to get better about validation but the following
// assert is likely to trigger, and also a case can be made that
// validation should be done at the hub level, for consistency.
//debug_assert!(matches!(self.cur_encoder, Encoder::None));
self.flush_encoder();
autoreleasepool(|| {
let (encoder, end_query) = match (&desc.timer_queries, self.counter_style) {
(Some(queries), CounterStyle::Stage) => {
let descriptor: id =
msg_send![class!(MTLComputePassDescriptor), computePassDescriptor];
let attachments: id = msg_send![descriptor, sampleBufferAttachments];
let index: NSUInteger = 0;
let attachment: id = msg_send![attachments, objectAtIndexedSubscript: index];
// Here we break the hub/mux separation a bit, for expedience
#[allow(irrefutable_let_patterns)]
if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 {
if let Some(sample_buf) = &query_pool.counter_sample_buf {
let () = msg_send![attachment, setSampleBuffer: sample_buf.id()];
}
}
let start_index = queries.1 as NSUInteger;
let end_index = queries.2 as NSInteger;
let () = msg_send![attachment, setStartOfEncoderSampleIndex: start_index];
let () = msg_send![attachment, setEndOfEncoderSampleIndex: end_index];
(
msg_send![
self.cmd_buf,
computeCommandEncoderWithDescriptor: descriptor
],
None,
)
}
(Some(queries), CounterStyle::Command) => {
let encoder = self.cmd_buf.new_compute_command_encoder();
#[allow(irrefutable_let_patterns)]
let end_query = if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 {
if let Some(sample_buf) = &query_pool.counter_sample_buf {
let sample_index = queries.1 as NSUInteger;
let sample_buf = sample_buf.id();
let () = msg_send![encoder, sampleCountersInBuffer: sample_buf atSampleIndex: sample_index withBarrier: true];
Some((sample_buf, queries.2))
} else {
None
}
} else {
None
};
(encoder, end_query)
}
_ => (self.cmd_buf.new_compute_command_encoder(), None),
};
self.cur_encoder = Encoder::Compute(encoder.to_owned(), end_query);
});
}
unsafe fn dispatch(
&mut self,
pipeline: &Pipeline,
@ -452,7 +640,7 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
workgroup_count: (u32, u32, u32),
workgroup_size: (u32, u32, u32),
) {
let encoder = self.cmd_buf.new_compute_command_encoder();
let encoder = self.compute_command_encoder();
encoder.set_compute_pipeline_state(&pipeline.0);
let mut buf_ix = 0;
for buffer in &descriptor_set.buffers {
@ -475,7 +663,11 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
depth: workgroup_size.2 as u64,
};
encoder.dispatch_thread_groups(workgroup_count, workgroup_size);
encoder.end_encoding();
}
unsafe fn end_compute_pass(&mut self) {
// TODO: might validate that we are in a compute encoder state
self.flush_encoder();
}
unsafe fn memory_barrier(&mut self) {
@ -494,22 +686,23 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
// I think these are being tracked.
}
unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) {
unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
let size = size.unwrap_or(buffer.size);
let encoder = self.cmd_buf.new_compute_command_encoder();
clear::encode_clear(&encoder, &self.helpers.clear_pipeline, &buffer.buffer, size);
encoder.end_encoding()
let _ = self.compute_command_encoder();
// Getting this directly is a workaround for a borrow checker issue.
if let Encoder::Compute(e, _) = &self.cur_encoder {
clear::encode_clear(e, &self.helpers.clear_pipeline, &buffer.buffer, size);
}
}
unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) {
let encoder = self.cmd_buf.new_blit_command_encoder();
unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
let encoder = self.blit_command_encoder();
let size = src.size.min(dst.size);
encoder.copy_from_buffer(&src.buffer, 0, &dst.buffer, 0, size);
encoder.end_encoding();
}
unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) {
let encoder = self.cmd_buf.new_blit_command_encoder();
unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
let encoder = self.blit_command_encoder();
assert_eq!(dst.size, (src.width as u64) * (src.height as u64) * 4);
let bytes_per_row = (src.width * 4) as NSUInteger;
let src_size = metal::MTLSize {
@ -530,11 +723,10 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
bytes_per_row * src.height as NSUInteger,
metal::MTLBlitOption::empty(),
);
encoder.end_encoding();
}
unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) {
let encoder = self.cmd_buf.new_blit_command_encoder();
unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
let encoder = self.blit_command_encoder();
assert_eq!(src.size, (dst.width as u64) * (dst.height as u64) * 4);
let bytes_per_row = (dst.width * 4) as NSUInteger;
let src_size = metal::MTLSize {
@ -555,11 +747,10 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
origin,
metal::MTLBlitOption::empty(),
);
encoder.end_encoding();
}
unsafe fn blit_image(&self, src: &Image, dst: &Image) {
let encoder = self.cmd_buf.new_blit_command_encoder();
unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
let encoder = self.blit_command_encoder();
let src_size = metal::MTLSize {
width: src.width.min(dst.width) as NSUInteger,
height: src.width.min(dst.height) as NSUInteger,
@ -577,15 +768,79 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
0,
origin,
);
encoder.end_encoding();
}
unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {}
unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {
let mut calibration = pool.calibration.lock().unwrap();
*calibration = Some(self.time_calibration.clone());
}
unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
// TODO
// This really a PITA because it's pretty different than Vulkan.
// See https://developer.apple.com/documentation/metal/counter_sampling
if let Some(buf) = &pool.counter_sample_buf {
if matches!(self.cur_encoder, Encoder::None) {
self.cur_encoder =
Encoder::Compute(self.cmd_buf.new_compute_command_encoder().to_owned(), None);
}
let sample_index = query as NSUInteger;
if self.counter_style == CounterStyle::Command {
match &self.cur_encoder {
Encoder::Compute(e, _) => {
let () = msg_send![e.as_ptr(), sampleCountersInBuffer: buf.id() atSampleIndex: sample_index withBarrier: true];
}
Encoder::None => unreachable!(),
_ => todo!(),
}
} else if self.counter_style == CounterStyle::Stage {
match &self.cur_encoder {
Encoder::Compute(_e, _) => {
println!("write_timestamp is not supported for stage-style encoders");
}
_ => (),
}
}
}
}
}
impl CmdBuf {
fn compute_command_encoder(&mut self) -> &metal::ComputeCommandEncoder {
if !matches!(self.cur_encoder, Encoder::Compute(..)) {
self.flush_encoder();
self.cur_encoder =
Encoder::Compute(self.cmd_buf.new_compute_command_encoder().to_owned(), None);
}
if let Encoder::Compute(e, _) = &self.cur_encoder {
e
} else {
unreachable!()
}
}
fn blit_command_encoder(&mut self) -> &metal::BlitCommandEncoder {
if !matches!(self.cur_encoder, Encoder::Blit(_)) {
self.flush_encoder();
self.cur_encoder = Encoder::Blit(self.cmd_buf.new_blit_command_encoder().to_owned());
}
if let Encoder::Blit(e) = &self.cur_encoder {
e
} else {
unreachable!()
}
}
fn flush_encoder(&mut self) {
match std::mem::replace(&mut self.cur_encoder, Encoder::None) {
Encoder::Compute(e, Some((sample_buf, end_query))) => {
let sample_index = end_query as NSUInteger;
unsafe {
let () = msg_send![e.as_ptr(), sampleCountersInBuffer: sample_buf atSampleIndex: sample_index withBarrier: true];
}
e.end_encoding();
}
Encoder::Compute(e, None) => e.end_encoding(),
Encoder::Blit(e) => e.end_encoding(),
Encoder::None => (),
}
}
}

View file

@ -0,0 +1,172 @@
// Copyright 2021 The piet-gpu authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Also licensed under MIT license, at your choice.
//! Support for timer queries.
//!
//! Likely some of this should be upstreamed into metal-rs.
use std::{ffi::CStr, ptr::null_mut};
use cocoa_foundation::{
base::id,
foundation::{NSRange, NSUInteger},
};
use metal::{DeviceRef, MTLStorageMode};
use objc::{class, msg_send, sel, sel_impl};
pub struct CounterSampleBuffer {
id: id,
count: u64,
}
pub struct CounterSet {
id: id,
}
#[derive(Default)]
pub struct TimeCalibration {
pub cpu_start_ts: u64,
pub gpu_start_ts: u64,
pub cpu_end_ts: u64,
pub gpu_end_ts: u64,
}
impl Drop for CounterSampleBuffer {
fn drop(&mut self) {
unsafe { msg_send![self.id, release] }
}
}
impl Clone for CounterSampleBuffer {
fn clone(&self) -> CounterSampleBuffer {
unsafe {
CounterSampleBuffer {
id: msg_send![self.id, retain],
count: self.count,
}
}
}
}
impl CounterSampleBuffer {
pub fn id(&self) -> id {
self.id
}
}
impl Drop for CounterSet {
fn drop(&mut self) {
unsafe { msg_send![self.id, release] }
}
}
impl CounterSet {
pub fn get_timer_counter_set(device: &DeviceRef) -> Option<CounterSet> {
unsafe {
// TODO: version check
let sets: id = msg_send!(device, counterSets);
let count: NSUInteger = msg_send![sets, count];
for i in 0..count {
let set: id = msg_send![sets, objectAtIndex: i];
let name: id = msg_send![set, name];
let name_cstr = CStr::from_ptr(msg_send![name, UTF8String]);
if name_cstr.to_bytes() == b"timestamp" {
return Some(CounterSet { id: set });
}
}
None
}
}
}
// copied from metal-rs; should be in common utilities maybe?
fn nsstring_as_str(nsstr: &objc::runtime::Object) -> &str {
let bytes = unsafe {
let bytes: *const std::os::raw::c_char = msg_send![nsstr, UTF8String];
bytes as *const u8
};
let len: NSUInteger = unsafe { msg_send![nsstr, length] };
unsafe {
let bytes = std::slice::from_raw_parts(bytes, len as usize);
std::str::from_utf8(bytes).unwrap()
}
}
impl CounterSampleBuffer {
pub fn new(
device: &DeviceRef,
count: u64,
counter_set: &CounterSet,
) -> Option<CounterSampleBuffer> {
unsafe {
let desc_cls = class!(MTLCounterSampleBufferDescriptor);
let descriptor: id = msg_send![desc_cls, alloc];
let _: id = msg_send![descriptor, init];
let count = count as NSUInteger;
let () = msg_send![descriptor, setSampleCount: count];
let () = msg_send![descriptor, setCounterSet: counter_set.id];
let () = msg_send![
descriptor,
setStorageMode: MTLStorageMode::Shared as NSUInteger
];
let mut error: id = null_mut();
let buf: id = msg_send![device, newCounterSampleBufferWithDescriptor: descriptor error: &mut error];
let () = msg_send![descriptor, release];
if !error.is_null() {
let description = msg_send![error, localizedDescription];
println!(
"error allocating sample buffer, code = {}",
nsstring_as_str(description)
);
let () = msg_send![error, release];
return None;
}
Some(CounterSampleBuffer { id: buf, count })
}
}
// Read the timestamps.
//
// Safety: the lifetime of the returned slice is wrong, it's actually autoreleased.
pub unsafe fn resolve(&self) -> &[u64] {
let range = NSRange::new(0, self.count);
let data: id = msg_send![self.id, resolveCounterRange: range];
if data.is_null() {
&[]
} else {
let bytes: *const u64 = msg_send![data, bytes];
std::slice::from_raw_parts(bytes, self.count as usize)
}
}
}
impl TimeCalibration {
/// Convert GPU timestamp into CPU time base.
///
/// See https://developer.apple.com/documentation/metal/performance_tuning/correlating_cpu_and_gpu_timestamps
pub fn correlate(&self, raw_ts: u64) -> f64 {
let delta_cpu = self.cpu_end_ts - self.cpu_start_ts;
let delta_gpu = self.gpu_end_ts - self.gpu_start_ts;
let adj_ts = if delta_gpu > 0 {
let scale = delta_cpu as f64 / delta_gpu as f64;
self.cpu_start_ts as f64 + (raw_ts as f64 - self.gpu_start_ts as f64) * scale
} else {
// Default is ns on Apple Silicon; on other hardware this will be wrong
raw_ts as f64
};
adj_ts * 1e-9
}
}

View file

@ -35,6 +35,7 @@ use crate::backend::DescriptorSetBuilder as DescriptorSetBuilderTrait;
use crate::backend::Device as DeviceTrait;
use crate::BackendType;
use crate::BindType;
use crate::ComputePassDescriptor;
use crate::ImageFormat;
use crate::MapMode;
use crate::{BufferUsage, Error, GpuInfo, ImageLayout, InstanceFlags};
@ -658,6 +659,14 @@ impl CmdBuf {
}
}
pub unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
mux_match! { self;
CmdBuf::Vk(c) => c.begin_compute_pass(desc),
CmdBuf::Dx12(c) => c.begin_compute_pass(desc),
CmdBuf::Mtl(c) => c.begin_compute_pass(desc),
}
}
/// Dispatch a compute shader.
///
/// Note that both the number of workgroups (`workgroup_count`) and the number of
@ -680,6 +689,14 @@ impl CmdBuf {
}
}
pub unsafe fn end_compute_pass(&mut self) {
mux_match! { self;
CmdBuf::Vk(c) => c.end_compute_pass(),
CmdBuf::Dx12(c) => c.end_compute_pass(),
CmdBuf::Mtl(c) => c.end_compute_pass(),
}
}
pub unsafe fn memory_barrier(&mut self) {
mux_match! { self;
CmdBuf::Vk(c) => c.memory_barrier(),

View file

@ -15,7 +15,7 @@ use smallvec::SmallVec;
use crate::backend::Device as DeviceTrait;
use crate::{
BindType, BufferUsage, Error, GpuInfo, ImageFormat, ImageLayout, MapMode, SamplerParams, SubgroupSize,
WorkgroupLimits,
WorkgroupLimits, ComputePassDescriptor,
};
pub struct VkInstance {
@ -92,6 +92,7 @@ pub struct CmdBuf {
cmd_buf: vk::CommandBuffer,
cmd_pool: vk::CommandPool,
device: Arc<RawDevice>,
end_query: Option<(vk::QueryPool, u32)>,
}
pub struct QueryPool {
@ -738,6 +739,7 @@ impl crate::backend::Device for VkDevice {
cmd_buf,
cmd_pool,
device: self.device.clone(),
end_query: None,
})
}
}
@ -770,11 +772,10 @@ impl crate::backend::Device for VkDevice {
// results (Windows 10, AMD 5700 XT).
let flags = vk::QueryResultFlags::TYPE_64 | vk::QueryResultFlags::WAIT;
device.get_query_pool_results(pool.pool, 0, pool.n_queries, &mut buf, flags)?;
let ts0 = buf[0];
let tsp = self.timestamp_period as f64 * 1e-9;
let result = buf[1..]
let result = buf
.iter()
.map(|ts| ts.wrapping_sub(ts0) as f64 * tsp)
.map(|ts| *ts as f64 * tsp)
.collect();
Ok(result)
}
@ -902,6 +903,16 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
true
}
unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
if let Some((pool, start, end)) = &desc.timer_queries {
#[allow(irrefutable_let_patterns)]
if let crate::hub::QueryPool::Vk(pool) = pool {
self.write_timestamp_raw(pool.pool, *start);
self.end_query = Some((pool.pool, *end));
}
}
}
unsafe fn dispatch(
&mut self,
pipeline: &Pipeline,
@ -931,6 +942,12 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
);
}
unsafe fn end_compute_pass(&mut self) {
if let Some((pool, end)) = self.end_query.take() {
self.write_timestamp_raw(pool, end);
}
}
/// Insert a pipeline barrier for all memory accesses.
unsafe fn memory_barrier(&mut self) {
let device = &self.device.device;
@ -995,13 +1012,13 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
);
}
unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) {
unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
let device = &self.device.device;
let size = size.unwrap_or(vk::WHOLE_SIZE);
device.cmd_fill_buffer(self.cmd_buf, buffer.buffer, 0, size, 0);
}
unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) {
unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
let device = &self.device.device;
let size = src.size.min(dst.size);
device.cmd_copy_buffer(
@ -1012,7 +1029,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
);
}
unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) {
unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
let device = &self.device.device;
device.cmd_copy_image_to_buffer(
self.cmd_buf,
@ -1035,7 +1052,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
);
}
unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) {
unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
let device = &self.device.device;
device.cmd_copy_buffer_to_image(
self.cmd_buf,
@ -1058,7 +1075,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
);
}
unsafe fn blit_image(&self, src: &Image, dst: &Image) {
unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
let device = &self.device.device;
device.cmd_blit_image(
self.cmd_buf,
@ -1106,13 +1123,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
}
unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
let device = &self.device.device;
device.cmd_write_timestamp(
self.cmd_buf,
vk::PipelineStageFlags::COMPUTE_SHADER,
pool.pool,
query,
);
self.write_timestamp_raw(pool.pool, query);
}
unsafe fn begin_debug_label(&mut self, label: &str) {
@ -1130,6 +1141,18 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
}
}
impl CmdBuf {
unsafe fn write_timestamp_raw(&mut self, pool: vk::QueryPool, query: u32) {
let device = &self.device.device;
device.cmd_write_timestamp(
self.cmd_buf,
vk::PipelineStageFlags::COMPUTE_SHADER,
pool,
query,
);
}
}
impl crate::backend::DescriptorSetBuilder<VkDevice> for DescriptorSetBuilder {
fn add_buffers(&mut self, buffers: &[&Buffer]) {
self.buffers.extend(buffers.iter().map(|b| b.buffer));

View file

@ -70,7 +70,7 @@ fn main() -> Result<(), Error> {
.map(|_| session.create_semaphore())
.collect::<Result<Vec<_>, Error>>()?;
let query_pools = (0..NUM_FRAMES)
.map(|_| session.create_query_pool(8))
.map(|_| session.create_query_pool(12))
.collect::<Result<Vec<_>, Error>>()?;
let mut cmd_bufs: [Option<CmdBuf>; NUM_FRAMES] = Default::default();
let mut submitted: [Option<SubmittedCmdBuf>; NUM_FRAMES] = Default::default();
@ -112,22 +112,23 @@ fn main() -> Result<(), Error> {
if !ts.is_empty() {
info_string = format!(
"{:.3}ms :: e:{:.3}ms|alloc:{:.3}ms|cp:{:.3}ms|bd:{:.3}ms|bin:{:.3}ms|cr:{:.3}ms|r:{:.3}ms",
ts[6] * 1e3,
ts[10] * 1e3,
ts[0] * 1e3,
(ts[1] - ts[0]) * 1e3,
(ts[2] - ts[1]) * 1e3,
(ts[3] - ts[2]) * 1e3,
(ts[4] - ts[3]) * 1e3,
(ts[5] - ts[4]) * 1e3,
(ts[6] - ts[5]) * 1e3,
(ts[8] - ts[7]) * 1e3,
(ts[10] - ts[9]) * 1e3,
);
}
}
let mut ctx = PietGpuRenderContext::new();
let test_blend = false;
if let Some(svg) = &svg {
test_scenes::render_svg(&mut ctx, svg);
} else {
} else if test_blend {
use piet_gpu::{Blend, BlendMode::*, CompositionMode::*};
let blends = [
Blend::new(Normal, SrcOver),
@ -163,6 +164,8 @@ fn main() -> Result<(), Error> {
let blend = blends[mode % blends.len()];
test_scenes::render_blend_test(&mut ctx, current_frame, blend);
info_string = format!("{:?}", blend);
} else {
test_scenes::render_anim_frame(&mut ctx, current_frame);
}
render_info_string(&mut ctx, &info_string);
if let Err(e) = renderer.upload_render_ctx(&mut ctx, frame_idx) {

View file

@ -18,8 +18,8 @@ use piet::kurbo::Vec2;
use piet::{ImageFormat, RenderContext};
use piet_gpu_hal::{
include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Error, Image,
ImageLayout, Pipeline, QueryPool, Session,
include_shader, BindType, Buffer, BufferUsage, CmdBuf, ComputePassDescriptor, DescriptorSet,
Error, Image, ImageLayout, Pipeline, QueryPool, Session,
};
pub use pico_svg::PicoSvg;
@ -424,10 +424,10 @@ impl Renderer {
cmd_buf.copy_buffer_to_image(&self.gradient_bufs[buf_ix], &self.gradients);
cmd_buf.image_barrier(&self.gradients, ImageLayout::BlitDst, ImageLayout::General);
cmd_buf.reset_query_pool(&query_pool);
cmd_buf.write_timestamp(&query_pool, 0);
cmd_buf.begin_debug_label("Element bounding box calculation");
let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 0, 1));
self.element_stage.record(
cmd_buf,
&mut pass,
&self.element_code,
&self.element_bindings[buf_ix],
self.n_transform as u64,
@ -435,56 +435,59 @@ impl Renderer {
self.n_pathtag as u32,
self.n_drawobj as u64,
);
pass.end();
cmd_buf.end_debug_label();
cmd_buf.write_timestamp(&query_pool, 1);
cmd_buf.memory_barrier();
cmd_buf.begin_debug_label("Clip bounding box calculation");
let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 2, 3));
pass.begin_debug_label("Clip bounding box calculation");
self.clip_binding
.record(cmd_buf, &self.clip_code, self.n_clip as u32);
cmd_buf.end_debug_label();
cmd_buf.begin_debug_label("Element binning");
cmd_buf.dispatch(
.record(&mut pass, &self.clip_code, self.n_clip as u32);
pass.end_debug_label();
pass.begin_debug_label("Element binning");
pass.dispatch(
&self.bin_pipeline,
&self.bin_ds,
(((self.n_paths + 255) / 256) as u32, 1, 1),
(256, 1, 1),
);
cmd_buf.end_debug_label();
cmd_buf.memory_barrier();
cmd_buf.begin_debug_label("Tile allocation");
cmd_buf.dispatch(
pass.end_debug_label();
pass.memory_barrier();
pass.begin_debug_label("Tile allocation");
pass.dispatch(
&self.tile_pipeline,
&self.tile_ds[buf_ix],
(((self.n_paths + 255) / 256) as u32, 1, 1),
(256, 1, 1),
);
cmd_buf.end_debug_label();
cmd_buf.write_timestamp(&query_pool, 2);
cmd_buf.memory_barrier();
pass.end_debug_label();
pass.end();
cmd_buf.begin_debug_label("Path flattening");
cmd_buf.dispatch(
cmd_buf.memory_barrier();
let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 4, 5));
pass.dispatch(
&self.path_pipeline,
&self.path_ds,
(((self.n_pathseg + 31) / 32) as u32, 1, 1),
(32, 1, 1),
);
pass.end();
cmd_buf.end_debug_label();
cmd_buf.write_timestamp(&query_pool, 3);
cmd_buf.memory_barrier();
cmd_buf.begin_debug_label("Backdrop propagation");
cmd_buf.dispatch(
let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 6, 7));
pass.dispatch(
&self.backdrop_pipeline,
&self.backdrop_ds,
(((self.n_paths + 255) / 256) as u32, 1, 1),
(256, self.backdrop_y, 1),
);
pass.end();
cmd_buf.end_debug_label();
cmd_buf.write_timestamp(&query_pool, 4);
// TODO: redo query accounting
cmd_buf.write_timestamp(&query_pool, 5);
cmd_buf.memory_barrier();
cmd_buf.begin_debug_label("Coarse raster");
cmd_buf.dispatch(
let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 8, 9));
pass.dispatch(
&self.coarse_pipeline,
&self.coarse_ds[buf_ix],
(
@ -494,11 +497,13 @@ impl Renderer {
),
(256, 1, 1),
);
pass.end();
cmd_buf.end_debug_label();
cmd_buf.write_timestamp(&query_pool, 6);
cmd_buf.memory_barrier();
cmd_buf.begin_debug_label("Fine raster");
cmd_buf.dispatch(
let mut pass =
cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 10, 11));
pass.dispatch(
&self.k4_pipeline,
&self.k4_ds,
(
@ -508,8 +513,8 @@ impl Renderer {
),
(8, 4, 1),
);
pass.end();
cmd_buf.end_debug_label();
cmd_buf.write_timestamp(&query_pool, 7);
cmd_buf.memory_barrier();
cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
}

View file

@ -26,7 +26,7 @@ use bytemuck::{Pod, Zeroable};
pub use clip::{ClipBinding, ClipCode, CLIP_PART_SIZE};
pub use draw::{DrawBinding, DrawCode, DrawMonoid, DrawStage, DRAW_PART_SIZE};
pub use path::{PathBinding, PathCode, PathEncoder, PathStage, PATHSEG_PART_SIZE};
use piet_gpu_hal::{Buffer, CmdBuf, Session};
use piet_gpu_hal::{Buffer, ComputePass, Session};
pub use transform::{
Transform, TransformBinding, TransformCode, TransformStage, TRANSFORM_PART_SIZE,
};
@ -140,7 +140,7 @@ impl ElementStage {
pub unsafe fn record(
&self,
cmd_buf: &mut CmdBuf,
pass: &mut ComputePass,
code: &ElementCode,
binding: &ElementBinding,
n_transform: u64,
@ -149,14 +149,14 @@ impl ElementStage {
n_drawobj: u64,
) {
self.transform_stage.record(
cmd_buf,
pass,
&code.transform_code,
&binding.transform_binding,
n_transform,
);
// No memory barrier needed here; path has at least one before pathseg
self.path_stage.record(
cmd_buf,
pass,
&code.path_code,
&binding.path_binding,
n_paths,
@ -164,6 +164,6 @@ impl ElementStage {
);
// No memory barrier needed here; draw has at least one before draw_leaf
self.draw_stage
.record(cmd_buf, &code.draw_code, &binding.draw_binding, n_drawobj);
.record(pass, &code.draw_code, &binding.draw_binding, n_drawobj);
}
}

View file

@ -16,7 +16,7 @@
//! The clip processing stage (includes substages).
use piet_gpu_hal::{include_shader, BindType, Buffer, CmdBuf, DescriptorSet, Pipeline, Session};
use piet_gpu_hal::{include_shader, BindType, Buffer, ComputePass, DescriptorSet, Pipeline, Session};
// Note that this isn't the code/stage/binding pattern of most of the other stages
// in the new element processing pipeline. We want to move those temporary buffers
@ -69,26 +69,26 @@ impl ClipBinding {
/// Record the clip dispatches.
///
/// Assumes memory barrier on entry. Provides memory barrier on exit.
pub unsafe fn record(&self, cmd_buf: &mut CmdBuf, code: &ClipCode, n_clip: u32) {
pub unsafe fn record(&self, pass: &mut ComputePass, code: &ClipCode, n_clip: u32) {
let n_wg_reduce = n_clip.saturating_sub(1) / CLIP_PART_SIZE;
if n_wg_reduce > 0 {
cmd_buf.dispatch(
pass.dispatch(
&code.reduce_pipeline,
&self.reduce_ds,
(n_wg_reduce, 1, 1),
(CLIP_PART_SIZE, 1, 1),
);
cmd_buf.memory_barrier();
pass.memory_barrier();
}
let n_wg = (n_clip + CLIP_PART_SIZE - 1) / CLIP_PART_SIZE;
if n_wg > 0 {
cmd_buf.dispatch(
pass.dispatch(
&code.leaf_pipeline,
&self.leaf_ds,
(n_wg, 1, 1),
(CLIP_PART_SIZE, 1, 1),
);
cmd_buf.memory_barrier();
pass.memory_barrier();
}
}
}

View file

@ -19,7 +19,7 @@
use bytemuck::{Pod, Zeroable};
use piet_gpu_hal::{
include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session,
include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session,
};
/// The output element of the draw object stage.
@ -130,7 +130,7 @@ impl DrawStage {
pub unsafe fn record(
&self,
cmd_buf: &mut CmdBuf,
pass: &mut ComputePass,
code: &DrawCode,
binding: &DrawBinding,
size: u64,
@ -140,22 +140,22 @@ impl DrawStage {
}
let n_workgroups = (size + DRAW_PART_SIZE - 1) / DRAW_PART_SIZE;
if n_workgroups > 1 {
cmd_buf.dispatch(
pass.dispatch(
&code.reduce_pipeline,
&binding.reduce_ds,
(n_workgroups as u32, 1, 1),
(DRAW_WG as u32, 1, 1),
);
cmd_buf.memory_barrier();
cmd_buf.dispatch(
pass.memory_barrier();
pass.dispatch(
&code.root_pipeline,
&self.root_ds,
(1, 1, 1),
(DRAW_WG as u32, 1, 1),
);
}
cmd_buf.memory_barrier();
cmd_buf.dispatch(
pass.memory_barrier();
pass.dispatch(
&code.leaf_pipeline,
&binding.leaf_ds,
(n_workgroups as u32, 1, 1),

View file

@ -17,7 +17,7 @@
//! The path stage (includes substages).
use piet_gpu_hal::{
include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session,
include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session,
};
pub struct PathCode {
@ -148,7 +148,7 @@ impl PathStage {
/// those are consumed. Result is written without barrier.
pub unsafe fn record(
&self,
cmd_buf: &mut CmdBuf,
pass: &mut ComputePass,
code: &PathCode,
binding: &PathBinding,
n_paths: u32,
@ -166,15 +166,15 @@ impl PathStage {
let reduce_part_tags = REDUCE_PART_SIZE * 4;
let n_wg_tag_reduce = (n_tags + reduce_part_tags - 1) / reduce_part_tags;
if n_wg_tag_reduce > 1 {
cmd_buf.dispatch(
pass.dispatch(
&code.reduce_pipeline,
&binding.reduce_ds,
(n_wg_tag_reduce, 1, 1),
(REDUCE_WG, 1, 1),
);
// I think we can skip root if n_wg_tag_reduce == 2
cmd_buf.memory_barrier();
cmd_buf.dispatch(
pass.memory_barrier();
pass.dispatch(
&code.tag_root_pipeline,
&self.tag_root_ds,
(1, 1, 1),
@ -183,15 +183,15 @@ impl PathStage {
// No barrier needed here; clear doesn't depend on path tags
}
let n_wg_clear = (n_paths + CLEAR_WG - 1) / CLEAR_WG;
cmd_buf.dispatch(
pass.dispatch(
&code.clear_pipeline,
&binding.clear_ds,
(n_wg_clear, 1, 1),
(CLEAR_WG, 1, 1),
);
cmd_buf.memory_barrier();
pass.memory_barrier();
let n_wg_pathseg = (n_tags + SCAN_PART_SIZE - 1) / SCAN_PART_SIZE;
cmd_buf.dispatch(
pass.dispatch(
&code.pathseg_pipeline,
&binding.path_ds,
(n_wg_pathseg, 1, 1),

View file

@ -20,7 +20,7 @@ use bytemuck::{Pod, Zeroable};
use piet::kurbo::Affine;
use piet_gpu_hal::{
include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session,
include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session,
};
/// An affine transform.
@ -132,7 +132,7 @@ impl TransformStage {
pub unsafe fn record(
&self,
cmd_buf: &mut CmdBuf,
pass: &mut ComputePass,
code: &TransformCode,
binding: &TransformBinding,
size: u64,
@ -142,22 +142,22 @@ impl TransformStage {
}
let n_workgroups = (size + TRANSFORM_PART_SIZE - 1) / TRANSFORM_PART_SIZE;
if n_workgroups > 1 {
cmd_buf.dispatch(
pass.dispatch(
&code.reduce_pipeline,
&binding.reduce_ds,
(n_workgroups as u32, 1, 1),
(TRANSFORM_WG as u32, 1, 1),
);
cmd_buf.memory_barrier();
cmd_buf.dispatch(
pass.memory_barrier();
pass.dispatch(
&code.root_pipeline,
&self.root_ds,
(1, 1, 1),
(TRANSFORM_WG as u32, 1, 1),
);
cmd_buf.memory_barrier();
pass.memory_barrier();
}
cmd_buf.dispatch(
pass.dispatch(
&code.leaf_pipeline,
&binding.leaf_ds,
(n_workgroups as u32, 1, 1),

View file

@ -16,11 +16,11 @@
//! Utilities (and a benchmark) for clearing buffers with compute shaders.
use piet_gpu_hal::{include_shader, BindType, BufferUsage, DescriptorSet};
use piet_gpu_hal::{include_shader, BindType, BufferUsage, ComputePass, DescriptorSet};
use piet_gpu_hal::{Buffer, Pipeline};
use crate::config::Config;
use crate::runner::{Commands, Runner};
use crate::runner::Runner;
use crate::test_result::TestResult;
const WG_SIZE: u64 = 256;
@ -52,9 +52,9 @@ pub unsafe fn run_clear_test(runner: &mut Runner, config: &Config) -> TestResult
let mut total_elapsed = 0.0;
for i in 0..n_iter {
let mut commands = runner.commands();
commands.write_timestamp(0);
stage.record(&mut commands, &code, &binding);
commands.write_timestamp(1);
let mut pass = commands.compute_pass(0, 1);
stage.record(&mut pass, &code, &binding);
pass.end();
if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier();
commands.download(&out_buf);
@ -108,17 +108,12 @@ impl ClearStage {
ClearBinding { descriptor_set }
}
pub unsafe fn record(
&self,
commands: &mut Commands,
code: &ClearCode,
bindings: &ClearBinding,
) {
pub unsafe fn record(&self, pass: &mut ComputePass, code: &ClearCode, bindings: &ClearBinding) {
let n_workgroups = (self.n_elements + WG_SIZE - 1) / WG_SIZE;
// An issue: for clearing large buffers (>16M), we need to check the
// number of workgroups against the (dynamically detected) limit, and
// potentially issue multiple dispatches.
commands.cmd_buf.dispatch(
pass.dispatch(
&code.pipeline,
&bindings.descriptor_set,
(n_workgroups as u32, 1, 1),

View file

@ -58,11 +58,11 @@ pub unsafe fn clip_test(runner: &mut Runner, config: &Config) -> TestResult {
let binding = ClipBinding::new(&runner.session, &code, &config_buf, &memory.dev_buf);
let mut commands = runner.commands();
commands.write_timestamp(0);
commands.upload(&memory);
binding.record(&mut commands.cmd_buf, &code, n_clip as u32);
let mut pass = commands.compute_pass(0, 1);
binding.record(&mut pass, &code, n_clip as u32);
pass.end();
commands.download(&memory);
commands.write_timestamp(1);
runner.submit(commands);
let dst = memory.map_read(..);
if let Some(failure) = data.verify(&dst) {

View file

@ -77,9 +77,9 @@ pub unsafe fn draw_test(runner: &mut Runner, config: &Config) -> TestResult {
let n_iter = config.n_iter;
for i in 0..n_iter {
let mut commands = runner.commands();
commands.write_timestamp(0);
stage.record(&mut commands.cmd_buf, &code, &binding, n_tag);
commands.write_timestamp(1);
let mut pass = commands.compute_pass(0, 1);
stage.record(&mut pass, &code, &binding, n_tag);
pass.end();
if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier();
commands.download(&memory);

View file

@ -45,9 +45,7 @@ pub unsafe fn run_linkedlist_test(runner: &mut Runner, config: &Config) -> TestR
for i in 0..n_iter {
let mut commands = runner.commands();
// Might clear only buckets to save time.
commands.write_timestamp(0);
stage.record(&mut commands, &code, &binding, &mem_buf.dev_buf);
commands.write_timestamp(1);
if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier();
commands.download(&mem_buf);
@ -107,12 +105,14 @@ impl LinkedListStage {
commands.cmd_buf.clear_buffer(out_buf, None);
commands.cmd_buf.memory_barrier();
let n_workgroups = N_BUCKETS / WG_SIZE;
commands.cmd_buf.dispatch(
let mut pass = commands.compute_pass(0, 1);
pass.dispatch(
&code.pipeline,
&bindings.descriptor_set,
(n_workgroups as u32, 1, 1),
(WG_SIZE as u32, 1, 1),
);
pass.end();
}
}

View file

@ -59,9 +59,7 @@ pub unsafe fn run_message_passing_test(
let mut failures = 0;
for _ in 0..n_iter {
let mut commands = runner.commands();
commands.write_timestamp(0);
stage.record(&mut commands, &code, &binding, &out_buf.dev_buf);
commands.write_timestamp(1);
commands.cmd_buf.memory_barrier();
commands.download(&out_buf);
total_elapsed += runner.submit(commands);
@ -128,11 +126,13 @@ impl MessagePassingStage {
commands.cmd_buf.clear_buffer(&self.data_buf, None);
commands.cmd_buf.clear_buffer(out_buf, None);
commands.cmd_buf.memory_barrier();
commands.cmd_buf.dispatch(
let mut pass = commands.compute_pass(0, 1);
pass.dispatch(
&code.pipeline,
&bindings.descriptor_set,
(256, 1, 1),
(256, 1, 1),
);
pass.end();
}
}

View file

@ -105,15 +105,15 @@ pub unsafe fn path_test(runner: &mut Runner, config: &Config) -> TestResult {
let mut commands = runner.commands();
commands.cmd_buf.copy_buffer(&memory_init, &memory.dev_buf);
commands.cmd_buf.memory_barrier();
commands.write_timestamp(0);
let mut pass = commands.compute_pass(0, 1);
stage.record(
&mut commands.cmd_buf,
&mut pass,
&code,
&binding,
path_data.n_path,
path_data.tags.len() as u32,
);
commands.write_timestamp(1);
pass.end();
if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier();
commands.download(&memory);

View file

@ -85,9 +85,7 @@ pub unsafe fn run_prefix_test(
let mut total_elapsed = 0.0;
for i in 0..n_iter {
let mut commands = runner.commands();
commands.write_timestamp(0);
stage.record(&mut commands, &code, &binding);
commands.write_timestamp(1);
if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier();
commands.download(&out_buf);
@ -159,12 +157,14 @@ impl PrefixStage {
let n_workgroups = (self.n_elements + ELEMENTS_PER_WG - 1) / ELEMENTS_PER_WG;
commands.cmd_buf.clear_buffer(&self.state_buf, None);
commands.cmd_buf.memory_barrier();
commands.cmd_buf.dispatch(
let mut pass = commands.compute_pass(0, 1);
pass.dispatch(
&code.pipeline,
&bindings.descriptor_set,
(n_workgroups as u32, 1, 1),
(WG_SIZE as u32, 1, 1),
);
pass.end();
// One thing that's missing here is registering the buffers so
// they can be safely dropped by Rust code before the execution
// of the command buffer completes.

View file

@ -66,9 +66,7 @@ pub unsafe fn run_prefix_test(runner: &mut Runner, config: &Config) -> TestResul
let mut commands = runner.commands();
commands.cmd_buf.copy_buffer(&data_buf, &out_buf.dev_buf);
commands.cmd_buf.memory_barrier();
commands.write_timestamp(0);
stage.record(&mut commands, &code, &binding);
commands.write_timestamp(1);
if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier();
commands.download(&out_buf);
@ -175,33 +173,35 @@ impl PrefixTreeStage {
code: &PrefixTreeCode,
bindings: &PrefixTreeBinding,
) {
let mut pass = commands.compute_pass(0, 1);
let n = self.tmp_bufs.len();
for i in 0..n {
let n_workgroups = self.sizes[i + 1];
commands.cmd_buf.dispatch(
pass.dispatch(
&code.reduce_pipeline,
&bindings.descriptor_sets[i],
(n_workgroups as u32, 1, 1),
(WG_SIZE as u32, 1, 1),
);
commands.cmd_buf.memory_barrier();
pass.memory_barrier();
}
commands.cmd_buf.dispatch(
pass.dispatch(
&code.root_pipeline,
&bindings.descriptor_sets[n],
(1, 1, 1),
(WG_SIZE as u32, 1, 1),
);
for i in (0..n).rev() {
commands.cmd_buf.memory_barrier();
pass.memory_barrier();
let n_workgroups = self.sizes[i + 1];
commands.cmd_buf.dispatch(
pass.dispatch(
&code.scan_pipeline,
&bindings.descriptor_sets[2 * n - i],
(n_workgroups as u32, 1, 1),
(WG_SIZE as u32, 1, 1),
);
}
pass.end();
}
}

View file

@ -20,8 +20,8 @@ use std::ops::RangeBounds;
use bytemuck::Pod;
use piet_gpu_hal::{
BackendType, BufReadGuard, BufWriteGuard, Buffer, BufferUsage, CmdBuf, Instance, InstanceFlags,
QueryPool, Session,
BackendType, BufReadGuard, BufWriteGuard, Buffer, BufferUsage, CmdBuf, ComputePass,
ComputePassDescriptor, Instance, InstanceFlags, QueryPool, Session,
};
pub struct Runner {
@ -118,8 +118,14 @@ impl Runner {
}
impl Commands {
pub unsafe fn write_timestamp(&mut self, query: u32) {
self.cmd_buf.write_timestamp(&self.query_pool, query);
/// Start a compute pass with timer queries.
pub unsafe fn compute_pass(&mut self, start_query: u32, end_query: u32) -> ComputePass {
self.cmd_buf
.begin_compute_pass(&ComputePassDescriptor::timer(
&self.query_pool,
start_query,
end_query,
))
}
pub unsafe fn upload(&mut self, buf: &BufStage) {

View file

@ -61,9 +61,9 @@ pub unsafe fn transform_test(runner: &mut Runner, config: &Config) -> TestResult
let n_iter = config.n_iter;
for i in 0..n_iter {
let mut commands = runner.commands();
commands.write_timestamp(0);
stage.record(&mut commands.cmd_buf, &code, &binding, n_elements);
commands.write_timestamp(1);
let mut pass = commands.compute_pass(0, 1);
stage.record(&mut pass, &code, &binding, n_elements);
pass.end();
if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier();
commands.download(&memory);