mirror of
https://github.com/italicsjenga/vello.git
synced 2025-01-10 12:41:30 +11:00
Merge pull request #164 from linebender/metal_timer
Timer queries on more platforms, particularly Metal
This commit is contained in:
commit
05624de19d
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -921,6 +921,7 @@ dependencies = [
|
|||
"block",
|
||||
"bytemuck",
|
||||
"cocoa-foundation",
|
||||
"foreign-types",
|
||||
"metal",
|
||||
"objc",
|
||||
"raw-window-handle",
|
||||
|
|
|
@ -28,3 +28,4 @@ metal = "0.22"
|
|||
objc = "0.2.5"
|
||||
block = "0.1.6"
|
||||
cocoa-foundation = "0.1"
|
||||
foreign-types = "0.3.2"
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
use piet_gpu_hal::{include_shader, BindType};
|
||||
use piet_gpu_hal::{include_shader, BindType, ComputePassDescriptor};
|
||||
use piet_gpu_hal::{BufferUsage, Instance, InstanceFlags, Session};
|
||||
|
||||
fn main() {
|
||||
|
@ -20,9 +20,9 @@ fn main() {
|
|||
let mut cmd_buf = session.cmd_buf().unwrap();
|
||||
cmd_buf.begin();
|
||||
cmd_buf.reset_query_pool(&query_pool);
|
||||
cmd_buf.write_timestamp(&query_pool, 0);
|
||||
cmd_buf.dispatch(&pipeline, &descriptor_set, (256, 1, 1), (1, 1, 1));
|
||||
cmd_buf.write_timestamp(&query_pool, 1);
|
||||
let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 0, 1));
|
||||
pass.dispatch(&pipeline, &descriptor_set, (256, 1, 1), (1, 1, 1));
|
||||
pass.end();
|
||||
cmd_buf.finish_timestamps(&query_pool);
|
||||
cmd_buf.host_barrier();
|
||||
cmd_buf.finish();
|
||||
|
|
|
@ -17,7 +17,8 @@
|
|||
//! The generic trait for backends to implement.
|
||||
|
||||
use crate::{
|
||||
BindType, BufferUsage, Error, GpuInfo, ImageFormat, ImageLayout, MapMode, SamplerParams,
|
||||
BindType, BufferUsage, ComputePassDescriptor, Error, GpuInfo, ImageFormat, ImageLayout,
|
||||
MapMode, SamplerParams,
|
||||
};
|
||||
|
||||
pub trait Device: Sized {
|
||||
|
@ -159,14 +160,32 @@ pub trait Device: Sized {
|
|||
unsafe fn create_sampler(&self, params: SamplerParams) -> Result<Self::Sampler, Error>;
|
||||
}
|
||||
|
||||
/// The trait implemented by backend command buffer implementations.
|
||||
///
|
||||
/// Valid encoding is represented by a state machine (currently not validated
|
||||
/// but it is easy to imagine there might be at least debug validation). Most
|
||||
/// methods are only valid in a particular state, and some move it to another
|
||||
/// state.
|
||||
pub trait CmdBuf<D: Device> {
|
||||
/// Begin encoding.
|
||||
///
|
||||
/// State: init -> ready
|
||||
unsafe fn begin(&mut self);
|
||||
|
||||
/// State: ready -> finished
|
||||
unsafe fn finish(&mut self);
|
||||
|
||||
/// Return true if the command buffer is suitable for reuse.
|
||||
unsafe fn reset(&mut self) -> bool;
|
||||
|
||||
/// Begin a compute pass.
|
||||
///
|
||||
/// State: ready -> in_compute_pass
|
||||
unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor);
|
||||
|
||||
/// Dispatch
|
||||
///
|
||||
/// State: in_compute_pass
|
||||
unsafe fn dispatch(
|
||||
&mut self,
|
||||
pipeline: &D::Pipeline,
|
||||
|
@ -175,6 +194,9 @@ pub trait CmdBuf<D: Device> {
|
|||
workgroup_size: (u32, u32, u32),
|
||||
);
|
||||
|
||||
/// State: in_compute_pass -> ready
|
||||
unsafe fn end_compute_pass(&mut self);
|
||||
|
||||
/// Insert an execution and memory barrier.
|
||||
///
|
||||
/// Compute kernels (and other actions) after this barrier may read from buffers
|
||||
|
@ -202,16 +224,16 @@ pub trait CmdBuf<D: Device> {
|
|||
/// This is readily supported in Vulkan, but for portability it is remarkably
|
||||
/// tricky (unimplemented in gfx-hal right now). Possibly best to write a compute
|
||||
/// kernel, or organize the code not to need it.
|
||||
unsafe fn clear_buffer(&self, buffer: &D::Buffer, size: Option<u64>);
|
||||
unsafe fn clear_buffer(&mut self, buffer: &D::Buffer, size: Option<u64>);
|
||||
|
||||
unsafe fn copy_buffer(&self, src: &D::Buffer, dst: &D::Buffer);
|
||||
unsafe fn copy_buffer(&mut self, src: &D::Buffer, dst: &D::Buffer);
|
||||
|
||||
unsafe fn copy_image_to_buffer(&self, src: &D::Image, dst: &D::Buffer);
|
||||
unsafe fn copy_image_to_buffer(&mut self, src: &D::Image, dst: &D::Buffer);
|
||||
|
||||
unsafe fn copy_buffer_to_image(&self, src: &D::Buffer, dst: &D::Image);
|
||||
unsafe fn copy_buffer_to_image(&mut self, src: &D::Buffer, dst: &D::Image);
|
||||
|
||||
// low portability, dx12 doesn't support it natively
|
||||
unsafe fn blit_image(&self, src: &D::Image, dst: &D::Image);
|
||||
unsafe fn blit_image(&mut self, src: &D::Image, dst: &D::Image);
|
||||
|
||||
/// Reset the query pool.
|
||||
///
|
||||
|
@ -227,7 +249,7 @@ pub trait CmdBuf<D: Device> {
|
|||
unsafe fn finish_timestamps(&mut self, _pool: &D::QueryPool) {}
|
||||
|
||||
/// Begin a labeled section for debugging and profiling purposes.
|
||||
unsafe fn begin_debug_label(&mut self, label: &str) {}
|
||||
unsafe fn begin_debug_label(&mut self, _label: &str) {}
|
||||
|
||||
/// End a section opened by `begin_debug_label`.
|
||||
unsafe fn end_debug_label(&mut self) {}
|
||||
|
|
|
@ -21,7 +21,7 @@ use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};
|
|||
|
||||
use smallvec::SmallVec;
|
||||
|
||||
use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, MapMode, WorkgroupLimits, ImageFormat};
|
||||
use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, MapMode, WorkgroupLimits, ImageFormat, ComputePassDescriptor};
|
||||
|
||||
use self::{
|
||||
descriptor::{CpuHeapRefOwned, DescriptorPool, GpuHeapRefOwned},
|
||||
|
@ -76,6 +76,7 @@ pub struct CmdBuf {
|
|||
c: wrappers::GraphicsCommandList,
|
||||
allocator: CommandAllocator,
|
||||
needs_reset: bool,
|
||||
end_query: Option<(wrappers::QueryHeap, u32)>,
|
||||
}
|
||||
|
||||
pub struct Pipeline {
|
||||
|
@ -360,6 +361,7 @@ impl crate::backend::Device for Dx12Device {
|
|||
c,
|
||||
allocator,
|
||||
needs_reset: false,
|
||||
end_query: None,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -388,11 +390,10 @@ impl crate::backend::Device for Dx12Device {
|
|||
let mapped = self.map_buffer(&pool.buf, 0, size as u64, MapMode::Read)?;
|
||||
std::ptr::copy_nonoverlapping(mapped, buf.as_mut_ptr() as *mut u8, size);
|
||||
self.unmap_buffer(&pool.buf, 0, size as u64, MapMode::Read)?;
|
||||
let ts0 = buf[0];
|
||||
let tsp = (self.ts_freq as f64).recip();
|
||||
let result = buf[1..]
|
||||
let result = buf
|
||||
.iter()
|
||||
.map(|ts| ts.wrapping_sub(ts0) as f64 * tsp)
|
||||
.map(|ts| *ts as f64 * tsp)
|
||||
.collect();
|
||||
Ok(result)
|
||||
}
|
||||
|
@ -610,6 +611,16 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
|
|||
self.allocator.reset().is_ok() && self.c.reset(&self.allocator, None).is_ok()
|
||||
}
|
||||
|
||||
unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
|
||||
if let Some((pool, start, end)) = &desc.timer_queries {
|
||||
#[allow(irrefutable_let_patterns)]
|
||||
if let crate::hub::QueryPool::Dx12(pool) = pool {
|
||||
self.write_timestamp(pool, *start);
|
||||
self.end_query = Some((pool.heap.clone(), *end));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unsafe fn dispatch(
|
||||
&mut self,
|
||||
pipeline: &Pipeline,
|
||||
|
@ -628,6 +639,12 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
|
|||
.dispatch(workgroup_count.0, workgroup_count.1, workgroup_count.2);
|
||||
}
|
||||
|
||||
unsafe fn end_compute_pass(&mut self) {
|
||||
if let Some((heap, end)) = self.end_query.take() {
|
||||
self.c.end_timing_query(&heap, end);
|
||||
}
|
||||
}
|
||||
|
||||
unsafe fn memory_barrier(&mut self) {
|
||||
// See comments in CommandBuffer::pipeline_barrier in gfx-hal dx12 backend.
|
||||
// The "proper" way to do this would be to name the actual buffers participating
|
||||
|
@ -666,7 +683,7 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
|
|||
self.memory_barrier();
|
||||
}
|
||||
|
||||
unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) {
|
||||
unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
|
||||
let cpu_ref = buffer.cpu_ref.as_ref().unwrap();
|
||||
let (gpu_ref, heap) = buffer
|
||||
.gpu_ref
|
||||
|
@ -684,23 +701,23 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
|
|||
);
|
||||
}
|
||||
|
||||
unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) {
|
||||
unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
|
||||
// TODO: consider using copy_resource here (if sizes match)
|
||||
let size = src.size.min(dst.size);
|
||||
self.c.copy_buffer(&dst.resource, 0, &src.resource, 0, size);
|
||||
}
|
||||
|
||||
unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) {
|
||||
unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
|
||||
self.c
|
||||
.copy_texture_to_buffer(&src.resource, &dst.resource, src.size.0, src.size.1);
|
||||
}
|
||||
|
||||
unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) {
|
||||
unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
|
||||
self.c
|
||||
.copy_buffer_to_texture(&src.resource, &dst.resource, dst.size.0, dst.size.1);
|
||||
}
|
||||
|
||||
unsafe fn blit_image(&self, src: &Image, dst: &Image) {
|
||||
unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
|
||||
self.c.copy_resource(&src.resource, &dst.resource);
|
||||
}
|
||||
|
||||
|
|
|
@ -79,7 +79,6 @@ pub struct Blob(pub ComPtr<d3dcommon::ID3DBlob>);
|
|||
#[derive(Clone)]
|
||||
pub struct ShaderByteCode {
|
||||
pub bytecode: d3d12::D3D12_SHADER_BYTECODE,
|
||||
blob: Option<Blob>,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
|
@ -741,7 +740,6 @@ impl ShaderByteCode {
|
|||
BytecodeLength: blob.0.GetBufferSize(),
|
||||
pShaderBytecode: blob.0.GetBufferPointer(),
|
||||
},
|
||||
blob: Some(blob),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -810,7 +808,6 @@ impl ShaderByteCode {
|
|||
BytecodeLength: bytecode.len(),
|
||||
pShaderBytecode: bytecode.as_ptr() as *const _,
|
||||
},
|
||||
blob: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -13,7 +13,7 @@ use std::sync::{Arc, Mutex, Weak};
|
|||
use bytemuck::Pod;
|
||||
use smallvec::SmallVec;
|
||||
|
||||
use crate::{mux, BackendType, BufWrite, ImageFormat, MapMode};
|
||||
use crate::{mux, BackendType, BufWrite, ComputePassDescriptor, ImageFormat, MapMode};
|
||||
|
||||
use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams};
|
||||
|
||||
|
@ -135,6 +135,11 @@ pub struct BufReadGuard<'a> {
|
|||
size: u64,
|
||||
}
|
||||
|
||||
/// A sub-object of a command buffer for a sequence of compute dispatches.
|
||||
pub struct ComputePass<'a> {
|
||||
cmd_buf: &'a mut CmdBuf,
|
||||
}
|
||||
|
||||
impl Session {
|
||||
/// Create a new session, choosing the best backend.
|
||||
pub fn new(device: mux::Device) -> Session {
|
||||
|
@ -370,8 +375,17 @@ impl Session {
|
|||
///
|
||||
/// This should be called after waiting on the command buffer that wrote the
|
||||
/// timer queries.
|
||||
///
|
||||
/// The returned vector is one shorter than the number of timer queries in the
|
||||
/// pool; the first value is subtracted off. It would likely be better to return
|
||||
/// the raw timestamps, but that change should be made consistently.
|
||||
pub unsafe fn fetch_query_pool(&self, pool: &QueryPool) -> Result<Vec<f64>, Error> {
|
||||
self.0.device.fetch_query_pool(pool)
|
||||
let result = self.0.device.fetch_query_pool(pool)?;
|
||||
// Subtract off first timestamp.
|
||||
Ok(result[1..]
|
||||
.iter()
|
||||
.map(|ts| *ts as f64 - result[0])
|
||||
.collect())
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
|
@ -471,23 +485,10 @@ impl CmdBuf {
|
|||
self.cmd_buf().finish();
|
||||
}
|
||||
|
||||
/// Dispatch a compute shader.
|
||||
///
|
||||
/// Request a compute shader to be run, using the pipeline to specify the
|
||||
/// code, and the descriptor set to address the resources read and written.
|
||||
///
|
||||
/// Both the workgroup count (number of workgroups) and the workgroup size
|
||||
/// (number of threads in a workgroup) must be specified here, though not
|
||||
/// all back-ends require the latter info.
|
||||
pub unsafe fn dispatch(
|
||||
&mut self,
|
||||
pipeline: &Pipeline,
|
||||
descriptor_set: &DescriptorSet,
|
||||
workgroup_count: (u32, u32, u32),
|
||||
workgroup_size: (u32, u32, u32),
|
||||
) {
|
||||
self.cmd_buf()
|
||||
.dispatch(pipeline, descriptor_set, workgroup_count, workgroup_size);
|
||||
/// Begin a compute pass.
|
||||
pub unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) -> ComputePass {
|
||||
self.cmd_buf().begin_compute_pass(desc);
|
||||
ComputePass { cmd_buf: self }
|
||||
}
|
||||
|
||||
/// Insert an execution and memory barrier.
|
||||
|
@ -582,13 +583,6 @@ impl CmdBuf {
|
|||
self.cmd_buf().reset_query_pool(pool);
|
||||
}
|
||||
|
||||
/// Write a timestamp.
|
||||
///
|
||||
/// The query index must be less than the size of the query pool on creation.
|
||||
pub unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
|
||||
self.cmd_buf().write_timestamp(pool, query);
|
||||
}
|
||||
|
||||
/// Prepare the timestamps for reading. This isn't required on Vulkan but
|
||||
/// is required on (at least) DX12.
|
||||
///
|
||||
|
@ -692,6 +686,51 @@ impl Drop for SubmittedCmdBuf {
|
|||
}
|
||||
}
|
||||
|
||||
impl<'a> ComputePass<'a> {
|
||||
/// Dispatch a compute shader.
|
||||
///
|
||||
/// Request a compute shader to be run, using the pipeline to specify the
|
||||
/// code, and the descriptor set to address the resources read and written.
|
||||
///
|
||||
/// Both the workgroup count (number of workgroups) and the workgroup size
|
||||
/// (number of threads in a workgroup) must be specified here, though not
|
||||
/// all back-ends require the latter info.
|
||||
pub unsafe fn dispatch(
|
||||
&mut self,
|
||||
pipeline: &Pipeline,
|
||||
descriptor_set: &DescriptorSet,
|
||||
workgroup_count: (u32, u32, u32),
|
||||
workgroup_size: (u32, u32, u32),
|
||||
) {
|
||||
self.cmd_buf
|
||||
.cmd_buf()
|
||||
.dispatch(pipeline, descriptor_set, workgroup_count, workgroup_size);
|
||||
}
|
||||
|
||||
/// Add a memory barrier.
|
||||
///
|
||||
/// Inserts a memory barrier in the compute encoder. This is a convenience
|
||||
/// function for calling the same function on the underlying command buffer,
|
||||
/// avoiding borrow check issues.
|
||||
pub unsafe fn memory_barrier(&mut self) {
|
||||
self.cmd_buf.memory_barrier();
|
||||
}
|
||||
|
||||
/// Begin a labeled section for debugging and profiling purposes.
|
||||
pub unsafe fn begin_debug_label(&mut self, label: &str) {
|
||||
self.cmd_buf.begin_debug_label(label);
|
||||
}
|
||||
|
||||
/// End a section opened by `begin_debug_label`.
|
||||
pub unsafe fn end_debug_label(&mut self) {
|
||||
self.cmd_buf.end_debug_label();
|
||||
}
|
||||
|
||||
pub unsafe fn end(self) {
|
||||
self.cmd_buf.cmd_buf().end_compute_pass();
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for BufferInner {
|
||||
fn drop(&mut self) {
|
||||
if let Some(session) = Weak::upgrade(&self.session) {
|
||||
|
|
|
@ -21,8 +21,8 @@ pub use crate::mux::{
|
|||
};
|
||||
pub use bufwrite::BufWrite;
|
||||
pub use hub::{
|
||||
BufReadGuard, BufWriteGuard, Buffer, CmdBuf, DescriptorSetBuilder, Image, RetainResource,
|
||||
Session, SubmittedCmdBuf,
|
||||
BufReadGuard, BufWriteGuard, Buffer, CmdBuf, ComputePass, DescriptorSetBuilder, Image,
|
||||
RetainResource, Session, SubmittedCmdBuf,
|
||||
};
|
||||
|
||||
// TODO: because these are conditionally included, "cargo fmt" does not
|
||||
|
@ -189,3 +189,23 @@ pub struct WorkgroupLimits {
|
|||
/// dimension.
|
||||
pub max_invocations: u32,
|
||||
}
|
||||
|
||||
/// Options for creating a compute pass.
|
||||
#[derive(Default)]
|
||||
pub struct ComputePassDescriptor<'a> {
|
||||
// Maybe label should go here? It does in wgpu and wgpu_hal.
|
||||
/// Timer query parameters.
|
||||
///
|
||||
/// To record timer queries for a compute pass, set the query pool, start
|
||||
/// query index, and end query index here. The indices must be less than
|
||||
/// the size of the query pool.
|
||||
timer_queries: Option<(&'a QueryPool, u32, u32)>,
|
||||
}
|
||||
|
||||
impl<'a> ComputePassDescriptor<'a> {
|
||||
pub fn timer(pool: &'a QueryPool, start_query: u32, end_query: u32) -> ComputePassDescriptor {
|
||||
ComputePassDescriptor {
|
||||
timer_queries: Some((pool, start_query, end_query)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -15,25 +15,32 @@
|
|||
// Also licensed under MIT license, at your choice.
|
||||
|
||||
mod clear;
|
||||
mod timer;
|
||||
mod util;
|
||||
|
||||
use std::mem;
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use block::Block;
|
||||
use cocoa_foundation::base::id;
|
||||
use cocoa_foundation::foundation::{NSInteger, NSUInteger};
|
||||
use foreign_types::ForeignType;
|
||||
use objc::rc::autoreleasepool;
|
||||
use objc::runtime::{Object, BOOL, YES};
|
||||
use objc::{class, msg_send, sel, sel_impl};
|
||||
|
||||
use metal::{CGFloat, MTLFeatureSet};
|
||||
use metal::{CGFloat, CommandBufferRef, MTLFeatureSet};
|
||||
|
||||
use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};
|
||||
|
||||
use crate::{BufferUsage, Error, GpuInfo, ImageFormat, MapMode, WorkgroupLimits};
|
||||
use crate::{
|
||||
BufferUsage, ComputePassDescriptor, Error, GpuInfo, ImageFormat, MapMode, WorkgroupLimits,
|
||||
};
|
||||
|
||||
use util::*;
|
||||
|
||||
use self::timer::{CounterSampleBuffer, CounterSet, TimeCalibration};
|
||||
|
||||
pub struct MtlInstance;
|
||||
|
||||
pub struct MtlDevice {
|
||||
|
@ -41,6 +48,18 @@ pub struct MtlDevice {
|
|||
cmd_queue: Arc<Mutex<metal::CommandQueue>>,
|
||||
gpu_info: GpuInfo,
|
||||
helpers: Arc<Helpers>,
|
||||
timer_set: Option<CounterSet>,
|
||||
counter_style: CounterStyle,
|
||||
}
|
||||
|
||||
/// Type of counter sampling.
|
||||
///
|
||||
/// See https://developer.apple.com/documentation/metal/counter_sampling/sampling_gpu_data_into_counter_sample_buffers
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
|
||||
enum CounterStyle {
|
||||
None,
|
||||
Stage,
|
||||
Command,
|
||||
}
|
||||
|
||||
pub struct MtlSurface {
|
||||
|
@ -81,9 +100,22 @@ pub struct Semaphore;
|
|||
pub struct CmdBuf {
|
||||
cmd_buf: metal::CommandBuffer,
|
||||
helpers: Arc<Helpers>,
|
||||
cur_encoder: Encoder,
|
||||
time_calibration: Arc<Mutex<TimeCalibration>>,
|
||||
counter_style: CounterStyle,
|
||||
}
|
||||
|
||||
pub struct QueryPool;
|
||||
enum Encoder {
|
||||
None,
|
||||
Compute(metal::ComputeCommandEncoder, Option<(id, u32)>),
|
||||
Blit(metal::BlitCommandEncoder),
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct QueryPool {
|
||||
counter_sample_buf: Option<CounterSampleBuffer>,
|
||||
calibration: Arc<Mutex<Option<Arc<Mutex<TimeCalibration>>>>>,
|
||||
}
|
||||
|
||||
pub struct Pipeline(metal::ComputePipelineState);
|
||||
|
||||
|
@ -209,18 +241,43 @@ impl MtlDevice {
|
|||
let helpers = Arc::new(Helpers {
|
||||
clear_pipeline: clear::make_clear_pipeline(&device),
|
||||
});
|
||||
// Timer stuff
|
||||
let timer_set = CounterSet::get_timer_counter_set(&device);
|
||||
let counter_style = if timer_set.is_some() {
|
||||
if device.supports_counter_sampling(metal::MTLCounterSamplingPoint::AtStageBoundary) {
|
||||
CounterStyle::Stage
|
||||
} else if device
|
||||
.supports_counter_sampling(metal::MTLCounterSamplingPoint::AtDispatchBoundary)
|
||||
{
|
||||
CounterStyle::Command
|
||||
} else {
|
||||
CounterStyle::None
|
||||
}
|
||||
} else {
|
||||
CounterStyle::None
|
||||
};
|
||||
MtlDevice {
|
||||
device,
|
||||
cmd_queue: Arc::new(Mutex::new(cmd_queue)),
|
||||
gpu_info,
|
||||
helpers,
|
||||
timer_set,
|
||||
counter_style,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn cmd_buf_from_raw_mtl(&self, raw_cmd_buf: metal::CommandBuffer) -> CmdBuf {
|
||||
let cmd_buf = raw_cmd_buf;
|
||||
let helpers = self.helpers.clone();
|
||||
CmdBuf { cmd_buf, helpers }
|
||||
let cur_encoder = Encoder::None;
|
||||
let time_calibration = Default::default();
|
||||
CmdBuf {
|
||||
cmd_buf,
|
||||
helpers,
|
||||
cur_encoder,
|
||||
time_calibration,
|
||||
counter_style: self.counter_style,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn image_from_raw_mtl(&self, texture: metal::Texture, width: u32, height: u32) -> Image {
|
||||
|
@ -330,11 +387,35 @@ impl crate::backend::Device for MtlDevice {
|
|||
|
||||
fn create_cmd_buf(&self) -> Result<Self::CmdBuf, Error> {
|
||||
let cmd_queue = self.cmd_queue.lock().unwrap();
|
||||
// A discussion about autorelease pools.
|
||||
//
|
||||
// Autorelease pools are a sore point in Rust/Objective-C interop. Basically,
|
||||
// you can have any two of correctness, ergonomics, and performance. Here we've
|
||||
// chosen the first two, using the pattern of a fine grained autorelease pool
|
||||
// to give the Obj-C object Rust-like lifetime semantics whenever objects are
|
||||
// created as autorelease (by convention, this is any object creation with an
|
||||
// Obj-C method name that doesn't begin with "new" or "alloc").
|
||||
//
|
||||
// To gain back some of the performance, we'd need a way to wrap an autorelease
|
||||
// pool over a chunk of work - that could be one frame of rendering, but for
|
||||
// tests that iterate a number of command buffer submissions, it would need to
|
||||
// be around that. On non-mac platforms, it would be a no-op.
|
||||
//
|
||||
// In any case, this way, the caller doesn't need to worry, and the performance
|
||||
// hit might not be so bad (perhaps we should measure).
|
||||
|
||||
// consider new_command_buffer_with_unretained_references for performance
|
||||
let cmd_buf = cmd_queue.new_command_buffer();
|
||||
let cmd_buf = autoreleasepool(|| cmd_buf.to_owned());
|
||||
let cmd_buf = autoreleasepool(|| cmd_queue.new_command_buffer().to_owned());
|
||||
let helpers = self.helpers.clone();
|
||||
Ok(CmdBuf { cmd_buf, helpers })
|
||||
let cur_encoder = Encoder::None;
|
||||
let time_calibration = Default::default();
|
||||
Ok(CmdBuf {
|
||||
cmd_buf,
|
||||
helpers,
|
||||
cur_encoder,
|
||||
time_calibration,
|
||||
counter_style: self.counter_style,
|
||||
})
|
||||
}
|
||||
|
||||
unsafe fn destroy_cmd_buf(&self, _cmd_buf: Self::CmdBuf) -> Result<(), Error> {
|
||||
|
@ -342,12 +423,31 @@ impl crate::backend::Device for MtlDevice {
|
|||
}
|
||||
|
||||
fn create_query_pool(&self, n_queries: u32) -> Result<Self::QueryPool, Error> {
|
||||
// TODO
|
||||
Ok(QueryPool)
|
||||
if let Some(timer_set) = &self.timer_set {
|
||||
let pool = CounterSampleBuffer::new(&self.device, n_queries as u64, timer_set)
|
||||
.ok_or("error creating timer query pool")?;
|
||||
return Ok(QueryPool {
|
||||
counter_sample_buf: Some(pool),
|
||||
calibration: Default::default(),
|
||||
});
|
||||
}
|
||||
Ok(QueryPool::default())
|
||||
}
|
||||
|
||||
unsafe fn fetch_query_pool(&self, pool: &Self::QueryPool) -> Result<Vec<f64>, Error> {
|
||||
// TODO
|
||||
if let Some(raw) = &pool.counter_sample_buf {
|
||||
let resolved = raw.resolve();
|
||||
let calibration = pool.calibration.lock().unwrap();
|
||||
if let Some(calibration) = &*calibration {
|
||||
let calibration = calibration.lock().unwrap();
|
||||
let result = resolved
|
||||
.iter()
|
||||
.map(|time_ns| calibration.correlate(*time_ns))
|
||||
.collect();
|
||||
return Ok(result);
|
||||
}
|
||||
}
|
||||
// Maybe should return None indicating it wasn't successful? But that might break.
|
||||
Ok(Vec::new())
|
||||
}
|
||||
|
||||
|
@ -358,7 +458,37 @@ impl crate::backend::Device for MtlDevice {
|
|||
_signal_semaphores: &[&Self::Semaphore],
|
||||
fence: Option<&mut Self::Fence>,
|
||||
) -> Result<(), Error> {
|
||||
unsafe fn add_scheduled_handler(
|
||||
cmd_buf: &metal::CommandBufferRef,
|
||||
block: &Block<(&CommandBufferRef,), ()>,
|
||||
) {
|
||||
msg_send![cmd_buf, addScheduledHandler: block]
|
||||
}
|
||||
for cmd_buf in cmd_bufs {
|
||||
let time_calibration = cmd_buf.time_calibration.clone();
|
||||
let start_block = block::ConcreteBlock::new(move |buffer: &metal::CommandBufferRef| {
|
||||
let device: id = msg_send![buffer, device];
|
||||
let mut time_calibration = time_calibration.lock().unwrap();
|
||||
let cpu_ts_ptr = &mut time_calibration.cpu_start_ts as *mut _;
|
||||
let gpu_ts_ptr = &mut time_calibration.gpu_start_ts as *mut _;
|
||||
// TODO: only do this if supported.
|
||||
let () = msg_send![device, sampleTimestamps: cpu_ts_ptr gpuTimestamp: gpu_ts_ptr];
|
||||
})
|
||||
.copy();
|
||||
add_scheduled_handler(&cmd_buf.cmd_buf, &start_block);
|
||||
let time_calibration = cmd_buf.time_calibration.clone();
|
||||
let completed_block =
|
||||
block::ConcreteBlock::new(move |buffer: &metal::CommandBufferRef| {
|
||||
let device: id = msg_send![buffer, device];
|
||||
let mut time_calibration = time_calibration.lock().unwrap();
|
||||
let cpu_ts_ptr = &mut time_calibration.cpu_end_ts as *mut _;
|
||||
let gpu_ts_ptr = &mut time_calibration.gpu_end_ts as *mut _;
|
||||
// TODO: only do this if supported.
|
||||
let () =
|
||||
msg_send![device, sampleTimestamps: cpu_ts_ptr gpuTimestamp: gpu_ts_ptr];
|
||||
})
|
||||
.copy();
|
||||
cmd_buf.cmd_buf.add_completed_handler(&completed_block);
|
||||
cmd_buf.cmd_buf.commit();
|
||||
}
|
||||
if let Some(last_cmd_buf) = cmd_bufs.last() {
|
||||
|
@ -439,12 +569,70 @@ impl crate::backend::Device for MtlDevice {
|
|||
impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
|
||||
unsafe fn begin(&mut self) {}
|
||||
|
||||
unsafe fn finish(&mut self) {}
|
||||
unsafe fn finish(&mut self) {
|
||||
self.flush_encoder();
|
||||
}
|
||||
|
||||
unsafe fn reset(&mut self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
|
||||
// TODO: we might want to get better about validation but the following
|
||||
// assert is likely to trigger, and also a case can be made that
|
||||
// validation should be done at the hub level, for consistency.
|
||||
//debug_assert!(matches!(self.cur_encoder, Encoder::None));
|
||||
self.flush_encoder();
|
||||
autoreleasepool(|| {
|
||||
let (encoder, end_query) = match (&desc.timer_queries, self.counter_style) {
|
||||
(Some(queries), CounterStyle::Stage) => {
|
||||
let descriptor: id =
|
||||
msg_send![class!(MTLComputePassDescriptor), computePassDescriptor];
|
||||
let attachments: id = msg_send![descriptor, sampleBufferAttachments];
|
||||
let index: NSUInteger = 0;
|
||||
let attachment: id = msg_send![attachments, objectAtIndexedSubscript: index];
|
||||
// Here we break the hub/mux separation a bit, for expedience
|
||||
#[allow(irrefutable_let_patterns)]
|
||||
if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 {
|
||||
if let Some(sample_buf) = &query_pool.counter_sample_buf {
|
||||
let () = msg_send![attachment, setSampleBuffer: sample_buf.id()];
|
||||
}
|
||||
}
|
||||
let start_index = queries.1 as NSUInteger;
|
||||
let end_index = queries.2 as NSInteger;
|
||||
let () = msg_send![attachment, setStartOfEncoderSampleIndex: start_index];
|
||||
let () = msg_send![attachment, setEndOfEncoderSampleIndex: end_index];
|
||||
(
|
||||
msg_send![
|
||||
self.cmd_buf,
|
||||
computeCommandEncoderWithDescriptor: descriptor
|
||||
],
|
||||
None,
|
||||
)
|
||||
}
|
||||
(Some(queries), CounterStyle::Command) => {
|
||||
let encoder = self.cmd_buf.new_compute_command_encoder();
|
||||
#[allow(irrefutable_let_patterns)]
|
||||
let end_query = if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 {
|
||||
if let Some(sample_buf) = &query_pool.counter_sample_buf {
|
||||
let sample_index = queries.1 as NSUInteger;
|
||||
let sample_buf = sample_buf.id();
|
||||
let () = msg_send![encoder, sampleCountersInBuffer: sample_buf atSampleIndex: sample_index withBarrier: true];
|
||||
Some((sample_buf, queries.2))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
(encoder, end_query)
|
||||
}
|
||||
_ => (self.cmd_buf.new_compute_command_encoder(), None),
|
||||
};
|
||||
self.cur_encoder = Encoder::Compute(encoder.to_owned(), end_query);
|
||||
});
|
||||
}
|
||||
|
||||
unsafe fn dispatch(
|
||||
&mut self,
|
||||
pipeline: &Pipeline,
|
||||
|
@ -452,7 +640,7 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
|
|||
workgroup_count: (u32, u32, u32),
|
||||
workgroup_size: (u32, u32, u32),
|
||||
) {
|
||||
let encoder = self.cmd_buf.new_compute_command_encoder();
|
||||
let encoder = self.compute_command_encoder();
|
||||
encoder.set_compute_pipeline_state(&pipeline.0);
|
||||
let mut buf_ix = 0;
|
||||
for buffer in &descriptor_set.buffers {
|
||||
|
@ -475,7 +663,11 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
|
|||
depth: workgroup_size.2 as u64,
|
||||
};
|
||||
encoder.dispatch_thread_groups(workgroup_count, workgroup_size);
|
||||
encoder.end_encoding();
|
||||
}
|
||||
|
||||
unsafe fn end_compute_pass(&mut self) {
|
||||
// TODO: might validate that we are in a compute encoder state
|
||||
self.flush_encoder();
|
||||
}
|
||||
|
||||
unsafe fn memory_barrier(&mut self) {
|
||||
|
@ -494,22 +686,23 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
|
|||
// I think these are being tracked.
|
||||
}
|
||||
|
||||
unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) {
|
||||
unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
|
||||
let size = size.unwrap_or(buffer.size);
|
||||
let encoder = self.cmd_buf.new_compute_command_encoder();
|
||||
clear::encode_clear(&encoder, &self.helpers.clear_pipeline, &buffer.buffer, size);
|
||||
encoder.end_encoding()
|
||||
let _ = self.compute_command_encoder();
|
||||
// Getting this directly is a workaround for a borrow checker issue.
|
||||
if let Encoder::Compute(e, _) = &self.cur_encoder {
|
||||
clear::encode_clear(e, &self.helpers.clear_pipeline, &buffer.buffer, size);
|
||||
}
|
||||
}
|
||||
|
||||
unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) {
|
||||
let encoder = self.cmd_buf.new_blit_command_encoder();
|
||||
unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
|
||||
let encoder = self.blit_command_encoder();
|
||||
let size = src.size.min(dst.size);
|
||||
encoder.copy_from_buffer(&src.buffer, 0, &dst.buffer, 0, size);
|
||||
encoder.end_encoding();
|
||||
}
|
||||
|
||||
unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) {
|
||||
let encoder = self.cmd_buf.new_blit_command_encoder();
|
||||
unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
|
||||
let encoder = self.blit_command_encoder();
|
||||
assert_eq!(dst.size, (src.width as u64) * (src.height as u64) * 4);
|
||||
let bytes_per_row = (src.width * 4) as NSUInteger;
|
||||
let src_size = metal::MTLSize {
|
||||
|
@ -530,11 +723,10 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
|
|||
bytes_per_row * src.height as NSUInteger,
|
||||
metal::MTLBlitOption::empty(),
|
||||
);
|
||||
encoder.end_encoding();
|
||||
}
|
||||
|
||||
unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) {
|
||||
let encoder = self.cmd_buf.new_blit_command_encoder();
|
||||
unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
|
||||
let encoder = self.blit_command_encoder();
|
||||
assert_eq!(src.size, (dst.width as u64) * (dst.height as u64) * 4);
|
||||
let bytes_per_row = (dst.width * 4) as NSUInteger;
|
||||
let src_size = metal::MTLSize {
|
||||
|
@ -555,11 +747,10 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
|
|||
origin,
|
||||
metal::MTLBlitOption::empty(),
|
||||
);
|
||||
encoder.end_encoding();
|
||||
}
|
||||
|
||||
unsafe fn blit_image(&self, src: &Image, dst: &Image) {
|
||||
let encoder = self.cmd_buf.new_blit_command_encoder();
|
||||
unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
|
||||
let encoder = self.blit_command_encoder();
|
||||
let src_size = metal::MTLSize {
|
||||
width: src.width.min(dst.width) as NSUInteger,
|
||||
height: src.width.min(dst.height) as NSUInteger,
|
||||
|
@ -577,15 +768,79 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
|
|||
0,
|
||||
origin,
|
||||
);
|
||||
encoder.end_encoding();
|
||||
}
|
||||
|
||||
unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {}
|
||||
unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {
|
||||
let mut calibration = pool.calibration.lock().unwrap();
|
||||
*calibration = Some(self.time_calibration.clone());
|
||||
}
|
||||
|
||||
unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
|
||||
// TODO
|
||||
// This really a PITA because it's pretty different than Vulkan.
|
||||
// See https://developer.apple.com/documentation/metal/counter_sampling
|
||||
if let Some(buf) = &pool.counter_sample_buf {
|
||||
if matches!(self.cur_encoder, Encoder::None) {
|
||||
self.cur_encoder =
|
||||
Encoder::Compute(self.cmd_buf.new_compute_command_encoder().to_owned(), None);
|
||||
}
|
||||
let sample_index = query as NSUInteger;
|
||||
if self.counter_style == CounterStyle::Command {
|
||||
match &self.cur_encoder {
|
||||
Encoder::Compute(e, _) => {
|
||||
let () = msg_send![e.as_ptr(), sampleCountersInBuffer: buf.id() atSampleIndex: sample_index withBarrier: true];
|
||||
}
|
||||
Encoder::None => unreachable!(),
|
||||
_ => todo!(),
|
||||
}
|
||||
} else if self.counter_style == CounterStyle::Stage {
|
||||
match &self.cur_encoder {
|
||||
Encoder::Compute(_e, _) => {
|
||||
println!("write_timestamp is not supported for stage-style encoders");
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl CmdBuf {
|
||||
fn compute_command_encoder(&mut self) -> &metal::ComputeCommandEncoder {
|
||||
if !matches!(self.cur_encoder, Encoder::Compute(..)) {
|
||||
self.flush_encoder();
|
||||
self.cur_encoder =
|
||||
Encoder::Compute(self.cmd_buf.new_compute_command_encoder().to_owned(), None);
|
||||
}
|
||||
if let Encoder::Compute(e, _) = &self.cur_encoder {
|
||||
e
|
||||
} else {
|
||||
unreachable!()
|
||||
}
|
||||
}
|
||||
|
||||
fn blit_command_encoder(&mut self) -> &metal::BlitCommandEncoder {
|
||||
if !matches!(self.cur_encoder, Encoder::Blit(_)) {
|
||||
self.flush_encoder();
|
||||
self.cur_encoder = Encoder::Blit(self.cmd_buf.new_blit_command_encoder().to_owned());
|
||||
}
|
||||
if let Encoder::Blit(e) = &self.cur_encoder {
|
||||
e
|
||||
} else {
|
||||
unreachable!()
|
||||
}
|
||||
}
|
||||
|
||||
fn flush_encoder(&mut self) {
|
||||
match std::mem::replace(&mut self.cur_encoder, Encoder::None) {
|
||||
Encoder::Compute(e, Some((sample_buf, end_query))) => {
|
||||
let sample_index = end_query as NSUInteger;
|
||||
unsafe {
|
||||
let () = msg_send![e.as_ptr(), sampleCountersInBuffer: sample_buf atSampleIndex: sample_index withBarrier: true];
|
||||
}
|
||||
e.end_encoding();
|
||||
}
|
||||
Encoder::Compute(e, None) => e.end_encoding(),
|
||||
Encoder::Blit(e) => e.end_encoding(),
|
||||
Encoder::None => (),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
172
piet-gpu-hal/src/metal/timer.rs
Normal file
172
piet-gpu-hal/src/metal/timer.rs
Normal file
|
@ -0,0 +1,172 @@
|
|||
// Copyright 2021 The piet-gpu authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
// Also licensed under MIT license, at your choice.
|
||||
|
||||
//! Support for timer queries.
|
||||
//!
|
||||
//! Likely some of this should be upstreamed into metal-rs.
|
||||
|
||||
use std::{ffi::CStr, ptr::null_mut};
|
||||
|
||||
use cocoa_foundation::{
|
||||
base::id,
|
||||
foundation::{NSRange, NSUInteger},
|
||||
};
|
||||
use metal::{DeviceRef, MTLStorageMode};
|
||||
use objc::{class, msg_send, sel, sel_impl};
|
||||
|
||||
pub struct CounterSampleBuffer {
|
||||
id: id,
|
||||
count: u64,
|
||||
}
|
||||
|
||||
pub struct CounterSet {
|
||||
id: id,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct TimeCalibration {
|
||||
pub cpu_start_ts: u64,
|
||||
pub gpu_start_ts: u64,
|
||||
pub cpu_end_ts: u64,
|
||||
pub gpu_end_ts: u64,
|
||||
}
|
||||
|
||||
impl Drop for CounterSampleBuffer {
|
||||
fn drop(&mut self) {
|
||||
unsafe { msg_send![self.id, release] }
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for CounterSampleBuffer {
|
||||
fn clone(&self) -> CounterSampleBuffer {
|
||||
unsafe {
|
||||
CounterSampleBuffer {
|
||||
id: msg_send![self.id, retain],
|
||||
count: self.count,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl CounterSampleBuffer {
|
||||
pub fn id(&self) -> id {
|
||||
self.id
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for CounterSet {
|
||||
fn drop(&mut self) {
|
||||
unsafe { msg_send![self.id, release] }
|
||||
}
|
||||
}
|
||||
|
||||
impl CounterSet {
|
||||
pub fn get_timer_counter_set(device: &DeviceRef) -> Option<CounterSet> {
|
||||
unsafe {
|
||||
// TODO: version check
|
||||
let sets: id = msg_send!(device, counterSets);
|
||||
let count: NSUInteger = msg_send![sets, count];
|
||||
for i in 0..count {
|
||||
let set: id = msg_send![sets, objectAtIndex: i];
|
||||
let name: id = msg_send![set, name];
|
||||
let name_cstr = CStr::from_ptr(msg_send![name, UTF8String]);
|
||||
if name_cstr.to_bytes() == b"timestamp" {
|
||||
return Some(CounterSet { id: set });
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// copied from metal-rs; should be in common utilities maybe?
|
||||
fn nsstring_as_str(nsstr: &objc::runtime::Object) -> &str {
|
||||
let bytes = unsafe {
|
||||
let bytes: *const std::os::raw::c_char = msg_send![nsstr, UTF8String];
|
||||
bytes as *const u8
|
||||
};
|
||||
let len: NSUInteger = unsafe { msg_send![nsstr, length] };
|
||||
unsafe {
|
||||
let bytes = std::slice::from_raw_parts(bytes, len as usize);
|
||||
std::str::from_utf8(bytes).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl CounterSampleBuffer {
|
||||
pub fn new(
|
||||
device: &DeviceRef,
|
||||
count: u64,
|
||||
counter_set: &CounterSet,
|
||||
) -> Option<CounterSampleBuffer> {
|
||||
unsafe {
|
||||
let desc_cls = class!(MTLCounterSampleBufferDescriptor);
|
||||
let descriptor: id = msg_send![desc_cls, alloc];
|
||||
let _: id = msg_send![descriptor, init];
|
||||
let count = count as NSUInteger;
|
||||
let () = msg_send![descriptor, setSampleCount: count];
|
||||
let () = msg_send![descriptor, setCounterSet: counter_set.id];
|
||||
let () = msg_send![
|
||||
descriptor,
|
||||
setStorageMode: MTLStorageMode::Shared as NSUInteger
|
||||
];
|
||||
let mut error: id = null_mut();
|
||||
let buf: id = msg_send![device, newCounterSampleBufferWithDescriptor: descriptor error: &mut error];
|
||||
let () = msg_send![descriptor, release];
|
||||
if !error.is_null() {
|
||||
let description = msg_send![error, localizedDescription];
|
||||
println!(
|
||||
"error allocating sample buffer, code = {}",
|
||||
nsstring_as_str(description)
|
||||
);
|
||||
let () = msg_send![error, release];
|
||||
return None;
|
||||
}
|
||||
Some(CounterSampleBuffer { id: buf, count })
|
||||
}
|
||||
}
|
||||
|
||||
// Read the timestamps.
|
||||
//
|
||||
// Safety: the lifetime of the returned slice is wrong, it's actually autoreleased.
|
||||
pub unsafe fn resolve(&self) -> &[u64] {
|
||||
let range = NSRange::new(0, self.count);
|
||||
let data: id = msg_send![self.id, resolveCounterRange: range];
|
||||
if data.is_null() {
|
||||
&[]
|
||||
} else {
|
||||
let bytes: *const u64 = msg_send![data, bytes];
|
||||
std::slice::from_raw_parts(bytes, self.count as usize)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TimeCalibration {
|
||||
/// Convert GPU timestamp into CPU time base.
|
||||
///
|
||||
/// See https://developer.apple.com/documentation/metal/performance_tuning/correlating_cpu_and_gpu_timestamps
|
||||
pub fn correlate(&self, raw_ts: u64) -> f64 {
|
||||
let delta_cpu = self.cpu_end_ts - self.cpu_start_ts;
|
||||
let delta_gpu = self.gpu_end_ts - self.gpu_start_ts;
|
||||
let adj_ts = if delta_gpu > 0 {
|
||||
let scale = delta_cpu as f64 / delta_gpu as f64;
|
||||
self.cpu_start_ts as f64 + (raw_ts as f64 - self.gpu_start_ts as f64) * scale
|
||||
} else {
|
||||
// Default is ns on Apple Silicon; on other hardware this will be wrong
|
||||
raw_ts as f64
|
||||
};
|
||||
adj_ts * 1e-9
|
||||
}
|
||||
}
|
|
@ -35,6 +35,7 @@ use crate::backend::DescriptorSetBuilder as DescriptorSetBuilderTrait;
|
|||
use crate::backend::Device as DeviceTrait;
|
||||
use crate::BackendType;
|
||||
use crate::BindType;
|
||||
use crate::ComputePassDescriptor;
|
||||
use crate::ImageFormat;
|
||||
use crate::MapMode;
|
||||
use crate::{BufferUsage, Error, GpuInfo, ImageLayout, InstanceFlags};
|
||||
|
@ -658,6 +659,14 @@ impl CmdBuf {
|
|||
}
|
||||
}
|
||||
|
||||
pub unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
|
||||
mux_match! { self;
|
||||
CmdBuf::Vk(c) => c.begin_compute_pass(desc),
|
||||
CmdBuf::Dx12(c) => c.begin_compute_pass(desc),
|
||||
CmdBuf::Mtl(c) => c.begin_compute_pass(desc),
|
||||
}
|
||||
}
|
||||
|
||||
/// Dispatch a compute shader.
|
||||
///
|
||||
/// Note that both the number of workgroups (`workgroup_count`) and the number of
|
||||
|
@ -680,6 +689,14 @@ impl CmdBuf {
|
|||
}
|
||||
}
|
||||
|
||||
pub unsafe fn end_compute_pass(&mut self) {
|
||||
mux_match! { self;
|
||||
CmdBuf::Vk(c) => c.end_compute_pass(),
|
||||
CmdBuf::Dx12(c) => c.end_compute_pass(),
|
||||
CmdBuf::Mtl(c) => c.end_compute_pass(),
|
||||
}
|
||||
}
|
||||
|
||||
pub unsafe fn memory_barrier(&mut self) {
|
||||
mux_match! { self;
|
||||
CmdBuf::Vk(c) => c.memory_barrier(),
|
||||
|
|
|
@ -15,7 +15,7 @@ use smallvec::SmallVec;
|
|||
use crate::backend::Device as DeviceTrait;
|
||||
use crate::{
|
||||
BindType, BufferUsage, Error, GpuInfo, ImageFormat, ImageLayout, MapMode, SamplerParams, SubgroupSize,
|
||||
WorkgroupLimits,
|
||||
WorkgroupLimits, ComputePassDescriptor,
|
||||
};
|
||||
|
||||
pub struct VkInstance {
|
||||
|
@ -92,6 +92,7 @@ pub struct CmdBuf {
|
|||
cmd_buf: vk::CommandBuffer,
|
||||
cmd_pool: vk::CommandPool,
|
||||
device: Arc<RawDevice>,
|
||||
end_query: Option<(vk::QueryPool, u32)>,
|
||||
}
|
||||
|
||||
pub struct QueryPool {
|
||||
|
@ -738,6 +739,7 @@ impl crate::backend::Device for VkDevice {
|
|||
cmd_buf,
|
||||
cmd_pool,
|
||||
device: self.device.clone(),
|
||||
end_query: None,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -770,11 +772,10 @@ impl crate::backend::Device for VkDevice {
|
|||
// results (Windows 10, AMD 5700 XT).
|
||||
let flags = vk::QueryResultFlags::TYPE_64 | vk::QueryResultFlags::WAIT;
|
||||
device.get_query_pool_results(pool.pool, 0, pool.n_queries, &mut buf, flags)?;
|
||||
let ts0 = buf[0];
|
||||
let tsp = self.timestamp_period as f64 * 1e-9;
|
||||
let result = buf[1..]
|
||||
let result = buf
|
||||
.iter()
|
||||
.map(|ts| ts.wrapping_sub(ts0) as f64 * tsp)
|
||||
.map(|ts| *ts as f64 * tsp)
|
||||
.collect();
|
||||
Ok(result)
|
||||
}
|
||||
|
@ -902,6 +903,16 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
|
|||
true
|
||||
}
|
||||
|
||||
unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
|
||||
if let Some((pool, start, end)) = &desc.timer_queries {
|
||||
#[allow(irrefutable_let_patterns)]
|
||||
if let crate::hub::QueryPool::Vk(pool) = pool {
|
||||
self.write_timestamp_raw(pool.pool, *start);
|
||||
self.end_query = Some((pool.pool, *end));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unsafe fn dispatch(
|
||||
&mut self,
|
||||
pipeline: &Pipeline,
|
||||
|
@ -931,6 +942,12 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
|
|||
);
|
||||
}
|
||||
|
||||
unsafe fn end_compute_pass(&mut self) {
|
||||
if let Some((pool, end)) = self.end_query.take() {
|
||||
self.write_timestamp_raw(pool, end);
|
||||
}
|
||||
}
|
||||
|
||||
/// Insert a pipeline barrier for all memory accesses.
|
||||
unsafe fn memory_barrier(&mut self) {
|
||||
let device = &self.device.device;
|
||||
|
@ -995,13 +1012,13 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
|
|||
);
|
||||
}
|
||||
|
||||
unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) {
|
||||
unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
|
||||
let device = &self.device.device;
|
||||
let size = size.unwrap_or(vk::WHOLE_SIZE);
|
||||
device.cmd_fill_buffer(self.cmd_buf, buffer.buffer, 0, size, 0);
|
||||
}
|
||||
|
||||
unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) {
|
||||
unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
|
||||
let device = &self.device.device;
|
||||
let size = src.size.min(dst.size);
|
||||
device.cmd_copy_buffer(
|
||||
|
@ -1012,7 +1029,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
|
|||
);
|
||||
}
|
||||
|
||||
unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) {
|
||||
unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
|
||||
let device = &self.device.device;
|
||||
device.cmd_copy_image_to_buffer(
|
||||
self.cmd_buf,
|
||||
|
@ -1035,7 +1052,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
|
|||
);
|
||||
}
|
||||
|
||||
unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) {
|
||||
unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
|
||||
let device = &self.device.device;
|
||||
device.cmd_copy_buffer_to_image(
|
||||
self.cmd_buf,
|
||||
|
@ -1058,7 +1075,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
|
|||
);
|
||||
}
|
||||
|
||||
unsafe fn blit_image(&self, src: &Image, dst: &Image) {
|
||||
unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
|
||||
let device = &self.device.device;
|
||||
device.cmd_blit_image(
|
||||
self.cmd_buf,
|
||||
|
@ -1106,13 +1123,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
|
|||
}
|
||||
|
||||
unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
|
||||
let device = &self.device.device;
|
||||
device.cmd_write_timestamp(
|
||||
self.cmd_buf,
|
||||
vk::PipelineStageFlags::COMPUTE_SHADER,
|
||||
pool.pool,
|
||||
query,
|
||||
);
|
||||
self.write_timestamp_raw(pool.pool, query);
|
||||
}
|
||||
|
||||
unsafe fn begin_debug_label(&mut self, label: &str) {
|
||||
|
@ -1130,6 +1141,18 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
|
|||
}
|
||||
}
|
||||
|
||||
impl CmdBuf {
|
||||
unsafe fn write_timestamp_raw(&mut self, pool: vk::QueryPool, query: u32) {
|
||||
let device = &self.device.device;
|
||||
device.cmd_write_timestamp(
|
||||
self.cmd_buf,
|
||||
vk::PipelineStageFlags::COMPUTE_SHADER,
|
||||
pool,
|
||||
query,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
impl crate::backend::DescriptorSetBuilder<VkDevice> for DescriptorSetBuilder {
|
||||
fn add_buffers(&mut self, buffers: &[&Buffer]) {
|
||||
self.buffers.extend(buffers.iter().map(|b| b.buffer));
|
||||
|
|
|
@ -70,7 +70,7 @@ fn main() -> Result<(), Error> {
|
|||
.map(|_| session.create_semaphore())
|
||||
.collect::<Result<Vec<_>, Error>>()?;
|
||||
let query_pools = (0..NUM_FRAMES)
|
||||
.map(|_| session.create_query_pool(8))
|
||||
.map(|_| session.create_query_pool(12))
|
||||
.collect::<Result<Vec<_>, Error>>()?;
|
||||
let mut cmd_bufs: [Option<CmdBuf>; NUM_FRAMES] = Default::default();
|
||||
let mut submitted: [Option<SubmittedCmdBuf>; NUM_FRAMES] = Default::default();
|
||||
|
@ -112,22 +112,23 @@ fn main() -> Result<(), Error> {
|
|||
if !ts.is_empty() {
|
||||
info_string = format!(
|
||||
"{:.3}ms :: e:{:.3}ms|alloc:{:.3}ms|cp:{:.3}ms|bd:{:.3}ms|bin:{:.3}ms|cr:{:.3}ms|r:{:.3}ms",
|
||||
ts[6] * 1e3,
|
||||
ts[10] * 1e3,
|
||||
ts[0] * 1e3,
|
||||
(ts[1] - ts[0]) * 1e3,
|
||||
(ts[2] - ts[1]) * 1e3,
|
||||
(ts[3] - ts[2]) * 1e3,
|
||||
(ts[4] - ts[3]) * 1e3,
|
||||
(ts[5] - ts[4]) * 1e3,
|
||||
(ts[6] - ts[5]) * 1e3,
|
||||
(ts[8] - ts[7]) * 1e3,
|
||||
(ts[10] - ts[9]) * 1e3,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let mut ctx = PietGpuRenderContext::new();
|
||||
let test_blend = false;
|
||||
if let Some(svg) = &svg {
|
||||
test_scenes::render_svg(&mut ctx, svg);
|
||||
} else {
|
||||
} else if test_blend {
|
||||
use piet_gpu::{Blend, BlendMode::*, CompositionMode::*};
|
||||
let blends = [
|
||||
Blend::new(Normal, SrcOver),
|
||||
|
@ -163,6 +164,8 @@ fn main() -> Result<(), Error> {
|
|||
let blend = blends[mode % blends.len()];
|
||||
test_scenes::render_blend_test(&mut ctx, current_frame, blend);
|
||||
info_string = format!("{:?}", blend);
|
||||
} else {
|
||||
test_scenes::render_anim_frame(&mut ctx, current_frame);
|
||||
}
|
||||
render_info_string(&mut ctx, &info_string);
|
||||
if let Err(e) = renderer.upload_render_ctx(&mut ctx, frame_idx) {
|
||||
|
|
|
@ -18,8 +18,8 @@ use piet::kurbo::Vec2;
|
|||
use piet::{ImageFormat, RenderContext};
|
||||
|
||||
use piet_gpu_hal::{
|
||||
include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Error, Image,
|
||||
ImageLayout, Pipeline, QueryPool, Session,
|
||||
include_shader, BindType, Buffer, BufferUsage, CmdBuf, ComputePassDescriptor, DescriptorSet,
|
||||
Error, Image, ImageLayout, Pipeline, QueryPool, Session,
|
||||
};
|
||||
|
||||
pub use pico_svg::PicoSvg;
|
||||
|
@ -424,10 +424,10 @@ impl Renderer {
|
|||
cmd_buf.copy_buffer_to_image(&self.gradient_bufs[buf_ix], &self.gradients);
|
||||
cmd_buf.image_barrier(&self.gradients, ImageLayout::BlitDst, ImageLayout::General);
|
||||
cmd_buf.reset_query_pool(&query_pool);
|
||||
cmd_buf.write_timestamp(&query_pool, 0);
|
||||
cmd_buf.begin_debug_label("Element bounding box calculation");
|
||||
let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 0, 1));
|
||||
self.element_stage.record(
|
||||
cmd_buf,
|
||||
&mut pass,
|
||||
&self.element_code,
|
||||
&self.element_bindings[buf_ix],
|
||||
self.n_transform as u64,
|
||||
|
@ -435,56 +435,59 @@ impl Renderer {
|
|||
self.n_pathtag as u32,
|
||||
self.n_drawobj as u64,
|
||||
);
|
||||
pass.end();
|
||||
cmd_buf.end_debug_label();
|
||||
cmd_buf.write_timestamp(&query_pool, 1);
|
||||
cmd_buf.memory_barrier();
|
||||
cmd_buf.begin_debug_label("Clip bounding box calculation");
|
||||
let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 2, 3));
|
||||
pass.begin_debug_label("Clip bounding box calculation");
|
||||
self.clip_binding
|
||||
.record(cmd_buf, &self.clip_code, self.n_clip as u32);
|
||||
cmd_buf.end_debug_label();
|
||||
cmd_buf.begin_debug_label("Element binning");
|
||||
cmd_buf.dispatch(
|
||||
.record(&mut pass, &self.clip_code, self.n_clip as u32);
|
||||
pass.end_debug_label();
|
||||
pass.begin_debug_label("Element binning");
|
||||
pass.dispatch(
|
||||
&self.bin_pipeline,
|
||||
&self.bin_ds,
|
||||
(((self.n_paths + 255) / 256) as u32, 1, 1),
|
||||
(256, 1, 1),
|
||||
);
|
||||
cmd_buf.end_debug_label();
|
||||
cmd_buf.memory_barrier();
|
||||
cmd_buf.begin_debug_label("Tile allocation");
|
||||
cmd_buf.dispatch(
|
||||
pass.end_debug_label();
|
||||
pass.memory_barrier();
|
||||
pass.begin_debug_label("Tile allocation");
|
||||
pass.dispatch(
|
||||
&self.tile_pipeline,
|
||||
&self.tile_ds[buf_ix],
|
||||
(((self.n_paths + 255) / 256) as u32, 1, 1),
|
||||
(256, 1, 1),
|
||||
);
|
||||
cmd_buf.end_debug_label();
|
||||
cmd_buf.write_timestamp(&query_pool, 2);
|
||||
cmd_buf.memory_barrier();
|
||||
pass.end_debug_label();
|
||||
pass.end();
|
||||
cmd_buf.begin_debug_label("Path flattening");
|
||||
cmd_buf.dispatch(
|
||||
cmd_buf.memory_barrier();
|
||||
let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 4, 5));
|
||||
pass.dispatch(
|
||||
&self.path_pipeline,
|
||||
&self.path_ds,
|
||||
(((self.n_pathseg + 31) / 32) as u32, 1, 1),
|
||||
(32, 1, 1),
|
||||
);
|
||||
pass.end();
|
||||
cmd_buf.end_debug_label();
|
||||
cmd_buf.write_timestamp(&query_pool, 3);
|
||||
cmd_buf.memory_barrier();
|
||||
cmd_buf.begin_debug_label("Backdrop propagation");
|
||||
cmd_buf.dispatch(
|
||||
let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 6, 7));
|
||||
pass.dispatch(
|
||||
&self.backdrop_pipeline,
|
||||
&self.backdrop_ds,
|
||||
(((self.n_paths + 255) / 256) as u32, 1, 1),
|
||||
(256, self.backdrop_y, 1),
|
||||
);
|
||||
pass.end();
|
||||
cmd_buf.end_debug_label();
|
||||
cmd_buf.write_timestamp(&query_pool, 4);
|
||||
// TODO: redo query accounting
|
||||
cmd_buf.write_timestamp(&query_pool, 5);
|
||||
cmd_buf.memory_barrier();
|
||||
cmd_buf.begin_debug_label("Coarse raster");
|
||||
cmd_buf.dispatch(
|
||||
let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 8, 9));
|
||||
pass.dispatch(
|
||||
&self.coarse_pipeline,
|
||||
&self.coarse_ds[buf_ix],
|
||||
(
|
||||
|
@ -494,11 +497,13 @@ impl Renderer {
|
|||
),
|
||||
(256, 1, 1),
|
||||
);
|
||||
pass.end();
|
||||
cmd_buf.end_debug_label();
|
||||
cmd_buf.write_timestamp(&query_pool, 6);
|
||||
cmd_buf.memory_barrier();
|
||||
cmd_buf.begin_debug_label("Fine raster");
|
||||
cmd_buf.dispatch(
|
||||
let mut pass =
|
||||
cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 10, 11));
|
||||
pass.dispatch(
|
||||
&self.k4_pipeline,
|
||||
&self.k4_ds,
|
||||
(
|
||||
|
@ -508,8 +513,8 @@ impl Renderer {
|
|||
),
|
||||
(8, 4, 1),
|
||||
);
|
||||
pass.end();
|
||||
cmd_buf.end_debug_label();
|
||||
cmd_buf.write_timestamp(&query_pool, 7);
|
||||
cmd_buf.memory_barrier();
|
||||
cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
|
||||
}
|
||||
|
|
|
@ -26,7 +26,7 @@ use bytemuck::{Pod, Zeroable};
|
|||
pub use clip::{ClipBinding, ClipCode, CLIP_PART_SIZE};
|
||||
pub use draw::{DrawBinding, DrawCode, DrawMonoid, DrawStage, DRAW_PART_SIZE};
|
||||
pub use path::{PathBinding, PathCode, PathEncoder, PathStage, PATHSEG_PART_SIZE};
|
||||
use piet_gpu_hal::{Buffer, CmdBuf, Session};
|
||||
use piet_gpu_hal::{Buffer, ComputePass, Session};
|
||||
pub use transform::{
|
||||
Transform, TransformBinding, TransformCode, TransformStage, TRANSFORM_PART_SIZE,
|
||||
};
|
||||
|
@ -140,7 +140,7 @@ impl ElementStage {
|
|||
|
||||
pub unsafe fn record(
|
||||
&self,
|
||||
cmd_buf: &mut CmdBuf,
|
||||
pass: &mut ComputePass,
|
||||
code: &ElementCode,
|
||||
binding: &ElementBinding,
|
||||
n_transform: u64,
|
||||
|
@ -149,14 +149,14 @@ impl ElementStage {
|
|||
n_drawobj: u64,
|
||||
) {
|
||||
self.transform_stage.record(
|
||||
cmd_buf,
|
||||
pass,
|
||||
&code.transform_code,
|
||||
&binding.transform_binding,
|
||||
n_transform,
|
||||
);
|
||||
// No memory barrier needed here; path has at least one before pathseg
|
||||
self.path_stage.record(
|
||||
cmd_buf,
|
||||
pass,
|
||||
&code.path_code,
|
||||
&binding.path_binding,
|
||||
n_paths,
|
||||
|
@ -164,6 +164,6 @@ impl ElementStage {
|
|||
);
|
||||
// No memory barrier needed here; draw has at least one before draw_leaf
|
||||
self.draw_stage
|
||||
.record(cmd_buf, &code.draw_code, &binding.draw_binding, n_drawobj);
|
||||
.record(pass, &code.draw_code, &binding.draw_binding, n_drawobj);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
|
||||
//! The clip processing stage (includes substages).
|
||||
|
||||
use piet_gpu_hal::{include_shader, BindType, Buffer, CmdBuf, DescriptorSet, Pipeline, Session};
|
||||
use piet_gpu_hal::{include_shader, BindType, Buffer, ComputePass, DescriptorSet, Pipeline, Session};
|
||||
|
||||
// Note that this isn't the code/stage/binding pattern of most of the other stages
|
||||
// in the new element processing pipeline. We want to move those temporary buffers
|
||||
|
@ -69,26 +69,26 @@ impl ClipBinding {
|
|||
/// Record the clip dispatches.
|
||||
///
|
||||
/// Assumes memory barrier on entry. Provides memory barrier on exit.
|
||||
pub unsafe fn record(&self, cmd_buf: &mut CmdBuf, code: &ClipCode, n_clip: u32) {
|
||||
pub unsafe fn record(&self, pass: &mut ComputePass, code: &ClipCode, n_clip: u32) {
|
||||
let n_wg_reduce = n_clip.saturating_sub(1) / CLIP_PART_SIZE;
|
||||
if n_wg_reduce > 0 {
|
||||
cmd_buf.dispatch(
|
||||
pass.dispatch(
|
||||
&code.reduce_pipeline,
|
||||
&self.reduce_ds,
|
||||
(n_wg_reduce, 1, 1),
|
||||
(CLIP_PART_SIZE, 1, 1),
|
||||
);
|
||||
cmd_buf.memory_barrier();
|
||||
pass.memory_barrier();
|
||||
}
|
||||
let n_wg = (n_clip + CLIP_PART_SIZE - 1) / CLIP_PART_SIZE;
|
||||
if n_wg > 0 {
|
||||
cmd_buf.dispatch(
|
||||
pass.dispatch(
|
||||
&code.leaf_pipeline,
|
||||
&self.leaf_ds,
|
||||
(n_wg, 1, 1),
|
||||
(CLIP_PART_SIZE, 1, 1),
|
||||
);
|
||||
cmd_buf.memory_barrier();
|
||||
pass.memory_barrier();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
use bytemuck::{Pod, Zeroable};
|
||||
|
||||
use piet_gpu_hal::{
|
||||
include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session,
|
||||
include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session,
|
||||
};
|
||||
|
||||
/// The output element of the draw object stage.
|
||||
|
@ -130,7 +130,7 @@ impl DrawStage {
|
|||
|
||||
pub unsafe fn record(
|
||||
&self,
|
||||
cmd_buf: &mut CmdBuf,
|
||||
pass: &mut ComputePass,
|
||||
code: &DrawCode,
|
||||
binding: &DrawBinding,
|
||||
size: u64,
|
||||
|
@ -140,22 +140,22 @@ impl DrawStage {
|
|||
}
|
||||
let n_workgroups = (size + DRAW_PART_SIZE - 1) / DRAW_PART_SIZE;
|
||||
if n_workgroups > 1 {
|
||||
cmd_buf.dispatch(
|
||||
pass.dispatch(
|
||||
&code.reduce_pipeline,
|
||||
&binding.reduce_ds,
|
||||
(n_workgroups as u32, 1, 1),
|
||||
(DRAW_WG as u32, 1, 1),
|
||||
);
|
||||
cmd_buf.memory_barrier();
|
||||
cmd_buf.dispatch(
|
||||
pass.memory_barrier();
|
||||
pass.dispatch(
|
||||
&code.root_pipeline,
|
||||
&self.root_ds,
|
||||
(1, 1, 1),
|
||||
(DRAW_WG as u32, 1, 1),
|
||||
);
|
||||
}
|
||||
cmd_buf.memory_barrier();
|
||||
cmd_buf.dispatch(
|
||||
pass.memory_barrier();
|
||||
pass.dispatch(
|
||||
&code.leaf_pipeline,
|
||||
&binding.leaf_ds,
|
||||
(n_workgroups as u32, 1, 1),
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
//! The path stage (includes substages).
|
||||
|
||||
use piet_gpu_hal::{
|
||||
include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session,
|
||||
include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session,
|
||||
};
|
||||
|
||||
pub struct PathCode {
|
||||
|
@ -148,7 +148,7 @@ impl PathStage {
|
|||
/// those are consumed. Result is written without barrier.
|
||||
pub unsafe fn record(
|
||||
&self,
|
||||
cmd_buf: &mut CmdBuf,
|
||||
pass: &mut ComputePass,
|
||||
code: &PathCode,
|
||||
binding: &PathBinding,
|
||||
n_paths: u32,
|
||||
|
@ -166,15 +166,15 @@ impl PathStage {
|
|||
let reduce_part_tags = REDUCE_PART_SIZE * 4;
|
||||
let n_wg_tag_reduce = (n_tags + reduce_part_tags - 1) / reduce_part_tags;
|
||||
if n_wg_tag_reduce > 1 {
|
||||
cmd_buf.dispatch(
|
||||
pass.dispatch(
|
||||
&code.reduce_pipeline,
|
||||
&binding.reduce_ds,
|
||||
(n_wg_tag_reduce, 1, 1),
|
||||
(REDUCE_WG, 1, 1),
|
||||
);
|
||||
// I think we can skip root if n_wg_tag_reduce == 2
|
||||
cmd_buf.memory_barrier();
|
||||
cmd_buf.dispatch(
|
||||
pass.memory_barrier();
|
||||
pass.dispatch(
|
||||
&code.tag_root_pipeline,
|
||||
&self.tag_root_ds,
|
||||
(1, 1, 1),
|
||||
|
@ -183,15 +183,15 @@ impl PathStage {
|
|||
// No barrier needed here; clear doesn't depend on path tags
|
||||
}
|
||||
let n_wg_clear = (n_paths + CLEAR_WG - 1) / CLEAR_WG;
|
||||
cmd_buf.dispatch(
|
||||
pass.dispatch(
|
||||
&code.clear_pipeline,
|
||||
&binding.clear_ds,
|
||||
(n_wg_clear, 1, 1),
|
||||
(CLEAR_WG, 1, 1),
|
||||
);
|
||||
cmd_buf.memory_barrier();
|
||||
pass.memory_barrier();
|
||||
let n_wg_pathseg = (n_tags + SCAN_PART_SIZE - 1) / SCAN_PART_SIZE;
|
||||
cmd_buf.dispatch(
|
||||
pass.dispatch(
|
||||
&code.pathseg_pipeline,
|
||||
&binding.path_ds,
|
||||
(n_wg_pathseg, 1, 1),
|
||||
|
|
|
@ -20,7 +20,7 @@ use bytemuck::{Pod, Zeroable};
|
|||
|
||||
use piet::kurbo::Affine;
|
||||
use piet_gpu_hal::{
|
||||
include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session,
|
||||
include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session,
|
||||
};
|
||||
|
||||
/// An affine transform.
|
||||
|
@ -132,7 +132,7 @@ impl TransformStage {
|
|||
|
||||
pub unsafe fn record(
|
||||
&self,
|
||||
cmd_buf: &mut CmdBuf,
|
||||
pass: &mut ComputePass,
|
||||
code: &TransformCode,
|
||||
binding: &TransformBinding,
|
||||
size: u64,
|
||||
|
@ -142,22 +142,22 @@ impl TransformStage {
|
|||
}
|
||||
let n_workgroups = (size + TRANSFORM_PART_SIZE - 1) / TRANSFORM_PART_SIZE;
|
||||
if n_workgroups > 1 {
|
||||
cmd_buf.dispatch(
|
||||
pass.dispatch(
|
||||
&code.reduce_pipeline,
|
||||
&binding.reduce_ds,
|
||||
(n_workgroups as u32, 1, 1),
|
||||
(TRANSFORM_WG as u32, 1, 1),
|
||||
);
|
||||
cmd_buf.memory_barrier();
|
||||
cmd_buf.dispatch(
|
||||
pass.memory_barrier();
|
||||
pass.dispatch(
|
||||
&code.root_pipeline,
|
||||
&self.root_ds,
|
||||
(1, 1, 1),
|
||||
(TRANSFORM_WG as u32, 1, 1),
|
||||
);
|
||||
cmd_buf.memory_barrier();
|
||||
pass.memory_barrier();
|
||||
}
|
||||
cmd_buf.dispatch(
|
||||
pass.dispatch(
|
||||
&code.leaf_pipeline,
|
||||
&binding.leaf_ds,
|
||||
(n_workgroups as u32, 1, 1),
|
||||
|
|
|
@ -16,11 +16,11 @@
|
|||
|
||||
//! Utilities (and a benchmark) for clearing buffers with compute shaders.
|
||||
|
||||
use piet_gpu_hal::{include_shader, BindType, BufferUsage, DescriptorSet};
|
||||
use piet_gpu_hal::{include_shader, BindType, BufferUsage, ComputePass, DescriptorSet};
|
||||
use piet_gpu_hal::{Buffer, Pipeline};
|
||||
|
||||
use crate::config::Config;
|
||||
use crate::runner::{Commands, Runner};
|
||||
use crate::runner::Runner;
|
||||
use crate::test_result::TestResult;
|
||||
|
||||
const WG_SIZE: u64 = 256;
|
||||
|
@ -52,9 +52,9 @@ pub unsafe fn run_clear_test(runner: &mut Runner, config: &Config) -> TestResult
|
|||
let mut total_elapsed = 0.0;
|
||||
for i in 0..n_iter {
|
||||
let mut commands = runner.commands();
|
||||
commands.write_timestamp(0);
|
||||
stage.record(&mut commands, &code, &binding);
|
||||
commands.write_timestamp(1);
|
||||
let mut pass = commands.compute_pass(0, 1);
|
||||
stage.record(&mut pass, &code, &binding);
|
||||
pass.end();
|
||||
if i == 0 || config.verify_all {
|
||||
commands.cmd_buf.memory_barrier();
|
||||
commands.download(&out_buf);
|
||||
|
@ -108,17 +108,12 @@ impl ClearStage {
|
|||
ClearBinding { descriptor_set }
|
||||
}
|
||||
|
||||
pub unsafe fn record(
|
||||
&self,
|
||||
commands: &mut Commands,
|
||||
code: &ClearCode,
|
||||
bindings: &ClearBinding,
|
||||
) {
|
||||
pub unsafe fn record(&self, pass: &mut ComputePass, code: &ClearCode, bindings: &ClearBinding) {
|
||||
let n_workgroups = (self.n_elements + WG_SIZE - 1) / WG_SIZE;
|
||||
// An issue: for clearing large buffers (>16M), we need to check the
|
||||
// number of workgroups against the (dynamically detected) limit, and
|
||||
// potentially issue multiple dispatches.
|
||||
commands.cmd_buf.dispatch(
|
||||
pass.dispatch(
|
||||
&code.pipeline,
|
||||
&bindings.descriptor_set,
|
||||
(n_workgroups as u32, 1, 1),
|
||||
|
|
|
@ -58,11 +58,11 @@ pub unsafe fn clip_test(runner: &mut Runner, config: &Config) -> TestResult {
|
|||
let binding = ClipBinding::new(&runner.session, &code, &config_buf, &memory.dev_buf);
|
||||
|
||||
let mut commands = runner.commands();
|
||||
commands.write_timestamp(0);
|
||||
commands.upload(&memory);
|
||||
binding.record(&mut commands.cmd_buf, &code, n_clip as u32);
|
||||
let mut pass = commands.compute_pass(0, 1);
|
||||
binding.record(&mut pass, &code, n_clip as u32);
|
||||
pass.end();
|
||||
commands.download(&memory);
|
||||
commands.write_timestamp(1);
|
||||
runner.submit(commands);
|
||||
let dst = memory.map_read(..);
|
||||
if let Some(failure) = data.verify(&dst) {
|
||||
|
|
|
@ -77,9 +77,9 @@ pub unsafe fn draw_test(runner: &mut Runner, config: &Config) -> TestResult {
|
|||
let n_iter = config.n_iter;
|
||||
for i in 0..n_iter {
|
||||
let mut commands = runner.commands();
|
||||
commands.write_timestamp(0);
|
||||
stage.record(&mut commands.cmd_buf, &code, &binding, n_tag);
|
||||
commands.write_timestamp(1);
|
||||
let mut pass = commands.compute_pass(0, 1);
|
||||
stage.record(&mut pass, &code, &binding, n_tag);
|
||||
pass.end();
|
||||
if i == 0 || config.verify_all {
|
||||
commands.cmd_buf.memory_barrier();
|
||||
commands.download(&memory);
|
||||
|
|
|
@ -45,9 +45,7 @@ pub unsafe fn run_linkedlist_test(runner: &mut Runner, config: &Config) -> TestR
|
|||
for i in 0..n_iter {
|
||||
let mut commands = runner.commands();
|
||||
// Might clear only buckets to save time.
|
||||
commands.write_timestamp(0);
|
||||
stage.record(&mut commands, &code, &binding, &mem_buf.dev_buf);
|
||||
commands.write_timestamp(1);
|
||||
if i == 0 || config.verify_all {
|
||||
commands.cmd_buf.memory_barrier();
|
||||
commands.download(&mem_buf);
|
||||
|
@ -107,12 +105,14 @@ impl LinkedListStage {
|
|||
commands.cmd_buf.clear_buffer(out_buf, None);
|
||||
commands.cmd_buf.memory_barrier();
|
||||
let n_workgroups = N_BUCKETS / WG_SIZE;
|
||||
commands.cmd_buf.dispatch(
|
||||
let mut pass = commands.compute_pass(0, 1);
|
||||
pass.dispatch(
|
||||
&code.pipeline,
|
||||
&bindings.descriptor_set,
|
||||
(n_workgroups as u32, 1, 1),
|
||||
(WG_SIZE as u32, 1, 1),
|
||||
);
|
||||
pass.end();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -59,9 +59,7 @@ pub unsafe fn run_message_passing_test(
|
|||
let mut failures = 0;
|
||||
for _ in 0..n_iter {
|
||||
let mut commands = runner.commands();
|
||||
commands.write_timestamp(0);
|
||||
stage.record(&mut commands, &code, &binding, &out_buf.dev_buf);
|
||||
commands.write_timestamp(1);
|
||||
commands.cmd_buf.memory_barrier();
|
||||
commands.download(&out_buf);
|
||||
total_elapsed += runner.submit(commands);
|
||||
|
@ -128,11 +126,13 @@ impl MessagePassingStage {
|
|||
commands.cmd_buf.clear_buffer(&self.data_buf, None);
|
||||
commands.cmd_buf.clear_buffer(out_buf, None);
|
||||
commands.cmd_buf.memory_barrier();
|
||||
commands.cmd_buf.dispatch(
|
||||
let mut pass = commands.compute_pass(0, 1);
|
||||
pass.dispatch(
|
||||
&code.pipeline,
|
||||
&bindings.descriptor_set,
|
||||
(256, 1, 1),
|
||||
(256, 1, 1),
|
||||
);
|
||||
pass.end();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -105,15 +105,15 @@ pub unsafe fn path_test(runner: &mut Runner, config: &Config) -> TestResult {
|
|||
let mut commands = runner.commands();
|
||||
commands.cmd_buf.copy_buffer(&memory_init, &memory.dev_buf);
|
||||
commands.cmd_buf.memory_barrier();
|
||||
commands.write_timestamp(0);
|
||||
let mut pass = commands.compute_pass(0, 1);
|
||||
stage.record(
|
||||
&mut commands.cmd_buf,
|
||||
&mut pass,
|
||||
&code,
|
||||
&binding,
|
||||
path_data.n_path,
|
||||
path_data.tags.len() as u32,
|
||||
);
|
||||
commands.write_timestamp(1);
|
||||
pass.end();
|
||||
if i == 0 || config.verify_all {
|
||||
commands.cmd_buf.memory_barrier();
|
||||
commands.download(&memory);
|
||||
|
|
|
@ -85,9 +85,7 @@ pub unsafe fn run_prefix_test(
|
|||
let mut total_elapsed = 0.0;
|
||||
for i in 0..n_iter {
|
||||
let mut commands = runner.commands();
|
||||
commands.write_timestamp(0);
|
||||
stage.record(&mut commands, &code, &binding);
|
||||
commands.write_timestamp(1);
|
||||
if i == 0 || config.verify_all {
|
||||
commands.cmd_buf.memory_barrier();
|
||||
commands.download(&out_buf);
|
||||
|
@ -159,12 +157,14 @@ impl PrefixStage {
|
|||
let n_workgroups = (self.n_elements + ELEMENTS_PER_WG - 1) / ELEMENTS_PER_WG;
|
||||
commands.cmd_buf.clear_buffer(&self.state_buf, None);
|
||||
commands.cmd_buf.memory_barrier();
|
||||
commands.cmd_buf.dispatch(
|
||||
let mut pass = commands.compute_pass(0, 1);
|
||||
pass.dispatch(
|
||||
&code.pipeline,
|
||||
&bindings.descriptor_set,
|
||||
(n_workgroups as u32, 1, 1),
|
||||
(WG_SIZE as u32, 1, 1),
|
||||
);
|
||||
pass.end();
|
||||
// One thing that's missing here is registering the buffers so
|
||||
// they can be safely dropped by Rust code before the execution
|
||||
// of the command buffer completes.
|
||||
|
|
|
@ -66,9 +66,7 @@ pub unsafe fn run_prefix_test(runner: &mut Runner, config: &Config) -> TestResul
|
|||
let mut commands = runner.commands();
|
||||
commands.cmd_buf.copy_buffer(&data_buf, &out_buf.dev_buf);
|
||||
commands.cmd_buf.memory_barrier();
|
||||
commands.write_timestamp(0);
|
||||
stage.record(&mut commands, &code, &binding);
|
||||
commands.write_timestamp(1);
|
||||
if i == 0 || config.verify_all {
|
||||
commands.cmd_buf.memory_barrier();
|
||||
commands.download(&out_buf);
|
||||
|
@ -175,33 +173,35 @@ impl PrefixTreeStage {
|
|||
code: &PrefixTreeCode,
|
||||
bindings: &PrefixTreeBinding,
|
||||
) {
|
||||
let mut pass = commands.compute_pass(0, 1);
|
||||
let n = self.tmp_bufs.len();
|
||||
for i in 0..n {
|
||||
let n_workgroups = self.sizes[i + 1];
|
||||
commands.cmd_buf.dispatch(
|
||||
pass.dispatch(
|
||||
&code.reduce_pipeline,
|
||||
&bindings.descriptor_sets[i],
|
||||
(n_workgroups as u32, 1, 1),
|
||||
(WG_SIZE as u32, 1, 1),
|
||||
);
|
||||
commands.cmd_buf.memory_barrier();
|
||||
pass.memory_barrier();
|
||||
}
|
||||
commands.cmd_buf.dispatch(
|
||||
pass.dispatch(
|
||||
&code.root_pipeline,
|
||||
&bindings.descriptor_sets[n],
|
||||
(1, 1, 1),
|
||||
(WG_SIZE as u32, 1, 1),
|
||||
);
|
||||
for i in (0..n).rev() {
|
||||
commands.cmd_buf.memory_barrier();
|
||||
pass.memory_barrier();
|
||||
let n_workgroups = self.sizes[i + 1];
|
||||
commands.cmd_buf.dispatch(
|
||||
pass.dispatch(
|
||||
&code.scan_pipeline,
|
||||
&bindings.descriptor_sets[2 * n - i],
|
||||
(n_workgroups as u32, 1, 1),
|
||||
(WG_SIZE as u32, 1, 1),
|
||||
);
|
||||
}
|
||||
pass.end();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -20,8 +20,8 @@ use std::ops::RangeBounds;
|
|||
|
||||
use bytemuck::Pod;
|
||||
use piet_gpu_hal::{
|
||||
BackendType, BufReadGuard, BufWriteGuard, Buffer, BufferUsage, CmdBuf, Instance, InstanceFlags,
|
||||
QueryPool, Session,
|
||||
BackendType, BufReadGuard, BufWriteGuard, Buffer, BufferUsage, CmdBuf, ComputePass,
|
||||
ComputePassDescriptor, Instance, InstanceFlags, QueryPool, Session,
|
||||
};
|
||||
|
||||
pub struct Runner {
|
||||
|
@ -118,8 +118,14 @@ impl Runner {
|
|||
}
|
||||
|
||||
impl Commands {
|
||||
pub unsafe fn write_timestamp(&mut self, query: u32) {
|
||||
self.cmd_buf.write_timestamp(&self.query_pool, query);
|
||||
/// Start a compute pass with timer queries.
|
||||
pub unsafe fn compute_pass(&mut self, start_query: u32, end_query: u32) -> ComputePass {
|
||||
self.cmd_buf
|
||||
.begin_compute_pass(&ComputePassDescriptor::timer(
|
||||
&self.query_pool,
|
||||
start_query,
|
||||
end_query,
|
||||
))
|
||||
}
|
||||
|
||||
pub unsafe fn upload(&mut self, buf: &BufStage) {
|
||||
|
|
|
@ -61,9 +61,9 @@ pub unsafe fn transform_test(runner: &mut Runner, config: &Config) -> TestResult
|
|||
let n_iter = config.n_iter;
|
||||
for i in 0..n_iter {
|
||||
let mut commands = runner.commands();
|
||||
commands.write_timestamp(0);
|
||||
stage.record(&mut commands.cmd_buf, &code, &binding, n_elements);
|
||||
commands.write_timestamp(1);
|
||||
let mut pass = commands.compute_pass(0, 1);
|
||||
stage.record(&mut pass, &code, &binding, n_elements);
|
||||
pass.end();
|
||||
if i == 0 || config.verify_all {
|
||||
commands.cmd_buf.memory_barrier();
|
||||
commands.download(&memory);
|
||||
|
|
Loading…
Reference in a new issue