From 290d5d2e132bbe657a45b04bc8b4eecd43024cab Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Thu, 6 Jan 2022 10:02:07 -0800 Subject: [PATCH] More progress This puts most of the infrastructure in place but I'm hitting an error that "sampleCountersInBuffer is not supported on this device". The issue is that M1 supports stage boundaries and not command boundaries. We'll have to rework the logic a bit. (And, in the most general case, support both) Start implementing stage boundaries, but it will probably require an API change. --- Cargo.lock | 1 + piet-gpu-hal/Cargo.toml | 1 + piet-gpu-hal/src/backend.rs | 17 ++++ piet-gpu-hal/src/metal.rs | 133 ++++++++++++++++++++++++++++---- piet-gpu-hal/src/metal/timer.rs | 35 +++++++-- piet-gpu-hal/src/mux.rs | 8 ++ 6 files changed, 177 insertions(+), 18 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 737c033..66c793b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -921,6 +921,7 @@ dependencies = [ "block", "bytemuck", "cocoa-foundation", + "foreign-types", "metal", "objc", "raw-window-handle", diff --git a/piet-gpu-hal/Cargo.toml b/piet-gpu-hal/Cargo.toml index 29b51bd..f9b844a 100644 --- a/piet-gpu-hal/Cargo.toml +++ b/piet-gpu-hal/Cargo.toml @@ -28,3 +28,4 @@ metal = "0.22" objc = "0.2.5" block = "0.1.6" cocoa-foundation = "0.1" +foreign-types = "0.3.2" diff --git a/piet-gpu-hal/src/backend.rs b/piet-gpu-hal/src/backend.rs index 5715d62..c1b2132 100644 --- a/piet-gpu-hal/src/backend.rs +++ b/piet-gpu-hal/src/backend.rs @@ -160,6 +160,8 @@ pub trait Device: Sized { } pub trait CmdBuf { + type ComputeEncoder; + unsafe fn begin(&mut self); unsafe fn finish(&mut self); @@ -231,6 +233,8 @@ pub trait CmdBuf { /// End a section opened by `begin_debug_label`. unsafe fn end_debug_label(&mut self) {} + + unsafe fn new_compute_encoder(&mut self) -> Self::ComputeEncoder; } /// A builder for descriptor sets with more complex layouts. @@ -252,3 +256,16 @@ pub trait DescriptorSetBuilder { fn add_textures(&mut self, images: &[&D::Image]); unsafe fn build(self, device: &D, pipeline: &D::Pipeline) -> Result; } + +pub trait ComputeEncoder { + unsafe fn dispatch( + &mut self, + pipeline: &D::Pipeline, + descriptor_set: &D::DescriptorSet, + workgroup_count: (u32, u32, u32), + workgroup_size: (u32, u32, u32), + ); + + // Question: should be self? + unsafe fn finish(&mut self); +} diff --git a/piet-gpu-hal/src/metal.rs b/piet-gpu-hal/src/metal.rs index c96f971..23cc256 100644 --- a/piet-gpu-hal/src/metal.rs +++ b/piet-gpu-hal/src/metal.rs @@ -24,6 +24,7 @@ use std::sync::{Arc, Mutex}; use block::Block; use cocoa_foundation::base::id; use cocoa_foundation::foundation::{NSInteger, NSUInteger}; +use foreign_types::ForeignType; use objc::rc::autoreleasepool; use objc::runtime::{Object, BOOL, YES}; use objc::{class, msg_send, sel, sel_impl}; @@ -36,6 +37,8 @@ use crate::{BufferUsage, Error, GpuInfo, ImageFormat, MapMode, WorkgroupLimits}; use util::*; +use self::timer::{CounterSampleBuffer, CounterSet}; + pub struct MtlInstance; pub struct MtlDevice { @@ -43,6 +46,18 @@ pub struct MtlDevice { cmd_queue: Arc>, gpu_info: GpuInfo, helpers: Arc, + timer_set: Option, + counter_style: CounterStyle, +} + +/// Type of counter sampling. +/// +/// See https://developer.apple.com/documentation/metal/counter_sampling/sampling_gpu_data_into_counter_sample_buffers +#[derive(Clone, Copy, PartialEq, Eq)] +enum CounterStyle { + None, + Stage, + Command, } pub struct MtlSurface { @@ -85,6 +100,7 @@ pub struct CmdBuf { helpers: Arc, cur_encoder: Encoder, time_calibration: Arc>, + counter_style: CounterStyle, } enum Encoder { @@ -101,7 +117,7 @@ struct TimeCalibration { gpu_end_ts: u64, } -pub struct QueryPool; +pub struct QueryPool(Option); pub struct Pipeline(metal::ComputePipelineState); @@ -118,6 +134,10 @@ struct Helpers { clear_pipeline: metal::ComputePipelineState, } +pub struct ComputeEncoder { + raw: metal::ComputeCommandEncoder, +} + impl MtlInstance { pub fn new( window_handle: Option<&dyn HasRawWindowHandle>, @@ -228,15 +248,22 @@ impl MtlDevice { clear_pipeline: clear::make_clear_pipeline(&device), }); // Timer stuff - if let Some(timer_set) = timer::CounterSet::get_timer_counter_set(&device) { - let timer = timer::CounterSampleBuffer::new(&device, 4, &timer_set); - } + let timer_set = CounterSet::get_timer_counter_set(&device); + let counter_style = if timer_set.is_some() { + // TODO: M1 is stage style, but should do proper runtime detection. + CounterStyle::Stage + } else { + CounterStyle::None + }; + MtlDevice { device, cmd_queue: Arc::new(Mutex::new(cmd_queue)), gpu_info, helpers, - } + timer_set, + counter_style, + } } pub fn cmd_buf_from_raw_mtl(&self, raw_cmd_buf: metal::CommandBuffer) -> CmdBuf { @@ -244,7 +271,13 @@ impl MtlDevice { let helpers = self.helpers.clone(); let cur_encoder = Encoder::None; let time_calibration = Default::default(); - CmdBuf { cmd_buf, helpers, cur_encoder, time_calibration } + CmdBuf { + cmd_buf, + helpers, + cur_encoder, + time_calibration, + counter_style: self.counter_style, + } } pub fn image_from_raw_mtl(&self, texture: metal::Texture, width: u32, height: u32) -> Image { @@ -364,6 +397,7 @@ impl crate::backend::Device for MtlDevice { helpers, cur_encoder, time_calibration, + counter_style: self.counter_style, }) } @@ -372,12 +406,19 @@ impl crate::backend::Device for MtlDevice { } fn create_query_pool(&self, n_queries: u32) -> Result { - // TODO - Ok(QueryPool) + if let Some(timer_set) = &self.timer_set { + let pool = CounterSampleBuffer::new(&self.device, n_queries as u64, timer_set) + .ok_or("error creating timer query pool")?; + return Ok(QueryPool(Some(pool))); + } + Ok(QueryPool(None)) } unsafe fn fetch_query_pool(&self, pool: &Self::QueryPool) -> Result, Error> { - // TODO + if let Some(raw) = &pool.0 { + let resolved = raw.resolve(); + println!("resolved = {:?}", resolved); + } Ok(Vec::new()) } @@ -505,6 +546,8 @@ impl crate::backend::Device for MtlDevice { } impl crate::backend::CmdBuf for CmdBuf { + type ComputeEncoder = ComputeEncoder; + unsafe fn begin(&mut self) {} unsafe fn finish(&mut self) { @@ -647,12 +690,39 @@ impl crate::backend::CmdBuf for CmdBuf { ); } - unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {} + unsafe fn reset_query_pool(&mut self, _pool: &QueryPool) {} unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) { - // TODO - // This really a PITA because it's pretty different than Vulkan. - // See https://developer.apple.com/documentation/metal/counter_sampling + if let Some(buf) = &pool.0 { + if matches!(self.cur_encoder, Encoder::None) { + self.cur_encoder = + Encoder::Compute(self.cmd_buf.new_compute_command_encoder().to_owned()); + } + let sample_index = query as NSUInteger; + if self.counter_style == CounterStyle::Command { + match &self.cur_encoder { + Encoder::Compute(e) => { + let () = msg_send![e.as_ptr(), sampleCountersInBuffer: buf.id() atSampleIndex: sample_index withBarrier: true]; + } + Encoder::None => unreachable!(), + _ => todo!(), + } + } else if self.counter_style == CounterStyle::Stage { + match &self.cur_encoder { + Encoder::Compute(e) => { + println!("here we are"); + } + _ => (), + } + } + } + } + + unsafe fn new_compute_encoder(&mut self) -> Self::ComputeEncoder { + let raw = self.cmd_buf.new_compute_command_encoder().to_owned(); + ComputeEncoder { + raw + } } } @@ -691,6 +761,43 @@ impl CmdBuf { } } +impl crate::backend::ComputeEncoder for ComputeEncoder { + unsafe fn dispatch( + &mut self, + pipeline: &Pipeline, + descriptor_set: &DescriptorSet, + workgroup_count: (u32, u32, u32), + workgroup_size: (u32, u32, u32), + ) { + self.raw.set_compute_pipeline_state(&pipeline.0); + let mut buf_ix = 0; + for buffer in &descriptor_set.buffers { + self.raw.set_buffer(buf_ix, Some(&buffer.buffer), 0); + buf_ix += 1; + } + let mut img_ix = buf_ix; + for image in &descriptor_set.images { + self.raw.set_texture(img_ix, Some(&image.texture)); + img_ix += 1; + } + let workgroup_count = metal::MTLSize { + width: workgroup_count.0 as u64, + height: workgroup_count.1 as u64, + depth: workgroup_count.2 as u64, + }; + let workgroup_size = metal::MTLSize { + width: workgroup_size.0 as u64, + height: workgroup_size.1 as u64, + depth: workgroup_size.2 as u64, + }; + self.raw.dispatch_thread_groups(workgroup_count, workgroup_size); + } + + unsafe fn finish(&mut self) { + self.raw.end_encoding(); + } +} + impl crate::backend::DescriptorSetBuilder for DescriptorSetBuilder { fn add_buffers(&mut self, buffers: &[&Buffer]) { self.0.buffers.extend(buffers.iter().copied().cloned()); diff --git a/piet-gpu-hal/src/metal/timer.rs b/piet-gpu-hal/src/metal/timer.rs index 5830fee..a51bc6d 100644 --- a/piet-gpu-hal/src/metal/timer.rs +++ b/piet-gpu-hal/src/metal/timer.rs @@ -20,12 +20,16 @@ use std::{ffi::CStr, ptr::null_mut}; -use cocoa_foundation::{base::id, foundation::NSUInteger}; -use metal::DeviceRef; +use cocoa_foundation::{ + base::id, + foundation::{NSRange, NSUInteger}, +}; +use metal::{DeviceRef, MTLStorageMode}; use objc::{class, msg_send, sel, sel_impl}; pub struct CounterSampleBuffer { id: id, + count: u64, } pub struct CounterSet { @@ -43,11 +47,18 @@ impl Clone for CounterSampleBuffer { unsafe { CounterSampleBuffer { id: msg_send![self.id, retain], + count: self.count, } } } } +impl CounterSampleBuffer { + pub fn id(&self) -> id { + self.id + } +} + impl CounterSet { pub fn get_timer_counter_set(device: &DeviceRef) -> Option { unsafe { @@ -81,6 +92,10 @@ impl CounterSampleBuffer { let count = count as NSUInteger; let () = msg_send![descriptor, setSampleCount: count]; let () = msg_send![descriptor, setCounterSet: counter_set.id]; + let () = msg_send![ + descriptor, + setStorageMode: MTLStorageMode::Shared as NSUInteger + ]; let mut error: id = null_mut(); let buf: id = msg_send![device, newCounterSampleBufferWithDescriptor: descriptor error: &mut error]; let () = msg_send![descriptor, release]; @@ -88,11 +103,21 @@ impl CounterSampleBuffer { let () = msg_send![error, release]; return None; } - Some(CounterSampleBuffer { id: buf }) + Some(CounterSampleBuffer { id: buf, count }) } } - pub fn id(&self) -> id { - self.id + // Read the timestamps. + // + // Safety: the lifetime of the returned slice is wrong, it's actually autoreleased. + pub unsafe fn resolve(&self) -> &[u64] { + let range = NSRange::new(0, self.count); + let data: id = msg_send![self.id, resolveCounterRange: range]; + if data.is_null() { + &[] + } else { + let bytes: *const u64 = msg_send![data, bytes]; + std::slice::from_raw_parts(bytes, self.count as usize) + } } } diff --git a/piet-gpu-hal/src/mux.rs b/piet-gpu-hal/src/mux.rs index af1702d..7853c2b 100644 --- a/piet-gpu-hal/src/mux.rs +++ b/piet-gpu-hal/src/mux.rs @@ -100,6 +100,14 @@ mux_device_enum! { QueryPool } mux_device_enum! { Sampler } +mux_enum! { + pub enum ComputeEncoder { + Vk(>::ComputeEncoder), + Dx12(>::ComputeEncoder), + Mtl(>::ComputeEncoder), + } +} + /// The code for a shader, either as source or intermediate representation. pub enum ShaderCode<'a> { /// SPIR-V (binary intermediate representation)