diff --git a/piet-gpu-hal/src/lib.rs b/piet-gpu-hal/src/lib.rs index 18f6390..a1073f4 100644 --- a/piet-gpu-hal/src/lib.rs +++ b/piet-gpu-hal/src/lib.rs @@ -21,8 +21,8 @@ pub use crate::mux::{ }; pub use bufwrite::BufWrite; pub use hub::{ - BufReadGuard, BufWriteGuard, Buffer, CmdBuf, DescriptorSetBuilder, Image, RetainResource, - Session, SubmittedCmdBuf, + BufReadGuard, BufWriteGuard, Buffer, CmdBuf, ComputePass, DescriptorSetBuilder, Image, + RetainResource, Session, SubmittedCmdBuf, }; // TODO: because these are conditionally included, "cargo fmt" does not diff --git a/piet-gpu-hal/src/metal.rs b/piet-gpu-hal/src/metal.rs index c907d77..9b4468c 100644 --- a/piet-gpu-hal/src/metal.rs +++ b/piet-gpu-hal/src/metal.rs @@ -381,8 +381,25 @@ impl crate::backend::Device for MtlDevice { fn create_cmd_buf(&self) -> Result { let cmd_queue = self.cmd_queue.lock().unwrap(); + // A discussion about autorelease pools. + // + // Autorelease pools are a sore point in Rust/Objective-C interop. Basically, + // you can have any two of correctness, ergonomics, and performance. Here we've + // chosen the first two, using the pattern of a fine grained autorelease pool + // to give the Obj-C object Rust-like lifetime semantics whenever objects are + // created as autorelease (by convention, this is any object creation with an + // Obj-C method name that doesn't begin with "new" or "alloc"). + // + // To gain back some of the performance, we'd need a way to wrap an autorelease + // pool over a chunk of work - that could be one frame of rendering, but for + // tests that iterate a number of command buffer submissions, it would need to + // be around that. On non-mac platforms, it would be a no-op. + // + // In any case, this way, the caller doesn't need to worry, and the performance + // hit might not be so bad (perhaps we should measure). + // consider new_command_buffer_with_unretained_references for performance - let cmd_buf = cmd_queue.new_command_buffer().to_owned(); + let cmd_buf = autoreleasepool(|| cmd_queue.new_command_buffer().to_owned()); let helpers = self.helpers.clone(); let cur_encoder = Encoder::None; let time_calibration = Default::default(); @@ -555,32 +572,38 @@ impl crate::backend::CmdBuf for CmdBuf { } unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) { - debug_assert!(matches!(self.cur_encoder, Encoder::None)); - let encoder = if let Some(queries) = &desc.timer_queries { - let descriptor: id = msg_send![class!(MTLComputePassDescriptor), computePassDescriptor]; - let attachments: id = msg_send![descriptor, sampleBufferAttachments]; - let index: NSUInteger = 0; - let attachment: id = msg_send![attachments, objectAtIndexedSubscript: index]; - // Here we break the hub/mux separation a bit, for expedience - #[allow(irrefutable_let_patterns)] - if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 { - if let Some(sample_buf) = &query_pool.counter_sample_buf { - let () = msg_send![attachment, setSampleBuffer: sample_buf.id()]; + // TODO: we might want to get better about validation but the following + // assert is likely to trigger, and also a case can be made that + // validation should be done at the hub level, for consistency. + //debug_assert!(matches!(self.cur_encoder, Encoder::None)); + self.flush_encoder(); + autoreleasepool(|| { + let encoder = if let Some(queries) = &desc.timer_queries { + let descriptor: id = + msg_send![class!(MTLComputePassDescriptor), computePassDescriptor]; + let attachments: id = msg_send![descriptor, sampleBufferAttachments]; + let index: NSUInteger = 0; + let attachment: id = msg_send![attachments, objectAtIndexedSubscript: index]; + // Here we break the hub/mux separation a bit, for expedience + #[allow(irrefutable_let_patterns)] + if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 { + if let Some(sample_buf) = &query_pool.counter_sample_buf { + let () = msg_send![attachment, setSampleBuffer: sample_buf.id()]; + } } - } - let start_index = queries.1 as NSUInteger; - let end_index = queries.2 as NSInteger; - let () = msg_send![attachment, setStartOfEncoderSampleIndex: start_index]; - let () = msg_send![attachment, setEndOfEncoderSampleIndex: end_index]; - let encoder = msg_send![ - self.cmd_buf, - computeCommandEncoderWithDescriptor: descriptor - ]; - encoder - } else { - self.cmd_buf.new_compute_command_encoder() - }; - self.cur_encoder = Encoder::Compute(encoder.to_owned()); + let start_index = queries.1 as NSUInteger; + let end_index = queries.2 as NSInteger; + let () = msg_send![attachment, setStartOfEncoderSampleIndex: start_index]; + let () = msg_send![attachment, setEndOfEncoderSampleIndex: end_index]; + msg_send![ + self.cmd_buf, + computeCommandEncoderWithDescriptor: descriptor + ] + } else { + self.cmd_buf.new_compute_command_encoder() + }; + self.cur_encoder = Encoder::Compute(encoder.to_owned()); + }); } unsafe fn dispatch( diff --git a/piet-gpu-hal/src/metal/timer.rs b/piet-gpu-hal/src/metal/timer.rs index a8b80d6..65c8026 100644 --- a/piet-gpu-hal/src/metal/timer.rs +++ b/piet-gpu-hal/src/metal/timer.rs @@ -67,6 +67,12 @@ impl CounterSampleBuffer { } } +impl Drop for CounterSet { + fn drop(&mut self) { + unsafe { msg_send![self.id, release] } + } +} + impl CounterSet { pub fn get_timer_counter_set(device: &DeviceRef) -> Option { unsafe { @@ -86,6 +92,19 @@ impl CounterSet { } } +// copied from metal-rs; should be in common utilities maybe? +fn nsstring_as_str(nsstr: &objc::runtime::Object) -> &str { + let bytes = unsafe { + let bytes: *const std::os::raw::c_char = msg_send![nsstr, UTF8String]; + bytes as *const u8 + }; + let len: NSUInteger = unsafe { msg_send![nsstr, length] }; + unsafe { + let bytes = std::slice::from_raw_parts(bytes, len as usize); + std::str::from_utf8(bytes).unwrap() + } +} + impl CounterSampleBuffer { pub fn new( device: &DeviceRef, @@ -107,6 +126,11 @@ impl CounterSampleBuffer { let buf: id = msg_send![device, newCounterSampleBufferWithDescriptor: descriptor error: &mut error]; let () = msg_send![descriptor, release]; if !error.is_null() { + let description = msg_send![error, localizedDescription]; + println!( + "error allocating sample buffer, code = {}", + nsstring_as_str(description) + ); let () = msg_send![error, release]; return None; } @@ -138,7 +162,7 @@ impl TimeCalibration { let delta_gpu = self.gpu_end_ts - self.gpu_start_ts; let adj_ts = if delta_gpu > 0 { let scale = delta_cpu as f64 / delta_gpu as f64; - self.cpu_start_ts as f64 + (raw_ts - self.gpu_start_ts) as f64 * scale + self.cpu_start_ts as f64 + (raw_ts as f64 - self.gpu_start_ts as f64) * scale } else { // Default is ns on Apple Silicon; on other hardware this will be wrong raw_ts as f64 diff --git a/tests/src/clear.rs b/tests/src/clear.rs index fc6f063..af4b8ea 100644 --- a/tests/src/clear.rs +++ b/tests/src/clear.rs @@ -16,11 +16,11 @@ //! Utilities (and a benchmark) for clearing buffers with compute shaders. -use piet_gpu_hal::{include_shader, BindType, BufferUsage, DescriptorSet}; +use piet_gpu_hal::{include_shader, BindType, BufferUsage, ComputePass, DescriptorSet}; use piet_gpu_hal::{Buffer, Pipeline}; use crate::config::Config; -use crate::runner::{Commands, Runner}; +use crate::runner::Runner; use crate::test_result::TestResult; const WG_SIZE: u64 = 256; @@ -52,9 +52,9 @@ pub unsafe fn run_clear_test(runner: &mut Runner, config: &Config) -> TestResult let mut total_elapsed = 0.0; for i in 0..n_iter { let mut commands = runner.commands(); - commands.write_timestamp(0); - stage.record(&mut commands, &code, &binding); - commands.write_timestamp(1); + let mut pass = commands.compute_pass(0, 1); + stage.record(&mut pass, &code, &binding); + pass.end(); if i == 0 || config.verify_all { commands.cmd_buf.memory_barrier(); commands.download(&out_buf); @@ -108,17 +108,12 @@ impl ClearStage { ClearBinding { descriptor_set } } - pub unsafe fn record( - &self, - commands: &mut Commands, - code: &ClearCode, - bindings: &ClearBinding, - ) { + pub unsafe fn record(&self, pass: &mut ComputePass, code: &ClearCode, bindings: &ClearBinding) { let n_workgroups = (self.n_elements + WG_SIZE - 1) / WG_SIZE; // An issue: for clearing large buffers (>16M), we need to check the // number of workgroups against the (dynamically detected) limit, and // potentially issue multiple dispatches. - commands.cmd_buf.dispatch( + pass.dispatch( &code.pipeline, &bindings.descriptor_set, (n_workgroups as u32, 1, 1), diff --git a/tests/src/runner.rs b/tests/src/runner.rs index 1fd6774..f97e15a 100644 --- a/tests/src/runner.rs +++ b/tests/src/runner.rs @@ -20,8 +20,8 @@ use std::ops::RangeBounds; use bytemuck::Pod; use piet_gpu_hal::{ - BackendType, BufReadGuard, BufWriteGuard, Buffer, BufferUsage, CmdBuf, Instance, InstanceFlags, - QueryPool, Session, + BackendType, BufReadGuard, BufWriteGuard, Buffer, BufferUsage, CmdBuf, ComputePass, + ComputePassDescriptor, Instance, InstanceFlags, QueryPool, Session, }; pub struct Runner { @@ -118,10 +118,21 @@ impl Runner { } impl Commands { + #[deprecated(note = "use compute_pass instead")] pub unsafe fn write_timestamp(&mut self, query: u32) { self.cmd_buf.write_timestamp(&self.query_pool, query); } + /// Start a compute pass with timer queries. + pub unsafe fn compute_pass(&mut self, start_query: u32, end_query: u32) -> ComputePass { + self.cmd_buf + .begin_compute_pass(&ComputePassDescriptor::timer( + &self.query_pool, + start_query, + end_query, + )) + } + pub unsafe fn upload(&mut self, buf: &BufStage) { self.cmd_buf.copy_buffer(&buf.stage_buf, &buf.dev_buf); }