diff --git a/piet-gpu-hal/src/lib.rs b/piet-gpu-hal/src/lib.rs
index 18f6390..a1073f4 100644
--- a/piet-gpu-hal/src/lib.rs
+++ b/piet-gpu-hal/src/lib.rs
@@ -21,8 +21,8 @@ pub use crate::mux::{
 };
 pub use bufwrite::BufWrite;
 pub use hub::{
-    BufReadGuard, BufWriteGuard, Buffer, CmdBuf, DescriptorSetBuilder, Image, RetainResource,
-    Session, SubmittedCmdBuf,
+    BufReadGuard, BufWriteGuard, Buffer, CmdBuf, ComputePass, DescriptorSetBuilder, Image,
+    RetainResource, Session, SubmittedCmdBuf,
 };
 
 // TODO: because these are conditionally included, "cargo fmt" does not
diff --git a/piet-gpu-hal/src/metal.rs b/piet-gpu-hal/src/metal.rs
index c907d77..9b4468c 100644
--- a/piet-gpu-hal/src/metal.rs
+++ b/piet-gpu-hal/src/metal.rs
@@ -381,8 +381,25 @@ impl crate::backend::Device for MtlDevice {
 
     fn create_cmd_buf(&self) -> Result<Self::CmdBuf, Error> {
         let cmd_queue = self.cmd_queue.lock().unwrap();
+        // A discussion about autorelease pools.
+        //
+        // Autorelease pools are a sore point in Rust/Objective-C interop. Basically,
+        // you can have any two of correctness, ergonomics, and performance. Here we've
+        // chosen the first two, using the pattern of a fine grained autorelease pool
+        // to give the Obj-C object Rust-like lifetime semantics whenever objects are
+        // created as autorelease (by convention, this is any object creation with an
+        // Obj-C method name that doesn't begin with "new" or "alloc").
+        //
+        // To gain back some of the performance, we'd need a way to wrap an autorelease
+        // pool over a chunk of work - that could be one frame of rendering, but for
+        // tests that iterate a number of command buffer submissions, it would need to
+        // be around that. On non-mac platforms, it would be a no-op.
+        //
+        // In any case, this way, the caller doesn't need to worry, and the performance
+        // hit might not be so bad (perhaps we should measure).
+
         // consider new_command_buffer_with_unretained_references for performance
-        let cmd_buf = cmd_queue.new_command_buffer().to_owned();
+        let cmd_buf = autoreleasepool(|| cmd_queue.new_command_buffer().to_owned());
         let helpers = self.helpers.clone();
         let cur_encoder = Encoder::None;
         let time_calibration = Default::default();
@@ -555,32 +572,38 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
     }
 
     unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
-        debug_assert!(matches!(self.cur_encoder, Encoder::None));
-        let encoder = if let Some(queries) = &desc.timer_queries {
-            let descriptor: id = msg_send![class!(MTLComputePassDescriptor), computePassDescriptor];
-            let attachments: id = msg_send![descriptor, sampleBufferAttachments];
-            let index: NSUInteger = 0;
-            let attachment: id = msg_send![attachments, objectAtIndexedSubscript: index];
-            // Here we break the hub/mux separation a bit, for expedience
-            #[allow(irrefutable_let_patterns)]
-            if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 {
-                if let Some(sample_buf) = &query_pool.counter_sample_buf {
-                    let () = msg_send![attachment, setSampleBuffer: sample_buf.id()];
+        // TODO: we might want to get better about validation but the following
+        // assert is likely to trigger, and also a case can be made that
+        // validation should be done at the hub level, for consistency.
+        //debug_assert!(matches!(self.cur_encoder, Encoder::None));
+        self.flush_encoder();
+        autoreleasepool(|| {
+            let encoder = if let Some(queries) = &desc.timer_queries {
+                let descriptor: id =
+                    msg_send![class!(MTLComputePassDescriptor), computePassDescriptor];
+                let attachments: id = msg_send![descriptor, sampleBufferAttachments];
+                let index: NSUInteger = 0;
+                let attachment: id = msg_send![attachments, objectAtIndexedSubscript: index];
+                // Here we break the hub/mux separation a bit, for expedience
+                #[allow(irrefutable_let_patterns)]
+                if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 {
+                    if let Some(sample_buf) = &query_pool.counter_sample_buf {
+                        let () = msg_send![attachment, setSampleBuffer: sample_buf.id()];
+                    }
                 }
-            }
-            let start_index = queries.1 as NSUInteger;
-            let end_index = queries.2 as NSInteger;
-            let () = msg_send![attachment, setStartOfEncoderSampleIndex: start_index];
-            let () = msg_send![attachment, setEndOfEncoderSampleIndex: end_index];
-            let encoder = msg_send![
-                self.cmd_buf,
-                computeCommandEncoderWithDescriptor: descriptor
-            ];
-            encoder
-        } else {
-            self.cmd_buf.new_compute_command_encoder()
-        };
-        self.cur_encoder = Encoder::Compute(encoder.to_owned());
+                let start_index = queries.1 as NSUInteger;
+                let end_index = queries.2 as NSInteger;
+                let () = msg_send![attachment, setStartOfEncoderSampleIndex: start_index];
+                let () = msg_send![attachment, setEndOfEncoderSampleIndex: end_index];
+                msg_send![
+                    self.cmd_buf,
+                    computeCommandEncoderWithDescriptor: descriptor
+                ]
+            } else {
+                self.cmd_buf.new_compute_command_encoder()
+            };
+            self.cur_encoder = Encoder::Compute(encoder.to_owned());
+        });
     }
 
     unsafe fn dispatch(
diff --git a/piet-gpu-hal/src/metal/timer.rs b/piet-gpu-hal/src/metal/timer.rs
index a8b80d6..65c8026 100644
--- a/piet-gpu-hal/src/metal/timer.rs
+++ b/piet-gpu-hal/src/metal/timer.rs
@@ -67,6 +67,12 @@ impl CounterSampleBuffer {
     }
 }
 
+impl Drop for CounterSet {
+    fn drop(&mut self) {
+        unsafe { msg_send![self.id, release] }
+    }
+}
+
 impl CounterSet {
     pub fn get_timer_counter_set(device: &DeviceRef) -> Option<CounterSet> {
         unsafe {
@@ -86,6 +92,19 @@ impl CounterSet {
     }
 }
 
+// copied from metal-rs; should be in common utilities maybe?
+fn nsstring_as_str(nsstr: &objc::runtime::Object) -> &str {
+    let bytes = unsafe {
+        let bytes: *const std::os::raw::c_char = msg_send![nsstr, UTF8String];
+        bytes as *const u8
+    };
+    let len: NSUInteger = unsafe { msg_send![nsstr, length] };
+    unsafe {
+        let bytes = std::slice::from_raw_parts(bytes, len as usize);
+        std::str::from_utf8(bytes).unwrap()
+    }
+}
+
 impl CounterSampleBuffer {
     pub fn new(
         device: &DeviceRef,
@@ -107,6 +126,11 @@ impl CounterSampleBuffer {
             let buf: id = msg_send![device, newCounterSampleBufferWithDescriptor: descriptor error: &mut error];
             let () = msg_send![descriptor, release];
             if !error.is_null() {
+                let description = msg_send![error, localizedDescription];
+                println!(
+                    "error allocating sample buffer, code = {}",
+                    nsstring_as_str(description)
+                );
                 let () = msg_send![error, release];
                 return None;
             }
@@ -138,7 +162,7 @@ impl TimeCalibration {
         let delta_gpu = self.gpu_end_ts - self.gpu_start_ts;
         let adj_ts = if delta_gpu > 0 {
             let scale = delta_cpu as f64 / delta_gpu as f64;
-            self.cpu_start_ts as f64 + (raw_ts - self.gpu_start_ts) as f64 * scale
+            self.cpu_start_ts as f64 + (raw_ts as f64 - self.gpu_start_ts as f64) * scale
         } else {
             // Default is ns on Apple Silicon; on other hardware this will be wrong
             raw_ts as f64
diff --git a/tests/src/clear.rs b/tests/src/clear.rs
index fc6f063..af4b8ea 100644
--- a/tests/src/clear.rs
+++ b/tests/src/clear.rs
@@ -16,11 +16,11 @@
 
 //! Utilities (and a benchmark) for clearing buffers with compute shaders.
 
-use piet_gpu_hal::{include_shader, BindType, BufferUsage, DescriptorSet};
+use piet_gpu_hal::{include_shader, BindType, BufferUsage, ComputePass, DescriptorSet};
 use piet_gpu_hal::{Buffer, Pipeline};
 
 use crate::config::Config;
-use crate::runner::{Commands, Runner};
+use crate::runner::Runner;
 use crate::test_result::TestResult;
 
 const WG_SIZE: u64 = 256;
@@ -52,9 +52,9 @@ pub unsafe fn run_clear_test(runner: &mut Runner, config: &Config) -> TestResult
     let mut total_elapsed = 0.0;
     for i in 0..n_iter {
         let mut commands = runner.commands();
-        commands.write_timestamp(0);
-        stage.record(&mut commands, &code, &binding);
-        commands.write_timestamp(1);
+        let mut pass = commands.compute_pass(0, 1);
+        stage.record(&mut pass, &code, &binding);
+        pass.end();
         if i == 0 || config.verify_all {
             commands.cmd_buf.memory_barrier();
             commands.download(&out_buf);
@@ -108,17 +108,12 @@ impl ClearStage {
         ClearBinding { descriptor_set }
     }
 
-    pub unsafe fn record(
-        &self,
-        commands: &mut Commands,
-        code: &ClearCode,
-        bindings: &ClearBinding,
-    ) {
+    pub unsafe fn record(&self, pass: &mut ComputePass, code: &ClearCode, bindings: &ClearBinding) {
         let n_workgroups = (self.n_elements + WG_SIZE - 1) / WG_SIZE;
         // An issue: for clearing large buffers (>16M), we need to check the
         // number of workgroups against the (dynamically detected) limit, and
         // potentially issue multiple dispatches.
-        commands.cmd_buf.dispatch(
+        pass.dispatch(
             &code.pipeline,
             &bindings.descriptor_set,
             (n_workgroups as u32, 1, 1),
diff --git a/tests/src/runner.rs b/tests/src/runner.rs
index 1fd6774..f97e15a 100644
--- a/tests/src/runner.rs
+++ b/tests/src/runner.rs
@@ -20,8 +20,8 @@ use std::ops::RangeBounds;
 
 use bytemuck::Pod;
 use piet_gpu_hal::{
-    BackendType, BufReadGuard, BufWriteGuard, Buffer, BufferUsage, CmdBuf, Instance, InstanceFlags,
-    QueryPool, Session,
+    BackendType, BufReadGuard, BufWriteGuard, Buffer, BufferUsage, CmdBuf, ComputePass,
+    ComputePassDescriptor, Instance, InstanceFlags, QueryPool, Session,
 };
 
 pub struct Runner {
@@ -118,10 +118,21 @@ impl Runner {
 }
 
 impl Commands {
+    #[deprecated(note = "use compute_pass instead")]
     pub unsafe fn write_timestamp(&mut self, query: u32) {
         self.cmd_buf.write_timestamp(&self.query_pool, query);
     }
 
+    /// Start a compute pass with timer queries.
+    pub unsafe fn compute_pass(&mut self, start_query: u32, end_query: u32) -> ComputePass {
+        self.cmd_buf
+            .begin_compute_pass(&ComputePassDescriptor::timer(
+                &self.query_pool,
+                start_query,
+                end_query,
+            ))
+    }
+
     pub unsafe fn upload(&mut self, buf: &BufStage) {
         self.cmd_buf.copy_buffer(&buf.stage_buf, &buf.dev_buf);
     }