From 4458e36efa9fe3520480ad017172b347c0b472aa Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Thu, 18 Nov 2021 19:38:31 -0800
Subject: [PATCH 01/12] Start implementing timer queries in Metal

---
 piet-gpu-hal/src/backend.rs     |  10 +--
 piet-gpu-hal/src/metal.rs       | 150 +++++++++++++++++++++++++++-----
 piet-gpu-hal/src/metal/timer.rs |  98 +++++++++++++++++++++
 3 files changed, 229 insertions(+), 29 deletions(-)
 create mode 100644 piet-gpu-hal/src/metal/timer.rs
diff --git a/piet-gpu-hal/src/backend.rs b/piet-gpu-hal/src/backend.rs
index 02ac7cb..5715d62 100644
--- a/piet-gpu-hal/src/backend.rs
+++ b/piet-gpu-hal/src/backend.rs
@@ -202,16 +202,16 @@ pub trait CmdBuf<D: Device> {
     /// This is readily supported in Vulkan, but for portability it is remarkably
     /// tricky (unimplemented in gfx-hal right now). Possibly best to write a compute
     /// kernel, or organize the code not to need it.
-    unsafe fn clear_buffer(&self, buffer: &D::Buffer, size: Option<u64>);
+    unsafe fn clear_buffer(&mut self, buffer: &D::Buffer, size: Option<u64>);
 
-    unsafe fn copy_buffer(&self, src: &D::Buffer, dst: &D::Buffer);
+    unsafe fn copy_buffer(&mut self, src: &D::Buffer, dst: &D::Buffer);
 
-    unsafe fn copy_image_to_buffer(&self, src: &D::Image, dst: &D::Buffer);
+    unsafe fn copy_image_to_buffer(&mut self, src: &D::Image, dst: &D::Buffer);
 
-    unsafe fn copy_buffer_to_image(&self, src: &D::Buffer, dst: &D::Image);
+    unsafe fn copy_buffer_to_image(&mut self, src: &D::Buffer, dst: &D::Image);
 
     // low portability, dx12 doesn't support it natively
-    unsafe fn blit_image(&self, src: &D::Image, dst: &D::Image);
+    unsafe fn blit_image(&mut self, src: &D::Image, dst: &D::Image);
 
     /// Reset the query pool.
     ///
diff --git a/piet-gpu-hal/src/metal.rs b/piet-gpu-hal/src/metal.rs
index e3157d4..c96f971 100644
--- a/piet-gpu-hal/src/metal.rs
+++ b/piet-gpu-hal/src/metal.rs
@@ -15,18 +15,20 @@
 // Also licensed under MIT license, at your choice.
 
 mod clear;
+mod timer;
 mod util;
 
 use std::mem;
 use std::sync::{Arc, Mutex};
 
+use block::Block;
 use cocoa_foundation::base::id;
 use cocoa_foundation::foundation::{NSInteger, NSUInteger};
 use objc::rc::autoreleasepool;
 use objc::runtime::{Object, BOOL, YES};
 use objc::{class, msg_send, sel, sel_impl};
 
-use metal::{CGFloat, MTLFeatureSet};
+use metal::{CGFloat, CommandBufferRef, MTLFeatureSet};
 
 use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};
 
@@ -81,6 +83,22 @@ pub struct Semaphore;
 pub struct CmdBuf {
     cmd_buf: metal::CommandBuffer,
     helpers: Arc<Helpers>,
+    cur_encoder: Encoder,
+    time_calibration: Arc<Mutex<TimeCalibration>>,
+}
+
+enum Encoder {
+    None,
+    Compute(metal::ComputeCommandEncoder),
+    Blit(metal::BlitCommandEncoder),
+}
+
+#[derive(Default)]
+struct TimeCalibration {
+    cpu_start_ts: u64,
+    gpu_start_ts: u64,
+    cpu_end_ts: u64,
+    gpu_end_ts: u64,
 }
 
 pub struct QueryPool;
@@ -209,6 +227,10 @@ impl MtlDevice {
         let helpers = Arc::new(Helpers {
             clear_pipeline: clear::make_clear_pipeline(&device),
         });
+        // Timer stuff
+        if let Some(timer_set) = timer::CounterSet::get_timer_counter_set(&device) {
+            let timer = timer::CounterSampleBuffer::new(&device, 4, &timer_set);
+        }
         MtlDevice {
             device,
             cmd_queue: Arc::new(Mutex::new(cmd_queue)),
@@ -220,7 +242,9 @@ impl MtlDevice {
     pub fn cmd_buf_from_raw_mtl(&self, raw_cmd_buf: metal::CommandBuffer) -> CmdBuf {
         let cmd_buf = raw_cmd_buf;
         let helpers = self.helpers.clone();
-        CmdBuf { cmd_buf, helpers }
+        let cur_encoder = Encoder::None;
+        let time_calibration = Default::default();
+        CmdBuf { cmd_buf, helpers, cur_encoder, time_calibration }
     }
 
     pub fn image_from_raw_mtl(&self, texture: metal::Texture, width: u32, height: u32) -> Image {
@@ -331,10 +355,16 @@ impl crate::backend::Device for MtlDevice {
     fn create_cmd_buf(&self) -> Result<Self::CmdBuf, Error> {
         let cmd_queue = self.cmd_queue.lock().unwrap();
         // consider new_command_buffer_with_unretained_references for performance
-        let cmd_buf = cmd_queue.new_command_buffer();
-        let cmd_buf = autoreleasepool(|| cmd_buf.to_owned());
+        let cmd_buf = cmd_queue.new_command_buffer().to_owned();
         let helpers = self.helpers.clone();
-        Ok(CmdBuf { cmd_buf, helpers })
+        let cur_encoder = Encoder::None;
+        let time_calibration = Default::default();
+        Ok(CmdBuf {
+            cmd_buf,
+            helpers,
+            cur_encoder,
+            time_calibration,
+        })
     }
 
     unsafe fn destroy_cmd_buf(&self, _cmd_buf: Self::CmdBuf) -> Result<(), Error> {
@@ -358,7 +388,45 @@ impl crate::backend::Device for MtlDevice {
         _signal_semaphores: &[&Self::Semaphore],
         fence: Option<&mut Self::Fence>,
     ) -> Result<(), Error> {
+        unsafe fn add_scheduled_handler(
+            cmd_buf: &metal::CommandBufferRef,
+            block: &Block<(&CommandBufferRef,), ()>,
+        ) {
+            msg_send![cmd_buf, addScheduledHandler: block]
+        }
         for cmd_buf in cmd_bufs {
+            let time_calibration = cmd_buf.time_calibration.clone();
+            let start_block = block::ConcreteBlock::new(move |buffer: &metal::CommandBufferRef| {
+                let device: id = msg_send![buffer, device];
+                let mut time_calibration = time_calibration.lock().unwrap();
+                let cpu_ts_ptr = &mut time_calibration.cpu_start_ts as *mut _;
+                let gpu_ts_ptr = &mut time_calibration.gpu_start_ts as *mut _;
+                // TODO: only do this if supported.
+                let () = msg_send![device, sampleTimestamps: cpu_ts_ptr gpuTimestamp: gpu_ts_ptr];
+                println!(
+                    "scheduled, {}, {}",
+                    time_calibration.cpu_start_ts, time_calibration.gpu_start_ts
+                );
+            })
+            .copy();
+            add_scheduled_handler(&cmd_buf.cmd_buf, &start_block);
+            let time_calibration = cmd_buf.time_calibration.clone();
+            let completed_block =
+                block::ConcreteBlock::new(move |buffer: &metal::CommandBufferRef| {
+                    let device: id = msg_send![buffer, device];
+                    let mut time_calibration = time_calibration.lock().unwrap();
+                    let cpu_ts_ptr = &mut time_calibration.cpu_end_ts as *mut _;
+                    let gpu_ts_ptr = &mut time_calibration.gpu_end_ts as *mut _;
+                    // TODO: only do this if supported.
+                    let () =
+                        msg_send![device, sampleTimestamps: cpu_ts_ptr gpuTimestamp: gpu_ts_ptr];
+                    println!(
+                        "completed, {}, {}",
+                        time_calibration.cpu_end_ts, time_calibration.gpu_end_ts
+                    );
+                })
+                .copy();
+            cmd_buf.cmd_buf.add_completed_handler(&completed_block);
             cmd_buf.cmd_buf.commit();
         }
         if let Some(last_cmd_buf) = cmd_bufs.last() {
@@ -439,7 +507,9 @@ impl crate::backend::Device for MtlDevice {
 impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
     unsafe fn begin(&mut self) {}
 
-    unsafe fn finish(&mut self) {}
+    unsafe fn finish(&mut self) {
+        self.flush_encoder();
+    }
 
     unsafe fn reset(&mut self) -> bool {
         false
@@ -452,7 +522,7 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
         workgroup_count: (u32, u32, u32),
         workgroup_size: (u32, u32, u32),
     ) {
-        let encoder = self.cmd_buf.new_compute_command_encoder();
+        let encoder = self.compute_command_encoder();
         encoder.set_compute_pipeline_state(&pipeline.0);
         let mut buf_ix = 0;
         for buffer in &descriptor_set.buffers {
@@ -475,7 +545,6 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
             depth: workgroup_size.2 as u64,
         };
         encoder.dispatch_thread_groups(workgroup_count, workgroup_size);
-        encoder.end_encoding();
     }
 
     unsafe fn memory_barrier(&mut self) {
@@ -494,22 +563,23 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
         // I think these are being tracked.
     }
 
-    unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) {
+    unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
         let size = size.unwrap_or(buffer.size);
-        let encoder = self.cmd_buf.new_compute_command_encoder();
-        clear::encode_clear(&encoder, &self.helpers.clear_pipeline, &buffer.buffer, size);
-        encoder.end_encoding()
+        let _ = self.compute_command_encoder();
+        // Getting this directly is a workaround for a borrow checker issue.
+        if let Encoder::Compute(e) = &self.cur_encoder {
+            clear::encode_clear(e, &self.helpers.clear_pipeline, &buffer.buffer, size);
+        }
     }
 
-    unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) {
-        let encoder = self.cmd_buf.new_blit_command_encoder();
+    unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
+        let encoder = self.blit_command_encoder();
         let size = src.size.min(dst.size);
         encoder.copy_from_buffer(&src.buffer, 0, &dst.buffer, 0, size);
-        encoder.end_encoding();
     }
 
-    unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) {
-        let encoder = self.cmd_buf.new_blit_command_encoder();
+    unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
+        let encoder = self.blit_command_encoder();
         assert_eq!(dst.size, (src.width as u64) * (src.height as u64) * 4);
         let bytes_per_row = (src.width * 4) as NSUInteger;
         let src_size = metal::MTLSize {
@@ -530,11 +600,10 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
             bytes_per_row * src.height as NSUInteger,
             metal::MTLBlitOption::empty(),
         );
-        encoder.end_encoding();
     }
 
-    unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) {
-        let encoder = self.cmd_buf.new_blit_command_encoder();
+    unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
+        let encoder = self.blit_command_encoder();
         assert_eq!(src.size, (dst.width as u64) * (dst.height as u64) * 4);
         let bytes_per_row = (dst.width * 4) as NSUInteger;
         let src_size = metal::MTLSize {
@@ -555,11 +624,10 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
             origin,
             metal::MTLBlitOption::empty(),
         );
-        encoder.end_encoding();
     }
 
-    unsafe fn blit_image(&self, src: &Image, dst: &Image) {
-        let encoder = self.cmd_buf.new_blit_command_encoder();
+    unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
+        let encoder = self.blit_command_encoder();
         let src_size = metal::MTLSize {
             width: src.width.min(dst.width) as NSUInteger,
             height: src.width.min(dst.height) as NSUInteger,
@@ -577,7 +645,6 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
             0,
             origin,
         );
-        encoder.end_encoding();
     }
 
     unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {}
@@ -589,6 +656,41 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
     }
 }
 
+impl CmdBuf {
+    fn compute_command_encoder(&mut self) -> &metal::ComputeCommandEncoder {
+        if !matches!(self.cur_encoder, Encoder::Compute(_)) {
+            self.flush_encoder();
+            self.cur_encoder =
+                Encoder::Compute(self.cmd_buf.new_compute_command_encoder().to_owned());
+        }
+        if let Encoder::Compute(e) = &self.cur_encoder {
+            e
+        } else {
+            unreachable!()
+        }
+    }
+
+    fn blit_command_encoder(&mut self) -> &metal::BlitCommandEncoder {
+        if !matches!(self.cur_encoder, Encoder::Blit(_)) {
+            self.flush_encoder();
+            self.cur_encoder = Encoder::Blit(self.cmd_buf.new_blit_command_encoder().to_owned());
+        }
+        if let Encoder::Blit(e) = &self.cur_encoder {
+            e
+        } else {
+            unreachable!()
+        }
+    }
+
+    fn flush_encoder(&mut self) {
+        match std::mem::replace(&mut self.cur_encoder, Encoder::None) {
+            Encoder::Compute(e) => e.end_encoding(),
+            Encoder::Blit(e) => e.end_encoding(),
+            Encoder::None => (),
+        }
+    }
+}
+
 impl crate::backend::DescriptorSetBuilder<MtlDevice> for DescriptorSetBuilder {
     fn add_buffers(&mut self, buffers: &[&Buffer]) {
         self.0.buffers.extend(buffers.iter().copied().cloned());
diff --git a/piet-gpu-hal/src/metal/timer.rs b/piet-gpu-hal/src/metal/timer.rs
new file mode 100644
index 0000000..5830fee
--- /dev/null
+++ b/piet-gpu-hal/src/metal/timer.rs
@@ -0,0 +1,98 @@
+// Copyright 2021 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+//! Support for timer queries.
+//!
+//! Likely some of this should be upstreamed into metal-rs.
+
+use std::{ffi::CStr, ptr::null_mut};
+
+use cocoa_foundation::{base::id, foundation::NSUInteger};
+use metal::DeviceRef;
+use objc::{class, msg_send, sel, sel_impl};
+
+pub struct CounterSampleBuffer {
+    id: id,
+}
+
+pub struct CounterSet {
+    id: id,
+}
+
+impl Drop for CounterSampleBuffer {
+    fn drop(&mut self) {
+        unsafe { msg_send![self.id, release] }
+    }
+}
+
+impl Clone for CounterSampleBuffer {
+    fn clone(&self) -> CounterSampleBuffer {
+        unsafe {
+            CounterSampleBuffer {
+                id: msg_send![self.id, retain],
+            }
+        }
+    }
+}
+
+impl CounterSet {
+    pub fn get_timer_counter_set(device: &DeviceRef) -> Option<CounterSet> {
+        unsafe {
+            // TODO: version check
+            let sets: id = msg_send!(device, counterSets);
+            let count: NSUInteger = msg_send![sets, count];
+            for i in 0..count {
+                let set: id = msg_send![sets, objectAtIndex: i];
+                let name: id = msg_send![set, name];
+                let name_cstr = CStr::from_ptr(msg_send![name, UTF8String]);
+                if name_cstr.to_bytes() == b"timestamp" {
+                    return Some(CounterSet { id: set });
+                }
+            }
+            None
+        }
+    }
+}
+
+impl CounterSampleBuffer {
+    pub fn new(
+        device: &DeviceRef,
+        count: u64,
+        counter_set: &CounterSet,
+    ) -> Option<CounterSampleBuffer> {
+        unsafe {
+            let desc_cls = class!(MTLCounterSampleBufferDescriptor);
+            let descriptor: id = msg_send![desc_cls, alloc];
+            println!("descriptor = {:?}", descriptor);
+            let _: id = msg_send![descriptor, init];
+            let count = count as NSUInteger;
+            let () = msg_send![descriptor, setSampleCount: count];
+            let () = msg_send![descriptor, setCounterSet: counter_set.id];
+            let mut error: id = null_mut();
+            let buf: id = msg_send![device, newCounterSampleBufferWithDescriptor: descriptor error: &mut error];
+            let () = msg_send![descriptor, release];
+            if !error.is_null() {
+                let () = msg_send![error, release];
+                return None;
+            }
+            Some(CounterSampleBuffer { id: buf })
+        }
+    }
+
+    pub fn id(&self) -> id {
+        self.id
+    }
+}

From 290d5d2e132bbe657a45b04bc8b4eecd43024cab Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Thu, 6 Jan 2022 10:02:07 -0800
Subject: [PATCH 02/12] More progress

This puts most of the infrastructure in place but I'm hitting an error
that "sampleCountersInBuffer is not supported on this device".

The issue is that M1 supports stage boundaries and not command boundaries.
We'll have to rework the logic a bit. (And, in the most general case, support
both)

Start implementing stage boundaries, but it will probably require an API
change.
---
 Cargo.lock                      |   1 +
 piet-gpu-hal/Cargo.toml         |   1 +
 piet-gpu-hal/src/backend.rs     |  17 ++++
 piet-gpu-hal/src/metal.rs       | 133 ++++++++++++++++++++++++++++----
 piet-gpu-hal/src/metal/timer.rs |  35 +++++++--
 piet-gpu-hal/src/mux.rs         |   8 ++
 6 files changed, 177 insertions(+), 18 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 737c033..66c793b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -921,6 +921,7 @@ dependencies = [
  "block",
  "bytemuck",
  "cocoa-foundation",
+ "foreign-types",
  "metal",
  "objc",
  "raw-window-handle",
diff --git a/piet-gpu-hal/Cargo.toml b/piet-gpu-hal/Cargo.toml
index 29b51bd..f9b844a 100644
--- a/piet-gpu-hal/Cargo.toml
+++ b/piet-gpu-hal/Cargo.toml
@@ -28,3 +28,4 @@ metal = "0.22"
 objc = "0.2.5"
 block = "0.1.6"
 cocoa-foundation = "0.1"
+foreign-types = "0.3.2"
diff --git a/piet-gpu-hal/src/backend.rs b/piet-gpu-hal/src/backend.rs
index 5715d62..c1b2132 100644
--- a/piet-gpu-hal/src/backend.rs
+++ b/piet-gpu-hal/src/backend.rs
@@ -160,6 +160,8 @@ pub trait Device: Sized {
 }
 
 pub trait CmdBuf<D: Device> {
+    type ComputeEncoder;
+
     unsafe fn begin(&mut self);
 
     unsafe fn finish(&mut self);
@@ -231,6 +233,8 @@ pub trait CmdBuf<D: Device> {
 
     /// End a section opened by `begin_debug_label`.
     unsafe fn end_debug_label(&mut self) {}
+
+    unsafe fn new_compute_encoder(&mut self) -> Self::ComputeEncoder;
 }
 
 /// A builder for descriptor sets with more complex layouts.
@@ -252,3 +256,16 @@ pub trait DescriptorSetBuilder<D: Device> {
     fn add_textures(&mut self, images: &[&D::Image]);
     unsafe fn build(self, device: &D, pipeline: &D::Pipeline) -> Result<D::DescriptorSet, Error>;
 }
+
+pub trait ComputeEncoder<D: Device> {
+    unsafe fn dispatch(
+        &mut self,
+        pipeline: &D::Pipeline,
+        descriptor_set: &D::DescriptorSet,
+        workgroup_count: (u32, u32, u32),
+        workgroup_size: (u32, u32, u32),
+    );
+
+    // Question: should be self?
+    unsafe fn finish(&mut self);
+}
diff --git a/piet-gpu-hal/src/metal.rs b/piet-gpu-hal/src/metal.rs
index c96f971..23cc256 100644
--- a/piet-gpu-hal/src/metal.rs
+++ b/piet-gpu-hal/src/metal.rs
@@ -24,6 +24,7 @@ use std::sync::{Arc, Mutex};
 use block::Block;
 use cocoa_foundation::base::id;
 use cocoa_foundation::foundation::{NSInteger, NSUInteger};
+use foreign_types::ForeignType;
 use objc::rc::autoreleasepool;
 use objc::runtime::{Object, BOOL, YES};
 use objc::{class, msg_send, sel, sel_impl};
@@ -36,6 +37,8 @@ use crate::{BufferUsage, Error, GpuInfo, ImageFormat, MapMode, WorkgroupLimits};
 
 use util::*;
 
+use self::timer::{CounterSampleBuffer, CounterSet};
+
 pub struct MtlInstance;
 
 pub struct MtlDevice {
@@ -43,6 +46,18 @@ pub struct MtlDevice {
     cmd_queue: Arc<Mutex<metal::CommandQueue>>,
     gpu_info: GpuInfo,
     helpers: Arc<Helpers>,
+    timer_set: Option<CounterSet>,
+    counter_style: CounterStyle,
+}
+
+/// Type of counter sampling.
+///
+/// See https://developer.apple.com/documentation/metal/counter_sampling/sampling_gpu_data_into_counter_sample_buffers
+#[derive(Clone, Copy, PartialEq, Eq)]
+enum CounterStyle {
+    None,
+    Stage,
+    Command,
 }
 
 pub struct MtlSurface {
@@ -85,6 +100,7 @@ pub struct CmdBuf {
     helpers: Arc<Helpers>,
     cur_encoder: Encoder,
     time_calibration: Arc<Mutex<TimeCalibration>>,
+    counter_style: CounterStyle,
 }
 
 enum Encoder {
@@ -101,7 +117,7 @@ struct TimeCalibration {
     gpu_end_ts: u64,
 }
 
-pub struct QueryPool;
+pub struct QueryPool(Option<CounterSampleBuffer>);
 
 pub struct Pipeline(metal::ComputePipelineState);
 
@@ -118,6 +134,10 @@ struct Helpers {
     clear_pipeline: metal::ComputePipelineState,
 }
 
+pub struct ComputeEncoder {
+    raw: metal::ComputeCommandEncoder,
+}
+
 impl MtlInstance {
     pub fn new(
         window_handle: Option<&dyn HasRawWindowHandle>,
@@ -228,15 +248,22 @@ impl MtlDevice {
             clear_pipeline: clear::make_clear_pipeline(&device),
         });
         // Timer stuff
-        if let Some(timer_set) = timer::CounterSet::get_timer_counter_set(&device) {
-            let timer = timer::CounterSampleBuffer::new(&device, 4, &timer_set);
-        }
+        let timer_set = CounterSet::get_timer_counter_set(&device);
+        let counter_style = if timer_set.is_some() {
+            // TODO: M1 is stage style, but should do proper runtime detection.
+            CounterStyle::Stage
+        } else {
+            CounterStyle::None
+        };
+
         MtlDevice {
             device,
             cmd_queue: Arc::new(Mutex::new(cmd_queue)),
             gpu_info,
             helpers,
-        }
+            timer_set,
+            counter_style,
+    }
     }
 
     pub fn cmd_buf_from_raw_mtl(&self, raw_cmd_buf: metal::CommandBuffer) -> CmdBuf {
@@ -244,7 +271,13 @@ impl MtlDevice {
         let helpers = self.helpers.clone();
         let cur_encoder = Encoder::None;
         let time_calibration = Default::default();
-        CmdBuf { cmd_buf, helpers, cur_encoder, time_calibration }
+        CmdBuf {
+            cmd_buf,
+            helpers,
+            cur_encoder,
+            time_calibration,
+            counter_style: self.counter_style,
+        }
     }
 
     pub fn image_from_raw_mtl(&self, texture: metal::Texture, width: u32, height: u32) -> Image {
@@ -364,6 +397,7 @@ impl crate::backend::Device for MtlDevice {
             helpers,
             cur_encoder,
             time_calibration,
+            counter_style: self.counter_style,
         })
     }
 
@@ -372,12 +406,19 @@ impl crate::backend::Device for MtlDevice {
     }
 
     fn create_query_pool(&self, n_queries: u32) -> Result<Self::QueryPool, Error> {
-        // TODO
-        Ok(QueryPool)
+        if let Some(timer_set) = &self.timer_set {
+            let pool = CounterSampleBuffer::new(&self.device, n_queries as u64, timer_set)
+                .ok_or("error creating timer query pool")?;
+            return Ok(QueryPool(Some(pool)));
+        }
+        Ok(QueryPool(None))
     }
 
     unsafe fn fetch_query_pool(&self, pool: &Self::QueryPool) -> Result<Vec<f64>, Error> {
-        // TODO
+        if let Some(raw) = &pool.0 {
+            let resolved = raw.resolve();
+            println!("resolved = {:?}", resolved);
+        }
         Ok(Vec::new())
     }
 
@@ -505,6 +546,8 @@ impl crate::backend::Device for MtlDevice {
 }
 
 impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
+    type ComputeEncoder = ComputeEncoder;
+
     unsafe fn begin(&mut self) {}
 
     unsafe fn finish(&mut self) {
@@ -647,12 +690,39 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
         );
     }
 
-    unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {}
+    unsafe fn reset_query_pool(&mut self, _pool: &QueryPool) {}
 
     unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
-        // TODO
-        // This really a PITA because it's pretty different than Vulkan.
-        // See https://developer.apple.com/documentation/metal/counter_sampling
+        if let Some(buf) = &pool.0 {
+            if matches!(self.cur_encoder, Encoder::None) {
+                self.cur_encoder =
+                    Encoder::Compute(self.cmd_buf.new_compute_command_encoder().to_owned());
+            }
+            let sample_index = query as NSUInteger;
+            if self.counter_style == CounterStyle::Command {
+                match &self.cur_encoder {
+                    Encoder::Compute(e) => {
+                        let () = msg_send![e.as_ptr(), sampleCountersInBuffer: buf.id() atSampleIndex: sample_index withBarrier: true];
+                    }
+                    Encoder::None => unreachable!(),
+                    _ => todo!(),
+                }
+            } else if self.counter_style == CounterStyle::Stage {
+                match &self.cur_encoder {
+                    Encoder::Compute(e) => {
+                        println!("here we are");
+                    }
+                    _ => (),
+                }
+            }
+        }
+    }
+
+    unsafe fn new_compute_encoder(&mut self) -> Self::ComputeEncoder {
+        let raw = self.cmd_buf.new_compute_command_encoder().to_owned();
+        ComputeEncoder {
+            raw
+        }
     }
 }
 
@@ -691,6 +761,43 @@ impl CmdBuf {
     }
 }
 
+impl crate::backend::ComputeEncoder<MtlDevice> for ComputeEncoder {
+    unsafe fn dispatch(
+        &mut self,
+        pipeline: &Pipeline,
+        descriptor_set: &DescriptorSet,
+        workgroup_count: (u32, u32, u32),
+        workgroup_size: (u32, u32, u32),
+    ) {
+        self.raw.set_compute_pipeline_state(&pipeline.0);
+        let mut buf_ix = 0;
+        for buffer in &descriptor_set.buffers {
+            self.raw.set_buffer(buf_ix, Some(&buffer.buffer), 0);
+            buf_ix += 1;
+        }
+        let mut img_ix = buf_ix;
+        for image in &descriptor_set.images {
+            self.raw.set_texture(img_ix, Some(&image.texture));
+            img_ix += 1;
+        }
+        let workgroup_count = metal::MTLSize {
+            width: workgroup_count.0 as u64,
+            height: workgroup_count.1 as u64,
+            depth: workgroup_count.2 as u64,
+        };
+        let workgroup_size = metal::MTLSize {
+            width: workgroup_size.0 as u64,
+            height: workgroup_size.1 as u64,
+            depth: workgroup_size.2 as u64,
+        };
+        self.raw.dispatch_thread_groups(workgroup_count, workgroup_size);
+    }
+
+    unsafe fn finish(&mut self) {
+        self.raw.end_encoding();
+    }
+}
+
 impl crate::backend::DescriptorSetBuilder<MtlDevice> for DescriptorSetBuilder {
     fn add_buffers(&mut self, buffers: &[&Buffer]) {
         self.0.buffers.extend(buffers.iter().copied().cloned());
diff --git a/piet-gpu-hal/src/metal/timer.rs b/piet-gpu-hal/src/metal/timer.rs
index 5830fee..a51bc6d 100644
--- a/piet-gpu-hal/src/metal/timer.rs
+++ b/piet-gpu-hal/src/metal/timer.rs
@@ -20,12 +20,16 @@
 
 use std::{ffi::CStr, ptr::null_mut};
 
-use cocoa_foundation::{base::id, foundation::NSUInteger};
-use metal::DeviceRef;
+use cocoa_foundation::{
+    base::id,
+    foundation::{NSRange, NSUInteger},
+};
+use metal::{DeviceRef, MTLStorageMode};
 use objc::{class, msg_send, sel, sel_impl};
 
 pub struct CounterSampleBuffer {
     id: id,
+    count: u64,
 }
 
 pub struct CounterSet {
@@ -43,11 +47,18 @@ impl Clone for CounterSampleBuffer {
         unsafe {
             CounterSampleBuffer {
                 id: msg_send![self.id, retain],
+                count: self.count,
             }
         }
     }
 }
 
+impl CounterSampleBuffer {
+    pub fn id(&self) -> id {
+        self.id
+    }
+}
+
 impl CounterSet {
     pub fn get_timer_counter_set(device: &DeviceRef) -> Option<CounterSet> {
         unsafe {
@@ -81,6 +92,10 @@ impl CounterSampleBuffer {
             let count = count as NSUInteger;
             let () = msg_send![descriptor, setSampleCount: count];
             let () = msg_send![descriptor, setCounterSet: counter_set.id];
+            let () = msg_send![
+                descriptor,
+                setStorageMode: MTLStorageMode::Shared as NSUInteger
+            ];
             let mut error: id = null_mut();
             let buf: id = msg_send![device, newCounterSampleBufferWithDescriptor: descriptor error: &mut error];
             let () = msg_send![descriptor, release];
@@ -88,11 +103,21 @@ impl CounterSampleBuffer {
                 let () = msg_send![error, release];
                 return None;
             }
-            Some(CounterSampleBuffer { id: buf })
+            Some(CounterSampleBuffer { id: buf, count })
         }
     }
 
-    pub fn id(&self) -> id {
-        self.id
+    // Read the timestamps.
+    //
+    // Safety: the lifetime of the returned slice is wrong, it's actually autoreleased.
+    pub unsafe fn resolve(&self) -> &[u64] {
+        let range = NSRange::new(0, self.count);
+        let data: id = msg_send![self.id, resolveCounterRange: range];
+        if data.is_null() {
+            &[]
+        } else {
+            let bytes: *const u64 = msg_send![data, bytes];
+            std::slice::from_raw_parts(bytes, self.count as usize)
+        }
     }
 }
diff --git a/piet-gpu-hal/src/mux.rs b/piet-gpu-hal/src/mux.rs
index af1702d..7853c2b 100644
--- a/piet-gpu-hal/src/mux.rs
+++ b/piet-gpu-hal/src/mux.rs
@@ -100,6 +100,14 @@ mux_device_enum! {
 QueryPool }
 mux_device_enum! { Sampler }
 
+mux_enum! {
+    pub enum ComputeEncoder {
+        Vk(<crate::vulkan::CmdBuf as crate::backend::CmdBuf<vulkan::VkDevice>>::ComputeEncoder),
+        Dx12(<crate::dx12::Dx12Device as crate::backend::CmdBuf<dx12::Dx12Device>>::ComputeEncoder),
+        Mtl(<crate::metal::CmdBuf as crate::backend::CmdBuf<metal::MtlDevice>>::ComputeEncoder),
+    }
+}
+
 /// The code for a shader, either as source or intermediate representation.
 pub enum ShaderCode<'a> {
     /// SPIR-V (binary intermediate representation)

From ba2b27cc3c987917715eea5b2c625832951bc92a Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Wed, 13 Apr 2022 10:31:38 -0700
Subject: [PATCH 03/12] Rework of compute encoder abstraction

The current plan is to more or less follow the wgpu/wgpu-hal approach. In the mux/backend layer (which corresponds fairly strongly to wgpu-hal), there isn't explicit construction of a compute encoder, but there are new methods for beginning and ending a compute pass. At the hub layer (which corresponds to wgpu) there will be a ComputeEncoder object.

That said, there will be some differences. The WebGPU "end" method on a compute encoder is implemented in wgpu as Drop, and that is not ideal. Also, the wgpu-hal approach to timer queries (still based on write_timestamp) is not up to the task of Metal timer queries, where the query offsets have to be specified at compute encoder creation. That's why there are different projects :)

WIP: current state is that stage-style queries work on Apple Silicon, but non-Metal backends are broken, and piet-gpu is not yet updated to use new API.
---
 piet-gpu-hal/examples/collatz.rs |   8 +-
 piet-gpu-hal/src/backend.rs      |  43 +++++-----
 piet-gpu-hal/src/hub.rs          |  49 ++++++++++-
 piet-gpu-hal/src/lib.rs          |  14 +++
 piet-gpu-hal/src/metal.rs        | 141 ++++++++++++++-----------------
 piet-gpu-hal/src/metal/timer.rs  |  27 +++++-
 piet-gpu-hal/src/mux.rs          |  25 ++++--
 7 files changed, 196 insertions(+), 111 deletions(-)

diff --git a/piet-gpu-hal/examples/collatz.rs b/piet-gpu-hal/examples/collatz.rs
index dae5b31..7aff938 100644
--- a/piet-gpu-hal/examples/collatz.rs
+++ b/piet-gpu-hal/examples/collatz.rs
@@ -1,4 +1,4 @@
-use piet_gpu_hal::{include_shader, BindType};
+use piet_gpu_hal::{include_shader, BindType, ComputePassDescriptor};
 use piet_gpu_hal::{BufferUsage, Instance, InstanceFlags, Session};
 
 fn main() {
@@ -20,9 +20,9 @@ fn main() {
         let mut cmd_buf = session.cmd_buf().unwrap();
         cmd_buf.begin();
         cmd_buf.reset_query_pool(&query_pool);
-        cmd_buf.write_timestamp(&query_pool, 0);
-        cmd_buf.dispatch(&pipeline, &descriptor_set, (256, 1, 1), (1, 1, 1));
-        cmd_buf.write_timestamp(&query_pool, 1);
+        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 0, 1));
+        pass.dispatch(&pipeline, &descriptor_set, (256, 1, 1), (1, 1, 1));
+        pass.end();
         cmd_buf.finish_timestamps(&query_pool);
         cmd_buf.host_barrier();
         cmd_buf.finish();
diff --git a/piet-gpu-hal/src/backend.rs b/piet-gpu-hal/src/backend.rs
index c1b2132..f2c67a1 100644
--- a/piet-gpu-hal/src/backend.rs
+++ b/piet-gpu-hal/src/backend.rs
@@ -17,7 +17,8 @@
 //! The generic trait for backends to implement.
 
 use crate::{
-    BindType, BufferUsage, Error, GpuInfo, ImageFormat, ImageLayout, MapMode, SamplerParams,
+    BindType, BufferUsage, ComputePassDescriptor, Error, GpuInfo, ImageFormat, ImageLayout,
+    MapMode, SamplerParams,
 };
 
 pub trait Device: Sized {
@@ -159,16 +160,32 @@ pub trait Device: Sized {
     unsafe fn create_sampler(&self, params: SamplerParams) -> Result<Self::Sampler, Error>;
 }
 
+/// The trait implemented by backend command buffer implementations.
+///
+/// Valid encoding is represented by a state machine (currently not validated
+/// but it is easy to imagine there might be at least debug validation). Most
+/// methods are only valid in a particular state, and some move it to another
+/// state.
 pub trait CmdBuf<D: Device> {
-    type ComputeEncoder;
-
+    /// Begin encoding.
+    ///
+    /// State: init -> ready
     unsafe fn begin(&mut self);
 
+    /// State: ready -> finished
     unsafe fn finish(&mut self);
 
     /// Return true if the command buffer is suitable for reuse.
     unsafe fn reset(&mut self) -> bool;
 
+    /// Begin a compute pass.
+    ///
+    /// State: ready -> in_compute_pass
+    unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor);
+
+    /// Dispatch
+    ///
+    /// State: in_compute_pass
     unsafe fn dispatch(
         &mut self,
         pipeline: &D::Pipeline,
@@ -177,6 +194,9 @@ pub trait CmdBuf<D: Device> {
         workgroup_size: (u32, u32, u32),
     );
 
+    /// State: in_compute_pass -> ready
+    unsafe fn end_compute_pass(&mut self);
+
     /// Insert an execution and memory barrier.
     ///
     /// Compute kernels (and other actions) after this barrier may read from buffers
@@ -229,12 +249,10 @@ pub trait CmdBuf<D: Device> {
     unsafe fn finish_timestamps(&mut self, _pool: &D::QueryPool) {}
 
     /// Begin a labeled section for debugging and profiling purposes.
-    unsafe fn begin_debug_label(&mut self, label: &str) {}
+    unsafe fn begin_debug_label(&mut self, _label: &str) {}
 
     /// End a section opened by `begin_debug_label`.
     unsafe fn end_debug_label(&mut self) {}
-
-    unsafe fn new_compute_encoder(&mut self) -> Self::ComputeEncoder;
 }
 
 /// A builder for descriptor sets with more complex layouts.
@@ -256,16 +274,3 @@ pub trait DescriptorSetBuilder<D: Device> {
     fn add_textures(&mut self, images: &[&D::Image]);
     unsafe fn build(self, device: &D, pipeline: &D::Pipeline) -> Result<D::DescriptorSet, Error>;
 }
-
-pub trait ComputeEncoder<D: Device> {
-    unsafe fn dispatch(
-        &mut self,
-        pipeline: &D::Pipeline,
-        descriptor_set: &D::DescriptorSet,
-        workgroup_count: (u32, u32, u32),
-        workgroup_size: (u32, u32, u32),
-    );
-
-    // Question: should be self?
-    unsafe fn finish(&mut self);
-}
diff --git a/piet-gpu-hal/src/hub.rs b/piet-gpu-hal/src/hub.rs
index cc09832..37c59df 100644
--- a/piet-gpu-hal/src/hub.rs
+++ b/piet-gpu-hal/src/hub.rs
@@ -13,7 +13,7 @@ use std::sync::{Arc, Mutex, Weak};
 use bytemuck::Pod;
 use smallvec::SmallVec;
 
-use crate::{mux, BackendType, BufWrite, ImageFormat, MapMode};
+use crate::{mux, BackendType, BufWrite, ComputePassDescriptor, ImageFormat, MapMode};
 
 use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams};
 
@@ -135,6 +135,11 @@ pub struct BufReadGuard<'a> {
     size: u64,
 }
 
+/// A sub-object of a command buffer for a sequence of compute dispatches.
+pub struct ComputePass<'a> {
+    cmd_buf: &'a mut CmdBuf,
+}
+
 impl Session {
     /// Create a new session, choosing the best backend.
     pub fn new(device: mux::Device) -> Session {
@@ -471,6 +476,12 @@ impl CmdBuf {
         self.cmd_buf().finish();
     }
 
+    /// Begin a compute pass.
+    pub unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) -> ComputePass {
+        self.cmd_buf().begin_compute_pass(desc);
+        ComputePass { cmd_buf: self }
+    }
+
     /// Dispatch a compute shader.
     ///
     /// Request a compute shader to be run, using the pipeline to specify the
@@ -479,6 +490,11 @@ impl CmdBuf {
     /// Both the workgroup count (number of workgroups) and the workgroup size
     /// (number of threads in a workgroup) must be specified here, though not
     /// all back-ends require the latter info.
+    ///
+    /// This version is deprecated because (a) you do not get timer queries and
+    /// (b) it doesn't aggregate multiple dispatches into a single compute
+    /// pass, which is a performance concern.
+    #[deprecated(note = "moving to ComputePass")]
     pub unsafe fn dispatch(
         &mut self,
         pipeline: &Pipeline,
@@ -486,8 +502,9 @@ impl CmdBuf {
         workgroup_count: (u32, u32, u32),
         workgroup_size: (u32, u32, u32),
     ) {
-        self.cmd_buf()
-            .dispatch(pipeline, descriptor_set, workgroup_count, workgroup_size);
+        let mut pass = self.begin_compute_pass(&Default::default());
+        pass.dispatch(pipeline, descriptor_set, workgroup_count, workgroup_size);
+        pass.end();
     }
 
     /// Insert an execution and memory barrier.
@@ -692,6 +709,32 @@ impl Drop for SubmittedCmdBuf {
     }
 }
 
+impl<'a> ComputePass<'a> {
+    /// Dispatch a compute shader.
+    ///
+    /// Request a compute shader to be run, using the pipeline to specify the
+    /// code, and the descriptor set to address the resources read and written.
+    ///
+    /// Both the workgroup count (number of workgroups) and the workgroup size
+    /// (number of threads in a workgroup) must be specified here, though not
+    /// all back-ends require the latter info.
+    pub unsafe fn dispatch(
+        &mut self,
+        pipeline: &Pipeline,
+        descriptor_set: &DescriptorSet,
+        workgroup_count: (u32, u32, u32),
+        workgroup_size: (u32, u32, u32),
+    ) {
+        self.cmd_buf
+            .cmd_buf()
+            .dispatch(pipeline, descriptor_set, workgroup_count, workgroup_size);
+    }
+
+    pub unsafe fn end(&mut self) {
+        self.cmd_buf.cmd_buf().end_compute_pass();
+    }
+}
+
 impl Drop for BufferInner {
     fn drop(&mut self) {
         if let Some(session) = Weak::upgrade(&self.session) {
diff --git a/piet-gpu-hal/src/lib.rs b/piet-gpu-hal/src/lib.rs
index fab7d65..241cdfd 100644
--- a/piet-gpu-hal/src/lib.rs
+++ b/piet-gpu-hal/src/lib.rs
@@ -189,3 +189,17 @@ pub struct WorkgroupLimits {
     /// dimension.
     pub max_invocations: u32,
 }
+
+#[derive(Default)]
+pub struct ComputePassDescriptor<'a> {
+    // Maybe label should go here? It does in wgpu and wgpu_hal.
+    timer_queries: Option<(&'a QueryPool, u32, u32)>,
+}
+
+impl<'a> ComputePassDescriptor<'a> {
+    pub fn timer(pool: &'a QueryPool, start_query: u32, end_query: u32) -> ComputePassDescriptor {
+        ComputePassDescriptor {
+            timer_queries: Some((pool, start_query, end_query)),
+        }
+    }
+}
diff --git a/piet-gpu-hal/src/metal.rs b/piet-gpu-hal/src/metal.rs
index 23cc256..c907d77 100644
--- a/piet-gpu-hal/src/metal.rs
+++ b/piet-gpu-hal/src/metal.rs
@@ -33,11 +33,13 @@ use metal::{CGFloat, CommandBufferRef, MTLFeatureSet};
 
 use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};
 
-use crate::{BufferUsage, Error, GpuInfo, ImageFormat, MapMode, WorkgroupLimits};
+use crate::{
+    BufferUsage, ComputePassDescriptor, Error, GpuInfo, ImageFormat, MapMode, WorkgroupLimits,
+};
 
 use util::*;
 
-use self::timer::{CounterSampleBuffer, CounterSet};
+use self::timer::{CounterSampleBuffer, CounterSet, TimeCalibration};
 
 pub struct MtlInstance;
 
@@ -110,15 +112,11 @@ enum Encoder {
 }
 
 #[derive(Default)]
-struct TimeCalibration {
-    cpu_start_ts: u64,
-    gpu_start_ts: u64,
-    cpu_end_ts: u64,
-    gpu_end_ts: u64,
+pub struct QueryPool {
+    counter_sample_buf: Option<CounterSampleBuffer>,
+    calibration: Arc<Mutex<Option<Arc<Mutex<TimeCalibration>>>>>,
 }
 
-pub struct QueryPool(Option<CounterSampleBuffer>);
-
 pub struct Pipeline(metal::ComputePipelineState);
 
 #[derive(Default)]
@@ -134,10 +132,6 @@ struct Helpers {
     clear_pipeline: metal::ComputePipelineState,
 }
 
-pub struct ComputeEncoder {
-    raw: metal::ComputeCommandEncoder,
-}
-
 impl MtlInstance {
     pub fn new(
         window_handle: Option<&dyn HasRawWindowHandle>,
@@ -263,7 +257,7 @@ impl MtlDevice {
             helpers,
             timer_set,
             counter_style,
-    }
+        }
     }
 
     pub fn cmd_buf_from_raw_mtl(&self, raw_cmd_buf: metal::CommandBuffer) -> CmdBuf {
@@ -409,16 +403,28 @@ impl crate::backend::Device for MtlDevice {
         if let Some(timer_set) = &self.timer_set {
             let pool = CounterSampleBuffer::new(&self.device, n_queries as u64, timer_set)
                 .ok_or("error creating timer query pool")?;
-            return Ok(QueryPool(Some(pool)));
+            return Ok(QueryPool {
+                counter_sample_buf: Some(pool),
+                calibration: Default::default(),
+            });
         }
-        Ok(QueryPool(None))
+        Ok(QueryPool::default())
     }
 
     unsafe fn fetch_query_pool(&self, pool: &Self::QueryPool) -> Result<Vec<f64>, Error> {
-        if let Some(raw) = &pool.0 {
+        if let Some(raw) = &pool.counter_sample_buf {
             let resolved = raw.resolve();
-            println!("resolved = {:?}", resolved);
+            let calibration = pool.calibration.lock().unwrap();
+            if let Some(calibration) = &*calibration {
+                let calibration = calibration.lock().unwrap();
+                let result = resolved
+                    .iter()
+                    .map(|time_ns| calibration.correlate(*time_ns))
+                    .collect();
+                return Ok(result);
+            }
         }
+        // Maybe should return None indicating it wasn't successful? But that might break.
         Ok(Vec::new())
     }
 
@@ -444,10 +450,6 @@ impl crate::backend::Device for MtlDevice {
                 let gpu_ts_ptr = &mut time_calibration.gpu_start_ts as *mut _;
                 // TODO: only do this if supported.
                 let () = msg_send![device, sampleTimestamps: cpu_ts_ptr gpuTimestamp: gpu_ts_ptr];
-                println!(
-                    "scheduled, {}, {}",
-                    time_calibration.cpu_start_ts, time_calibration.gpu_start_ts
-                );
             })
             .copy();
             add_scheduled_handler(&cmd_buf.cmd_buf, &start_block);
@@ -461,10 +463,6 @@ impl crate::backend::Device for MtlDevice {
                     // TODO: only do this if supported.
                     let () =
                         msg_send![device, sampleTimestamps: cpu_ts_ptr gpuTimestamp: gpu_ts_ptr];
-                    println!(
-                        "completed, {}, {}",
-                        time_calibration.cpu_end_ts, time_calibration.gpu_end_ts
-                    );
                 })
                 .copy();
             cmd_buf.cmd_buf.add_completed_handler(&completed_block);
@@ -546,8 +544,6 @@ impl crate::backend::Device for MtlDevice {
 }
 
 impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
-    type ComputeEncoder = ComputeEncoder;
-
     unsafe fn begin(&mut self) {}
 
     unsafe fn finish(&mut self) {
@@ -558,6 +554,35 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
         false
     }
 
+    unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
+        debug_assert!(matches!(self.cur_encoder, Encoder::None));
+        let encoder = if let Some(queries) = &desc.timer_queries {
+            let descriptor: id = msg_send![class!(MTLComputePassDescriptor), computePassDescriptor];
+            let attachments: id = msg_send![descriptor, sampleBufferAttachments];
+            let index: NSUInteger = 0;
+            let attachment: id = msg_send![attachments, objectAtIndexedSubscript: index];
+            // Here we break the hub/mux separation a bit, for expedience
+            #[allow(irrefutable_let_patterns)]
+            if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 {
+                if let Some(sample_buf) = &query_pool.counter_sample_buf {
+                    let () = msg_send![attachment, setSampleBuffer: sample_buf.id()];
+                }
+            }
+            let start_index = queries.1 as NSUInteger;
+            let end_index = queries.2 as NSInteger;
+            let () = msg_send![attachment, setStartOfEncoderSampleIndex: start_index];
+            let () = msg_send![attachment, setEndOfEncoderSampleIndex: end_index];
+            let encoder = msg_send![
+                self.cmd_buf,
+                computeCommandEncoderWithDescriptor: descriptor
+            ];
+            encoder
+        } else {
+            self.cmd_buf.new_compute_command_encoder()
+        };
+        self.cur_encoder = Encoder::Compute(encoder.to_owned());
+    }
+
     unsafe fn dispatch(
         &mut self,
         pipeline: &Pipeline,
@@ -590,6 +615,11 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
         encoder.dispatch_thread_groups(workgroup_count, workgroup_size);
     }
 
+    unsafe fn end_compute_pass(&mut self) {
+        // TODO: might validate that we are in a compute encoder state
+        self.flush_encoder();
+    }
+
     unsafe fn memory_barrier(&mut self) {
         // We'll probably move to explicit barriers, but for now rely on
         // Metal's own tracking.
@@ -690,10 +720,13 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
         );
     }
 
-    unsafe fn reset_query_pool(&mut self, _pool: &QueryPool) {}
+    unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {
+        let mut calibration = pool.calibration.lock().unwrap();
+        *calibration = Some(self.time_calibration.clone());
+    }
 
     unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
-        if let Some(buf) = &pool.0 {
+        if let Some(buf) = &pool.counter_sample_buf {
             if matches!(self.cur_encoder, Encoder::None) {
                 self.cur_encoder =
                     Encoder::Compute(self.cmd_buf.new_compute_command_encoder().to_owned());
@@ -709,21 +742,14 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
                 }
             } else if self.counter_style == CounterStyle::Stage {
                 match &self.cur_encoder {
-                    Encoder::Compute(e) => {
-                        println!("here we are");
+                    Encoder::Compute(_e) => {
+                        println!("write_timestamp is not supported for stage-style encoders");
                     }
                     _ => (),
                 }
             }
         }
     }
-
-    unsafe fn new_compute_encoder(&mut self) -> Self::ComputeEncoder {
-        let raw = self.cmd_buf.new_compute_command_encoder().to_owned();
-        ComputeEncoder {
-            raw
-        }
-    }
 }
 
 impl CmdBuf {
@@ -761,43 +787,6 @@ impl CmdBuf {
     }
 }
 
-impl crate::backend::ComputeEncoder<MtlDevice> for ComputeEncoder {
-    unsafe fn dispatch(
-        &mut self,
-        pipeline: &Pipeline,
-        descriptor_set: &DescriptorSet,
-        workgroup_count: (u32, u32, u32),
-        workgroup_size: (u32, u32, u32),
-    ) {
-        self.raw.set_compute_pipeline_state(&pipeline.0);
-        let mut buf_ix = 0;
-        for buffer in &descriptor_set.buffers {
-            self.raw.set_buffer(buf_ix, Some(&buffer.buffer), 0);
-            buf_ix += 1;
-        }
-        let mut img_ix = buf_ix;
-        for image in &descriptor_set.images {
-            self.raw.set_texture(img_ix, Some(&image.texture));
-            img_ix += 1;
-        }
-        let workgroup_count = metal::MTLSize {
-            width: workgroup_count.0 as u64,
-            height: workgroup_count.1 as u64,
-            depth: workgroup_count.2 as u64,
-        };
-        let workgroup_size = metal::MTLSize {
-            width: workgroup_size.0 as u64,
-            height: workgroup_size.1 as u64,
-            depth: workgroup_size.2 as u64,
-        };
-        self.raw.dispatch_thread_groups(workgroup_count, workgroup_size);
-    }
-
-    unsafe fn finish(&mut self) {
-        self.raw.end_encoding();
-    }
-}
-
 impl crate::backend::DescriptorSetBuilder<MtlDevice> for DescriptorSetBuilder {
     fn add_buffers(&mut self, buffers: &[&Buffer]) {
         self.0.buffers.extend(buffers.iter().copied().cloned());
diff --git a/piet-gpu-hal/src/metal/timer.rs b/piet-gpu-hal/src/metal/timer.rs
index a51bc6d..a8b80d6 100644
--- a/piet-gpu-hal/src/metal/timer.rs
+++ b/piet-gpu-hal/src/metal/timer.rs
@@ -36,6 +36,14 @@ pub struct CounterSet {
     id: id,
 }
 
+#[derive(Default)]
+pub struct TimeCalibration {
+    pub cpu_start_ts: u64,
+    pub gpu_start_ts: u64,
+    pub cpu_end_ts: u64,
+    pub gpu_end_ts: u64,
+}
+
 impl Drop for CounterSampleBuffer {
     fn drop(&mut self) {
         unsafe { msg_send![self.id, release] }
@@ -87,7 +95,6 @@ impl CounterSampleBuffer {
         unsafe {
             let desc_cls = class!(MTLCounterSampleBufferDescriptor);
             let descriptor: id = msg_send![desc_cls, alloc];
-            println!("descriptor = {:?}", descriptor);
             let _: id = msg_send![descriptor, init];
             let count = count as NSUInteger;
             let () = msg_send![descriptor, setSampleCount: count];
@@ -121,3 +128,21 @@ impl CounterSampleBuffer {
         }
     }
 }
+
+impl TimeCalibration {
+    /// Convert GPU timestamp into CPU time base.
+    ///
+    /// See https://developer.apple.com/documentation/metal/performance_tuning/correlating_cpu_and_gpu_timestamps
+    pub fn correlate(&self, raw_ts: u64) -> f64 {
+        let delta_cpu = self.cpu_end_ts - self.cpu_start_ts;
+        let delta_gpu = self.gpu_end_ts - self.gpu_start_ts;
+        let adj_ts = if delta_gpu > 0 {
+            let scale = delta_cpu as f64 / delta_gpu as f64;
+            self.cpu_start_ts as f64 + (raw_ts - self.gpu_start_ts) as f64 * scale
+        } else {
+            // Default is ns on Apple Silicon; on other hardware this will be wrong
+            raw_ts as f64
+        };
+        adj_ts * 1e-9
+    }
+}
diff --git a/piet-gpu-hal/src/mux.rs b/piet-gpu-hal/src/mux.rs
index 7853c2b..9795193 100644
--- a/piet-gpu-hal/src/mux.rs
+++ b/piet-gpu-hal/src/mux.rs
@@ -35,6 +35,7 @@ use crate::backend::DescriptorSetBuilder as DescriptorSetBuilderTrait;
 use crate::backend::Device as DeviceTrait;
 use crate::BackendType;
 use crate::BindType;
+use crate::ComputePassDescriptor;
 use crate::ImageFormat;
 use crate::MapMode;
 use crate::{BufferUsage, Error, GpuInfo, ImageLayout, InstanceFlags};
@@ -100,14 +101,6 @@ mux_device_enum! {
 QueryPool }
 mux_device_enum! { Sampler }
 
-mux_enum! {
-    pub enum ComputeEncoder {
-        Vk(<crate::vulkan::CmdBuf as crate::backend::CmdBuf<vulkan::VkDevice>>::ComputeEncoder),
-        Dx12(<crate::dx12::Dx12Device as crate::backend::CmdBuf<dx12::Dx12Device>>::ComputeEncoder),
-        Mtl(<crate::metal::CmdBuf as crate::backend::CmdBuf<metal::MtlDevice>>::ComputeEncoder),
-    }
-}
-
 /// The code for a shader, either as source or intermediate representation.
 pub enum ShaderCode<'a> {
     /// SPIR-V (binary intermediate representation)
@@ -666,6 +659,14 @@ impl CmdBuf {
         }
     }
 
+    pub unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
+        mux_match! { self;
+            CmdBuf::Vk(c) => c.begin_compute_pass(desc),
+            CmdBuf::Dx12(c) => c.begin_compute_pass(desc),
+            CmdBuf::Mtl(c) => c.begin_compute_pass(desc),
+        }
+    }
+
     /// Dispatch a compute shader.
     ///
     /// Note that both the number of workgroups (`workgroup_count`) and the number of
@@ -688,6 +689,14 @@ impl CmdBuf {
         }
     }
 
+    pub unsafe fn end_compute_pass(&mut self) {
+        mux_match! { self;
+            CmdBuf::Vk(c) => c.end_compute_pass(),
+            CmdBuf::Dx12(c) => c.end_compute_pass(),
+            CmdBuf::Mtl(c) => c.end_compute_pass(),
+        }
+    }
+
     pub unsafe fn memory_barrier(&mut self) {
         mux_match! { self;
             CmdBuf::Vk(c) => c.memory_barrier(),

From 9980c858b6e57666e849976af0f51ea4a21b0256 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Thu, 14 Apr 2022 16:27:28 -0700
Subject: [PATCH 04/12] Fix timer queries in Vulkan and DX12 backends

Current status: the piet-gpu-hal module (including the collatz example)
have the new API (with queries set on compute pass) implemented. The
other uses have not yet been updated.

On Metal, only M1 is tested. The "command" counter style is partly
implemented, but not fully wired up.
---
 piet-gpu-hal/src/dx12.rs          | 35 +++++++++++++++-----
 piet-gpu-hal/src/dx12/wrappers.rs |  3 --
 piet-gpu-hal/src/hub.rs           | 15 ++++++++-
 piet-gpu-hal/src/lib.rs           |  6 ++++
 piet-gpu-hal/src/vulkan.rs        | 55 ++++++++++++++++++++++---------
 5 files changed, 85 insertions(+), 29 deletions(-)

diff --git a/piet-gpu-hal/src/dx12.rs b/piet-gpu-hal/src/dx12.rs
index 78ad449..c5e1e04 100644
--- a/piet-gpu-hal/src/dx12.rs
+++ b/piet-gpu-hal/src/dx12.rs
@@ -21,7 +21,7 @@ use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};
 
 use smallvec::SmallVec;
 
-use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, MapMode, WorkgroupLimits, ImageFormat};
+use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, MapMode, WorkgroupLimits, ImageFormat, ComputePassDescriptor};
 
 use self::{
     descriptor::{CpuHeapRefOwned, DescriptorPool, GpuHeapRefOwned},
@@ -76,6 +76,7 @@ pub struct CmdBuf {
     c: wrappers::GraphicsCommandList,
     allocator: CommandAllocator,
     needs_reset: bool,
+    end_query: Option<(wrappers::QueryHeap, u32)>,
 }
 
 pub struct Pipeline {
@@ -360,6 +361,7 @@ impl crate::backend::Device for Dx12Device {
                 c,
                 allocator,
                 needs_reset: false,
+                end_query: None,
             })
         }
     }
@@ -388,11 +390,10 @@ impl crate::backend::Device for Dx12Device {
         let mapped = self.map_buffer(&pool.buf, 0, size as u64, MapMode::Read)?;
         std::ptr::copy_nonoverlapping(mapped, buf.as_mut_ptr() as *mut u8, size);
         self.unmap_buffer(&pool.buf, 0, size as u64, MapMode::Read)?;
-        let ts0 = buf[0];
         let tsp = (self.ts_freq as f64).recip();
-        let result = buf[1..]
+        let result = buf
             .iter()
-            .map(|ts| ts.wrapping_sub(ts0) as f64 * tsp)
+            .map(|ts| *ts as f64 * tsp)
             .collect();
         Ok(result)
     }
@@ -610,6 +611,16 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
         self.allocator.reset().is_ok() && self.c.reset(&self.allocator, None).is_ok()
     }
 
+    unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
+        if let Some((pool, start, end)) = &desc.timer_queries {
+            #[allow(irrefutable_let_patterns)]
+            if let crate::hub::QueryPool::Dx12(pool) = pool {
+                self.write_timestamp(pool, *start);
+                self.end_query = Some((pool.heap.clone(), *end));
+            }
+        }
+    }
+
     unsafe fn dispatch(
         &mut self,
         pipeline: &Pipeline,
@@ -628,6 +639,12 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
             .dispatch(workgroup_count.0, workgroup_count.1, workgroup_count.2);
     }
 
+    unsafe fn end_compute_pass(&mut self) {
+        if let Some((heap, end)) = self.end_query.take() {
+            self.c.end_timing_query(&heap, end);
+        }
+    }
+
     unsafe fn memory_barrier(&mut self) {
         // See comments in CommandBuffer::pipeline_barrier in gfx-hal dx12 backend.
         // The "proper" way to do this would be to name the actual buffers participating
@@ -666,7 +683,7 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
         self.memory_barrier();
     }
 
-    unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) {
+    unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
         let cpu_ref = buffer.cpu_ref.as_ref().unwrap();
         let (gpu_ref, heap) = buffer
             .gpu_ref
@@ -684,23 +701,23 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
         );
     }
 
-    unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) {
+    unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
         // TODO: consider using copy_resource here (if sizes match)
         let size = src.size.min(dst.size);
         self.c.copy_buffer(&dst.resource, 0, &src.resource, 0, size);
     }
 
-    unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) {
+    unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
         self.c
             .copy_texture_to_buffer(&src.resource, &dst.resource, src.size.0, src.size.1);
     }
 
-    unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) {
+    unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
         self.c
             .copy_buffer_to_texture(&src.resource, &dst.resource, dst.size.0, dst.size.1);
     }
 
-    unsafe fn blit_image(&self, src: &Image, dst: &Image) {
+    unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
         self.c.copy_resource(&src.resource, &dst.resource);
     }
 
diff --git a/piet-gpu-hal/src/dx12/wrappers.rs b/piet-gpu-hal/src/dx12/wrappers.rs
index 4bbb86c..9a3fb90 100644
--- a/piet-gpu-hal/src/dx12/wrappers.rs
+++ b/piet-gpu-hal/src/dx12/wrappers.rs
@@ -79,7 +79,6 @@ pub struct Blob(pub ComPtr<d3dcommon::ID3DBlob>);
 #[derive(Clone)]
 pub struct ShaderByteCode {
     pub bytecode: d3d12::D3D12_SHADER_BYTECODE,
-    blob: Option<Blob>,
 }
 
 #[derive(Clone)]
@@ -741,7 +740,6 @@ impl ShaderByteCode {
                 BytecodeLength: blob.0.GetBufferSize(),
                 pShaderBytecode: blob.0.GetBufferPointer(),
             },
-            blob: Some(blob),
         }
     }
 
@@ -810,7 +808,6 @@ impl ShaderByteCode {
                 BytecodeLength: bytecode.len(),
                 pShaderBytecode: bytecode.as_ptr() as *const _,
             },
-            blob: None,
         }
     }
 }
diff --git a/piet-gpu-hal/src/hub.rs b/piet-gpu-hal/src/hub.rs
index 37c59df..5c7122a 100644
--- a/piet-gpu-hal/src/hub.rs
+++ b/piet-gpu-hal/src/hub.rs
@@ -375,8 +375,17 @@ impl Session {
     ///
     /// This should be called after waiting on the command buffer that wrote the
     /// timer queries.
+    ///
+    /// The returned vector is one shorter than the number of timer queries in the
+    /// pool; the first value is subtracted off. It would likely be better to return
+    /// the raw timestamps, but that change should be made consistently.
     pub unsafe fn fetch_query_pool(&self, pool: &QueryPool) -> Result<Vec<f64>, Error> {
-        self.0.device.fetch_query_pool(pool)
+        let result = self.0.device.fetch_query_pool(pool)?;
+        // Subtract off first timestamp.
+        Ok(result[1..]
+            .iter()
+            .map(|ts| *ts as f64 - result[0])
+            .collect())
     }
 
     #[doc(hidden)]
@@ -602,6 +611,10 @@ impl CmdBuf {
     /// Write a timestamp.
     ///
     /// The query index must be less than the size of the query pool on creation.
+    ///
+    /// Deprecation: for greater portability, set timestamp queries on compute
+    /// passes instead.
+    #[deprecated(note = "use compute pass descriptor instead")]
     pub unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
         self.cmd_buf().write_timestamp(pool, query);
     }
diff --git a/piet-gpu-hal/src/lib.rs b/piet-gpu-hal/src/lib.rs
index 241cdfd..18f6390 100644
--- a/piet-gpu-hal/src/lib.rs
+++ b/piet-gpu-hal/src/lib.rs
@@ -190,9 +190,15 @@ pub struct WorkgroupLimits {
     pub max_invocations: u32,
 }
 
+/// Options for creating a compute pass.
 #[derive(Default)]
 pub struct ComputePassDescriptor<'a> {
     // Maybe label should go here? It does in wgpu and wgpu_hal.
+    /// Timer query parameters.
+    ///
+    /// To record timer queries for a compute pass, set the query pool, start
+    /// query index, and end query index here. The indices must be less than
+    /// the size of the query pool.
     timer_queries: Option<(&'a QueryPool, u32, u32)>,
 }
 
diff --git a/piet-gpu-hal/src/vulkan.rs b/piet-gpu-hal/src/vulkan.rs
index 8392899..504d947 100644
--- a/piet-gpu-hal/src/vulkan.rs
+++ b/piet-gpu-hal/src/vulkan.rs
@@ -15,7 +15,7 @@ use smallvec::SmallVec;
 use crate::backend::Device as DeviceTrait;
 use crate::{
     BindType, BufferUsage, Error, GpuInfo, ImageFormat, ImageLayout, MapMode, SamplerParams, SubgroupSize,
-    WorkgroupLimits,
+    WorkgroupLimits, ComputePassDescriptor,
 };
 
 pub struct VkInstance {
@@ -92,6 +92,7 @@ pub struct CmdBuf {
     cmd_buf: vk::CommandBuffer,
     cmd_pool: vk::CommandPool,
     device: Arc<RawDevice>,
+    end_query: Option<(vk::QueryPool, u32)>,
 }
 
 pub struct QueryPool {
@@ -738,6 +739,7 @@ impl crate::backend::Device for VkDevice {
                 cmd_buf,
                 cmd_pool,
                 device: self.device.clone(),
+                end_query: None,
             })
         }
     }
@@ -770,11 +772,10 @@ impl crate::backend::Device for VkDevice {
         // results (Windows 10, AMD 5700 XT).
         let flags = vk::QueryResultFlags::TYPE_64 | vk::QueryResultFlags::WAIT;
         device.get_query_pool_results(pool.pool, 0, pool.n_queries, &mut buf, flags)?;
-        let ts0 = buf[0];
         let tsp = self.timestamp_period as f64 * 1e-9;
-        let result = buf[1..]
+        let result = buf
             .iter()
-            .map(|ts| ts.wrapping_sub(ts0) as f64 * tsp)
+            .map(|ts| *ts as f64 * tsp)
             .collect();
         Ok(result)
     }
@@ -902,6 +903,16 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
         true
     }
 
+    unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
+        if let Some((pool, start, end)) = &desc.timer_queries {
+            #[allow(irrefutable_let_patterns)]
+            if let crate::hub::QueryPool::Vk(pool) = pool {
+                self.write_timestamp_raw(pool.pool, *start);
+                self.end_query = Some((pool.pool, *end));
+            }
+        }
+    }
+
     unsafe fn dispatch(
         &mut self,
         pipeline: &Pipeline,
@@ -931,6 +942,12 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
         );
     }
 
+    unsafe fn end_compute_pass(&mut self) {
+        if let Some((pool, end)) = self.end_query.take() {
+            self.write_timestamp_raw(pool, end);
+        }
+    }
+
     /// Insert a pipeline barrier for all memory accesses.
     unsafe fn memory_barrier(&mut self) {
         let device = &self.device.device;
@@ -995,13 +1012,13 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
         );
     }
 
-    unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) {
+    unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
         let device = &self.device.device;
         let size = size.unwrap_or(vk::WHOLE_SIZE);
         device.cmd_fill_buffer(self.cmd_buf, buffer.buffer, 0, size, 0);
     }
 
-    unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) {
+    unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
         let device = &self.device.device;
         let size = src.size.min(dst.size);
         device.cmd_copy_buffer(
@@ -1012,7 +1029,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
         );
     }
 
-    unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) {
+    unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
         let device = &self.device.device;
         device.cmd_copy_image_to_buffer(
             self.cmd_buf,
@@ -1035,7 +1052,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
         );
     }
 
-    unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) {
+    unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
         let device = &self.device.device;
         device.cmd_copy_buffer_to_image(
             self.cmd_buf,
@@ -1058,7 +1075,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
         );
     }
 
-    unsafe fn blit_image(&self, src: &Image, dst: &Image) {
+    unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
         let device = &self.device.device;
         device.cmd_blit_image(
             self.cmd_buf,
@@ -1106,13 +1123,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
     }
 
     unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
-        let device = &self.device.device;
-        device.cmd_write_timestamp(
-            self.cmd_buf,
-            vk::PipelineStageFlags::COMPUTE_SHADER,
-            pool.pool,
-            query,
-        );
+        self.write_timestamp_raw(pool.pool, query);
     }
 
     unsafe fn begin_debug_label(&mut self, label: &str) {
@@ -1130,6 +1141,18 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
     }
 }
 
+impl CmdBuf {
+    unsafe fn write_timestamp_raw(&mut self, pool: vk::QueryPool, query: u32) {
+        let device = &self.device.device;
+        device.cmd_write_timestamp(
+            self.cmd_buf,
+            vk::PipelineStageFlags::COMPUTE_SHADER,
+            pool,
+            query,
+        );
+    }
+}
+
 impl crate::backend::DescriptorSetBuilder<VkDevice> for DescriptorSetBuilder {
     fn add_buffers(&mut self, buffers: &[&Buffer]) {
         self.buffers.extend(buffers.iter().map(|b| b.buffer));

From 58836244a44a080e08c68222cc9d6b731b26162c Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Wed, 20 Apr 2022 09:19:22 -0700
Subject: [PATCH 05/12] Change signature of ComputePass::end() to self

As per review comment.
---
 piet-gpu-hal/src/hub.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/piet-gpu-hal/src/hub.rs b/piet-gpu-hal/src/hub.rs
index 5c7122a..6940b24 100644
--- a/piet-gpu-hal/src/hub.rs
+++ b/piet-gpu-hal/src/hub.rs
@@ -743,7 +743,7 @@ impl<'a> ComputePass<'a> {
             .dispatch(pipeline, descriptor_set, workgroup_count, workgroup_size);
     }
 
-    pub unsafe fn end(&mut self) {
+    pub unsafe fn end(self) {
         self.cmd_buf.cmd_buf().end_compute_pass();
     }
 }

From 5a9b8d924317dce7a38babc410de6bcfa944c1aa Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Wed, 20 Apr 2022 10:21:49 -0700
Subject: [PATCH 06/12] Start applying compute pass to tests

Use compute pass for tests in tests subdir. This is also shaking out some issues that weren't apparent from just collatz.

In particular, we need more autorelease pools to prevent things from leaking. As of this commit, the "clear" test runs correctly but the others haven't yet been converted to the compute_pass format.
---
 piet-gpu-hal/src/lib.rs         |  4 +-
 piet-gpu-hal/src/metal.rs       | 75 +++++++++++++++++++++------------
 piet-gpu-hal/src/metal/timer.rs | 26 +++++++++++-
 tests/src/clear.rs              | 19 +++------
 tests/src/runner.rs             | 15 ++++++-
 5 files changed, 96 insertions(+), 43 deletions(-)

diff --git a/piet-gpu-hal/src/lib.rs b/piet-gpu-hal/src/lib.rs
index 18f6390..a1073f4 100644
--- a/piet-gpu-hal/src/lib.rs
+++ b/piet-gpu-hal/src/lib.rs
@@ -21,8 +21,8 @@ pub use crate::mux::{
 };
 pub use bufwrite::BufWrite;
 pub use hub::{
-    BufReadGuard, BufWriteGuard, Buffer, CmdBuf, DescriptorSetBuilder, Image, RetainResource,
-    Session, SubmittedCmdBuf,
+    BufReadGuard, BufWriteGuard, Buffer, CmdBuf, ComputePass, DescriptorSetBuilder, Image,
+    RetainResource, Session, SubmittedCmdBuf,
 };
 
 // TODO: because these are conditionally included, "cargo fmt" does not
diff --git a/piet-gpu-hal/src/metal.rs b/piet-gpu-hal/src/metal.rs
index c907d77..9b4468c 100644
--- a/piet-gpu-hal/src/metal.rs
+++ b/piet-gpu-hal/src/metal.rs
@@ -381,8 +381,25 @@ impl crate::backend::Device for MtlDevice {
 
     fn create_cmd_buf(&self) -> Result<Self::CmdBuf, Error> {
         let cmd_queue = self.cmd_queue.lock().unwrap();
+        // A discussion about autorelease pools.
+        //
+        // Autorelease pools are a sore point in Rust/Objective-C interop. Basically,
+        // you can have any two of correctness, ergonomics, and performance. Here we've
+        // chosen the first two, using the pattern of a fine grained autorelease pool
+        // to give the Obj-C object Rust-like lifetime semantics whenever objects are
+        // created as autorelease (by convention, this is any object creation with an
+        // Obj-C method name that doesn't begin with "new" or "alloc").
+        //
+        // To gain back some of the performance, we'd need a way to wrap an autorelease
+        // pool over a chunk of work - that could be one frame of rendering, but for
+        // tests that iterate a number of command buffer submissions, it would need to
+        // be around that. On non-mac platforms, it would be a no-op.
+        //
+        // In any case, this way, the caller doesn't need to worry, and the performance
+        // hit might not be so bad (perhaps we should measure).
+
         // consider new_command_buffer_with_unretained_references for performance
-        let cmd_buf = cmd_queue.new_command_buffer().to_owned();
+        let cmd_buf = autoreleasepool(|| cmd_queue.new_command_buffer().to_owned());
         let helpers = self.helpers.clone();
         let cur_encoder = Encoder::None;
         let time_calibration = Default::default();
@@ -555,32 +572,38 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
     }
 
     unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
-        debug_assert!(matches!(self.cur_encoder, Encoder::None));
-        let encoder = if let Some(queries) = &desc.timer_queries {
-            let descriptor: id = msg_send![class!(MTLComputePassDescriptor), computePassDescriptor];
-            let attachments: id = msg_send![descriptor, sampleBufferAttachments];
-            let index: NSUInteger = 0;
-            let attachment: id = msg_send![attachments, objectAtIndexedSubscript: index];
-            // Here we break the hub/mux separation a bit, for expedience
-            #[allow(irrefutable_let_patterns)]
-            if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 {
-                if let Some(sample_buf) = &query_pool.counter_sample_buf {
-                    let () = msg_send![attachment, setSampleBuffer: sample_buf.id()];
+        // TODO: we might want to get better about validation but the following
+        // assert is likely to trigger, and also a case can be made that
+        // validation should be done at the hub level, for consistency.
+        //debug_assert!(matches!(self.cur_encoder, Encoder::None));
+        self.flush_encoder();
+        autoreleasepool(|| {
+            let encoder = if let Some(queries) = &desc.timer_queries {
+                let descriptor: id =
+                    msg_send![class!(MTLComputePassDescriptor), computePassDescriptor];
+                let attachments: id = msg_send![descriptor, sampleBufferAttachments];
+                let index: NSUInteger = 0;
+                let attachment: id = msg_send![attachments, objectAtIndexedSubscript: index];
+                // Here we break the hub/mux separation a bit, for expedience
+                #[allow(irrefutable_let_patterns)]
+                if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 {
+                    if let Some(sample_buf) = &query_pool.counter_sample_buf {
+                        let () = msg_send![attachment, setSampleBuffer: sample_buf.id()];
+                    }
                 }
-            }
-            let start_index = queries.1 as NSUInteger;
-            let end_index = queries.2 as NSInteger;
-            let () = msg_send![attachment, setStartOfEncoderSampleIndex: start_index];
-            let () = msg_send![attachment, setEndOfEncoderSampleIndex: end_index];
-            let encoder = msg_send![
-                self.cmd_buf,
-                computeCommandEncoderWithDescriptor: descriptor
-            ];
-            encoder
-        } else {
-            self.cmd_buf.new_compute_command_encoder()
-        };
-        self.cur_encoder = Encoder::Compute(encoder.to_owned());
+                let start_index = queries.1 as NSUInteger;
+                let end_index = queries.2 as NSInteger;
+                let () = msg_send![attachment, setStartOfEncoderSampleIndex: start_index];
+                let () = msg_send![attachment, setEndOfEncoderSampleIndex: end_index];
+                msg_send![
+                    self.cmd_buf,
+                    computeCommandEncoderWithDescriptor: descriptor
+                ]
+            } else {
+                self.cmd_buf.new_compute_command_encoder()
+            };
+            self.cur_encoder = Encoder::Compute(encoder.to_owned());
+        });
     }
 
     unsafe fn dispatch(
diff --git a/piet-gpu-hal/src/metal/timer.rs b/piet-gpu-hal/src/metal/timer.rs
index a8b80d6..65c8026 100644
--- a/piet-gpu-hal/src/metal/timer.rs
+++ b/piet-gpu-hal/src/metal/timer.rs
@@ -67,6 +67,12 @@ impl CounterSampleBuffer {
     }
 }
 
+impl Drop for CounterSet {
+    fn drop(&mut self) {
+        unsafe { msg_send![self.id, release] }
+    }
+}
+
 impl CounterSet {
     pub fn get_timer_counter_set(device: &DeviceRef) -> Option<CounterSet> {
         unsafe {
@@ -86,6 +92,19 @@ impl CounterSet {
     }
 }
 
+// copied from metal-rs; should be in common utilities maybe?
+fn nsstring_as_str(nsstr: &objc::runtime::Object) -> &str {
+    let bytes = unsafe {
+        let bytes: *const std::os::raw::c_char = msg_send![nsstr, UTF8String];
+        bytes as *const u8
+    };
+    let len: NSUInteger = unsafe { msg_send![nsstr, length] };
+    unsafe {
+        let bytes = std::slice::from_raw_parts(bytes, len as usize);
+        std::str::from_utf8(bytes).unwrap()
+    }
+}
+
 impl CounterSampleBuffer {
     pub fn new(
         device: &DeviceRef,
@@ -107,6 +126,11 @@ impl CounterSampleBuffer {
             let buf: id = msg_send![device, newCounterSampleBufferWithDescriptor: descriptor error: &mut error];
             let () = msg_send![descriptor, release];
             if !error.is_null() {
+                let description = msg_send![error, localizedDescription];
+                println!(
+                    "error allocating sample buffer, code = {}",
+                    nsstring_as_str(description)
+                );
                 let () = msg_send![error, release];
                 return None;
             }
@@ -138,7 +162,7 @@ impl TimeCalibration {
         let delta_gpu = self.gpu_end_ts - self.gpu_start_ts;
         let adj_ts = if delta_gpu > 0 {
             let scale = delta_cpu as f64 / delta_gpu as f64;
-            self.cpu_start_ts as f64 + (raw_ts - self.gpu_start_ts) as f64 * scale
+            self.cpu_start_ts as f64 + (raw_ts as f64 - self.gpu_start_ts as f64) * scale
         } else {
             // Default is ns on Apple Silicon; on other hardware this will be wrong
             raw_ts as f64
diff --git a/tests/src/clear.rs b/tests/src/clear.rs
index fc6f063..af4b8ea 100644
--- a/tests/src/clear.rs
+++ b/tests/src/clear.rs
@@ -16,11 +16,11 @@
 
 //! Utilities (and a benchmark) for clearing buffers with compute shaders.
 
-use piet_gpu_hal::{include_shader, BindType, BufferUsage, DescriptorSet};
+use piet_gpu_hal::{include_shader, BindType, BufferUsage, ComputePass, DescriptorSet};
 use piet_gpu_hal::{Buffer, Pipeline};
 
 use crate::config::Config;
-use crate::runner::{Commands, Runner};
+use crate::runner::Runner;
 use crate::test_result::TestResult;
 
 const WG_SIZE: u64 = 256;
@@ -52,9 +52,9 @@ pub unsafe fn run_clear_test(runner: &mut Runner, config: &Config) -> TestResult
     let mut total_elapsed = 0.0;
     for i in 0..n_iter {
         let mut commands = runner.commands();
-        commands.write_timestamp(0);
-        stage.record(&mut commands, &code, &binding);
-        commands.write_timestamp(1);
+        let mut pass = commands.compute_pass(0, 1);
+        stage.record(&mut pass, &code, &binding);
+        pass.end();
         if i == 0 || config.verify_all {
             commands.cmd_buf.memory_barrier();
             commands.download(&out_buf);
@@ -108,17 +108,12 @@ impl ClearStage {
         ClearBinding { descriptor_set }
     }
 
-    pub unsafe fn record(
-        &self,
-        commands: &mut Commands,
-        code: &ClearCode,
-        bindings: &ClearBinding,
-    ) {
+    pub unsafe fn record(&self, pass: &mut ComputePass, code: &ClearCode, bindings: &ClearBinding) {
         let n_workgroups = (self.n_elements + WG_SIZE - 1) / WG_SIZE;
         // An issue: for clearing large buffers (>16M), we need to check the
         // number of workgroups against the (dynamically detected) limit, and
         // potentially issue multiple dispatches.
-        commands.cmd_buf.dispatch(
+        pass.dispatch(
             &code.pipeline,
             &bindings.descriptor_set,
             (n_workgroups as u32, 1, 1),
diff --git a/tests/src/runner.rs b/tests/src/runner.rs
index 1fd6774..f97e15a 100644
--- a/tests/src/runner.rs
+++ b/tests/src/runner.rs
@@ -20,8 +20,8 @@ use std::ops::RangeBounds;
 
 use bytemuck::Pod;
 use piet_gpu_hal::{
-    BackendType, BufReadGuard, BufWriteGuard, Buffer, BufferUsage, CmdBuf, Instance, InstanceFlags,
-    QueryPool, Session,
+    BackendType, BufReadGuard, BufWriteGuard, Buffer, BufferUsage, CmdBuf, ComputePass,
+    ComputePassDescriptor, Instance, InstanceFlags, QueryPool, Session,
 };
 
 pub struct Runner {
@@ -118,10 +118,21 @@ impl Runner {
 }
 
 impl Commands {
+    #[deprecated(note = "use compute_pass instead")]
     pub unsafe fn write_timestamp(&mut self, query: u32) {
         self.cmd_buf.write_timestamp(&self.query_pool, query);
     }
 
+    /// Start a compute pass with timer queries.
+    pub unsafe fn compute_pass(&mut self, start_query: u32, end_query: u32) -> ComputePass {
+        self.cmd_buf
+            .begin_compute_pass(&ComputePassDescriptor::timer(
+                &self.query_pool,
+                start_query,
+                end_query,
+            ))
+    }
+
     pub unsafe fn upload(&mut self, buf: &BufStage) {
         self.cmd_buf.copy_buffer(&buf.stage_buf, &buf.dev_buf);
     }

From ea0fbab8aa7d405a7c091407ea796e41890a0152 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Wed, 20 Apr 2022 15:13:54 -0700
Subject: [PATCH 07/12] Add memory barrier to ComputePass

We need to be able to call memory_barrier() on ComputePass, to avoid the borrow checker complaining if we tried to call it on the underlying command buffer.
---
 piet-gpu-hal/src/hub.rs | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/piet-gpu-hal/src/hub.rs b/piet-gpu-hal/src/hub.rs
index 6940b24..627b97b 100644
--- a/piet-gpu-hal/src/hub.rs
+++ b/piet-gpu-hal/src/hub.rs
@@ -743,6 +743,15 @@ impl<'a> ComputePass<'a> {
             .dispatch(pipeline, descriptor_set, workgroup_count, workgroup_size);
     }
 
+    /// Add a memory barrier.
+    ///
+    /// Inserts a memory barrier in the compute encoder. This is a convenience
+    /// function for calling the same function on the underlying command buffer,
+    /// avoiding borrow check issues.
+    pub unsafe fn memory_barrier(&mut self) {
+        self.cmd_buf.memory_barrier();
+    }
+
     pub unsafe fn end(self) {
         self.cmd_buf.cmd_buf().end_compute_pass();
     }

From 02cc8679506471811b707bdb6fd74ee69cd41fac Mon Sep 17 00:00:00 2001
From: chad <cbrokaw@gmail.com>
Date: Thu, 21 Apr 2022 04:20:54 -0400
Subject: [PATCH 08/12] command style metal timer queries + compute pass

This commit adds timestamps to compute pass boundaries for command style timer queries on metal.

It also updates the code in piet-gpu/stages, piet-gpu/lib.rs and tests/ to use the new ComputePass type.
---
 piet-gpu-hal/src/metal.rs        | 90 +++++++++++++++++++++-----------
 piet-gpu/bin/winit.rs            | 13 +++--
 piet-gpu/src/lib.rs              | 63 +++++++++++++---------
 piet-gpu/src/stages.rs           | 10 ++--
 piet-gpu/src/stages/clip.rs      | 12 ++---
 piet-gpu/src/stages/draw.rs      | 14 ++---
 piet-gpu/src/stages/path.rs      | 16 +++---
 piet-gpu/src/stages/transform.rs | 14 ++---
 tests/src/clip.rs                |  6 +--
 tests/src/draw.rs                |  6 +--
 tests/src/linkedlist.rs          |  6 +--
 tests/src/message_passing.rs     |  6 +--
 tests/src/path.rs                |  6 +--
 tests/src/prefix.rs              |  6 +--
 tests/src/prefix_tree.rs         | 14 ++---
 tests/src/transform.rs           |  6 +--
 16 files changed, 166 insertions(+), 122 deletions(-)

diff --git a/piet-gpu-hal/src/metal.rs b/piet-gpu-hal/src/metal.rs
index 9b4468c..b2189e4 100644
--- a/piet-gpu-hal/src/metal.rs
+++ b/piet-gpu-hal/src/metal.rs
@@ -107,7 +107,7 @@ pub struct CmdBuf {
 
 enum Encoder {
     None,
-    Compute(metal::ComputeCommandEncoder),
+    Compute(metal::ComputeCommandEncoder, Option<(id, u32)>),
     Blit(metal::BlitCommandEncoder),
 }
 
@@ -578,31 +578,52 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
         //debug_assert!(matches!(self.cur_encoder, Encoder::None));
         self.flush_encoder();
         autoreleasepool(|| {
-            let encoder = if let Some(queries) = &desc.timer_queries {
-                let descriptor: id =
-                    msg_send![class!(MTLComputePassDescriptor), computePassDescriptor];
-                let attachments: id = msg_send![descriptor, sampleBufferAttachments];
-                let index: NSUInteger = 0;
-                let attachment: id = msg_send![attachments, objectAtIndexedSubscript: index];
-                // Here we break the hub/mux separation a bit, for expedience
-                #[allow(irrefutable_let_patterns)]
-                if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 {
-                    if let Some(sample_buf) = &query_pool.counter_sample_buf {
-                        let () = msg_send![attachment, setSampleBuffer: sample_buf.id()];
+            let (encoder, end_query) = match (&desc.timer_queries, self.counter_style) {
+                (Some(queries), CounterStyle::Stage) => {
+                    let descriptor: id =
+                        msg_send![class!(MTLComputePassDescriptor), computePassDescriptor];
+                    let attachments: id = msg_send![descriptor, sampleBufferAttachments];
+                    let index: NSUInteger = 0;
+                    let attachment: id = msg_send![attachments, objectAtIndexedSubscript: index];
+                    // Here we break the hub/mux separation a bit, for expedience
+                    #[allow(irrefutable_let_patterns)]
+                    if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 {
+                        if let Some(sample_buf) = &query_pool.counter_sample_buf {
+                            let () = msg_send![attachment, setSampleBuffer: sample_buf.id()];
+                        }
                     }
+                    let start_index = queries.1 as NSUInteger;
+                    let end_index = queries.2 as NSInteger;
+                    let () = msg_send![attachment, setStartOfEncoderSampleIndex: start_index];
+                    let () = msg_send![attachment, setEndOfEncoderSampleIndex: end_index];
+                    (
+                        msg_send![
+                            self.cmd_buf,
+                            computeCommandEncoderWithDescriptor: descriptor
+                        ],
+                        None,
+                    )
                 }
-                let start_index = queries.1 as NSUInteger;
-                let end_index = queries.2 as NSInteger;
-                let () = msg_send![attachment, setStartOfEncoderSampleIndex: start_index];
-                let () = msg_send![attachment, setEndOfEncoderSampleIndex: end_index];
-                msg_send![
-                    self.cmd_buf,
-                    computeCommandEncoderWithDescriptor: descriptor
-                ]
-            } else {
-                self.cmd_buf.new_compute_command_encoder()
+                (Some(queries), CounterStyle::Command) => {
+                    let encoder = self.cmd_buf.new_compute_command_encoder();
+                    #[allow(irrefutable_let_patterns)]
+                    let end_query = if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 {
+                        if let Some(sample_buf) = &query_pool.counter_sample_buf {
+                            let sample_index = queries.1 as NSUInteger;
+                            let sample_buf = sample_buf.id();
+                            let () = msg_send![encoder, sampleCountersInBuffer: sample_buf atSampleIndex: sample_index withBarrier: true];
+                            Some((sample_buf, queries.2))
+                        } else {
+                            None
+                        }
+                    } else {
+                        None
+                    };
+                    (encoder, end_query)
+                }
+                _ => (self.cmd_buf.new_compute_command_encoder(), None),
             };
-            self.cur_encoder = Encoder::Compute(encoder.to_owned());
+            self.cur_encoder = Encoder::Compute(encoder.to_owned(), end_query);
         });
     }
 
@@ -663,7 +684,7 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
         let size = size.unwrap_or(buffer.size);
         let _ = self.compute_command_encoder();
         // Getting this directly is a workaround for a borrow checker issue.
-        if let Encoder::Compute(e) = &self.cur_encoder {
+        if let Encoder::Compute(e, _) = &self.cur_encoder {
             clear::encode_clear(e, &self.helpers.clear_pipeline, &buffer.buffer, size);
         }
     }
@@ -752,12 +773,12 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
         if let Some(buf) = &pool.counter_sample_buf {
             if matches!(self.cur_encoder, Encoder::None) {
                 self.cur_encoder =
-                    Encoder::Compute(self.cmd_buf.new_compute_command_encoder().to_owned());
+                    Encoder::Compute(self.cmd_buf.new_compute_command_encoder().to_owned(), None);
             }
             let sample_index = query as NSUInteger;
             if self.counter_style == CounterStyle::Command {
                 match &self.cur_encoder {
-                    Encoder::Compute(e) => {
+                    Encoder::Compute(e, _) => {
                         let () = msg_send![e.as_ptr(), sampleCountersInBuffer: buf.id() atSampleIndex: sample_index withBarrier: true];
                     }
                     Encoder::None => unreachable!(),
@@ -765,7 +786,7 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
                 }
             } else if self.counter_style == CounterStyle::Stage {
                 match &self.cur_encoder {
-                    Encoder::Compute(_e) => {
+                    Encoder::Compute(_e, _) => {
                         println!("write_timestamp is not supported for stage-style encoders");
                     }
                     _ => (),
@@ -777,12 +798,12 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
 
 impl CmdBuf {
     fn compute_command_encoder(&mut self) -> &metal::ComputeCommandEncoder {
-        if !matches!(self.cur_encoder, Encoder::Compute(_)) {
+        if !matches!(self.cur_encoder, Encoder::Compute(..)) {
             self.flush_encoder();
             self.cur_encoder =
-                Encoder::Compute(self.cmd_buf.new_compute_command_encoder().to_owned());
+                Encoder::Compute(self.cmd_buf.new_compute_command_encoder().to_owned(), None);
         }
-        if let Encoder::Compute(e) = &self.cur_encoder {
+        if let Encoder::Compute(e, _) = &self.cur_encoder {
             e
         } else {
             unreachable!()
@@ -803,7 +824,14 @@ impl CmdBuf {
 
     fn flush_encoder(&mut self) {
         match std::mem::replace(&mut self.cur_encoder, Encoder::None) {
-            Encoder::Compute(e) => e.end_encoding(),
+            Encoder::Compute(e, Some((sample_buf, end_query))) => {
+                let sample_index = end_query as NSUInteger;
+                unsafe {
+                    let () = msg_send![e.as_ptr(), sampleCountersInBuffer: sample_buf atSampleIndex: sample_index withBarrier: true];
+                }
+                e.end_encoding();
+            }
+            Encoder::Compute(e, None) => e.end_encoding(),
             Encoder::Blit(e) => e.end_encoding(),
             Encoder::None => (),
         }
diff --git a/piet-gpu/bin/winit.rs b/piet-gpu/bin/winit.rs
index b1db5e0..1642026 100644
--- a/piet-gpu/bin/winit.rs
+++ b/piet-gpu/bin/winit.rs
@@ -70,7 +70,7 @@ fn main() -> Result<(), Error> {
             .map(|_| session.create_semaphore())
             .collect::<Result<Vec<_>, Error>>()?;
         let query_pools = (0..NUM_FRAMES)
-            .map(|_| session.create_query_pool(8))
+            .map(|_| session.create_query_pool(12))
             .collect::<Result<Vec<_>, Error>>()?;
         let mut cmd_bufs: [Option<CmdBuf>; NUM_FRAMES] = Default::default();
         let mut submitted: [Option<SubmittedCmdBuf>; NUM_FRAMES] = Default::default();
@@ -112,22 +112,23 @@ fn main() -> Result<(), Error> {
                         if !ts.is_empty() {
                             info_string = format!(
                                 "{:.3}ms :: e:{:.3}ms|alloc:{:.3}ms|cp:{:.3}ms|bd:{:.3}ms|bin:{:.3}ms|cr:{:.3}ms|r:{:.3}ms",
-                                ts[6] * 1e3,
+                                ts[10] * 1e3,
                                 ts[0] * 1e3,
                                 (ts[1] - ts[0]) * 1e3,
                                 (ts[2] - ts[1]) * 1e3,
-                                (ts[3] - ts[2]) * 1e3,
                                 (ts[4] - ts[3]) * 1e3,
-                                (ts[5] - ts[4]) * 1e3,
                                 (ts[6] - ts[5]) * 1e3,
+                                (ts[8] - ts[7]) * 1e3,
+                                (ts[10] - ts[9]) * 1e3,
                             );
                         }
                     }
 
                     let mut ctx = PietGpuRenderContext::new();
+                    let test_blend = false;
                     if let Some(svg) = &svg {
                         test_scenes::render_svg(&mut ctx, svg);
-                    } else {
+                    } else if test_blend {
                         use piet_gpu::{Blend, BlendMode::*, CompositionMode::*};
                         let blends = [
                             Blend::new(Normal, SrcOver),
@@ -163,6 +164,8 @@ fn main() -> Result<(), Error> {
                         let blend = blends[mode % blends.len()];
                         test_scenes::render_blend_test(&mut ctx, current_frame, blend);
                         info_string = format!("{:?}", blend);
+                    } else {
+                        test_scenes::render_anim_frame(&mut ctx, current_frame);
                     }
                     render_info_string(&mut ctx, &info_string);
                     if let Err(e) = renderer.upload_render_ctx(&mut ctx, frame_idx) {
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index 249735a..acbd28d 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -17,8 +17,8 @@ use piet::kurbo::Vec2;
 use piet::{ImageFormat, RenderContext};
 
 use piet_gpu_hal::{
-    include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Error, Image,
-    ImageLayout, Pipeline, QueryPool, Session,
+    include_shader, BindType, Buffer, BufferUsage, CmdBuf, ComputePassDescriptor, DescriptorSet,
+    Error, Image, ImageLayout, Pipeline, QueryPool, Session,
 };
 
 pub use pico_svg::PicoSvg;
@@ -423,10 +423,11 @@ impl Renderer {
         cmd_buf.copy_buffer_to_image(&self.gradient_bufs[buf_ix], &self.gradients);
         cmd_buf.image_barrier(&self.gradients, ImageLayout::BlitDst, ImageLayout::General);
         cmd_buf.reset_query_pool(&query_pool);
-        cmd_buf.write_timestamp(&query_pool, 0);
         cmd_buf.begin_debug_label("Element bounding box calculation");
+        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 0, 1));
+        // cmd_buf.write_timestamp(&query_pool, 0);
         self.element_stage.record(
-            cmd_buf,
+            &mut pass,
             &self.element_code,
             &self.element_bindings[buf_ix],
             self.n_transform as u64,
@@ -434,56 +435,64 @@ impl Renderer {
             self.n_pathtag as u32,
             self.n_drawobj as u64,
         );
+        pass.end();
         cmd_buf.end_debug_label();
-        cmd_buf.write_timestamp(&query_pool, 1);
+        // cmd_buf.write_timestamp(&query_pool, 1);
         cmd_buf.memory_barrier();
         cmd_buf.begin_debug_label("Clip bounding box calculation");
+        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 2, 3));
         self.clip_binding
-            .record(cmd_buf, &self.clip_code, self.n_clip as u32);
-        cmd_buf.end_debug_label();
-        cmd_buf.begin_debug_label("Element binning");
-        cmd_buf.dispatch(
+            .record(&mut pass, &self.clip_code, self.n_clip as u32);
+        // cmd_buf.end_debug_label();
+        // cmd_buf.begin_debug_label("Element binning");
+        pass.dispatch(
             &self.bin_pipeline,
             &self.bin_ds,
             (((self.n_paths + 255) / 256) as u32, 1, 1),
             (256, 1, 1),
         );
-        cmd_buf.end_debug_label();
-        cmd_buf.memory_barrier();
-        cmd_buf.begin_debug_label("Tile allocation");
-        cmd_buf.dispatch(
+        // cmd_buf.end_debug_label();
+        pass.memory_barrier();
+        // cmd_buf.begin_debug_label("Tile allocation");
+        pass.dispatch(
             &self.tile_pipeline,
             &self.tile_ds[buf_ix],
             (((self.n_paths + 255) / 256) as u32, 1, 1),
             (256, 1, 1),
         );
-        cmd_buf.end_debug_label();
-        cmd_buf.write_timestamp(&query_pool, 2);
-        cmd_buf.memory_barrier();
+        // cmd_buf.end_debug_label();
+        pass.end();
+        // cmd_buf.write_timestamp(&query_pool, 2);
         cmd_buf.begin_debug_label("Path flattening");
-        cmd_buf.dispatch(
+        cmd_buf.memory_barrier();
+        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 4, 5));
+        pass.dispatch(
             &self.path_pipeline,
             &self.path_ds,
             (((self.n_pathseg + 31) / 32) as u32, 1, 1),
             (32, 1, 1),
         );
+        pass.end();
+        // cmd_buf.write_timestamp(&query_pool, 3);
         cmd_buf.end_debug_label();
-        cmd_buf.write_timestamp(&query_pool, 3);
         cmd_buf.memory_barrier();
         cmd_buf.begin_debug_label("Backdrop propagation");
-        cmd_buf.dispatch(
+        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 6, 7));
+        pass.dispatch(
             &self.backdrop_pipeline,
             &self.backdrop_ds,
             (((self.n_paths + 255) / 256) as u32, 1, 1),
             (256, self.backdrop_y, 1),
         );
+        pass.end();
+        // cmd_buf.write_timestamp(&query_pool, 4);
         cmd_buf.end_debug_label();
-        cmd_buf.write_timestamp(&query_pool, 4);
         // TODO: redo query accounting
-        cmd_buf.write_timestamp(&query_pool, 5);
+        // cmd_buf.write_timestamp(&query_pool, 5);
         cmd_buf.memory_barrier();
         cmd_buf.begin_debug_label("Coarse raster");
-        cmd_buf.dispatch(
+        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 8, 9));
+        pass.dispatch(
             &self.coarse_pipeline,
             &self.coarse_ds[buf_ix],
             (
@@ -493,11 +502,14 @@ impl Renderer {
             ),
             (256, 1, 1),
         );
+        pass.end();
         cmd_buf.end_debug_label();
-        cmd_buf.write_timestamp(&query_pool, 6);
+        // cmd_buf.write_timestamp(&query_pool, 6);
         cmd_buf.memory_barrier();
         cmd_buf.begin_debug_label("Fine raster");
-        cmd_buf.dispatch(
+        let mut pass =
+            cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 10, 11));
+        pass.dispatch(
             &self.k4_pipeline,
             &self.k4_ds,
             (
@@ -507,8 +519,9 @@ impl Renderer {
             ),
             (8, 4, 1),
         );
+        pass.end();
         cmd_buf.end_debug_label();
-        cmd_buf.write_timestamp(&query_pool, 7);
+        // cmd_buf.write_timestamp(&query_pool, 7);
         cmd_buf.memory_barrier();
         cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
     }
diff --git a/piet-gpu/src/stages.rs b/piet-gpu/src/stages.rs
index 52b8bf1..5442ba3 100644
--- a/piet-gpu/src/stages.rs
+++ b/piet-gpu/src/stages.rs
@@ -26,7 +26,7 @@ use bytemuck::{Pod, Zeroable};
 pub use clip::{ClipBinding, ClipCode, CLIP_PART_SIZE};
 pub use draw::{DrawBinding, DrawCode, DrawMonoid, DrawStage, DRAW_PART_SIZE};
 pub use path::{PathBinding, PathCode, PathEncoder, PathStage, PATHSEG_PART_SIZE};
-use piet_gpu_hal::{Buffer, CmdBuf, Session};
+use piet_gpu_hal::{Buffer, ComputePass, Session};
 pub use transform::{
     Transform, TransformBinding, TransformCode, TransformStage, TRANSFORM_PART_SIZE,
 };
@@ -140,7 +140,7 @@ impl ElementStage {
 
     pub unsafe fn record(
         &self,
-        cmd_buf: &mut CmdBuf,
+        pass: &mut ComputePass,
         code: &ElementCode,
         binding: &ElementBinding,
         n_transform: u64,
@@ -149,14 +149,14 @@ impl ElementStage {
         n_drawobj: u64,
     ) {
         self.transform_stage.record(
-            cmd_buf,
+            pass,
             &code.transform_code,
             &binding.transform_binding,
             n_transform,
         );
         // No memory barrier needed here; path has at least one before pathseg
         self.path_stage.record(
-            cmd_buf,
+            pass,
             &code.path_code,
             &binding.path_binding,
             n_paths,
@@ -164,6 +164,6 @@ impl ElementStage {
         );
         // No memory barrier needed here; draw has at least one before draw_leaf
         self.draw_stage
-            .record(cmd_buf, &code.draw_code, &binding.draw_binding, n_drawobj);
+            .record(pass, &code.draw_code, &binding.draw_binding, n_drawobj);
     }
 }
diff --git a/piet-gpu/src/stages/clip.rs b/piet-gpu/src/stages/clip.rs
index e4bc3db..2fd195b 100644
--- a/piet-gpu/src/stages/clip.rs
+++ b/piet-gpu/src/stages/clip.rs
@@ -16,7 +16,7 @@
 
 //! The clip processing stage (includes substages).
 
-use piet_gpu_hal::{include_shader, BindType, Buffer, CmdBuf, DescriptorSet, Pipeline, Session};
+use piet_gpu_hal::{include_shader, BindType, Buffer, ComputePass, DescriptorSet, Pipeline, Session};
 
 // Note that this isn't the code/stage/binding pattern of most of the other stages
 // in the new element processing pipeline. We want to move those temporary buffers
@@ -69,26 +69,26 @@ impl ClipBinding {
     /// Record the clip dispatches.
     ///
     /// Assumes memory barrier on entry. Provides memory barrier on exit.
-    pub unsafe fn record(&self, cmd_buf: &mut CmdBuf, code: &ClipCode, n_clip: u32) {
+    pub unsafe fn record(&self, pass: &mut ComputePass, code: &ClipCode, n_clip: u32) {
         let n_wg_reduce = n_clip.saturating_sub(1) / CLIP_PART_SIZE;
         if n_wg_reduce > 0 {
-            cmd_buf.dispatch(
+            pass.dispatch(
                 &code.reduce_pipeline,
                 &self.reduce_ds,
                 (n_wg_reduce, 1, 1),
                 (CLIP_PART_SIZE, 1, 1),
             );
-            cmd_buf.memory_barrier();
+            pass.memory_barrier();
         }
         let n_wg = (n_clip + CLIP_PART_SIZE - 1) / CLIP_PART_SIZE;
         if n_wg > 0 {
-            cmd_buf.dispatch(
+            pass.dispatch(
                 &code.leaf_pipeline,
                 &self.leaf_ds,
                 (n_wg, 1, 1),
                 (CLIP_PART_SIZE, 1, 1),
             );
-            cmd_buf.memory_barrier();
+            pass.memory_barrier();
         }
     }
 }
diff --git a/piet-gpu/src/stages/draw.rs b/piet-gpu/src/stages/draw.rs
index 21312a4..f0ee2b6 100644
--- a/piet-gpu/src/stages/draw.rs
+++ b/piet-gpu/src/stages/draw.rs
@@ -19,7 +19,7 @@
 use bytemuck::{Pod, Zeroable};
 
 use piet_gpu_hal::{
-    include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session,
+    include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session,
 };
 
 /// The output element of the draw object stage.
@@ -130,7 +130,7 @@ impl DrawStage {
 
     pub unsafe fn record(
         &self,
-        cmd_buf: &mut CmdBuf,
+        pass: &mut ComputePass,
         code: &DrawCode,
         binding: &DrawBinding,
         size: u64,
@@ -140,22 +140,22 @@ impl DrawStage {
         }
         let n_workgroups = (size + DRAW_PART_SIZE - 1) / DRAW_PART_SIZE;
         if n_workgroups > 1 {
-            cmd_buf.dispatch(
+            pass.dispatch(
                 &code.reduce_pipeline,
                 &binding.reduce_ds,
                 (n_workgroups as u32, 1, 1),
                 (DRAW_WG as u32, 1, 1),
             );
-            cmd_buf.memory_barrier();
-            cmd_buf.dispatch(
+            pass.memory_barrier();
+            pass.dispatch(
                 &code.root_pipeline,
                 &self.root_ds,
                 (1, 1, 1),
                 (DRAW_WG as u32, 1, 1),
             );
         }
-        cmd_buf.memory_barrier();
-        cmd_buf.dispatch(
+        pass.memory_barrier();
+        pass.dispatch(
             &code.leaf_pipeline,
             &binding.leaf_ds,
             (n_workgroups as u32, 1, 1),
diff --git a/piet-gpu/src/stages/path.rs b/piet-gpu/src/stages/path.rs
index 6c524a2..be33041 100644
--- a/piet-gpu/src/stages/path.rs
+++ b/piet-gpu/src/stages/path.rs
@@ -17,7 +17,7 @@
 //! The path stage (includes substages).
 
 use piet_gpu_hal::{
-    include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session,
+    include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session,
 };
 
 pub struct PathCode {
@@ -148,7 +148,7 @@ impl PathStage {
     /// those are consumed. Result is written without barrier.
     pub unsafe fn record(
         &self,
-        cmd_buf: &mut CmdBuf,
+        pass: &mut ComputePass,
         code: &PathCode,
         binding: &PathBinding,
         n_paths: u32,
@@ -166,15 +166,15 @@ impl PathStage {
         let reduce_part_tags = REDUCE_PART_SIZE * 4;
         let n_wg_tag_reduce = (n_tags + reduce_part_tags - 1) / reduce_part_tags;
         if n_wg_tag_reduce > 1 {
-            cmd_buf.dispatch(
+            pass.dispatch(
                 &code.reduce_pipeline,
                 &binding.reduce_ds,
                 (n_wg_tag_reduce, 1, 1),
                 (REDUCE_WG, 1, 1),
             );
             // I think we can skip root if n_wg_tag_reduce == 2
-            cmd_buf.memory_barrier();
-            cmd_buf.dispatch(
+            pass.memory_barrier();
+            pass.dispatch(
                 &code.tag_root_pipeline,
                 &self.tag_root_ds,
                 (1, 1, 1),
@@ -183,15 +183,15 @@ impl PathStage {
             // No barrier needed here; clear doesn't depend on path tags
         }
         let n_wg_clear = (n_paths + CLEAR_WG - 1) / CLEAR_WG;
-        cmd_buf.dispatch(
+        pass.dispatch(
             &code.clear_pipeline,
             &binding.clear_ds,
             (n_wg_clear, 1, 1),
             (CLEAR_WG, 1, 1),
         );
-        cmd_buf.memory_barrier();
+        pass.memory_barrier();
         let n_wg_pathseg = (n_tags + SCAN_PART_SIZE - 1) / SCAN_PART_SIZE;
-        cmd_buf.dispatch(
+        pass.dispatch(
             &code.pathseg_pipeline,
             &binding.path_ds,
             (n_wg_pathseg, 1, 1),
diff --git a/piet-gpu/src/stages/transform.rs b/piet-gpu/src/stages/transform.rs
index b21712f..8de7cee 100644
--- a/piet-gpu/src/stages/transform.rs
+++ b/piet-gpu/src/stages/transform.rs
@@ -20,7 +20,7 @@ use bytemuck::{Pod, Zeroable};
 
 use piet::kurbo::Affine;
 use piet_gpu_hal::{
-    include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session,
+    include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session,
 };
 
 /// An affine transform.
@@ -132,7 +132,7 @@ impl TransformStage {
 
     pub unsafe fn record(
         &self,
-        cmd_buf: &mut CmdBuf,
+        pass: &mut ComputePass,
         code: &TransformCode,
         binding: &TransformBinding,
         size: u64,
@@ -142,22 +142,22 @@ impl TransformStage {
         }
         let n_workgroups = (size + TRANSFORM_PART_SIZE - 1) / TRANSFORM_PART_SIZE;
         if n_workgroups > 1 {
-            cmd_buf.dispatch(
+            pass.dispatch(
                 &code.reduce_pipeline,
                 &binding.reduce_ds,
                 (n_workgroups as u32, 1, 1),
                 (TRANSFORM_WG as u32, 1, 1),
             );
-            cmd_buf.memory_barrier();
-            cmd_buf.dispatch(
+            pass.memory_barrier();
+            pass.dispatch(
                 &code.root_pipeline,
                 &self.root_ds,
                 (1, 1, 1),
                 (TRANSFORM_WG as u32, 1, 1),
             );
-            cmd_buf.memory_barrier();
+            pass.memory_barrier();
         }
-        cmd_buf.dispatch(
+        pass.dispatch(
             &code.leaf_pipeline,
             &binding.leaf_ds,
             (n_workgroups as u32, 1, 1),
diff --git a/tests/src/clip.rs b/tests/src/clip.rs
index 4a38949..b1f8613 100644
--- a/tests/src/clip.rs
+++ b/tests/src/clip.rs
@@ -58,11 +58,11 @@ pub unsafe fn clip_test(runner: &mut Runner, config: &Config) -> TestResult {
     let binding = ClipBinding::new(&runner.session, &code, &config_buf, &memory.dev_buf);
 
     let mut commands = runner.commands();
-    commands.write_timestamp(0);
     commands.upload(&memory);
-    binding.record(&mut commands.cmd_buf, &code, n_clip as u32);
+    let mut pass = commands.compute_pass(0, 1);
+    binding.record(&mut pass, &code, n_clip as u32);
+    pass.end();
     commands.download(&memory);
-    commands.write_timestamp(1);
     runner.submit(commands);
     let dst = memory.map_read(..);
     if let Some(failure) = data.verify(&dst) {
diff --git a/tests/src/draw.rs b/tests/src/draw.rs
index 4372da4..dc82572 100644
--- a/tests/src/draw.rs
+++ b/tests/src/draw.rs
@@ -77,9 +77,9 @@ pub unsafe fn draw_test(runner: &mut Runner, config: &Config) -> TestResult {
     let n_iter = config.n_iter;
     for i in 0..n_iter {
         let mut commands = runner.commands();
-        commands.write_timestamp(0);
-        stage.record(&mut commands.cmd_buf, &code, &binding, n_tag);
-        commands.write_timestamp(1);
+        let mut pass = commands.compute_pass(0, 1);
+        stage.record(&mut pass, &code, &binding, n_tag);
+        pass.end();
         if i == 0 || config.verify_all {
             commands.cmd_buf.memory_barrier();
             commands.download(&memory);
diff --git a/tests/src/linkedlist.rs b/tests/src/linkedlist.rs
index 5767806..e24adcb 100644
--- a/tests/src/linkedlist.rs
+++ b/tests/src/linkedlist.rs
@@ -45,9 +45,7 @@ pub unsafe fn run_linkedlist_test(runner: &mut Runner, config: &Config) -> TestR
     for i in 0..n_iter {
         let mut commands = runner.commands();
         // Might clear only buckets to save time.
-        commands.write_timestamp(0);
         stage.record(&mut commands, &code, &binding, &mem_buf.dev_buf);
-        commands.write_timestamp(1);
         if i == 0 || config.verify_all {
             commands.cmd_buf.memory_barrier();
             commands.download(&mem_buf);
@@ -107,12 +105,14 @@ impl LinkedListStage {
         commands.cmd_buf.clear_buffer(out_buf, None);
         commands.cmd_buf.memory_barrier();
         let n_workgroups = N_BUCKETS / WG_SIZE;
-        commands.cmd_buf.dispatch(
+        let mut pass = commands.compute_pass(0, 1);
+        pass.dispatch(
             &code.pipeline,
             &bindings.descriptor_set,
             (n_workgroups as u32, 1, 1),
             (WG_SIZE as u32, 1, 1),
         );
+        pass.end();
     }
 }
 
diff --git a/tests/src/message_passing.rs b/tests/src/message_passing.rs
index c5d989b..39e71dc 100644
--- a/tests/src/message_passing.rs
+++ b/tests/src/message_passing.rs
@@ -59,9 +59,7 @@ pub unsafe fn run_message_passing_test(
     let mut failures = 0;
     for _ in 0..n_iter {
         let mut commands = runner.commands();
-        commands.write_timestamp(0);
         stage.record(&mut commands, &code, &binding, &out_buf.dev_buf);
-        commands.write_timestamp(1);
         commands.cmd_buf.memory_barrier();
         commands.download(&out_buf);
         total_elapsed += runner.submit(commands);
@@ -128,11 +126,13 @@ impl MessagePassingStage {
         commands.cmd_buf.clear_buffer(&self.data_buf, None);
         commands.cmd_buf.clear_buffer(out_buf, None);
         commands.cmd_buf.memory_barrier();
-        commands.cmd_buf.dispatch(
+        let mut pass = commands.compute_pass(0, 1);
+        pass.dispatch(
             &code.pipeline,
             &bindings.descriptor_set,
             (256, 1, 1),
             (256, 1, 1),
         );
+        pass.end();
     }
 }
diff --git a/tests/src/path.rs b/tests/src/path.rs
index bf72c68..9d794e1 100644
--- a/tests/src/path.rs
+++ b/tests/src/path.rs
@@ -105,15 +105,15 @@ pub unsafe fn path_test(runner: &mut Runner, config: &Config) -> TestResult {
         let mut commands = runner.commands();
         commands.cmd_buf.copy_buffer(&memory_init, &memory.dev_buf);
         commands.cmd_buf.memory_barrier();
-        commands.write_timestamp(0);
+        let mut pass = commands.compute_pass(0, 1);
         stage.record(
-            &mut commands.cmd_buf,
+            &mut pass,
             &code,
             &binding,
             path_data.n_path,
             path_data.tags.len() as u32,
         );
-        commands.write_timestamp(1);
+        pass.end();
         if i == 0 || config.verify_all {
             commands.cmd_buf.memory_barrier();
             commands.download(&memory);
diff --git a/tests/src/prefix.rs b/tests/src/prefix.rs
index 4174d8d..dbaf256 100644
--- a/tests/src/prefix.rs
+++ b/tests/src/prefix.rs
@@ -85,9 +85,7 @@ pub unsafe fn run_prefix_test(
     let mut total_elapsed = 0.0;
     for i in 0..n_iter {
         let mut commands = runner.commands();
-        commands.write_timestamp(0);
         stage.record(&mut commands, &code, &binding);
-        commands.write_timestamp(1);
         if i == 0 || config.verify_all {
             commands.cmd_buf.memory_barrier();
             commands.download(&out_buf);
@@ -159,12 +157,14 @@ impl PrefixStage {
         let n_workgroups = (self.n_elements + ELEMENTS_PER_WG - 1) / ELEMENTS_PER_WG;
         commands.cmd_buf.clear_buffer(&self.state_buf, None);
         commands.cmd_buf.memory_barrier();
-        commands.cmd_buf.dispatch(
+        let mut pass = commands.compute_pass(0, 1);
+        pass.dispatch(
             &code.pipeline,
             &bindings.descriptor_set,
             (n_workgroups as u32, 1, 1),
             (WG_SIZE as u32, 1, 1),
         );
+        pass.end();
         // One thing that's missing here is registering the buffers so
         // they can be safely dropped by Rust code before the execution
         // of the command buffer completes.
diff --git a/tests/src/prefix_tree.rs b/tests/src/prefix_tree.rs
index 24be2af..3c9c813 100644
--- a/tests/src/prefix_tree.rs
+++ b/tests/src/prefix_tree.rs
@@ -66,9 +66,7 @@ pub unsafe fn run_prefix_test(runner: &mut Runner, config: &Config) -> TestResul
         let mut commands = runner.commands();
         commands.cmd_buf.copy_buffer(&data_buf, &out_buf.dev_buf);
         commands.cmd_buf.memory_barrier();
-        commands.write_timestamp(0);
         stage.record(&mut commands, &code, &binding);
-        commands.write_timestamp(1);
         if i == 0 || config.verify_all {
             commands.cmd_buf.memory_barrier();
             commands.download(&out_buf);
@@ -175,33 +173,35 @@ impl PrefixTreeStage {
         code: &PrefixTreeCode,
         bindings: &PrefixTreeBinding,
     ) {
+        let mut pass = commands.compute_pass(0, 1);
         let n = self.tmp_bufs.len();
         for i in 0..n {
             let n_workgroups = self.sizes[i + 1];
-            commands.cmd_buf.dispatch(
+            pass.dispatch(
                 &code.reduce_pipeline,
                 &bindings.descriptor_sets[i],
                 (n_workgroups as u32, 1, 1),
                 (WG_SIZE as u32, 1, 1),
             );
-            commands.cmd_buf.memory_barrier();
+            pass.memory_barrier();
         }
-        commands.cmd_buf.dispatch(
+        pass.dispatch(
             &code.root_pipeline,
             &bindings.descriptor_sets[n],
             (1, 1, 1),
             (WG_SIZE as u32, 1, 1),
         );
         for i in (0..n).rev() {
-            commands.cmd_buf.memory_barrier();
+            pass.memory_barrier();
             let n_workgroups = self.sizes[i + 1];
-            commands.cmd_buf.dispatch(
+            pass.dispatch(
                 &code.scan_pipeline,
                 &bindings.descriptor_sets[2 * n - i],
                 (n_workgroups as u32, 1, 1),
                 (WG_SIZE as u32, 1, 1),
             );
         }
+        pass.end();
     }
 }
 
diff --git a/tests/src/transform.rs b/tests/src/transform.rs
index 6edcc3f..43bfc67 100644
--- a/tests/src/transform.rs
+++ b/tests/src/transform.rs
@@ -61,9 +61,9 @@ pub unsafe fn transform_test(runner: &mut Runner, config: &Config) -> TestResult
     let n_iter = config.n_iter;
     for i in 0..n_iter {
         let mut commands = runner.commands();
-        commands.write_timestamp(0);
-        stage.record(&mut commands.cmd_buf, &code, &binding, n_elements);
-        commands.write_timestamp(1);
+        let mut pass = commands.compute_pass(0, 1);
+        stage.record(&mut pass, &code, &binding, n_elements);
+        pass.end();
         if i == 0 || config.verify_all {
             commands.cmd_buf.memory_barrier();
             commands.download(&memory);

From 867b97f41d3c66124d211c8018889b7af941461f Mon Sep 17 00:00:00 2001
From: chad <cbrokaw@gmail.com>
Date: Mon, 25 Apr 2022 02:34:19 -0400
Subject: [PATCH 09/12] Detect metal counter sampling style

Use MTLDevice::supports_counter_sampling() to select the appropriate counter style.
---
 piet-gpu-hal/src/metal.rs | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/piet-gpu-hal/src/metal.rs b/piet-gpu-hal/src/metal.rs
index b2189e4..307def8 100644
--- a/piet-gpu-hal/src/metal.rs
+++ b/piet-gpu-hal/src/metal.rs
@@ -55,7 +55,7 @@ pub struct MtlDevice {
 /// Type of counter sampling.
 ///
 /// See https://developer.apple.com/documentation/metal/counter_sampling/sampling_gpu_data_into_counter_sample_buffers
-#[derive(Clone, Copy, PartialEq, Eq)]
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
 enum CounterStyle {
     None,
     Stage,
@@ -244,12 +244,18 @@ impl MtlDevice {
         // Timer stuff
         let timer_set = CounterSet::get_timer_counter_set(&device);
         let counter_style = if timer_set.is_some() {
-            // TODO: M1 is stage style, but should do proper runtime detection.
-            CounterStyle::Stage
+            if device.supports_counter_sampling(metal::MTLCounterSamplingPoint::AtStageBoundary) {
+                CounterStyle::Stage
+            } else if device
+                .supports_counter_sampling(metal::MTLCounterSamplingPoint::AtDispatchBoundary)
+            {
+                CounterStyle::Command
+            } else {
+                CounterStyle::None
+            }
         } else {
             CounterStyle::None
         };
-
         MtlDevice {
             device,
             cmd_queue: Arc::new(Mutex::new(cmd_queue)),

From 41316ccd4b83eb0672af2f037a8b5005749f3678 Mon Sep 17 00:00:00 2001
From: Chad Brokaw <cbrokaw@gmail.com>
Date: Wed, 4 May 2022 01:56:16 -0400
Subject: [PATCH 10/12] Add debug label functions to ComputePass

---
 piet-gpu-hal/src/hub.rs | 10 ++++++++++
 piet-gpu/src/lib.rs     | 19 ++++++-------------
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/piet-gpu-hal/src/hub.rs b/piet-gpu-hal/src/hub.rs
index 627b97b..5eafd0b 100644
--- a/piet-gpu-hal/src/hub.rs
+++ b/piet-gpu-hal/src/hub.rs
@@ -752,6 +752,16 @@ impl<'a> ComputePass<'a> {
         self.cmd_buf.memory_barrier();
     }
 
+    /// Begin a labeled section for debugging and profiling purposes.
+    pub unsafe fn begin_debug_label(&mut self, label: &str) {
+        self.cmd_buf.begin_debug_label(label);
+    }
+
+    /// End a section opened by `begin_debug_label`.
+    pub unsafe fn end_debug_label(&mut self) {
+        self.cmd_buf.end_debug_label();
+    }
+
     pub unsafe fn end(self) {
         self.cmd_buf.cmd_buf().end_compute_pass();
     }
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index acbd28d..3d28e4a 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -437,32 +437,30 @@ impl Renderer {
         );
         pass.end();
         cmd_buf.end_debug_label();
-        // cmd_buf.write_timestamp(&query_pool, 1);
         cmd_buf.memory_barrier();
-        cmd_buf.begin_debug_label("Clip bounding box calculation");
         let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 2, 3));
+        pass.begin_debug_label("Clip bounding box calculation");
         self.clip_binding
             .record(&mut pass, &self.clip_code, self.n_clip as u32);
-        // cmd_buf.end_debug_label();
-        // cmd_buf.begin_debug_label("Element binning");
+        pass.end_debug_label();
+        pass.begin_debug_label("Element binning");
         pass.dispatch(
             &self.bin_pipeline,
             &self.bin_ds,
             (((self.n_paths + 255) / 256) as u32, 1, 1),
             (256, 1, 1),
         );
-        // cmd_buf.end_debug_label();
+        pass.end_debug_label();
         pass.memory_barrier();
-        // cmd_buf.begin_debug_label("Tile allocation");
+        pass.begin_debug_label("Tile allocation");
         pass.dispatch(
             &self.tile_pipeline,
             &self.tile_ds[buf_ix],
             (((self.n_paths + 255) / 256) as u32, 1, 1),
             (256, 1, 1),
         );
-        // cmd_buf.end_debug_label();
+        pass.end_debug_label();
         pass.end();
-        // cmd_buf.write_timestamp(&query_pool, 2);
         cmd_buf.begin_debug_label("Path flattening");
         cmd_buf.memory_barrier();
         let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 4, 5));
@@ -473,7 +471,6 @@ impl Renderer {
             (32, 1, 1),
         );
         pass.end();
-        // cmd_buf.write_timestamp(&query_pool, 3);
         cmd_buf.end_debug_label();
         cmd_buf.memory_barrier();
         cmd_buf.begin_debug_label("Backdrop propagation");
@@ -485,10 +482,8 @@ impl Renderer {
             (256, self.backdrop_y, 1),
         );
         pass.end();
-        // cmd_buf.write_timestamp(&query_pool, 4);
         cmd_buf.end_debug_label();
         // TODO: redo query accounting
-        // cmd_buf.write_timestamp(&query_pool, 5);
         cmd_buf.memory_barrier();
         cmd_buf.begin_debug_label("Coarse raster");
         let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 8, 9));
@@ -504,7 +499,6 @@ impl Renderer {
         );
         pass.end();
         cmd_buf.end_debug_label();
-        // cmd_buf.write_timestamp(&query_pool, 6);
         cmd_buf.memory_barrier();
         cmd_buf.begin_debug_label("Fine raster");
         let mut pass =
@@ -521,7 +515,6 @@ impl Renderer {
         );
         pass.end();
         cmd_buf.end_debug_label();
-        // cmd_buf.write_timestamp(&query_pool, 7);
         cmd_buf.memory_barrier();
         cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
     }

From 60bca997e5acd0a84dd8e9c771bab99540ded681 Mon Sep 17 00:00:00 2001
From: Chad Brokaw <cbrokaw@gmail.com>
Date: Wed, 4 May 2022 01:59:49 -0400
Subject: [PATCH 11/12] Remove deprecated functions

* remove CmdBuff::dispatch() which was moved to ComputePass

* remove CmdBuff::write_timestamp() which is replaced by timestamp index pair in ComputePassDescriptor
---
 piet-gpu-hal/src/hub.rs | 36 ------------------------------------
 tests/src/runner.rs     |  5 -----
 2 files changed, 41 deletions(-)

diff --git a/piet-gpu-hal/src/hub.rs b/piet-gpu-hal/src/hub.rs
index 5eafd0b..ea17754 100644
--- a/piet-gpu-hal/src/hub.rs
+++ b/piet-gpu-hal/src/hub.rs
@@ -491,31 +491,6 @@ impl CmdBuf {
         ComputePass { cmd_buf: self }
     }
 
-    /// Dispatch a compute shader.
-    ///
-    /// Request a compute shader to be run, using the pipeline to specify the
-    /// code, and the descriptor set to address the resources read and written.
-    ///
-    /// Both the workgroup count (number of workgroups) and the workgroup size
-    /// (number of threads in a workgroup) must be specified here, though not
-    /// all back-ends require the latter info.
-    ///
-    /// This version is deprecated because (a) you do not get timer queries and
-    /// (b) it doesn't aggregate multiple dispatches into a single compute
-    /// pass, which is a performance concern.
-    #[deprecated(note = "moving to ComputePass")]
-    pub unsafe fn dispatch(
-        &mut self,
-        pipeline: &Pipeline,
-        descriptor_set: &DescriptorSet,
-        workgroup_count: (u32, u32, u32),
-        workgroup_size: (u32, u32, u32),
-    ) {
-        let mut pass = self.begin_compute_pass(&Default::default());
-        pass.dispatch(pipeline, descriptor_set, workgroup_count, workgroup_size);
-        pass.end();
-    }
-
     /// Insert an execution and memory barrier.
     ///
     /// Compute kernels (and other actions) after this barrier may read from buffers
@@ -608,17 +583,6 @@ impl CmdBuf {
         self.cmd_buf().reset_query_pool(pool);
     }
 
-    /// Write a timestamp.
-    ///
-    /// The query index must be less than the size of the query pool on creation.
-    ///
-    /// Deprecation: for greater portability, set timestamp queries on compute
-    /// passes instead.
-    #[deprecated(note = "use compute pass descriptor instead")]
-    pub unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
-        self.cmd_buf().write_timestamp(pool, query);
-    }
-
     /// Prepare the timestamps for reading. This isn't required on Vulkan but
     /// is required on (at least) DX12.
     ///
diff --git a/tests/src/runner.rs b/tests/src/runner.rs
index f97e15a..3ba8223 100644
--- a/tests/src/runner.rs
+++ b/tests/src/runner.rs
@@ -118,11 +118,6 @@ impl Runner {
 }
 
 impl Commands {
-    #[deprecated(note = "use compute_pass instead")]
-    pub unsafe fn write_timestamp(&mut self, query: u32) {
-        self.cmd_buf.write_timestamp(&self.query_pool, query);
-    }
-
     /// Start a compute pass with timer queries.
     pub unsafe fn compute_pass(&mut self, start_query: u32, end_query: u32) -> ComputePass {
         self.cmd_buf

From f2e8c32876a5b4194407531d71fc1ec423ca51b5 Mon Sep 17 00:00:00 2001
From: Chad Brokaw <cbrokaw@gmail.com>
Date: Wed, 4 May 2022 02:03:25 -0400
Subject: [PATCH 12/12] Erase remaining commented write_timestamp()

---
 piet-gpu/src/lib.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index 3d28e4a..aca6efd 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -425,7 +425,6 @@ impl Renderer {
         cmd_buf.reset_query_pool(&query_pool);
         cmd_buf.begin_debug_label("Element bounding box calculation");
         let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 0, 1));
-        // cmd_buf.write_timestamp(&query_pool, 0);
         self.element_stage.record(
             &mut pass,
             &self.element_code,