rebase on timer query patch

2025-01-09 12:21:31 +11:00 · 2022-05-09 22:39:59 -04:00 · 2022-05-09 22:39:59 -04:00 · c749addf6c
parent ba7f85731c
commit c749addf6c
45 changed files with 887 additions and 494 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -925,6 +925,7 @@ dependencies = [
 "block",
 "bytemuck",
 "cocoa-foundation",
+ "foreign-types",
 "metal",
 "objc",
 "raw-window-handle 0.3.4",
--- a/piet-gpu-hal/Cargo.toml
+++ b/piet-gpu-hal/Cargo.toml
@ -28,3 +28,4 @@ metal = "0.22"
 objc = "0.2.5"
 block = "0.1.6"
 cocoa-foundation = "0.1"
+foreign-types = "0.3.2"
--- a/piet-gpu-hal/examples/collatz.rs
+++ b/piet-gpu-hal/examples/collatz.rs
@ -1,4 +1,4 @@
-use piet_gpu_hal::{include_shader, BindType};
+use piet_gpu_hal::{include_shader, BindType, ComputePassDescriptor};
 use piet_gpu_hal::{BufferUsage, Instance, InstanceFlags, Session};

 fn main() {
@ -20,9 +20,9 @@ fn main() {
        let mut cmd_buf = session.cmd_buf().unwrap();
        cmd_buf.begin();
        cmd_buf.reset_query_pool(&query_pool);
-        cmd_buf.write_timestamp(&query_pool, 0);
-        cmd_buf.dispatch(&pipeline, &descriptor_set, (256, 1, 1), (1, 1, 1));
-        cmd_buf.write_timestamp(&query_pool, 1);
+        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 0, 1));
+        pass.dispatch(&pipeline, &descriptor_set, (256, 1, 1), (1, 1, 1));
+        pass.end();
        cmd_buf.finish_timestamps(&query_pool);
        cmd_buf.host_barrier();
        cmd_buf.finish();
--- a/piet-gpu-hal/src/backend.rs
+++ b/piet-gpu-hal/src/backend.rs
@ -17,7 +17,8 @@
 //! The generic trait for backends to implement.

 use crate::{
-    BindType, BufferUsage, Error, GpuInfo, ImageFormat, ImageLayout, MapMode, SamplerParams,
+    BindType, BufferUsage, ComputePassDescriptor, Error, GpuInfo, ImageFormat, ImageLayout,
+    MapMode, SamplerParams,
 };

 pub trait Device: Sized {
@ -159,14 +160,32 @@ pub trait Device: Sized {
    unsafe fn create_sampler(&self, params: SamplerParams) -> Result<Self::Sampler, Error>;
 }

+/// The trait implemented by backend command buffer implementations.
+///
+/// Valid encoding is represented by a state machine (currently not validated
+/// but it is easy to imagine there might be at least debug validation). Most
+/// methods are only valid in a particular state, and some move it to another
+/// state.
 pub trait CmdBuf<D: Device> {
+    /// Begin encoding.
+    ///
+    /// State: init -> ready
    unsafe fn begin(&mut self);

+    /// State: ready -> finished
    unsafe fn finish(&mut self);

    /// Return true if the command buffer is suitable for reuse.
    unsafe fn reset(&mut self) -> bool;

+    /// Begin a compute pass.
+    ///
+    /// State: ready -> in_compute_pass
+    unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor);
+
+    /// Dispatch
+    ///
+    /// State: in_compute_pass
    unsafe fn dispatch(
        &mut self,
        pipeline: &D::Pipeline,
@ -175,6 +194,9 @@ pub trait CmdBuf<D: Device> {
        workgroup_size: (u32, u32, u32),
    );

+    /// State: in_compute_pass -> ready
+    unsafe fn end_compute_pass(&mut self);
+
    /// Insert an execution and memory barrier.
    ///
    /// Compute kernels (and other actions) after this barrier may read from buffers
@ -202,16 +224,16 @@ pub trait CmdBuf<D: Device> {
    /// This is readily supported in Vulkan, but for portability it is remarkably
    /// tricky (unimplemented in gfx-hal right now). Possibly best to write a compute
    /// kernel, or organize the code not to need it.
-    unsafe fn clear_buffer(&self, buffer: &D::Buffer, size: Option<u64>);
+    unsafe fn clear_buffer(&mut self, buffer: &D::Buffer, size: Option<u64>);

-    unsafe fn copy_buffer(&self, src: &D::Buffer, dst: &D::Buffer);
+    unsafe fn copy_buffer(&mut self, src: &D::Buffer, dst: &D::Buffer);

-    unsafe fn copy_image_to_buffer(&self, src: &D::Image, dst: &D::Buffer);
+    unsafe fn copy_image_to_buffer(&mut self, src: &D::Image, dst: &D::Buffer);

-    unsafe fn copy_buffer_to_image(&self, src: &D::Buffer, dst: &D::Image);
+    unsafe fn copy_buffer_to_image(&mut self, src: &D::Buffer, dst: &D::Image);

    // low portability, dx12 doesn't support it natively
-    unsafe fn blit_image(&self, src: &D::Image, dst: &D::Image);
+    unsafe fn blit_image(&mut self, src: &D::Image, dst: &D::Image);

    /// Reset the query pool.
    ///
@ -227,7 +249,7 @@ pub trait CmdBuf<D: Device> {
    unsafe fn finish_timestamps(&mut self, _pool: &D::QueryPool) {}

    /// Begin a labeled section for debugging and profiling purposes.
-    unsafe fn begin_debug_label(&mut self, label: &str) {}
+    unsafe fn begin_debug_label(&mut self, _label: &str) {}

    /// End a section opened by `begin_debug_label`.
    unsafe fn end_debug_label(&mut self) {}
--- a/piet-gpu-hal/src/dx12.rs
+++ b/piet-gpu-hal/src/dx12.rs
@ -21,7 +21,7 @@ use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};

 use smallvec::SmallVec;

-use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, MapMode, WorkgroupLimits, ImageFormat};
+use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, MapMode, WorkgroupLimits, ImageFormat, ComputePassDescriptor};

 use self::{
    descriptor::{CpuHeapRefOwned, DescriptorPool, GpuHeapRefOwned},
@ -76,6 +76,7 @@ pub struct CmdBuf {
    c: wrappers::GraphicsCommandList,
    allocator: CommandAllocator,
    needs_reset: bool,
+    end_query: Option<(wrappers::QueryHeap, u32)>,
 }

 pub struct Pipeline {
@ -360,6 +361,7 @@ impl crate::backend::Device for Dx12Device {
                c,
                allocator,
                needs_reset: false,
+                end_query: None,
            })
        }
    }
@ -388,11 +390,10 @@ impl crate::backend::Device for Dx12Device {
        let mapped = self.map_buffer(&pool.buf, 0, size as u64, MapMode::Read)?;
        std::ptr::copy_nonoverlapping(mapped, buf.as_mut_ptr() as *mut u8, size);
        self.unmap_buffer(&pool.buf, 0, size as u64, MapMode::Read)?;
-        let ts0 = buf[0];
        let tsp = (self.ts_freq as f64).recip();
-        let result = buf[1..]
+        let result = buf
            .iter()
-            .map(|ts| ts.wrapping_sub(ts0) as f64 * tsp)
+            .map(|ts| *ts as f64 * tsp)
            .collect();
        Ok(result)
    }
@ -610,6 +611,16 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
        self.allocator.reset().is_ok() && self.c.reset(&self.allocator, None).is_ok()
    }

+    unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
+        if let Some((pool, start, end)) = &desc.timer_queries {
+            #[allow(irrefutable_let_patterns)]
+            if let crate::hub::QueryPool::Dx12(pool) = pool {
+                self.write_timestamp(pool, *start);
+                self.end_query = Some((pool.heap.clone(), *end));
+            }
+        }
+    }
+
    unsafe fn dispatch(
        &mut self,
        pipeline: &Pipeline,
@ -628,6 +639,12 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
            .dispatch(workgroup_count.0, workgroup_count.1, workgroup_count.2);
    }

+    unsafe fn end_compute_pass(&mut self) {
+        if let Some((heap, end)) = self.end_query.take() {
+            self.c.end_timing_query(&heap, end);
+        }
+    }
+
    unsafe fn memory_barrier(&mut self) {
        // See comments in CommandBuffer::pipeline_barrier in gfx-hal dx12 backend.
        // The "proper" way to do this would be to name the actual buffers participating
@ -666,7 +683,7 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
        self.memory_barrier();
    }

-    unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) {
+    unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
        let cpu_ref = buffer.cpu_ref.as_ref().unwrap();
        let (gpu_ref, heap) = buffer
            .gpu_ref
@ -684,23 +701,23 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
        );
    }

-    unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) {
+    unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
        // TODO: consider using copy_resource here (if sizes match)
        let size = src.size.min(dst.size);
        self.c.copy_buffer(&dst.resource, 0, &src.resource, 0, size);
    }

-    unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) {
+    unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
        self.c
            .copy_texture_to_buffer(&src.resource, &dst.resource, src.size.0, src.size.1);
    }

-    unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) {
+    unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
        self.c
            .copy_buffer_to_texture(&src.resource, &dst.resource, dst.size.0, dst.size.1);
    }

-    unsafe fn blit_image(&self, src: &Image, dst: &Image) {
+    unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
        self.c.copy_resource(&src.resource, &dst.resource);
    }

--- a/piet-gpu-hal/src/dx12/wrappers.rs
+++ b/piet-gpu-hal/src/dx12/wrappers.rs
@ -79,7 +79,6 @@ pub struct Blob(pub ComPtr<d3dcommon::ID3DBlob>);
 #[derive(Clone)]
 pub struct ShaderByteCode {
    pub bytecode: d3d12::D3D12_SHADER_BYTECODE,
-    blob: Option<Blob>,
 }

 #[derive(Clone)]
@ -741,7 +740,6 @@ impl ShaderByteCode {
                BytecodeLength: blob.0.GetBufferSize(),
                pShaderBytecode: blob.0.GetBufferPointer(),
            },
-            blob: Some(blob),
        }
    }

@ -810,7 +808,6 @@ impl ShaderByteCode {
                BytecodeLength: bytecode.len(),
                pShaderBytecode: bytecode.as_ptr() as *const _,
            },
-            blob: None,
        }
    }
 }
--- a/piet-gpu-hal/src/hub.rs
+++ b/piet-gpu-hal/src/hub.rs
@ -13,7 +13,7 @@ use std::sync::{Arc, Mutex, Weak};
 use bytemuck::Pod;
 use smallvec::SmallVec;

-use crate::{mux, BackendType, BufWrite, ImageFormat, MapMode};
+use crate::{mux, BackendType, BufWrite, ComputePassDescriptor, ImageFormat, MapMode};

 use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams};

@ -135,6 +135,11 @@ pub struct BufReadGuard<'a> {
    size: u64,
 }

+/// A sub-object of a command buffer for a sequence of compute dispatches.
+pub struct ComputePass<'a> {
+    cmd_buf: &'a mut CmdBuf,
+}
+
 impl Session {
    /// Create a new session, choosing the best backend.
    pub fn new(device: mux::Device) -> Session {
@ -370,8 +375,17 @@ impl Session {
    ///
    /// This should be called after waiting on the command buffer that wrote the
    /// timer queries.
+    ///
+    /// The returned vector is one shorter than the number of timer queries in the
+    /// pool; the first value is subtracted off. It would likely be better to return
+    /// the raw timestamps, but that change should be made consistently.
    pub unsafe fn fetch_query_pool(&self, pool: &QueryPool) -> Result<Vec<f64>, Error> {
-        self.0.device.fetch_query_pool(pool)
+        let result = self.0.device.fetch_query_pool(pool)?;
+        // Subtract off first timestamp.
+        Ok(result[1..]
+            .iter()
+            .map(|ts| *ts as f64 - result[0])
+            .collect())
    }

    #[doc(hidden)]
@ -471,23 +485,10 @@ impl CmdBuf {
        self.cmd_buf().finish();
    }

-    /// Dispatch a compute shader.
-    ///
-    /// Request a compute shader to be run, using the pipeline to specify the
-    /// code, and the descriptor set to address the resources read and written.
-    ///
-    /// Both the workgroup count (number of workgroups) and the workgroup size
-    /// (number of threads in a workgroup) must be specified here, though not
-    /// all back-ends require the latter info.
-    pub unsafe fn dispatch(
-        &mut self,
-        pipeline: &Pipeline,
-        descriptor_set: &DescriptorSet,
-        workgroup_count: (u32, u32, u32),
-        workgroup_size: (u32, u32, u32),
-    ) {
-        self.cmd_buf()
-            .dispatch(pipeline, descriptor_set, workgroup_count, workgroup_size);
+    /// Begin a compute pass.
+    pub unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) -> ComputePass {
+        self.cmd_buf().begin_compute_pass(desc);
+        ComputePass { cmd_buf: self }
    }

    /// Insert an execution and memory barrier.
@ -582,13 +583,6 @@ impl CmdBuf {
        self.cmd_buf().reset_query_pool(pool);
    }

-    /// Write a timestamp.
-    ///
-    /// The query index must be less than the size of the query pool on creation.
-    pub unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
-        self.cmd_buf().write_timestamp(pool, query);
-    }
-
    /// Prepare the timestamps for reading. This isn't required on Vulkan but
    /// is required on (at least) DX12.
    ///
@ -692,6 +686,51 @@ impl Drop for SubmittedCmdBuf {
    }
 }

+impl<'a> ComputePass<'a> {
+    /// Dispatch a compute shader.
+    ///
+    /// Request a compute shader to be run, using the pipeline to specify the
+    /// code, and the descriptor set to address the resources read and written.
+    ///
+    /// Both the workgroup count (number of workgroups) and the workgroup size
+    /// (number of threads in a workgroup) must be specified here, though not
+    /// all back-ends require the latter info.
+    pub unsafe fn dispatch(
+        &mut self,
+        pipeline: &Pipeline,
+        descriptor_set: &DescriptorSet,
+        workgroup_count: (u32, u32, u32),
+        workgroup_size: (u32, u32, u32),
+    ) {
+        self.cmd_buf
+            .cmd_buf()
+            .dispatch(pipeline, descriptor_set, workgroup_count, workgroup_size);
+    }
+
+    /// Add a memory barrier.
+    ///
+    /// Inserts a memory barrier in the compute encoder. This is a convenience
+    /// function for calling the same function on the underlying command buffer,
+    /// avoiding borrow check issues.
+    pub unsafe fn memory_barrier(&mut self) {
+        self.cmd_buf.memory_barrier();
+    }
+
+    /// Begin a labeled section for debugging and profiling purposes.
+    pub unsafe fn begin_debug_label(&mut self, label: &str) {
+        self.cmd_buf.begin_debug_label(label);
+    }
+
+    /// End a section opened by `begin_debug_label`.
+    pub unsafe fn end_debug_label(&mut self) {
+        self.cmd_buf.end_debug_label();
+    }
+
+    pub unsafe fn end(self) {
+        self.cmd_buf.cmd_buf().end_compute_pass();
+    }
+}
+
 impl Drop for BufferInner {
    fn drop(&mut self) {
        if let Some(session) = Weak::upgrade(&self.session) {
--- a/piet-gpu-hal/src/lib.rs
+++ b/piet-gpu-hal/src/lib.rs
@ -21,8 +21,8 @@ pub use crate::mux::{
 };
 pub use bufwrite::BufWrite;
 pub use hub::{
-    BufReadGuard, BufWriteGuard, Buffer, CmdBuf, DescriptorSetBuilder, Image, RetainResource,
-    Session, SubmittedCmdBuf,
+    BufReadGuard, BufWriteGuard, Buffer, CmdBuf, ComputePass, DescriptorSetBuilder, Image,
+    RetainResource, Session, SubmittedCmdBuf,
 };

 // TODO: because these are conditionally included, "cargo fmt" does not
@ -189,3 +189,23 @@ pub struct WorkgroupLimits {
    /// dimension.
    pub max_invocations: u32,
 }
+
+/// Options for creating a compute pass.
+#[derive(Default)]
+pub struct ComputePassDescriptor<'a> {
+    // Maybe label should go here? It does in wgpu and wgpu_hal.
+    /// Timer query parameters.
+    ///
+    /// To record timer queries for a compute pass, set the query pool, start
+    /// query index, and end query index here. The indices must be less than
+    /// the size of the query pool.
+    timer_queries: Option<(&'a QueryPool, u32, u32)>,
+}
+
+impl<'a> ComputePassDescriptor<'a> {
+    pub fn timer(pool: &'a QueryPool, start_query: u32, end_query: u32) -> ComputePassDescriptor {
+        ComputePassDescriptor {
+            timer_queries: Some((pool, start_query, end_query)),
+        }
+    }
+}
--- a/piet-gpu-hal/src/metal.rs
+++ b/piet-gpu-hal/src/metal.rs
@ -15,25 +15,32 @@
 // Also licensed under MIT license, at your choice.

 mod clear;
+mod timer;
 mod util;

 use std::mem;
 use std::sync::{Arc, Mutex};

+use block::Block;
 use cocoa_foundation::base::id;
 use cocoa_foundation::foundation::{NSInteger, NSUInteger};
+use foreign_types::ForeignType;
 use objc::rc::autoreleasepool;
 use objc::runtime::{Object, BOOL, YES};
 use objc::{class, msg_send, sel, sel_impl};

-use metal::{CGFloat, MTLFeatureSet};
+use metal::{CGFloat, CommandBufferRef, MTLFeatureSet};

 use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};

-use crate::{BufferUsage, Error, GpuInfo, ImageFormat, MapMode, WorkgroupLimits};
+use crate::{
+    BufferUsage, ComputePassDescriptor, Error, GpuInfo, ImageFormat, MapMode, WorkgroupLimits,
+};

 use util::*;

+use self::timer::{CounterSampleBuffer, CounterSet, TimeCalibration};
+
 pub struct MtlInstance;

 pub struct MtlDevice {
@ -41,6 +48,18 @@ pub struct MtlDevice {
    cmd_queue: Arc<Mutex<metal::CommandQueue>>,
    gpu_info: GpuInfo,
    helpers: Arc<Helpers>,
+    timer_set: Option<CounterSet>,
+    counter_style: CounterStyle,
+}
+
+/// Type of counter sampling.
+///
+/// See https://developer.apple.com/documentation/metal/counter_sampling/sampling_gpu_data_into_counter_sample_buffers
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+enum CounterStyle {
+    None,
+    Stage,
+    Command,
 }

 pub struct MtlSurface {
@ -81,9 +100,22 @@ pub struct Semaphore;
 pub struct CmdBuf {
    cmd_buf: metal::CommandBuffer,
    helpers: Arc<Helpers>,
+    cur_encoder: Encoder,
+    time_calibration: Arc<Mutex<TimeCalibration>>,
+    counter_style: CounterStyle,
 }

-pub struct QueryPool;
+enum Encoder {
+    None,
+    Compute(metal::ComputeCommandEncoder, Option<(id, u32)>),
+    Blit(metal::BlitCommandEncoder),
+}
+
+#[derive(Default)]
+pub struct QueryPool {
+    counter_sample_buf: Option<CounterSampleBuffer>,
+    calibration: Arc<Mutex<Option<Arc<Mutex<TimeCalibration>>>>>,
+}

 pub struct Pipeline(metal::ComputePipelineState);

@ -209,18 +241,43 @@ impl MtlDevice {
        let helpers = Arc::new(Helpers {
            clear_pipeline: clear::make_clear_pipeline(&device),
        });
+        // Timer stuff
+        let timer_set = CounterSet::get_timer_counter_set(&device);
+        let counter_style = if timer_set.is_some() {
+            if device.supports_counter_sampling(metal::MTLCounterSamplingPoint::AtStageBoundary) {
+                CounterStyle::Stage
+            } else if device
+                .supports_counter_sampling(metal::MTLCounterSamplingPoint::AtDispatchBoundary)
+            {
+                CounterStyle::Command
+            } else {
+                CounterStyle::None
+            }
+        } else {
+            CounterStyle::None
+        };
        MtlDevice {
            device,
            cmd_queue: Arc::new(Mutex::new(cmd_queue)),
            gpu_info,
            helpers,
+            timer_set,
+            counter_style,
        }
    }

    pub fn cmd_buf_from_raw_mtl(&self, raw_cmd_buf: metal::CommandBuffer) -> CmdBuf {
        let cmd_buf = raw_cmd_buf;
        let helpers = self.helpers.clone();
-        CmdBuf { cmd_buf, helpers }
+        let cur_encoder = Encoder::None;
+        let time_calibration = Default::default();
+        CmdBuf {
+            cmd_buf,
+            helpers,
+            cur_encoder,
+            time_calibration,
+            counter_style: self.counter_style,
+        }
    }

    pub fn image_from_raw_mtl(&self, texture: metal::Texture, width: u32, height: u32) -> Image {
@ -330,11 +387,35 @@ impl crate::backend::Device for MtlDevice {

    fn create_cmd_buf(&self) -> Result<Self::CmdBuf, Error> {
        let cmd_queue = self.cmd_queue.lock().unwrap();
+        // A discussion about autorelease pools.
+        //
+        // Autorelease pools are a sore point in Rust/Objective-C interop. Basically,
+        // you can have any two of correctness, ergonomics, and performance. Here we've
+        // chosen the first two, using the pattern of a fine grained autorelease pool
+        // to give the Obj-C object Rust-like lifetime semantics whenever objects are
+        // created as autorelease (by convention, this is any object creation with an
+        // Obj-C method name that doesn't begin with "new" or "alloc").
+        //
+        // To gain back some of the performance, we'd need a way to wrap an autorelease
+        // pool over a chunk of work - that could be one frame of rendering, but for
+        // tests that iterate a number of command buffer submissions, it would need to
+        // be around that. On non-mac platforms, it would be a no-op.
+        //
+        // In any case, this way, the caller doesn't need to worry, and the performance
+        // hit might not be so bad (perhaps we should measure).
+
        // consider new_command_buffer_with_unretained_references for performance
-        let cmd_buf = cmd_queue.new_command_buffer();
-        let cmd_buf = autoreleasepool(|| cmd_buf.to_owned());
+        let cmd_buf = autoreleasepool(|| cmd_queue.new_command_buffer().to_owned());
        let helpers = self.helpers.clone();
-        Ok(CmdBuf { cmd_buf, helpers })
+        let cur_encoder = Encoder::None;
+        let time_calibration = Default::default();
+        Ok(CmdBuf {
+            cmd_buf,
+            helpers,
+            cur_encoder,
+            time_calibration,
+            counter_style: self.counter_style,
+        })
    }

    unsafe fn destroy_cmd_buf(&self, _cmd_buf: Self::CmdBuf) -> Result<(), Error> {
@ -342,12 +423,31 @@ impl crate::backend::Device for MtlDevice {
    }

    fn create_query_pool(&self, n_queries: u32) -> Result<Self::QueryPool, Error> {
-        // TODO
-        Ok(QueryPool)
+        if let Some(timer_set) = &self.timer_set {
+            let pool = CounterSampleBuffer::new(&self.device, n_queries as u64, timer_set)
+                .ok_or("error creating timer query pool")?;
+            return Ok(QueryPool {
+                counter_sample_buf: Some(pool),
+                calibration: Default::default(),
+            });
+        }
+        Ok(QueryPool::default())
    }

    unsafe fn fetch_query_pool(&self, pool: &Self::QueryPool) -> Result<Vec<f64>, Error> {
-        // TODO
+        if let Some(raw) = &pool.counter_sample_buf {
+            let resolved = raw.resolve();
+            let calibration = pool.calibration.lock().unwrap();
+            if let Some(calibration) = &*calibration {
+                let calibration = calibration.lock().unwrap();
+                let result = resolved
+                    .iter()
+                    .map(|time_ns| calibration.correlate(*time_ns))
+                    .collect();
+                return Ok(result);
+            }
+        }
+        // Maybe should return None indicating it wasn't successful? But that might break.
        Ok(Vec::new())
    }

@ -358,7 +458,37 @@ impl crate::backend::Device for MtlDevice {
        _signal_semaphores: &[&Self::Semaphore],
        fence: Option<&mut Self::Fence>,
    ) -> Result<(), Error> {
+        unsafe fn add_scheduled_handler(
+            cmd_buf: &metal::CommandBufferRef,
+            block: &Block<(&CommandBufferRef,), ()>,
+        ) {
+            msg_send![cmd_buf, addScheduledHandler: block]
+        }
        for cmd_buf in cmd_bufs {
+            let time_calibration = cmd_buf.time_calibration.clone();
+            let start_block = block::ConcreteBlock::new(move |buffer: &metal::CommandBufferRef| {
+                let device: id = msg_send![buffer, device];
+                let mut time_calibration = time_calibration.lock().unwrap();
+                let cpu_ts_ptr = &mut time_calibration.cpu_start_ts as *mut _;
+                let gpu_ts_ptr = &mut time_calibration.gpu_start_ts as *mut _;
+                // TODO: only do this if supported.
+                let () = msg_send![device, sampleTimestamps: cpu_ts_ptr gpuTimestamp: gpu_ts_ptr];
+            })
+            .copy();
+            add_scheduled_handler(&cmd_buf.cmd_buf, &start_block);
+            let time_calibration = cmd_buf.time_calibration.clone();
+            let completed_block =
+                block::ConcreteBlock::new(move |buffer: &metal::CommandBufferRef| {
+                    let device: id = msg_send![buffer, device];
+                    let mut time_calibration = time_calibration.lock().unwrap();
+                    let cpu_ts_ptr = &mut time_calibration.cpu_end_ts as *mut _;
+                    let gpu_ts_ptr = &mut time_calibration.gpu_end_ts as *mut _;
+                    // TODO: only do this if supported.
+                    let () =
+                        msg_send![device, sampleTimestamps: cpu_ts_ptr gpuTimestamp: gpu_ts_ptr];
+                })
+                .copy();
+            cmd_buf.cmd_buf.add_completed_handler(&completed_block);
            cmd_buf.cmd_buf.commit();
        }
        if let Some(last_cmd_buf) = cmd_bufs.last() {
@ -439,12 +569,70 @@ impl crate::backend::Device for MtlDevice {
 impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
    unsafe fn begin(&mut self) {}

-    unsafe fn finish(&mut self) {}
+    unsafe fn finish(&mut self) {
+        self.flush_encoder();
+    }

    unsafe fn reset(&mut self) -> bool {
        false
    }

+    unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
+        // TODO: we might want to get better about validation but the following
+        // assert is likely to trigger, and also a case can be made that
+        // validation should be done at the hub level, for consistency.
+        //debug_assert!(matches!(self.cur_encoder, Encoder::None));
+        self.flush_encoder();
+        autoreleasepool(|| {
+            let (encoder, end_query) = match (&desc.timer_queries, self.counter_style) {
+                (Some(queries), CounterStyle::Stage) => {
+                    let descriptor: id =
+                        msg_send![class!(MTLComputePassDescriptor), computePassDescriptor];
+                    let attachments: id = msg_send![descriptor, sampleBufferAttachments];
+                    let index: NSUInteger = 0;
+                    let attachment: id = msg_send![attachments, objectAtIndexedSubscript: index];
+                    // Here we break the hub/mux separation a bit, for expedience
+                    #[allow(irrefutable_let_patterns)]
+                    if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 {
+                        if let Some(sample_buf) = &query_pool.counter_sample_buf {
+                            let () = msg_send![attachment, setSampleBuffer: sample_buf.id()];
+                        }
+                    }
+                    let start_index = queries.1 as NSUInteger;
+                    let end_index = queries.2 as NSInteger;
+                    let () = msg_send![attachment, setStartOfEncoderSampleIndex: start_index];
+                    let () = msg_send![attachment, setEndOfEncoderSampleIndex: end_index];
+                    (
+                        msg_send![
+                            self.cmd_buf,
+                            computeCommandEncoderWithDescriptor: descriptor
+                        ],
+                        None,
+                    )
+                }
+                (Some(queries), CounterStyle::Command) => {
+                    let encoder = self.cmd_buf.new_compute_command_encoder();
+                    #[allow(irrefutable_let_patterns)]
+                    let end_query = if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 {
+                        if let Some(sample_buf) = &query_pool.counter_sample_buf {
+                            let sample_index = queries.1 as NSUInteger;
+                            let sample_buf = sample_buf.id();
+                            let () = msg_send![encoder, sampleCountersInBuffer: sample_buf atSampleIndex: sample_index withBarrier: true];
+                            Some((sample_buf, queries.2))
+                        } else {
+                            None
+                        }
+                    } else {
+                        None
+                    };
+                    (encoder, end_query)
+                }
+                _ => (self.cmd_buf.new_compute_command_encoder(), None),
+            };
+            self.cur_encoder = Encoder::Compute(encoder.to_owned(), end_query);
+        });
+    }
+
    unsafe fn dispatch(
        &mut self,
        pipeline: &Pipeline,
@ -452,7 +640,7 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
        workgroup_count: (u32, u32, u32),
        workgroup_size: (u32, u32, u32),
    ) {
-        let encoder = self.cmd_buf.new_compute_command_encoder();
+        let encoder = self.compute_command_encoder();
        encoder.set_compute_pipeline_state(&pipeline.0);
        let mut buf_ix = 0;
        for buffer in &descriptor_set.buffers {
@ -475,7 +663,11 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
            depth: workgroup_size.2 as u64,
        };
        encoder.dispatch_thread_groups(workgroup_count, workgroup_size);
-        encoder.end_encoding();
+    }
+
+    unsafe fn end_compute_pass(&mut self) {
+        // TODO: might validate that we are in a compute encoder state
+        self.flush_encoder();
    }

    unsafe fn memory_barrier(&mut self) {
@ -494,22 +686,23 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
        // I think these are being tracked.
    }

-    unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) {
+    unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
        let size = size.unwrap_or(buffer.size);
-        let encoder = self.cmd_buf.new_compute_command_encoder();
-        clear::encode_clear(&encoder, &self.helpers.clear_pipeline, &buffer.buffer, size);
-        encoder.end_encoding()
+        let _ = self.compute_command_encoder();
+        // Getting this directly is a workaround for a borrow checker issue.
+        if let Encoder::Compute(e, _) = &self.cur_encoder {
+            clear::encode_clear(e, &self.helpers.clear_pipeline, &buffer.buffer, size);
+        }
    }

-    unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) {
-        let encoder = self.cmd_buf.new_blit_command_encoder();
+    unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
+        let encoder = self.blit_command_encoder();
        let size = src.size.min(dst.size);
        encoder.copy_from_buffer(&src.buffer, 0, &dst.buffer, 0, size);
-        encoder.end_encoding();
    }

-    unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) {
-        let encoder = self.cmd_buf.new_blit_command_encoder();
+    unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
+        let encoder = self.blit_command_encoder();
        assert_eq!(dst.size, (src.width as u64) * (src.height as u64) * 4);
        let bytes_per_row = (src.width * 4) as NSUInteger;
        let src_size = metal::MTLSize {
@ -530,11 +723,10 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
            bytes_per_row * src.height as NSUInteger,
            metal::MTLBlitOption::empty(),
        );
-        encoder.end_encoding();
    }

-    unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) {
-        let encoder = self.cmd_buf.new_blit_command_encoder();
+    unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
+        let encoder = self.blit_command_encoder();
        assert_eq!(src.size, (dst.width as u64) * (dst.height as u64) * 4);
        let bytes_per_row = (dst.width * 4) as NSUInteger;
        let src_size = metal::MTLSize {
@ -555,11 +747,10 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
            origin,
            metal::MTLBlitOption::empty(),
        );
-        encoder.end_encoding();
    }

-    unsafe fn blit_image(&self, src: &Image, dst: &Image) {
-        let encoder = self.cmd_buf.new_blit_command_encoder();
+    unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
+        let encoder = self.blit_command_encoder();
        let src_size = metal::MTLSize {
            width: src.width.min(dst.width) as NSUInteger,
            height: src.width.min(dst.height) as NSUInteger,
@ -577,15 +768,79 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
            0,
            origin,
        );
-        encoder.end_encoding();
    }

-    unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {}
+    unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {
+        let mut calibration = pool.calibration.lock().unwrap();
+        *calibration = Some(self.time_calibration.clone());
+    }

    unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
-        // TODO
-        // This really a PITA because it's pretty different than Vulkan.
-        // See https://developer.apple.com/documentation/metal/counter_sampling
+        if let Some(buf) = &pool.counter_sample_buf {
+            if matches!(self.cur_encoder, Encoder::None) {
+                self.cur_encoder =
+                    Encoder::Compute(self.cmd_buf.new_compute_command_encoder().to_owned(), None);
+            }
+            let sample_index = query as NSUInteger;
+            if self.counter_style == CounterStyle::Command {
+                match &self.cur_encoder {
+                    Encoder::Compute(e, _) => {
+                        let () = msg_send![e.as_ptr(), sampleCountersInBuffer: buf.id() atSampleIndex: sample_index withBarrier: true];
+                    }
+                    Encoder::None => unreachable!(),
+                    _ => todo!(),
+                }
+            } else if self.counter_style == CounterStyle::Stage {
+                match &self.cur_encoder {
+                    Encoder::Compute(_e, _) => {
+                        println!("write_timestamp is not supported for stage-style encoders");
+                    }
+                    _ => (),
+                }
+            }
+        }
+    }
+}
+
+impl CmdBuf {
+    fn compute_command_encoder(&mut self) -> &metal::ComputeCommandEncoder {
+        if !matches!(self.cur_encoder, Encoder::Compute(..)) {
+            self.flush_encoder();
+            self.cur_encoder =
+                Encoder::Compute(self.cmd_buf.new_compute_command_encoder().to_owned(), None);
+        }
+        if let Encoder::Compute(e, _) = &self.cur_encoder {
+            e
+        } else {
+            unreachable!()
+        }
+    }
+
+    fn blit_command_encoder(&mut self) -> &metal::BlitCommandEncoder {
+        if !matches!(self.cur_encoder, Encoder::Blit(_)) {
+            self.flush_encoder();
+            self.cur_encoder = Encoder::Blit(self.cmd_buf.new_blit_command_encoder().to_owned());
+        }
+        if let Encoder::Blit(e) = &self.cur_encoder {
+            e
+        } else {
+            unreachable!()
+        }
+    }
+
+    fn flush_encoder(&mut self) {
+        match std::mem::replace(&mut self.cur_encoder, Encoder::None) {
+            Encoder::Compute(e, Some((sample_buf, end_query))) => {
+                let sample_index = end_query as NSUInteger;
+                unsafe {
+                    let () = msg_send![e.as_ptr(), sampleCountersInBuffer: sample_buf atSampleIndex: sample_index withBarrier: true];
+                }
+                e.end_encoding();
+            }
+            Encoder::Compute(e, None) => e.end_encoding(),
+            Encoder::Blit(e) => e.end_encoding(),
+            Encoder::None => (),
+        }
    }
 }

--- a/piet-gpu-hal/src/metal/timer.rs
+++ b/piet-gpu-hal/src/metal/timer.rs
@ -0,0 +1,172 @@
+// Copyright 2021 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+//! Support for timer queries.
+//!
+//! Likely some of this should be upstreamed into metal-rs.
+
+use std::{ffi::CStr, ptr::null_mut};
+
+use cocoa_foundation::{
+    base::id,
+    foundation::{NSRange, NSUInteger},
+};
+use metal::{DeviceRef, MTLStorageMode};
+use objc::{class, msg_send, sel, sel_impl};
+
+pub struct CounterSampleBuffer {
+    id: id,
+    count: u64,
+}
+
+pub struct CounterSet {
+    id: id,
+}
+
+#[derive(Default)]
+pub struct TimeCalibration {
+    pub cpu_start_ts: u64,
+    pub gpu_start_ts: u64,
+    pub cpu_end_ts: u64,
+    pub gpu_end_ts: u64,
+}
+
+impl Drop for CounterSampleBuffer {
+    fn drop(&mut self) {
+        unsafe { msg_send![self.id, release] }
+    }
+}
+
+impl Clone for CounterSampleBuffer {
+    fn clone(&self) -> CounterSampleBuffer {
+        unsafe {
+            CounterSampleBuffer {
+                id: msg_send![self.id, retain],
+                count: self.count,
+            }
+        }
+    }
+}
+
+impl CounterSampleBuffer {
+    pub fn id(&self) -> id {
+        self.id
+    }
+}
+
+impl Drop for CounterSet {
+    fn drop(&mut self) {
+        unsafe { msg_send![self.id, release] }
+    }
+}
+
+impl CounterSet {
+    pub fn get_timer_counter_set(device: &DeviceRef) -> Option<CounterSet> {
+        unsafe {
+            // TODO: version check
+            let sets: id = msg_send!(device, counterSets);
+            let count: NSUInteger = msg_send![sets, count];
+            for i in 0..count {
+                let set: id = msg_send![sets, objectAtIndex: i];
+                let name: id = msg_send![set, name];
+                let name_cstr = CStr::from_ptr(msg_send![name, UTF8String]);
+                if name_cstr.to_bytes() == b"timestamp" {
+                    return Some(CounterSet { id: set });
+                }
+            }
+            None
+        }
+    }
+}
+
+// copied from metal-rs; should be in common utilities maybe?
+fn nsstring_as_str(nsstr: &objc::runtime::Object) -> &str {
+    let bytes = unsafe {
+        let bytes: *const std::os::raw::c_char = msg_send![nsstr, UTF8String];
+        bytes as *const u8
+    };
+    let len: NSUInteger = unsafe { msg_send![nsstr, length] };
+    unsafe {
+        let bytes = std::slice::from_raw_parts(bytes, len as usize);
+        std::str::from_utf8(bytes).unwrap()
+    }
+}
+
+impl CounterSampleBuffer {
+    pub fn new(
+        device: &DeviceRef,
+        count: u64,
+        counter_set: &CounterSet,
+    ) -> Option<CounterSampleBuffer> {
+        unsafe {
+            let desc_cls = class!(MTLCounterSampleBufferDescriptor);
+            let descriptor: id = msg_send![desc_cls, alloc];
+            let _: id = msg_send![descriptor, init];
+            let count = count as NSUInteger;
+            let () = msg_send![descriptor, setSampleCount: count];
+            let () = msg_send![descriptor, setCounterSet: counter_set.id];
+            let () = msg_send![
+                descriptor,
+                setStorageMode: MTLStorageMode::Shared as NSUInteger
+            ];
+            let mut error: id = null_mut();
+            let buf: id = msg_send![device, newCounterSampleBufferWithDescriptor: descriptor error: &mut error];
+            let () = msg_send![descriptor, release];
+            if !error.is_null() {
+                let description = msg_send![error, localizedDescription];
+                println!(
+                    "error allocating sample buffer, code = {}",
+                    nsstring_as_str(description)
+                );
+                let () = msg_send![error, release];
+                return None;
+            }
+            Some(CounterSampleBuffer { id: buf, count })
+        }
+    }
+
+    // Read the timestamps.
+    //
+    // Safety: the lifetime of the returned slice is wrong, it's actually autoreleased.
+    pub unsafe fn resolve(&self) -> &[u64] {
+        let range = NSRange::new(0, self.count);
+        let data: id = msg_send![self.id, resolveCounterRange: range];
+        if data.is_null() {
+            &[]
+        } else {
+            let bytes: *const u64 = msg_send![data, bytes];
+            std::slice::from_raw_parts(bytes, self.count as usize)
+        }
+    }
+}
+
+impl TimeCalibration {
+    /// Convert GPU timestamp into CPU time base.
+    ///
+    /// See https://developer.apple.com/documentation/metal/performance_tuning/correlating_cpu_and_gpu_timestamps
+    pub fn correlate(&self, raw_ts: u64) -> f64 {
+        let delta_cpu = self.cpu_end_ts - self.cpu_start_ts;
+        let delta_gpu = self.gpu_end_ts - self.gpu_start_ts;
+        let adj_ts = if delta_gpu > 0 {
+            let scale = delta_cpu as f64 / delta_gpu as f64;
+            self.cpu_start_ts as f64 + (raw_ts as f64 - self.gpu_start_ts as f64) * scale
+        } else {
+            // Default is ns on Apple Silicon; on other hardware this will be wrong
+            raw_ts as f64
+        };
+        adj_ts * 1e-9
+    }
+}
--- a/piet-gpu-hal/src/mux.rs
+++ b/piet-gpu-hal/src/mux.rs
@ -35,6 +35,7 @@ use crate::backend::DescriptorSetBuilder as DescriptorSetBuilderTrait;
 use crate::backend::Device as DeviceTrait;
 use crate::BackendType;
 use crate::BindType;
+use crate::ComputePassDescriptor;
 use crate::ImageFormat;
 use crate::MapMode;
 use crate::{BufferUsage, Error, GpuInfo, ImageLayout, InstanceFlags};
@ -658,6 +659,14 @@ impl CmdBuf {
        }
    }

+    pub unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
+        mux_match! { self;
+            CmdBuf::Vk(c) => c.begin_compute_pass(desc),
+            CmdBuf::Dx12(c) => c.begin_compute_pass(desc),
+            CmdBuf::Mtl(c) => c.begin_compute_pass(desc),
+        }
+    }
+
    /// Dispatch a compute shader.
    ///
    /// Note that both the number of workgroups (`workgroup_count`) and the number of
@ -680,6 +689,14 @@ impl CmdBuf {
        }
    }

+    pub unsafe fn end_compute_pass(&mut self) {
+        mux_match! { self;
+            CmdBuf::Vk(c) => c.end_compute_pass(),
+            CmdBuf::Dx12(c) => c.end_compute_pass(),
+            CmdBuf::Mtl(c) => c.end_compute_pass(),
+        }
+    }
+
    pub unsafe fn memory_barrier(&mut self) {
        mux_match! { self;
            CmdBuf::Vk(c) => c.memory_barrier(),
--- a/piet-gpu-hal/src/vulkan.rs
+++ b/piet-gpu-hal/src/vulkan.rs
@ -15,7 +15,7 @@ use smallvec::SmallVec;
 use crate::backend::Device as DeviceTrait;
 use crate::{
    BindType, BufferUsage, Error, GpuInfo, ImageFormat, ImageLayout, MapMode, SamplerParams, SubgroupSize,
-    WorkgroupLimits,
+    WorkgroupLimits, ComputePassDescriptor,
 };

 pub struct VkInstance {
@ -92,6 +92,7 @@ pub struct CmdBuf {
    cmd_buf: vk::CommandBuffer,
    cmd_pool: vk::CommandPool,
    device: Arc<RawDevice>,
+    end_query: Option<(vk::QueryPool, u32)>,
 }

 pub struct QueryPool {
@ -738,6 +739,7 @@ impl crate::backend::Device for VkDevice {
                cmd_buf,
                cmd_pool,
                device: self.device.clone(),
+                end_query: None,
            })
        }
    }
@ -770,11 +772,10 @@ impl crate::backend::Device for VkDevice {
        // results (Windows 10, AMD 5700 XT).
        let flags = vk::QueryResultFlags::TYPE_64 | vk::QueryResultFlags::WAIT;
        device.get_query_pool_results(pool.pool, 0, pool.n_queries, &mut buf, flags)?;
-        let ts0 = buf[0];
        let tsp = self.timestamp_period as f64 * 1e-9;
-        let result = buf[1..]
+        let result = buf
            .iter()
-            .map(|ts| ts.wrapping_sub(ts0) as f64 * tsp)
+            .map(|ts| *ts as f64 * tsp)
            .collect();
        Ok(result)
    }
@ -902,6 +903,16 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
        true
    }

+    unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
+        if let Some((pool, start, end)) = &desc.timer_queries {
+            #[allow(irrefutable_let_patterns)]
+            if let crate::hub::QueryPool::Vk(pool) = pool {
+                self.write_timestamp_raw(pool.pool, *start);
+                self.end_query = Some((pool.pool, *end));
+            }
+        }
+    }
+
    unsafe fn dispatch(
        &mut self,
        pipeline: &Pipeline,
@ -931,6 +942,12 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
        );
    }

+    unsafe fn end_compute_pass(&mut self) {
+        if let Some((pool, end)) = self.end_query.take() {
+            self.write_timestamp_raw(pool, end);
+        }
+    }
+
    /// Insert a pipeline barrier for all memory accesses.
    unsafe fn memory_barrier(&mut self) {
        let device = &self.device.device;
@ -995,13 +1012,13 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
        );
    }

-    unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) {
+    unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
        let device = &self.device.device;
        let size = size.unwrap_or(vk::WHOLE_SIZE);
        device.cmd_fill_buffer(self.cmd_buf, buffer.buffer, 0, size, 0);
    }

-    unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) {
+    unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
        let device = &self.device.device;
        let size = src.size.min(dst.size);
        device.cmd_copy_buffer(
@ -1012,7 +1029,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
        );
    }

-    unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) {
+    unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
        let device = &self.device.device;
        device.cmd_copy_image_to_buffer(
            self.cmd_buf,
@ -1035,7 +1052,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
        );
    }

-    unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) {
+    unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
        let device = &self.device.device;
        device.cmd_copy_buffer_to_image(
            self.cmd_buf,
@ -1058,7 +1075,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
        );
    }

-    unsafe fn blit_image(&self, src: &Image, dst: &Image) {
+    unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
        let device = &self.device.device;
        device.cmd_blit_image(
            self.cmd_buf,
@ -1106,13 +1123,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
    }

    unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
-        let device = &self.device.device;
-        device.cmd_write_timestamp(
-            self.cmd_buf,
-            vk::PipelineStageFlags::COMPUTE_SHADER,
-            pool.pool,
-            query,
-        );
+        self.write_timestamp_raw(pool.pool, query);
    }

    unsafe fn begin_debug_label(&mut self, label: &str) {
@ -1130,6 +1141,18 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
    }
 }

+impl CmdBuf {
+    unsafe fn write_timestamp_raw(&mut self, pool: vk::QueryPool, query: u32) {
+        let device = &self.device.device;
+        device.cmd_write_timestamp(
+            self.cmd_buf,
+            vk::PipelineStageFlags::COMPUTE_SHADER,
+            pool,
+            query,
+        );
+    }
+}
+
 impl crate::backend::DescriptorSetBuilder<VkDevice> for DescriptorSetBuilder {
    fn add_buffers(&mut self, buffers: &[&Buffer]) {
        self.buffers.extend(buffers.iter().map(|b| b.buffer));
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@ -6,7 +6,7 @@ use clap::{App, Arg};

 use piet_gpu_hal::{BufferUsage, Error, Instance, InstanceFlags, Session};

-use piet_gpu::{test_scenes, PietGpuRenderContext, Renderer};
+use piet_gpu::{test_scenes, PicoSvg, PietGpuRenderContext, Renderer};

 const WIDTH: usize = 2048;
 const HEIGHT: usize = 1536;
@ -243,7 +243,11 @@ fn main() -> Result<(), Error> {
            if matches.is_present("flip") {
                scale = -scale;
            }
-            test_scenes::render_svg(&mut ctx, input, scale);
+            let xml_str = std::fs::read_to_string(input).unwrap();
+            let start = std::time::Instant::now();
+            let svg = PicoSvg::load(&xml_str, scale).unwrap();
+            println!("parsing time: {:?}", start.elapsed());
+            test_scenes::render_svg(&mut ctx, &svg);
        } else {
            test_scenes::render_scene(&mut ctx);
        }
--- a/piet-gpu/bin/winit.rs
+++ b/piet-gpu/bin/winit.rs
@ -2,7 +2,7 @@ use piet::kurbo::Point;
 use piet::{RenderContext, Text, TextAttribute, TextLayoutBuilder};
 use piet_gpu_hal::{CmdBuf, Error, ImageLayout, Instance, Session, SubmittedCmdBuf};

-use piet_gpu::{test_scenes, PietGpuRenderContext, Renderer};
+use piet_gpu::{test_scenes, PicoSvg, PietGpuRenderContext, Renderer};

 use clap::{App, Arg};

@ -29,6 +29,25 @@ fn main() -> Result<(), Error> {
        )
        .get_matches();

+    // Collect SVG if input
+    let svg = match matches.value_of("INPUT") {
+        Some(file) => {
+            let mut scale = matches
+                .value_of("scale")
+                .map(|scale| scale.parse().unwrap())
+                .unwrap_or(8.0);
+            if matches.is_present("flip") {
+                scale = -scale;
+            }
+            let xml_str = std::fs::read_to_string(file).unwrap();
+            let start = std::time::Instant::now();
+            let svg = PicoSvg::load(&xml_str, scale).unwrap();
+            println!("parsing time: {:?}", start.elapsed());
+            Some(svg)
+        }
+        None => None,
+    };
+
    let event_loop = EventLoop::new();
    let window = WindowBuilder::new()
        .with_inner_size(winit::dpi::LogicalSize {
@ -51,7 +70,7 @@ fn main() -> Result<(), Error> {
            .map(|_| session.create_semaphore())
            .collect::<Result<Vec<_>, Error>>()?;
        let query_pools = (0..NUM_FRAMES)
-            .map(|_| session.create_query_pool(8))
+            .map(|_| session.create_query_pool(12))
            .collect::<Result<Vec<_>, Error>>()?;
        let mut cmd_bufs: [Option<CmdBuf>; NUM_FRAMES] = Default::default();
        let mut submitted: [Option<SubmittedCmdBuf>; NUM_FRAMES] = Default::default();
@ -93,29 +112,23 @@ fn main() -> Result<(), Error> {
                        if !ts.is_empty() {
                            info_string = format!(
                                "{:.3}ms :: e:{:.3}ms|alloc:{:.3}ms|cp:{:.3}ms|bd:{:.3}ms|bin:{:.3}ms|cr:{:.3}ms|r:{:.3}ms",
-                                ts[6] * 1e3,
+                                ts[10] * 1e3,
                                ts[0] * 1e3,
                                (ts[1] - ts[0]) * 1e3,
                                (ts[2] - ts[1]) * 1e3,
-                                (ts[3] - ts[2]) * 1e3,
                                (ts[4] - ts[3]) * 1e3,
-                                (ts[5] - ts[4]) * 1e3,
                                (ts[6] - ts[5]) * 1e3,
+                                (ts[8] - ts[7]) * 1e3,
+                                (ts[10] - ts[9]) * 1e3,
                            );
                        }
                    }

                    let mut ctx = PietGpuRenderContext::new();
-                    if let Some(input) = matches.value_of("INPUT") {
-                        let mut scale = matches
-                            .value_of("scale")
-                            .map(|scale| scale.parse().unwrap())
-                            .unwrap_or(8.0);
-                        if matches.is_present("flip") {
-                            scale = -scale;
-                        }
-                        test_scenes::render_svg(&mut ctx, input, scale);
-                    } else {
+                    let test_blend = false;
+                    if let Some(svg) = &svg {
+                        test_scenes::render_svg(&mut ctx, svg);
+                    } else if test_blend {
                        use piet_gpu::{Blend, BlendMode::*, CompositionMode::*};
                        let blends = [
                            Blend::new(Normal, SrcOver),
@ -151,6 +164,8 @@ fn main() -> Result<(), Error> {
                        let blend = blends[mode % blends.len()];
                        test_scenes::render_blend_test(&mut ctx, current_frame, blend);
                        info_string = format!("{:?}", blend);
+                    } else {
+                        test_scenes::render_anim_frame(&mut ctx, current_frame);
                    }
                    render_info_string(&mut ctx, &info_string);
                    if let Err(e) = renderer.upload_render_ctx(&mut ctx, frame_idx) {
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@ -306,7 +306,7 @@ void main() {
                    is_blend = (blend != BlendComp_default);
                }
                include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip
-                    || (is_clip && is_blend);
+                    || is_blend;
            }
            if (include_tile) {
                uint el_slice = el_ix / 32;
--- a/piet-gpu/shader/gen/binning.dxil
+++ b/piet-gpu/shader/gen/binning.dxil
--- a/piet-gpu/shader/gen/coarse.dxil
+++ b/piet-gpu/shader/gen/coarse.dxil
--- a/piet-gpu/shader/gen/coarse.hlsl
+++ b/piet-gpu/shader/gen/coarse.hlsl
@ -931,23 +931,14 @@ void comp_main()
                {
                    _1701 = _1692;
                }
-                bool _1708;
-                if (!_1701)
-                {
-                    _1708 = is_clip && is_blend;
-                }
-                else
-                {
-                    _1708 = _1701;
-                }
-                include_tile = _1708;
+                include_tile = _1701 || is_blend;
            }
            if (include_tile)
            {
                uint el_slice = el_ix / 32u;
                uint el_mask = 1u << (el_ix & 31u);
-                uint _1728;
-                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1728);
+                uint _1723;
+                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1723);
            }
        }
        GroupMemoryBarrierWithGroupSync();
@ -976,9 +967,9 @@ void comp_main()
            {
                uint param_25 = element_ref_ix;
                bool param_26 = mem_ok;
-                TileRef _1805 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                TileRef _1800 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
                Alloc param_27 = read_tile_alloc(param_25, param_26);
-                TileRef param_28 = _1805;
+                TileRef param_28 = _1800;
                Tile tile_1 = Tile_read(param_27, param_28);
                uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2);
                uint scene_offset_1 = _260.Load((drawmonoid_base_2 + 2u) * 4 + 8);
@ -993,11 +984,11 @@ void comp_main()
                        Alloc param_29 = cmd_alloc;
                        CmdRef param_30 = cmd_ref;
                        uint param_31 = cmd_limit;
-                        bool _1853 = alloc_cmd(param_29, param_30, param_31);
+                        bool _1848 = alloc_cmd(param_29, param_30, param_31);
                        cmd_alloc = param_29;
                        cmd_ref = param_30;
                        cmd_limit = param_31;
-                        if (!_1853)
+                        if (!_1848)
                        {
                            break;
                        }
@ -1008,10 +999,10 @@ void comp_main()
                        write_fill(param_32, param_33, param_34, param_35);
                        cmd_ref = param_33;
                        uint rgba = _1372.Load(dd_1 * 4 + 0);
-                        CmdColor _1876 = { rgba };
+                        CmdColor _1871 = { rgba };
                        Alloc param_36 = cmd_alloc;
                        CmdRef param_37 = cmd_ref;
-                        CmdColor param_38 = _1876;
+                        CmdColor param_38 = _1871;
                        Cmd_Color_write(param_36, param_37, param_38);
                        cmd_ref.offset += 8u;
                        break;
@ -1021,11 +1012,11 @@ void comp_main()
                        Alloc param_39 = cmd_alloc;
                        CmdRef param_40 = cmd_ref;
                        uint param_41 = cmd_limit;
-                        bool _1894 = alloc_cmd(param_39, param_40, param_41);
+                        bool _1889 = alloc_cmd(param_39, param_40, param_41);
                        cmd_alloc = param_39;
                        cmd_ref = param_40;
                        cmd_limit = param_41;
-                        if (!_1894)
+                        if (!_1889)
                        {
                            break;
                        }
@ -1052,11 +1043,11 @@ void comp_main()
                        Alloc param_49 = cmd_alloc;
                        CmdRef param_50 = cmd_ref;
                        uint param_51 = cmd_limit;
-                        bool _1958 = alloc_cmd(param_49, param_50, param_51);
+                        bool _1953 = alloc_cmd(param_49, param_50, param_51);
                        cmd_alloc = param_49;
                        cmd_ref = param_50;
                        cmd_limit = param_51;
-                        if (!_1958)
+                        if (!_1953)
                        {
                            break;
                        }
@ -1086,11 +1077,11 @@ void comp_main()
                        Alloc param_59 = cmd_alloc;
                        CmdRef param_60 = cmd_ref;
                        uint param_61 = cmd_limit;
-                        bool _2064 = alloc_cmd(param_59, param_60, param_61);
+                        bool _2059 = alloc_cmd(param_59, param_60, param_61);
                        cmd_alloc = param_59;
                        cmd_ref = param_60;
                        cmd_limit = param_61;
-                        if (!_2064)
+                        if (!_2059)
                        {
                            break;
                        }
@ -1103,27 +1094,27 @@ void comp_main()
                        uint index = _1372.Load(dd_1 * 4 + 0);
                        uint raw1 = _1372.Load((dd_1 + 1u) * 4 + 0);
                        int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
-                        CmdImage _2103 = { index, offset_1 };
+                        CmdImage _2098 = { index, offset_1 };
                        Alloc param_66 = cmd_alloc;
                        CmdRef param_67 = cmd_ref;
-                        CmdImage param_68 = _2103;
+                        CmdImage param_68 = _2098;
                        Cmd_Image_write(param_66, param_67, param_68);
                        cmd_ref.offset += 12u;
                        break;
                    }
                    case 5u:
                    {
-                        bool _2117 = tile_1.tile.offset == 0u;
-                        bool _2123;
-                        if (_2117)
+                        bool _2112 = tile_1.tile.offset == 0u;
+                        bool _2118;
+                        if (_2112)
                        {
-                            _2123 = tile_1.backdrop == 0;
+                            _2118 = tile_1.backdrop == 0;
                        }
                        else
                        {
-                            _2123 = _2117;
+                            _2118 = _2112;
                        }
-                        if (_2123)
+                        if (_2118)
                        {
                            clip_zero_depth = clip_depth + 1u;
                        }
@ -1132,11 +1123,11 @@ void comp_main()
                            Alloc param_69 = cmd_alloc;
                            CmdRef param_70 = cmd_ref;
                            uint param_71 = cmd_limit;
-                            bool _2135 = alloc_cmd(param_69, param_70, param_71);
+                            bool _2130 = alloc_cmd(param_69, param_70, param_71);
                            cmd_alloc = param_69;
                            cmd_ref = param_70;
                            cmd_limit = param_71;
-                            if (!_2135)
+                            if (!_2130)
                            {
                                break;
                            }
@ -1154,11 +1145,11 @@ void comp_main()
                        Alloc param_74 = cmd_alloc;
                        CmdRef param_75 = cmd_ref;
                        uint param_76 = cmd_limit;
-                        bool _2163 = alloc_cmd(param_74, param_75, param_76);
+                        bool _2158 = alloc_cmd(param_74, param_75, param_76);
                        cmd_alloc = param_74;
                        cmd_ref = param_75;
                        cmd_limit = param_76;
-                        if (!_2163)
+                        if (!_2158)
                        {
                            break;
                        }
@ -1169,10 +1160,10 @@ void comp_main()
                        write_fill(param_77, param_78, param_79, param_80);
                        cmd_ref = param_78;
                        uint blend_1 = _1372.Load(dd_1 * 4 + 0);
-                        CmdEndClip _2186 = { blend_1 };
+                        CmdEndClip _2181 = { blend_1 };
                        Alloc param_81 = cmd_alloc;
                        CmdRef param_82 = cmd_ref;
-                        CmdEndClip param_83 = _2186;
+                        CmdEndClip param_83 = _2181;
                        Cmd_EndClip_write(param_81, param_82, param_83);
                        cmd_ref.offset += 8u;
                        break;
@ -1207,17 +1198,17 @@ void comp_main()
            break;
        }
    }
-    bool _2233 = (bin_tile_x + tile_x) < _1005.Load(8);
-    bool _2242;
-    if (_2233)
+    bool _2228 = (bin_tile_x + tile_x) < _1005.Load(8);
+    bool _2237;
+    if (_2228)
    {
-        _2242 = (bin_tile_y + tile_y) < _1005.Load(12);
+        _2237 = (bin_tile_y + tile_y) < _1005.Load(12);
    }
    else
    {
-        _2242 = _2233;
+        _2237 = _2228;
    }
-    if (_2242)
+    if (_2237)
    {
        Alloc param_84 = cmd_alloc;
        CmdRef param_85 = cmd_ref;
--- a/piet-gpu/shader/gen/coarse.msl
+++ b/piet-gpu/shader/gen/coarse.msl
@ -954,22 +954,13 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                {
                    _1701 = _1692;
                }
-                bool _1708;
-                if (!_1701)
-                {
-                    _1708 = is_clip && is_blend;
-                }
-                else
-                {
-                    _1708 = _1701;
-                }
-                include_tile = _1708;
+                include_tile = _1701 || is_blend;
            }
            if (include_tile)
            {
                uint el_slice = el_ix / 32u;
                uint el_mask = 1u << (el_ix & 31u);
-                uint _1728 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
+                uint _1723 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
            }
        }
        threadgroup_barrier(mem_flags::mem_threadgroup);
@ -1014,11 +1005,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                        Alloc param_29 = cmd_alloc;
                        CmdRef param_30 = cmd_ref;
                        uint param_31 = cmd_limit;
-                        bool _1853 = alloc_cmd(param_29, param_30, param_31, v_260, v_260BufferSize);
+                        bool _1848 = alloc_cmd(param_29, param_30, param_31, v_260, v_260BufferSize);
                        cmd_alloc = param_29;
                        cmd_ref = param_30;
                        cmd_limit = param_31;
-                        if (!_1853)
+                        if (!_1848)
                        {
                            break;
                        }
@ -1041,11 +1032,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                        Alloc param_39 = cmd_alloc;
                        CmdRef param_40 = cmd_ref;
                        uint param_41 = cmd_limit;
-                        bool _1894 = alloc_cmd(param_39, param_40, param_41, v_260, v_260BufferSize);
+                        bool _1889 = alloc_cmd(param_39, param_40, param_41, v_260, v_260BufferSize);
                        cmd_alloc = param_39;
                        cmd_ref = param_40;
                        cmd_limit = param_41;
-                        if (!_1894)
+                        if (!_1889)
                        {
                            break;
                        }
@ -1072,11 +1063,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                        Alloc param_49 = cmd_alloc;
                        CmdRef param_50 = cmd_ref;
                        uint param_51 = cmd_limit;
-                        bool _1958 = alloc_cmd(param_49, param_50, param_51, v_260, v_260BufferSize);
+                        bool _1953 = alloc_cmd(param_49, param_50, param_51, v_260, v_260BufferSize);
                        cmd_alloc = param_49;
                        cmd_ref = param_50;
                        cmd_limit = param_51;
-                        if (!_1958)
+                        if (!_1953)
                        {
                            break;
                        }
@ -1106,11 +1097,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                        Alloc param_59 = cmd_alloc;
                        CmdRef param_60 = cmd_ref;
                        uint param_61 = cmd_limit;
-                        bool _2064 = alloc_cmd(param_59, param_60, param_61, v_260, v_260BufferSize);
+                        bool _2059 = alloc_cmd(param_59, param_60, param_61, v_260, v_260BufferSize);
                        cmd_alloc = param_59;
                        cmd_ref = param_60;
                        cmd_limit = param_61;
-                        if (!_2064)
+                        if (!_2059)
                        {
                            break;
                        }
@ -1132,17 +1123,17 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                    }
                    case 5u:
                    {
-                        bool _2117 = tile_1.tile.offset == 0u;
-                        bool _2123;
-                        if (_2117)
+                        bool _2112 = tile_1.tile.offset == 0u;
+                        bool _2118;
+                        if (_2112)
                        {
-                            _2123 = tile_1.backdrop == 0;
+                            _2118 = tile_1.backdrop == 0;
                        }
                        else
                        {
-                            _2123 = _2117;
+                            _2118 = _2112;
                        }
-                        if (_2123)
+                        if (_2118)
                        {
                            clip_zero_depth = clip_depth + 1u;
                        }
@ -1151,11 +1142,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                            Alloc param_69 = cmd_alloc;
                            CmdRef param_70 = cmd_ref;
                            uint param_71 = cmd_limit;
-                            bool _2135 = alloc_cmd(param_69, param_70, param_71, v_260, v_260BufferSize);
+                            bool _2130 = alloc_cmd(param_69, param_70, param_71, v_260, v_260BufferSize);
                            cmd_alloc = param_69;
                            cmd_ref = param_70;
                            cmd_limit = param_71;
-                            if (!_2135)
+                            if (!_2130)
                            {
                                break;
                            }
@ -1173,11 +1164,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                        Alloc param_74 = cmd_alloc;
                        CmdRef param_75 = cmd_ref;
                        uint param_76 = cmd_limit;
-                        bool _2163 = alloc_cmd(param_74, param_75, param_76, v_260, v_260BufferSize);
+                        bool _2158 = alloc_cmd(param_74, param_75, param_76, v_260, v_260BufferSize);
                        cmd_alloc = param_74;
                        cmd_ref = param_75;
                        cmd_limit = param_76;
-                        if (!_2163)
+                        if (!_2158)
                        {
                            break;
                        }
@ -1225,17 +1216,17 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
            break;
        }
    }
-    bool _2233 = (bin_tile_x + tile_x) < _1005.conf.width_in_tiles;
-    bool _2242;
-    if (_2233)
+    bool _2228 = (bin_tile_x + tile_x) < _1005.conf.width_in_tiles;
+    bool _2237;
+    if (_2228)
    {
-        _2242 = (bin_tile_y + tile_y) < _1005.conf.height_in_tiles;
+        _2237 = (bin_tile_y + tile_y) < _1005.conf.height_in_tiles;
    }
    else
    {
-        _2242 = _2233;
+        _2237 = _2228;
    }
-    if (_2242)
+    if (_2237)
    {
        Alloc param_84 = cmd_alloc;
        CmdRef param_85 = cmd_ref;
--- a/piet-gpu/shader/gen/coarse.spv
+++ b/piet-gpu/shader/gen/coarse.spv
--- a/piet-gpu/shader/gen/draw_leaf.dxil
+++ b/piet-gpu/shader/gen/draw_leaf.dxil
--- a/piet-gpu/shader/gen/draw_reduce.dxil
+++ b/piet-gpu/shader/gen/draw_reduce.dxil
--- a/piet-gpu/shader/gen/draw_root.dxil
+++ b/piet-gpu/shader/gen/draw_root.dxil
--- a/piet-gpu/shader/gen/kernel4.dxil
+++ b/piet-gpu/shader/gen/kernel4.dxil
--- a/piet-gpu/shader/gen/kernel4_gray.dxil
+++ b/piet-gpu/shader/gen/kernel4_gray.dxil
--- a/piet-gpu/shader/gen/tile_alloc.dxil
+++ b/piet-gpu/shader/gen/tile_alloc.dxil
--- a/piet-gpu/src/encoder.rs
+++ b/piet-gpu/src/encoder.rs
@ -37,147 +37,6 @@ pub struct Encoder {
    n_clip: u32,
 }

-#[derive(Copy, Clone, Debug)]
-pub struct EncodedSceneRef<'a, T: Copy + Pod> {
-    pub transform_stream: &'a [T],
-    pub tag_stream: &'a [u8],
-    pub pathseg_stream: &'a [u8],
-    pub linewidth_stream: &'a [f32],
-    pub drawtag_stream: &'a [u32],
-    pub drawdata_stream: &'a [u8],
-    pub n_path: u32,
-    pub n_pathseg: u32,
-    pub n_clip: u32,
-    pub ramp_data: &'a [u32],
-}
-
-impl<'a, T: Copy + Pod> EncodedSceneRef<'a, T> {
-    /// Return a config for the element processing pipeline.
-    ///
-    /// This does not include further pipeline processing. Also returns the
-    /// beginning of free memory.
-    pub fn stage_config(&self) -> (Config, usize) {
-        // Layout of scene buffer
-        let drawtag_offset = 0;
-        let n_drawobj = self.n_drawobj();
-        let n_drawobj_padded = align_up(n_drawobj, DRAW_PART_SIZE as usize);
-        let drawdata_offset = drawtag_offset + n_drawobj_padded * DRAWTAG_SIZE;
-        let trans_offset = drawdata_offset + self.drawdata_stream.len();
-        let n_trans = self.transform_stream.len();
-        let n_trans_padded = align_up(n_trans, TRANSFORM_PART_SIZE as usize);
-        let linewidth_offset = trans_offset + n_trans_padded * TRANSFORM_SIZE;
-        let n_linewidth = self.linewidth_stream.len();
-        let pathtag_offset = linewidth_offset + n_linewidth * LINEWIDTH_SIZE;
-        let n_pathtag = self.tag_stream.len();
-        let n_pathtag_padded = align_up(n_pathtag, PATHSEG_PART_SIZE as usize);
-        let pathseg_offset = pathtag_offset + n_pathtag_padded;
-
-        // Layout of memory
-        let mut alloc = 0;
-        let trans_alloc = alloc;
-        alloc += trans_alloc + n_trans_padded * TRANSFORM_SIZE;
-        let pathseg_alloc = alloc;
-        alloc += pathseg_alloc + self.n_pathseg as usize * PATHSEG_SIZE;
-        let path_bbox_alloc = alloc;
-        let n_path = self.n_path as usize;
-        alloc += path_bbox_alloc + n_path * PATH_BBOX_SIZE;
-        let drawmonoid_alloc = alloc;
-        alloc += n_drawobj_padded * DRAWMONOID_SIZE;
-        let anno_alloc = alloc;
-        alloc += n_drawobj * ANNOTATED_SIZE;
-        let clip_alloc = alloc;
-        let n_clip = self.n_clip as usize;
-        const CLIP_SIZE: usize = 4;
-        alloc += n_clip * CLIP_SIZE;
-        let clip_bic_alloc = alloc;
-        const CLIP_BIC_SIZE: usize = 8;
-        // This can round down, as we only reduce the prefix
-        alloc += (n_clip / CLIP_PART_SIZE as usize) * CLIP_BIC_SIZE;
-        let clip_stack_alloc = alloc;
-        const CLIP_EL_SIZE: usize = 20;
-        alloc += n_clip * CLIP_EL_SIZE;
-        let clip_bbox_alloc = alloc;
-        const CLIP_BBOX_SIZE: usize = 16;
-        alloc += align_up(n_clip as usize, CLIP_PART_SIZE as usize) * CLIP_BBOX_SIZE;
-        let draw_bbox_alloc = alloc;
-        alloc += n_drawobj * DRAW_BBOX_SIZE;
-        let drawinfo_alloc = alloc;
-        // TODO: not optimized; it can be accumulated during encoding or summed from drawtags
-        const MAX_DRAWINFO_SIZE: usize = 44;
-        alloc += n_drawobj * MAX_DRAWINFO_SIZE;
-
-        let config = Config {
-            n_elements: n_drawobj as u32,
-            n_pathseg: self.n_pathseg,
-            pathseg_alloc: pathseg_alloc as u32,
-            anno_alloc: anno_alloc as u32,
-            trans_alloc: trans_alloc as u32,
-            path_bbox_alloc: path_bbox_alloc as u32,
-            drawmonoid_alloc: drawmonoid_alloc as u32,
-            clip_alloc: clip_alloc as u32,
-            clip_bic_alloc: clip_bic_alloc as u32,
-            clip_stack_alloc: clip_stack_alloc as u32,
-            clip_bbox_alloc: clip_bbox_alloc as u32,
-            draw_bbox_alloc: draw_bbox_alloc as u32,
-            drawinfo_alloc: drawinfo_alloc as u32,
-            n_trans: n_trans as u32,
-            n_path: self.n_path,
-            n_clip: self.n_clip,
-            trans_offset: trans_offset as u32,
-            linewidth_offset: linewidth_offset as u32,
-            pathtag_offset: pathtag_offset as u32,
-            pathseg_offset: pathseg_offset as u32,
-            drawtag_offset: drawtag_offset as u32,
-            drawdata_offset: drawdata_offset as u32,
-            ..Default::default()
-        };
-        (config, alloc)
-    }
-
-    pub fn write_scene(&self, buf: &mut BufWrite) {
-        buf.extend_slice(&self.drawtag_stream);
-        let n_drawobj = self.drawtag_stream.len();
-        buf.fill_zero(padding(n_drawobj, DRAW_PART_SIZE as usize) * DRAWTAG_SIZE);
-        buf.extend_slice(&self.drawdata_stream);
-        buf.extend_slice(&self.transform_stream);
-        let n_trans = self.transform_stream.len();
-        buf.fill_zero(padding(n_trans, TRANSFORM_PART_SIZE as usize) * TRANSFORM_SIZE);
-        buf.extend_slice(&self.linewidth_stream);
-        buf.extend_slice(&self.tag_stream);
-        let n_pathtag = self.tag_stream.len();
-        buf.fill_zero(padding(n_pathtag, PATHSEG_PART_SIZE as usize));
-        buf.extend_slice(&self.pathseg_stream);
-    }
-
-    /// The number of draw objects in the draw object stream.
-    pub(crate) fn n_drawobj(&self) -> usize {
-        self.drawtag_stream.len()
-    }
-
-    /// The number of paths.
-    pub(crate) fn n_path(&self) -> u32 {
-        self.n_path
-    }
-
-    /// The number of path segments.
-    pub(crate) fn n_pathseg(&self) -> u32 {
-        self.n_pathseg
-    }
-
-    pub(crate) fn n_transform(&self) -> usize {
-        self.transform_stream.len()
-    }
-
-    /// The number of tags in the path stream.
-    pub(crate) fn n_pathtag(&self) -> usize {
-        self.tag_stream.len()
-    }
-
-    pub(crate) fn n_clip(&self) -> u32 {
-        self.n_clip
-    }
-}
-
 /// A scene fragment encoding a glyph.
 ///
 /// This is a reduced version of the full encoder.
@ -471,21 +330,6 @@ impl Encoder {
        self.n_path += glyph.n_path;
        self.n_pathseg += glyph.n_pathseg;
    }
-
-    pub(crate) fn scene_ref(&self) -> EncodedSceneRef<stages::Transform> {
-        EncodedSceneRef {
-            transform_stream: &self.transform_stream,
-            tag_stream: &self.tag_stream,
-            pathseg_stream: &self.pathseg_stream,
-            linewidth_stream: &self.linewidth_stream,
-            drawtag_stream: &self.drawtag_stream,
-            drawdata_stream: &self.drawdata_stream,
-            n_path: self.n_path,
-            n_pathseg: self.n_pathseg,
-            n_clip: self.n_clip,
-            ramp_data: &[],
-        }
-    }
 }

 fn align_up(x: usize, align: usize) -> usize {
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -10,10 +10,7 @@ mod text;

 use std::convert::TryInto;

-use bytemuck::Pod;
-
 pub use blend::{Blend, BlendMode, CompositionMode};
-pub use encoder::EncodedSceneRef;
 pub use render_ctx::PietGpuRenderContext;
 pub use gradient::Colrv1RadialGradient;

@ -21,11 +18,11 @@ use piet::kurbo::Vec2;
 use piet::{ImageFormat, RenderContext};

 use piet_gpu_hal::{
-    include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Error, Image,
-    ImageLayout, Pipeline, QueryPool, Session,
+    include_shader, BindType, Buffer, BufferUsage, CmdBuf, ComputePassDescriptor, DescriptorSet,
+    Error, Image, ImageLayout, Pipeline, QueryPool, Session,
 };

-use pico_svg::PicoSvg;
+pub use pico_svg::PicoSvg;
 use stages::{ClipBinding, ElementBinding, ElementCode};

 use crate::stages::{ClipCode, Config, ElementStage};
@ -358,27 +355,16 @@ impl Renderer {
        render_ctx: &mut PietGpuRenderContext,
        buf_ix: usize,
    ) -> Result<(), Error> {
-        let mut scene = render_ctx.encoded_scene();
-        let ramp_data = render_ctx.get_ramp_data();
-        scene.ramp_data = &ramp_data;
-        self.upload_scene(&scene, buf_ix)
-    }
-
-    pub fn upload_scene<T: Copy + Pod>(
-        &mut self,
-        scene: &EncodedSceneRef<T>,
-        buf_ix: usize,
-    ) -> Result<(), Error> {
-        let (mut config, mut alloc) = scene.stage_config();
-        let n_drawobj = scene.n_drawobj();
+        let (mut config, mut alloc) = render_ctx.stage_config();
+        let n_drawobj = render_ctx.n_drawobj();
        // TODO: be more consistent in size types
-        let n_path = scene.n_path() as usize;
+        let n_path = render_ctx.n_path() as usize;
        self.n_paths = n_path;
-        self.n_transform = scene.n_transform();
-        self.n_drawobj = scene.n_drawobj();
-        self.n_pathseg = scene.n_pathseg() as usize;
-        self.n_pathtag = scene.n_pathtag();
-        self.n_clip = scene.n_clip();
+        self.n_transform = render_ctx.n_transform();
+        self.n_drawobj = render_ctx.n_drawobj();
+        self.n_pathseg = render_ctx.n_pathseg() as usize;
+        self.n_pathtag = render_ctx.n_pathtag();
+        self.n_clip = render_ctx.n_clip();

        // These constants depend on encoding and may need to be updated.
        // Perhaps we can plumb these from piet-gpu-derive?
@ -402,18 +388,19 @@ impl Renderer {
            // TODO: reallocate scene buffer if size is inadequate
            {
                let mut mapped_scene = self.scene_bufs[buf_ix].map_write(..)?;
-                scene.write_scene(&mut mapped_scene);
+                render_ctx.write_scene(&mut mapped_scene);
            }
            self.config_bufs[buf_ix].write(&[config])?;
            self.memory_buf_host[buf_ix].write(&[alloc as u32, 0 /* Overflow flag */])?;

            // Upload gradient data.
-            if !scene.ramp_data.is_empty() {
+            let ramp_data = render_ctx.get_ramp_data();
+            if !ramp_data.is_empty() {
                assert!(
                    self.gradient_bufs[buf_ix].size() as usize
-                        >= std::mem::size_of_val(&*scene.ramp_data)
+                        >= std::mem::size_of_val(&*ramp_data)
                );
-                self.gradient_bufs[buf_ix].write(scene.ramp_data)?;
+                self.gradient_bufs[buf_ix].write(&ramp_data)?;
            }
        }
        Ok(())
@ -437,10 +424,10 @@ impl Renderer {
        cmd_buf.copy_buffer_to_image(&self.gradient_bufs[buf_ix], &self.gradients);
        cmd_buf.image_barrier(&self.gradients, ImageLayout::BlitDst, ImageLayout::General);
        cmd_buf.reset_query_pool(&query_pool);
-        cmd_buf.write_timestamp(&query_pool, 0);
        cmd_buf.begin_debug_label("Element bounding box calculation");
+        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 0, 1));
        self.element_stage.record(
-            cmd_buf,
+            &mut pass,
            &self.element_code,
            &self.element_bindings[buf_ix],
            self.n_transform as u64,
@ -448,56 +435,59 @@ impl Renderer {
            self.n_pathtag as u32,
            self.n_drawobj as u64,
        );
+        pass.end();
        cmd_buf.end_debug_label();
-        cmd_buf.write_timestamp(&query_pool, 1);
        cmd_buf.memory_barrier();
-        cmd_buf.begin_debug_label("Clip bounding box calculation");
+        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 2, 3));
+        pass.begin_debug_label("Clip bounding box calculation");
        self.clip_binding
-            .record(cmd_buf, &self.clip_code, self.n_clip as u32);
-        cmd_buf.end_debug_label();
-        cmd_buf.begin_debug_label("Element binning");
-        cmd_buf.dispatch(
+            .record(&mut pass, &self.clip_code, self.n_clip as u32);
+        pass.end_debug_label();
+        pass.begin_debug_label("Element binning");
+        pass.dispatch(
            &self.bin_pipeline,
            &self.bin_ds,
            (((self.n_paths + 255) / 256) as u32, 1, 1),
            (256, 1, 1),
        );
-        cmd_buf.end_debug_label();
-        cmd_buf.memory_barrier();
-        cmd_buf.begin_debug_label("Tile allocation");
-        cmd_buf.dispatch(
+        pass.end_debug_label();
+        pass.memory_barrier();
+        pass.begin_debug_label("Tile allocation");
+        pass.dispatch(
            &self.tile_pipeline,
            &self.tile_ds[buf_ix],
            (((self.n_paths + 255) / 256) as u32, 1, 1),
            (256, 1, 1),
        );
-        cmd_buf.end_debug_label();
-        cmd_buf.write_timestamp(&query_pool, 2);
-        cmd_buf.memory_barrier();
+        pass.end_debug_label();
+        pass.end();
        cmd_buf.begin_debug_label("Path flattening");
-        cmd_buf.dispatch(
+        cmd_buf.memory_barrier();
+        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 4, 5));
+        pass.dispatch(
            &self.path_pipeline,
            &self.path_ds,
            (((self.n_pathseg + 31) / 32) as u32, 1, 1),
            (32, 1, 1),
        );
+        pass.end();
        cmd_buf.end_debug_label();
-        cmd_buf.write_timestamp(&query_pool, 3);
        cmd_buf.memory_barrier();
        cmd_buf.begin_debug_label("Backdrop propagation");
-        cmd_buf.dispatch(
+        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 6, 7));
+        pass.dispatch(
            &self.backdrop_pipeline,
            &self.backdrop_ds,
            (((self.n_paths + 255) / 256) as u32, 1, 1),
            (256, self.backdrop_y, 1),
        );
+        pass.end();
        cmd_buf.end_debug_label();
-        cmd_buf.write_timestamp(&query_pool, 4);
        // TODO: redo query accounting
-        cmd_buf.write_timestamp(&query_pool, 5);
        cmd_buf.memory_barrier();
        cmd_buf.begin_debug_label("Coarse raster");
-        cmd_buf.dispatch(
+        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 8, 9));
+        pass.dispatch(
            &self.coarse_pipeline,
            &self.coarse_ds[buf_ix],
            (
@ -507,11 +497,13 @@ impl Renderer {
            ),
            (256, 1, 1),
        );
+        pass.end();
        cmd_buf.end_debug_label();
-        cmd_buf.write_timestamp(&query_pool, 6);
        cmd_buf.memory_barrier();
        cmd_buf.begin_debug_label("Fine raster");
-        cmd_buf.dispatch(
+        let mut pass =
+            cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 10, 11));
+        pass.dispatch(
            &self.k4_pipeline,
            &self.k4_ds,
            (
@ -521,8 +513,8 @@ impl Renderer {
            ),
            (8, 4, 1),
        );
+        pass.end();
        cmd_buf.end_debug_label();
-        cmd_buf.write_timestamp(&query_pool, 7);
        cmd_buf.memory_barrier();
        cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
    }
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@ -1,6 +1,6 @@
 use std::borrow::Cow;

-use crate::encoder::{EncodedSceneRef, GlyphEncoder};
+use crate::encoder::GlyphEncoder;
 use crate::stages::{Config, Transform};
 use crate::MAX_BLEND_STACK;
 use piet::kurbo::{Affine, Insets, PathEl, Point, Rect, Shape};
@ -97,10 +97,6 @@ impl PietGpuRenderContext {
        self.new_encoder.stage_config()
    }

-    pub fn encoded_scene(&self) -> EncodedSceneRef<crate::stages::Transform> {
-        self.new_encoder.scene_ref()
-    }
-
    /// Number of draw objects.
    ///
    /// This is for the new element processing pipeline. It's not necessarily the
--- a/piet-gpu/src/stages.rs
+++ b/piet-gpu/src/stages.rs
@ -26,7 +26,7 @@ use bytemuck::{Pod, Zeroable};
 pub use clip::{ClipBinding, ClipCode, CLIP_PART_SIZE};
 pub use draw::{DrawBinding, DrawCode, DrawMonoid, DrawStage, DRAW_PART_SIZE};
 pub use path::{PathBinding, PathCode, PathEncoder, PathStage, PATHSEG_PART_SIZE};
-use piet_gpu_hal::{Buffer, CmdBuf, Session};
+use piet_gpu_hal::{Buffer, ComputePass, Session};
 pub use transform::{
    Transform, TransformBinding, TransformCode, TransformStage, TRANSFORM_PART_SIZE,
 };
@ -140,7 +140,7 @@ impl ElementStage {

    pub unsafe fn record(
        &self,
-        cmd_buf: &mut CmdBuf,
+        pass: &mut ComputePass,
        code: &ElementCode,
        binding: &ElementBinding,
        n_transform: u64,
@ -149,14 +149,14 @@ impl ElementStage {
        n_drawobj: u64,
    ) {
        self.transform_stage.record(
-            cmd_buf,
+            pass,
            &code.transform_code,
            &binding.transform_binding,
            n_transform,
        );
        // No memory barrier needed here; path has at least one before pathseg
        self.path_stage.record(
-            cmd_buf,
+            pass,
            &code.path_code,
            &binding.path_binding,
            n_paths,
@ -164,6 +164,6 @@ impl ElementStage {
        );
        // No memory barrier needed here; draw has at least one before draw_leaf
        self.draw_stage
-            .record(cmd_buf, &code.draw_code, &binding.draw_binding, n_drawobj);
+            .record(pass, &code.draw_code, &binding.draw_binding, n_drawobj);
    }
 }
--- a/piet-gpu/src/stages/clip.rs
+++ b/piet-gpu/src/stages/clip.rs
@ -16,7 +16,7 @@

 //! The clip processing stage (includes substages).

-use piet_gpu_hal::{include_shader, BindType, Buffer, CmdBuf, DescriptorSet, Pipeline, Session};
+use piet_gpu_hal::{include_shader, BindType, Buffer, ComputePass, DescriptorSet, Pipeline, Session};

 // Note that this isn't the code/stage/binding pattern of most of the other stages
 // in the new element processing pipeline. We want to move those temporary buffers
@ -69,26 +69,26 @@ impl ClipBinding {
    /// Record the clip dispatches.
    ///
    /// Assumes memory barrier on entry. Provides memory barrier on exit.
-    pub unsafe fn record(&self, cmd_buf: &mut CmdBuf, code: &ClipCode, n_clip: u32) {
+    pub unsafe fn record(&self, pass: &mut ComputePass, code: &ClipCode, n_clip: u32) {
        let n_wg_reduce = n_clip.saturating_sub(1) / CLIP_PART_SIZE;
        if n_wg_reduce > 0 {
-            cmd_buf.dispatch(
+            pass.dispatch(
                &code.reduce_pipeline,
                &self.reduce_ds,
                (n_wg_reduce, 1, 1),
                (CLIP_PART_SIZE, 1, 1),
            );
-            cmd_buf.memory_barrier();
+            pass.memory_barrier();
        }
        let n_wg = (n_clip + CLIP_PART_SIZE - 1) / CLIP_PART_SIZE;
        if n_wg > 0 {
-            cmd_buf.dispatch(
+            pass.dispatch(
                &code.leaf_pipeline,
                &self.leaf_ds,
                (n_wg, 1, 1),
                (CLIP_PART_SIZE, 1, 1),
            );
-            cmd_buf.memory_barrier();
+            pass.memory_barrier();
        }
    }
 }
--- a/piet-gpu/src/stages/draw.rs
+++ b/piet-gpu/src/stages/draw.rs
@ -19,7 +19,7 @@
 use bytemuck::{Pod, Zeroable};

 use piet_gpu_hal::{
-    include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session,
+    include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session,
 };

 /// The output element of the draw object stage.
@ -130,7 +130,7 @@ impl DrawStage {

    pub unsafe fn record(
        &self,
-        cmd_buf: &mut CmdBuf,
+        pass: &mut ComputePass,
        code: &DrawCode,
        binding: &DrawBinding,
        size: u64,
@ -140,22 +140,22 @@ impl DrawStage {
        }
        let n_workgroups = (size + DRAW_PART_SIZE - 1) / DRAW_PART_SIZE;
        if n_workgroups > 1 {
-            cmd_buf.dispatch(
+            pass.dispatch(
                &code.reduce_pipeline,
                &binding.reduce_ds,
                (n_workgroups as u32, 1, 1),
                (DRAW_WG as u32, 1, 1),
            );
-            cmd_buf.memory_barrier();
-            cmd_buf.dispatch(
+            pass.memory_barrier();
+            pass.dispatch(
                &code.root_pipeline,
                &self.root_ds,
                (1, 1, 1),
                (DRAW_WG as u32, 1, 1),
            );
        }
-        cmd_buf.memory_barrier();
-        cmd_buf.dispatch(
+        pass.memory_barrier();
+        pass.dispatch(
            &code.leaf_pipeline,
            &binding.leaf_ds,
            (n_workgroups as u32, 1, 1),
--- a/piet-gpu/src/stages/path.rs
+++ b/piet-gpu/src/stages/path.rs
@ -17,7 +17,7 @@
 //! The path stage (includes substages).

 use piet_gpu_hal::{
-    include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session,
+    include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session,
 };

 pub struct PathCode {
@ -148,7 +148,7 @@ impl PathStage {
    /// those are consumed. Result is written without barrier.
    pub unsafe fn record(
        &self,
-        cmd_buf: &mut CmdBuf,
+        pass: &mut ComputePass,
        code: &PathCode,
        binding: &PathBinding,
        n_paths: u32,
@ -166,15 +166,15 @@ impl PathStage {
        let reduce_part_tags = REDUCE_PART_SIZE * 4;
        let n_wg_tag_reduce = (n_tags + reduce_part_tags - 1) / reduce_part_tags;
        if n_wg_tag_reduce > 1 {
-            cmd_buf.dispatch(
+            pass.dispatch(
                &code.reduce_pipeline,
                &binding.reduce_ds,
                (n_wg_tag_reduce, 1, 1),
                (REDUCE_WG, 1, 1),
            );
            // I think we can skip root if n_wg_tag_reduce == 2
-            cmd_buf.memory_barrier();
-            cmd_buf.dispatch(
+            pass.memory_barrier();
+            pass.dispatch(
                &code.tag_root_pipeline,
                &self.tag_root_ds,
                (1, 1, 1),
@ -183,15 +183,15 @@ impl PathStage {
            // No barrier needed here; clear doesn't depend on path tags
        }
        let n_wg_clear = (n_paths + CLEAR_WG - 1) / CLEAR_WG;
-        cmd_buf.dispatch(
+        pass.dispatch(
            &code.clear_pipeline,
            &binding.clear_ds,
            (n_wg_clear, 1, 1),
            (CLEAR_WG, 1, 1),
        );
-        cmd_buf.memory_barrier();
+        pass.memory_barrier();
        let n_wg_pathseg = (n_tags + SCAN_PART_SIZE - 1) / SCAN_PART_SIZE;
-        cmd_buf.dispatch(
+        pass.dispatch(
            &code.pathseg_pipeline,
            &binding.path_ds,
            (n_wg_pathseg, 1, 1),
--- a/piet-gpu/src/stages/transform.rs
+++ b/piet-gpu/src/stages/transform.rs
@ -20,7 +20,7 @@ use bytemuck::{Pod, Zeroable};

 use piet::kurbo::Affine;
 use piet_gpu_hal::{
-    include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session,
+    include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session,
 };

 /// An affine transform.
@ -132,7 +132,7 @@ impl TransformStage {

    pub unsafe fn record(
        &self,
-        cmd_buf: &mut CmdBuf,
+        pass: &mut ComputePass,
        code: &TransformCode,
        binding: &TransformBinding,
        size: u64,
@ -142,22 +142,22 @@ impl TransformStage {
        }
        let n_workgroups = (size + TRANSFORM_PART_SIZE - 1) / TRANSFORM_PART_SIZE;
        if n_workgroups > 1 {
-            cmd_buf.dispatch(
+            pass.dispatch(
                &code.reduce_pipeline,
                &binding.reduce_ds,
                (n_workgroups as u32, 1, 1),
                (TRANSFORM_WG as u32, 1, 1),
            );
-            cmd_buf.memory_barrier();
-            cmd_buf.dispatch(
+            pass.memory_barrier();
+            pass.dispatch(
                &code.root_pipeline,
                &self.root_ds,
                (1, 1, 1),
                (TRANSFORM_WG as u32, 1, 1),
            );
-            cmd_buf.memory_barrier();
+            pass.memory_barrier();
        }
-        cmd_buf.dispatch(
+        pass.dispatch(
            &code.leaf_pipeline,
            &binding.leaf_ds,
            (n_workgroups as u32, 1, 1),
--- a/piet-gpu/src/test_scenes.rs
+++ b/piet-gpu/src/test_scenes.rs
@ -21,12 +21,7 @@ pub fn render_blend_test(rc: &mut PietGpuRenderContext, i: usize, blend: Blend)
    rc.restore().unwrap();
 }

-pub fn render_svg(rc: &mut impl RenderContext, filename: &str, scale: f64) {
-    let xml_str = std::fs::read_to_string(filename).unwrap();
-    let start = std::time::Instant::now();
-    let svg = PicoSvg::load(&xml_str, scale).unwrap();
-    println!("parsing time: {:?}", start.elapsed());
-
+pub fn render_svg(rc: &mut impl RenderContext, svg: &PicoSvg) {
    let start = std::time::Instant::now();
    svg.render(rc);
    println!("flattening and encoding time: {:?}", start.elapsed());
--- a/tests/src/clear.rs
+++ b/tests/src/clear.rs
@ -16,11 +16,11 @@

 //! Utilities (and a benchmark) for clearing buffers with compute shaders.

-use piet_gpu_hal::{include_shader, BindType, BufferUsage, DescriptorSet};
+use piet_gpu_hal::{include_shader, BindType, BufferUsage, ComputePass, DescriptorSet};
 use piet_gpu_hal::{Buffer, Pipeline};

 use crate::config::Config;
-use crate::runner::{Commands, Runner};
+use crate::runner::Runner;
 use crate::test_result::TestResult;

 const WG_SIZE: u64 = 256;
@ -52,9 +52,9 @@ pub unsafe fn run_clear_test(runner: &mut Runner, config: &Config) -> TestResult
    let mut total_elapsed = 0.0;
    for i in 0..n_iter {
        let mut commands = runner.commands();
-        commands.write_timestamp(0);
-        stage.record(&mut commands, &code, &binding);
-        commands.write_timestamp(1);
+        let mut pass = commands.compute_pass(0, 1);
+        stage.record(&mut pass, &code, &binding);
+        pass.end();
        if i == 0 || config.verify_all {
            commands.cmd_buf.memory_barrier();
            commands.download(&out_buf);
@ -108,17 +108,12 @@ impl ClearStage {
        ClearBinding { descriptor_set }
    }

-    pub unsafe fn record(
-        &self,
-        commands: &mut Commands,
-        code: &ClearCode,
-        bindings: &ClearBinding,
-    ) {
+    pub unsafe fn record(&self, pass: &mut ComputePass, code: &ClearCode, bindings: &ClearBinding) {
        let n_workgroups = (self.n_elements + WG_SIZE - 1) / WG_SIZE;
        // An issue: for clearing large buffers (>16M), we need to check the
        // number of workgroups against the (dynamically detected) limit, and
        // potentially issue multiple dispatches.
-        commands.cmd_buf.dispatch(
+        pass.dispatch(
            &code.pipeline,
            &bindings.descriptor_set,
            (n_workgroups as u32, 1, 1),
--- a/tests/src/clip.rs
+++ b/tests/src/clip.rs
@ -58,11 +58,11 @@ pub unsafe fn clip_test(runner: &mut Runner, config: &Config) -> TestResult {
    let binding = ClipBinding::new(&runner.session, &code, &config_buf, &memory.dev_buf);

    let mut commands = runner.commands();
-    commands.write_timestamp(0);
    commands.upload(&memory);
-    binding.record(&mut commands.cmd_buf, &code, n_clip as u32);
+    let mut pass = commands.compute_pass(0, 1);
+    binding.record(&mut pass, &code, n_clip as u32);
+    pass.end();
    commands.download(&memory);
-    commands.write_timestamp(1);
    runner.submit(commands);
    let dst = memory.map_read(..);
    if let Some(failure) = data.verify(&dst) {
--- a/tests/src/draw.rs
+++ b/tests/src/draw.rs
@ -77,9 +77,9 @@ pub unsafe fn draw_test(runner: &mut Runner, config: &Config) -> TestResult {
    let n_iter = config.n_iter;
    for i in 0..n_iter {
        let mut commands = runner.commands();
-        commands.write_timestamp(0);
-        stage.record(&mut commands.cmd_buf, &code, &binding, n_tag);
-        commands.write_timestamp(1);
+        let mut pass = commands.compute_pass(0, 1);
+        stage.record(&mut pass, &code, &binding, n_tag);
+        pass.end();
        if i == 0 || config.verify_all {
            commands.cmd_buf.memory_barrier();
            commands.download(&memory);
--- a/tests/src/linkedlist.rs
+++ b/tests/src/linkedlist.rs
@ -45,9 +45,7 @@ pub unsafe fn run_linkedlist_test(runner: &mut Runner, config: &Config) -> TestR
    for i in 0..n_iter {
        let mut commands = runner.commands();
        // Might clear only buckets to save time.
-        commands.write_timestamp(0);
        stage.record(&mut commands, &code, &binding, &mem_buf.dev_buf);
-        commands.write_timestamp(1);
        if i == 0 || config.verify_all {
            commands.cmd_buf.memory_barrier();
            commands.download(&mem_buf);
@ -107,12 +105,14 @@ impl LinkedListStage {
        commands.cmd_buf.clear_buffer(out_buf, None);
        commands.cmd_buf.memory_barrier();
        let n_workgroups = N_BUCKETS / WG_SIZE;
-        commands.cmd_buf.dispatch(
+        let mut pass = commands.compute_pass(0, 1);
+        pass.dispatch(
            &code.pipeline,
            &bindings.descriptor_set,
            (n_workgroups as u32, 1, 1),
            (WG_SIZE as u32, 1, 1),
        );
+        pass.end();
    }
 }

--- a/tests/src/message_passing.rs
+++ b/tests/src/message_passing.rs
@ -59,9 +59,7 @@ pub unsafe fn run_message_passing_test(
    let mut failures = 0;
    for _ in 0..n_iter {
        let mut commands = runner.commands();
-        commands.write_timestamp(0);
        stage.record(&mut commands, &code, &binding, &out_buf.dev_buf);
-        commands.write_timestamp(1);
        commands.cmd_buf.memory_barrier();
        commands.download(&out_buf);
        total_elapsed += runner.submit(commands);
@ -128,11 +126,13 @@ impl MessagePassingStage {
        commands.cmd_buf.clear_buffer(&self.data_buf, None);
        commands.cmd_buf.clear_buffer(out_buf, None);
        commands.cmd_buf.memory_barrier();
-        commands.cmd_buf.dispatch(
+        let mut pass = commands.compute_pass(0, 1);
+        pass.dispatch(
            &code.pipeline,
            &bindings.descriptor_set,
            (256, 1, 1),
            (256, 1, 1),
        );
+        pass.end();
    }
 }
--- a/tests/src/path.rs
+++ b/tests/src/path.rs
@ -105,15 +105,15 @@ pub unsafe fn path_test(runner: &mut Runner, config: &Config) -> TestResult {
        let mut commands = runner.commands();
        commands.cmd_buf.copy_buffer(&memory_init, &memory.dev_buf);
        commands.cmd_buf.memory_barrier();
-        commands.write_timestamp(0);
+        let mut pass = commands.compute_pass(0, 1);
        stage.record(
-            &mut commands.cmd_buf,
+            &mut pass,
            &code,
            &binding,
            path_data.n_path,
            path_data.tags.len() as u32,
        );
-        commands.write_timestamp(1);
+        pass.end();
        if i == 0 || config.verify_all {
            commands.cmd_buf.memory_barrier();
            commands.download(&memory);
--- a/tests/src/prefix.rs
+++ b/tests/src/prefix.rs
@ -85,9 +85,7 @@ pub unsafe fn run_prefix_test(
    let mut total_elapsed = 0.0;
    for i in 0..n_iter {
        let mut commands = runner.commands();
-        commands.write_timestamp(0);
        stage.record(&mut commands, &code, &binding);
-        commands.write_timestamp(1);
        if i == 0 || config.verify_all {
            commands.cmd_buf.memory_barrier();
            commands.download(&out_buf);
@ -159,12 +157,14 @@ impl PrefixStage {
        let n_workgroups = (self.n_elements + ELEMENTS_PER_WG - 1) / ELEMENTS_PER_WG;
        commands.cmd_buf.clear_buffer(&self.state_buf, None);
        commands.cmd_buf.memory_barrier();
-        commands.cmd_buf.dispatch(
+        let mut pass = commands.compute_pass(0, 1);
+        pass.dispatch(
            &code.pipeline,
            &bindings.descriptor_set,
            (n_workgroups as u32, 1, 1),
            (WG_SIZE as u32, 1, 1),
        );
+        pass.end();
        // One thing that's missing here is registering the buffers so
        // they can be safely dropped by Rust code before the execution
        // of the command buffer completes.
--- a/tests/src/prefix_tree.rs
+++ b/tests/src/prefix_tree.rs
@ -66,9 +66,7 @@ pub unsafe fn run_prefix_test(runner: &mut Runner, config: &Config) -> TestResul
        let mut commands = runner.commands();
        commands.cmd_buf.copy_buffer(&data_buf, &out_buf.dev_buf);
        commands.cmd_buf.memory_barrier();
-        commands.write_timestamp(0);
        stage.record(&mut commands, &code, &binding);
-        commands.write_timestamp(1);
        if i == 0 || config.verify_all {
            commands.cmd_buf.memory_barrier();
            commands.download(&out_buf);
@ -175,33 +173,35 @@ impl PrefixTreeStage {
        code: &PrefixTreeCode,
        bindings: &PrefixTreeBinding,
    ) {
+        let mut pass = commands.compute_pass(0, 1);
        let n = self.tmp_bufs.len();
        for i in 0..n {
            let n_workgroups = self.sizes[i + 1];
-            commands.cmd_buf.dispatch(
+            pass.dispatch(
                &code.reduce_pipeline,
                &bindings.descriptor_sets[i],
                (n_workgroups as u32, 1, 1),
                (WG_SIZE as u32, 1, 1),
            );
-            commands.cmd_buf.memory_barrier();
+            pass.memory_barrier();
        }
-        commands.cmd_buf.dispatch(
+        pass.dispatch(
            &code.root_pipeline,
            &bindings.descriptor_sets[n],
            (1, 1, 1),
            (WG_SIZE as u32, 1, 1),
        );
        for i in (0..n).rev() {
-            commands.cmd_buf.memory_barrier();
+            pass.memory_barrier();
            let n_workgroups = self.sizes[i + 1];
-            commands.cmd_buf.dispatch(
+            pass.dispatch(
                &code.scan_pipeline,
                &bindings.descriptor_sets[2 * n - i],
                (n_workgroups as u32, 1, 1),
                (WG_SIZE as u32, 1, 1),
            );
        }
+        pass.end();
    }
 }

--- a/tests/src/runner.rs
+++ b/tests/src/runner.rs
@ -20,8 +20,8 @@ use std::ops::RangeBounds;

 use bytemuck::Pod;
 use piet_gpu_hal::{
-    BackendType, BufReadGuard, BufWriteGuard, Buffer, BufferUsage, CmdBuf, Instance, InstanceFlags,
-    QueryPool, Session,
+    BackendType, BufReadGuard, BufWriteGuard, Buffer, BufferUsage, CmdBuf, ComputePass,
+    ComputePassDescriptor, Instance, InstanceFlags, QueryPool, Session,
 };

 pub struct Runner {
@ -118,8 +118,14 @@ impl Runner {
 }

 impl Commands {
-    pub unsafe fn write_timestamp(&mut self, query: u32) {
-        self.cmd_buf.write_timestamp(&self.query_pool, query);
+    /// Start a compute pass with timer queries.
+    pub unsafe fn compute_pass(&mut self, start_query: u32, end_query: u32) -> ComputePass {
+        self.cmd_buf
+            .begin_compute_pass(&ComputePassDescriptor::timer(
+                &self.query_pool,
+                start_query,
+                end_query,
+            ))
    }

    pub unsafe fn upload(&mut self, buf: &BufStage) {
--- a/tests/src/transform.rs
+++ b/tests/src/transform.rs
@ -61,9 +61,9 @@ pub unsafe fn transform_test(runner: &mut Runner, config: &Config) -> TestResult
    let n_iter = config.n_iter;
    for i in 0..n_iter {
        let mut commands = runner.commands();
-        commands.write_timestamp(0);
-        stage.record(&mut commands.cmd_buf, &code, &binding, n_elements);
-        commands.write_timestamp(1);
+        let mut pass = commands.compute_pass(0, 1);
+        stage.record(&mut pass, &code, &binding, n_elements);
+        pass.end();
        if i == 0 || config.verify_all {
            commands.cmd_buf.memory_barrier();
            commands.download(&memory);