API changes and cleanup

Add workgroup size to dispatch call (needed by metal). Change all fence references to mutable for consistency. Move backend traits to a separate file (move them out of the toplevel namespace in preparation for the hub types going there, to make the public API nicer). Add a method and macro for automatically choosing shader code, and change collatz example to generate all 3 kinds on build.
2025-01-10 12:41:30 +11:00 · 2021-05-28 15:17:36 -07:00 · 2021-05-28 15:17:36 -07:00 · 7d7c86c44b
parent 641891b01f
commit 7d7c86c44b
17 changed files with 524 additions and 348 deletions
--- a/piet-gpu-hal/examples/collatz.rs
+++ b/piet-gpu-hal/examples/collatz.rs
@ -1,6 +1,7 @@
 use piet_gpu_hal::hub;
-use piet_gpu_hal::mux::{Instance, ShaderCode};
+use piet_gpu_hal::mux::Instance;
 use piet_gpu_hal::BufferUsage;
+use piet_gpu_hal::include_shader;

 fn main() {
    let (instance, _) = Instance::new(None).unwrap();
@ -10,7 +11,7 @@ fn main() {
        let usage = BufferUsage::MAP_READ | BufferUsage::STORAGE;
        let src = (0..256).map(|x| x + 1).collect::<Vec<u32>>();
        let buffer = session.create_buffer_init(&src, usage).unwrap();
-        let code = ShaderCode::Msl(include_str!("./shader/collatz.msl"));
+        let code = include_shader!(&session, "./shader/gen/collatz");
        let pipeline = session.create_simple_compute_pipeline(code, 1).unwrap();
        let descriptor_set = session
            .create_simple_descriptor_set(&pipeline, &[&buffer])
@ -20,7 +21,7 @@ fn main() {
        cmd_buf.begin();
        cmd_buf.reset_query_pool(&query_pool);
        cmd_buf.write_timestamp(&query_pool, 0);
-        cmd_buf.dispatch(&pipeline, &descriptor_set, (256, 1, 1));
+        cmd_buf.dispatch(&pipeline, &descriptor_set, (256, 1, 1), (1, 1, 1));
        cmd_buf.write_timestamp(&query_pool, 1);
        cmd_buf.host_barrier();
        cmd_buf.finish();
--- a/piet-gpu-hal/examples/dx12_toy.rs
+++ b/piet-gpu-hal/examples/dx12_toy.rs
@ -2,7 +2,8 @@
 //! This will probably go away when it's fully implemented and we can
 //! just use the hub.

-use piet_gpu_hal::{dx12, BufferUsage, CmdBuf, Device, Error};
+use piet_gpu_hal::{dx12, BufferUsage, Error};
+use piet_gpu_hal::backend::{CmdBuf, Device};

 const SHADER_CODE: &str = r#"RWByteAddressBuffer _53 : register(u0, space0);

@ -78,7 +79,7 @@ fn toy() -> Result<(), Error> {
        cmd_buf.copy_buffer(&buf, &dev_buf);
        cmd_buf.memory_barrier();
        cmd_buf.write_timestamp(&query_pool, 0);
-        cmd_buf.dispatch(&pipeline, &ds, (1, 1, 1));
+        cmd_buf.dispatch(&pipeline, &ds, (1, 1, 1), (256, 1, 1));
        cmd_buf.write_timestamp(&query_pool, 1);
        cmd_buf.memory_barrier();
        cmd_buf.copy_buffer(&dev_buf, &buf);
--- a/piet-gpu-hal/examples/metal_toy.rs
+++ b/piet-gpu-hal/examples/metal_toy.rs
@ -1,25 +0,0 @@
-// Copyright 2021 The piet-gpu authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Also licensed under MIT license, at your choice.
-
-//! An example to exercise the Metal backend. Once that becomes
-//! functional, this file will go away.
-
-use piet_gpu_hal::metal;
-
-fn main() {
-    let instance = metal::MetalInstance;
-    println!("hello metal");
-}
--- a/piet-gpu-hal/examples/shader/build.ninja
+++ b/piet-gpu-hal/examples/shader/build.ninja
@ -1,10 +1,19 @@
 # Build file for shaders.

-# You must have glslangValidator in your path, or patch here.
+# You must have Vulkan tools in your path, or patch here.

 glslang_validator = glslangValidator
+spirv_cross = spirv-cross

 rule glsl
  command = $glslang_validator -V -o $out $in

-build collatz.spv: glsl collatz.comp
+rule hlsl
+  command = $spirv_cross --hlsl $in --output $out
+
+rule msl
+  command = $spirv_cross --msl $in --output $out
+
+build gen/collatz.spv: glsl collatz.comp
+build gen/collatz.hlsl: hlsl gen/collatz.spv
+build gen/collatz.msl: msl gen/collatz.spv
--- a/piet-gpu-hal/examples/shader/gen/collatz.hlsl
+++ b/piet-gpu-hal/examples/shader/gen/collatz.hlsl
@ -0,0 +1,62 @@
+static const uint3 gl_WorkGroupSize = uint3(1u, 1u, 1u);
+
+RWByteAddressBuffer _57 : register(u0);
+
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+float mod(float x, float y)
+{
+    return x - y * floor(x / y);
+}
+
+float2 mod(float2 x, float2 y)
+{
+    return x - y * floor(x / y);
+}
+
+float3 mod(float3 x, float3 y)
+{
+    return x - y * floor(x / y);
+}
+
+float4 mod(float4 x, float4 y)
+{
+    return x - y * floor(x / y);
+}
+
+uint collatz_iterations(inout uint n)
+{
+    uint i = 0u;
+    while (n != 1u)
+    {
+        if (mod(float(n), 2.0f) == 0.0f)
+        {
+            n /= 2u;
+        }
+        else
+        {
+            n = (3u * n) + 1u;
+        }
+        i++;
+    }
+    return i;
+}
+
+void comp_main()
+{
+    uint index = gl_GlobalInvocationID.x;
+    uint param = _57.Load(index * 4 + 0);
+    uint _65 = collatz_iterations(param);
+    _57.Store(index * 4 + 0, _65);
+}
+
+[numthreads(1, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+    comp_main();
+}
--- a/piet-gpu-hal/examples/shader/gen/collatz.msl
+++ b/piet-gpu-hal/examples/shader/gen/collatz.msl
@ -0,0 +1,48 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct PrimeIndices
+{
+    uint indices[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(1u);
+
+// Implementation of the GLSL mod() function, which is slightly different than Metal fmod()
+template<typename Tx, typename Ty>
+inline Tx mod(Tx x, Ty y)
+{
+    return x - y * floor(x / y);
+}
+
+static inline __attribute__((always_inline))
+uint collatz_iterations(thread uint& n)
+{
+    uint i = 0u;
+    while (n != 1u)
+    {
+        if (mod(float(n), 2.0) == 0.0)
+        {
+            n /= 2u;
+        }
+        else
+        {
+            n = (3u * n) + 1u;
+        }
+        i++;
+    }
+    return i;
+}
+
+kernel void main0(device PrimeIndices& _57 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+{
+    uint index = gl_GlobalInvocationID.x;
+    uint param = _57.indices[index];
+    uint _65 = collatz_iterations(param);
+    _57.indices[index] = _65;
+}
+
--- a/piet-gpu-hal/examples/shader/gen/collatz.spv
+++ b/piet-gpu-hal/examples/shader/gen/collatz.spv
--- a/piet-gpu-hal/src/backend.rs
+++ b/piet-gpu-hal/src/backend.rs
@ -0,0 +1,265 @@
+// Copyright 2021 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+//! The generic trait for backends to implement.
+
+use crate::{BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams, mux::ShaderCode};
+
+pub trait Device: Sized {
+    type Buffer: 'static;
+    type Image;
+    type Pipeline;
+    type DescriptorSet;
+    type QueryPool;
+    type CmdBuf: CmdBuf<Self>;
+    type Fence;
+    type Semaphore;
+    type PipelineBuilder: PipelineBuilder<Self>;
+    type DescriptorSetBuilder: DescriptorSetBuilder<Self>;
+    type Sampler;
+    type ShaderSource: ?Sized;
+
+    /// Query the GPU info.
+    ///
+    /// This method may be expensive, so the hub should call it once and retain
+    /// the info.
+    fn query_gpu_info(&self) -> GpuInfo;
+
+    fn create_buffer(&self, size: u64, usage: BufferUsage) -> Result<Self::Buffer, Error>;
+
+    /// Destroy a buffer.
+    ///
+    /// The same safety requirements hold as in Vulkan: the buffer cannot be used
+    /// after this call, and all commands referencing this buffer must have completed.
+    ///
+    /// Maybe doesn't need result return?
+    unsafe fn destroy_buffer(&self, buffer: &Self::Buffer) -> Result<(), Error>;
+
+    unsafe fn create_image2d(&self, width: u32, height: u32) -> Result<Self::Image, Error>;
+
+    /// Destroy an image.
+    ///
+    /// The same safety requirements hold as in Vulkan: the image cannot be used
+    /// after this call, and all commands referencing this image must have completed.
+    ///
+    /// Use this only with images we created, not for swapchain images.
+    ///
+    /// Maybe doesn't need result return?
+    unsafe fn destroy_image(&self, image: &Self::Image) -> Result<(), Error>;
+
+    /// Start building a pipeline.
+    ///
+    /// A pipeline is a bit of shader IR plus a signature for what kinds of resources
+    /// it expects.
+    unsafe fn pipeline_builder(&self) -> Self::PipelineBuilder;
+
+    /// Start building a descriptor set.
+    ///
+    /// A descriptor set is a binding of resources for a given pipeline.
+    unsafe fn descriptor_set_builder(&self) -> Self::DescriptorSetBuilder;
+
+    /// Create a simple compute pipeline that operates on buffers and storage images.
+    ///
+    /// This is provided as a convenience but will probably go away, as the functionality
+    /// is subsumed by the builder.
+    unsafe fn create_simple_compute_pipeline(
+        &self,
+        code: &Self::ShaderSource,
+        n_buffers: u32,
+        n_images: u32,
+    ) -> Result<Self::Pipeline, Error> {
+        let mut builder = self.pipeline_builder();
+        builder.add_buffers(n_buffers);
+        builder.add_images(n_images);
+        builder.create_compute_pipeline(self, code)
+    }
+
+    /// Create a descriptor set for a given pipeline, binding buffers and images.
+    ///
+    /// This is provided as a convenience but will probably go away, as the functionality
+    /// is subsumed by the builder.
+    unsafe fn create_descriptor_set(
+        &self,
+        pipeline: &Self::Pipeline,
+        bufs: &[&Self::Buffer],
+        images: &[&Self::Image],
+    ) -> Result<Self::DescriptorSet, Error> {
+        let mut builder = self.descriptor_set_builder();
+        builder.add_buffers(bufs);
+        builder.add_images(images);
+        builder.build(self, pipeline)
+    }
+
+    fn create_cmd_buf(&self) -> Result<Self::CmdBuf, Error>;
+
+    fn create_query_pool(&self, n_queries: u32) -> Result<Self::QueryPool, Error>;
+
+    /// Get results from query pool, destroying it in the process.
+    ///
+    /// The returned vector is one less than the number of queries; the first is used as
+    /// a baseline.
+    ///
+    /// # Safety
+    /// All submitted commands that refer to this query pool must have completed.
+    unsafe fn fetch_query_pool(&self, pool: &Self::QueryPool) -> Result<Vec<f64>, Error>;
+
+    unsafe fn run_cmd_bufs(
+        &self,
+        cmd_buf: &[&Self::CmdBuf],
+        wait_semaphores: &[&Self::Semaphore],
+        signal_semaphores: &[&Self::Semaphore],
+        fence: Option<&mut Self::Fence>,
+    ) -> Result<(), Error>;
+
+    /// Copy data from the buffer to memory.
+    ///
+    /// Discussion question: add offset?
+    ///
+    /// # Safety
+    ///
+    /// The buffer must be valid to access. The destination memory must be valid to
+    /// write to. The ranges must not overlap. The offset + size must be within
+    /// the buffer's allocation, and size within the destination.
+    unsafe fn read_buffer(
+        &self,
+        buffer: &Self::Buffer,
+        dst: *mut u8,
+        offset: u64,
+        size: u64,
+    ) -> Result<(), Error>;
+
+    /// Copy data from memory to the buffer.
+    ///
+    /// # Safety
+    ///
+    /// The buffer must be valid to access. The source memory must be valid to
+    /// read from. The ranges must not overlap. The offset + size must be within
+    /// the buffer's allocation, and size within the source.
+    unsafe fn write_buffer(
+        &self,
+        buffer: &Self::Buffer,
+        contents: *const u8,
+        offset: u64,
+        size: u64,
+    ) -> Result<(), Error>;
+
+    unsafe fn create_semaphore(&self) -> Result<Self::Semaphore, Error>;
+    unsafe fn create_fence(&self, signaled: bool) -> Result<Self::Fence, Error>;
+    unsafe fn wait_and_reset(&self, fences: Vec<&mut Self::Fence>) -> Result<(), Error>;
+    unsafe fn get_fence_status(&self, fence: &mut Self::Fence) -> Result<bool, Error>;
+
+    unsafe fn create_sampler(&self, params: SamplerParams) -> Result<Self::Sampler, Error>;
+}
+
+pub trait CmdBuf<D: Device> {
+    unsafe fn begin(&mut self);
+
+    unsafe fn finish(&mut self);
+
+    unsafe fn dispatch(
+        &mut self,
+        pipeline: &D::Pipeline,
+        descriptor_set: &D::DescriptorSet,
+        workgroup_count: (u32, u32, u32),
+        workgroup_size: (u32, u32, u32),
+    );
+
+    /// Insert an execution and memory barrier.
+    ///
+    /// Compute kernels (and other actions) after this barrier may read from buffers
+    /// that were written before this barrier.
+    unsafe fn memory_barrier(&mut self);
+
+    /// Insert a barrier for host access to buffers.
+    ///
+    /// The host may read buffers written before this barrier, after the fence for
+    /// the command buffer is signaled.
+    ///
+    /// See http://themaister.net/blog/2019/08/14/yet-another-blog-explaining-vulkan-synchronization/
+    /// ("Host memory reads") for an explanation of this barrier.
+    unsafe fn host_barrier(&mut self);
+
+    unsafe fn image_barrier(
+        &mut self,
+        image: &D::Image,
+        src_layout: ImageLayout,
+        dst_layout: ImageLayout,
+    );
+
+    /// Clear the buffer.
+    ///
+    /// This is readily supported in Vulkan, but for portability it is remarkably
+    /// tricky (unimplemented in gfx-hal right now). Possibly best to write a compute
+    /// kernel, or organize the code not to need it.
+    unsafe fn clear_buffer(&self, buffer: &D::Buffer, size: Option<u64>);
+
+    unsafe fn copy_buffer(&self, src: &D::Buffer, dst: &D::Buffer);
+
+    unsafe fn copy_image_to_buffer(&self, src: &D::Image, dst: &D::Buffer);
+
+    unsafe fn copy_buffer_to_image(&self, src: &D::Buffer, dst: &D::Image);
+
+    // low portability, dx12 doesn't support it natively
+    unsafe fn blit_image(&self, src: &D::Image, dst: &D::Image);
+
+    /// Reset the query pool.
+    ///
+    /// The query pool must be reset before each use, to avoid validation errors.
+    /// This is annoying, and we could tweak the API to make it implicit, doing
+    /// the reset before the first timestamp write.
+    unsafe fn reset_query_pool(&mut self, pool: &D::QueryPool);
+
+    unsafe fn write_timestamp(&mut self, pool: &D::QueryPool, query: u32);
+
+    /// Prepare the timestamps for reading. This isn't required on Vulkan but
+    /// is required on (at least) DX12.
+    unsafe fn finish_timestamps(&mut self, _pool: &D::QueryPool) {}
+}
+
+/// A builder for pipelines with more complex layouts.
+pub trait PipelineBuilder<D: Device> {
+    /// Add buffers to the pipeline. Each has its own binding.
+    fn add_buffers(&mut self, n_buffers: u32);
+    /// Add storage images to the pipeline. Each has its own binding.
+    fn add_images(&mut self, n_images: u32);
+    /// Add a binding with a variable-size array of textures.
+    fn add_textures(&mut self, max_textures: u32);
+    unsafe fn create_compute_pipeline(
+        self,
+        device: &D,
+        code: &D::ShaderSource,
+    ) -> Result<D::Pipeline, Error>;
+}
+
+/// A builder for descriptor sets with more complex layouts.
+///
+/// Note: the order needs to match the pipeline building, and it also needs to
+/// be buffers, then images, then textures.
+pub trait DescriptorSetBuilder<D: Device> {
+    fn add_buffers(&mut self, buffers: &[&D::Buffer]);
+    /// Add an array of storage images.
+    ///
+    /// The images need to be in `ImageLayout::General` layout.
+    fn add_images(&mut self, images: &[&D::Image]);
+    /// Add an array of textures.
+    ///
+    /// The images need to be in `ImageLayout::ShaderRead` layout.
+    ///
+    /// The same sampler is used for all textures, which is not very sophisticated;
+    /// we should have a way to vary the sampler.
+    fn add_textures(&mut self, images: &[&D::Image]);
+    unsafe fn build(self, device: &D, pipeline: &D::Pipeline) -> Result<D::DescriptorSet, Error>;
+}
--- a/piet-gpu-hal/src/dx12.rs
+++ b/piet-gpu-hal/src/dx12.rs
@ -224,7 +224,7 @@ impl Dx12Instance {
    }
 }

-impl crate::Device for Dx12Device {
+impl crate::backend::Device for Dx12Device {
    type Buffer = Buffer;

    type Image = Image;
@ -413,7 +413,7 @@ impl crate::Device for Dx12Device {
        Ok(())
    }

-    unsafe fn get_fence_status(&self, fence: &Self::Fence) -> Result<bool, Error> {
+    unsafe fn get_fence_status(&self, fence: &mut Self::Fence) -> Result<bool, Error> {
        let fence_val = fence.fence.get_value();
        Ok(fence_val == fence.val.get())
    }
@ -451,7 +451,7 @@ impl Dx12Device {
    }
 }

-impl crate::CmdBuf<Dx12Device> for CmdBuf {
+impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
    unsafe fn begin(&mut self) {}

    unsafe fn finish(&mut self) {
@ -468,7 +468,8 @@ impl crate::CmdBuf<Dx12Device> for CmdBuf {
        &mut self,
        pipeline: &Pipeline,
        descriptor_set: &DescriptorSet,
-        size: (u32, u32, u32),
+        workgroup_count: (u32, u32, u32),
+        _workgroup_size: (u32, u32, u32),
    ) {
        self.c.set_pipeline_state(&pipeline.pipeline_state);
        self.c
@ -478,7 +479,8 @@ impl crate::CmdBuf<Dx12Device> for CmdBuf {
            0,
            descriptor_set.0.get_gpu_descriptor_handle_at_offset(0),
        );
-        self.c.dispatch(size.0, size.1, size.2);
+        self.c
+            .dispatch(workgroup_count.0, workgroup_count.1, workgroup_count.2);
    }

    unsafe fn memory_barrier(&mut self) {
@ -554,7 +556,7 @@ impl crate::CmdBuf<Dx12Device> for CmdBuf {
    }
 }

-impl crate::PipelineBuilder<Dx12Device> for PipelineBuilder {
+impl crate::backend::PipelineBuilder<Dx12Device> for PipelineBuilder {
    fn add_buffers(&mut self, n_buffers: u32) {
        if n_buffers != 0 {
            self.ranges.push(d3d12::D3D12_DESCRIPTOR_RANGE {
@ -630,7 +632,7 @@ impl crate::PipelineBuilder<Dx12Device> for PipelineBuilder {
    }
 }

-impl crate::DescriptorSetBuilder<Dx12Device> for DescriptorSetBuilder {
+impl crate::backend::DescriptorSetBuilder<Dx12Device> for DescriptorSetBuilder {
    fn add_buffers(&mut self, buffers: &[&Buffer]) {
        // Note: we could get rid of the clone here (which is an AddRef)
        // and store a raw pointer, as it's a safety precondition that
--- a/piet-gpu-hal/src/hub.rs
+++ b/piet-gpu-hal/src/hub.rs
@ -123,7 +123,7 @@ impl Session {
        unsafe {
            let mut i = 0;
            while i < pending.len() {
-                if let Ok(true) = self.0.device.get_fence_status(&pending[i].fence) {
+                if let Ok(true) = self.0.device.get_fence_status(&mut pending[i].fence) {
                    let mut item = pending.swap_remove(i);
                    // TODO: wait is superfluous, can just reset
                    let _ = self.0.device.wait_and_reset(vec![&mut item.fence]);
@ -295,6 +295,11 @@ impl Session {
    pub fn gpu_info(&self) -> &GpuInfo {
        &self.0.gpu_info
    }
+
+    /// Choose shader code from the available choices.
+    pub fn choose_shader<'a>(&self, spv: &'a [u8], hlsl: &'a str, msl: &'a str) -> ShaderCode<'a> {
+        self.0.device.choose_shader(spv, hlsl, msl)
+    }
 }

 impl CmdBuf {
--- a/piet-gpu-hal/src/lib.rs
+++ b/piet-gpu-hal/src/lib.rs
@ -4,6 +4,7 @@
 /// In time, it may go away and be replaced by either gfx-hal or wgpu.
 use bitflags::bitflags;

+pub mod backend;
 pub mod hub;

 #[macro_use]
@ -26,9 +27,13 @@ mux_cfg! {
 #[cfg(target_os = "macos")]
 pub mod metal;

-/// This isn't great but is expedient.
+/// The common error type for the crate.
+///
+/// This keeps things imple and can be expanded later.
 pub type Error = Box<dyn std::error::Error>;

+pub use crate::backend::CmdBuf;
+
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
 pub enum ImageLayout {
    Undefined,
@ -92,248 +97,3 @@ pub struct SubgroupSize {
    min: u32,
    max: u32,
 }
-
-pub trait Device: Sized {
-    type Buffer: 'static;
-    type Image;
-    type Pipeline;
-    type DescriptorSet;
-    type QueryPool;
-    type CmdBuf: CmdBuf<Self>;
-    type Fence;
-    type Semaphore;
-    type PipelineBuilder: PipelineBuilder<Self>;
-    type DescriptorSetBuilder: DescriptorSetBuilder<Self>;
-    type Sampler;
-    type ShaderSource: ?Sized;
-
-    /// Query the GPU info.
-    ///
-    /// This method may be expensive, so the hub should call it once and retain
-    /// the info.
-    fn query_gpu_info(&self) -> GpuInfo;
-
-    fn create_buffer(&self, size: u64, usage: BufferUsage) -> Result<Self::Buffer, Error>;
-
-    /// Destroy a buffer.
-    ///
-    /// The same safety requirements hold as in Vulkan: the buffer cannot be used
-    /// after this call, and all commands referencing this buffer must have completed.
-    ///
-    /// Maybe doesn't need result return?
-    unsafe fn destroy_buffer(&self, buffer: &Self::Buffer) -> Result<(), Error>;
-
-    unsafe fn create_image2d(&self, width: u32, height: u32) -> Result<Self::Image, Error>;
-
-    /// Destroy an image.
-    ///
-    /// The same safety requirements hold as in Vulkan: the image cannot be used
-    /// after this call, and all commands referencing this image must have completed.
-    ///
-    /// Use this only with images we created, not for swapchain images.
-    ///
-    /// Maybe doesn't need result return?
-    unsafe fn destroy_image(&self, image: &Self::Image) -> Result<(), Error>;
-
-    /// Start building a pipeline.
-    ///
-    /// A pipeline is a bit of shader IR plus a signature for what kinds of resources
-    /// it expects.
-    unsafe fn pipeline_builder(&self) -> Self::PipelineBuilder;
-
-    /// Start building a descriptor set.
-    ///
-    /// A descriptor set is a binding of resources for a given pipeline.
-    unsafe fn descriptor_set_builder(&self) -> Self::DescriptorSetBuilder;
-
-    /// Create a simple compute pipeline that operates on buffers and storage images.
-    ///
-    /// This is provided as a convenience but will probably go away, as the functionality
-    /// is subsumed by the builder.
-    unsafe fn create_simple_compute_pipeline(
-        &self,
-        code: &Self::ShaderSource,
-        n_buffers: u32,
-        n_images: u32,
-    ) -> Result<Self::Pipeline, Error> {
-        let mut builder = self.pipeline_builder();
-        builder.add_buffers(n_buffers);
-        builder.add_images(n_images);
-        builder.create_compute_pipeline(self, code)
-    }
-
-    /// Create a descriptor set for a given pipeline, binding buffers and images.
-    ///
-    /// This is provided as a convenience but will probably go away, as the functionality
-    /// is subsumed by the builder.
-    unsafe fn create_descriptor_set(
-        &self,
-        pipeline: &Self::Pipeline,
-        bufs: &[&Self::Buffer],
-        images: &[&Self::Image],
-    ) -> Result<Self::DescriptorSet, Error> {
-        let mut builder = self.descriptor_set_builder();
-        builder.add_buffers(bufs);
-        builder.add_images(images);
-        builder.build(self, pipeline)
-    }
-
-    fn create_cmd_buf(&self) -> Result<Self::CmdBuf, Error>;
-
-    fn create_query_pool(&self, n_queries: u32) -> Result<Self::QueryPool, Error>;
-
-    /// Get results from query pool, destroying it in the process.
-    ///
-    /// The returned vector is one less than the number of queries; the first is used as
-    /// a baseline.
-    ///
-    /// # Safety
-    /// All submitted commands that refer to this query pool must have completed.
-    unsafe fn fetch_query_pool(&self, pool: &Self::QueryPool) -> Result<Vec<f64>, Error>;
-
-    unsafe fn run_cmd_bufs(
-        &self,
-        cmd_buf: &[&Self::CmdBuf],
-        wait_semaphores: &[&Self::Semaphore],
-        signal_semaphores: &[&Self::Semaphore],
-        fence: Option<&mut Self::Fence>,
-    ) -> Result<(), Error>;
-
-    /// Copy data from the buffer to memory.
-    ///
-    /// Discussion question: add offset?
-    ///
-    /// # Safety
-    ///
-    /// The buffer must be valid to access. The destination memory must be valid to
-    /// write to. The ranges must not overlap. The offset + size must be within
-    /// the buffer's allocation, and size within the destination.
-    unsafe fn read_buffer(
-        &self,
-        buffer: &Self::Buffer,
-        dst: *mut u8,
-        offset: u64,
-        size: u64,
-    ) -> Result<(), Error>;
-
-    /// Copy data from memory to the buffer.
-    ///
-    /// # Safety
-    ///
-    /// The buffer must be valid to access. The source memory must be valid to
-    /// read from. The ranges must not overlap. The offset + size must be within
-    /// the buffer's allocation, and size within the source.
-    unsafe fn write_buffer(
-        &self,
-        buffer: &Self::Buffer,
-        contents: *const u8,
-        offset: u64,
-        size: u64,
-    ) -> Result<(), Error>;
-
-    unsafe fn create_semaphore(&self) -> Result<Self::Semaphore, Error>;
-    unsafe fn create_fence(&self, signaled: bool) -> Result<Self::Fence, Error>;
-    unsafe fn wait_and_reset(&self, fences: Vec<&mut Self::Fence>) -> Result<(), Error>;
-    unsafe fn get_fence_status(&self, fence: &Self::Fence) -> Result<bool, Error>;
-
-    unsafe fn create_sampler(&self, params: SamplerParams) -> Result<Self::Sampler, Error>;
-}
-
-pub trait CmdBuf<D: Device> {
-    unsafe fn begin(&mut self);
-
-    unsafe fn finish(&mut self);
-
-    unsafe fn dispatch(
-        &mut self,
-        pipeline: &D::Pipeline,
-        descriptor_set: &D::DescriptorSet,
-        size: (u32, u32, u32),
-    );
-
-    /// Insert an execution and memory barrier.
-    ///
-    /// Compute kernels (and other actions) after this barrier may read from buffers
-    /// that were written before this barrier.
-    unsafe fn memory_barrier(&mut self);
-
-    /// Insert a barrier for host access to buffers.
-    ///
-    /// The host may read buffers written before this barrier, after the fence for
-    /// the command buffer is signaled.
-    ///
-    /// See http://themaister.net/blog/2019/08/14/yet-another-blog-explaining-vulkan-synchronization/
-    /// ("Host memory reads") for an explanation of this barrier.
-    unsafe fn host_barrier(&mut self);
-
-    unsafe fn image_barrier(
-        &mut self,
-        image: &D::Image,
-        src_layout: ImageLayout,
-        dst_layout: ImageLayout,
-    );
-
-    /// Clear the buffer.
-    ///
-    /// This is readily supported in Vulkan, but for portability it is remarkably
-    /// tricky (unimplemented in gfx-hal right now). Possibly best to write a compute
-    /// kernel, or organize the code not to need it.
-    unsafe fn clear_buffer(&self, buffer: &D::Buffer, size: Option<u64>);
-
-    unsafe fn copy_buffer(&self, src: &D::Buffer, dst: &D::Buffer);
-
-    unsafe fn copy_image_to_buffer(&self, src: &D::Image, dst: &D::Buffer);
-
-    unsafe fn copy_buffer_to_image(&self, src: &D::Buffer, dst: &D::Image);
-
-    // low portability, dx12 doesn't support it natively
-    unsafe fn blit_image(&self, src: &D::Image, dst: &D::Image);
-
-    /// Reset the query pool.
-    ///
-    /// The query pool must be reset before each use, to avoid validation errors.
-    /// This is annoying, and we could tweak the API to make it implicit, doing
-    /// the reset before the first timestamp write.
-    unsafe fn reset_query_pool(&mut self, pool: &D::QueryPool);
-
-    unsafe fn write_timestamp(&mut self, pool: &D::QueryPool, query: u32);
-
-    /// Prepare the timestamps for reading. This isn't required on Vulkan but
-    /// is required on (at least) DX12.
-    unsafe fn finish_timestamps(&mut self, pool: &D::QueryPool) {}
-}
-
-/// A builder for pipelines with more complex layouts.
-pub trait PipelineBuilder<D: Device> {
-    /// Add buffers to the pipeline. Each has its own binding.
-    fn add_buffers(&mut self, n_buffers: u32);
-    /// Add storage images to the pipeline. Each has its own binding.
-    fn add_images(&mut self, n_images: u32);
-    /// Add a binding with a variable-size array of textures.
-    fn add_textures(&mut self, max_textures: u32);
-    unsafe fn create_compute_pipeline(
-        self,
-        device: &D,
-        code: &D::ShaderSource,
-    ) -> Result<D::Pipeline, Error>;
-}
-
-/// A builder for descriptor sets with more complex layouts.
-///
-/// Note: the order needs to match the pipeline building, and it also needs to
-/// be buffers, then images, then textures.
-pub trait DescriptorSetBuilder<D: Device> {
-    fn add_buffers(&mut self, buffers: &[&D::Buffer]);
-    /// Add an array of storage images.
-    ///
-    /// The images need to be in `ImageLayout::General` layout.
-    fn add_images(&mut self, images: &[&D::Image]);
-    /// Add an array of textures.
-    ///
-    /// The images need to be in `ImageLayout::ShaderRead` layout.
-    ///
-    /// The same sampler is used for all textures, which is not very sophisticated;
-    /// we should have a way to vary the sampler.
-    fn add_textures(&mut self, images: &[&D::Image]);
-    unsafe fn build(self, device: &D, pipeline: &D::Pipeline) -> Result<D::DescriptorSet, Error>;
-}
--- a/piet-gpu-hal/src/macros.rs
+++ b/piet-gpu-hal/src/macros.rs
@ -117,9 +117,9 @@ macro_rules! mux_device_enum {
        $crate::mux_enum! {
            $(#[$outer])*
            pub enum $assoc_type {
-                Vk(<$crate::vulkan::VkDevice as $crate::Device>::$assoc_type),
-                Dx12(<$crate::dx12::Dx12Device as $crate::Device>::$assoc_type),
-                Mtl(<$crate::metal::MtlDevice as $crate::Device>::$assoc_type),
+                Vk(<$crate::vulkan::VkDevice as $crate::backend::Device>::$assoc_type),
+                Dx12(<$crate::dx12::Dx12Device as $crate::backend::Device>::$assoc_type),
+                Mtl(<$crate::metal::MtlDevice as $crate::backend::Device>::$assoc_type),
            }
        }
    }
@ -154,3 +154,15 @@ macro_rules! mux_match {
        }
    };
 }
+
+/// A convenience macro for selecting a shader from included files.
+#[macro_export]
+macro_rules! include_shader {
+    ( $device:expr, $path_base:expr) => {
+        $device.choose_shader(
+            include_bytes!(concat!($path_base, ".spv")),
+            include_str!(concat!($path_base, ".hlsl")),
+            include_str!(concat!($path_base, ".msl")),
+        )
+    };
+}
--- a/piet-gpu-hal/src/metal.rs
+++ b/piet-gpu-hal/src/metal.rs
@ -102,7 +102,11 @@ impl MtlInstance {
                has_memory_model: false,
                use_staging_buffers: use_staging_buffers,
            };
-            Ok(MtlDevice { device, cmd_queue, gpu_info })
+            Ok(MtlDevice {
+                device,
+                cmd_queue,
+                gpu_info,
+            })
        } else {
            Err("can't create system default Metal device".into())
        }
@ -119,7 +123,7 @@ impl MtlInstance {
    }
 }

-impl crate::Device for MtlDevice {
+impl crate::backend::Device for MtlDevice {
    type Buffer = Buffer;

    type Image = Image;
@ -282,9 +286,13 @@ impl crate::Device for MtlDevice {
        Ok(())
    }

-    unsafe fn get_fence_status(&self, fence: &Self::Fence) -> Result<bool, Error> {
-        // fence need to be mutable here :/
-        todo!()
+    unsafe fn get_fence_status(&self, fence: &mut Self::Fence) -> Result<bool, Error> {
+        match fence {
+            Fence::Idle => Ok(true),
+            Fence::CmdBufPending(cmd_buf) => {
+                Ok(cmd_buf.status() == metal::MTLCommandBufferStatus::Completed)
+            }
+        }
    }

    unsafe fn create_sampler(&self, params: crate::SamplerParams) -> Result<Self::Sampler, Error> {
@ -292,18 +300,17 @@ impl crate::Device for MtlDevice {
    }
 }

-impl crate::CmdBuf<MtlDevice> for CmdBuf {
-    unsafe fn begin(&mut self) {
-    }
+impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
+    unsafe fn begin(&mut self) {}

-    unsafe fn finish(&mut self) {
-    }
+    unsafe fn finish(&mut self) {}

    unsafe fn dispatch(
        &mut self,
        pipeline: &Pipeline,
        descriptor_set: &DescriptorSet,
-        size: (u32, u32, u32),
+        workgroup_count: (u32, u32, u32),
+        workgroup_size: (u32, u32, u32),
    ) {
        let encoder = self.cmd_buf.new_compute_command_encoder();
        encoder.set_compute_pipeline_state(&pipeline.0);
@ -313,19 +320,17 @@ impl crate::CmdBuf<MtlDevice> for CmdBuf {
            ix += 1;
        }
        // TODO: set images
-        let work_group_count = metal::MTLSize {
-            width: size.0 as u64,
-            height: size.1 as u64,
-            depth: size.2 as u64,
+        let workgroup_count = metal::MTLSize {
+            width: workgroup_count.0 as u64,
+            height: workgroup_count.1 as u64,
+            depth: workgroup_count.2 as u64,
        };
-        // TODO: we need to pass this in explicitly. In gfx-hal, this is parsed from
-        // the spv before translation.
-        let work_group_size = metal::MTLSize {
-            width: 1,
-            height: 1,
-            depth: 1,
+        let workgroup_size = metal::MTLSize {
+            width: workgroup_size.0 as u64,
+            height: workgroup_size.1 as u64,
+            depth: workgroup_size.2 as u64,
        };
-        encoder.dispatch_thread_groups(work_group_count, work_group_size);
+        encoder.dispatch_thread_groups(workgroup_count, workgroup_size);
        encoder.end_encoding();
    }

@ -334,8 +339,7 @@ impl crate::CmdBuf<MtlDevice> for CmdBuf {
        // Metal's own tracking.
    }

-    unsafe fn host_barrier(&mut self) {
-    }
+    unsafe fn host_barrier(&mut self) {}

    unsafe fn image_barrier(
        &mut self,
@ -366,9 +370,7 @@ impl crate::CmdBuf<MtlDevice> for CmdBuf {
        todo!()
    }

-    unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {
-        
-    }
+    unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {}

    unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
        // TODO
@ -377,17 +379,15 @@ impl crate::CmdBuf<MtlDevice> for CmdBuf {
    }
 }

-impl crate::PipelineBuilder<MtlDevice> for PipelineBuilder {
+impl crate::backend::PipelineBuilder<MtlDevice> for PipelineBuilder {
    fn add_buffers(&mut self, _n_buffers: u32) {
        // My understanding is that Metal infers the pipeline layout from
        // the source.
    }

-    fn add_images(&mut self, _n_images: u32) {
-    }
+    fn add_images(&mut self, _n_images: u32) {}

-    fn add_textures(&mut self, _max_textures: u32) {
-    }
+    fn add_textures(&mut self, _max_textures: u32) {}

    unsafe fn create_compute_pipeline(
        self,
@ -399,12 +399,14 @@ impl crate::PipelineBuilder<MtlDevice> for PipelineBuilder {
        let library = device.device.new_library_with_source(code, &options)?;
        // This seems to be the default name from spirv-cross, but we may need to tweak.
        let function = library.get_function("main0", None)?;
-        let pipeline = device.device.new_compute_pipeline_state_with_function(&function)?;
+        let pipeline = device
+            .device
+            .new_compute_pipeline_state_with_function(&function)?;
        Ok(Pipeline(pipeline))
    }
 }

-impl crate::DescriptorSetBuilder<MtlDevice> for DescriptorSetBuilder {
+impl crate::backend::DescriptorSetBuilder<MtlDevice> for DescriptorSetBuilder {
    fn add_buffers(&mut self, buffers: &[&Buffer]) {
        self.0.buffers.extend(buffers.iter().copied().cloned());
    }
--- a/piet-gpu-hal/src/mux.rs
+++ b/piet-gpu-hal/src/mux.rs
@ -30,10 +30,10 @@ mux_cfg! {
    #[cfg(mtl)]
    use crate::metal;
 }
-use crate::CmdBuf as CmdBufTrait;
-use crate::DescriptorSetBuilder as DescriptorSetBuilderTrait;
-use crate::Device as DeviceTrait;
-use crate::PipelineBuilder as PipelineBuilderTrait;
+use crate::backend::CmdBuf as CmdBufTrait;
+use crate::backend::DescriptorSetBuilder as DescriptorSetBuilderTrait;
+use crate::backend::Device as DeviceTrait;
+use crate::backend::PipelineBuilder as PipelineBuilderTrait;
 use crate::{BufferUsage, Error, GpuInfo, ImageLayout};

 mux_enum! {
@ -255,11 +255,11 @@ impl Device {
        }
    }

-    pub unsafe fn get_fence_status(&self, fence: &Fence) -> Result<bool, Error> {
+    pub unsafe fn get_fence_status(&self, fence: &mut Fence) -> Result<bool, Error> {
        mux_match! { self;
-            Device::Vk(d) => d.get_fence_status(fence.vk()),
-            Device::Dx12(d) => d.get_fence_status(fence.dx12()),
-            Device::Mtl(d) => d.get_fence_status(fence.mtl()),
+            Device::Vk(d) => d.get_fence_status(fence.vk_mut()),
+            Device::Dx12(d) => d.get_fence_status(fence.dx12_mut()),
+            Device::Mtl(d) => d.get_fence_status(fence.mtl_mut()),
        }
    }

@ -400,6 +400,15 @@ impl Device {
            Device::Mtl(d) => d.write_buffer(buffer.mtl(), contents, offset, size),
        }
    }
+
+    /// Choose shader code from the available choices.
+    pub fn choose_shader<'a>(&self, _spv: &'a [u8], _hlsl: &'a str, _msl: &'a str) -> ShaderCode<'a> {
+        mux_match! { self;
+            Device::Vk(_d) => ShaderCode::Spv(_spv),
+            Device::Dx12(_d) => ShaderCode::Hlsl(_hlsl),
+            Device::Mtl(_d) => ShaderCode::Msl(_msl),
+        }
+    }
 }

 impl PipelineBuilder {
@ -578,16 +587,25 @@ impl CmdBuf {
        }
    }

+    /// Dispatch a compute shader.
+    ///
+    /// Note that both the number of workgroups (`workgroup_count`) and the number of
+    /// threads in a workgroup (`workgroup_size`) are given. The latter is needed on
+    /// Metal, while it's baked into the shader on Vulkan and DX12.
+    ///
+    /// Perhaps we'll have a mechanism to plumb the latter value to configure the size
+    /// of a workgroup using specialization constants in the future.
    pub unsafe fn dispatch(
        &mut self,
        pipeline: &Pipeline,
        descriptor_set: &DescriptorSet,
-        size: (u32, u32, u32),
+        workgroup_count: (u32, u32, u32),
+        workgroup_size: (u32, u32, u32),
    ) {
        mux_match! { self;
-            CmdBuf::Vk(c) => c.dispatch(pipeline.vk(), descriptor_set.vk(), size),
-            CmdBuf::Dx12(c) => c.dispatch(pipeline.dx12(), descriptor_set.dx12(), size),
-            CmdBuf::Mtl(c) => c.dispatch(pipeline.mtl(), descriptor_set.mtl(), size),
+            CmdBuf::Vk(c) => c.dispatch(pipeline.vk(), descriptor_set.vk(), workgroup_count, workgroup_size),
+            CmdBuf::Dx12(c) => c.dispatch(pipeline.dx12(), descriptor_set.dx12(), workgroup_count, workgroup_size),
+            CmdBuf::Mtl(c) => c.dispatch(pipeline.mtl(), descriptor_set.mtl(), workgroup_count, workgroup_size),
        }
    }

--- a/piet-gpu-hal/src/vulkan.rs
+++ b/piet-gpu-hal/src/vulkan.rs
@ -13,8 +13,10 @@ use ash::{vk, Device, Entry, Instance};
 use smallvec::SmallVec;

 use crate::{
-    BufferUsage, Device as DeviceTrait, Error, GpuInfo, ImageLayout, SamplerParams, SubgroupSize,
+    BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams, SubgroupSize,
 };
+use crate::backend::Device as DeviceTrait;
+

 pub struct VkInstance {
    /// Retain the dynamic lib.
@ -455,7 +457,7 @@ impl VkInstance {
    }
 }

-impl crate::Device for VkDevice {
+impl crate::backend::Device for VkDevice {
    type Buffer = Buffer;
    type Image = Image;
    type CmdBuf = CmdBuf;
@ -621,16 +623,13 @@ impl crate::Device for VkDevice {

    unsafe fn wait_and_reset(&self, fences: Vec<&mut Self::Fence>) -> Result<(), Error> {
        let device = &self.device.device;
-        let fences = fences
-            .iter()
-            .map(|f| **f)
-            .collect::<SmallVec<[_; 4]>>();
+        let fences = fences.iter().map(|f| **f).collect::<SmallVec<[_; 4]>>();
        device.wait_for_fences(&fences, true, !0)?;
        device.reset_fences(&fences)?;
        Ok(())
    }

-    unsafe fn get_fence_status(&self, fence: &Self::Fence) -> Result<bool, Error> {
+    unsafe fn get_fence_status(&self, fence: &mut Self::Fence) -> Result<bool, Error> {
        let device = &self.device.device;
        Ok(device.get_fence_status(*fence)?)
    }
@ -843,7 +842,8 @@ impl crate::CmdBuf<VkDevice> for CmdBuf {
        &mut self,
        pipeline: &Pipeline,
        descriptor_set: &DescriptorSet,
-        size: (u32, u32, u32),
+        workgroup_count: (u32, u32, u32),
+        _workgroup_size: (u32, u32, u32),
    ) {
        let device = &self.device.device;
        device.cmd_bind_pipeline(
@ -859,7 +859,12 @@ impl crate::CmdBuf<VkDevice> for CmdBuf {
            &[descriptor_set.descriptor_set],
            &[],
        );
-        device.cmd_dispatch(self.cmd_buf, size.0, size.1, size.2);
+        device.cmd_dispatch(
+            self.cmd_buf,
+            workgroup_count.0,
+            workgroup_count.1,
+            workgroup_count.2,
+        );
    }

    /// Insert a pipeline barrier for all memory accesses.
@ -1047,7 +1052,7 @@ impl crate::CmdBuf<VkDevice> for CmdBuf {
    }
 }

-impl crate::PipelineBuilder<VkDevice> for PipelineBuilder {
+impl crate::backend::PipelineBuilder<VkDevice> for PipelineBuilder {
    fn add_buffers(&mut self, n_buffers: u32) {
        let start = self.bindings.len() as u32;
        for i in 0..n_buffers {
@ -1153,7 +1158,7 @@ impl crate::PipelineBuilder<VkDevice> for PipelineBuilder {
    }
 }

-impl crate::DescriptorSetBuilder<VkDevice> for DescriptorSetBuilder {
+impl crate::backend::DescriptorSetBuilder<VkDevice> for DescriptorSetBuilder {
    fn add_buffers(&mut self, buffers: &[&Buffer]) {
        self.buffers.extend(buffers.iter().map(|b| b.buffer));
    }
@ -1307,7 +1312,11 @@ impl VkSwapchain {
        image_idx: usize,
        semaphores: &[&vk::Semaphore],
    ) -> Result<bool, Error> {
-        let semaphores = semaphores.iter().copied().copied().collect::<SmallVec<[_; 4]>>();
+        let semaphores = semaphores
+            .iter()
+            .copied()
+            .copied()
+            .collect::<SmallVec<[_; 4]>>();
        Ok(self.swapchain_fn.queue_present(
            self.present_queue,
            &vk::PresentInfoKHR::builder()
--- a/piet-gpu/bin/android.rs
+++ b/piet-gpu/bin/android.rs
@ -12,7 +12,7 @@ use ndk::native_window::NativeWindow;
 use ndk_glue::Event;

 use piet_gpu_hal::hub;
-use piet_gpu_hal::mux::{QueryPool, Instance, Surface, Swapchain};
+use piet_gpu_hal::mux::{Instance, QueryPool, Surface, Swapchain};
 use piet_gpu_hal::{CmdBuf, Error, ImageLayout};

 use piet_gpu::{render_scene, PietGpuRenderContext, Renderer};
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -403,6 +403,7 @@ impl Renderer {
            &self.el_pipeline,
            &self.el_ds,
            (((self.n_elements + 127) / 128) as u32, 1, 1),
+            (128, 1, 1),
        );
        cmd_buf.write_timestamp(&query_pool, 1);
        cmd_buf.memory_barrier();
@ -410,6 +411,7 @@ impl Renderer {
            &self.tile_pipeline,
            &self.tile_ds,
            (((self.n_paths + 255) / 256) as u32, 1, 1),
+            (256, 1, 1),
        );
        cmd_buf.write_timestamp(&query_pool, 2);
        cmd_buf.memory_barrier();
@ -417,6 +419,7 @@ impl Renderer {
            &self.path_pipeline,
            &self.path_ds,
            (((self.n_pathseg + 31) / 32) as u32, 1, 1),
+            (32, 1, 1),
        );
        cmd_buf.write_timestamp(&query_pool, 3);
        cmd_buf.memory_barrier();
@ -424,6 +427,7 @@ impl Renderer {
            &self.backdrop_pipeline,
            &self.backdrop_ds,
            (((self.n_paths + 255) / 256) as u32, 1, 1),
+            (256, 1, 1),
        );
        cmd_buf.write_timestamp(&query_pool, 4);
        // Note: this barrier is not needed as an actual dependency between
@ -434,6 +438,7 @@ impl Renderer {
            &self.bin_pipeline,
            &self.bin_ds,
            (((self.n_paths + 255) / 256) as u32, 1, 1),
+            (256, 1, 1),
        );
        cmd_buf.write_timestamp(&query_pool, 5);
        cmd_buf.memory_barrier();
@ -441,6 +446,7 @@ impl Renderer {
            &self.coarse_pipeline,
            &self.coarse_ds,
            ((WIDTH as u32 + 255) / 256, (HEIGHT as u32 + 255) / 256, 1),
+            (256, 256, 1),
        );
        cmd_buf.write_timestamp(&query_pool, 6);
        cmd_buf.memory_barrier();
@ -448,6 +454,7 @@ impl Renderer {
            &self.k4_pipeline,
            &self.k4_ds,
            ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1),
+            (8, 4, 1),
        );
        cmd_buf.write_timestamp(&query_pool, 7);
        cmd_buf.memory_barrier();