diff --git a/piet-gpu-hal/examples/collatz.rs b/piet-gpu-hal/examples/collatz.rs
index 4220148..d31bb4c 100644
--- a/piet-gpu-hal/examples/collatz.rs
+++ b/piet-gpu-hal/examples/collatz.rs
@@ -1,6 +1,7 @@
 use piet_gpu_hal::hub;
-use piet_gpu_hal::mux::{Instance, ShaderCode};
+use piet_gpu_hal::mux::Instance;
 use piet_gpu_hal::BufferUsage;
+use piet_gpu_hal::include_shader;
 
 fn main() {
     let (instance, _) = Instance::new(None).unwrap();
@@ -10,7 +11,7 @@ fn main() {
         let usage = BufferUsage::MAP_READ | BufferUsage::STORAGE;
         let src = (0..256).map(|x| x + 1).collect::<Vec<u32>>();
         let buffer = session.create_buffer_init(&src, usage).unwrap();
-        let code = ShaderCode::Msl(include_str!("./shader/collatz.msl"));
+        let code = include_shader!(&session, "./shader/gen/collatz");
         let pipeline = session.create_simple_compute_pipeline(code, 1).unwrap();
         let descriptor_set = session
             .create_simple_descriptor_set(&pipeline, &[&buffer])
@@ -20,7 +21,7 @@ fn main() {
         cmd_buf.begin();
         cmd_buf.reset_query_pool(&query_pool);
         cmd_buf.write_timestamp(&query_pool, 0);
-        cmd_buf.dispatch(&pipeline, &descriptor_set, (256, 1, 1));
+        cmd_buf.dispatch(&pipeline, &descriptor_set, (256, 1, 1), (1, 1, 1));
         cmd_buf.write_timestamp(&query_pool, 1);
         cmd_buf.host_barrier();
         cmd_buf.finish();
diff --git a/piet-gpu-hal/examples/dx12_toy.rs b/piet-gpu-hal/examples/dx12_toy.rs
index 2ad9dfe..d8df466 100644
--- a/piet-gpu-hal/examples/dx12_toy.rs
+++ b/piet-gpu-hal/examples/dx12_toy.rs
@@ -2,7 +2,8 @@
 //! This will probably go away when it's fully implemented and we can
 //! just use the hub.
 
-use piet_gpu_hal::{dx12, BufferUsage, CmdBuf, Device, Error};
+use piet_gpu_hal::{dx12, BufferUsage, Error};
+use piet_gpu_hal::backend::{CmdBuf, Device};
 
 const SHADER_CODE: &str = r#"RWByteAddressBuffer _53 : register(u0, space0);
 
@@ -78,7 +79,7 @@ fn toy() -> Result<(), Error> {
         cmd_buf.copy_buffer(&buf, &dev_buf);
         cmd_buf.memory_barrier();
         cmd_buf.write_timestamp(&query_pool, 0);
-        cmd_buf.dispatch(&pipeline, &ds, (1, 1, 1));
+        cmd_buf.dispatch(&pipeline, &ds, (1, 1, 1), (256, 1, 1));
         cmd_buf.write_timestamp(&query_pool, 1);
         cmd_buf.memory_barrier();
         cmd_buf.copy_buffer(&dev_buf, &buf);
diff --git a/piet-gpu-hal/examples/metal_toy.rs b/piet-gpu-hal/examples/metal_toy.rs
deleted file mode 100644
index a20a5af..0000000
--- a/piet-gpu-hal/examples/metal_toy.rs
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright 2021 The piet-gpu authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// Also licensed under MIT license, at your choice.
-
-//! An example to exercise the Metal backend. Once that becomes
-//! functional, this file will go away.
-
-use piet_gpu_hal::metal;
-
-fn main() {
-    let instance = metal::MetalInstance;
-    println!("hello metal");
-}
diff --git a/piet-gpu-hal/examples/shader/build.ninja b/piet-gpu-hal/examples/shader/build.ninja
index 848637a..f1c6328 100644
--- a/piet-gpu-hal/examples/shader/build.ninja
+++ b/piet-gpu-hal/examples/shader/build.ninja
@@ -1,10 +1,19 @@
 # Build file for shaders.
 
-# You must have glslangValidator in your path, or patch here.
+# You must have Vulkan tools in your path, or patch here.
 
 glslang_validator = glslangValidator
+spirv_cross = spirv-cross
 
 rule glsl
   command = $glslang_validator -V -o $out $in
 
-build collatz.spv: glsl collatz.comp
+rule hlsl
+  command = $spirv_cross --hlsl $in --output $out
+
+rule msl
+  command = $spirv_cross --msl $in --output $out
+
+build gen/collatz.spv: glsl collatz.comp
+build gen/collatz.hlsl: hlsl gen/collatz.spv
+build gen/collatz.msl: msl gen/collatz.spv
diff --git a/piet-gpu-hal/examples/shader/gen/collatz.hlsl b/piet-gpu-hal/examples/shader/gen/collatz.hlsl
new file mode 100644
index 0000000..d5fcd56
--- /dev/null
+++ b/piet-gpu-hal/examples/shader/gen/collatz.hlsl
@@ -0,0 +1,62 @@
+static const uint3 gl_WorkGroupSize = uint3(1u, 1u, 1u);
+
+RWByteAddressBuffer _57 : register(u0);
+
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+float mod(float x, float y)
+{
+    return x - y * floor(x / y);
+}
+
+float2 mod(float2 x, float2 y)
+{
+    return x - y * floor(x / y);
+}
+
+float3 mod(float3 x, float3 y)
+{
+    return x - y * floor(x / y);
+}
+
+float4 mod(float4 x, float4 y)
+{
+    return x - y * floor(x / y);
+}
+
+uint collatz_iterations(inout uint n)
+{
+    uint i = 0u;
+    while (n != 1u)
+    {
+        if (mod(float(n), 2.0f) == 0.0f)
+        {
+            n /= 2u;
+        }
+        else
+        {
+            n = (3u * n) + 1u;
+        }
+        i++;
+    }
+    return i;
+}
+
+void comp_main()
+{
+    uint index = gl_GlobalInvocationID.x;
+    uint param = _57.Load(index * 4 + 0);
+    uint _65 = collatz_iterations(param);
+    _57.Store(index * 4 + 0, _65);
+}
+
+[numthreads(1, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+    comp_main();
+}
diff --git a/piet-gpu-hal/examples/shader/gen/collatz.msl b/piet-gpu-hal/examples/shader/gen/collatz.msl
new file mode 100644
index 0000000..c2592c0
--- /dev/null
+++ b/piet-gpu-hal/examples/shader/gen/collatz.msl
@@ -0,0 +1,48 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct PrimeIndices
+{
+    uint indices[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(1u);
+
+// Implementation of the GLSL mod() function, which is slightly different than Metal fmod()
+template<typename Tx, typename Ty>
+inline Tx mod(Tx x, Ty y)
+{
+    return x - y * floor(x / y);
+}
+
+static inline __attribute__((always_inline))
+uint collatz_iterations(thread uint& n)
+{
+    uint i = 0u;
+    while (n != 1u)
+    {
+        if (mod(float(n), 2.0) == 0.0)
+        {
+            n /= 2u;
+        }
+        else
+        {
+            n = (3u * n) + 1u;
+        }
+        i++;
+    }
+    return i;
+}
+
+kernel void main0(device PrimeIndices& _57 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+{
+    uint index = gl_GlobalInvocationID.x;
+    uint param = _57.indices[index];
+    uint _65 = collatz_iterations(param);
+    _57.indices[index] = _65;
+}
+
diff --git a/piet-gpu-hal/examples/shader/collatz.spv b/piet-gpu-hal/examples/shader/gen/collatz.spv
similarity index 91%
rename from piet-gpu-hal/examples/shader/collatz.spv
rename to piet-gpu-hal/examples/shader/gen/collatz.spv
index 21e4e92..886797e 100644
Binary files a/piet-gpu-hal/examples/shader/collatz.spv and b/piet-gpu-hal/examples/shader/gen/collatz.spv differ
diff --git a/piet-gpu-hal/src/backend.rs b/piet-gpu-hal/src/backend.rs
new file mode 100644
index 0000000..abdb077
--- /dev/null
+++ b/piet-gpu-hal/src/backend.rs
@@ -0,0 +1,265 @@
+// Copyright 2021 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+//! The generic trait for backends to implement.
+
+use crate::{BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams, mux::ShaderCode};
+
+pub trait Device: Sized {
+    type Buffer: 'static;
+    type Image;
+    type Pipeline;
+    type DescriptorSet;
+    type QueryPool;
+    type CmdBuf: CmdBuf<Self>;
+    type Fence;
+    type Semaphore;
+    type PipelineBuilder: PipelineBuilder<Self>;
+    type DescriptorSetBuilder: DescriptorSetBuilder<Self>;
+    type Sampler;
+    type ShaderSource: ?Sized;
+
+    /// Query the GPU info.
+    ///
+    /// This method may be expensive, so the hub should call it once and retain
+    /// the info.
+    fn query_gpu_info(&self) -> GpuInfo;
+
+    fn create_buffer(&self, size: u64, usage: BufferUsage) -> Result<Self::Buffer, Error>;
+
+    /// Destroy a buffer.
+    ///
+    /// The same safety requirements hold as in Vulkan: the buffer cannot be used
+    /// after this call, and all commands referencing this buffer must have completed.
+    ///
+    /// Maybe doesn't need result return?
+    unsafe fn destroy_buffer(&self, buffer: &Self::Buffer) -> Result<(), Error>;
+
+    unsafe fn create_image2d(&self, width: u32, height: u32) -> Result<Self::Image, Error>;
+
+    /// Destroy an image.
+    ///
+    /// The same safety requirements hold as in Vulkan: the image cannot be used
+    /// after this call, and all commands referencing this image must have completed.
+    ///
+    /// Use this only with images we created, not for swapchain images.
+    ///
+    /// Maybe doesn't need result return?
+    unsafe fn destroy_image(&self, image: &Self::Image) -> Result<(), Error>;
+
+    /// Start building a pipeline.
+    ///
+    /// A pipeline is a bit of shader IR plus a signature for what kinds of resources
+    /// it expects.
+    unsafe fn pipeline_builder(&self) -> Self::PipelineBuilder;
+
+    /// Start building a descriptor set.
+    ///
+    /// A descriptor set is a binding of resources for a given pipeline.
+    unsafe fn descriptor_set_builder(&self) -> Self::DescriptorSetBuilder;
+
+    /// Create a simple compute pipeline that operates on buffers and storage images.
+    ///
+    /// This is provided as a convenience but will probably go away, as the functionality
+    /// is subsumed by the builder.
+    unsafe fn create_simple_compute_pipeline(
+        &self,
+        code: &Self::ShaderSource,
+        n_buffers: u32,
+        n_images: u32,
+    ) -> Result<Self::Pipeline, Error> {
+        let mut builder = self.pipeline_builder();
+        builder.add_buffers(n_buffers);
+        builder.add_images(n_images);
+        builder.create_compute_pipeline(self, code)
+    }
+
+    /// Create a descriptor set for a given pipeline, binding buffers and images.
+    ///
+    /// This is provided as a convenience but will probably go away, as the functionality
+    /// is subsumed by the builder.
+    unsafe fn create_descriptor_set(
+        &self,
+        pipeline: &Self::Pipeline,
+        bufs: &[&Self::Buffer],
+        images: &[&Self::Image],
+    ) -> Result<Self::DescriptorSet, Error> {
+        let mut builder = self.descriptor_set_builder();
+        builder.add_buffers(bufs);
+        builder.add_images(images);
+        builder.build(self, pipeline)
+    }
+
+    fn create_cmd_buf(&self) -> Result<Self::CmdBuf, Error>;
+
+    fn create_query_pool(&self, n_queries: u32) -> Result<Self::QueryPool, Error>;
+
+    /// Get results from query pool, destroying it in the process.
+    ///
+    /// The returned vector is one less than the number of queries; the first is used as
+    /// a baseline.
+    ///
+    /// # Safety
+    /// All submitted commands that refer to this query pool must have completed.
+    unsafe fn fetch_query_pool(&self, pool: &Self::QueryPool) -> Result<Vec<f64>, Error>;
+
+    unsafe fn run_cmd_bufs(
+        &self,
+        cmd_buf: &[&Self::CmdBuf],
+        wait_semaphores: &[&Self::Semaphore],
+        signal_semaphores: &[&Self::Semaphore],
+        fence: Option<&mut Self::Fence>,
+    ) -> Result<(), Error>;
+
+    /// Copy data from the buffer to memory.
+    ///
+    /// Discussion question: add offset?
+    ///
+    /// # Safety
+    ///
+    /// The buffer must be valid to access. The destination memory must be valid to
+    /// write to. The ranges must not overlap. The offset + size must be within
+    /// the buffer's allocation, and size within the destination.
+    unsafe fn read_buffer(
+        &self,
+        buffer: &Self::Buffer,
+        dst: *mut u8,
+        offset: u64,
+        size: u64,
+    ) -> Result<(), Error>;
+
+    /// Copy data from memory to the buffer.
+    ///
+    /// # Safety
+    ///
+    /// The buffer must be valid to access. The source memory must be valid to
+    /// read from. The ranges must not overlap. The offset + size must be within
+    /// the buffer's allocation, and size within the source.
+    unsafe fn write_buffer(
+        &self,
+        buffer: &Self::Buffer,
+        contents: *const u8,
+        offset: u64,
+        size: u64,
+    ) -> Result<(), Error>;
+
+    unsafe fn create_semaphore(&self) -> Result<Self::Semaphore, Error>;
+    unsafe fn create_fence(&self, signaled: bool) -> Result<Self::Fence, Error>;
+    unsafe fn wait_and_reset(&self, fences: Vec<&mut Self::Fence>) -> Result<(), Error>;
+    unsafe fn get_fence_status(&self, fence: &mut Self::Fence) -> Result<bool, Error>;
+
+    unsafe fn create_sampler(&self, params: SamplerParams) -> Result<Self::Sampler, Error>;
+}
+
+pub trait CmdBuf<D: Device> {
+    unsafe fn begin(&mut self);
+
+    unsafe fn finish(&mut self);
+
+    unsafe fn dispatch(
+        &mut self,
+        pipeline: &D::Pipeline,
+        descriptor_set: &D::DescriptorSet,
+        workgroup_count: (u32, u32, u32),
+        workgroup_size: (u32, u32, u32),
+    );
+
+    /// Insert an execution and memory barrier.
+    ///
+    /// Compute kernels (and other actions) after this barrier may read from buffers
+    /// that were written before this barrier.
+    unsafe fn memory_barrier(&mut self);
+
+    /// Insert a barrier for host access to buffers.
+    ///
+    /// The host may read buffers written before this barrier, after the fence for
+    /// the command buffer is signaled.
+    ///
+    /// See http://themaister.net/blog/2019/08/14/yet-another-blog-explaining-vulkan-synchronization/
+    /// ("Host memory reads") for an explanation of this barrier.
+    unsafe fn host_barrier(&mut self);
+
+    unsafe fn image_barrier(
+        &mut self,
+        image: &D::Image,
+        src_layout: ImageLayout,
+        dst_layout: ImageLayout,
+    );
+
+    /// Clear the buffer.
+    ///
+    /// This is readily supported in Vulkan, but for portability it is remarkably
+    /// tricky (unimplemented in gfx-hal right now). Possibly best to write a compute
+    /// kernel, or organize the code not to need it.
+    unsafe fn clear_buffer(&self, buffer: &D::Buffer, size: Option<u64>);
+
+    unsafe fn copy_buffer(&self, src: &D::Buffer, dst: &D::Buffer);
+
+    unsafe fn copy_image_to_buffer(&self, src: &D::Image, dst: &D::Buffer);
+
+    unsafe fn copy_buffer_to_image(&self, src: &D::Buffer, dst: &D::Image);
+
+    // low portability, dx12 doesn't support it natively
+    unsafe fn blit_image(&self, src: &D::Image, dst: &D::Image);
+
+    /// Reset the query pool.
+    ///
+    /// The query pool must be reset before each use, to avoid validation errors.
+    /// This is annoying, and we could tweak the API to make it implicit, doing
+    /// the reset before the first timestamp write.
+    unsafe fn reset_query_pool(&mut self, pool: &D::QueryPool);
+
+    unsafe fn write_timestamp(&mut self, pool: &D::QueryPool, query: u32);
+
+    /// Prepare the timestamps for reading. This isn't required on Vulkan but
+    /// is required on (at least) DX12.
+    unsafe fn finish_timestamps(&mut self, _pool: &D::QueryPool) {}
+}
+
+/// A builder for pipelines with more complex layouts.
+pub trait PipelineBuilder<D: Device> {
+    /// Add buffers to the pipeline. Each has its own binding.
+    fn add_buffers(&mut self, n_buffers: u32);
+    /// Add storage images to the pipeline. Each has its own binding.
+    fn add_images(&mut self, n_images: u32);
+    /// Add a binding with a variable-size array of textures.
+    fn add_textures(&mut self, max_textures: u32);
+    unsafe fn create_compute_pipeline(
+        self,
+        device: &D,
+        code: &D::ShaderSource,
+    ) -> Result<D::Pipeline, Error>;
+}
+
+/// A builder for descriptor sets with more complex layouts.
+///
+/// Note: the order needs to match the pipeline building, and it also needs to
+/// be buffers, then images, then textures.
+pub trait DescriptorSetBuilder<D: Device> {
+    fn add_buffers(&mut self, buffers: &[&D::Buffer]);
+    /// Add an array of storage images.
+    ///
+    /// The images need to be in `ImageLayout::General` layout.
+    fn add_images(&mut self, images: &[&D::Image]);
+    /// Add an array of textures.
+    ///
+    /// The images need to be in `ImageLayout::ShaderRead` layout.
+    ///
+    /// The same sampler is used for all textures, which is not very sophisticated;
+    /// we should have a way to vary the sampler.
+    fn add_textures(&mut self, images: &[&D::Image]);
+    unsafe fn build(self, device: &D, pipeline: &D::Pipeline) -> Result<D::DescriptorSet, Error>;
+}
diff --git a/piet-gpu-hal/src/dx12.rs b/piet-gpu-hal/src/dx12.rs
index cf9a0e3..67b751d 100644
--- a/piet-gpu-hal/src/dx12.rs
+++ b/piet-gpu-hal/src/dx12.rs
@@ -224,7 +224,7 @@ impl Dx12Instance {
     }
 }
 
-impl crate::Device for Dx12Device {
+impl crate::backend::Device for Dx12Device {
     type Buffer = Buffer;
 
     type Image = Image;
@@ -413,7 +413,7 @@ impl crate::Device for Dx12Device {
         Ok(())
     }
 
-    unsafe fn get_fence_status(&self, fence: &Self::Fence) -> Result<bool, Error> {
+    unsafe fn get_fence_status(&self, fence: &mut Self::Fence) -> Result<bool, Error> {
         let fence_val = fence.fence.get_value();
         Ok(fence_val == fence.val.get())
     }
@@ -451,7 +451,7 @@ impl Dx12Device {
     }
 }
 
-impl crate::CmdBuf<Dx12Device> for CmdBuf {
+impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
     unsafe fn begin(&mut self) {}
 
     unsafe fn finish(&mut self) {
@@ -468,7 +468,8 @@ impl crate::CmdBuf<Dx12Device> for CmdBuf {
         &mut self,
         pipeline: &Pipeline,
         descriptor_set: &DescriptorSet,
-        size: (u32, u32, u32),
+        workgroup_count: (u32, u32, u32),
+        _workgroup_size: (u32, u32, u32),
     ) {
         self.c.set_pipeline_state(&pipeline.pipeline_state);
         self.c
@@ -478,7 +479,8 @@ impl crate::CmdBuf<Dx12Device> for CmdBuf {
             0,
             descriptor_set.0.get_gpu_descriptor_handle_at_offset(0),
         );
-        self.c.dispatch(size.0, size.1, size.2);
+        self.c
+            .dispatch(workgroup_count.0, workgroup_count.1, workgroup_count.2);
     }
 
     unsafe fn memory_barrier(&mut self) {
@@ -554,7 +556,7 @@ impl crate::CmdBuf<Dx12Device> for CmdBuf {
     }
 }
 
-impl crate::PipelineBuilder<Dx12Device> for PipelineBuilder {
+impl crate::backend::PipelineBuilder<Dx12Device> for PipelineBuilder {
     fn add_buffers(&mut self, n_buffers: u32) {
         if n_buffers != 0 {
             self.ranges.push(d3d12::D3D12_DESCRIPTOR_RANGE {
@@ -630,7 +632,7 @@ impl crate::PipelineBuilder<Dx12Device> for PipelineBuilder {
     }
 }
 
-impl crate::DescriptorSetBuilder<Dx12Device> for DescriptorSetBuilder {
+impl crate::backend::DescriptorSetBuilder<Dx12Device> for DescriptorSetBuilder {
     fn add_buffers(&mut self, buffers: &[&Buffer]) {
         // Note: we could get rid of the clone here (which is an AddRef)
         // and store a raw pointer, as it's a safety precondition that
diff --git a/piet-gpu-hal/src/hub.rs b/piet-gpu-hal/src/hub.rs
index a53ef2f..8822b8a 100644
--- a/piet-gpu-hal/src/hub.rs
+++ b/piet-gpu-hal/src/hub.rs
@@ -123,7 +123,7 @@ impl Session {
         unsafe {
             let mut i = 0;
             while i < pending.len() {
-                if let Ok(true) = self.0.device.get_fence_status(&pending[i].fence) {
+                if let Ok(true) = self.0.device.get_fence_status(&mut pending[i].fence) {
                     let mut item = pending.swap_remove(i);
                     // TODO: wait is superfluous, can just reset
                     let _ = self.0.device.wait_and_reset(vec![&mut item.fence]);
@@ -295,6 +295,11 @@ impl Session {
     pub fn gpu_info(&self) -> &GpuInfo {
         &self.0.gpu_info
     }
+
+    /// Choose shader code from the available choices.
+    pub fn choose_shader<'a>(&self, spv: &'a [u8], hlsl: &'a str, msl: &'a str) -> ShaderCode<'a> {
+        self.0.device.choose_shader(spv, hlsl, msl)
+    }
 }
 
 impl CmdBuf {
diff --git a/piet-gpu-hal/src/lib.rs b/piet-gpu-hal/src/lib.rs
index 6a66db6..b5bc8b5 100644
--- a/piet-gpu-hal/src/lib.rs
+++ b/piet-gpu-hal/src/lib.rs
@@ -4,6 +4,7 @@
 /// In time, it may go away and be replaced by either gfx-hal or wgpu.
 use bitflags::bitflags;
 
+pub mod backend;
 pub mod hub;
 
 #[macro_use]
@@ -26,9 +27,13 @@ mux_cfg! {
 #[cfg(target_os = "macos")]
 pub mod metal;
 
-/// This isn't great but is expedient.
+/// The common error type for the crate.
+///
+/// This keeps things imple and can be expanded later.
 pub type Error = Box<dyn std::error::Error>;
 
+pub use crate::backend::CmdBuf;
+
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
 pub enum ImageLayout {
     Undefined,
@@ -92,248 +97,3 @@ pub struct SubgroupSize {
     min: u32,
     max: u32,
 }
-
-pub trait Device: Sized {
-    type Buffer: 'static;
-    type Image;
-    type Pipeline;
-    type DescriptorSet;
-    type QueryPool;
-    type CmdBuf: CmdBuf<Self>;
-    type Fence;
-    type Semaphore;
-    type PipelineBuilder: PipelineBuilder<Self>;
-    type DescriptorSetBuilder: DescriptorSetBuilder<Self>;
-    type Sampler;
-    type ShaderSource: ?Sized;
-
-    /// Query the GPU info.
-    ///
-    /// This method may be expensive, so the hub should call it once and retain
-    /// the info.
-    fn query_gpu_info(&self) -> GpuInfo;
-
-    fn create_buffer(&self, size: u64, usage: BufferUsage) -> Result<Self::Buffer, Error>;
-
-    /// Destroy a buffer.
-    ///
-    /// The same safety requirements hold as in Vulkan: the buffer cannot be used
-    /// after this call, and all commands referencing this buffer must have completed.
-    ///
-    /// Maybe doesn't need result return?
-    unsafe fn destroy_buffer(&self, buffer: &Self::Buffer) -> Result<(), Error>;
-
-    unsafe fn create_image2d(&self, width: u32, height: u32) -> Result<Self::Image, Error>;
-
-    /// Destroy an image.
-    ///
-    /// The same safety requirements hold as in Vulkan: the image cannot be used
-    /// after this call, and all commands referencing this image must have completed.
-    ///
-    /// Use this only with images we created, not for swapchain images.
-    ///
-    /// Maybe doesn't need result return?
-    unsafe fn destroy_image(&self, image: &Self::Image) -> Result<(), Error>;
-
-    /// Start building a pipeline.
-    ///
-    /// A pipeline is a bit of shader IR plus a signature for what kinds of resources
-    /// it expects.
-    unsafe fn pipeline_builder(&self) -> Self::PipelineBuilder;
-
-    /// Start building a descriptor set.
-    ///
-    /// A descriptor set is a binding of resources for a given pipeline.
-    unsafe fn descriptor_set_builder(&self) -> Self::DescriptorSetBuilder;
-
-    /// Create a simple compute pipeline that operates on buffers and storage images.
-    ///
-    /// This is provided as a convenience but will probably go away, as the functionality
-    /// is subsumed by the builder.
-    unsafe fn create_simple_compute_pipeline(
-        &self,
-        code: &Self::ShaderSource,
-        n_buffers: u32,
-        n_images: u32,
-    ) -> Result<Self::Pipeline, Error> {
-        let mut builder = self.pipeline_builder();
-        builder.add_buffers(n_buffers);
-        builder.add_images(n_images);
-        builder.create_compute_pipeline(self, code)
-    }
-
-    /// Create a descriptor set for a given pipeline, binding buffers and images.
-    ///
-    /// This is provided as a convenience but will probably go away, as the functionality
-    /// is subsumed by the builder.
-    unsafe fn create_descriptor_set(
-        &self,
-        pipeline: &Self::Pipeline,
-        bufs: &[&Self::Buffer],
-        images: &[&Self::Image],
-    ) -> Result<Self::DescriptorSet, Error> {
-        let mut builder = self.descriptor_set_builder();
-        builder.add_buffers(bufs);
-        builder.add_images(images);
-        builder.build(self, pipeline)
-    }
-
-    fn create_cmd_buf(&self) -> Result<Self::CmdBuf, Error>;
-
-    fn create_query_pool(&self, n_queries: u32) -> Result<Self::QueryPool, Error>;
-
-    /// Get results from query pool, destroying it in the process.
-    ///
-    /// The returned vector is one less than the number of queries; the first is used as
-    /// a baseline.
-    ///
-    /// # Safety
-    /// All submitted commands that refer to this query pool must have completed.
-    unsafe fn fetch_query_pool(&self, pool: &Self::QueryPool) -> Result<Vec<f64>, Error>;
-
-    unsafe fn run_cmd_bufs(
-        &self,
-        cmd_buf: &[&Self::CmdBuf],
-        wait_semaphores: &[&Self::Semaphore],
-        signal_semaphores: &[&Self::Semaphore],
-        fence: Option<&mut Self::Fence>,
-    ) -> Result<(), Error>;
-
-    /// Copy data from the buffer to memory.
-    ///
-    /// Discussion question: add offset?
-    ///
-    /// # Safety
-    ///
-    /// The buffer must be valid to access. The destination memory must be valid to
-    /// write to. The ranges must not overlap. The offset + size must be within
-    /// the buffer's allocation, and size within the destination.
-    unsafe fn read_buffer(
-        &self,
-        buffer: &Self::Buffer,
-        dst: *mut u8,
-        offset: u64,
-        size: u64,
-    ) -> Result<(), Error>;
-
-    /// Copy data from memory to the buffer.
-    ///
-    /// # Safety
-    ///
-    /// The buffer must be valid to access. The source memory must be valid to
-    /// read from. The ranges must not overlap. The offset + size must be within
-    /// the buffer's allocation, and size within the source.
-    unsafe fn write_buffer(
-        &self,
-        buffer: &Self::Buffer,
-        contents: *const u8,
-        offset: u64,
-        size: u64,
-    ) -> Result<(), Error>;
-
-    unsafe fn create_semaphore(&self) -> Result<Self::Semaphore, Error>;
-    unsafe fn create_fence(&self, signaled: bool) -> Result<Self::Fence, Error>;
-    unsafe fn wait_and_reset(&self, fences: Vec<&mut Self::Fence>) -> Result<(), Error>;
-    unsafe fn get_fence_status(&self, fence: &Self::Fence) -> Result<bool, Error>;
-
-    unsafe fn create_sampler(&self, params: SamplerParams) -> Result<Self::Sampler, Error>;
-}
-
-pub trait CmdBuf<D: Device> {
-    unsafe fn begin(&mut self);
-
-    unsafe fn finish(&mut self);
-
-    unsafe fn dispatch(
-        &mut self,
-        pipeline: &D::Pipeline,
-        descriptor_set: &D::DescriptorSet,
-        size: (u32, u32, u32),
-    );
-
-    /// Insert an execution and memory barrier.
-    ///
-    /// Compute kernels (and other actions) after this barrier may read from buffers
-    /// that were written before this barrier.
-    unsafe fn memory_barrier(&mut self);
-
-    /// Insert a barrier for host access to buffers.
-    ///
-    /// The host may read buffers written before this barrier, after the fence for
-    /// the command buffer is signaled.
-    ///
-    /// See http://themaister.net/blog/2019/08/14/yet-another-blog-explaining-vulkan-synchronization/
-    /// ("Host memory reads") for an explanation of this barrier.
-    unsafe fn host_barrier(&mut self);
-
-    unsafe fn image_barrier(
-        &mut self,
-        image: &D::Image,
-        src_layout: ImageLayout,
-        dst_layout: ImageLayout,
-    );
-
-    /// Clear the buffer.
-    ///
-    /// This is readily supported in Vulkan, but for portability it is remarkably
-    /// tricky (unimplemented in gfx-hal right now). Possibly best to write a compute
-    /// kernel, or organize the code not to need it.
-    unsafe fn clear_buffer(&self, buffer: &D::Buffer, size: Option<u64>);
-
-    unsafe fn copy_buffer(&self, src: &D::Buffer, dst: &D::Buffer);
-
-    unsafe fn copy_image_to_buffer(&self, src: &D::Image, dst: &D::Buffer);
-
-    unsafe fn copy_buffer_to_image(&self, src: &D::Buffer, dst: &D::Image);
-
-    // low portability, dx12 doesn't support it natively
-    unsafe fn blit_image(&self, src: &D::Image, dst: &D::Image);
-
-    /// Reset the query pool.
-    ///
-    /// The query pool must be reset before each use, to avoid validation errors.
-    /// This is annoying, and we could tweak the API to make it implicit, doing
-    /// the reset before the first timestamp write.
-    unsafe fn reset_query_pool(&mut self, pool: &D::QueryPool);
-
-    unsafe fn write_timestamp(&mut self, pool: &D::QueryPool, query: u32);
-
-    /// Prepare the timestamps for reading. This isn't required on Vulkan but
-    /// is required on (at least) DX12.
-    unsafe fn finish_timestamps(&mut self, pool: &D::QueryPool) {}
-}
-
-/// A builder for pipelines with more complex layouts.
-pub trait PipelineBuilder<D: Device> {
-    /// Add buffers to the pipeline. Each has its own binding.
-    fn add_buffers(&mut self, n_buffers: u32);
-    /// Add storage images to the pipeline. Each has its own binding.
-    fn add_images(&mut self, n_images: u32);
-    /// Add a binding with a variable-size array of textures.
-    fn add_textures(&mut self, max_textures: u32);
-    unsafe fn create_compute_pipeline(
-        self,
-        device: &D,
-        code: &D::ShaderSource,
-    ) -> Result<D::Pipeline, Error>;
-}
-
-/// A builder for descriptor sets with more complex layouts.
-///
-/// Note: the order needs to match the pipeline building, and it also needs to
-/// be buffers, then images, then textures.
-pub trait DescriptorSetBuilder<D: Device> {
-    fn add_buffers(&mut self, buffers: &[&D::Buffer]);
-    /// Add an array of storage images.
-    ///
-    /// The images need to be in `ImageLayout::General` layout.
-    fn add_images(&mut self, images: &[&D::Image]);
-    /// Add an array of textures.
-    ///
-    /// The images need to be in `ImageLayout::ShaderRead` layout.
-    ///
-    /// The same sampler is used for all textures, which is not very sophisticated;
-    /// we should have a way to vary the sampler.
-    fn add_textures(&mut self, images: &[&D::Image]);
-    unsafe fn build(self, device: &D, pipeline: &D::Pipeline) -> Result<D::DescriptorSet, Error>;
-}
diff --git a/piet-gpu-hal/src/macros.rs b/piet-gpu-hal/src/macros.rs
index 1810a3c..3456342 100644
--- a/piet-gpu-hal/src/macros.rs
+++ b/piet-gpu-hal/src/macros.rs
@@ -117,9 +117,9 @@ macro_rules! mux_device_enum {
         $crate::mux_enum! {
             $(#[$outer])*
             pub enum $assoc_type {
-                Vk(<$crate::vulkan::VkDevice as $crate::Device>::$assoc_type),
-                Dx12(<$crate::dx12::Dx12Device as $crate::Device>::$assoc_type),
-                Mtl(<$crate::metal::MtlDevice as $crate::Device>::$assoc_type),
+                Vk(<$crate::vulkan::VkDevice as $crate::backend::Device>::$assoc_type),
+                Dx12(<$crate::dx12::Dx12Device as $crate::backend::Device>::$assoc_type),
+                Mtl(<$crate::metal::MtlDevice as $crate::backend::Device>::$assoc_type),
             }
         }
     }
@@ -154,3 +154,15 @@ macro_rules! mux_match {
         }
     };
 }
+
+/// A convenience macro for selecting a shader from included files.
+#[macro_export]
+macro_rules! include_shader {
+    ( $device:expr, $path_base:expr) => {
+        $device.choose_shader(
+            include_bytes!(concat!($path_base, ".spv")),
+            include_str!(concat!($path_base, ".hlsl")),
+            include_str!(concat!($path_base, ".msl")),
+        )
+    };
+}
diff --git a/piet-gpu-hal/src/metal.rs b/piet-gpu-hal/src/metal.rs
index 0b3f588..fcf4637 100644
--- a/piet-gpu-hal/src/metal.rs
+++ b/piet-gpu-hal/src/metal.rs
@@ -102,7 +102,11 @@ impl MtlInstance {
                 has_memory_model: false,
                 use_staging_buffers: use_staging_buffers,
             };
-            Ok(MtlDevice { device, cmd_queue, gpu_info })
+            Ok(MtlDevice {
+                device,
+                cmd_queue,
+                gpu_info,
+            })
         } else {
             Err("can't create system default Metal device".into())
         }
@@ -119,7 +123,7 @@ impl MtlInstance {
     }
 }
 
-impl crate::Device for MtlDevice {
+impl crate::backend::Device for MtlDevice {
     type Buffer = Buffer;
 
     type Image = Image;
@@ -282,9 +286,13 @@ impl crate::Device for MtlDevice {
         Ok(())
     }
 
-    unsafe fn get_fence_status(&self, fence: &Self::Fence) -> Result<bool, Error> {
-        // fence need to be mutable here :/
-        todo!()
+    unsafe fn get_fence_status(&self, fence: &mut Self::Fence) -> Result<bool, Error> {
+        match fence {
+            Fence::Idle => Ok(true),
+            Fence::CmdBufPending(cmd_buf) => {
+                Ok(cmd_buf.status() == metal::MTLCommandBufferStatus::Completed)
+            }
+        }
     }
 
     unsafe fn create_sampler(&self, params: crate::SamplerParams) -> Result<Self::Sampler, Error> {
@@ -292,18 +300,17 @@ impl crate::Device for MtlDevice {
     }
 }
 
-impl crate::CmdBuf<MtlDevice> for CmdBuf {
-    unsafe fn begin(&mut self) {
-    }
+impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
+    unsafe fn begin(&mut self) {}
 
-    unsafe fn finish(&mut self) {
-    }
+    unsafe fn finish(&mut self) {}
 
     unsafe fn dispatch(
         &mut self,
         pipeline: &Pipeline,
         descriptor_set: &DescriptorSet,
-        size: (u32, u32, u32),
+        workgroup_count: (u32, u32, u32),
+        workgroup_size: (u32, u32, u32),
     ) {
         let encoder = self.cmd_buf.new_compute_command_encoder();
         encoder.set_compute_pipeline_state(&pipeline.0);
@@ -313,19 +320,17 @@ impl crate::CmdBuf<MtlDevice> for CmdBuf {
             ix += 1;
         }
         // TODO: set images
-        let work_group_count = metal::MTLSize {
-            width: size.0 as u64,
-            height: size.1 as u64,
-            depth: size.2 as u64,
+        let workgroup_count = metal::MTLSize {
+            width: workgroup_count.0 as u64,
+            height: workgroup_count.1 as u64,
+            depth: workgroup_count.2 as u64,
         };
-        // TODO: we need to pass this in explicitly. In gfx-hal, this is parsed from
-        // the spv before translation.
-        let work_group_size = metal::MTLSize {
-            width: 1,
-            height: 1,
-            depth: 1,
+        let workgroup_size = metal::MTLSize {
+            width: workgroup_size.0 as u64,
+            height: workgroup_size.1 as u64,
+            depth: workgroup_size.2 as u64,
         };
-        encoder.dispatch_thread_groups(work_group_count, work_group_size);
+        encoder.dispatch_thread_groups(workgroup_count, workgroup_size);
         encoder.end_encoding();
     }
 
@@ -334,8 +339,7 @@ impl crate::CmdBuf<MtlDevice> for CmdBuf {
         // Metal's own tracking.
     }
 
-    unsafe fn host_barrier(&mut self) {
-    }
+    unsafe fn host_barrier(&mut self) {}
 
     unsafe fn image_barrier(
         &mut self,
@@ -366,9 +370,7 @@ impl crate::CmdBuf<MtlDevice> for CmdBuf {
         todo!()
     }
 
-    unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {
-        
-    }
+    unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {}
 
     unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
         // TODO
@@ -377,17 +379,15 @@ impl crate::CmdBuf<MtlDevice> for CmdBuf {
     }
 }
 
-impl crate::PipelineBuilder<MtlDevice> for PipelineBuilder {
+impl crate::backend::PipelineBuilder<MtlDevice> for PipelineBuilder {
     fn add_buffers(&mut self, _n_buffers: u32) {
         // My understanding is that Metal infers the pipeline layout from
         // the source.
     }
 
-    fn add_images(&mut self, _n_images: u32) {
-    }
+    fn add_images(&mut self, _n_images: u32) {}
 
-    fn add_textures(&mut self, _max_textures: u32) {
-    }
+    fn add_textures(&mut self, _max_textures: u32) {}
 
     unsafe fn create_compute_pipeline(
         self,
@@ -399,12 +399,14 @@ impl crate::PipelineBuilder<MtlDevice> for PipelineBuilder {
         let library = device.device.new_library_with_source(code, &options)?;
         // This seems to be the default name from spirv-cross, but we may need to tweak.
         let function = library.get_function("main0", None)?;
-        let pipeline = device.device.new_compute_pipeline_state_with_function(&function)?;
+        let pipeline = device
+            .device
+            .new_compute_pipeline_state_with_function(&function)?;
         Ok(Pipeline(pipeline))
     }
 }
 
-impl crate::DescriptorSetBuilder<MtlDevice> for DescriptorSetBuilder {
+impl crate::backend::DescriptorSetBuilder<MtlDevice> for DescriptorSetBuilder {
     fn add_buffers(&mut self, buffers: &[&Buffer]) {
         self.0.buffers.extend(buffers.iter().copied().cloned());
     }
diff --git a/piet-gpu-hal/src/mux.rs b/piet-gpu-hal/src/mux.rs
index e2c0988..7a83ae9 100644
--- a/piet-gpu-hal/src/mux.rs
+++ b/piet-gpu-hal/src/mux.rs
@@ -30,10 +30,10 @@ mux_cfg! {
     #[cfg(mtl)]
     use crate::metal;
 }
-use crate::CmdBuf as CmdBufTrait;
-use crate::DescriptorSetBuilder as DescriptorSetBuilderTrait;
-use crate::Device as DeviceTrait;
-use crate::PipelineBuilder as PipelineBuilderTrait;
+use crate::backend::CmdBuf as CmdBufTrait;
+use crate::backend::DescriptorSetBuilder as DescriptorSetBuilderTrait;
+use crate::backend::Device as DeviceTrait;
+use crate::backend::PipelineBuilder as PipelineBuilderTrait;
 use crate::{BufferUsage, Error, GpuInfo, ImageLayout};
 
 mux_enum! {
@@ -255,11 +255,11 @@ impl Device {
         }
     }
 
-    pub unsafe fn get_fence_status(&self, fence: &Fence) -> Result<bool, Error> {
+    pub unsafe fn get_fence_status(&self, fence: &mut Fence) -> Result<bool, Error> {
         mux_match! { self;
-            Device::Vk(d) => d.get_fence_status(fence.vk()),
-            Device::Dx12(d) => d.get_fence_status(fence.dx12()),
-            Device::Mtl(d) => d.get_fence_status(fence.mtl()),
+            Device::Vk(d) => d.get_fence_status(fence.vk_mut()),
+            Device::Dx12(d) => d.get_fence_status(fence.dx12_mut()),
+            Device::Mtl(d) => d.get_fence_status(fence.mtl_mut()),
         }
     }
 
@@ -400,6 +400,15 @@ impl Device {
             Device::Mtl(d) => d.write_buffer(buffer.mtl(), contents, offset, size),
         }
     }
+
+    /// Choose shader code from the available choices.
+    pub fn choose_shader<'a>(&self, _spv: &'a [u8], _hlsl: &'a str, _msl: &'a str) -> ShaderCode<'a> {
+        mux_match! { self;
+            Device::Vk(_d) => ShaderCode::Spv(_spv),
+            Device::Dx12(_d) => ShaderCode::Hlsl(_hlsl),
+            Device::Mtl(_d) => ShaderCode::Msl(_msl),
+        }
+    }
 }
 
 impl PipelineBuilder {
@@ -578,16 +587,25 @@ impl CmdBuf {
         }
     }
 
+    /// Dispatch a compute shader.
+    ///
+    /// Note that both the number of workgroups (`workgroup_count`) and the number of
+    /// threads in a workgroup (`workgroup_size`) are given. The latter is needed on
+    /// Metal, while it's baked into the shader on Vulkan and DX12.
+    ///
+    /// Perhaps we'll have a mechanism to plumb the latter value to configure the size
+    /// of a workgroup using specialization constants in the future.
     pub unsafe fn dispatch(
         &mut self,
         pipeline: &Pipeline,
         descriptor_set: &DescriptorSet,
-        size: (u32, u32, u32),
+        workgroup_count: (u32, u32, u32),
+        workgroup_size: (u32, u32, u32),
     ) {
         mux_match! { self;
-            CmdBuf::Vk(c) => c.dispatch(pipeline.vk(), descriptor_set.vk(), size),
-            CmdBuf::Dx12(c) => c.dispatch(pipeline.dx12(), descriptor_set.dx12(), size),
-            CmdBuf::Mtl(c) => c.dispatch(pipeline.mtl(), descriptor_set.mtl(), size),
+            CmdBuf::Vk(c) => c.dispatch(pipeline.vk(), descriptor_set.vk(), workgroup_count, workgroup_size),
+            CmdBuf::Dx12(c) => c.dispatch(pipeline.dx12(), descriptor_set.dx12(), workgroup_count, workgroup_size),
+            CmdBuf::Mtl(c) => c.dispatch(pipeline.mtl(), descriptor_set.mtl(), workgroup_count, workgroup_size),
         }
     }
 
diff --git a/piet-gpu-hal/src/vulkan.rs b/piet-gpu-hal/src/vulkan.rs
index 8cdddcd..1084505 100644
--- a/piet-gpu-hal/src/vulkan.rs
+++ b/piet-gpu-hal/src/vulkan.rs
@@ -13,8 +13,10 @@ use ash::{vk, Device, Entry, Instance};
 use smallvec::SmallVec;
 
 use crate::{
-    BufferUsage, Device as DeviceTrait, Error, GpuInfo, ImageLayout, SamplerParams, SubgroupSize,
+    BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams, SubgroupSize,
 };
+use crate::backend::Device as DeviceTrait;
+
 
 pub struct VkInstance {
     /// Retain the dynamic lib.
@@ -455,7 +457,7 @@ impl VkInstance {
     }
 }
 
-impl crate::Device for VkDevice {
+impl crate::backend::Device for VkDevice {
     type Buffer = Buffer;
     type Image = Image;
     type CmdBuf = CmdBuf;
@@ -621,16 +623,13 @@ impl crate::Device for VkDevice {
 
     unsafe fn wait_and_reset(&self, fences: Vec<&mut Self::Fence>) -> Result<(), Error> {
         let device = &self.device.device;
-        let fences = fences
-            .iter()
-            .map(|f| **f)
-            .collect::<SmallVec<[_; 4]>>();
+        let fences = fences.iter().map(|f| **f).collect::<SmallVec<[_; 4]>>();
         device.wait_for_fences(&fences, true, !0)?;
         device.reset_fences(&fences)?;
         Ok(())
     }
 
-    unsafe fn get_fence_status(&self, fence: &Self::Fence) -> Result<bool, Error> {
+    unsafe fn get_fence_status(&self, fence: &mut Self::Fence) -> Result<bool, Error> {
         let device = &self.device.device;
         Ok(device.get_fence_status(*fence)?)
     }
@@ -843,7 +842,8 @@ impl crate::CmdBuf<VkDevice> for CmdBuf {
         &mut self,
         pipeline: &Pipeline,
         descriptor_set: &DescriptorSet,
-        size: (u32, u32, u32),
+        workgroup_count: (u32, u32, u32),
+        _workgroup_size: (u32, u32, u32),
     ) {
         let device = &self.device.device;
         device.cmd_bind_pipeline(
@@ -859,7 +859,12 @@ impl crate::CmdBuf<VkDevice> for CmdBuf {
             &[descriptor_set.descriptor_set],
             &[],
         );
-        device.cmd_dispatch(self.cmd_buf, size.0, size.1, size.2);
+        device.cmd_dispatch(
+            self.cmd_buf,
+            workgroup_count.0,
+            workgroup_count.1,
+            workgroup_count.2,
+        );
     }
 
     /// Insert a pipeline barrier for all memory accesses.
@@ -1047,7 +1052,7 @@ impl crate::CmdBuf<VkDevice> for CmdBuf {
     }
 }
 
-impl crate::PipelineBuilder<VkDevice> for PipelineBuilder {
+impl crate::backend::PipelineBuilder<VkDevice> for PipelineBuilder {
     fn add_buffers(&mut self, n_buffers: u32) {
         let start = self.bindings.len() as u32;
         for i in 0..n_buffers {
@@ -1153,7 +1158,7 @@ impl crate::PipelineBuilder<VkDevice> for PipelineBuilder {
     }
 }
 
-impl crate::DescriptorSetBuilder<VkDevice> for DescriptorSetBuilder {
+impl crate::backend::DescriptorSetBuilder<VkDevice> for DescriptorSetBuilder {
     fn add_buffers(&mut self, buffers: &[&Buffer]) {
         self.buffers.extend(buffers.iter().map(|b| b.buffer));
     }
@@ -1307,7 +1312,11 @@ impl VkSwapchain {
         image_idx: usize,
         semaphores: &[&vk::Semaphore],
     ) -> Result<bool, Error> {
-        let semaphores = semaphores.iter().copied().copied().collect::<SmallVec<[_; 4]>>();
+        let semaphores = semaphores
+            .iter()
+            .copied()
+            .copied()
+            .collect::<SmallVec<[_; 4]>>();
         Ok(self.swapchain_fn.queue_present(
             self.present_queue,
             &vk::PresentInfoKHR::builder()
diff --git a/piet-gpu/bin/android.rs b/piet-gpu/bin/android.rs
index 63ee91a..ce37503 100644
--- a/piet-gpu/bin/android.rs
+++ b/piet-gpu/bin/android.rs
@@ -12,7 +12,7 @@ use ndk::native_window::NativeWindow;
 use ndk_glue::Event;
 
 use piet_gpu_hal::hub;
-use piet_gpu_hal::mux::{QueryPool, Instance, Surface, Swapchain};
+use piet_gpu_hal::mux::{Instance, QueryPool, Surface, Swapchain};
 use piet_gpu_hal::{CmdBuf, Error, ImageLayout};
 
 use piet_gpu::{render_scene, PietGpuRenderContext, Renderer};
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index 2b8ed0f..b815655 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -403,6 +403,7 @@ impl Renderer {
             &self.el_pipeline,
             &self.el_ds,
             (((self.n_elements + 127) / 128) as u32, 1, 1),
+            (128, 1, 1),
         );
         cmd_buf.write_timestamp(&query_pool, 1);
         cmd_buf.memory_barrier();
@@ -410,6 +411,7 @@ impl Renderer {
             &self.tile_pipeline,
             &self.tile_ds,
             (((self.n_paths + 255) / 256) as u32, 1, 1),
+            (256, 1, 1),
         );
         cmd_buf.write_timestamp(&query_pool, 2);
         cmd_buf.memory_barrier();
@@ -417,6 +419,7 @@ impl Renderer {
             &self.path_pipeline,
             &self.path_ds,
             (((self.n_pathseg + 31) / 32) as u32, 1, 1),
+            (32, 1, 1),
         );
         cmd_buf.write_timestamp(&query_pool, 3);
         cmd_buf.memory_barrier();
@@ -424,6 +427,7 @@ impl Renderer {
             &self.backdrop_pipeline,
             &self.backdrop_ds,
             (((self.n_paths + 255) / 256) as u32, 1, 1),
+            (256, 1, 1),
         );
         cmd_buf.write_timestamp(&query_pool, 4);
         // Note: this barrier is not needed as an actual dependency between
@@ -434,6 +438,7 @@ impl Renderer {
             &self.bin_pipeline,
             &self.bin_ds,
             (((self.n_paths + 255) / 256) as u32, 1, 1),
+            (256, 1, 1),
         );
         cmd_buf.write_timestamp(&query_pool, 5);
         cmd_buf.memory_barrier();
@@ -441,6 +446,7 @@ impl Renderer {
             &self.coarse_pipeline,
             &self.coarse_ds,
             ((WIDTH as u32 + 255) / 256, (HEIGHT as u32 + 255) / 256, 1),
+            (256, 256, 1),
         );
         cmd_buf.write_timestamp(&query_pool, 6);
         cmd_buf.memory_barrier();
@@ -448,6 +454,7 @@ impl Renderer {
             &self.k4_pipeline,
             &self.k4_ds,
             ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1),
+            (8, 4, 1),
         );
         cmd_buf.write_timestamp(&query_pool, 7);
         cmd_buf.memory_barrier();