Merge pull request #126 from linebender/dx12

Improve DX12 portability
2025-01-10 12:41:30 +11:00 · 2021-11-12 15:24:24 -08:00 · 2021-11-12 15:24:24 -08:00 · 4c42da2d46
parent 19fedf36db 3f1bbe4af1
commit 4c42da2d46
50 changed files with 1148 additions and 908 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/piet-gpu-hal/examples/collatz.rs
+++ b/piet-gpu-hal/examples/collatz.rs
@ -1,8 +1,8 @@
-use piet_gpu_hal::include_shader;
-use piet_gpu_hal::{BufferUsage, Instance, Session};
+use piet_gpu_hal::{include_shader, BindType};
+use piet_gpu_hal::{BufferUsage, Instance, InstanceFlags, Session};

 fn main() {
-    let (instance, _) = Instance::new(None).unwrap();
+    let (instance, _) = Instance::new(None, InstanceFlags::empty()).unwrap();
    unsafe {
        let device = instance.device(None).unwrap();
        let session = Session::new(device);
@ -10,7 +10,9 @@ fn main() {
        let src = (0..256).map(|x| x + 1).collect::<Vec<u32>>();
        let buffer = session.create_buffer_init(&src, usage).unwrap();
        let code = include_shader!(&session, "./shader/gen/collatz");
-        let pipeline = session.create_simple_compute_pipeline(code, 1).unwrap();
+        let pipeline = session
+            .create_compute_pipeline(code, &[BindType::Buffer])
+            .unwrap();
        let descriptor_set = session
            .create_simple_descriptor_set(&pipeline, &[&buffer])
            .unwrap();
--- a/piet-gpu-hal/examples/shader/build.ninja
+++ b/piet-gpu-hal/examples/shader/build.ninja
@ -4,6 +4,7 @@

 glslang_validator = glslangValidator
 spirv_cross = spirv-cross
+dxc = dxc

 rule glsl
  command = $glslang_validator -V -o $out $in
@ -11,9 +12,13 @@ rule glsl
 rule hlsl
  command = $spirv_cross --hlsl $in --output $out

+rule dxil
+  command = $dxc -T cs_6_0 $in -Fo $out
+
 rule msl
  command = $spirv_cross --msl $in --output $out

 build gen/collatz.spv: glsl collatz.comp
 build gen/collatz.hlsl: hlsl gen/collatz.spv
+build gen/collatz.dxil: dxil gen/collatz.hlsl
 build gen/collatz.msl: msl gen/collatz.spv
--- a/piet-gpu-hal/examples/shader/gen/collatz.dxil
+++ b/piet-gpu-hal/examples/shader/gen/collatz.dxil
--- a/piet-gpu-hal/src/backend.rs
+++ b/piet-gpu-hal/src/backend.rs
@ -16,7 +16,7 @@

 //! The generic trait for backends to implement.

-use crate::{BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams};
+use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams};

 pub trait Device: Sized {
    type Buffer: 'static;
@ -27,7 +27,6 @@ pub trait Device: Sized {
    type CmdBuf: CmdBuf<Self>;
    type Fence;
    type Semaphore;
-    type PipelineBuilder: PipelineBuilder<Self>;
    type DescriptorSetBuilder: DescriptorSetBuilder<Self>;
    type Sampler;
    type ShaderSource: ?Sized;
@ -60,33 +59,21 @@ pub trait Device: Sized {
    /// Maybe doesn't need result return?
    unsafe fn destroy_image(&self, image: &Self::Image) -> Result<(), Error>;

-    /// Start building a pipeline.
+    /// Build a compute pipeline.
    ///
    /// A pipeline is a bit of shader IR plus a signature for what kinds of resources
    /// it expects.
-    unsafe fn pipeline_builder(&self) -> Self::PipelineBuilder;
+    unsafe fn create_compute_pipeline(
+        &self,
+        code: &Self::ShaderSource,
+        bind_types: &[BindType],
+    ) -> Result<Self::Pipeline, Error>;

    /// Start building a descriptor set.
    ///
    /// A descriptor set is a binding of resources for a given pipeline.
    unsafe fn descriptor_set_builder(&self) -> Self::DescriptorSetBuilder;

-    /// Create a simple compute pipeline that operates on buffers and storage images.
-    ///
-    /// This is provided as a convenience but will probably go away, as the functionality
-    /// is subsumed by the builder.
-    unsafe fn create_simple_compute_pipeline(
-        &self,
-        code: &Self::ShaderSource,
-        n_buffers: u32,
-        n_images: u32,
-    ) -> Result<Self::Pipeline, Error> {
-        let mut builder = self.pipeline_builder();
-        builder.add_buffers(n_buffers);
-        builder.add_images(n_images);
-        builder.create_compute_pipeline(self, code)
-    }
-
    /// Create a descriptor set for a given pipeline, binding buffers and images.
    ///
    /// This is provided as a convenience but will probably go away, as the functionality
@ -236,21 +223,6 @@ pub trait CmdBuf<D: Device> {
    unsafe fn finish_timestamps(&mut self, _pool: &D::QueryPool) {}
 }

-/// A builder for pipelines with more complex layouts.
-pub trait PipelineBuilder<D: Device> {
-    /// Add buffers to the pipeline. Each has its own binding.
-    fn add_buffers(&mut self, n_buffers: u32);
-    /// Add storage images to the pipeline. Each has its own binding.
-    fn add_images(&mut self, n_images: u32);
-    /// Add a binding with a variable-size array of textures.
-    fn add_textures(&mut self, max_textures: u32);
-    unsafe fn create_compute_pipeline(
-        self,
-        device: &D,
-        code: &D::ShaderSource,
-    ) -> Result<D::Pipeline, Error>;
-}
-
 /// A builder for descriptor sets with more complex layouts.
 ///
 /// Note: the order needs to match the pipeline building, and it also needs to
--- a/piet-gpu-hal/src/dx12.rs
+++ b/piet-gpu-hal/src/dx12.rs
@ -3,17 +3,19 @@
 mod error;
 mod wrappers;

-use std::{cell::Cell, convert::TryInto, mem, ptr};
+use std::{cell::Cell, convert::{TryFrom, TryInto}, mem, ptr};

 use winapi::shared::minwindef::TRUE;
-use winapi::shared::{dxgi, dxgi1_2, dxgi1_3, dxgitype};
+use winapi::shared::{dxgi, dxgi1_2, dxgitype};
+#[allow(unused)]
+use winapi::shared::dxgi1_3; // for error reporting in debug mode
 use winapi::um::d3d12;

 use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};

 use smallvec::SmallVec;

-use crate::{BufferUsage, Error, GpuInfo, ImageLayout, WorkgroupLimits};
+use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, WorkgroupLimits};

 use self::wrappers::{CommandAllocator, CommandQueue, Device, Factory4, Resource, ShaderByteCode};

@ -83,13 +85,6 @@ pub struct Fence {
 /// semaphore is needed for presentation on DX12.
 pub struct Semaphore;

-#[derive(Default)]
-pub struct PipelineBuilder {
-    ranges: Vec<d3d12::D3D12_DESCRIPTOR_RANGE>,
-    n_uav: u32,
-    // TODO: add counters for other resource types
-}
-
 // TODO
 #[derive(Default)]
 pub struct DescriptorSetBuilder {
@ -239,14 +234,13 @@ impl crate::backend::Device for Dx12Device {

    type Semaphore = Semaphore;

-    type PipelineBuilder = PipelineBuilder;
-
    type DescriptorSetBuilder = DescriptorSetBuilder;

    type Sampler = ();

-    // Currently this is HLSL source, but we'll probably change it to IR.
-    type ShaderSource = str;
+    // Currently due to type inflexibility this is hardcoded to either HLSL or
+    // DXIL, but it would be nice to be able to handle both at runtime.
+    type ShaderSource = [u8];

    fn create_buffer(&self, size: u64, usage: BufferUsage) -> Result<Self::Buffer, Error> {
        // TODO: consider supporting BufferUsage::QUERY_RESOLVE here rather than
@ -289,9 +283,7 @@ impl crate::backend::Device for Dx12Device {

    fn create_cmd_buf(&self) -> Result<Self::CmdBuf, Error> {
        let list_type = d3d12::D3D12_COMMAND_LIST_TYPE_DIRECT;
-        let allocator = 
-            unsafe { self.device.create_command_allocator(list_type)? }
-        ;
+        let allocator = unsafe { self.device.create_command_allocator(list_type)? };
        let node_mask = 0;
        unsafe {
            let c = self
@ -420,8 +412,94 @@ impl crate::backend::Device for Dx12Device {
        self.gpu_info.clone()
    }

-    unsafe fn pipeline_builder(&self) -> Self::PipelineBuilder {
-        PipelineBuilder::default()
+    unsafe fn create_compute_pipeline(
+        &self,
+        code: &Self::ShaderSource,
+        bind_types: &[BindType],
+    ) -> Result<Pipeline, Error> {
+        if u32::try_from(bind_types.len()).is_err() {
+            panic!("bind type length overflow");
+        }
+        let mut ranges = Vec::new();
+        let mut i = 0;
+        fn map_range_type(bind_type: BindType) -> d3d12::D3D12_DESCRIPTOR_RANGE_TYPE {
+            match bind_type {
+                BindType::Buffer | BindType::Image | BindType::ImageRead => d3d12::D3D12_DESCRIPTOR_RANGE_TYPE_UAV,
+                BindType::BufReadOnly => d3d12::D3D12_DESCRIPTOR_RANGE_TYPE_SRV,
+            }
+        }
+        while i < bind_types.len() {
+            let range_type = map_range_type(bind_types[i]);
+            let mut end = i + 1;
+            while end < bind_types.len() && map_range_type(bind_types[end]) == range_type {
+                end += 1;
+            }
+            let n_descriptors = (end - i) as u32;
+            ranges.push(d3d12::D3D12_DESCRIPTOR_RANGE {
+                RangeType: range_type,
+                NumDescriptors: n_descriptors,
+                BaseShaderRegister: i as u32,
+                RegisterSpace: 0,
+                OffsetInDescriptorsFromTableStart: d3d12::D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND,
+            });
+            i = end;
+        }
+
+        // We could always have ShaderSource as [u8] even when it's HLSL, and use the
+        // magic number to distinguish. In any case, for now it's hardcoded as one or
+        // the other.
+        /*
+        // HLSL code path
+        #[cfg(debug_assertions)]
+        let flags = winapi::um::d3dcompiler::D3DCOMPILE_DEBUG
+            | winapi::um::d3dcompiler::D3DCOMPILE_SKIP_OPTIMIZATION;
+        #[cfg(not(debug_assertions))]
+        let flags = 0;
+        let shader_blob = ShaderByteCode::compile(code, "cs_5_1", "main", flags)?;
+        let shader = ShaderByteCode::from_blob(shader_blob);
+        */
+
+        // DXIL code path
+        let shader = ShaderByteCode::from_slice(code);
+
+        let mut root_parameter = d3d12::D3D12_ROOT_PARAMETER {
+            ParameterType: d3d12::D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE,
+            ShaderVisibility: d3d12::D3D12_SHADER_VISIBILITY_ALL,
+            ..mem::zeroed()
+        };
+        *root_parameter.u.DescriptorTable_mut() = d3d12::D3D12_ROOT_DESCRIPTOR_TABLE {
+            NumDescriptorRanges: ranges.len() as u32,
+            pDescriptorRanges: ranges.as_ptr(),
+        };
+        let root_signature_desc = d3d12::D3D12_ROOT_SIGNATURE_DESC {
+            NumParameters: 1,
+            pParameters: &root_parameter,
+            NumStaticSamplers: 0,
+            pStaticSamplers: ptr::null(),
+            Flags: d3d12::D3D12_ROOT_SIGNATURE_FLAG_NONE,
+        };
+        let root_signature_blob = wrappers::RootSignature::serialize_description(
+            &root_signature_desc,
+            d3d12::D3D_ROOT_SIGNATURE_VERSION_1,
+        )?;
+        let root_signature = self
+            .device
+            .create_root_signature(0, root_signature_blob)?;
+        let desc = d3d12::D3D12_COMPUTE_PIPELINE_STATE_DESC {
+            pRootSignature: root_signature.0.as_raw(),
+            CS: shader.bytecode,
+            NodeMask: 0,
+            CachedPSO: d3d12::D3D12_CACHED_PIPELINE_STATE {
+                pCachedBlob: ptr::null(),
+                CachedBlobSizeInBytes: 0,
+            },
+            Flags: d3d12::D3D12_PIPELINE_STATE_FLAG_NONE,
+        };
+        let pipeline_state = self.device.create_compute_pipeline_state(&desc)?;
+        Ok(Pipeline {
+            pipeline_state,
+            root_signature,
+        })
    }

    unsafe fn descriptor_set_builder(&self) -> Self::DescriptorSetBuilder {
@ -451,8 +529,7 @@ impl Dx12Device {

 impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
    unsafe fn begin(&mut self) {
-        if self.needs_reset {
-        }
+        if self.needs_reset {}
    }

    unsafe fn finish(&mut self) {
@ -559,86 +636,6 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
    }
 }

-impl crate::backend::PipelineBuilder<Dx12Device> for PipelineBuilder {
-    fn add_buffers(&mut self, n_buffers: u32) {
-        // Note: if the buffer is readonly, then it needs to be bound
-        // as an SRV, not a UAV. I think that requires distinguishing
-        // readonly and read-write cases in pipeline and descriptor set
-        // creation. For now we punt.
-        if n_buffers != 0 {
-            self.ranges.push(d3d12::D3D12_DESCRIPTOR_RANGE {
-                RangeType: d3d12::D3D12_DESCRIPTOR_RANGE_TYPE_UAV,
-                NumDescriptors: n_buffers,
-                BaseShaderRegister: self.n_uav,
-                RegisterSpace: 0,
-                OffsetInDescriptorsFromTableStart: d3d12::D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND,
-            });
-            self.n_uav += n_buffers;
-        }
-    }
-
-    fn add_images(&mut self, n_images: u32) {
-        // These are UAV images, so the descriptor type is the same as buffers.
-        self.add_buffers(n_images);
-    }
-
-    fn add_textures(&mut self, _max_textures: u32) {
-        todo!()
-    }
-
-    unsafe fn create_compute_pipeline(
-        self,
-        device: &Dx12Device,
-        code: &str,
-    ) -> Result<Pipeline, Error> {
-        #[cfg(debug_assertions)]
-        let flags = winapi::um::d3dcompiler::D3DCOMPILE_DEBUG
-            | winapi::um::d3dcompiler::D3DCOMPILE_SKIP_OPTIMIZATION;
-        #[cfg(not(debug_assertions))]
-        let flags = 0;
-        let shader_blob = ShaderByteCode::compile(code, "cs_5_1", "main", flags)?;
-        let shader = ShaderByteCode::from_blob(shader_blob);
-        let mut root_parameter = d3d12::D3D12_ROOT_PARAMETER {
-            ParameterType: d3d12::D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE,
-            ShaderVisibility: d3d12::D3D12_SHADER_VISIBILITY_ALL,
-            ..mem::zeroed()
-        };
-        *root_parameter.u.DescriptorTable_mut() = d3d12::D3D12_ROOT_DESCRIPTOR_TABLE {
-            NumDescriptorRanges: self.ranges.len().try_into()?,
-            pDescriptorRanges: self.ranges.as_ptr(),
-        };
-        let root_signature_desc = d3d12::D3D12_ROOT_SIGNATURE_DESC {
-            NumParameters: 1,
-            pParameters: &root_parameter,
-            NumStaticSamplers: 0,
-            pStaticSamplers: ptr::null(),
-            Flags: d3d12::D3D12_ROOT_SIGNATURE_FLAG_NONE,
-        };
-        let root_signature_blob = wrappers::RootSignature::serialize_description(
-            &root_signature_desc,
-            d3d12::D3D_ROOT_SIGNATURE_VERSION_1,
-        )?;
-        let root_signature = device
-            .device
-            .create_root_signature(0, root_signature_blob)?;
-        let desc = d3d12::D3D12_COMPUTE_PIPELINE_STATE_DESC {
-            pRootSignature: root_signature.0.as_raw(),
-            CS: shader.bytecode,
-            NodeMask: 0,
-            CachedPSO: d3d12::D3D12_CACHED_PIPELINE_STATE {
-                pCachedBlob: ptr::null(),
-                CachedBlobSizeInBytes: 0,
-            },
-            Flags: d3d12::D3D12_PIPELINE_STATE_FLAG_NONE,
-        };
-        let pipeline_state = device.device.create_compute_pipeline_state(&desc)?;
-        Ok(Pipeline {
-            pipeline_state,
-            root_signature,
-        })
-    }
-}
-
 impl crate::backend::DescriptorSetBuilder<Dx12Device> for DescriptorSetBuilder {
    fn add_buffers(&mut self, buffers: &[&Buffer]) {
        // Note: we could get rid of the clone here (which is an AddRef)
--- a/piet-gpu-hal/src/dx12/wrappers.rs
+++ b/piet-gpu-hal/src/dx12/wrappers.rs
@ -10,9 +10,7 @@ use crate::dx12::error::{self, error_if_failed_else_unit, explain_error, Error};
 use std::convert::{TryFrom, TryInto};
 use std::sync::atomic::{AtomicPtr, Ordering};
 use std::{ffi, mem, ptr};
-use winapi::shared::{
-    dxgi, dxgi1_2, dxgi1_3, dxgi1_4, dxgiformat, dxgitype, minwindef, windef,
-};
+use winapi::shared::{dxgi, dxgi1_2, dxgi1_3, dxgi1_4, dxgiformat, dxgitype, minwindef, windef};
 use winapi::um::d3dcommon::ID3DBlob;
 use winapi::um::{
    d3d12, d3d12sdklayers, d3dcommon, d3dcompiler, dxgidebug, handleapi, synchapi, winnt,
@ -198,7 +196,7 @@ impl Factory4 {
        error_if_failed_else_unit(self.0.EnumAdapters1(id, &mut adapter))?;
        let mut desc = mem::zeroed();
        (*adapter).GetDesc(&mut desc);
-        println!("desc: {:?}", desc.Description);
+        //println!("desc: {:?}", desc.Description);
        Ok(Adapter1(ComPtr::from_raw(adapter)))
    }

@ -278,6 +276,7 @@ impl SwapChain3 {
 }

 impl Blob {
+    #[allow(unused)]
    pub unsafe fn print_to_console(blob: &Blob) {
        println!("==SHADER COMPILE MESSAGES==");
        let message = {
@ -563,7 +562,6 @@ impl Device {
        Ok(QueryHeap(ComPtr::from_raw(query_heap)))
    }

-
    pub unsafe fn create_buffer(
        &self,
        buffer_size_in_bytes: u32,
@ -717,13 +715,13 @@ impl RootSignature {
        let hresult =
            d3d12::D3D12SerializeRootSignature(desc, version, &mut blob, &mut error_blob_ptr);

+        #[cfg(debug_assertions)]
+        {
            let error_blob = if error_blob_ptr.is_null() {
                None
            } else {
                Some(Blob(ComPtr::from_raw(error_blob_ptr)))
            };
-        #[cfg(debug_assertions)]
-        {
            if let Some(error_blob) = &error_blob {
                Blob::print_to_console(error_blob);
            }
@ -739,6 +737,7 @@ impl ShaderByteCode {
    // `blob` may not be null.
    // TODO: this is not super elegant, maybe want to move the get
    // operations closer to where they're used.
+    #[allow(unused)]
    pub unsafe fn from_blob(blob: Blob) -> ShaderByteCode {
        ShaderByteCode {
            bytecode: d3d12::D3D12_SHADER_BYTECODE {
@ -752,6 +751,7 @@ impl ShaderByteCode {
    /// Compile a shader from raw HLSL.
    ///
    /// * `target`: example format: `ps_5_1`.
+    #[allow(unused)]
    pub unsafe fn compile(
        source: &str,
        target: &str,
@ -798,6 +798,24 @@ impl ShaderByteCode {

        Ok(Blob(ComPtr::from_raw(shader_blob_ptr)))
    }
+
+    /// Create bytecode from a slice.
+    ///
+    /// # Safety
+    ///
+    /// This call elides the lifetime from the slice. The caller is responsible
+    /// for making sure the reference remains valid for the lifetime of this
+    /// object.
+    #[allow(unused)]
+    pub unsafe fn from_slice(bytecode: &[u8]) -> ShaderByteCode {
+        ShaderByteCode {
+            bytecode: d3d12::D3D12_SHADER_BYTECODE {
+                BytecodeLength: bytecode.len(),
+                pShaderBytecode: bytecode.as_ptr() as *const _,
+            },
+            blob: None,
+        }
+    }
 }

 impl Fence {
@ -864,7 +882,11 @@ impl GraphicsCommandList {
        explain_error(self.0.Close(), "error closing command list")
    }

-    pub unsafe fn reset(&self, allocator: &CommandAllocator, initial_pso: Option<&PipelineState>) -> Result<(), Error> {
+    pub unsafe fn reset(
+        &self,
+        allocator: &CommandAllocator,
+        initial_pso: Option<&PipelineState>,
+    ) -> Result<(), Error> {
        let p_initial_state = initial_pso.map(|p| p.0.as_raw()).unwrap_or(ptr::null_mut());
        error::error_if_failed_else_unit(self.0.Reset(allocator.0.as_raw(), p_initial_state))
    }
@ -1072,9 +1094,8 @@ pub unsafe fn create_transition_resource_barrier(
    resource_barrier
 }

+#[allow(unused)]
 pub unsafe fn enable_debug_layer() -> Result<(), Error> {
-    println!("enabling debug layer.");
-
    let mut debug_controller: *mut d3d12sdklayers::ID3D12Debug1 = ptr::null_mut();
    explain_error(
        d3d12::D3D12GetDebugInterface(
--- a/piet-gpu-hal/src/hub.rs
+++ b/piet-gpu-hal/src/hub.rs
@ -11,9 +11,9 @@ use std::sync::{Arc, Mutex, Weak};

 use smallvec::SmallVec;

-use crate::mux;
+use crate::{mux, BackendType};

-use crate::{BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams};
+use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams};

 pub use crate::mux::{DescriptorSet, Fence, Pipeline, QueryPool, Sampler, Semaphore, ShaderCode};

@ -100,12 +100,6 @@ struct BufferInner {
    session: Weak<SessionInner>,
 }

-/// A builder for creating pipelines.
-///
-/// Configure the signature (buffers and images accessed) for a pipeline,
-/// which is essentially compiled shader code, ready to be dispatched.
-pub struct PipelineBuilder(mux::PipelineBuilder);
-
 /// A builder for creating descriptor sets.
 ///
 /// Add bindings to the descriptor set before dispatching a shader.
@ -316,26 +310,16 @@ impl Session {
        self.0.device.create_semaphore()
    }

-    /// This creates a pipeline that operates on some buffers and images.
-    ///
-    /// The descriptor set layout is just some number of storage buffers
-    /// and storage images (this might change).
-    pub unsafe fn create_simple_compute_pipeline<'a>(
-        &self,
-        code: ShaderCode<'a>,
-        n_buffers: u32,
-    ) -> Result<Pipeline, Error> {
-        self.pipeline_builder()
-            .add_buffers(n_buffers)
-            .create_compute_pipeline(self, code)
-    }
-
-    /// Start building a pipeline.
+    /// Create a compute shader pipeline.
    ///
    /// A pipeline is essentially a compiled shader, with more specific
    /// details about what resources may be bound to it.
-    pub unsafe fn pipeline_builder(&self) -> PipelineBuilder {
-        PipelineBuilder(self.0.device.pipeline_builder())
+    pub unsafe fn create_compute_pipeline<'a>(
+        &self,
+        code: ShaderCode<'a>,
+        bind_types: &[BindType],
+    ) -> Result<Pipeline, Error> {
+        self.0.device.create_compute_pipeline(code, bind_types)
    }

    /// Create a descriptor set for a simple pipeline that just references buffers.
@ -385,8 +369,13 @@ impl Session {
    }

    /// Choose shader code from the available choices.
-    pub fn choose_shader<'a>(&self, spv: &'a [u8], hlsl: &'a str, msl: &'a str) -> ShaderCode<'a> {
-        self.0.device.choose_shader(spv, hlsl, msl)
+    pub fn choose_shader<'a>(&self, spv: &'a [u8], hlsl: &'a str, dxil: &'a [u8], msl: &'a str) -> ShaderCode<'a> {
+        self.0.device.choose_shader(spv, hlsl, dxil, msl)
+    }
+
+    /// Report the backend type that was chosen.
+    pub fn backend_type(&self) -> BackendType {
+        self.0.device.backend_type()
    }
 }

@ -729,38 +718,6 @@ impl Buffer {
    }
 }

-impl PipelineBuilder {
-    /// Add buffers to the pipeline. Each has its own binding.
-    pub fn add_buffers(mut self, n_buffers: u32) -> Self {
-        self.0.add_buffers(n_buffers);
-        self
-    }
-
-    /// Add storage images to the pipeline. Each has its own binding.
-    pub fn add_images(mut self, n_images: u32) -> Self {
-        self.0.add_images(n_images);
-        self
-    }
-
-    /// Add a binding with a variable-size array of textures.
-    pub fn add_textures(mut self, max_textures: u32) -> Self {
-        self.0.add_textures(max_textures);
-        self
-    }
-
-    /// Create the compute pipeline.
-    ///
-    /// The shader code must be given in an appropriate format for
-    /// the back-end. See [`Session::choose_shader`] for a helper.
-    pub unsafe fn create_compute_pipeline<'a>(
-        self,
-        session: &Session,
-        code: ShaderCode<'a>,
-    ) -> Result<Pipeline, Error> {
-        self.0.create_compute_pipeline(&session.0.device, code)
-    }
-}
-
 impl DescriptorSetBuilder {
    pub fn add_buffers<'a>(mut self, buffers: impl IntoRefs<'a, Buffer>) -> Self {
        let mux_buffers = buffers
--- a/piet-gpu-hal/src/lib.rs
+++ b/piet-gpu-hal/src/lib.rs
@ -1,7 +1,8 @@
-/// The cross-platform abstraction for a GPU device.
-///
-/// This abstraction is inspired by gfx-hal, but is specialized to the needs of piet-gpu.
-/// In time, it may go away and be replaced by either gfx-hal or wgpu.
+//! The cross-platform abstraction for a GPU device.
+//!
+//! This abstraction is inspired by gfx-hal, but is specialized to the needs of piet-gpu.
+//! In time, it may go away and be replaced by either gfx-hal or wgpu.
+
 use bitflags::bitflags;

 mod backend;
@ -17,8 +18,8 @@ pub use crate::mux::{
    Swapchain,
 };
 pub use hub::{
-    Buffer, CmdBuf, DescriptorSetBuilder, Image, PipelineBuilder, PlainData, RetainResource,
-    Session, SubmittedCmdBuf,
+    Buffer, CmdBuf, DescriptorSetBuilder, Image, PlainData, RetainResource, Session,
+    SubmittedCmdBuf,
 };

 // TODO: because these are conditionally included, "cargo fmt" does not
@ -36,9 +37,27 @@ mod metal;

 /// The common error type for the crate.
 ///
-/// This keeps things imple and can be expanded later.
+/// This keeps things simple and can be expanded later.
 pub type Error = Box<dyn std::error::Error>;

+bitflags! {
+    /// Options when creating an instance.
+    #[derive(Default)]
+    pub struct InstanceFlags: u32 {
+        /// Prefer DX12 over Vulkan.
+        const DX12 = 0x1;
+        // TODO: discrete vs integrated selection
+    }
+}
+
+/// The GPU backend that was selected.
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub enum BackendType {
+    Vulkan,
+    Dx12,
+    Metal,
+}
+
 /// An image layout state.
 ///
 /// An image must be in a particular layout state to be used for
@ -84,10 +103,31 @@ bitflags! {
        const STORAGE = 0x80;
        /// The buffer can be used to store the results of queries.
        const QUERY_RESOLVE = 0x200;
+        /// The buffer may be cleared.
+        const CLEAR = 0x8000;
        // May add other types.
    }
 }

+/// The type of resource that will be bound to a slot in a shader.
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub enum BindType {
+    /// A storage buffer with read/write access.
+    Buffer,
+    /// A storage buffer with read only access.
+    BufReadOnly,
+    /// A storage image.
+    Image,
+    /// A storage image with read only access.
+    ///
+    /// A note on this. None of the backends are currently making a
+    /// distinction between Image and ImageRead as far as bindings go,
+    /// but the `--hlsl-nonwritable-uav-texture-as-srv` option to
+    /// spirv-cross (marked as unstable) would do so.
+    ImageRead,
+    // TODO: Uniform, Sampler, maybe others
+}
+
 #[derive(Clone, Debug)]
 /// Information about the GPU.
 pub struct GpuInfo {
--- a/piet-gpu-hal/src/macros.rs
+++ b/piet-gpu-hal/src/macros.rs
@ -198,6 +198,7 @@ macro_rules! include_shader {
        $device.choose_shader(
            include_bytes!(concat!($path_base, ".spv")),
            include_str!(concat!($path_base, ".hlsl")),
+            include_bytes!(concat!($path_base, ".dxil")),
            include_str!(concat!($path_base, ".msl")),
        )
    };
--- a/piet-gpu-hal/src/metal.rs
+++ b/piet-gpu-hal/src/metal.rs
@ -82,8 +82,6 @@ pub struct CmdBuf {

 pub struct QueryPool;

-pub struct PipelineBuilder;
-
 pub struct Pipeline(metal::ComputePipelineState);

 #[derive(Default)]
@ -220,8 +218,6 @@ impl crate::backend::Device for MtlDevice {

    type Semaphore = Semaphore;

-    type PipelineBuilder = PipelineBuilder;
-
    type DescriptorSetBuilder = DescriptorSetBuilder;

    type Sampler = ();
@ -273,8 +269,18 @@ impl crate::backend::Device for MtlDevice {
        todo!()
    }

-    unsafe fn pipeline_builder(&self) -> Self::PipelineBuilder {
-        PipelineBuilder
+    unsafe fn create_compute_pipeline(
+        &self,
+        code: &Self::ShaderSource,
+        _bind_types: &[crate::BindType],
+    ) -> Result<Self::Pipeline, Error> {
+        let options = metal::CompileOptions::new();
+        let library = self.device.new_library_with_source(code, &options)?;
+        let function = library.get_function("main0", None)?;
+        let pipeline = self
+            .device
+            .new_compute_pipeline_state_with_function(&function)?;
+        Ok(Pipeline(pipeline))
    }

    unsafe fn descriptor_set_builder(&self) -> Self::DescriptorSetBuilder {
@ -552,33 +558,6 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
    }
 }

-impl crate::backend::PipelineBuilder<MtlDevice> for PipelineBuilder {
-    fn add_buffers(&mut self, _n_buffers: u32) {
-        // My understanding is that Metal infers the pipeline layout from
-        // the source.
-    }
-
-    fn add_images(&mut self, _n_images: u32) {}
-
-    fn add_textures(&mut self, _max_textures: u32) {}
-
-    unsafe fn create_compute_pipeline(
-        self,
-        device: &MtlDevice,
-        code: &str,
-    ) -> Result<Pipeline, Error> {
-        let options = metal::CompileOptions::new();
-        // Probably want to set MSL version here.
-        let library = device.device.new_library_with_source(code, &options)?;
-        // This seems to be the default name from spirv-cross, but we may need to tweak.
-        let function = library.get_function("main0", None)?;
-        let pipeline = device
-            .device
-            .new_compute_pipeline_state_with_function(&function)?;
-        Ok(Pipeline(pipeline))
-    }
-}
-
 impl crate::backend::DescriptorSetBuilder<MtlDevice> for DescriptorSetBuilder {
    fn add_buffers(&mut self, buffers: &[&Buffer]) {
        self.0.buffers.extend(buffers.iter().copied().cloned());
--- a/piet-gpu-hal/src/mux.rs
+++ b/piet-gpu-hal/src/mux.rs
@ -33,8 +33,9 @@ mux_cfg! {
 use crate::backend::CmdBuf as CmdBufTrait;
 use crate::backend::DescriptorSetBuilder as DescriptorSetBuilderTrait;
 use crate::backend::Device as DeviceTrait;
-use crate::backend::PipelineBuilder as PipelineBuilderTrait;
-use crate::{BufferUsage, Error, GpuInfo, ImageLayout};
+use crate::BackendType;
+use crate::BindType;
+use crate::{BufferUsage, Error, GpuInfo, ImageLayout, InstanceFlags};

 mux_enum! {
    /// An instance, selected from multiple backends.
@ -84,7 +85,6 @@ mux_device_enum! {
 /// presentation by the back-end, this may or may not be a "real"
 /// semaphore.
 Semaphore }
-mux_device_enum! { PipelineBuilder }
 mux_device_enum! {
 /// A pipeline object; basically a compiled shader.
 Pipeline }
@ -104,6 +104,8 @@ pub enum ShaderCode<'a> {
    Spv(&'a [u8]),
    /// HLSL (source)
    Hlsl(&'a str),
+    /// DXIL (DX12 intermediate language)
+    Dxil(&'a [u8]),
    /// Metal Shading Language (source)
    Msl(&'a str),
 }
@ -118,7 +120,14 @@ impl Instance {
    /// work.
    pub fn new(
        window_handle: Option<&dyn raw_window_handle::HasRawWindowHandle>,
+        flags: InstanceFlags,
    ) -> Result<(Instance, Option<Surface>), Error> {
+        let mut backends = [BackendType::Vulkan, BackendType::Dx12];
+        if flags.contains(InstanceFlags::DX12) {
+            backends.swap(0, 1);
+        }
+        for backend in backends {
+            if backend == BackendType::Vulkan {
                mux_cfg! {
                    #[cfg(vk)]
                    {
@ -128,6 +137,8 @@ impl Instance {
                        }
                    }
                }
+            }
+            if backend == BackendType::Dx12 {
                mux_cfg! {
                    #[cfg(dx12)]
                    {
@ -137,6 +148,8 @@ impl Instance {
                        }
                    }
                }
+            }
+        }
        mux_cfg! {
            #[cfg(mtl)]
            {
@ -293,11 +306,40 @@ impl Device {
        }
    }

-    pub unsafe fn pipeline_builder(&self) -> PipelineBuilder {
+    pub unsafe fn create_compute_pipeline<'a>(
+        &self,
+        code: ShaderCode<'a>,
+        bind_types: &[BindType],
+    ) -> Result<Pipeline, Error> {
        mux_match! { self;
-            Device::Vk(d) => PipelineBuilder::Vk(d.pipeline_builder()),
-            Device::Dx12(d) => PipelineBuilder::Dx12(d.pipeline_builder()),
-            Device::Mtl(d) => PipelineBuilder::Mtl(d.pipeline_builder()),
+            Device::Vk(d) => {
+                let shader_code = match code {
+                    ShaderCode::Spv(spv) => spv,
+                    // Panic or return "incompatible shader" error here?
+                    _ => panic!("Vulkan backend requires shader code in SPIR-V format"),
+                };
+                d.create_compute_pipeline(shader_code, bind_types)
+                    .map(Pipeline::Vk)
+            }
+            Device::Dx12(d) => {
+                let shader_code = match code {
+                    //ShaderCode::Hlsl(hlsl) => hlsl,
+                    ShaderCode::Dxil(dxil) => dxil,
+                    // Panic or return "incompatible shader" error here?
+                    _ => panic!("DX12 backend requires shader code in DXIL format"),
+                };
+                d.create_compute_pipeline(shader_code, bind_types)
+                    .map(Pipeline::Dx12)
+            }
+            Device::Mtl(d) => {
+                let shader_code = match code {
+                    ShaderCode::Msl(msl) => msl,
+                    // Panic or return "incompatible shader" error here?
+                    _ => panic!("Metal backend requires shader code in MSL format"),
+                };
+                d.create_compute_pipeline(shader_code, bind_types)
+                    .map(Pipeline::Mtl)
+            }
        }
    }

@ -436,74 +478,21 @@ impl Device {
        &self,
        _spv: &'a [u8],
        _hlsl: &'a str,
+        _dxil: &'a [u8],
        _msl: &'a str,
    ) -> ShaderCode<'a> {
        mux_match! { self;
            Device::Vk(_d) => ShaderCode::Spv(_spv),
-            Device::Dx12(_d) => ShaderCode::Hlsl(_hlsl),
+            Device::Dx12(_d) => ShaderCode::Dxil(_dxil),
            Device::Mtl(_d) => ShaderCode::Msl(_msl),
        }
    }
-}

-impl PipelineBuilder {
-    pub fn add_buffers(&mut self, n_buffers: u32) {
+    pub fn backend_type(&self) -> BackendType {
        mux_match! { self;
-            PipelineBuilder::Vk(x) => x.add_buffers(n_buffers),
-            PipelineBuilder::Dx12(x) => x.add_buffers(n_buffers),
-            PipelineBuilder::Mtl(x) => x.add_buffers(n_buffers),
-        }
-    }
-
-    pub fn add_images(&mut self, n_buffers: u32) {
-        mux_match! { self;
-            PipelineBuilder::Vk(x) => x.add_images(n_buffers),
-            PipelineBuilder::Dx12(x) => x.add_images(n_buffers),
-            PipelineBuilder::Mtl(x) => x.add_images(n_buffers),
-        }
-    }
-
-    pub fn add_textures(&mut self, n_buffers: u32) {
-        mux_match! { self;
-            PipelineBuilder::Vk(x) => x.add_textures(n_buffers),
-            PipelineBuilder::Dx12(x) => x.add_textures(n_buffers),
-            PipelineBuilder::Mtl(x) => x.add_textures(n_buffers),
-        }
-    }
-
-    pub unsafe fn create_compute_pipeline<'a>(
-        self,
-        device: &Device,
-        code: ShaderCode<'a>,
-    ) -> Result<Pipeline, Error> {
-        mux_match! { self;
-            PipelineBuilder::Vk(x) => {
-                let shader_code = match code {
-                    ShaderCode::Spv(spv) => spv,
-                    // Panic or return "incompatible shader" error here?
-                    _ => panic!("Vulkan backend requires shader code in SPIR-V format"),
-                };
-                x.create_compute_pipeline(device.vk(), shader_code)
-                    .map(Pipeline::Vk)
-            }
-            PipelineBuilder::Dx12(x) => {
-                let shader_code = match code {
-                    ShaderCode::Hlsl(hlsl) => hlsl,
-                    // Panic or return "incompatible shader" error here?
-                    _ => panic!("DX12 backend requires shader code in HLSL format"),
-                };
-                x.create_compute_pipeline(device.dx12(), shader_code)
-                    .map(Pipeline::Dx12)
-            }
-            PipelineBuilder::Mtl(x) => {
-                let shader_code = match code {
-                    ShaderCode::Msl(msl) => msl,
-                    // Panic or return "incompatible shader" error here?
-                    _ => panic!("Metal backend requires shader code in MSL format"),
-                };
-                x.create_compute_pipeline(device.mtl(), shader_code)
-                    .map(Pipeline::Mtl)
-            }
+            Device::Vk(_d) => BackendType::Vulkan,
+            Device::Dx12(_d) => BackendType::Dx12,
+            Device::Mtl(_d) => BackendType::Metal,
        }
    }
 }
--- a/piet-gpu-hal/src/vulkan.rs
+++ b/piet-gpu-hal/src/vulkan.rs
@ -11,9 +11,11 @@ use ash::{vk, Device, Entry, Instance};

 use smallvec::SmallVec;

-use crate::{BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams, SubgroupSize, WorkgroupLimits};
 use crate::backend::Device as DeviceTrait;
-
+use crate::{
+    BindType, BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams, SubgroupSize,
+    WorkgroupLimits,
+};

 pub struct VkInstance {
    /// Retain the dynamic lib.
@ -98,12 +100,6 @@ pub struct QueryPool {
 #[derive(Clone, Copy)]
 pub struct MemFlags(vk::MemoryPropertyFlags);

-pub struct PipelineBuilder {
-    bindings: Vec<vk::DescriptorSetLayoutBinding>,
-    binding_flags: Vec<vk::DescriptorBindingFlags>,
-    max_textures: u32,
-}
-
 pub struct DescriptorSetBuilder {
    buffers: Vec<vk::Buffer>,
    images: Vec<vk::ImageView>,
@ -262,9 +258,9 @@ impl VkInstance {
        if vk1_1 {
            let mut descriptor_indexing_features =
                vk::PhysicalDeviceDescriptorIndexingFeatures::builder();
-            features2 = features2
-                .push_next(&mut descriptor_indexing_features);
-            self.instance.get_physical_device_features2(pdevice, &mut features2);
+            features2 = features2.push_next(&mut descriptor_indexing_features);
+            self.instance
+                .get_physical_device_features2(pdevice, &mut features2);
            set_features2 = set_features2.features(features2.features);
            has_descriptor_indexing = descriptor_indexing_features
                .shader_storage_image_array_non_uniform_indexing
@ -296,14 +292,13 @@ impl VkInstance {
            extensions.try_add(vk::KhrMaintenance3Fn::name());
            extensions.try_add(vk::ExtDescriptorIndexingFn::name());
        }
-        let has_subgroup_size = vk1_1
-            && extensions.try_add(vk::ExtSubgroupSizeControlFn::name());
-        let has_memory_model = vk1_1
-            && extensions.try_add(vk::KhrVulkanMemoryModelFn::name());
+        let has_subgroup_size = vk1_1 && extensions.try_add(vk::ExtSubgroupSizeControlFn::name());
+        let has_memory_model = vk1_1 && extensions.try_add(vk::KhrVulkanMemoryModelFn::name());
        let mut create_info = vk::DeviceCreateInfo::builder()
            .queue_create_infos(&queue_create_infos)
            .enabled_extension_names(extensions.as_ptrs());
-            let mut set_memory_model_features = vk::PhysicalDeviceVulkanMemoryModelFeatures::builder();        if vk1_1 {
+        let mut set_memory_model_features = vk::PhysicalDeviceVulkanMemoryModelFeatures::builder();
+        if vk1_1 {
            create_info = create_info.push_next(&mut set_features2);
            if has_memory_model {
                set_memory_model_features = set_memory_model_features
@ -422,7 +417,8 @@ impl VkInstance {
            0 => u32::MAX,
            x => x,
        };
-        let image_count = PREFERRED_IMAGE_COUNT.clamp(capabilities.min_image_count, max_image_count);
+        let image_count =
+            PREFERRED_IMAGE_COUNT.clamp(capabilities.min_image_count, max_image_count);
        let mut extent = capabilities.current_extent;
        if extent.width == u32::MAX || extent.height == u32::MAX {
            // We're deciding the size.
@ -475,7 +471,6 @@ impl crate::backend::Device for VkDevice {
    type QueryPool = QueryPool;
    type Fence = vk::Fence;
    type Semaphore = vk::Semaphore;
-    type PipelineBuilder = PipelineBuilder;
    type DescriptorSetBuilder = DescriptorSetBuilder;
    type Sampler = vk::Sampler;
    type ShaderSource = [u8];
@ -649,12 +644,65 @@ impl crate::backend::Device for VkDevice {
        Ok(device.get_fence_status(*fence)?)
    }

-    unsafe fn pipeline_builder(&self) -> PipelineBuilder {
-        PipelineBuilder {
-            bindings: Vec::new(),
-            binding_flags: Vec::new(),
-            max_textures: 0,
-        }
+    unsafe fn create_compute_pipeline(
+        &self,
+        code: &[u8],
+        bind_types: &[BindType],
+    ) -> Result<Pipeline, Error> {
+        let device = &self.device.device;
+        let bindings = bind_types
+            .iter()
+            .enumerate()
+            .map(|(i, bind_type)| {
+                let descriptor_type = match bind_type {
+                    BindType::Buffer | BindType::BufReadOnly => vk::DescriptorType::STORAGE_BUFFER,
+                    BindType::Image | BindType::ImageRead => vk::DescriptorType::STORAGE_IMAGE,
+                };
+                vk::DescriptorSetLayoutBinding::builder()
+                    .binding(i.try_into().unwrap())
+                    .descriptor_type(descriptor_type)
+                    .descriptor_count(1)
+                    .stage_flags(vk::ShaderStageFlags::COMPUTE)
+                    .build()
+            })
+            .collect::<Vec<_>>();
+        let descriptor_set_layout = device.create_descriptor_set_layout(
+            &vk::DescriptorSetLayoutCreateInfo::builder().bindings(&bindings),
+            None,
+        )?;
+        let descriptor_set_layouts = [descriptor_set_layout];
+
+        // Create compute pipeline.
+        let code_u32 = convert_u32_vec(code);
+        let compute_shader_module = device
+            .create_shader_module(&vk::ShaderModuleCreateInfo::builder().code(&code_u32), None)?;
+        let entry_name = CString::new("main").unwrap();
+        let pipeline_layout = device.create_pipeline_layout(
+            &vk::PipelineLayoutCreateInfo::builder().set_layouts(&descriptor_set_layouts),
+            None,
+        )?;
+
+        let pipeline = device
+            .create_compute_pipelines(
+                vk::PipelineCache::null(),
+                &[vk::ComputePipelineCreateInfo::builder()
+                    .stage(
+                        vk::PipelineShaderStageCreateInfo::builder()
+                            .stage(vk::ShaderStageFlags::COMPUTE)
+                            .module(compute_shader_module)
+                            .name(&entry_name)
+                            .build(),
+                    )
+                    .layout(pipeline_layout)
+                    .build()],
+                None,
+            )
+            .map_err(|(_pipeline, err)| err)?[0];
+        Ok(Pipeline {
+            pipeline,
+            pipeline_layout,
+            descriptor_set_layout,
+        })
    }

    unsafe fn descriptor_set_builder(&self) -> DescriptorSetBuilder {
@ -715,13 +763,7 @@ impl crate::backend::Device for VkDevice {
        // fence should make the query available, but otherwise we get sporadic NOT_READY
        // results (Windows 10, AMD 5700 XT).
        let flags = vk::QueryResultFlags::TYPE_64 | vk::QueryResultFlags::WAIT;
-        device.get_query_pool_results(
-            pool.pool,
-            0,
-            pool.n_queries,
-            &mut buf,
-            flags,
-        )?;
+        device.get_query_pool_results(pool.pool, 0, pool.n_queries, &mut buf, flags)?;
        let ts0 = buf[0];
        let tsp = self.timestamp_period as f64 * 1e-9;
        let result = buf[1..]
@ -1080,109 +1122,6 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
    }
 }

-impl crate::backend::PipelineBuilder<VkDevice> for PipelineBuilder {
-    fn add_buffers(&mut self, n_buffers: u32) {
-        let start = self.bindings.len() as u32;
-        for i in 0..n_buffers {
-            self.bindings.push(
-                vk::DescriptorSetLayoutBinding::builder()
-                    .binding(start + i)
-                    .descriptor_type(vk::DescriptorType::STORAGE_BUFFER)
-                    .descriptor_count(1)
-                    .stage_flags(vk::ShaderStageFlags::COMPUTE)
-                    .build(),
-            );
-            self.binding_flags
-                .push(vk::DescriptorBindingFlags::default());
-        }
-    }
-
-    fn add_images(&mut self, n_images: u32) {
-        let start = self.bindings.len() as u32;
-        for i in 0..n_images {
-            self.bindings.push(
-                vk::DescriptorSetLayoutBinding::builder()
-                    .binding(start + i)
-                    .descriptor_type(vk::DescriptorType::STORAGE_IMAGE)
-                    .descriptor_count(1)
-                    .stage_flags(vk::ShaderStageFlags::COMPUTE)
-                    .build(),
-            );
-            self.binding_flags
-                .push(vk::DescriptorBindingFlags::default());
-        }
-    }
-
-    fn add_textures(&mut self, n_images: u32) {
-        let start = self.bindings.len() as u32;
-        for i in 0..n_images {
-            self.bindings.push(
-                vk::DescriptorSetLayoutBinding::builder()
-                    .binding(start + i)
-                    .descriptor_type(vk::DescriptorType::STORAGE_IMAGE)
-                    .descriptor_count(1)
-                    .stage_flags(vk::ShaderStageFlags::COMPUTE)
-                    .build(),
-            );
-            self.binding_flags
-                .push(vk::DescriptorBindingFlags::default());
-        }
-        self.max_textures += n_images;
-    }
-
-    unsafe fn create_compute_pipeline(
-        self,
-        device: &VkDevice,
-        code: &[u8],
-    ) -> Result<Pipeline, Error> {
-        let device = &device.device.device;
-        let descriptor_set_layout = device.create_descriptor_set_layout(
-            &vk::DescriptorSetLayoutCreateInfo::builder()
-                .bindings(&self.bindings)
-                // It might be a slight optimization not to push this if max_textures = 0
-                .push_next(
-                    &mut vk::DescriptorSetLayoutBindingFlagsCreateInfo::builder()
-                        .binding_flags(&self.binding_flags)
-                        .build(),
-                ),
-            None,
-        )?;
-        let descriptor_set_layouts = [descriptor_set_layout];
-
-        // Create compute pipeline.
-        let code_u32 = convert_u32_vec(code);
-        let compute_shader_module = device
-            .create_shader_module(&vk::ShaderModuleCreateInfo::builder().code(&code_u32), None)?;
-        let entry_name = CString::new("main").unwrap();
-        let pipeline_layout = device.create_pipeline_layout(
-            &vk::PipelineLayoutCreateInfo::builder().set_layouts(&descriptor_set_layouts),
-            None,
-        )?;
-
-        let pipeline = device
-            .create_compute_pipelines(
-                vk::PipelineCache::null(),
-                &[vk::ComputePipelineCreateInfo::builder()
-                    .stage(
-                        vk::PipelineShaderStageCreateInfo::builder()
-                            .stage(vk::ShaderStageFlags::COMPUTE)
-                            .module(compute_shader_module)
-                            .name(&entry_name)
-                            .build(),
-                    )
-                    .layout(pipeline_layout)
-                    .build()],
-                None,
-            )
-            .map_err(|(_pipeline, err)| err)?[0];
-        Ok(Pipeline {
-            pipeline,
-            pipeline_layout,
-            descriptor_set_layout,
-        })
-    }
-}
-
 impl crate::backend::DescriptorSetBuilder<VkDevice> for DescriptorSetBuilder {
    fn add_buffers(&mut self, buffers: &[&Buffer]) {
        self.buffers.extend(buffers.iter().map(|b| b.buffer));
--- a/piet-gpu/Cargo.toml
+++ b/piet-gpu/Cargo.toml
@ -30,7 +30,7 @@ piet = "0.2.0"
 png = "0.16.2"
 rand = "0.7.3"
 roxmltree = "0.13"
-winit = "0.23"
+winit = "0.25"
 clap = "2.33"
 swash = "0.1.4"

--- a/piet-gpu/bin/android.rs
+++ b/piet-gpu/bin/android.rs
@ -56,7 +56,7 @@ fn my_main() -> Result<(), Error> {
                        let width = window.width() as usize;
                        let height = window.height() as usize;
                        let handle = get_handle(window);
-                        let (instance, surface) = Instance::new(Some(&handle))?;
+                        let (instance, surface) = Instance::new(Some(&handle), Default::default())?;
                        gfx_state =
                            Some(GfxState::new(&instance, surface.as_ref(), width, height)?);
                    } else {
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@ -226,7 +226,7 @@ fn main() -> Result<(), Error> {
                .takes_value(true),
        )
        .get_matches();
-    let (instance, _) = Instance::new(None)?;
+    let (instance, _) = Instance::new(None, Default::default())?;
    unsafe {
        let device = instance.device(None)?;
        let session = Session::new(device);
--- a/piet-gpu/bin/winit.rs
+++ b/piet-gpu/bin/winit.rs
@ -38,7 +38,7 @@ fn main() -> Result<(), Error> {
        .with_resizable(false) // currently not supported
        .build(&event_loop)?;

-    let (instance, surface) = Instance::new(Some(&window))?;
+    let (instance, surface) = Instance::new(Some(&window), Default::default())?;
    let mut info_string = "info".to_string();
    unsafe {
        let device = instance.device(surface.as_ref())?;
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -14,8 +14,8 @@ use piet::{ImageFormat, RenderContext};
 use piet_gpu_types::encoder::Encode;

 use piet_gpu_hal::{
-    Buffer, BufferUsage, CmdBuf, DescriptorSet, Error, Image, ImageLayout, Pipeline, QueryPool,
-    Session, ShaderCode,
+    BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Error, Image, ImageLayout, Pipeline,
+    QueryPool, Session, ShaderCode,
 };

 use pico_svg::PicoSvg;
@ -140,7 +140,15 @@ impl Renderer {
        let memory_buf_dev = session.create_buffer(128 * 1024 * 1024, dev)?;

        let el_code = ShaderCode::Spv(include_bytes!("../shader/elements.spv"));
-        let el_pipeline = session.create_simple_compute_pipeline(el_code, 4)?;
+        let el_pipeline = session.create_compute_pipeline(
+            el_code,
+            &[
+                BindType::Buffer,
+                BindType::Buffer,
+                BindType::Buffer,
+                BindType::Buffer,
+            ],
+        )?;
        let mut el_ds = Vec::with_capacity(n_bufs);
        for scene_buf in &scene_bufs {
            el_ds.push(session.create_simple_descriptor_set(
@ -150,12 +158,14 @@ impl Renderer {
        }

        let tile_alloc_code = ShaderCode::Spv(include_bytes!("../shader/tile_alloc.spv"));
-        let tile_pipeline = session.create_simple_compute_pipeline(tile_alloc_code, 2)?;
+        let tile_pipeline = session
+            .create_compute_pipeline(tile_alloc_code, &[BindType::Buffer, BindType::Buffer])?;
        let tile_ds = session
            .create_simple_descriptor_set(&tile_pipeline, &[&memory_buf_dev, &config_buf])?;

        let path_alloc_code = ShaderCode::Spv(include_bytes!("../shader/path_coarse.spv"));
-        let path_pipeline = session.create_simple_compute_pipeline(path_alloc_code, 2)?;
+        let path_pipeline = session
+            .create_compute_pipeline(path_alloc_code, &[BindType::Buffer, BindType::Buffer])?;
        let path_ds = session
            .create_simple_descriptor_set(&path_pipeline, &[&memory_buf_dev, &config_buf])?;

@ -165,18 +175,21 @@ impl Renderer {
            println!("using small workgroup backdrop kernel");
            ShaderCode::Spv(include_bytes!("../shader/backdrop.spv"))
        };
-        let backdrop_pipeline = session.create_simple_compute_pipeline(backdrop_code, 2)?;
+        let backdrop_pipeline = session
+            .create_compute_pipeline(backdrop_code, &[BindType::Buffer, BindType::Buffer])?;
        let backdrop_ds = session
            .create_simple_descriptor_set(&backdrop_pipeline, &[&memory_buf_dev, &config_buf])?;

        // TODO: constants
        let bin_code = ShaderCode::Spv(include_bytes!("../shader/binning.spv"));
-        let bin_pipeline = session.create_simple_compute_pipeline(bin_code, 2)?;
+        let bin_pipeline =
+            session.create_compute_pipeline(bin_code, &[BindType::Buffer, BindType::Buffer])?;
        let bin_ds =
            session.create_simple_descriptor_set(&bin_pipeline, &[&memory_buf_dev, &config_buf])?;

        let coarse_code = ShaderCode::Spv(include_bytes!("../shader/coarse.spv"));
-        let coarse_pipeline = session.create_simple_compute_pipeline(coarse_code, 2)?;
+        let coarse_pipeline =
+            session.create_compute_pipeline(coarse_code, &[BindType::Buffer, BindType::Buffer])?;
        let coarse_ds = session
            .create_simple_descriptor_set(&coarse_pipeline, &[&memory_buf_dev, &config_buf])?;

@ -194,17 +207,16 @@ impl Renderer {
        let gradients = Self::make_gradient_image(&session);

        let k4_code = ShaderCode::Spv(include_bytes!("../shader/kernel4.spv"));
-        // This is a bit of a stand-in for future development. For now, we assume one
-        // atlas image for all images, and another image for the gradients. In the future,
-        // on GPUs that support it, we will probably want to go to descriptor indexing in
-        // order to cut down on allocation and copying for the atlas image.
-        let max_textures = 2;
-        let k4_pipeline = session
-            .pipeline_builder()
-            .add_buffers(2)
-            .add_images(1)
-            .add_textures(max_textures)
-            .create_compute_pipeline(&session, k4_code)?;
+        let k4_pipeline = session.create_compute_pipeline(
+            k4_code,
+            &[
+                BindType::Buffer,
+                BindType::Buffer,
+                BindType::Image,
+                BindType::ImageRead,
+                BindType::ImageRead,
+            ],
+        )?;
        let k4_ds = session
            .descriptor_set_builder()
            .add_buffers(&[&memory_buf_dev, &config_buf])
--- a/tests/shader/build.ninja
+++ b/tests/shader/build.ninja
@ -4,6 +4,11 @@

 glslang_validator = glslangValidator
 spirv_cross = spirv-cross
+dxc = dxc
+
+# See https://github.com/KhronosGroup/SPIRV-Cross/issues/1248 for
+# why we set this.
+msl_flags = --msl-decoration-binding

 rule glsl
  command = $glslang_validator $flags -V -o $out $in
@ -11,22 +16,34 @@ rule glsl
 rule hlsl
  command = $spirv_cross --hlsl $in --output $out

+rule dxil
+  command = $dxc -T cs_6_0 $in -Fo $out
+
 rule msl
-  command = $spirv_cross --msl $in --output $out
+  command = $spirv_cross --msl $in --output $out $msl_flags
+
+build gen/clear.spv: glsl clear.comp
+build gen/clear.hlsl: hlsl gen/clear.spv
+build gen/clear.dxil: dxil gen/clear.hlsl
+build gen/clear.msl: msl gen/clear.spv

 build gen/prefix.spv: glsl prefix.comp
 build gen/prefix.hlsl: hlsl gen/prefix.spv
+build gen/prefix.dxil: dxil gen/prefix.hlsl
 build gen/prefix.msl: msl gen/prefix.spv

 build gen/prefix_reduce.spv: glsl prefix_reduce.comp
 build gen/prefix_reduce.hlsl: hlsl gen/prefix_reduce.spv
+build gen/prefix_reduce.dxil: dxil gen/prefix_reduce.hlsl
 build gen/prefix_reduce.msl: msl gen/prefix_reduce.spv

 build gen/prefix_root.spv: glsl prefix_scan.comp
  flags = -DROOT
 build gen/prefix_root.hlsl: hlsl gen/prefix_root.spv
+build gen/prefix_root.dxil: dxil gen/prefix_root.hlsl
 build gen/prefix_root.msl: msl gen/prefix_root.spv

 build gen/prefix_scan.spv: glsl prefix_scan.comp
 build gen/prefix_scan.hlsl: hlsl gen/prefix_scan.spv
+build gen/prefix_scan.dxil: dxil gen/prefix_scan.hlsl
 build gen/prefix_scan.msl: msl gen/prefix_scan.spv
--- a/tests/shader/clear.comp
+++ b/tests/shader/clear.comp
@ -0,0 +1,26 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Clear a buffer.
+
+#version 450
+
+layout(local_size_x = 256) in;
+
+// This should probably be uniform rather than readonly,
+// but we haven't done the binding work yet.
+layout(binding = 0) readonly buffer ConfigBuf {
+    // size is in uint (4 byte) units
+    uint size;
+    uint value;
+};
+
+layout(binding = 1) buffer TargetBuf {
+    uint[] data;
+};
+
+void main() {
+    uint ix = gl_GlobalInvocationID.x;
+    if (ix < size) {
+        data[ix] = value;
+    }
+}
--- a/tests/shader/gen/clear.dxil
+++ b/tests/shader/gen/clear.dxil
--- a/tests/shader/gen/clear.hlsl
+++ b/tests/shader/gen/clear.hlsl
@ -0,0 +1,26 @@
+static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
+
+ByteAddressBuffer _19 : register(t0);
+RWByteAddressBuffer _32 : register(u1);
+
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+void comp_main()
+{
+    uint ix = gl_GlobalInvocationID.x;
+    if (ix < _19.Load(0))
+    {
+        _32.Store(ix * 4 + 0, _19.Load(4));
+    }
+}
+
+[numthreads(256, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+    comp_main();
+}
--- a/tests/shader/gen/clear.msl
+++ b/tests/shader/gen/clear.msl
@ -0,0 +1,27 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct ConfigBuf
+{
+    uint size;
+    uint value;
+};
+
+struct TargetBuf
+{
+    uint data[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
+
+kernel void main0(const device ConfigBuf& _19 [[buffer(0)]], device TargetBuf& _32 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+{
+    uint ix = gl_GlobalInvocationID.x;
+    if (ix < _19.size)
+    {
+        _32.data[ix] = _19.value;
+    }
+}
+
--- a/tests/shader/gen/clear.spv
+++ b/tests/shader/gen/clear.spv
--- a/tests/shader/gen/prefix.dxil
+++ b/tests/shader/gen/prefix.dxil
--- a/tests/shader/gen/prefix.hlsl
+++ b/tests/shader/gen/prefix.hlsl
@ -12,11 +12,11 @@ struct State

 static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);

-static const Monoid _187 = { 0u };
+static const Monoid _185 = { 0u };

 globallycoherent RWByteAddressBuffer _43 : register(u2);
 ByteAddressBuffer _67 : register(t0);
-RWByteAddressBuffer _374 : register(u1);
+RWByteAddressBuffer _372 : register(u1);

 static uint3 gl_LocalInvocationID;
 struct SPIRV_Cross_Input
@ -64,9 +64,9 @@ void comp_main()
    for (uint i_1 = 0u; i_1 < 9u; i_1++)
    {
        GroupMemoryBarrierWithGroupSync();
-        if (gl_LocalInvocationID.x >= uint(1 << int(i_1)))
+        if (gl_LocalInvocationID.x >= (1u << i_1))
        {
-            Monoid other = sh_scratch[gl_LocalInvocationID.x - uint(1 << int(i_1))];
+            Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
            Monoid param_2 = other;
            Monoid param_3 = agg;
            agg = combine_monoid(param_2, param_3);
@ -92,7 +92,7 @@ void comp_main()
        }
        _43.Store(part_ix * 12 + 4, flag);
    }
-    Monoid exclusive = _187;
+    Monoid exclusive = _185;
    if (part_ix != 0u)
    {
        uint look_back_ix = part_ix - 1u;
@ -113,9 +113,9 @@ void comp_main()
            {
                if (gl_LocalInvocationID.x == 511u)
                {
-                    Monoid _225;
-                    _225.element = _43.Load(look_back_ix * 12 + 12);
-                    their_prefix.element = _225.element;
+                    Monoid _223;
+                    _223.element = _43.Load(look_back_ix * 12 + 12);
+                    their_prefix.element = _223.element;
                    Monoid param_4 = their_prefix;
                    Monoid param_5 = exclusive;
                    exclusive = combine_monoid(param_4, param_5);
@ -128,9 +128,9 @@ void comp_main()
                {
                    if (gl_LocalInvocationID.x == 511u)
                    {
-                        Monoid _247;
-                        _247.element = _43.Load(look_back_ix * 12 + 8);
-                        their_agg.element = _247.element;
+                        Monoid _245;
+                        _245.element = _43.Load(look_back_ix * 12 + 8);
+                        their_agg.element = _245.element;
                        Monoid param_6 = their_agg;
                        Monoid param_7 = exclusive;
                        exclusive = combine_monoid(param_6, param_7);
@ -142,9 +142,9 @@ void comp_main()
            }
            if (gl_LocalInvocationID.x == 511u)
            {
-                Monoid _269;
-                _269.element = _67.Load(((look_back_ix * 8192u) + their_ix) * 4 + 0);
-                m.element = _269.element;
+                Monoid _267;
+                _267.element = _67.Load(((look_back_ix * 8192u) + their_ix) * 4 + 0);
+                m.element = _267.element;
                if (their_ix == 0u)
                {
                    their_agg = m;
@ -211,7 +211,7 @@ void comp_main()
        Monoid param_16 = row;
        Monoid param_17 = local[i_2];
        Monoid m_1 = combine_monoid(param_16, param_17);
-        _374.Store((ix + i_2) * 4 + 0, m_1.element);
+        _372.Store((ix + i_2) * 4 + 0, m_1.element);
    }
 }

--- a/tests/shader/gen/prefix.msl
+++ b/tests/shader/gen/prefix.msl
@ -87,7 +87,7 @@ Monoid combine_monoid(thread const Monoid& a, thread const Monoid& b)
    return Monoid{ a.element + b.element };
 }

-kernel void main0(volatile device StateBuf& _43 [[buffer(0)]], const device InBuf& _67 [[buffer(1)]], device OutBuf& _374 [[buffer(2)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(const device InBuf& _67 [[buffer(0)]], device OutBuf& _372 [[buffer(1)]], volatile device StateBuf& _43 [[buffer(2)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
    threadgroup uint sh_part_ix;
    threadgroup Monoid sh_scratch[512];
@ -115,9 +115,9 @@ kernel void main0(volatile device StateBuf& _43 [[buffer(0)]], const device InBu
    for (uint i_1 = 0u; i_1 < 9u; i_1++)
    {
        threadgroup_barrier(mem_flags::mem_threadgroup);
-        if (gl_LocalInvocationID.x >= uint(1 << int(i_1)))
+        if (gl_LocalInvocationID.x >= (1u << i_1))
        {
-            Monoid other = sh_scratch[gl_LocalInvocationID.x - uint(1 << int(i_1))];
+            Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
            Monoid param_2 = other;
            Monoid param_3 = agg;
            agg = combine_monoid(param_2, param_3);
@ -256,7 +256,7 @@ kernel void main0(volatile device StateBuf& _43 [[buffer(0)]], const device InBu
        Monoid param_16 = row;
        Monoid param_17 = local[i_2];
        Monoid m_1 = combine_monoid(param_16, param_17);
-        _374.outbuf[ix + i_2].element = m_1.element;
+        _372.outbuf[ix + i_2].element = m_1.element;
    }
 }

--- a/tests/shader/gen/prefix.spv
+++ b/tests/shader/gen/prefix.spv
--- a/tests/shader/gen/prefix_reduce.dxil
+++ b/tests/shader/gen/prefix_reduce.dxil
--- a/tests/shader/gen/prefix_reduce.hlsl
+++ b/tests/shader/gen/prefix_reduce.hlsl
@ -6,7 +6,7 @@ struct Monoid
 static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);

 ByteAddressBuffer _40 : register(t0);
-RWByteAddressBuffer _129 : register(u1);
+RWByteAddressBuffer _127 : register(u1);

 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@ -46,9 +46,9 @@ void comp_main()
    for (uint i_1 = 0u; i_1 < 9u; i_1++)
    {
        GroupMemoryBarrierWithGroupSync();
-        if ((gl_LocalInvocationID.x + uint(1 << int(i_1))) < 512u)
+        if ((gl_LocalInvocationID.x + (1u << i_1)) < 512u)
        {
-            Monoid other = sh_scratch[gl_LocalInvocationID.x + uint(1 << int(i_1))];
+            Monoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)];
            Monoid param_2 = agg;
            Monoid param_3 = other;
            agg = combine_monoid(param_2, param_3);
@ -58,7 +58,7 @@ void comp_main()
    }
    if (gl_LocalInvocationID.x == 0u)
    {
-        _129.Store(gl_WorkGroupID.x * 4 + 0, agg.element);
+        _127.Store(gl_WorkGroupID.x * 4 + 0, agg.element);
    }
 }

--- a/tests/shader/gen/prefix_reduce.msl
+++ b/tests/shader/gen/prefix_reduce.msl
@ -33,7 +33,7 @@ Monoid combine_monoid(thread const Monoid& a, thread const Monoid& b)
    return Monoid{ a.element + b.element };
 }

-kernel void main0(const device InBuf& _40 [[buffer(0)]], device OutBuf& _129 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+kernel void main0(const device InBuf& _40 [[buffer(0)]], device OutBuf& _127 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
 {
    threadgroup Monoid sh_scratch[512];
    uint ix = gl_GlobalInvocationID.x * 8u;
@ -50,9 +50,9 @@ kernel void main0(const device InBuf& _40 [[buffer(0)]], device OutBuf& _129 [[b
    for (uint i_1 = 0u; i_1 < 9u; i_1++)
    {
        threadgroup_barrier(mem_flags::mem_threadgroup);
-        if ((gl_LocalInvocationID.x + uint(1 << int(i_1))) < 512u)
+        if ((gl_LocalInvocationID.x + (1u << i_1)) < 512u)
        {
-            Monoid other = sh_scratch[gl_LocalInvocationID.x + uint(1 << int(i_1))];
+            Monoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)];
            Monoid param_2 = agg;
            Monoid param_3 = other;
            agg = combine_monoid(param_2, param_3);
@ -62,7 +62,7 @@ kernel void main0(const device InBuf& _40 [[buffer(0)]], device OutBuf& _129 [[b
    }
    if (gl_LocalInvocationID.x == 0u)
    {
-        _129.outbuf[gl_WorkGroupID.x].element = agg.element;
+        _127.outbuf[gl_WorkGroupID.x].element = agg.element;
    }
 }

--- a/tests/shader/gen/prefix_reduce.spv
+++ b/tests/shader/gen/prefix_reduce.spv
--- a/tests/shader/gen/prefix_root.dxil
+++ b/tests/shader/gen/prefix_root.dxil
--- a/tests/shader/gen/prefix_root.hlsl
+++ b/tests/shader/gen/prefix_root.hlsl
@ -5,7 +5,7 @@ struct Monoid

 static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);

-static const Monoid _133 = { 0u };
+static const Monoid _131 = { 0u };

 RWByteAddressBuffer _42 : register(u0);

@ -46,9 +46,9 @@ void comp_main()
    for (uint i_1 = 0u; i_1 < 9u; i_1++)
    {
        GroupMemoryBarrierWithGroupSync();
-        if (gl_LocalInvocationID.x >= uint(1 << int(i_1)))
+        if (gl_LocalInvocationID.x >= (1u << i_1))
        {
-            Monoid other = sh_scratch[gl_LocalInvocationID.x - uint(1 << int(i_1))];
+            Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
            Monoid param_2 = other;
            Monoid param_3 = agg;
            agg = combine_monoid(param_2, param_3);
@ -57,7 +57,7 @@ void comp_main()
        sh_scratch[gl_LocalInvocationID.x] = agg;
    }
    GroupMemoryBarrierWithGroupSync();
-    Monoid row = _133;
+    Monoid row = _131;
    if (gl_LocalInvocationID.x > 0u)
    {
        row = sh_scratch[gl_LocalInvocationID.x - 1u];
--- a/tests/shader/gen/prefix_root.msl
+++ b/tests/shader/gen/prefix_root.msl
@ -85,9 +85,9 @@ kernel void main0(device DataBuf& _42 [[buffer(0)]], uint3 gl_GlobalInvocationID
    for (uint i_1 = 0u; i_1 < 9u; i_1++)
    {
        threadgroup_barrier(mem_flags::mem_threadgroup);
-        if (gl_LocalInvocationID.x >= uint(1 << int(i_1)))
+        if (gl_LocalInvocationID.x >= (1u << i_1))
        {
-            Monoid other = sh_scratch[gl_LocalInvocationID.x - uint(1 << int(i_1))];
+            Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
            Monoid param_2 = other;
            Monoid param_3 = agg;
            agg = combine_monoid(param_2, param_3);
--- a/tests/shader/gen/prefix_root.spv
+++ b/tests/shader/gen/prefix_root.spv
--- a/tests/shader/gen/prefix_scan.dxil
+++ b/tests/shader/gen/prefix_scan.dxil
--- a/tests/shader/gen/prefix_scan.hlsl
+++ b/tests/shader/gen/prefix_scan.hlsl
@ -5,10 +5,10 @@ struct Monoid

 static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);

-static const Monoid _133 = { 0u };
+static const Monoid _131 = { 0u };

 RWByteAddressBuffer _42 : register(u0);
-RWByteAddressBuffer _143 : register(u1);
+ByteAddressBuffer _141 : register(t1);

 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@ -49,9 +49,9 @@ void comp_main()
    for (uint i_1 = 0u; i_1 < 9u; i_1++)
    {
        GroupMemoryBarrierWithGroupSync();
-        if (gl_LocalInvocationID.x >= uint(1 << int(i_1)))
+        if (gl_LocalInvocationID.x >= (1u << i_1))
        {
-            Monoid other = sh_scratch[gl_LocalInvocationID.x - uint(1 << int(i_1))];
+            Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
            Monoid param_2 = other;
            Monoid param_3 = agg;
            agg = combine_monoid(param_2, param_3);
@ -60,12 +60,12 @@ void comp_main()
        sh_scratch[gl_LocalInvocationID.x] = agg;
    }
    GroupMemoryBarrierWithGroupSync();
-    Monoid row = _133;
+    Monoid row = _131;
    if (gl_WorkGroupID.x > 0u)
    {
-        Monoid _148;
-        _148.element = _143.Load((gl_WorkGroupID.x - 1u) * 4 + 0);
-        row.element = _148.element;
+        Monoid _146;
+        _146.element = _141.Load((gl_WorkGroupID.x - 1u) * 4 + 0);
+        row.element = _146.element;
    }
    if (gl_LocalInvocationID.x > 0u)
    {
--- a/tests/shader/gen/prefix_scan.msl
+++ b/tests/shader/gen/prefix_scan.msl
@ -72,7 +72,7 @@ Monoid combine_monoid(thread const Monoid& a, thread const Monoid& b)
    return Monoid{ a.element + b.element };
 }

-kernel void main0(device DataBuf& _42 [[buffer(0)]], device ParentBuf& _143 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+kernel void main0(device DataBuf& _42 [[buffer(0)]], const device ParentBuf& _141 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
 {
    threadgroup Monoid sh_scratch[512];
    uint ix = gl_GlobalInvocationID.x * 8u;
@ -90,9 +90,9 @@ kernel void main0(device DataBuf& _42 [[buffer(0)]], device ParentBuf& _143 [[bu
    for (uint i_1 = 0u; i_1 < 9u; i_1++)
    {
        threadgroup_barrier(mem_flags::mem_threadgroup);
-        if (gl_LocalInvocationID.x >= uint(1 << int(i_1)))
+        if (gl_LocalInvocationID.x >= (1u << i_1))
        {
-            Monoid other = sh_scratch[gl_LocalInvocationID.x - uint(1 << int(i_1))];
+            Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
            Monoid param_2 = other;
            Monoid param_3 = agg;
            agg = combine_monoid(param_2, param_3);
@ -104,7 +104,7 @@ kernel void main0(device DataBuf& _42 [[buffer(0)]], device ParentBuf& _143 [[bu
    Monoid row = Monoid{ 0u };
    if (gl_WorkGroupID.x > 0u)
    {
-        row.element = _143.parent[gl_WorkGroupID.x - 1u].element;
+        row.element = _141.parent[gl_WorkGroupID.x - 1u].element;
    }
    if (gl_LocalInvocationID.x > 0u)
    {
--- a/tests/shader/gen/prefix_scan.spv
+++ b/tests/shader/gen/prefix_scan.spv
--- a/tests/shader/prefix.comp
+++ b/tests/shader/prefix.comp
@ -71,8 +71,8 @@ void main() {
    sh_scratch[gl_LocalInvocationID.x] = agg;
    for (uint i = 0; i < LG_WG_SIZE; i++) {
        barrier();
-        if (gl_LocalInvocationID.x >= (1 << i)) {
-            Monoid other = sh_scratch[gl_LocalInvocationID.x - (1 << i)];
+        if (gl_LocalInvocationID.x >= (1u << i)) {
+            Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)];
            agg = combine_monoid(other, agg);
        }
        barrier();
--- a/tests/shader/prefix_reduce.comp
+++ b/tests/shader/prefix_reduce.comp
@ -40,8 +40,8 @@ void main() {
    for (uint i = 0; i < LG_WG_SIZE; i++) {
        barrier();
        // We could make this predicate tighter, but would it help?
-        if (gl_LocalInvocationID.x + (1 << i) < WG_SIZE) {
-            Monoid other = sh_scratch[gl_LocalInvocationID.x + (1 << i)];
+        if (gl_LocalInvocationID.x + (1u << i) < WG_SIZE) {
+            Monoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i)];
            agg = combine_monoid(agg, other);
        }
        barrier();
--- a/tests/shader/prefix_scan.comp
+++ b/tests/shader/prefix_scan.comp
@ -20,7 +20,7 @@ layout(set = 0, binding = 0) buffer DataBuf {
 };

 #ifndef ROOT
-layout(set = 0, binding = 1) buffer ParentBuf {
+layout(set = 0, binding = 1) readonly buffer ParentBuf {
    Monoid[] parent;
 };
 #endif
@ -45,8 +45,8 @@ void main() {
    sh_scratch[gl_LocalInvocationID.x] = agg;
    for (uint i = 0; i < LG_WG_SIZE; i++) {
        barrier();
-        if (gl_LocalInvocationID.x >= (1 << i)) {
-            Monoid other = sh_scratch[gl_LocalInvocationID.x - (1 << i)];
+        if (gl_LocalInvocationID.x >= (1u << i)) {
+            Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)];
            agg = combine_monoid(other, agg);
        }
        barrier();
--- a/tests/src/clear.rs
+++ b/tests/src/clear.rs
@ -0,0 +1,141 @@
+// Copyright 2021 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+//! Utilities (and a benchmark) for clearing buffers with compute shaders.
+
+use piet_gpu_hal::{include_shader, BindType, BufferUsage, DescriptorSet};
+use piet_gpu_hal::{Buffer, Pipeline};
+
+use crate::config::Config;
+use crate::runner::{Commands, Runner};
+use crate::test_result::TestResult;
+
+const WG_SIZE: u64 = 256;
+
+/// The shader code for clearing buffers.
+pub struct ClearCode {
+    pipeline: Pipeline,
+}
+
+/// The stage resources for clearing buffers.
+pub struct ClearStage {
+    n_elements: u64,
+    config_buf: Buffer,
+}
+
+/// The binding for clearing buffers.
+pub struct ClearBinding {
+    descriptor_set: DescriptorSet,
+}
+
+pub unsafe fn run_clear_test(runner: &mut Runner, config: &Config) -> TestResult {
+    let mut result = TestResult::new("clear buffers");
+    let n_elements: u64 = config.size.choose(1 << 12, 1 << 20, 1 << 24);
+    let out_buf = runner.buf_down(n_elements * 4);
+    let code = ClearCode::new(runner);
+    let stage = ClearStage::new_with_value(runner, n_elements, 0x42);
+    let binding = stage.bind(runner, &code, &out_buf.dev_buf);
+    let n_iter = config.n_iter;
+    let mut total_elapsed = 0.0;
+    for i in 0..n_iter {
+        let mut commands = runner.commands();
+        commands.write_timestamp(0);
+        stage.record(&mut commands, &code, &binding);
+        commands.write_timestamp(1);
+        if i == 0 {
+            commands.cmd_buf.memory_barrier();
+            commands.download(&out_buf);
+        }
+        total_elapsed += runner.submit(commands);
+        if i == 0 {
+            let mut dst: Vec<u32> = Default::default();
+            out_buf.read(&mut dst);
+            if let Some(failure) = verify(&dst) {
+                result.fail(format!("failure at {}", failure));
+            }
+        }
+    }
+    result.timing(total_elapsed, n_elements * n_iter);
+    result
+}
+
+impl ClearCode {
+    pub unsafe fn new(runner: &mut Runner) -> ClearCode {
+        let code = include_shader!(&runner.session, "../shader/gen/Clear");
+        let pipeline = runner
+            .session
+            .create_compute_pipeline(code, &[BindType::BufReadOnly, BindType::Buffer])
+            .unwrap();
+        ClearCode { pipeline }
+    }
+}
+
+impl ClearStage {
+    pub unsafe fn new(runner: &mut Runner, n_elements: u64) -> ClearStage {
+        Self::new_with_value(runner, n_elements, 0)
+    }
+
+    pub unsafe fn new_with_value(runner: &mut Runner, n_elements: u64, value: u32) -> ClearStage {
+        let config = [n_elements as u32, value];
+        let config_buf = runner
+            .session
+            .create_buffer_init(&config, BufferUsage::STORAGE)
+            .unwrap();
+        ClearStage {
+            n_elements,
+            config_buf,
+        }
+    }
+
+    pub unsafe fn bind(
+        &self,
+        runner: &mut Runner,
+        code: &ClearCode,
+        out_buf: &Buffer,
+    ) -> ClearBinding {
+        let descriptor_set = runner
+            .session
+            .create_simple_descriptor_set(&code.pipeline, &[&self.config_buf, out_buf])
+            .unwrap();
+        ClearBinding { descriptor_set }
+    }
+
+    pub unsafe fn record(
+        &self,
+        commands: &mut Commands,
+        code: &ClearCode,
+        bindings: &ClearBinding,
+    ) {
+        let n_workgroups = (self.n_elements + WG_SIZE - 1) / WG_SIZE;
+        // An issue: for clearing large buffers (>16M), we need to check the
+        // number of workgroups against the (dynamically detected) limit, and
+        // potentially issue multiple dispatches.
+        commands.cmd_buf.dispatch(
+            &code.pipeline,
+            &bindings.descriptor_set,
+            (n_workgroups as u32, 1, 1),
+            (WG_SIZE as u32, 1, 1),
+        );
+        // One thing that's missing here is registering the buffers so
+        // they can be safely dropped by Rust code before the execution
+        // of the command buffer completes.
+    }
+}
+
+// Verify that the data is cleared.
+fn verify(data: &[u32]) -> Option<usize> {
+    data.iter().position(|val| *val != 0x42)
+}
--- a/tests/src/config.rs
+++ b/tests/src/config.rs
@ -21,6 +21,7 @@ use clap::ArgMatches;
 pub struct Config {
    pub groups: Groups,
    pub size: Size,
+    pub n_iter: u64,
 }

 pub struct Groups(String);
@ -35,8 +36,14 @@ impl Config {
    pub fn from_matches(matches: &ArgMatches) -> Config {
        let groups = Groups::from_str(matches.value_of("groups").unwrap_or("all"));
        let size = Size::from_str(matches.value_of("size").unwrap_or("m"));
+        let n_iter = matches
+            .value_of("n_iter")
+            .and_then(|s| s.parse().ok())
+            .unwrap_or(1000);
        Config {
-            groups, size
+            groups,
+            size,
+            n_iter,
        }
    }
 }
--- a/tests/src/main.rs
+++ b/tests/src/main.rs
@ -16,6 +16,7 @@

 //! Tests for piet-gpu shaders and GPU capabilities.

+mod clear;
 mod config;
 mod prefix;
 mod prefix_tree;
@ -23,6 +24,7 @@ mod runner;
 mod test_result;

 use clap::{App, Arg};
+use piet_gpu_hal::InstanceFlags;

 use crate::config::Config;
 use crate::runner::Runner;
@ -41,21 +43,26 @@ fn main() {
                .short("g")
                .long("groups")
                .help("Groups to run")
-                .takes_value(true)
+                .takes_value(true),
        )
        .arg(
            Arg::with_name("size")
                .short("s")
                .long("size")
                .help("Size of tests")
-                .takes_value(true)
+                .takes_value(true),
        )
        .arg(
            Arg::with_name("n_iter")
                .short("n")
                .long("n_iter")
                .help("Number of iterations")
-                .takes_value(true)
+                .takes_value(true),
+        )
+        .arg(
+            Arg::with_name("dx12")
+                .long("dx12")
+                .help("Prefer DX12 backend"),
        )
        .get_matches();
    let style = if matches.is_present("verbose") {
@ -68,7 +75,16 @@ fn main() {
        let report = |test_result: &TestResult| {
            test_result.report(style);
        };
-        let mut runner = Runner::new();
+        let mut flags = InstanceFlags::empty();
+        if matches.is_present("dx12") {
+            flags |= InstanceFlags::DX12;
+        }
+        let mut runner = Runner::new(flags);
+        if style == ReportStyle::Verbose {
+            // TODO: get adapter name in here too
+            println!("Backend: {:?}", runner.backend_type());
+        }
+        report(&clear::run_clear_test(&mut runner, &config));
        if config.groups.matches("prefix") {
            report(&prefix::run_prefix_test(&mut runner, &config));
            report(&prefix_tree::run_prefix_test(&mut runner, &config));
--- a/tests/src/prefix.rs
+++ b/tests/src/prefix.rs
@ -14,9 +14,10 @@
 //
 // Also licensed under MIT license, at your choice.

-use piet_gpu_hal::{include_shader, BufferUsage, DescriptorSet};
+use piet_gpu_hal::{include_shader, BackendType, BindType, BufferUsage, DescriptorSet};
 use piet_gpu_hal::{Buffer, Pipeline};

+use crate::clear::{ClearBinding, ClearCode, ClearStage};
 use crate::config::Config;
 use crate::runner::{Commands, Runner};
 use crate::test_result::TestResult;
@ -30,6 +31,7 @@ const ELEMENTS_PER_WG: u64 = WG_SIZE * N_ROWS;
 /// A code struct can be created once and reused any number of times.
 struct PrefixCode {
    pipeline: Pipeline,
+    clear_code: Option<ClearCode>,
 }

 /// The stage resources for the prefix sum example.
@ -41,6 +43,7 @@ struct PrefixStage {
    // treat it as a capacity.
    n_elements: u64,
    state_buf: Buffer,
+    clear_stage: Option<(ClearStage, ClearBinding)>,
 }

 /// The binding for the prefix sum example.
@ -50,7 +53,13 @@ struct PrefixBinding {

 pub unsafe fn run_prefix_test(runner: &mut Runner, config: &Config) -> TestResult {
    let mut result = TestResult::new("prefix sum, decoupled look-back");
-    // This will be configurable.
+    /*
+    // We're good if we're using DXC.
+    if runner.backend_type() == BackendType::Dx12 {
+        result.skip("Shader won't compile on FXC");
+        return result;
+    }
+    */
    let n_elements: u64 = config.size.choose(1 << 12, 1 << 24, 1 << 25);
    let data: Vec<u32> = (0..n_elements as u32).collect();
    let data_buf = runner
@ -59,10 +68,9 @@ pub unsafe fn run_prefix_test(runner: &mut Runner, config: &Config) -> TestResul
        .unwrap();
    let out_buf = runner.buf_down(data_buf.size());
    let code = PrefixCode::new(runner);
-    let stage = PrefixStage::new(runner, n_elements);
+    let stage = PrefixStage::new(runner, &code, n_elements);
    let binding = stage.bind(runner, &code, &data_buf, &out_buf.dev_buf);
-    // Also will be configurable of course.
-    let n_iter = 1000;
+    let n_iter = config.n_iter;
    let mut total_elapsed = 0.0;
    for i in 0..n_iter {
        let mut commands = runner.commands();
@ -91,23 +99,44 @@ impl PrefixCode {
        let code = include_shader!(&runner.session, "../shader/gen/prefix");
        let pipeline = runner
            .session
-            .create_simple_compute_pipeline(code, 3)
+            .create_compute_pipeline(
+                code,
+                &[BindType::BufReadOnly, BindType::Buffer, BindType::Buffer],
+            )
            .unwrap();
-        PrefixCode { pipeline }
+        // Currently, DX12 and Metal backends don't support buffer clearing, so use a
+        // compute shader as a workaround.
+        let clear_code = if runner.backend_type() != BackendType::Vulkan {
+            Some(ClearCode::new(runner))
+        } else {
+            None
+        };
+        PrefixCode {
+            pipeline,
+            clear_code,
+        }
    }
 }

 impl PrefixStage {
-    unsafe fn new(runner: &mut Runner, n_elements: u64) -> PrefixStage {
+    unsafe fn new(runner: &mut Runner, code: &PrefixCode, n_elements: u64) -> PrefixStage {
        let n_workgroups = (n_elements + ELEMENTS_PER_WG - 1) / ELEMENTS_PER_WG;
        let state_buf_size = 4 + 12 * n_workgroups;
        let state_buf = runner
            .session
            .create_buffer(state_buf_size, BufferUsage::STORAGE | BufferUsage::COPY_DST)
            .unwrap();
+        let clear_stage = if let Some(clear_code) = &code.clear_code {
+            let stage = ClearStage::new(runner, state_buf_size / 4);
+            let binding = stage.bind(runner, clear_code, &state_buf);
+            Some((stage, binding))
+        } else {
+            None
+        };
        PrefixStage {
            n_elements,
            state_buf,
+            clear_stage,
        }
    }

@ -127,7 +156,11 @@ impl PrefixStage {

    unsafe fn record(&self, commands: &mut Commands, code: &PrefixCode, bindings: &PrefixBinding) {
        let n_workgroups = (self.n_elements + ELEMENTS_PER_WG - 1) / ELEMENTS_PER_WG;
+        if let Some((stage, binding)) = &self.clear_stage {
+            stage.record(commands, code.clear_code.as_ref().unwrap(), binding);
+        } else {
            commands.cmd_buf.clear_buffer(&self.state_buf, None);
+        }
        commands.cmd_buf.memory_barrier();
        commands.cmd_buf.dispatch(
            &code.pipeline,
--- a/tests/src/prefix_tree.rs
+++ b/tests/src/prefix_tree.rs
@ -14,7 +14,7 @@
 //
 // Also licensed under MIT license, at your choice.

-use piet_gpu_hal::{include_shader, BufferUsage, DescriptorSet};
+use piet_gpu_hal::{include_shader, BindType, BufferUsage, DescriptorSet};
 use piet_gpu_hal::{Buffer, Pipeline};

 use crate::config::Config;
@ -57,7 +57,7 @@ pub unsafe fn run_prefix_test(runner: &mut Runner, config: &Config) -> TestResul
    let stage = PrefixTreeStage::new(runner, n_elements);
    let binding = stage.bind(runner, &code, &out_buf.dev_buf);
    // Also will be configurable of course.
-    let n_iter = 1000;
+    let n_iter = config.n_iter;
    let mut total_elapsed = 0.0;
    for i in 0..n_iter {
        let mut commands = runner.commands();
@ -88,17 +88,17 @@ impl PrefixTreeCode {
        let reduce_code = include_shader!(&runner.session, "../shader/gen/prefix_reduce");
        let reduce_pipeline = runner
            .session
-            .create_simple_compute_pipeline(reduce_code, 2)
+            .create_compute_pipeline(reduce_code, &[BindType::BufReadOnly, BindType::Buffer])
            .unwrap();
        let scan_code = include_shader!(&runner.session, "../shader/gen/prefix_scan");
        let scan_pipeline = runner
            .session
-            .create_simple_compute_pipeline(scan_code, 2)
+            .create_compute_pipeline(scan_code, &[BindType::Buffer, BindType::BufReadOnly])
            .unwrap();
        let root_code = include_shader!(&runner.session, "../shader/gen/prefix_root");
        let root_pipeline = runner
            .session
-            .create_simple_compute_pipeline(root_code, 1)
+            .create_compute_pipeline(root_code, &[BindType::Buffer])
            .unwrap();
        PrefixTreeCode {
            reduce_pipeline,
--- a/tests/src/runner.rs
+++ b/tests/src/runner.rs
@ -16,7 +16,10 @@

 //! Test runner intended to make it easy to write tests.

-use piet_gpu_hal::{Buffer, BufferUsage, CmdBuf, Instance, PlainData, QueryPool, Session};
+use piet_gpu_hal::{
+    BackendType, Buffer, BufferUsage, CmdBuf, Instance, InstanceFlags, PlainData, QueryPool,
+    Session,
+};

 pub struct Runner {
    #[allow(unused)]
@ -45,8 +48,8 @@ pub struct BufDown {
 }

 impl Runner {
-    pub unsafe fn new() -> Runner {
-        let (instance, _) = Instance::new(None).unwrap();
+    pub unsafe fn new(flags: InstanceFlags) -> Runner {
+        let (instance, _) = Instance::new(None, flags).unwrap();
        let device = instance.device(None).unwrap();
        let session = Session::new(device);
        let cmd_buf_pool = Vec::new();
@ -82,7 +85,7 @@ impl Runner {
        let submitted = self.session.run_cmd_buf(cmd_buf, &[], &[]).unwrap();
        self.cmd_buf_pool.extend(submitted.wait().unwrap());
        let timestamps = self.session.fetch_query_pool(&query_pool).unwrap();
-        timestamps[0]
+        timestamps.get(0).copied().unwrap_or_default()
    }

    #[allow(unused)]
@ -114,6 +117,10 @@ impl Runner {
            .unwrap();
        BufDown { stage_buf, dev_buf }
    }
+
+    pub fn backend_type(&self) -> BackendType {
+        self.session.backend_type()
+    }
 }

 impl Commands {
--- a/tests/src/test_result.rs
+++ b/tests/src/test_result.rs
@ -21,10 +21,17 @@ pub struct TestResult {
    // TODO: statistics. We're lean and mean for now.
    total_time: f64,
    n_elements: u64,
-    failure: Option<String>,
+    status: Status,
 }

-#[derive(Clone, Copy)]
+pub enum Status {
+    Pass,
+    Fail(String),
+    #[allow(unused)]
+    Skipped(String),
+}
+
+#[derive(Clone, Copy, PartialEq, Eq)]
 pub enum ReportStyle {
    Short,
    Verbose,
@ -36,14 +43,15 @@ impl TestResult {
            name: name.to_string(),
            total_time: 0.0,
            n_elements: 0,
-            failure: None,
+            status: Status::Pass,
        }
    }

    pub fn report(&self, style: ReportStyle) {
-        let fail_string = match &self.failure {
-            None => "pass".into(),
-            Some(s) => format!("fail ({})", s),
+        let fail_string = match &self.status {
+            Status::Pass => "pass".into(),
+            Status::Fail(s) => format!("fail ({})", s),
+            Status::Skipped(s) => format!("skipped ({})", s),
        };
        match style {
            ReportStyle::Short => {
@ -73,8 +81,13 @@ impl TestResult {
        }
    }

-    pub fn fail(&mut self, explanation: String) {
-        self.failure = Some(explanation);
+    pub fn fail(&mut self, explanation: impl Into<String>) {
+        self.status = Status::Fail(explanation.into());
+    }
+
+    #[allow(unused)]
+    pub fn skip(&mut self, explanation: impl Into<String>) {
+        self.status = Status::Skipped(explanation.into());
    }

    pub fn timing(&mut self, total_time: f64, n_elements: u64) {