diff --git a/piet-gpu-hal/examples/collatz.rs b/piet-gpu-hal/examples/collatz.rs
index d31bb4c..e974cde 100644
--- a/piet-gpu-hal/examples/collatz.rs
+++ b/piet-gpu-hal/examples/collatz.rs
@@ -1,13 +1,11 @@
-use piet_gpu_hal::hub;
-use piet_gpu_hal::mux::Instance;
-use piet_gpu_hal::BufferUsage;
 use piet_gpu_hal::include_shader;
+use piet_gpu_hal::{BufferUsage, Instance, Session};
 
 fn main() {
     let (instance, _) = Instance::new(None).unwrap();
     unsafe {
         let device = instance.device(None).unwrap();
-        let session = hub::Session::new(device);
+        let session = Session::new(device);
         let usage = BufferUsage::MAP_READ | BufferUsage::STORAGE;
         let src = (0..256).map(|x| x + 1).collect::<Vec<u32>>();
         let buffer = session.create_buffer_init(&src, usage).unwrap();
diff --git a/piet-gpu-hal/examples/dx12_toy.rs b/piet-gpu-hal/examples/dx12_toy.rs
deleted file mode 100644
index d8df466..0000000
--- a/piet-gpu-hal/examples/dx12_toy.rs
+++ /dev/null
@@ -1,102 +0,0 @@
-//! An example to exercise the dx12 backend, while it's being developed.
-//! This will probably go away when it's fully implemented and we can
-//! just use the hub.
-
-use piet_gpu_hal::{dx12, BufferUsage, Error};
-use piet_gpu_hal::backend::{CmdBuf, Device};
-
-const SHADER_CODE: &str = r#"RWByteAddressBuffer _53 : register(u0, space0);
-
-RWTexture2D<float4> textureOut : register(u1);
-
-static uint3 gl_GlobalInvocationID;
-struct SPIRV_Cross_Input
-{
-    uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
-};
-
-uint collatz_iterations(inout uint n)
-{
-    uint i = 0u;
-    while (n != 1u)
-    {
-        if ((n & 1u) == 0u)
-        {
-            n /= 2u;
-        }
-        else
-        {
-            n = (3u * n) + 1u;
-        }
-        i++;
-    }
-    return i;
-}
-
-void comp_main()
-{
-    uint index = gl_GlobalInvocationID.x;
-    uint param = _53.Load(index * 4 + 0);
-    uint _61 = collatz_iterations(param);
-    _53.Store(index * 4 + 0, _61);
-    textureOut[uint2(index, 0)] = float4(1.0, 0.0, 0.0, 1.0);
-}
-
-[numthreads(256, 1, 1)]
-void main(SPIRV_Cross_Input stage_input)
-{
-    gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
-    comp_main();
-}
-"#;
-
-fn toy() -> Result<(), Error> {
-    let (instance, _surface) = dx12::Dx12Instance::new(None)?;
-    let device = instance.device(None)?;
-    let buf = device.create_buffer(
-        1024,
-        BufferUsage::MAP_READ
-            | BufferUsage::MAP_WRITE
-            | BufferUsage::COPY_SRC
-            | BufferUsage::COPY_DST,
-    )?;
-    let dev_buf = device.create_buffer(
-        1024,
-        BufferUsage::STORAGE | BufferUsage::COPY_SRC | BufferUsage::COPY_DST,
-    )?;
-    let img_readback_buf =
-        device.create_buffer(1024, BufferUsage::MAP_READ | BufferUsage::COPY_DST)?;
-    let data: Vec<u32> = (1..257).collect();
-    let query_pool = device.create_query_pool(2)?;
-    unsafe {
-        let img = device.create_image2d(256, 1)?;
-        device.write_buffer(&buf, data.as_ptr() as *const u8, 0, 1024)?;
-        let pipeline = device.create_simple_compute_pipeline(SHADER_CODE, 1, 1)?;
-        let ds = device.create_descriptor_set(&pipeline, &[&dev_buf], &[&img])?;
-        let mut cmd_buf = device.create_cmd_buf()?;
-        let mut fence = device.create_fence(false)?;
-        cmd_buf.begin();
-        cmd_buf.copy_buffer(&buf, &dev_buf);
-        cmd_buf.memory_barrier();
-        cmd_buf.write_timestamp(&query_pool, 0);
-        cmd_buf.dispatch(&pipeline, &ds, (1, 1, 1), (256, 1, 1));
-        cmd_buf.write_timestamp(&query_pool, 1);
-        cmd_buf.memory_barrier();
-        cmd_buf.copy_buffer(&dev_buf, &buf);
-        cmd_buf.copy_image_to_buffer(&img, &img_readback_buf);
-        cmd_buf.finish_timestamps(&query_pool);
-        cmd_buf.host_barrier();
-        cmd_buf.finish();
-        device.run_cmd_bufs(&[&cmd_buf], &[], &[], Some(&mut fence))?;
-        device.wait_and_reset(vec![&mut fence])?;
-        let mut readback: Vec<u32> = vec![0u32; 256];
-        device.read_buffer(&buf, readback.as_mut_ptr() as *mut u8, 0, 1024)?;
-        println!("{:?}", readback);
-        println!("{:?}", device.fetch_query_pool(&query_pool));
-    }
-    Ok(())
-}
-
-fn main() {
-    toy().unwrap();
-}
diff --git a/piet-gpu-hal/src/backend.rs b/piet-gpu-hal/src/backend.rs
index abdb077..7b1d59f 100644
--- a/piet-gpu-hal/src/backend.rs
+++ b/piet-gpu-hal/src/backend.rs
@@ -16,7 +16,7 @@
 
 //! The generic trait for backends to implement.
 
-use crate::{BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams, mux::ShaderCode};
+use crate::{mux::ShaderCode, BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams};
 
 pub trait Device: Sized {
     type Buffer: 'static;
diff --git a/piet-gpu-hal/src/hub.rs b/piet-gpu-hal/src/hub.rs
index 8822b8a..b280881 100644
--- a/piet-gpu-hal/src/hub.rs
+++ b/piet-gpu-hal/src/hub.rs
@@ -13,10 +13,18 @@ use smallvec::SmallVec;
 
 use crate::mux;
 
-use crate::{BufferUsage, Error, GpuInfo, SamplerParams};
+use crate::{BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams};
 
 pub use crate::mux::{DescriptorSet, Fence, Pipeline, QueryPool, Sampler, Semaphore, ShaderCode};
 
+/// A session of GPU operations.
+///
+/// This abstraction is generally called a "device" in other APIs, but that
+/// term is very overloaded. It is the point to access resource creation,
+/// work submission, and related concerns.
+///
+/// Most of the methods are `&self`, indicating that they can be called from
+/// multiple threads.
 #[derive(Clone)]
 pub struct Session(Arc<SessionInner>);
 
@@ -30,6 +38,10 @@ struct SessionInner {
     gpu_info: GpuInfo,
 }
 
+/// A command buffer.
+///
+/// Actual work done by the GPU is encoded into a command buffer and then
+/// submitted to the session in a batch.
 pub struct CmdBuf {
     cmd_buf: mux::CmdBuf,
     fence: Fence,
@@ -37,7 +49,13 @@ pub struct CmdBuf {
     session: Weak<SessionInner>,
 }
 
-// Maybe "pending" is a better name?
+/// A command buffer in submitted state.
+///
+/// Submission of a command buffer is asynchronous, meaning that the submit
+/// method returns immediately. The work done in the command buffer cannot
+/// be accessed (for example, readback from buffers written) until the the
+/// submission is complete. The main purpose of this structure is to wait on
+/// that completion.
 pub struct SubmittedCmdBuf(Option<SubmittedCmdBufInner>, Weak<SessionInner>);
 
 struct SubmittedCmdBufInner {
@@ -49,6 +67,9 @@ struct SubmittedCmdBufInner {
     staging_cmd_buf: Option<CmdBuf>,
 }
 
+/// An image or texture.
+///
+/// At the moment, images are limited to 2D.
 #[derive(Clone)]
 pub struct Image(Arc<ImageInner>);
 
@@ -57,6 +78,11 @@ struct ImageInner {
     session: Weak<SessionInner>,
 }
 
+/// A buffer.
+///
+/// A buffer is a segment of memory that can be accessed by the GPU, and
+/// in some cases also by the host (if the appropriate [`BufferUsage`] flags
+/// are set).
 #[derive(Clone)]
 pub struct Buffer(Arc<BufferInner>);
 
@@ -65,8 +91,15 @@ struct BufferInner {
     session: Weak<SessionInner>,
 }
 
+/// A builder for creating pipelines.
+///
+/// Configure the signature (buffers and images accessed) for a pipeline,
+/// which is essentially compiled shader code, ready to be dispatched.
 pub struct PipelineBuilder(mux::PipelineBuilder);
 
+/// A builder for creating descriptor sets.
+///
+/// Add bindings to the descriptor set before dispatching a shader.
 pub struct DescriptorSetBuilder(mux::DescriptorSetBuilder);
 
 /// Data types that can be stored in a GPU buffer.
@@ -90,6 +123,7 @@ pub enum RetainResource {
 }
 
 impl Session {
+    /// Create a new session, choosing the best backend.
     pub fn new(device: mux::Device) -> Session {
         let gpu_info = device.query_gpu_info();
         Session(Arc::new(SessionInner {
@@ -101,6 +135,14 @@ impl Session {
         }))
     }
 
+    /// Create a new command buffer.
+    ///
+    /// The caller is responsible for inserting pipeline barriers and other
+    /// transitions. If one dispatch writes a buffer (or image), and another
+    /// reads it, a barrier must intervene. No such barrier is needed for
+    /// uploads by the host before command submission, but a host barrier is
+    /// needed if the host will do readback of any buffers written by the
+    /// command list.
     pub fn cmd_buf(&self) -> Result<CmdBuf, Error> {
         self.poll_cleanup();
         let (cmd_buf, fence) = if let Some(cf) = self.0.cmd_buf_pool.lock().unwrap().pop() {
@@ -141,6 +183,12 @@ impl Session {
         }
     }
 
+    /// Run a command buffer.
+    ///
+    /// The semaphores are for swapchain presentation and can be empty for
+    /// compute-only work. When provided, work is synchronized to start only
+    /// when the wait semaphores are signaled, and when work is complete, the
+    /// signal semaphores are signaled.
     pub unsafe fn run_cmd_buf(
         &self,
         mut cmd_buf: CmdBuf,
@@ -175,6 +223,13 @@ impl Session {
         ))
     }
 
+    /// Create a buffer.
+    ///
+    /// The `usage` flags must be specified to indicate what the buffer will
+    /// be used for. In general, when no `MAP_` flags are specified, the buffer
+    /// will be created in device memory, which means they are not host
+    /// accessible, but GPU access is much higher performance (at least on
+    /// discrete GPUs).
     pub fn create_buffer(&self, size: u64, usage: BufferUsage) -> Result<Buffer, Error> {
         let buffer = self.0.device.create_buffer(size, usage)?;
         Ok(Buffer(Arc::new(BufferInner {
@@ -184,6 +239,10 @@ impl Session {
     }
 
     /// Create a buffer with initialized data.
+    ///
+    /// This method takes care of creating a staging buffer if needed, so
+    /// it is not necessary to specify `MAP_WRITE` usage, unless of course
+    /// the buffer will subsequently be written by the host.
     pub fn create_buffer_init(
         &self,
         contents: &[impl PlainData],
@@ -226,7 +285,7 @@ impl Session {
             }
             let staging_cmd_buf = staging_cmd_buf.as_mut().unwrap();
             // This will ensure the staging buffer is deallocated.
-            staging_cmd_buf.copy_buffer(create_buf.mux_buffer(), buf.mux_buffer());
+            staging_cmd_buf.copy_buffer(&create_buf, &buf);
             staging_cmd_buf.add_resource(create_buf);
             Ok(buf)
         } else {
@@ -234,6 +293,10 @@ impl Session {
         }
     }
 
+    /// Create an image.
+    ///
+    /// Currently this creates only a 2D image in RGBA8 format, with usage
+    /// so that it can be accessed by shaders and used for transfer.
     pub unsafe fn create_image2d(&self, width: u32, height: u32) -> Result<Image, Error> {
         let image = self.0.device.create_image2d(width, height)?;
         Ok(Image(Arc::new(ImageInner {
@@ -242,13 +305,18 @@ impl Session {
         })))
     }
 
+    /// Create a semaphore.
+    ///
+    /// These "semaphores" are only for swapchain integration and may be
+    /// stubs on back-ends that don't require semaphore synchronization.
     pub unsafe fn create_semaphore(&self) -> Result<Semaphore, Error> {
         self.0.device.create_semaphore()
     }
 
     /// This creates a pipeline that operates on some buffers and images.
     ///
-    /// The descriptor set layout is just some number of storage buffers and storage images (this might change).
+    /// The descriptor set layout is just some number of storage buffers
+    /// and storage images (this might change).
     pub unsafe fn create_simple_compute_pipeline<'a>(
         &self,
         code: ShaderCode<'a>,
@@ -259,6 +327,14 @@ impl Session {
             .create_compute_pipeline(self, code)
     }
 
+    /// Start building a pipeline.
+    ///
+    /// A pipeline is essentially a compiled shader, with more specific
+    /// details about what resources may be bound to it.
+    pub unsafe fn pipeline_builder(&self) -> PipelineBuilder {
+        PipelineBuilder(self.0.device.pipeline_builder())
+    }
+
     /// Create a descriptor set for a simple pipeline that just references buffers.
     pub unsafe fn create_simple_descriptor_set<'a>(
         &self,
@@ -270,28 +346,37 @@ impl Session {
             .build(self, pipeline)
     }
 
+    /// Start building a descriptor set.
+    ///
+    /// A descriptor set is a binding of actual resources (buffers and
+    /// images) to slots as specified in the pipeline.
+    pub unsafe fn descriptor_set_builder(&self) -> DescriptorSetBuilder {
+        DescriptorSetBuilder(self.0.device.descriptor_set_builder())
+    }
+
     /// Create a query pool for timestamp queries.
     pub fn create_query_pool(&self, n_queries: u32) -> Result<QueryPool, Error> {
         self.0.device.create_query_pool(n_queries)
     }
 
+    /// Fetch the contents of the query pool.
+    ///
+    /// This should be called after waiting on the command buffer that wrote the
+    /// timer queries.
     pub unsafe fn fetch_query_pool(&self, pool: &QueryPool) -> Result<Vec<f64>, Error> {
         self.0.device.fetch_query_pool(pool)
     }
 
-    pub unsafe fn pipeline_builder(&self) -> PipelineBuilder {
-        PipelineBuilder(self.0.device.pipeline_builder())
-    }
-
-    pub unsafe fn descriptor_set_builder(&self) -> DescriptorSetBuilder {
-        DescriptorSetBuilder(self.0.device.descriptor_set_builder())
-    }
-
+    #[doc(hidden)]
+    /// Create a sampler.
+    ///
+    /// Noy yet implemented.
     pub unsafe fn create_sampler(&self, params: SamplerParams) -> Result<Sampler, Error> {
         todo!()
         //self.0.device.create_sampler(params)
     }
 
+    /// Query the GPU info.
     pub fn gpu_info(&self) -> &GpuInfo {
         &self.0.gpu_info
     }
@@ -303,6 +388,149 @@ impl Session {
 }
 
 impl CmdBuf {
+    /// Begin recording into a command buffer.
+    ///
+    /// Always call this before encoding any actual work.
+    ///
+    /// Discussion question: can this be subsumed?
+    pub unsafe fn begin(&mut self) {
+        self.cmd_buf.begin();
+    }
+
+    /// Finish recording into a command buffer.
+    ///
+    /// Always call this as the last method before submitting the command
+    /// buffer.
+    pub unsafe fn finish(&mut self) {
+        self.cmd_buf.finish();
+    }
+
+    /// Dispatch a compute shader.
+    ///
+    /// Request a compute shader to be run, using the pipeline to specify the
+    /// code, and the descriptor set to address the resources read and written.
+    ///
+    /// Both the workgroup count (number of workgroups) and the workgroup size
+    /// (number of threads in a workgroup) must be specified here, though not
+    /// all back-ends require the latter info.
+    pub unsafe fn dispatch(
+        &mut self,
+        pipeline: &Pipeline,
+        descriptor_set: &DescriptorSet,
+        workgroup_count: (u32, u32, u32),
+        workgroup_size: (u32, u32, u32),
+    ) {
+        self.cmd_buf
+            .dispatch(pipeline, descriptor_set, workgroup_count, workgroup_size);
+    }
+
+    /// Insert an execution and memory barrier.
+    ///
+    /// Compute kernels (and other actions) after this barrier may read from buffers
+    /// that were written before this barrier.
+    pub unsafe fn memory_barrier(&mut self) {
+        self.cmd_buf.memory_barrier();
+    }
+
+    /// Insert a barrier for host access to buffers.
+    ///
+    /// The host may read buffers written before this barrier, after the fence for
+    /// the command buffer is signaled.
+    ///
+    /// See http://themaister.net/blog/2019/08/14/yet-another-blog-explaining-vulkan-synchronization/
+    /// ("Host memory reads") for an explanation of this barrier.
+    pub unsafe fn host_barrier(&mut self) {
+        self.cmd_buf.memory_barrier();
+    }
+
+    /// Insert an image barrier, transitioning image layout.
+    ///
+    /// When an image is written by one command and then read by another, an image
+    /// barrier must separate the uses. Also, the image layout must match the use
+    /// of the image.
+    ///
+    /// Additionally, when writing to an image for the first time, it must be
+    /// transitioned from an unknown layout to specify the layout.
+    pub unsafe fn image_barrier(
+        &mut self,
+        image: &Image,
+        src_layout: ImageLayout,
+        dst_layout: ImageLayout,
+    ) {
+        self.cmd_buf
+            .image_barrier(image.mux_image(), src_layout, dst_layout);
+    }
+
+    /// Clear the buffer.
+    ///
+    /// When the size is not specified, it clears the whole buffer.
+    pub unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
+        self.cmd_buf.clear_buffer(buffer.mux_buffer(), size);
+    }
+
+    /// Copy one buffer to another.
+    ///
+    /// When the buffers differ in size, the minimum of the sizes is used.
+    pub unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
+        self.cmd_buf.copy_buffer(src.mux_buffer(), dst.mux_buffer());
+    }
+
+    /// Copy an image to a buffer.
+    ///
+    /// The size of the image and buffer must match.
+    pub unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
+        self.cmd_buf
+            .copy_image_to_buffer(src.mux_image(), dst.mux_buffer());
+        // TODO: change the backend signature to allow failure, as in "not
+        // implemented" or "unaligned", and fall back to compute shader
+        // submission.
+    }
+
+    /// Copy a buffer to an image.
+    ///
+    /// The size of the image and buffer must match.
+    pub unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
+        self.cmd_buf
+            .copy_buffer_to_image(src.mux_buffer(), dst.mux_image());
+        // See above.
+    }
+
+    /// Copy an image to another.
+    ///
+    /// This is especially useful for writing to the swapchain image, as in
+    /// general that can't be bound to a compute shader.
+    ///
+    /// Discussion question: we might have a specialized version of this
+    /// function for copying to the swapchain image, and a separate type.
+    pub unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
+        self.cmd_buf.blit_image(src.mux_image(), dst.mux_image());
+    }
+
+    /// Reset the query pool.
+    ///
+    /// The query pool must be reset before each use, to avoid validation errors.
+    /// This is annoying, and we could tweak the API to make it implicit, doing
+    /// the reset before the first timestamp write.
+    pub unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {
+        self.cmd_buf.reset_query_pool(pool);
+    }
+
+    /// Write a timestamp.
+    ///
+    /// The query index must be less than the size of the query pool on creation.
+    pub unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
+        self.cmd_buf.write_timestamp(pool, query);
+    }
+
+    /// Prepare the timestamps for reading. This isn't required on Vulkan but
+    /// is required on (at least) DX12.
+    ///
+    /// It's possible we'll make this go away, by implicitly including it
+    /// on command buffer submission when a query pool has been written.
+    pub unsafe fn finish_timestamps(&mut self, pool: &QueryPool) {
+        self.cmd_buf.finish_timestamps(pool);
+    }
+
     /// Make sure the resource lives until the command buffer completes.
     ///
     /// The submitted command buffer will hold this reference until the corresponding
@@ -317,6 +545,17 @@ impl CmdBuf {
 }
 
 impl SubmittedCmdBuf {
+    /// Wait for the work to complete.
+    ///
+    /// After calling this function, buffers written by the command buffer
+    /// can be read (assuming they were created with `MAP_READ` usage and also
+    /// that a host barrier was placed in the command list).
+    ///
+    /// Further, resources referenced by the command list may be destroyed or
+    /// reused; it is a safety violation to do so beforehand.
+    ///
+    /// Resources for which destruction was deferred through
+    /// [`add_resource`][`CmdBuf::add_resource`] will actually be dropped here.
     pub fn wait(mut self) -> Result<(), Error> {
         let mut item = self.0.take().unwrap();
         if let Some(session) = Weak::upgrade(&self.1) {
@@ -365,31 +604,33 @@ impl Drop for ImageInner {
     }
 }
 
-// Probably migrate from deref here to wrapping all methods.
-impl std::ops::Deref for CmdBuf {
-    type Target = mux::CmdBuf;
-    fn deref(&self) -> &Self::Target {
-        &self.cmd_buf
-    }
-}
-
-impl std::ops::DerefMut for CmdBuf {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        &mut self.cmd_buf
-    }
-}
-
 impl Image {
-    pub fn mux_image(&self) -> &mux::Image {
+    /// Get a lower level image handle.
+    pub(crate) fn mux_image(&self) -> &mux::Image {
         &self.0.image
     }
+
+    /// Wrap a swapchain image so it can be exported to the hub level.
+    /// Swapchain images don't need resource tracking (or at least we
+    /// don't do it), so no session ref is needed.
+    pub(crate) fn wrap_swapchain_image(image: mux::Image) -> Image {
+        Image(Arc::new(ImageInner {
+            image,
+            session: Weak::new(),
+        }))
+    }
 }
 
 impl Buffer {
-    pub fn mux_buffer(&self) -> &mux::Buffer {
+    /// Get a lower level buffer handle.
+    pub(crate) fn mux_buffer(&self) -> &mux::Buffer {
         &self.0.buffer
     }
 
+    /// Write the buffer contents.
+    ///
+    /// The buffer must have been created with `MAP_WRITE` usage, and with
+    /// a size large enough to accommodate the given slice.
     pub unsafe fn write<T: PlainData>(&mut self, contents: &[T]) -> Result<(), Error> {
         if let Some(session) = Weak::upgrade(&self.0.session) {
             session.device.write_buffer(
@@ -402,6 +643,12 @@ impl Buffer {
         // else session lost error?
         Ok(())
     }
+
+    /// Read the buffer contents.
+    ///
+    /// The buffer must have been created with `MAP_READ` usage. The caller
+    /// is also responsible for ensuring that this does not read uninitialized
+    /// memory.
     pub unsafe fn read<T: PlainData>(&self, result: &mut Vec<T>) -> Result<(), Error> {
         let size = self.mux_buffer().size();
         let len = size as usize / std::mem::size_of::<T>();
@@ -438,6 +685,10 @@ impl PipelineBuilder {
         self
     }
 
+    /// Create the compute pipeline.
+    ///
+    /// The shader code must be given in an appropriate format for
+    /// the back-end. See [`Session::choose_shader`] for a helper.
     pub unsafe fn create_compute_pipeline<'a>(
         self,
         session: &Session,
diff --git a/piet-gpu-hal/src/lib.rs b/piet-gpu-hal/src/lib.rs
index b5bc8b5..0739b13 100644
--- a/piet-gpu-hal/src/lib.rs
+++ b/piet-gpu-hal/src/lib.rs
@@ -4,43 +4,58 @@
 /// In time, it may go away and be replaced by either gfx-hal or wgpu.
 use bitflags::bitflags;
 
-pub mod backend;
-pub mod hub;
+mod backend;
+mod hub;
 
 #[macro_use]
 mod macros;
 
-// TODO: Don't make the module pub, but do figure out which types to
-// export at the root level.
-pub mod mux;
+mod mux;
+
+pub use crate::mux::{
+    DescriptorSet, Fence, Instance, Pipeline, QueryPool, Sampler, Semaphore, ShaderCode, Surface,
+    Swapchain,
+};
+pub use hub::{
+    Buffer, CmdBuf, DescriptorSetBuilder, Image, PipelineBuilder, PlainData, RetainResource,
+    Session, SubmittedCmdBuf,
+};
 
 // TODO: because these are conditionally included, "cargo fmt" does not
 // see them. Figure that out, possibly including running rustfmt manually.
 mux_cfg! {
     #[cfg(vk)]
-    pub mod vulkan;
+    mod vulkan;
 }
 mux_cfg! {
     #[cfg(dx12)]
-    pub mod dx12;
+    mod dx12;
 }
 #[cfg(target_os = "macos")]
-pub mod metal;
+mod metal;
 
 /// The common error type for the crate.
 ///
 /// This keeps things imple and can be expanded later.
 pub type Error = Box<dyn std::error::Error>;
 
-pub use crate::backend::CmdBuf;
-
+/// An image layout state.
+///
+/// An image must be in a particular layout state to be used for
+/// a purpose such as being bound to a shader.
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
 pub enum ImageLayout {
+    /// The initial state for a newly created image.
     Undefined,
+    /// A swapchain ready to be presented.
     Present,
+    /// The source for a copy operation.
     BlitSrc,
+    /// The destination for a copy operation.
     BlitDst,
+    /// Read/write binding to a shader.
     General,
+    /// Able to be sampled from by shaders.
     ShaderRead,
 }
 
@@ -55,7 +70,7 @@ pub enum SamplerParams {
 }
 
 bitflags! {
-    /// The intended usage for this buffer.
+    /// The intended usage for a buffer, specified on creation.
     pub struct BufferUsage: u32 {
         /// The buffer can be mapped for reading CPU-side.
         const MAP_READ = 0x1;
@@ -92,6 +107,11 @@ pub struct GpuInfo {
     pub use_staging_buffers: bool,
 }
 
+/// The range of subgroup sizes supported by a back-end, when available.
+///
+/// The subgroup size is always a power of 2. The ability to specify
+/// subgroup size for a compute shader is a newer feature, not always
+/// available.
 #[derive(Clone, Debug)]
 pub struct SubgroupSize {
     min: u32,
diff --git a/piet-gpu-hal/src/macros.rs b/piet-gpu-hal/src/macros.rs
index 3456342..8131e50 100644
--- a/piet-gpu-hal/src/macros.rs
+++ b/piet-gpu-hal/src/macros.rs
@@ -16,6 +16,8 @@
 
 //! Macros, mostly to automate backend selection tedium.
 
+#[doc(hidden)]
+/// Configure an item to be included only for the given GPU.
 #[macro_export]
 macro_rules! mux_cfg {
     ( #[cfg(vk)] $($tokens:tt)* ) => {
@@ -31,6 +33,8 @@ macro_rules! mux_cfg {
     };
 }
 
+#[doc(hidden)]
+/// Define an enum with a variant per GPU.
 #[macro_export]
 macro_rules! mux_enum {
     ( $(#[$outer:meta])* $v:vis enum $name:ident {
@@ -112,6 +116,7 @@ macro_rules! mux_enum {
     };
 }
 
+/// Define an enum with a variant per GPU for a Device associated type.
 macro_rules! mux_device_enum {
     ( $(#[$outer:meta])* $assoc_type: ident) => {
         $crate::mux_enum! {
@@ -125,6 +130,8 @@ macro_rules! mux_device_enum {
     }
 }
 
+#[doc(hidden)]
+/// A match statement where match arms are conditionally configured per GPU.
 #[macro_export]
 macro_rules! mux_match {
     ( $e:expr ;
diff --git a/piet-gpu-hal/src/mux.rs b/piet-gpu-hal/src/mux.rs
index 7a83ae9..8f93eb6 100644
--- a/piet-gpu-hal/src/mux.rs
+++ b/piet-gpu-hal/src/mux.rs
@@ -74,14 +74,28 @@ mux_enum! {
 
 mux_device_enum! { Buffer }
 mux_device_enum! { Image }
-mux_device_enum! { Fence }
-mux_device_enum! { Semaphore }
+mux_device_enum! {
+/// An object for waiting on command buffer completion.
+Fence }
+mux_device_enum! {
+/// A semaphore for swapchain presentation.
+///
+/// Depending on what kind of synchronization is needed for swapchain
+/// presentation by the back-end, this may or may not be a "real"
+/// semaphore.
+Semaphore }
 mux_device_enum! { PipelineBuilder }
-mux_device_enum! { Pipeline }
+mux_device_enum! {
+/// A pipeline object; basically a compiled shader.
+Pipeline }
 mux_device_enum! { DescriptorSetBuilder }
-mux_device_enum! { DescriptorSet }
+mux_device_enum! {
+/// A descriptor set; a binding of resources for access by a shader.
+DescriptorSet }
 mux_device_enum! { CmdBuf }
-mux_device_enum! { QueryPool }
+mux_device_enum! {
+/// An object for recording timer queries.
+QueryPool }
 mux_device_enum! { Sampler }
 
 /// The code for a shader, either as source or intermediate representation.
@@ -402,7 +416,12 @@ impl Device {
     }
 
     /// Choose shader code from the available choices.
-    pub fn choose_shader<'a>(&self, _spv: &'a [u8], _hlsl: &'a str, _msl: &'a str) -> ShaderCode<'a> {
+    pub fn choose_shader<'a>(
+        &self,
+        _spv: &'a [u8],
+        _hlsl: &'a str,
+        _msl: &'a str,
+    ) -> ShaderCode<'a> {
         mux_match! { self;
             Device::Vk(_d) => ShaderCode::Spv(_spv),
             Device::Dx12(_d) => ShaderCode::Hlsl(_hlsl),
@@ -731,7 +750,11 @@ impl Swapchain {
         }
     }
 
-    pub unsafe fn image(&self, idx: usize) -> Image {
+    pub unsafe fn image(&self, idx: usize) -> crate::Image {
+        crate::Image::wrap_swapchain_image(self.image_raw(idx))
+    }
+
+    pub unsafe fn image_raw(&self, idx: usize) -> Image {
         mux_match! { self;
             Swapchain::Vk(s) => Image::Vk(s.image(idx)),
             Swapchain::Dx12(s) => Image::Dx12(s.image(idx)),
diff --git a/piet-gpu-hal/src/vulkan.rs b/piet-gpu-hal/src/vulkan.rs
index 1084505..3eee69a 100644
--- a/piet-gpu-hal/src/vulkan.rs
+++ b/piet-gpu-hal/src/vulkan.rs
@@ -822,7 +822,7 @@ impl crate::backend::Device for VkDevice {
     }
 }
 
-impl crate::CmdBuf<VkDevice> for CmdBuf {
+impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
     unsafe fn begin(&mut self) {
         self.device
             .device
diff --git a/piet-gpu/bin/android.rs b/piet-gpu/bin/android.rs
index ce37503..d3a2aa3 100644
--- a/piet-gpu/bin/android.rs
+++ b/piet-gpu/bin/android.rs
@@ -11,9 +11,10 @@ use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};
 use ndk::native_window::NativeWindow;
 use ndk_glue::Event;
 
-use piet_gpu_hal::hub;
-use piet_gpu_hal::mux::{Instance, QueryPool, Surface, Swapchain};
-use piet_gpu_hal::{CmdBuf, Error, ImageLayout};
+use piet_gpu_hal::{
+    Error, ImageLayout, Instance, QueryPool, Semaphore, Session, SubmittedCmdBuf, Surface,
+    Swapchain,
+};
 
 use piet_gpu::{render_scene, PietGpuRenderContext, Renderer};
 
@@ -28,14 +29,14 @@ struct MyHandle {
 
 // State required to render and present the contents
 struct GfxState {
-    session: hub::Session,
+    session: Session,
     renderer: Renderer,
     swapchain: Swapchain,
     current_frame: usize,
     last_frame_idx: usize,
-    submitted: Option<hub::SubmittedCmdBuf>,
+    submitted: Option<SubmittedCmdBuf>,
     query_pools: Vec<QueryPool>,
-    present_semaphores: Vec<hub::Semaphore>,
+    present_semaphores: Vec<Semaphore>,
 }
 
 const WIDTH: usize = 1080;
@@ -95,7 +96,7 @@ impl GfxState {
             let device = instance.device(surface)?;
             let mut swapchain =
                 instance.swapchain(WIDTH / 2, HEIGHT / 2, &device, surface.unwrap())?;
-            let session = hub::Session::new(device);
+            let session = Session::new(device);
             let mut current_frame = 0;
             let present_semaphores = (0..NUM_FRAMES)
                 .map(|_| session.create_semaphore())
@@ -113,7 +114,7 @@ impl GfxState {
 
             let renderer = Renderer::new(&session, scene, n_paths, n_pathseg, n_trans)?;
 
-            let submitted: Option<hub::SubmittedCmdBuf> = None;
+            let submitted: Option<SubmittedCmdBuf> = None;
             let current_frame = 0;
             let last_frame_idx = 0;
             Ok(GfxState {
@@ -151,7 +152,7 @@ impl GfxState {
 
             // Image -> Swapchain
             cmd_buf.image_barrier(&swap_image, ImageLayout::Undefined, ImageLayout::BlitDst);
-            cmd_buf.blit_image(self.renderer.image_dev.mux_image(), &swap_image);
+            cmd_buf.blit_image(&self.renderer.image_dev, &swap_image);
             cmd_buf.image_barrier(&swap_image, ImageLayout::BlitDst, ImageLayout::Present);
             cmd_buf.finish();
 
diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs
index a1d766d..92950b5 100644
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@@ -4,9 +4,7 @@ use std::path::Path;
 
 use clap::{App, Arg};
 
-use piet_gpu_hal::hub;
-use piet_gpu_hal::mux::Instance;
-use piet_gpu_hal::{BufferUsage, Error};
+use piet_gpu_hal::{BufferUsage, Error, Instance, Session};
 
 use piet_gpu::{render_scene, render_svg, PietGpuRenderContext, Renderer, HEIGHT, WIDTH};
 
@@ -228,7 +226,7 @@ fn main() -> Result<(), Error> {
     let (instance, _) = Instance::new(None)?;
     unsafe {
         let device = instance.device(None)?;
-        let session = hub::Session::new(device);
+        let session = Session::new(device);
 
         let mut cmd_buf = session.cmd_buf()?;
         let query_pool = session.create_query_pool(8)?;
@@ -258,7 +256,7 @@ fn main() -> Result<(), Error> {
 
         cmd_buf.begin();
         renderer.record(&mut cmd_buf, &query_pool);
-        cmd_buf.copy_image_to_buffer(renderer.image_dev.mux_image(), image_buf.mux_buffer());
+        cmd_buf.copy_image_to_buffer(&renderer.image_dev, &image_buf);
         cmd_buf.host_barrier();
         cmd_buf.finish();
         let start = std::time::Instant::now();
diff --git a/piet-gpu/bin/winit.rs b/piet-gpu/bin/winit.rs
index af60a3a..b7b838b 100644
--- a/piet-gpu/bin/winit.rs
+++ b/piet-gpu/bin/winit.rs
@@ -1,6 +1,4 @@
-use piet_gpu_hal::hub;
-use piet_gpu_hal::mux::Instance;
-use piet_gpu_hal::{Error, ImageLayout};
+use piet_gpu_hal::{Error, ImageLayout, Instance, Session, SubmittedCmdBuf};
 
 use piet_gpu::{render_scene, PietGpuRenderContext, Renderer, HEIGHT, WIDTH};
 
@@ -27,7 +25,7 @@ fn main() -> Result<(), Error> {
         let device = instance.device(surface.as_ref())?;
         let mut swapchain =
             instance.swapchain(WIDTH / 2, HEIGHT / 2, &device, surface.as_ref().unwrap())?;
-        let session = hub::Session::new(device);
+        let session = Session::new(device);
 
         let mut current_frame = 0;
         let present_semaphores = (0..NUM_FRAMES)
@@ -46,7 +44,7 @@ fn main() -> Result<(), Error> {
 
         let renderer = Renderer::new(&session, scene, n_paths, n_pathseg, n_trans)?;
 
-        let mut submitted: Option<hub::SubmittedCmdBuf> = None;
+        let mut submitted: Option<SubmittedCmdBuf> = None;
         let mut last_frame_idx = 0;
 
         event_loop.run(move |event, _, control_flow| {
@@ -89,7 +87,6 @@ fn main() -> Result<(), Error> {
                         ));
                     }
 
-
                     let (image_idx, acquisition_semaphore) = swapchain.next().unwrap();
                     let swap_image = swapchain.image(image_idx);
                     let query_pool = &query_pools[frame_idx];
@@ -103,7 +100,7 @@ fn main() -> Result<(), Error> {
                         ImageLayout::Undefined,
                         ImageLayout::BlitDst,
                     );
-                    cmd_buf.blit_image(renderer.image_dev.mux_image(), &swap_image);
+                    cmd_buf.blit_image(&renderer.image_dev, &swap_image);
                     cmd_buf.image_barrier(&swap_image, ImageLayout::BlitDst, ImageLayout::Present);
                     cmd_buf.finish();
 
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index b815655..ef70c9c 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -12,9 +12,10 @@ use piet::{Color, ImageFormat, RenderContext};
 
 use piet_gpu_types::encoder::Encode;
 
-use piet_gpu_hal::hub;
-use piet_gpu_hal::hub::ShaderCode;
-use piet_gpu_hal::{BufferUsage, Error, ImageLayout};
+use piet_gpu_hal::{
+    Buffer, BufferUsage, CmdBuf, DescriptorSet, Error, Image, ImageLayout, Pipeline, QueryPool,
+    Session, ShaderCode,
+};
 
 use pico_svg::PicoSvg;
 
@@ -188,53 +189,53 @@ pub fn dump_k1_data(k1_buf: &[u32]) {
 }
 
 pub struct Renderer {
-    pub image_dev: hub::Image, // resulting image
+    pub image_dev: Image, // resulting image
 
     // The reference is held by the pipelines. We will be changing
     // this to make the scene upload dynamic.
     #[allow(dead_code)]
-    scene_buf: hub::Buffer,
+    scene_buf: Buffer,
 
-    memory_buf_host: hub::Buffer,
-    memory_buf_dev: hub::Buffer,
+    memory_buf_host: Buffer,
+    memory_buf_dev: Buffer,
 
-    state_buf: hub::Buffer,
+    state_buf: Buffer,
 
     #[allow(dead_code)]
-    config_buf: hub::Buffer,
+    config_buf: Buffer,
 
-    el_pipeline: hub::Pipeline,
-    el_ds: hub::DescriptorSet,
+    el_pipeline: Pipeline,
+    el_ds: DescriptorSet,
 
-    tile_pipeline: hub::Pipeline,
-    tile_ds: hub::DescriptorSet,
+    tile_pipeline: Pipeline,
+    tile_ds: DescriptorSet,
 
-    path_pipeline: hub::Pipeline,
-    path_ds: hub::DescriptorSet,
+    path_pipeline: Pipeline,
+    path_ds: DescriptorSet,
 
-    backdrop_pipeline: hub::Pipeline,
-    backdrop_ds: hub::DescriptorSet,
+    backdrop_pipeline: Pipeline,
+    backdrop_ds: DescriptorSet,
 
-    bin_pipeline: hub::Pipeline,
-    bin_ds: hub::DescriptorSet,
+    bin_pipeline: Pipeline,
+    bin_ds: DescriptorSet,
 
-    coarse_pipeline: hub::Pipeline,
-    coarse_ds: hub::DescriptorSet,
+    coarse_pipeline: Pipeline,
+    coarse_ds: DescriptorSet,
 
-    k4_pipeline: hub::Pipeline,
-    k4_ds: hub::DescriptorSet,
+    k4_pipeline: Pipeline,
+    k4_ds: DescriptorSet,
 
     n_elements: usize,
     n_paths: usize,
     n_pathseg: usize,
 
     // Keep a reference to the image so that it is not destroyed.
-    _bg_image: hub::Image,
+    _bg_image: Image,
 }
 
 impl Renderer {
     pub unsafe fn new(
-        session: &hub::Session,
+        session: &Session,
         scene: &[u8],
         n_paths: usize,
         n_pathseg: usize,
@@ -385,15 +386,12 @@ impl Renderer {
         })
     }
 
-    pub unsafe fn record(&self, cmd_buf: &mut hub::CmdBuf, query_pool: &hub::QueryPool) {
-        cmd_buf.copy_buffer(
-            self.memory_buf_host.mux_buffer(),
-            self.memory_buf_dev.mux_buffer(),
-        );
-        cmd_buf.clear_buffer(self.state_buf.mux_buffer(), None);
+    pub unsafe fn record(&self, cmd_buf: &mut CmdBuf, query_pool: &QueryPool) {
+        cmd_buf.copy_buffer(&self.memory_buf_host, &self.memory_buf_dev);
+        cmd_buf.clear_buffer(&self.state_buf, None);
         cmd_buf.memory_barrier();
         cmd_buf.image_barrier(
-            self.image_dev.mux_image(),
+            &self.image_dev,
             ImageLayout::Undefined,
             ImageLayout::General,
         );
@@ -458,20 +456,16 @@ impl Renderer {
         );
         cmd_buf.write_timestamp(&query_pool, 7);
         cmd_buf.memory_barrier();
-        cmd_buf.image_barrier(
-            self.image_dev.mux_image(),
-            ImageLayout::General,
-            ImageLayout::BlitSrc,
-        );
+        cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
     }
 
     pub fn make_image(
-        session: &hub::Session,
+        session: &Session,
         width: usize,
         height: usize,
         buf: &[u8],
         format: ImageFormat,
-    ) -> Result<hub::Image, Error> {
+    ) -> Result<Image, Error> {
         unsafe {
             if format != ImageFormat::RgbaPremul {
                 return Err("unsupported image format".into());
@@ -482,17 +476,9 @@ impl Renderer {
             let image = session.create_image2d(width.try_into()?, height.try_into()?)?;
             let mut cmd_buf = session.cmd_buf()?;
             cmd_buf.begin();
-            cmd_buf.image_barrier(
-                image.mux_image(),
-                ImageLayout::Undefined,
-                ImageLayout::BlitDst,
-            );
-            cmd_buf.copy_buffer_to_image(buffer.mux_buffer(), image.mux_image());
-            cmd_buf.image_barrier(
-                image.mux_image(),
-                ImageLayout::BlitDst,
-                ImageLayout::General,
-            );
+            cmd_buf.image_barrier(&image, ImageLayout::Undefined, ImageLayout::BlitDst);
+            cmd_buf.copy_buffer_to_image(&buffer, &image);
+            cmd_buf.image_barrier(&image, ImageLayout::BlitDst, ImageLayout::General);
             cmd_buf.finish();
             // Make sure not to drop the buffer and image until the command buffer completes.
             cmd_buf.add_resource(&buffer);
@@ -504,7 +490,7 @@ impl Renderer {
     }
 
     /// Make a test image.
-    fn make_test_bg_image(session: &hub::Session) -> hub::Image {
+    fn make_test_bg_image(session: &Session) -> Image {
         const WIDTH: usize = 256;
         const HEIGHT: usize = 256;
         let mut buf = vec![255u8; WIDTH * HEIGHT * 4];