From bae185efbdc74441ca6d87f7440cfa00d160c02b Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Sat, 29 May 2021 16:33:52 -0700
Subject: [PATCH] API reorg

Move types into the toplevel and hide implementation details. Remove
deref of hub CmdBuf to mux. Restrict public visibility of internals.

Most items have some docs, though improvements are still possible. In
particular, there should be detailed safety info.
---
 piet-gpu-hal/examples/collatz.rs  |   6 +-
 piet-gpu-hal/examples/dx12_toy.rs | 102 ----------
 piet-gpu-hal/src/backend.rs       |   2 +-
 piet-gpu-hal/src/hub.rs           | 307 +++++++++++++++++++++++++++---
 piet-gpu-hal/src/lib.rs           |  42 ++--
 piet-gpu-hal/src/macros.rs        |   7 +
 piet-gpu-hal/src/mux.rs           |  37 +++-
 piet-gpu-hal/src/vulkan.rs        |   2 +-
 piet-gpu/bin/android.rs           |  19 +-
 piet-gpu/bin/cli.rs               |   8 +-
 piet-gpu/bin/winit.rs             |  11 +-
 piet-gpu/src/lib.rs               |  88 ++++-----
 12 files changed, 405 insertions(+), 226 deletions(-)
 delete mode 100644 piet-gpu-hal/examples/dx12_toy.rs
diff --git a/piet-gpu-hal/examples/collatz.rs b/piet-gpu-hal/examples/collatz.rs
index d31bb4c..e974cde 100644
--- a/piet-gpu-hal/examples/collatz.rs
+++ b/piet-gpu-hal/examples/collatz.rs
@@ -1,13 +1,11 @@
-use piet_gpu_hal::hub;
-use piet_gpu_hal::mux::Instance;
-use piet_gpu_hal::BufferUsage;
 use piet_gpu_hal::include_shader;
+use piet_gpu_hal::{BufferUsage, Instance, Session};
 
 fn main() {
     let (instance, _) = Instance::new(None).unwrap();
     unsafe {
         let device = instance.device(None).unwrap();
-        let session = hub::Session::new(device);
+        let session = Session::new(device);
         let usage = BufferUsage::MAP_READ | BufferUsage::STORAGE;
         let src = (0..256).map(|x| x + 1).collect::<Vec<u32>>();
         let buffer = session.create_buffer_init(&src, usage).unwrap();
diff --git a/piet-gpu-hal/examples/dx12_toy.rs b/piet-gpu-hal/examples/dx12_toy.rs
deleted file mode 100644
index d8df466..0000000
--- a/piet-gpu-hal/examples/dx12_toy.rs
+++ /dev/null
@@ -1,102 +0,0 @@
-//! An example to exercise the dx12 backend, while it's being developed.
-//! This will probably go away when it's fully implemented and we can
-//! just use the hub.
-
-use piet_gpu_hal::{dx12, BufferUsage, Error};
-use piet_gpu_hal::backend::{CmdBuf, Device};
-
-const SHADER_CODE: &str = r#"RWByteAddressBuffer _53 : register(u0, space0);
-
-RWTexture2D<float4> textureOut : register(u1);
-
-static uint3 gl_GlobalInvocationID;
-struct SPIRV_Cross_Input
-{
-    uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
-};
-
-uint collatz_iterations(inout uint n)
-{
-    uint i = 0u;
-    while (n != 1u)
-    {
-        if ((n & 1u) == 0u)
-        {
-            n /= 2u;
-        }
-        else
-        {
-            n = (3u * n) + 1u;
-        }
-        i++;
-    }
-    return i;
-}
-
-void comp_main()
-{
-    uint index = gl_GlobalInvocationID.x;
-    uint param = _53.Load(index * 4 + 0);
-    uint _61 = collatz_iterations(param);
-    _53.Store(index * 4 + 0, _61);
-    textureOut[uint2(index, 0)] = float4(1.0, 0.0, 0.0, 1.0);
-}
-
-[numthreads(256, 1, 1)]
-void main(SPIRV_Cross_Input stage_input)
-{
-    gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
-    comp_main();
-}
-"#;
-
-fn toy() -> Result<(), Error> {
-    let (instance, _surface) = dx12::Dx12Instance::new(None)?;
-    let device = instance.device(None)?;
-    let buf = device.create_buffer(
-        1024,
-        BufferUsage::MAP_READ
-            | BufferUsage::MAP_WRITE
-            | BufferUsage::COPY_SRC
-            | BufferUsage::COPY_DST,
-    )?;
-    let dev_buf = device.create_buffer(
-        1024,
-        BufferUsage::STORAGE | BufferUsage::COPY_SRC | BufferUsage::COPY_DST,
-    )?;
-    let img_readback_buf =
-        device.create_buffer(1024, BufferUsage::MAP_READ | BufferUsage::COPY_DST)?;
-    let data: Vec<u32> = (1..257).collect();
-    let query_pool = device.create_query_pool(2)?;
-    unsafe {
-        let img = device.create_image2d(256, 1)?;
-        device.write_buffer(&buf, data.as_ptr() as *const u8, 0, 1024)?;
-        let pipeline = device.create_simple_compute_pipeline(SHADER_CODE, 1, 1)?;
-        let ds = device.create_descriptor_set(&pipeline, &[&dev_buf], &[&img])?;
-        let mut cmd_buf = device.create_cmd_buf()?;
-        let mut fence = device.create_fence(false)?;
-        cmd_buf.begin();
-        cmd_buf.copy_buffer(&buf, &dev_buf);
-        cmd_buf.memory_barrier();
-        cmd_buf.write_timestamp(&query_pool, 0);
-        cmd_buf.dispatch(&pipeline, &ds, (1, 1, 1), (256, 1, 1));
-        cmd_buf.write_timestamp(&query_pool, 1);
-        cmd_buf.memory_barrier();
-        cmd_buf.copy_buffer(&dev_buf, &buf);
-        cmd_buf.copy_image_to_buffer(&img, &img_readback_buf);
-        cmd_buf.finish_timestamps(&query_pool);
-        cmd_buf.host_barrier();
-        cmd_buf.finish();
-        device.run_cmd_bufs(&[&cmd_buf], &[], &[], Some(&mut fence))?;
-        device.wait_and_reset(vec![&mut fence])?;
-        let mut readback: Vec<u32> = vec![0u32; 256];
-        device.read_buffer(&buf, readback.as_mut_ptr() as *mut u8, 0, 1024)?;
-        println!("{:?}", readback);
-        println!("{:?}", device.fetch_query_pool(&query_pool));
-    }
-    Ok(())
-}
-
-fn main() {
-    toy().unwrap();
-}
diff --git a/piet-gpu-hal/src/backend.rs b/piet-gpu-hal/src/backend.rs
index abdb077..7b1d59f 100644
--- a/piet-gpu-hal/src/backend.rs
+++ b/piet-gpu-hal/src/backend.rs
@@ -16,7 +16,7 @@
 
 //! The generic trait for backends to implement.
 
-use crate::{BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams, mux::ShaderCode};
+use crate::{mux::ShaderCode, BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams};
 
 pub trait Device: Sized {
     type Buffer: 'static;
diff --git a/piet-gpu-hal/src/hub.rs b/piet-gpu-hal/src/hub.rs
index 8822b8a..b280881 100644
--- a/piet-gpu-hal/src/hub.rs
+++ b/piet-gpu-hal/src/hub.rs
@@ -13,10 +13,18 @@ use smallvec::SmallVec;
 
 use crate::mux;
 
-use crate::{BufferUsage, Error, GpuInfo, SamplerParams};
+use crate::{BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams};
 
 pub use crate::mux::{DescriptorSet, Fence, Pipeline, QueryPool, Sampler, Semaphore, ShaderCode};
 
+/// A session of GPU operations.
+///
+/// This abstraction is generally called a "device" in other APIs, but that
+/// term is very overloaded. It is the point to access resource creation,
+/// work submission, and related concerns.
+///
+/// Most of the methods are `&self`, indicating that they can be called from
+/// multiple threads.
 #[derive(Clone)]
 pub struct Session(Arc<SessionInner>);
 
@@ -30,6 +38,10 @@ struct SessionInner {
     gpu_info: GpuInfo,
 }
 
+/// A command buffer.
+///
+/// Actual work done by the GPU is encoded into a command buffer and then
+/// submitted to the session in a batch.
 pub struct CmdBuf {
     cmd_buf: mux::CmdBuf,
     fence: Fence,
@@ -37,7 +49,13 @@ pub struct CmdBuf {
     session: Weak<SessionInner>,
 }
 
-// Maybe "pending" is a better name?
+/// A command buffer in submitted state.
+///
+/// Submission of a command buffer is asynchronous, meaning that the submit
+/// method returns immediately. The work done in the command buffer cannot
+/// be accessed (for example, readback from buffers written) until the the
+/// submission is complete. The main purpose of this structure is to wait on
+/// that completion.
 pub struct SubmittedCmdBuf(Option<SubmittedCmdBufInner>, Weak<SessionInner>);
 
 struct SubmittedCmdBufInner {
@@ -49,6 +67,9 @@ struct SubmittedCmdBufInner {
     staging_cmd_buf: Option<CmdBuf>,
 }
 
+/// An image or texture.
+///
+/// At the moment, images are limited to 2D.
 #[derive(Clone)]
 pub struct Image(Arc<ImageInner>);
 
@@ -57,6 +78,11 @@ struct ImageInner {
     session: Weak<SessionInner>,
 }
 
+/// A buffer.
+///
+/// A buffer is a segment of memory that can be accessed by the GPU, and
+/// in some cases also by the host (if the appropriate [`BufferUsage`] flags
+/// are set).
 #[derive(Clone)]
 pub struct Buffer(Arc<BufferInner>);
 
@@ -65,8 +91,15 @@ struct BufferInner {
     session: Weak<SessionInner>,
 }
 
+/// A builder for creating pipelines.
+///
+/// Configure the signature (buffers and images accessed) for a pipeline,
+/// which is essentially compiled shader code, ready to be dispatched.
 pub struct PipelineBuilder(mux::PipelineBuilder);
 
+/// A builder for creating descriptor sets.
+///
+/// Add bindings to the descriptor set before dispatching a shader.
 pub struct DescriptorSetBuilder(mux::DescriptorSetBuilder);
 
 /// Data types that can be stored in a GPU buffer.
@@ -90,6 +123,7 @@ pub enum RetainResource {
 }
 
 impl Session {
+    /// Create a new session, choosing the best backend.
     pub fn new(device: mux::Device) -> Session {
         let gpu_info = device.query_gpu_info();
         Session(Arc::new(SessionInner {
@@ -101,6 +135,14 @@ impl Session {
         }))
     }
 
+    /// Create a new command buffer.
+    ///
+    /// The caller is responsible for inserting pipeline barriers and other
+    /// transitions. If one dispatch writes a buffer (or image), and another
+    /// reads it, a barrier must intervene. No such barrier is needed for
+    /// uploads by the host before command submission, but a host barrier is
+    /// needed if the host will do readback of any buffers written by the
+    /// command list.
     pub fn cmd_buf(&self) -> Result<CmdBuf, Error> {
         self.poll_cleanup();
         let (cmd_buf, fence) = if let Some(cf) = self.0.cmd_buf_pool.lock().unwrap().pop() {
@@ -141,6 +183,12 @@ impl Session {
         }
     }
 
+    /// Run a command buffer.
+    ///
+    /// The semaphores are for swapchain presentation and can be empty for
+    /// compute-only work. When provided, work is synchronized to start only
+    /// when the wait semaphores are signaled, and when work is complete, the
+    /// signal semaphores are signaled.
     pub unsafe fn run_cmd_buf(
         &self,
         mut cmd_buf: CmdBuf,
@@ -175,6 +223,13 @@ impl Session {
         ))
     }
 
+    /// Create a buffer.
+    ///
+    /// The `usage` flags must be specified to indicate what the buffer will
+    /// be used for. In general, when no `MAP_` flags are specified, the buffer
+    /// will be created in device memory, which means they are not host
+    /// accessible, but GPU access is much higher performance (at least on
+    /// discrete GPUs).
     pub fn create_buffer(&self, size: u64, usage: BufferUsage) -> Result<Buffer, Error> {
         let buffer = self.0.device.create_buffer(size, usage)?;
         Ok(Buffer(Arc::new(BufferInner {
@@ -184,6 +239,10 @@ impl Session {
     }
 
     /// Create a buffer with initialized data.
+    ///
+    /// This method takes care of creating a staging buffer if needed, so
+    /// it is not necessary to specify `MAP_WRITE` usage, unless of course
+    /// the buffer will subsequently be written by the host.
     pub fn create_buffer_init(
         &self,
         contents: &[impl PlainData],
@@ -226,7 +285,7 @@ impl Session {
             }
             let staging_cmd_buf = staging_cmd_buf.as_mut().unwrap();
             // This will ensure the staging buffer is deallocated.
-            staging_cmd_buf.copy_buffer(create_buf.mux_buffer(), buf.mux_buffer());
+            staging_cmd_buf.copy_buffer(&create_buf, &buf);
             staging_cmd_buf.add_resource(create_buf);
             Ok(buf)
         } else {
@@ -234,6 +293,10 @@ impl Session {
         }
     }
 
+    /// Create an image.
+    ///
+    /// Currently this creates only a 2D image in RGBA8 format, with usage
+    /// so that it can be accessed by shaders and used for transfer.
     pub unsafe fn create_image2d(&self, width: u32, height: u32) -> Result<Image, Error> {
         let image = self.0.device.create_image2d(width, height)?;
         Ok(Image(Arc::new(ImageInner {
@@ -242,13 +305,18 @@ impl Session {
         })))
     }
 
+    /// Create a semaphore.
+    ///
+    /// These "semaphores" are only for swapchain integration and may be
+    /// stubs on back-ends that don't require semaphore synchronization.
     pub unsafe fn create_semaphore(&self) -> Result<Semaphore, Error> {
         self.0.device.create_semaphore()
     }
 
     /// This creates a pipeline that operates on some buffers and images.
     ///
-    /// The descriptor set layout is just some number of storage buffers and storage images (this might change).
+    /// The descriptor set layout is just some number of storage buffers
+    /// and storage images (this might change).
     pub unsafe fn create_simple_compute_pipeline<'a>(
         &self,
         code: ShaderCode<'a>,
@@ -259,6 +327,14 @@ impl Session {
             .create_compute_pipeline(self, code)
     }
 
+    /// Start building a pipeline.
+    ///
+    /// A pipeline is essentially a compiled shader, with more specific
+    /// details about what resources may be bound to it.
+    pub unsafe fn pipeline_builder(&self) -> PipelineBuilder {
+        PipelineBuilder(self.0.device.pipeline_builder())
+    }
+
     /// Create a descriptor set for a simple pipeline that just references buffers.
     pub unsafe fn create_simple_descriptor_set<'a>(
         &self,
@@ -270,28 +346,37 @@ impl Session {
             .build(self, pipeline)
     }
 
+    /// Start building a descriptor set.
+    ///
+    /// A descriptor set is a binding of actual resources (buffers and
+    /// images) to slots as specified in the pipeline.
+    pub unsafe fn descriptor_set_builder(&self) -> DescriptorSetBuilder {
+        DescriptorSetBuilder(self.0.device.descriptor_set_builder())
+    }
+
     /// Create a query pool for timestamp queries.
     pub fn create_query_pool(&self, n_queries: u32) -> Result<QueryPool, Error> {
         self.0.device.create_query_pool(n_queries)
     }
 
+    /// Fetch the contents of the query pool.
+    ///
+    /// This should be called after waiting on the command buffer that wrote the
+    /// timer queries.
     pub unsafe fn fetch_query_pool(&self, pool: &QueryPool) -> Result<Vec<f64>, Error> {
         self.0.device.fetch_query_pool(pool)
     }
 
-    pub unsafe fn pipeline_builder(&self) -> PipelineBuilder {
-        PipelineBuilder(self.0.device.pipeline_builder())
-    }
-
-    pub unsafe fn descriptor_set_builder(&self) -> DescriptorSetBuilder {
-        DescriptorSetBuilder(self.0.device.descriptor_set_builder())
-    }
-
+    #[doc(hidden)]
+    /// Create a sampler.
+    ///
+    /// Noy yet implemented.
     pub unsafe fn create_sampler(&self, params: SamplerParams) -> Result<Sampler, Error> {
         todo!()
         //self.0.device.create_sampler(params)
     }
 
+    /// Query the GPU info.
     pub fn gpu_info(&self) -> &GpuInfo {
         &self.0.gpu_info
     }
@@ -303,6 +388,149 @@ impl Session {
 }
 
 impl CmdBuf {
+    /// Begin recording into a command buffer.
+    ///
+    /// Always call this before encoding any actual work.
+    ///
+    /// Discussion question: can this be subsumed?
+    pub unsafe fn begin(&mut self) {
+        self.cmd_buf.begin();
+    }
+
+    /// Finish recording into a command buffer.
+    ///
+    /// Always call this as the last method before submitting the command
+    /// buffer.
+    pub unsafe fn finish(&mut self) {
+        self.cmd_buf.finish();
+    }
+
+    /// Dispatch a compute shader.
+    ///
+    /// Request a compute shader to be run, using the pipeline to specify the
+    /// code, and the descriptor set to address the resources read and written.
+    ///
+    /// Both the workgroup count (number of workgroups) and the workgroup size
+    /// (number of threads in a workgroup) must be specified here, though not
+    /// all back-ends require the latter info.
+    pub unsafe fn dispatch(
+        &mut self,
+        pipeline: &Pipeline,
+        descriptor_set: &DescriptorSet,
+        workgroup_count: (u32, u32, u32),
+        workgroup_size: (u32, u32, u32),
+    ) {
+        self.cmd_buf
+            .dispatch(pipeline, descriptor_set, workgroup_count, workgroup_size);
+    }
+
+    /// Insert an execution and memory barrier.
+    ///
+    /// Compute kernels (and other actions) after this barrier may read from buffers
+    /// that were written before this barrier.
+    pub unsafe fn memory_barrier(&mut self) {
+        self.cmd_buf.memory_barrier();
+    }
+
+    /// Insert a barrier for host access to buffers.
+    ///
+    /// The host may read buffers written before this barrier, after the fence for
+    /// the command buffer is signaled.
+    ///
+    /// See http://themaister.net/blog/2019/08/14/yet-another-blog-explaining-vulkan-synchronization/
+    /// ("Host memory reads") for an explanation of this barrier.
+    pub unsafe fn host_barrier(&mut self) {
+        self.cmd_buf.memory_barrier();
+    }
+
+    /// Insert an image barrier, transitioning image layout.
+    ///
+    /// When an image is written by one command and then read by another, an image
+    /// barrier must separate the uses. Also, the image layout must match the use
+    /// of the image.
+    ///
+    /// Additionally, when writing to an image for the first time, it must be
+    /// transitioned from an unknown layout to specify the layout.
+    pub unsafe fn image_barrier(
+        &mut self,
+        image: &Image,
+        src_layout: ImageLayout,
+        dst_layout: ImageLayout,
+    ) {
+        self.cmd_buf
+            .image_barrier(image.mux_image(), src_layout, dst_layout);
+    }
+
+    /// Clear the buffer.
+    ///
+    /// When the size is not specified, it clears the whole buffer.
+    pub unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
+        self.cmd_buf.clear_buffer(buffer.mux_buffer(), size);
+    }
+
+    /// Copy one buffer to another.
+    ///
+    /// When the buffers differ in size, the minimum of the sizes is used.
+    pub unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
+        self.cmd_buf.copy_buffer(src.mux_buffer(), dst.mux_buffer());
+    }
+
+    /// Copy an image to a buffer.
+    ///
+    /// The size of the image and buffer must match.
+    pub unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
+        self.cmd_buf
+            .copy_image_to_buffer(src.mux_image(), dst.mux_buffer());
+        // TODO: change the backend signature to allow failure, as in "not
+        // implemented" or "unaligned", and fall back to compute shader
+        // submission.
+    }
+
+    /// Copy a buffer to an image.
+    ///
+    /// The size of the image and buffer must match.
+    pub unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
+        self.cmd_buf
+            .copy_buffer_to_image(src.mux_buffer(), dst.mux_image());
+        // See above.
+    }
+
+    /// Copy an image to another.
+    ///
+    /// This is especially useful for writing to the swapchain image, as in
+    /// general that can't be bound to a compute shader.
+    ///
+    /// Discussion question: we might have a specialized version of this
+    /// function for copying to the swapchain image, and a separate type.
+    pub unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
+        self.cmd_buf.blit_image(src.mux_image(), dst.mux_image());
+    }
+
+    /// Reset the query pool.
+    ///
+    /// The query pool must be reset before each use, to avoid validation errors.
+    /// This is annoying, and we could tweak the API to make it implicit, doing
+    /// the reset before the first timestamp write.
+    pub unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {
+        self.cmd_buf.reset_query_pool(pool);
+    }
+
+    /// Write a timestamp.
+    ///
+    /// The query index must be less than the size of the query pool on creation.
+    pub unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
+        self.cmd_buf.write_timestamp(pool, query);
+    }
+
+    /// Prepare the timestamps for reading. This isn't required on Vulkan but
+    /// is required on (at least) DX12.
+    ///
+    /// It's possible we'll make this go away, by implicitly including it
+    /// on command buffer submission when a query pool has been written.
+    pub unsafe fn finish_timestamps(&mut self, pool: &QueryPool) {
+        self.cmd_buf.finish_timestamps(pool);
+    }
+
     /// Make sure the resource lives until the command buffer completes.
     ///
     /// The submitted command buffer will hold this reference until the corresponding
@@ -317,6 +545,17 @@ impl CmdBuf {
 }
 
 impl SubmittedCmdBuf {
+    /// Wait for the work to complete.
+    ///
+    /// After calling this function, buffers written by the command buffer
+    /// can be read (assuming they were created with `MAP_READ` usage and also
+    /// that a host barrier was placed in the command list).
+    ///
+    /// Further, resources referenced by the command list may be destroyed or
+    /// reused; it is a safety violation to do so beforehand.
+    ///
+    /// Resources for which destruction was deferred through
+    /// [`add_resource`][`CmdBuf::add_resource`] will actually be dropped here.
     pub fn wait(mut self) -> Result<(), Error> {
         let mut item = self.0.take().unwrap();
         if let Some(session) = Weak::upgrade(&self.1) {
@@ -365,31 +604,33 @@ impl Drop for ImageInner {
     }
 }
 
-// Probably migrate from deref here to wrapping all methods.
-impl std::ops::Deref for CmdBuf {
-    type Target = mux::CmdBuf;
-    fn deref(&self) -> &Self::Target {
-        &self.cmd_buf
-    }
-}
-
-impl std::ops::DerefMut for CmdBuf {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        &mut self.cmd_buf
-    }
-}
-
 impl Image {
-    pub fn mux_image(&self) -> &mux::Image {
+    /// Get a lower level image handle.
+    pub(crate) fn mux_image(&self) -> &mux::Image {
         &self.0.image
     }
+
+    /// Wrap a swapchain image so it can be exported to the hub level.
+    /// Swapchain images don't need resource tracking (or at least we
+    /// don't do it), so no session ref is needed.
+    pub(crate) fn wrap_swapchain_image(image: mux::Image) -> Image {
+        Image(Arc::new(ImageInner {
+            image,
+            session: Weak::new(),
+        }))
+    }
 }
 
 impl Buffer {
-    pub fn mux_buffer(&self) -> &mux::Buffer {
+    /// Get a lower level buffer handle.
+    pub(crate) fn mux_buffer(&self) -> &mux::Buffer {
         &self.0.buffer
     }
 
+    /// Write the buffer contents.
+    ///
+    /// The buffer must have been created with `MAP_WRITE` usage, and with
+    /// a size large enough to accommodate the given slice.
     pub unsafe fn write<T: PlainData>(&mut self, contents: &[T]) -> Result<(), Error> {
         if let Some(session) = Weak::upgrade(&self.0.session) {
             session.device.write_buffer(
@@ -402,6 +643,12 @@ impl Buffer {
         // else session lost error?
         Ok(())
     }
+
+    /// Read the buffer contents.
+    ///
+    /// The buffer must have been created with `MAP_READ` usage. The caller
+    /// is also responsible for ensuring that this does not read uninitialized
+    /// memory.
     pub unsafe fn read<T: PlainData>(&self, result: &mut Vec<T>) -> Result<(), Error> {
         let size = self.mux_buffer().size();
         let len = size as usize / std::mem::size_of::<T>();
@@ -438,6 +685,10 @@ impl PipelineBuilder {
         self
     }
 
+    /// Create the compute pipeline.
+    ///
+    /// The shader code must be given in an appropriate format for
+    /// the back-end. See [`Session::choose_shader`] for a helper.
     pub unsafe fn create_compute_pipeline<'a>(
         self,
         session: &Session,
diff --git a/piet-gpu-hal/src/lib.rs b/piet-gpu-hal/src/lib.rs
index b5bc8b5..0739b13 100644
--- a/piet-gpu-hal/src/lib.rs
+++ b/piet-gpu-hal/src/lib.rs
@@ -4,43 +4,58 @@
 /// In time, it may go away and be replaced by either gfx-hal or wgpu.
 use bitflags::bitflags;
 
-pub mod backend;
-pub mod hub;
+mod backend;
+mod hub;
 
 #[macro_use]
 mod macros;
 
-// TODO: Don't make the module pub, but do figure out which types to
-// export at the root level.
-pub mod mux;
+mod mux;
+
+pub use crate::mux::{
+    DescriptorSet, Fence, Instance, Pipeline, QueryPool, Sampler, Semaphore, ShaderCode, Surface,
+    Swapchain,
+};
+pub use hub::{
+    Buffer, CmdBuf, DescriptorSetBuilder, Image, PipelineBuilder, PlainData, RetainResource,
+    Session, SubmittedCmdBuf,
+};
 
 // TODO: because these are conditionally included, "cargo fmt" does not
 // see them. Figure that out, possibly including running rustfmt manually.
 mux_cfg! {
     #[cfg(vk)]
-    pub mod vulkan;
+    mod vulkan;
 }
 mux_cfg! {
     #[cfg(dx12)]
-    pub mod dx12;
+    mod dx12;
 }
 #[cfg(target_os = "macos")]
-pub mod metal;
+mod metal;
 
 /// The common error type for the crate.
 ///
 /// This keeps things imple and can be expanded later.
 pub type Error = Box<dyn std::error::Error>;
 
-pub use crate::backend::CmdBuf;
-
+/// An image layout state.
+///
+/// An image must be in a particular layout state to be used for
+/// a purpose such as being bound to a shader.
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
 pub enum ImageLayout {
+    /// The initial state for a newly created image.
     Undefined,
+    /// A swapchain ready to be presented.
     Present,
+    /// The source for a copy operation.
     BlitSrc,
+    /// The destination for a copy operation.
     BlitDst,
+    /// Read/write binding to a shader.
     General,
+    /// Able to be sampled from by shaders.
     ShaderRead,
 }
 
@@ -55,7 +70,7 @@ pub enum SamplerParams {
 }
 
 bitflags! {
-    /// The intended usage for this buffer.
+    /// The intended usage for a buffer, specified on creation.
     pub struct BufferUsage: u32 {
         /// The buffer can be mapped for reading CPU-side.
         const MAP_READ = 0x1;
@@ -92,6 +107,11 @@ pub struct GpuInfo {
     pub use_staging_buffers: bool,
 }
 
+/// The range of subgroup sizes supported by a back-end, when available.
+///
+/// The subgroup size is always a power of 2. The ability to specify
+/// subgroup size for a compute shader is a newer feature, not always
+/// available.
 #[derive(Clone, Debug)]
 pub struct SubgroupSize {
     min: u32,
diff --git a/piet-gpu-hal/src/macros.rs b/piet-gpu-hal/src/macros.rs
index 3456342..8131e50 100644
--- a/piet-gpu-hal/src/macros.rs
+++ b/piet-gpu-hal/src/macros.rs
@@ -16,6 +16,8 @@
 
 //! Macros, mostly to automate backend selection tedium.
 
+#[doc(hidden)]
+/// Configure an item to be included only for the given GPU.
 #[macro_export]
 macro_rules! mux_cfg {
     ( #[cfg(vk)] $($tokens:tt)* ) => {
@@ -31,6 +33,8 @@ macro_rules! mux_cfg {
     };
 }
 
+#[doc(hidden)]
+/// Define an enum with a variant per GPU.
 #[macro_export]
 macro_rules! mux_enum {
     ( $(#[$outer:meta])* $v:vis enum $name:ident {
@@ -112,6 +116,7 @@ macro_rules! mux_enum {
     };
 }
 
+/// Define an enum with a variant per GPU for a Device associated type.
 macro_rules! mux_device_enum {
     ( $(#[$outer:meta])* $assoc_type: ident) => {
         $crate::mux_enum! {
@@ -125,6 +130,8 @@ macro_rules! mux_device_enum {
     }
 }
 
+#[doc(hidden)]
+/// A match statement where match arms are conditionally configured per GPU.
 #[macro_export]
 macro_rules! mux_match {
     ( $e:expr ;
diff --git a/piet-gpu-hal/src/mux.rs b/piet-gpu-hal/src/mux.rs
index 7a83ae9..8f93eb6 100644
--- a/piet-gpu-hal/src/mux.rs
+++ b/piet-gpu-hal/src/mux.rs
@@ -74,14 +74,28 @@ mux_enum! {
 
 mux_device_enum! { Buffer }
 mux_device_enum! { Image }
-mux_device_enum! { Fence }
-mux_device_enum! { Semaphore }
+mux_device_enum! {
+/// An object for waiting on command buffer completion.
+Fence }
+mux_device_enum! {
+/// A semaphore for swapchain presentation.
+///
+/// Depending on what kind of synchronization is needed for swapchain
+/// presentation by the back-end, this may or may not be a "real"
+/// semaphore.
+Semaphore }
 mux_device_enum! { PipelineBuilder }
-mux_device_enum! { Pipeline }
+mux_device_enum! {
+/// A pipeline object; basically a compiled shader.
+Pipeline }
 mux_device_enum! { DescriptorSetBuilder }
-mux_device_enum! { DescriptorSet }
+mux_device_enum! {
+/// A descriptor set; a binding of resources for access by a shader.
+DescriptorSet }
 mux_device_enum! { CmdBuf }
-mux_device_enum! { QueryPool }
+mux_device_enum! {
+/// An object for recording timer queries.
+QueryPool }
 mux_device_enum! { Sampler }
 
 /// The code for a shader, either as source or intermediate representation.
@@ -402,7 +416,12 @@ impl Device {
     }
 
     /// Choose shader code from the available choices.
-    pub fn choose_shader<'a>(&self, _spv: &'a [u8], _hlsl: &'a str, _msl: &'a str) -> ShaderCode<'a> {
+    pub fn choose_shader<'a>(
+        &self,
+        _spv: &'a [u8],
+        _hlsl: &'a str,
+        _msl: &'a str,
+    ) -> ShaderCode<'a> {
         mux_match! { self;
             Device::Vk(_d) => ShaderCode::Spv(_spv),
             Device::Dx12(_d) => ShaderCode::Hlsl(_hlsl),
@@ -731,7 +750,11 @@ impl Swapchain {
         }
     }
 
-    pub unsafe fn image(&self, idx: usize) -> Image {
+    pub unsafe fn image(&self, idx: usize) -> crate::Image {
+        crate::Image::wrap_swapchain_image(self.image_raw(idx))
+    }
+
+    pub unsafe fn image_raw(&self, idx: usize) -> Image {
         mux_match! { self;
             Swapchain::Vk(s) => Image::Vk(s.image(idx)),
             Swapchain::Dx12(s) => Image::Dx12(s.image(idx)),
diff --git a/piet-gpu-hal/src/vulkan.rs b/piet-gpu-hal/src/vulkan.rs
index 1084505..3eee69a 100644
--- a/piet-gpu-hal/src/vulkan.rs
+++ b/piet-gpu-hal/src/vulkan.rs
@@ -822,7 +822,7 @@ impl crate::backend::Device for VkDevice {
     }
 }
 
-impl crate::CmdBuf<VkDevice> for CmdBuf {
+impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
     unsafe fn begin(&mut self) {
         self.device
             .device
diff --git a/piet-gpu/bin/android.rs b/piet-gpu/bin/android.rs
index ce37503..d3a2aa3 100644
--- a/piet-gpu/bin/android.rs
+++ b/piet-gpu/bin/android.rs
@@ -11,9 +11,10 @@ use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};
 use ndk::native_window::NativeWindow;
 use ndk_glue::Event;
 
-use piet_gpu_hal::hub;
-use piet_gpu_hal::mux::{Instance, QueryPool, Surface, Swapchain};
-use piet_gpu_hal::{CmdBuf, Error, ImageLayout};
+use piet_gpu_hal::{
+    Error, ImageLayout, Instance, QueryPool, Semaphore, Session, SubmittedCmdBuf, Surface,
+    Swapchain,
+};
 
 use piet_gpu::{render_scene, PietGpuRenderContext, Renderer};
 
@@ -28,14 +29,14 @@ struct MyHandle {
 
 // State required to render and present the contents
 struct GfxState {
-    session: hub::Session,
+    session: Session,
     renderer: Renderer,
     swapchain: Swapchain,
     current_frame: usize,
     last_frame_idx: usize,
-    submitted: Option<hub::SubmittedCmdBuf>,
+    submitted: Option<SubmittedCmdBuf>,
     query_pools: Vec<QueryPool>,
-    present_semaphores: Vec<hub::Semaphore>,
+    present_semaphores: Vec<Semaphore>,
 }
 
 const WIDTH: usize = 1080;
@@ -95,7 +96,7 @@ impl GfxState {
             let device = instance.device(surface)?;
             let mut swapchain =
                 instance.swapchain(WIDTH / 2, HEIGHT / 2, &device, surface.unwrap())?;
-            let session = hub::Session::new(device);
+            let session = Session::new(device);
             let mut current_frame = 0;
             let present_semaphores = (0..NUM_FRAMES)
                 .map(|_| session.create_semaphore())
@@ -113,7 +114,7 @@ impl GfxState {
 
             let renderer = Renderer::new(&session, scene, n_paths, n_pathseg, n_trans)?;
 
-            let submitted: Option<hub::SubmittedCmdBuf> = None;
+            let submitted: Option<SubmittedCmdBuf> = None;
             let current_frame = 0;
             let last_frame_idx = 0;
             Ok(GfxState {
@@ -151,7 +152,7 @@ impl GfxState {
 
             // Image -> Swapchain
             cmd_buf.image_barrier(&swap_image, ImageLayout::Undefined, ImageLayout::BlitDst);
-            cmd_buf.blit_image(self.renderer.image_dev.mux_image(), &swap_image);
+            cmd_buf.blit_image(&self.renderer.image_dev, &swap_image);
             cmd_buf.image_barrier(&swap_image, ImageLayout::BlitDst, ImageLayout::Present);
             cmd_buf.finish();
 
diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs
index a1d766d..92950b5 100644
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@@ -4,9 +4,7 @@ use std::path::Path;
 
 use clap::{App, Arg};
 
-use piet_gpu_hal::hub;
-use piet_gpu_hal::mux::Instance;
-use piet_gpu_hal::{BufferUsage, Error};
+use piet_gpu_hal::{BufferUsage, Error, Instance, Session};
 
 use piet_gpu::{render_scene, render_svg, PietGpuRenderContext, Renderer, HEIGHT, WIDTH};
 
@@ -228,7 +226,7 @@ fn main() -> Result<(), Error> {
     let (instance, _) = Instance::new(None)?;
     unsafe {
         let device = instance.device(None)?;
-        let session = hub::Session::new(device);
+        let session = Session::new(device);
 
         let mut cmd_buf = session.cmd_buf()?;
         let query_pool = session.create_query_pool(8)?;
@@ -258,7 +256,7 @@ fn main() -> Result<(), Error> {
 
         cmd_buf.begin();
         renderer.record(&mut cmd_buf, &query_pool);
-        cmd_buf.copy_image_to_buffer(renderer.image_dev.mux_image(), image_buf.mux_buffer());
+        cmd_buf.copy_image_to_buffer(&renderer.image_dev, &image_buf);
         cmd_buf.host_barrier();
         cmd_buf.finish();
         let start = std::time::Instant::now();
diff --git a/piet-gpu/bin/winit.rs b/piet-gpu/bin/winit.rs
index af60a3a..b7b838b 100644
--- a/piet-gpu/bin/winit.rs
+++ b/piet-gpu/bin/winit.rs
@@ -1,6 +1,4 @@
-use piet_gpu_hal::hub;
-use piet_gpu_hal::mux::Instance;
-use piet_gpu_hal::{Error, ImageLayout};
+use piet_gpu_hal::{Error, ImageLayout, Instance, Session, SubmittedCmdBuf};
 
 use piet_gpu::{render_scene, PietGpuRenderContext, Renderer, HEIGHT, WIDTH};
 
@@ -27,7 +25,7 @@ fn main() -> Result<(), Error> {
         let device = instance.device(surface.as_ref())?;
         let mut swapchain =
             instance.swapchain(WIDTH / 2, HEIGHT / 2, &device, surface.as_ref().unwrap())?;
-        let session = hub::Session::new(device);
+        let session = Session::new(device);
 
         let mut current_frame = 0;
         let present_semaphores = (0..NUM_FRAMES)
@@ -46,7 +44,7 @@ fn main() -> Result<(), Error> {
 
         let renderer = Renderer::new(&session, scene, n_paths, n_pathseg, n_trans)?;
 
-        let mut submitted: Option<hub::SubmittedCmdBuf> = None;
+        let mut submitted: Option<SubmittedCmdBuf> = None;
         let mut last_frame_idx = 0;
 
         event_loop.run(move |event, _, control_flow| {
@@ -89,7 +87,6 @@ fn main() -> Result<(), Error> {
                         ));
                     }
 
-
                     let (image_idx, acquisition_semaphore) = swapchain.next().unwrap();
                     let swap_image = swapchain.image(image_idx);
                     let query_pool = &query_pools[frame_idx];
@@ -103,7 +100,7 @@ fn main() -> Result<(), Error> {
                         ImageLayout::Undefined,
                         ImageLayout::BlitDst,
                     );
-                    cmd_buf.blit_image(renderer.image_dev.mux_image(), &swap_image);
+                    cmd_buf.blit_image(&renderer.image_dev, &swap_image);
                     cmd_buf.image_barrier(&swap_image, ImageLayout::BlitDst, ImageLayout::Present);
                     cmd_buf.finish();
 
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index b815655..ef70c9c 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -12,9 +12,10 @@ use piet::{Color, ImageFormat, RenderContext};
 
 use piet_gpu_types::encoder::Encode;
 
-use piet_gpu_hal::hub;
-use piet_gpu_hal::hub::ShaderCode;
-use piet_gpu_hal::{BufferUsage, Error, ImageLayout};
+use piet_gpu_hal::{
+    Buffer, BufferUsage, CmdBuf, DescriptorSet, Error, Image, ImageLayout, Pipeline, QueryPool,
+    Session, ShaderCode,
+};
 
 use pico_svg::PicoSvg;
 
@@ -188,53 +189,53 @@ pub fn dump_k1_data(k1_buf: &[u32]) {
 }
 
 pub struct Renderer {
-    pub image_dev: hub::Image, // resulting image
+    pub image_dev: Image, // resulting image
 
     // The reference is held by the pipelines. We will be changing
     // this to make the scene upload dynamic.
     #[allow(dead_code)]
-    scene_buf: hub::Buffer,
+    scene_buf: Buffer,
 
-    memory_buf_host: hub::Buffer,
-    memory_buf_dev: hub::Buffer,
+    memory_buf_host: Buffer,
+    memory_buf_dev: Buffer,
 
-    state_buf: hub::Buffer,
+    state_buf: Buffer,
 
     #[allow(dead_code)]
-    config_buf: hub::Buffer,
+    config_buf: Buffer,
 
-    el_pipeline: hub::Pipeline,
-    el_ds: hub::DescriptorSet,
+    el_pipeline: Pipeline,
+    el_ds: DescriptorSet,
 
-    tile_pipeline: hub::Pipeline,
-    tile_ds: hub::DescriptorSet,
+    tile_pipeline: Pipeline,
+    tile_ds: DescriptorSet,
 
-    path_pipeline: hub::Pipeline,
-    path_ds: hub::DescriptorSet,
+    path_pipeline: Pipeline,
+    path_ds: DescriptorSet,
 
-    backdrop_pipeline: hub::Pipeline,
-    backdrop_ds: hub::DescriptorSet,
+    backdrop_pipeline: Pipeline,
+    backdrop_ds: DescriptorSet,
 
-    bin_pipeline: hub::Pipeline,
-    bin_ds: hub::DescriptorSet,
+    bin_pipeline: Pipeline,
+    bin_ds: DescriptorSet,
 
-    coarse_pipeline: hub::Pipeline,
-    coarse_ds: hub::DescriptorSet,
+    coarse_pipeline: Pipeline,
+    coarse_ds: DescriptorSet,
 
-    k4_pipeline: hub::Pipeline,
-    k4_ds: hub::DescriptorSet,
+    k4_pipeline: Pipeline,
+    k4_ds: DescriptorSet,
 
     n_elements: usize,
     n_paths: usize,
     n_pathseg: usize,
 
     // Keep a reference to the image so that it is not destroyed.
-    _bg_image: hub::Image,
+    _bg_image: Image,
 }
 
 impl Renderer {
     pub unsafe fn new(
-        session: &hub::Session,
+        session: &Session,
         scene: &[u8],
         n_paths: usize,
         n_pathseg: usize,
@@ -385,15 +386,12 @@ impl Renderer {
         })
     }
 
-    pub unsafe fn record(&self, cmd_buf: &mut hub::CmdBuf, query_pool: &hub::QueryPool) {
-        cmd_buf.copy_buffer(
-            self.memory_buf_host.mux_buffer(),
-            self.memory_buf_dev.mux_buffer(),
-        );
-        cmd_buf.clear_buffer(self.state_buf.mux_buffer(), None);
+    pub unsafe fn record(&self, cmd_buf: &mut CmdBuf, query_pool: &QueryPool) {
+        cmd_buf.copy_buffer(&self.memory_buf_host, &self.memory_buf_dev);
+        cmd_buf.clear_buffer(&self.state_buf, None);
         cmd_buf.memory_barrier();
         cmd_buf.image_barrier(
-            self.image_dev.mux_image(),
+            &self.image_dev,
             ImageLayout::Undefined,
             ImageLayout::General,
         );
@@ -458,20 +456,16 @@ impl Renderer {
         );
         cmd_buf.write_timestamp(&query_pool, 7);
         cmd_buf.memory_barrier();
-        cmd_buf.image_barrier(
-            self.image_dev.mux_image(),
-            ImageLayout::General,
-            ImageLayout::BlitSrc,
-        );
+        cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
     }
 
     pub fn make_image(
-        session: &hub::Session,
+        session: &Session,
         width: usize,
         height: usize,
         buf: &[u8],
         format: ImageFormat,
-    ) -> Result<hub::Image, Error> {
+    ) -> Result<Image, Error> {
         unsafe {
             if format != ImageFormat::RgbaPremul {
                 return Err("unsupported image format".into());
@@ -482,17 +476,9 @@ impl Renderer {
             let image = session.create_image2d(width.try_into()?, height.try_into()?)?;
             let mut cmd_buf = session.cmd_buf()?;
             cmd_buf.begin();
-            cmd_buf.image_barrier(
-                image.mux_image(),
-                ImageLayout::Undefined,
-                ImageLayout::BlitDst,
-            );
-            cmd_buf.copy_buffer_to_image(buffer.mux_buffer(), image.mux_image());
-            cmd_buf.image_barrier(
-                image.mux_image(),
-                ImageLayout::BlitDst,
-                ImageLayout::General,
-            );
+            cmd_buf.image_barrier(&image, ImageLayout::Undefined, ImageLayout::BlitDst);
+            cmd_buf.copy_buffer_to_image(&buffer, &image);
+            cmd_buf.image_barrier(&image, ImageLayout::BlitDst, ImageLayout::General);
             cmd_buf.finish();
             // Make sure not to drop the buffer and image until the command buffer completes.
             cmd_buf.add_resource(&buffer);
@@ -504,7 +490,7 @@ impl Renderer {
     }
 
     /// Make a test image.
-    fn make_test_bg_image(session: &hub::Session) -> hub::Image {
+    fn make_test_bg_image(session: &Session) -> Image {
         const WIDTH: usize = 256;
         const HEIGHT: usize = 256;
         let mut buf = vec![255u8; WIDTH * HEIGHT * 4];