Merge pull request #264 from linebender/async

Experimental async wiring
2025-01-09 20:31:29 +11:00 · 2023-01-27 12:43:48 -08:00 · 2023-01-27 12:43:48 -08:00 · 6a184244e6
parent ca79d5c8d0 db018da537
commit 6a184244e6
5 changed files with 650 additions and 364 deletions
--- a/examples/with_winit/src/main.rs
+++ b/examples/with_winit/src/main.rs
@ -22,6 +22,7 @@ use std::{borrow::Cow, path::PathBuf, time::Instant};

 use clap::Parser;
 use vello::{
+    block_on_wgpu,
    kurbo::{Affine, Vec2},
    util::RenderContext,
    Renderer, Scene, SceneBuilder,
@ -187,6 +188,24 @@ async fn run(event_loop: EventLoop<UserEvent>, window: Window, args: Args) {
                .surface
                .get_current_texture()
                .expect("failed to get surface texture");
+            #[cfg(not(target_arch = "wasm32"))]
+            {
+                block_on_wgpu(
+                    &device_handle.device,
+                    renderer.render_to_surface_async(
+                        &device_handle.device,
+                        &device_handle.queue,
+                        &scene,
+                        &surface_texture,
+                        width,
+                        height,
+                    ),
+                )
+                .expect("failed to render to surface");
+            }
+            // Note: in the wasm case, we're currently not running the robust
+            // pipeline, as it requires more async wiring for the readback.
+            #[cfg(target_arch = "wasm32")]
            renderer
                .render_to_surface(
                    &device_handle.device,
@ -198,7 +217,7 @@ async fn run(event_loop: EventLoop<UserEvent>, window: Window, args: Args) {
                )
                .expect("failed to render to surface");
            surface_texture.present();
-            device_handle.device.poll(wgpu::Maintain::Wait);
+            device_handle.device.poll(wgpu::Maintain::Poll);
        }
        Event::UserEvent(event) => match event {
            #[cfg(not(target_arch = "wasm32"))]
--- a/src/engine.rs
+++ b/src/engine.rs
@ -16,17 +16,14 @@

 use std::{
    borrow::Cow,
-    collections::{hash_map::Entry, HashMap},
+    collections::{hash_map::Entry, HashMap, HashSet},
    num::{NonZeroU32, NonZeroU64},
    sync::atomic::{AtomicU64, Ordering},
 };

-use futures_intrusive::channel::shared::GenericOneshotReceiver;
-use parking_lot::RawMutex;
 use wgpu::{
-    util::DeviceExt, BindGroup, BindGroupLayout, Buffer, BufferAsyncError, BufferSlice,
-    BufferUsages, BufferView, ComputePipeline, Device, Queue, Texture, TextureAspect,
-    TextureFormat, TextureUsages, TextureView, TextureViewDimension,
+    util::DeviceExt, BindGroup, BindGroupLayout, Buffer, BufferUsages, ComputePipeline, Device,
+    Queue, Texture, TextureAspect, TextureFormat, TextureUsages, TextureView, TextureViewDimension,
 };

 pub type Error = Box<dyn std::error::Error>;
@ -42,6 +39,8 @@ static ID_COUNTER: AtomicU64 = AtomicU64::new(0);
 pub struct Engine {
    shaders: Vec<Shader>,
    pool: ResourcePool,
+    bind_map: BindMap,
+    downloads: HashMap<Id, Buffer>,
 }

 struct Shader {
@ -96,11 +95,8 @@ pub enum Command {
    Dispatch(ShaderId, (u32, u32, u32), Vec<ResourceProxy>),
    Download(BufProxy),
    Clear(BufProxy, u64, Option<NonZeroU64>),
-}
-
-#[derive(Default)]
-pub struct Downloads {
-    buf_map: HashMap<Id, Buffer>,
+    FreeBuf(BufProxy),
+    FreeImage(ImageProxy),
 }

 /// The type of resource that will be bound to a slot in a shader.
@ -149,6 +145,8 @@ impl Engine {
        Engine {
            shaders: vec![],
            pool: Default::default(),
+            bind_map: Default::default(),
+            downloads: Default::default(),
        }
    }

@ -249,9 +247,9 @@ impl Engine {
        queue: &Queue,
        recording: &Recording,
        external_resources: &[ExternalResource],
-    ) -> Result<Downloads, Error> {
-        let mut bind_map = BindMap::default();
-        let mut downloads = Downloads::default();
+    ) -> Result<(), Error> {
+        let mut free_bufs: HashSet<Id> = Default::default();
+        let mut free_images: HashSet<Id> = Default::default();

        let mut encoder = device.create_command_encoder(&Default::default());
        for command in &recording.commands {
@ -259,18 +257,22 @@ impl Engine {
                Command::Upload(buf_proxy, bytes) => {
                    let usage =
                        BufferUsages::COPY_SRC | BufferUsages::COPY_DST | BufferUsages::STORAGE;
-                    let buf = self.pool.get_buf(buf_proxy, usage, device);
+                    let buf = self
+                        .pool
+                        .get_buf(buf_proxy.size, buf_proxy.name, usage, device);
                    // TODO: if buffer is newly created, might be better to make it mapped at creation
                    // and copy. However, we expect reuse will be most common.
                    queue.write_buffer(&buf, 0, bytes);
-                    bind_map.insert_buf(buf_proxy, buf);
+                    self.bind_map.insert_buf(buf_proxy, buf);
                }
                Command::UploadUniform(buf_proxy, bytes) => {
                    let usage = BufferUsages::UNIFORM | BufferUsages::COPY_DST;
                    // Same consideration as above
-                    let buf = self.pool.get_buf(buf_proxy, usage, device);
+                    let buf = self
+                        .pool
+                        .get_buf(buf_proxy.size, buf_proxy.name, usage, device);
                    queue.write_buffer(&buf, 0, bytes);
-                    bind_map.insert_buf(buf_proxy, buf);
+                    self.bind_map.insert_buf(buf_proxy, buf);
                }
                Command::UploadImage(image_proxy, bytes) => {
                    let buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
@ -322,12 +324,13 @@ impl Engine {
                            depth_or_array_layers: 1,
                        },
                    );
-                    bind_map.insert_image(image_proxy.id, texture, texture_view)
+                    self.bind_map
+                        .insert_image(image_proxy.id, texture, texture_view)
                }
                Command::Dispatch(shader_id, wg_size, bindings) => {
                    // println!("dispatching {:?} with {} bindings", wg_size, bindings.len());
                    let shader = &self.shaders[shader_id.0];
-                    let bind_group = bind_map.create_bind_group(
+                    let bind_group = self.bind_map.create_bind_group(
                        device,
                        &shader.bind_group_layout,
                        bindings,
@ -340,18 +343,20 @@ impl Engine {
                    cpass.dispatch_workgroups(wg_size.0, wg_size.1, wg_size.2);
                }
                Command::Download(proxy) => {
-                    let src_buf = bind_map.buf_map.get(&proxy.id).ok_or("buffer not in map")?;
-                    let buf = device.create_buffer(&wgpu::BufferDescriptor {
-                        label: Some(proxy.name),
-                        size: proxy.size,
-                        usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
-                        mapped_at_creation: false,
-                    });
+                    let src_buf = self
+                        .bind_map
+                        .buf_map
+                        .get(&proxy.id)
+                        .ok_or("buffer not in map")?;
+                    let usage = BufferUsages::MAP_READ | BufferUsages::COPY_DST;
+                    let buf = self.pool.get_buf(proxy.size, "download", usage, device);
                    encoder.copy_buffer_to_buffer(&src_buf.buffer, 0, &buf, 0, proxy.size);
-                    downloads.buf_map.insert(proxy.id, buf);
+                    self.downloads.insert(proxy.id, buf);
                }
                Command::Clear(proxy, offset, size) => {
-                    let buffer = bind_map.get_or_create(*proxy, device, &mut self.pool)?;
+                    let buffer = self
+                        .bind_map
+                        .get_or_create(*proxy, device, &mut self.pool)?;
                    #[cfg(not(target_arch = "wasm32"))]
                    encoder.clear_buffer(buffer, *offset, *size);
                    #[cfg(target_arch = "wasm32")]
@ -366,11 +371,42 @@ impl Engine {
                        queue.write_buffer(buffer, *offset, &zeros);
                    }
                }
+                Command::FreeBuf(proxy) => {
+                    free_bufs.insert(proxy.id);
+                }
+                Command::FreeImage(proxy) => {
+                    free_images.insert(proxy.id);
+                }
            }
        }
        queue.submit(Some(encoder.finish()));
-        self.pool.reap_bindmap(bind_map);
-        Ok(downloads)
+        for id in free_bufs {
+            if let Some(buf) = self.bind_map.buf_map.remove(&id) {
+                let props = BufferProperties {
+                    size: buf.buffer.size(),
+                    usages: buf.buffer.usage(),
+                    #[cfg(feature = "buffer_labels")]
+                    name: buf.label,
+                };
+                self.pool.bufs.entry(props).or_default().push(buf.buffer);
+            }
+        }
+        for id in free_images {
+            if let Some((texture, view)) = self.bind_map.image_map.remove(&id) {
+                // TODO: have a pool to avoid needless re-allocation
+                drop(texture);
+                drop(view);
+            }
+        }
+        Ok(())
+    }
+
+    pub fn get_download(&self, buf: BufProxy) -> Option<&Buffer> {
+        self.downloads.get(&buf.id)
+    }
+
+    pub fn free_download(&mut self, buf: BufProxy) {
+        self.downloads.remove(&buf.id);
    }
 }

@ -418,6 +454,10 @@ impl Recording {
        ));
    }

+    /// Prepare a buffer for downloading.
+    ///
+    /// Currently this copies to a download buffer. The original buffer can be freed
+    /// immediately after.
    pub fn download(&mut self, buf: BufProxy) {
        self.push(Command::Download(buf));
    }
@ -425,6 +465,21 @@ impl Recording {
    pub fn clear_all(&mut self, buf: BufProxy) {
        self.push(Command::Clear(buf, 0, None));
    }
+
+    pub fn free_buf(&mut self, buf: BufProxy) {
+        self.push(Command::FreeBuf(buf));
+    }
+
+    pub fn free_image(&mut self, image: ImageProxy) {
+        self.push(Command::FreeImage(image));
+    }
+
+    pub fn free_resource(&mut self, resource: ResourceProxy) {
+        match resource {
+            ResourceProxy::Buf(buf) => self.free_buf(buf),
+            ResourceProxy::Image(image) => self.free_image(image),
+        }
+    }
 }

 impl BufProxy {
@ -565,7 +620,7 @@ impl BindMap {
                    if let Entry::Vacant(v) = self.buf_map.entry(proxy.id) {
                        let usage =
                            BufferUsages::COPY_SRC | BufferUsages::COPY_DST | BufferUsages::STORAGE;
-                        let buf = pool.get_buf(&proxy, usage, device);
+                        let buf = pool.get_buf(proxy.size, proxy.name, usage, device);
                        v.insert(BindMapBuffer {
                            buffer: buf,
                            label: proxy.name,
@ -647,7 +702,7 @@ impl BindMap {
            Entry::Occupied(occupied) => Ok(&occupied.into_mut().buffer),
            Entry::Vacant(vacant) => {
                let usage = BufferUsages::COPY_SRC | BufferUsages::COPY_DST | BufferUsages::STORAGE;
-                let buf = pool.get_buf(&proxy, usage, device);
+                let buf = pool.get_buf(proxy.size, proxy.name, usage, device);
                Ok(&vacant
                    .insert(BindMapBuffer {
                        buffer: buf,
@ -659,53 +714,23 @@ impl BindMap {
    }
 }

-pub struct DownloadsMapped<'a>(
-    HashMap<
-        Id,
-        (
-            BufferSlice<'a>,
-            GenericOneshotReceiver<RawMutex, Result<(), BufferAsyncError>>,
-        ),
-    >,
-);
-
-impl Downloads {
-    // Discussion: should API change so we get one buffer, rather than mapping all?
-    pub fn map(&self) -> DownloadsMapped {
-        let mut map = HashMap::new();
-        for (id, buf) in &self.buf_map {
-            let buf_slice = buf.slice(..);
-            let (sender, receiver) = futures_intrusive::channel::shared::oneshot_channel();
-            buf_slice.map_async(wgpu::MapMode::Read, move |v| sender.send(v).unwrap());
-            map.insert(*id, (buf_slice, receiver));
-        }
-        DownloadsMapped(map)
-    }
-}
-
-impl<'a> DownloadsMapped<'a> {
-    pub async fn get_mapped(&self, proxy: BufProxy) -> Result<BufferView, Error> {
-        let (slice, recv) = self.0.get(&proxy.id).ok_or("buffer not in map")?;
-        if let Some(recv_result) = recv.receive().await {
-            recv_result?;
-        } else {
-            return Err("channel was closed".into());
-        }
-        Ok(slice.get_mapped_range())
-    }
-}
-
 const SIZE_CLASS_BITS: u32 = 1;

 impl ResourcePool {
    /// Get a buffer from the pool or create one.
-    fn get_buf(&mut self, proxy: &BufProxy, usage: BufferUsages, device: &Device) -> Buffer {
-        let rounded_size = Self::size_class(proxy.size, SIZE_CLASS_BITS);
+    fn get_buf(
+        &mut self,
+        size: u64,
+        name: &'static str,
+        usage: BufferUsages,
+        device: &Device,
+    ) -> Buffer {
+        let rounded_size = Self::size_class(size, SIZE_CLASS_BITS);
        let props = BufferProperties {
            size: rounded_size,
            usages: usage,
            #[cfg(feature = "buffer_labels")]
-            name: proxy.name,
+            name: name,
        };
        if let Some(buf_vec) = self.bufs.get_mut(&props) {
            if let Some(buf) = buf_vec.pop() {
@ -714,7 +739,7 @@ impl ResourcePool {
        }
        device.create_buffer(&wgpu::BufferDescriptor {
            #[cfg(feature = "buffer_labels")]
-            label: Some(proxy.name),
+            label: Some(name),
            #[cfg(not(feature = "buffer_labels"))]
            label: None,
            size: rounded_size,
@ -723,19 +748,6 @@ impl ResourcePool {
        })
    }

-    fn reap_bindmap(&mut self, bind_map: BindMap) {
-        for (_id, buf) in bind_map.buf_map {
-            let size = buf.buffer.size();
-            let props = BufferProperties {
-                size,
-                usages: buf.buffer.usage(),
-                #[cfg(feature = "buffer_labels")]
-                name: buf.label,
-            };
-            self.bufs.entry(props).or_default().push(buf.buffer);
-        }
-    }
-
    /// Quantize a size up to the nearest size class.
    fn size_class(x: u64, bits: u32) -> u64 {
        if x > 1 << bits {
--- a/src/lib.rs
+++ b/src/lib.rs
@ -29,9 +29,11 @@ pub mod encoding;
 pub mod glyph;
 pub mod util;

+use render::Render;
 pub use scene::{Scene, SceneBuilder, SceneFragment};
+pub use util::block_on_wgpu;

-use engine::{Engine, ExternalResource};
+use engine::{Engine, ExternalResource, Recording};
 use shaders::FullShaders;

 use wgpu::{Device, Queue, SurfaceTexture, TextureFormat, TextureView};
@ -83,8 +85,7 @@ impl Renderer {
            *target.as_image().unwrap(),
            texture,
        )];
-        let _ = self
-            .engine
+        self.engine
            .run_recording(device, queue, &recording, &external_resources)?;
        Ok(())
    }
@ -164,6 +165,105 @@ impl Renderer {
        self.shaders = shaders;
        Ok(())
    }
+
+    /// Renders a scene to the target texture.
+    ///
+    /// The texture is assumed to be of the specified dimensions and have been created with
+    /// the [wgpu::TextureFormat::Rgba8Unorm] format and the [wgpu::TextureUsages::STORAGE_BINDING]
+    /// flag set.
+    pub async fn render_to_texture_async(
+        &mut self,
+        device: &Device,
+        queue: &Queue,
+        scene: &Scene,
+        texture: &TextureView,
+        width: u32,
+        height: u32,
+    ) -> Result<()> {
+        let mut render = Render::new();
+        let encoding = scene.data();
+        let recording = render.render_encoding_coarse(encoding, &self.shaders, width, height, true);
+        let target = render.out_image();
+        let bump_buf = render.bump_buf();
+        self.engine.run_recording(device, queue, &recording, &[])?;
+        if let Some(bump_buf) = self.engine.get_download(bump_buf) {
+            let buf_slice = bump_buf.slice(..);
+            let (sender, receiver) = futures_intrusive::channel::shared::oneshot_channel();
+            buf_slice.map_async(wgpu::MapMode::Read, move |v| sender.send(v).unwrap());
+            if let Some(recv_result) = receiver.receive().await {
+                recv_result?;
+            } else {
+                return Err("channel was closed".into());
+            }
+            let mapped = buf_slice.get_mapped_range();
+            println!("{:?}", bytemuck::cast_slice::<_, u32>(&mapped));
+        }
+        // TODO: apply logic to determine whether we need to rerun coarse, and also
+        // allocate the blend stack as needed.
+        self.engine.free_download(bump_buf);
+        // Maybe clear to reuse allocation?
+        let mut recording = Recording::default();
+        render.record_fine(&self.shaders, &mut recording);
+        let external_resources = [ExternalResource::Image(target, texture)];
+        self.engine
+            .run_recording(device, queue, &recording, &external_resources)?;
+        Ok(())
+    }
+
+    pub async fn render_to_surface_async(
+        &mut self,
+        device: &Device,
+        queue: &Queue,
+        scene: &Scene,
+        surface: &SurfaceTexture,
+        width: u32,
+        height: u32,
+    ) -> Result<()> {
+        let mut target = self
+            .target
+            .take()
+            .unwrap_or_else(|| TargetTexture::new(device, width, height));
+        // TODO: implement clever resizing semantics here to avoid thrashing the memory allocator
+        // during resize, specifically on metal.
+        if target.width != width || target.height != height {
+            target = TargetTexture::new(device, width, height);
+        }
+        self.render_to_texture_async(device, queue, scene, &target.view, width, height)
+            .await?;
+        let mut encoder =
+            device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
+        {
+            let surface_view = surface
+                .texture
+                .create_view(&wgpu::TextureViewDescriptor::default());
+            let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
+                label: None,
+                layout: &self.blit.bind_layout,
+                entries: &[wgpu::BindGroupEntry {
+                    binding: 0,
+                    resource: wgpu::BindingResource::TextureView(&target.view),
+                }],
+            });
+            let mut render_pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
+                label: None,
+                color_attachments: &[Some(wgpu::RenderPassColorAttachment {
+                    view: &surface_view,
+                    resolve_target: None,
+                    ops: wgpu::Operations {
+                        load: wgpu::LoadOp::Clear(wgpu::Color::default()),
+                        store: true,
+                    },
+                })],
+                depth_stencil_attachment: None,
+            });
+            render_pass.set_pipeline(&self.blit.pipeline);
+            render_pass.set_bind_group(0, &bind_group, &[]);
+            render_pass.draw(0..6, 0..1);
+        }
+        queue.submit(Some(encoder.finish()));
+        self.target = Some(target);
+        Ok(())
+    }
 }

 struct TargetTexture {
--- a/src/render.rs
+++ b/src/render.rs
@ -9,6 +9,34 @@ use crate::{
    Scene,
 };

+/// State for a render in progress.
+pub struct Render {
+    /// Size of binning and info combined buffer in u32 units
+    binning_info_size: u32,
+    /// Size of tiles buf in tiles
+    tiles_size: u32,
+    /// Size of segments buf in segments
+    segments_size: u32,
+    /// Size of per-tile command list in u32 units
+    ptcl_size: u32,
+    width_in_tiles: u32,
+    height_in_tiles: u32,
+    fine: Option<FineResources>,
+}
+
+/// Resources produced by pipeline, needed for fine rasterization.
+struct FineResources {
+    config_buf: ResourceProxy,
+    bump_buf: ResourceProxy,
+    tile_buf: ResourceProxy,
+    segments_buf: ResourceProxy,
+    ptcl_buf: ResourceProxy,
+    gradient_image: ResourceProxy,
+    info_bin_data_buf: ResourceProxy,
+
+    out_image: ImageProxy,
+}
+
 const TAG_MONOID_SIZE: u64 = 12;
 const TAG_MONOID_FULL_SIZE: u64 = 20;
 const PATH_BBOX_SIZE: u64 = 24;
@ -157,288 +185,389 @@ pub fn render_full(
    render_encoding_full(scene.data(), shaders, width, height)
 }

+/// Create a single recording with both coarse and fine render stages.
+///
+/// This function is not recommended when the scene can be complex, as it does not
+/// implement robust dynamic memory.
 pub fn render_encoding_full(
    encoding: &Encoding,
    shaders: &FullShaders,
    width: u32,
    height: u32,
 ) -> (Recording, ResourceProxy) {
-    use crate::encoding::{resource::ResourceCache, PackedEncoding};
-    let mut recording = Recording::default();
-    let mut resources = ResourceCache::new();
-    let mut packed = PackedEncoding::default();
-    packed.pack(encoding, &mut resources);
-    let (ramp_data, ramps_width, ramps_height) = resources.ramps(packed.resources).unwrap();
-    let gradient_image = if encoding.patches.is_empty() {
-        ResourceProxy::new_image(1, 1, ImageFormat::Rgba8)
-    } else {
-        let data: &[u8] = bytemuck::cast_slice(ramp_data);
-        ResourceProxy::Image(recording.upload_image(
-            ramps_width,
-            ramps_height,
-            ImageFormat::Rgba8,
-            data,
-        ))
-    };
-    // TODO: calculate for real when we do rectangles
-    let n_pathtag = encoding.path_tags.len();
-    let pathtag_padded = align_up(encoding.path_tags.len(), 4 * shaders::PATHTAG_REDUCE_WG);
-    let n_paths = encoding.n_paths;
-    let n_drawobj = n_paths;
-    let n_clip = encoding.n_clips;
-
-    let new_width = next_multiple_of(width, 16);
-    let new_height = next_multiple_of(height, 16);
-
-    let info_size = packed.layout.bin_data_start;
-    let config = crate::encoding::Config {
-        width_in_tiles: new_width / 16,
-        height_in_tiles: new_height / 16,
-        target_width: width,
-        target_height: height,
-        binning_size: ((1 << 20) / 4) - info_size,
-        tiles_size: (1 << 24) / TILE_SIZE as u32,
-        segments_size: (1 << 26) / SEGMENT_SIZE as u32,
-        ptcl_size: (1 << 25) / 4,
-        layout: packed.layout,
-    };
-    // println!("{:?}", config);
-    let scene_buf = ResourceProxy::Buf(recording.upload("scene", packed.data));
-    let config_buf =
-        ResourceProxy::Buf(recording.upload_uniform("config", bytemuck::bytes_of(&config)));
-    let info_bin_data_buf = ResourceProxy::new_buf(
-        (info_size + config.binning_size) as u64 * 4,
-        "info_bin_data_buf",
-    );
-    let tile_buf = ResourceProxy::new_buf(config.tiles_size as u64 * TILE_SIZE, "tile_buf");
-    let segments_buf =
-        ResourceProxy::new_buf(config.segments_size as u64 * SEGMENT_SIZE, "segments_buf");
-    let ptcl_buf = ResourceProxy::new_buf(config.ptcl_size as u64 * 4, "ptcl_buf");
-
-    let pathtag_wgs = pathtag_padded / (4 * shaders::PATHTAG_REDUCE_WG as usize);
-    let pathtag_large = pathtag_wgs > shaders::PATHTAG_REDUCE_WG as usize;
-    let reduced_size = if pathtag_large {
-        align_up(pathtag_wgs, shaders::PATHTAG_REDUCE_WG)
-    } else {
-        pathtag_wgs
-    };
-    let reduced_buf =
-        ResourceProxy::new_buf(reduced_size as u64 * TAG_MONOID_FULL_SIZE, "reduced_buf");
-    // TODO: really only need pathtag_wgs - 1
-    recording.dispatch(
-        shaders.pathtag_reduce,
-        (pathtag_wgs as u32, 1, 1),
-        [config_buf, scene_buf, reduced_buf],
-    );
-    let mut pathtag_parent = reduced_buf;
-    if pathtag_large {
-        let reduced2_size = shaders::PATHTAG_REDUCE_WG as usize;
-        let reduced2_buf =
-            ResourceProxy::new_buf(reduced2_size as u64 * TAG_MONOID_FULL_SIZE, "reduced2_buf");
-        recording.dispatch(
-            shaders.pathtag_reduce2,
-            (reduced2_size as u32, 1, 1),
-            [reduced_buf, reduced2_buf],
-        );
-        let reduced_scan_buf = ResourceProxy::new_buf(
-            pathtag_wgs as u64 * TAG_MONOID_FULL_SIZE,
-            "reduced_scan_buf",
-        );
-        recording.dispatch(
-            shaders.pathtag_scan1,
-            (reduced_size as u32 / shaders::PATHTAG_REDUCE_WG, 1, 1),
-            [reduced_buf, reduced2_buf, reduced_scan_buf],
-        );
-        pathtag_parent = reduced_scan_buf;
-    }
-
-    let tagmonoid_buf = ResourceProxy::new_buf(
-        pathtag_wgs as u64 * shaders::PATHTAG_REDUCE_WG as u64 * TAG_MONOID_FULL_SIZE,
-        "tagmonoid_buf",
-    );
-    let pathtag_scan = if pathtag_large {
-        shaders.pathtag_scan_large
-    } else {
-        shaders.pathtag_scan
-    };
-    recording.dispatch(
-        pathtag_scan,
-        (pathtag_wgs as u32, 1, 1),
-        [config_buf, scene_buf, pathtag_parent, tagmonoid_buf],
-    );
-    let drawobj_wgs = (n_drawobj + shaders::PATH_BBOX_WG - 1) / shaders::PATH_BBOX_WG;
-    let path_bbox_buf = ResourceProxy::new_buf(n_paths as u64 * PATH_BBOX_SIZE, "path_bbox_buf");
-    recording.dispatch(
-        shaders.bbox_clear,
-        (drawobj_wgs, 1, 1),
-        [config_buf, path_bbox_buf],
-    );
-    let cubic_buf = ResourceProxy::new_buf(n_pathtag as u64 * CUBIC_SIZE, "cubic_buf");
-    let path_coarse_wgs =
-        (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
-    recording.dispatch(
-        shaders.pathseg,
-        (path_coarse_wgs, 1, 1),
-        [
-            config_buf,
-            scene_buf,
-            tagmonoid_buf,
-            path_bbox_buf,
-            cubic_buf,
-        ],
-    );
-    let draw_reduced_buf =
-        ResourceProxy::new_buf(drawobj_wgs as u64 * DRAWMONOID_SIZE, "draw_reduced_buf");
-    recording.dispatch(
-        shaders.draw_reduce,
-        (drawobj_wgs, 1, 1),
-        [config_buf, scene_buf, draw_reduced_buf],
-    );
-    let draw_monoid_buf =
-        ResourceProxy::new_buf(n_drawobj as u64 * DRAWMONOID_SIZE, "draw_monoid_buf");
-    let clip_inp_buf =
-        ResourceProxy::new_buf(encoding.n_clips as u64 * CLIP_INP_SIZE, "clip_inp_buf");
-    recording.dispatch(
-        shaders.draw_leaf,
-        (drawobj_wgs, 1, 1),
-        [
-            config_buf,
-            scene_buf,
-            draw_reduced_buf,
-            path_bbox_buf,
-            draw_monoid_buf,
-            info_bin_data_buf,
-            clip_inp_buf,
-        ],
-    );
-    let clip_el_buf = ResourceProxy::new_buf(encoding.n_clips as u64 * CLIP_EL_SIZE, "clip_el_buf");
-    let clip_bic_buf = ResourceProxy::new_buf(
-        (n_clip / shaders::CLIP_REDUCE_WG) as u64 * CLIP_BIC_SIZE,
-        "clip_bic_buf",
-    );
-    let clip_wg_reduce = n_clip.saturating_sub(1) / shaders::CLIP_REDUCE_WG;
-    if clip_wg_reduce > 0 {
-        recording.dispatch(
-            shaders.clip_reduce,
-            (clip_wg_reduce, 1, 1),
-            [
-                config_buf,
-                clip_inp_buf,
-                path_bbox_buf,
-                clip_bic_buf,
-                clip_el_buf,
-            ],
-        );
-    }
-    let clip_wg = (n_clip + shaders::CLIP_REDUCE_WG - 1) / shaders::CLIP_REDUCE_WG;
-    let clip_bbox_buf = ResourceProxy::new_buf(n_clip as u64 * CLIP_BBOX_SIZE, "clip_bbox_buf");
-    if clip_wg > 0 {
-        recording.dispatch(
-            shaders.clip_leaf,
-            (clip_wg, 1, 1),
-            [
-                config_buf,
-                clip_inp_buf,
-                path_bbox_buf,
-                clip_bic_buf,
-                clip_el_buf,
-                draw_monoid_buf,
-                clip_bbox_buf,
-            ],
-        );
-    }
-    let draw_bbox_buf = ResourceProxy::new_buf(n_paths as u64 * DRAW_BBOX_SIZE, "draw_bbox_buf");
-    let bump_buf = BufProxy::new(BUMP_SIZE, "bump_buf");
-    let width_in_bins = (config.width_in_tiles + 15) / 16;
-    let height_in_bins = (config.height_in_tiles + 15) / 16;
-    let bin_header_buf = ResourceProxy::new_buf(
-        (256 * drawobj_wgs) as u64 * BIN_HEADER_SIZE,
-        "bin_header_buf",
-    );
-    recording.clear_all(bump_buf);
-    let bump_buf = ResourceProxy::Buf(bump_buf);
-    recording.dispatch(
-        shaders.binning,
-        (drawobj_wgs, 1, 1),
-        [
-            config_buf,
-            draw_monoid_buf,
-            path_bbox_buf,
-            clip_bbox_buf,
-            draw_bbox_buf,
-            bump_buf,
-            info_bin_data_buf,
-            bin_header_buf,
-        ],
-    );
-    // Note: this only needs to be rounded up because of the workaround to store the tile_offset
-    // in storage rather than workgroup memory.
-    let n_path_aligned = align_up(n_paths as usize, 256);
-    let path_buf = ResourceProxy::new_buf(n_path_aligned as u64 * PATH_SIZE, "path_buf");
-    let path_wgs = (n_paths + shaders::PATH_BBOX_WG - 1) / shaders::PATH_BBOX_WG;
-    recording.dispatch(
-        shaders.tile_alloc,
-        (path_wgs, 1, 1),
-        [
-            config_buf,
-            scene_buf,
-            draw_bbox_buf,
-            bump_buf,
-            path_buf,
-            tile_buf,
-        ],
-    );
-    recording.dispatch(
-        shaders.path_coarse,
-        (path_coarse_wgs, 1, 1),
-        [
-            config_buf,
-            scene_buf,
-            tagmonoid_buf,
-            cubic_buf,
-            path_buf,
-            bump_buf,
-            tile_buf,
-            segments_buf,
-        ],
-    );
-    recording.dispatch(
-        shaders.backdrop,
-        (path_wgs, 1, 1),
-        [config_buf, path_buf, tile_buf],
-    );
-    recording.dispatch(
-        shaders.coarse,
-        (width_in_bins, height_in_bins, 1),
-        [
-            config_buf,
-            scene_buf,
-            draw_monoid_buf,
-            bin_header_buf,
-            info_bin_data_buf,
-            path_buf,
-            tile_buf,
-            bump_buf,
-            ptcl_buf,
-        ],
-    );
-    let out_image = ImageProxy::new(width, height, ImageFormat::Rgba8);
-    recording.dispatch(
-        shaders.fine,
-        (config.width_in_tiles, config.height_in_tiles, 1),
-        [
-            config_buf,
-            tile_buf,
-            segments_buf,
-            ResourceProxy::Image(out_image),
-            ptcl_buf,
-            gradient_image,
-            info_bin_data_buf,
-        ],
-    );
-    (recording, ResourceProxy::Image(out_image))
+    let mut render = Render::new();
+    let mut recording = render.render_encoding_coarse(encoding, shaders, width, height, false);
+    let out_image = render.out_image();
+    render.record_fine(shaders, &mut recording);
+    (recording, out_image.into())
 }

 pub fn align_up(len: usize, alignment: u32) -> usize {
    len + (len.wrapping_neg() & (alignment as usize - 1))
 }
+
+impl Render {
+    pub fn new() -> Self {
+        // These sizes are adequate for paris-30k but should probably be dialed down.
+        Render {
+            binning_info_size: (1 << 20) / 4,
+            tiles_size: (1 << 24) / TILE_SIZE as u32,
+            segments_size: (1 << 26) / SEGMENT_SIZE as u32,
+            ptcl_size: (1 << 25) / 4 as u32,
+            width_in_tiles: 0,
+            height_in_tiles: 0,
+            fine: None,
+        }
+    }
+
+    /// Prepare a recording for the coarse rasterization phase.
+    ///
+    /// The `robust` parameter controls whether we're preparing for readback
+    /// of the atomic bump buffer, for robust dynamic memory.
+    pub fn render_encoding_coarse(
+        &mut self,
+        encoding: &Encoding,
+        shaders: &FullShaders,
+        width: u32,
+        height: u32,
+        robust: bool,
+    ) -> Recording {
+        use crate::encoding::{resource::ResourceCache, PackedEncoding};
+        let mut recording = Recording::default();
+        let mut resources = ResourceCache::new();
+        let mut packed = PackedEncoding::default();
+        packed.pack(encoding, &mut resources);
+        let (ramp_data, ramps_width, ramps_height) = resources.ramps(packed.resources).unwrap();
+        let gradient_image = if encoding.patches.is_empty() {
+            ResourceProxy::new_image(1, 1, ImageFormat::Rgba8)
+        } else {
+            let data: &[u8] = bytemuck::cast_slice(ramp_data);
+            ResourceProxy::Image(recording.upload_image(
+                ramps_width,
+                ramps_height,
+                ImageFormat::Rgba8,
+                data,
+            ))
+        };
+        // TODO: calculate for real when we do rectangles
+        let n_pathtag = encoding.path_tags.len();
+        let pathtag_padded = align_up(encoding.path_tags.len(), 4 * shaders::PATHTAG_REDUCE_WG);
+        let n_paths = encoding.n_paths;
+        let n_drawobj = n_paths;
+        let n_clip = encoding.n_clips;
+
+        let new_width = next_multiple_of(width, 16);
+        let new_height = next_multiple_of(height, 16);
+
+        let info_size = packed.layout.bin_data_start;
+        let config = crate::encoding::Config {
+            width_in_tiles: new_width / 16,
+            height_in_tiles: new_height / 16,
+            target_width: width,
+            target_height: height,
+            binning_size: self.binning_info_size - info_size,
+            tiles_size: self.tiles_size,
+            segments_size: self.segments_size,
+            ptcl_size: self.ptcl_size,
+            layout: packed.layout,
+        };
+        // println!("{:?}", config);
+        let scene_buf = ResourceProxy::Buf(recording.upload("scene", packed.data));
+        let config_buf =
+            ResourceProxy::Buf(recording.upload_uniform("config", bytemuck::bytes_of(&config)));
+        let info_bin_data_buf = ResourceProxy::new_buf(
+            (info_size + config.binning_size) as u64 * 4,
+            "info_bin_data_buf",
+        );
+        let tile_buf = ResourceProxy::new_buf(config.tiles_size as u64 * TILE_SIZE, "tile_buf");
+        let segments_buf =
+            ResourceProxy::new_buf(config.segments_size as u64 * SEGMENT_SIZE, "segments_buf");
+        let ptcl_buf = ResourceProxy::new_buf(config.ptcl_size as u64 * 4, "ptcl_buf");
+
+        let pathtag_wgs = pathtag_padded / (4 * shaders::PATHTAG_REDUCE_WG as usize);
+        let pathtag_large = pathtag_wgs > shaders::PATHTAG_REDUCE_WG as usize;
+        let reduced_size = if pathtag_large {
+            align_up(pathtag_wgs, shaders::PATHTAG_REDUCE_WG)
+        } else {
+            pathtag_wgs
+        };
+        let reduced_buf =
+            ResourceProxy::new_buf(reduced_size as u64 * TAG_MONOID_FULL_SIZE, "reduced_buf");
+        // TODO: really only need pathtag_wgs - 1
+        recording.dispatch(
+            shaders.pathtag_reduce,
+            (pathtag_wgs as u32, 1, 1),
+            [config_buf, scene_buf, reduced_buf],
+        );
+        let mut pathtag_parent = reduced_buf;
+        let mut large_pathtag_bufs = None;
+        if pathtag_large {
+            let reduced2_size = shaders::PATHTAG_REDUCE_WG as usize;
+            let reduced2_buf =
+                ResourceProxy::new_buf(reduced2_size as u64 * TAG_MONOID_FULL_SIZE, "reduced2_buf");
+            recording.dispatch(
+                shaders.pathtag_reduce2,
+                (reduced2_size as u32, 1, 1),
+                [reduced_buf, reduced2_buf],
+            );
+            let reduced_scan_buf = ResourceProxy::new_buf(
+                pathtag_wgs as u64 * TAG_MONOID_FULL_SIZE,
+                "reduced_scan_buf",
+            );
+            recording.dispatch(
+                shaders.pathtag_scan1,
+                (reduced_size as u32 / shaders::PATHTAG_REDUCE_WG, 1, 1),
+                [reduced_buf, reduced2_buf, reduced_scan_buf],
+            );
+            pathtag_parent = reduced_scan_buf;
+            large_pathtag_bufs = Some((reduced2_buf, reduced_scan_buf));
+        }
+
+        let tagmonoid_buf = ResourceProxy::new_buf(
+            pathtag_wgs as u64 * shaders::PATHTAG_REDUCE_WG as u64 * TAG_MONOID_FULL_SIZE,
+            "tagmonoid_buf",
+        );
+        let pathtag_scan = if pathtag_large {
+            shaders.pathtag_scan_large
+        } else {
+            shaders.pathtag_scan
+        };
+        recording.dispatch(
+            pathtag_scan,
+            (pathtag_wgs as u32, 1, 1),
+            [config_buf, scene_buf, pathtag_parent, tagmonoid_buf],
+        );
+        recording.free_resource(reduced_buf);
+        if let Some((reduced2, reduced_scan)) = large_pathtag_bufs {
+            recording.free_resource(reduced2);
+            recording.free_resource(reduced_scan);
+        }
+        let drawobj_wgs = (n_drawobj + shaders::PATH_BBOX_WG - 1) / shaders::PATH_BBOX_WG;
+        let path_bbox_buf =
+            ResourceProxy::new_buf(n_paths as u64 * PATH_BBOX_SIZE, "path_bbox_buf");
+        recording.dispatch(
+            shaders.bbox_clear,
+            (drawobj_wgs, 1, 1),
+            [config_buf, path_bbox_buf],
+        );
+        let cubic_buf = ResourceProxy::new_buf(n_pathtag as u64 * CUBIC_SIZE, "cubic_buf");
+        let path_coarse_wgs =
+            (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
+        recording.dispatch(
+            shaders.pathseg,
+            (path_coarse_wgs, 1, 1),
+            [
+                config_buf,
+                scene_buf,
+                tagmonoid_buf,
+                path_bbox_buf,
+                cubic_buf,
+            ],
+        );
+        let draw_reduced_buf =
+            ResourceProxy::new_buf(drawobj_wgs as u64 * DRAWMONOID_SIZE, "draw_reduced_buf");
+        recording.dispatch(
+            shaders.draw_reduce,
+            (drawobj_wgs, 1, 1),
+            [config_buf, scene_buf, draw_reduced_buf],
+        );
+        let draw_monoid_buf =
+            ResourceProxy::new_buf(n_drawobj as u64 * DRAWMONOID_SIZE, "draw_monoid_buf");
+        let clip_inp_buf =
+            ResourceProxy::new_buf(encoding.n_clips as u64 * CLIP_INP_SIZE, "clip_inp_buf");
+        recording.dispatch(
+            shaders.draw_leaf,
+            (drawobj_wgs, 1, 1),
+            [
+                config_buf,
+                scene_buf,
+                draw_reduced_buf,
+                path_bbox_buf,
+                draw_monoid_buf,
+                info_bin_data_buf,
+                clip_inp_buf,
+            ],
+        );
+        recording.free_resource(draw_reduced_buf);
+        let clip_el_buf =
+            ResourceProxy::new_buf(encoding.n_clips as u64 * CLIP_EL_SIZE, "clip_el_buf");
+        let clip_bic_buf = ResourceProxy::new_buf(
+            (n_clip / shaders::CLIP_REDUCE_WG) as u64 * CLIP_BIC_SIZE,
+            "clip_bic_buf",
+        );
+        let clip_wg_reduce = n_clip.saturating_sub(1) / shaders::CLIP_REDUCE_WG;
+        if clip_wg_reduce > 0 {
+            recording.dispatch(
+                shaders.clip_reduce,
+                (clip_wg_reduce, 1, 1),
+                [
+                    config_buf,
+                    clip_inp_buf,
+                    path_bbox_buf,
+                    clip_bic_buf,
+                    clip_el_buf,
+                ],
+            );
+        }
+        let clip_wg = (n_clip + shaders::CLIP_REDUCE_WG - 1) / shaders::CLIP_REDUCE_WG;
+        let clip_bbox_buf = ResourceProxy::new_buf(n_clip as u64 * CLIP_BBOX_SIZE, "clip_bbox_buf");
+        if clip_wg > 0 {
+            recording.dispatch(
+                shaders.clip_leaf,
+                (clip_wg, 1, 1),
+                [
+                    config_buf,
+                    clip_inp_buf,
+                    path_bbox_buf,
+                    clip_bic_buf,
+                    clip_el_buf,
+                    draw_monoid_buf,
+                    clip_bbox_buf,
+                ],
+            );
+        }
+        recording.free_resource(clip_inp_buf);
+        recording.free_resource(clip_bic_buf);
+        recording.free_resource(clip_el_buf);
+        let draw_bbox_buf =
+            ResourceProxy::new_buf(n_paths as u64 * DRAW_BBOX_SIZE, "draw_bbox_buf");
+        let bump_buf = BufProxy::new(BUMP_SIZE, "bump_buf");
+        let width_in_bins = (config.width_in_tiles + 15) / 16;
+        let height_in_bins = (config.height_in_tiles + 15) / 16;
+        let bin_header_buf = ResourceProxy::new_buf(
+            (256 * drawobj_wgs) as u64 * BIN_HEADER_SIZE,
+            "bin_header_buf",
+        );
+        recording.clear_all(bump_buf);
+        let bump_buf = ResourceProxy::Buf(bump_buf);
+        recording.dispatch(
+            shaders.binning,
+            (drawobj_wgs, 1, 1),
+            [
+                config_buf,
+                draw_monoid_buf,
+                path_bbox_buf,
+                clip_bbox_buf,
+                draw_bbox_buf,
+                bump_buf,
+                info_bin_data_buf,
+                bin_header_buf,
+            ],
+        );
+        recording.free_resource(draw_monoid_buf);
+        recording.free_resource(path_bbox_buf);
+        recording.free_resource(clip_bbox_buf);
+        // Note: this only needs to be rounded up because of the workaround to store the tile_offset
+        // in storage rather than workgroup memory.
+        let n_path_aligned = align_up(n_paths as usize, 256);
+        let path_buf = ResourceProxy::new_buf(n_path_aligned as u64 * PATH_SIZE, "path_buf");
+        let path_wgs = (n_paths + shaders::PATH_BBOX_WG - 1) / shaders::PATH_BBOX_WG;
+        recording.dispatch(
+            shaders.tile_alloc,
+            (path_wgs, 1, 1),
+            [
+                config_buf,
+                scene_buf,
+                draw_bbox_buf,
+                bump_buf,
+                path_buf,
+                tile_buf,
+            ],
+        );
+        recording.free_resource(draw_bbox_buf);
+        recording.dispatch(
+            shaders.path_coarse,
+            (path_coarse_wgs, 1, 1),
+            [
+                config_buf,
+                scene_buf,
+                tagmonoid_buf,
+                cubic_buf,
+                path_buf,
+                bump_buf,
+                tile_buf,
+                segments_buf,
+            ],
+        );
+        recording.free_resource(tagmonoid_buf);
+        recording.free_resource(cubic_buf);
+        recording.dispatch(
+            shaders.backdrop,
+            (path_wgs, 1, 1),
+            [config_buf, path_buf, tile_buf],
+        );
+        recording.dispatch(
+            shaders.coarse,
+            (width_in_bins, height_in_bins, 1),
+            [
+                config_buf,
+                scene_buf,
+                draw_monoid_buf,
+                bin_header_buf,
+                info_bin_data_buf,
+                path_buf,
+                tile_buf,
+                bump_buf,
+                ptcl_buf,
+            ],
+        );
+        recording.free_resource(scene_buf);
+        recording.free_resource(draw_monoid_buf);
+        recording.free_resource(bin_header_buf);
+        recording.free_resource(path_buf);
+        let out_image = ImageProxy::new(width, height, ImageFormat::Rgba8);
+        self.width_in_tiles = config.width_in_tiles;
+        self.height_in_tiles = config.height_in_tiles;
+        self.fine = Some(FineResources {
+            config_buf,
+            bump_buf,
+            tile_buf,
+            segments_buf,
+            ptcl_buf,
+            gradient_image,
+            info_bin_data_buf,
+            out_image,
+        });
+        if robust {
+            recording.download(*bump_buf.as_buf().unwrap());
+        }
+        recording.free_resource(bump_buf);
+        recording
+    }
+
+    /// Run fine rasterization assuming the coarse phase succeeded.
+    pub fn record_fine(&mut self, shaders: &FullShaders, recording: &mut Recording) {
+        let fine = self.fine.take().unwrap();
+        recording.dispatch(
+            shaders.fine,
+            (self.width_in_tiles, self.height_in_tiles, 1),
+            [
+                fine.config_buf,
+                fine.tile_buf,
+                fine.segments_buf,
+                ResourceProxy::Image(fine.out_image),
+                fine.ptcl_buf,
+                fine.gradient_image,
+                fine.info_bin_data_buf,
+            ],
+        );
+        recording.free_resource(fine.config_buf);
+        recording.free_resource(fine.tile_buf);
+        recording.free_resource(fine.segments_buf);
+        recording.free_resource(fine.ptcl_buf);
+        recording.free_resource(fine.gradient_image);
+        recording.free_resource(fine.info_bin_data_buf);
+    }
+
+    /// Get the output image.
+    ///
+    /// This is going away, as the caller will add the output image to the bind
+    /// map.
+    pub fn out_image(&self) -> ImageProxy {
+        self.fine.as_ref().unwrap().out_image
+    }
+
+    pub fn bump_buf(&self) -> BufProxy {
+        *self.fine.as_ref().unwrap().bump_buf.as_buf().unwrap()
+    }
+}
--- a/src/util.rs
+++ b/src/util.rs
@ -16,6 +16,8 @@

 //! Simple helpers for managing wgpu state and surfaces.

+use std::future::Future;
+
 use super::Result;

 use raw_window_handle::{HasRawDisplayHandle, HasRawWindowHandle};
@ -132,3 +134,27 @@ pub struct RenderSurface {
    pub config: SurfaceConfiguration,
    pub dev_id: usize,
 }
+
+struct NullWake;
+
+impl std::task::Wake for NullWake {
+    fn wake(self: std::sync::Arc<Self>) {}
+}
+
+/// Block on a future, polling the device as needed.
+///
+/// This will deadlock if the future is awaiting anything other than GPU progress.
+pub fn block_on_wgpu<F: Future>(device: &Device, mut fut: F) -> F::Output {
+    let waker = std::task::Waker::from(std::sync::Arc::new(NullWake));
+    let mut context = std::task::Context::from_waker(&waker);
+    // Same logic as `pin_mut!` macro from `pin_utils`.
+    let mut fut = unsafe { std::pin::Pin::new_unchecked(&mut fut) };
+    loop {
+        match fut.as_mut().poll(&mut context) {
+            std::task::Poll::Pending => {
+                device.poll(wgpu::Maintain::Wait);
+            }
+            std::task::Poll::Ready(item) => break item,
+        }
+    }
+}