Merge pull request #181 from linebender/mem2

Implement robust dynamic memory
2025-01-10 12:41:30 +11:00 · 2022-07-14 07:27:38 -07:00 · 2022-07-14 07:27:38 -07:00 · bfa4abf642
parent d5b04a69dd 78bdd08f01
commit bfa4abf642
22 changed files with 994 additions and 634 deletions
--- a/.github/workflows/push-shader.yml
+++ b/.github/workflows/push-shader.yml
@ -20,7 +20,7 @@ jobs:
          git switch main
          git config user.name "Commit by GitHub Action"
          git config user.email "nobody@example.com"
-          git merge dev -m "merge from dev branch"
+          git merge dev -m "merge from dev branch - ${{ github.ref_name }}"
          sed -i '' '/shader\/gen/d' .gitignore
          git add .gitignore
          git rm -r --ignore-unmatch piet-gpu/shader/gen
--- a/piet-gpu/bin/android.rs
+++ b/piet-gpu/bin/android.rs
@ -20,7 +20,7 @@ use piet_gpu_hal::{
 use piet::kurbo::Point;
 use piet::{RenderContext, Text, TextAttribute, TextLayoutBuilder};

-use piet_gpu::{test_scenes, PietGpuRenderContext, Renderer};
+use piet_gpu::{test_scenes, PietGpuRenderContext, RenderDriver, Renderer};

 #[cfg_attr(target_os = "android", ndk_glue::main(backtrace = "on"))]
 fn main() {
@ -34,12 +34,9 @@ struct MyHandle {
 // State required to render and present the contents
 struct GfxState {
    session: Session,
-    renderer: Renderer,
+    render_driver: RenderDriver,
    swapchain: Swapchain,
    current_frame: usize,
-    submitted: [Option<SubmittedCmdBuf>; NUM_FRAMES],
-    cmd_bufs: [Option<CmdBuf>; NUM_FRAMES],
-    query_pools: Vec<QueryPool>,
    present_semaphores: Vec<Semaphore>,
 }

@ -110,22 +107,15 @@ impl GfxState {
            let present_semaphores = (0..NUM_FRAMES)
                .map(|_| session.create_semaphore())
                .collect::<Result<Vec<_>, Error>>()?;
-            let query_pools = (0..NUM_FRAMES)
-                .map(|_| session.create_query_pool(Renderer::QUERY_POOL_SIZE))
-                .collect::<Result<Vec<_>, Error>>()?;
-            let submitted = Default::default();
-            let cmd_bufs = Default::default();

            let renderer = Renderer::new(&session, width, height, NUM_FRAMES)?;
+            let render_driver = RenderDriver::new(&session, NUM_FRAMES, renderer);

            Ok(GfxState {
                session,
-                renderer,
+                render_driver,
                swapchain,
                current_frame,
-                submitted,
-                cmd_bufs,
-                query_pools,
                present_semaphores,
            })
        }
@ -137,51 +127,47 @@ impl GfxState {
            let frame_idx = self.current_frame % NUM_FRAMES;
            let mut info_string = String::new();

-            if let Some(submitted) = self.submitted[frame_idx].take() {
-                self.cmd_bufs[frame_idx] = submitted.wait().unwrap();
-                let ts = self
-                    .session
-                    .fetch_query_pool(&self.query_pools[frame_idx])
-                    .unwrap();
-                info_string = format!("{:.1}ms", ts.last().unwrap() * 1e3);
-                println!("render time: {:?}", ts);
+            if self.current_frame >= NUM_FRAMES {
+                let stats = self
+                    .render_driver
+                    .get_timing_stats(&self.session, frame_idx);
+                info_string = stats.short_summary();
+                println!("{}", info_string);
            }
            let mut ctx = PietGpuRenderContext::new();
            test_scenes::render_anim_frame(&mut ctx, self.current_frame);
            //test_scenes::render_tiger(&mut ctx);
            render_info_string(&mut ctx, &info_string);
-            if let Err(e) = self.renderer.upload_render_ctx(&mut ctx, frame_idx) {
+            if let Err(e) = self
+                .render_driver
+                .upload_render_ctx(&self.session, &mut ctx)
+            {
                println!("error in uploading: {}", e);
            }
            let (image_idx, acquisition_semaphore) = self.swapchain.next().unwrap();
            let swap_image = self.swapchain.image(image_idx);
-            let query_pool = &self.query_pools[frame_idx];
-            let mut cmd_buf = self.cmd_bufs[frame_idx]
-                .take()
-                .unwrap_or_else(|| self.session.cmd_buf().unwrap());
-            cmd_buf.begin();
-            self.renderer.record(&mut cmd_buf, &query_pool, frame_idx);
+            self.render_driver.run_coarse(&self.session).unwrap();
+            let target = self.render_driver.record_fine(&self.session).unwrap();
+            let cmd_buf = target.cmd_buf;

            // Image -> Swapchain
            cmd_buf.image_barrier(&swap_image, ImageLayout::Undefined, ImageLayout::BlitDst);
-            cmd_buf.blit_image(&self.renderer.image_dev, &swap_image);
+            cmd_buf.blit_image(target.image, &swap_image);
            cmd_buf.image_barrier(&swap_image, ImageLayout::BlitDst, ImageLayout::Present);
-            cmd_buf.finish();

-            self.submitted[frame_idx] = Some(
-                self.session
-                    .run_cmd_buf(
-                        cmd_buf,
-                        &[&acquisition_semaphore],
-                        &[&self.present_semaphores[frame_idx]],
-                    )
-                    .unwrap(),
-            );
+            self.render_driver
+                .submit(
+                    &self.session,
+                    &[&acquisition_semaphore],
+                    &[&self.present_semaphores[frame_idx]],
+                )
+                .unwrap();

            self.swapchain
                .present(image_idx, &[&self.present_semaphores[frame_idx]])
                .unwrap();

+            self.render_driver.next_buffer();
            self.current_frame += 1;
        }
    }
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@ -6,7 +6,7 @@ use clap::{App, Arg};

 use piet_gpu_hal::{BufferUsage, Error, Instance, InstanceFlags, Session};

-use piet_gpu::{test_scenes, PicoSvg, PietGpuRenderContext, Renderer};
+use piet_gpu::{test_scenes, PicoSvg, PietGpuRenderContext, RenderDriver, Renderer};

 const WIDTH: usize = 2048;
 const HEIGHT: usize = 1536;
@ -231,9 +231,6 @@ fn main() -> Result<(), Error> {
        let device = instance.device(None)?;
        let session = Session::new(device);

-        let mut cmd_buf = session.cmd_buf()?;
-        let query_pool = session.create_query_pool(Renderer::QUERY_POOL_SIZE)?;
-
        let mut ctx = PietGpuRenderContext::new();
        if let Some(input) = matches.value_of("INPUT") {
            let mut scale = matches
@ -253,40 +250,22 @@ fn main() -> Result<(), Error> {
            test_scenes::render_blend_grid(&mut ctx);
        }

-        let mut renderer = Renderer::new(&session, WIDTH, HEIGHT, 1)?;
-        renderer.upload_render_ctx(&mut ctx, 0)?;
+        let renderer = Renderer::new(&session, WIDTH, HEIGHT, 1)?;
+        let mut render_driver = RenderDriver::new(&session, 1, renderer);
+        let start = std::time::Instant::now();
+        render_driver.upload_render_ctx(&session, &mut ctx)?;
        let image_usage = BufferUsage::MAP_READ | BufferUsage::COPY_DST;
        let image_buf = session.create_buffer((WIDTH * HEIGHT * 4) as u64, image_usage)?;

-        cmd_buf.begin();
-        renderer.record(&mut cmd_buf, &query_pool, 0);
-        cmd_buf.copy_image_to_buffer(&renderer.image_dev, &image_buf);
-        cmd_buf.finish_timestamps(&query_pool);
-        cmd_buf.host_barrier();
-        cmd_buf.finish();
-        let start = std::time::Instant::now();
-        let submitted = session.run_cmd_buf(cmd_buf, &[], &[])?;
-        submitted.wait()?;
+        render_driver.run_coarse(&session)?;
+        let target = render_driver.record_fine(&session)?;
+        target
+            .cmd_buf
+            .copy_image_to_buffer(target.image, &image_buf);
+        render_driver.submit(&session, &[], &[])?;
+        render_driver.wait(&session);
        println!("elapsed = {:?}", start.elapsed());
-        let ts = session.fetch_query_pool(&query_pool).unwrap();
-        if !ts.is_empty() {
-            println!("Element kernel time: {:.3}ms", ts[0] * 1e3);
-            println!(
-                "Tile allocation kernel time: {:.3}ms",
-                (ts[1] - ts[0]) * 1e3
-            );
-            println!("Coarse path kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3);
-            println!("Backdrop kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3);
-            println!("Binning kernel time: {:.3}ms", (ts[4] - ts[3]) * 1e3);
-            println!("Coarse raster kernel time: {:.3}ms", (ts[5] - ts[4]) * 1e3);
-            println!("Render kernel time: {:.3}ms", (ts[6] - ts[5]) * 1e3);
-        }
-
-        /*
-        let mut data: Vec<u32> = Default::default();
-        renderer.memory_buf_dev.read(&mut data).unwrap();
-        piet_gpu::dump_k1_data(&data[2..]);
-        */
+        render_driver.get_timing_stats(&session, 0).print_summary();

        let mut img_data: Vec<u8> = Default::default();
        // Note: because png can use a `&[u8]` slice, we could avoid an extra copy
--- a/piet-gpu/bin/winit.rs
+++ b/piet-gpu/bin/winit.rs
@ -1,8 +1,8 @@
 use piet::kurbo::Point;
 use piet::{RenderContext, Text, TextAttribute, TextLayoutBuilder};
-use piet_gpu_hal::{CmdBuf, Error, ImageLayout, Instance, Session, SubmittedCmdBuf};
+use piet_gpu_hal::{Error, ImageLayout, Instance, Session};

-use piet_gpu::{test_scenes, PicoSvg, PietGpuRenderContext, Renderer};
+use piet_gpu::{test_scenes, PicoSvg, PietGpuRenderContext, RenderDriver, Renderer};

 use clap::{App, Arg};

@ -69,13 +69,9 @@ fn main() -> Result<(), Error> {
        let present_semaphores = (0..NUM_FRAMES)
            .map(|_| session.create_semaphore())
            .collect::<Result<Vec<_>, Error>>()?;
-        let query_pools = (0..NUM_FRAMES)
-            .map(|_| session.create_query_pool(Renderer::QUERY_POOL_SIZE))
-            .collect::<Result<Vec<_>, Error>>()?;
-        let mut cmd_bufs: [Option<CmdBuf>; NUM_FRAMES] = Default::default();
-        let mut submitted: [Option<SubmittedCmdBuf>; NUM_FRAMES] = Default::default();

-        let mut renderer = Renderer::new(&session, WIDTH, HEIGHT, NUM_FRAMES)?;
+        let renderer = Renderer::new(&session, WIDTH, HEIGHT, NUM_FRAMES)?;
+        let mut render_driver = RenderDriver::new(&session, NUM_FRAMES, renderer);
        let mut mode = 0usize;

        event_loop.run(move |event, _, control_flow| {
@ -106,26 +102,13 @@ fn main() -> Result<(), Error> {
                Event::RedrawRequested(window_id) if window_id == window.id() => {
                    let frame_idx = current_frame % NUM_FRAMES;

-                    if let Some(submitted) = submitted[frame_idx].take() {
-                        cmd_bufs[frame_idx] = submitted.wait().unwrap();
-                        let ts = session.fetch_query_pool(&query_pools[frame_idx]).unwrap();
-                        if !ts.is_empty() {
-                            info_string = format!(
-                                "{:.3}ms :: e:{:.3}ms|alloc:{:.3}ms|cp:{:.3}ms|bd:{:.3}ms|bin:{:.3}ms|cr:{:.3}ms|r:{:.3}ms",
-                                ts[10] * 1e3,
-                                ts[0] * 1e3,
-                                (ts[1] - ts[0]) * 1e3,
-                                (ts[2] - ts[1]) * 1e3,
-                                (ts[4] - ts[3]) * 1e3,
-                                (ts[6] - ts[5]) * 1e3,
-                                (ts[8] - ts[7]) * 1e3,
-                                (ts[10] - ts[9]) * 1e3,
-                            );
-                        }
+                    if current_frame >= NUM_FRAMES {
+                        let stats = render_driver.get_timing_stats(&session, frame_idx);
+                        info_string = stats.short_summary();
                    }

                    let mut ctx = PietGpuRenderContext::new();
-                    let test_blend = true;
+                    let test_blend = false;
                    if let Some(svg) = &svg {
                        test_scenes::render_svg(&mut ctx, svg);
                    } else if test_blend {
@ -168,16 +151,15 @@ fn main() -> Result<(), Error> {
                        test_scenes::render_anim_frame(&mut ctx, current_frame);
                    }
                    render_info_string(&mut ctx, &info_string);
-                    if let Err(e) = renderer.upload_render_ctx(&mut ctx, frame_idx) {
+                    if let Err(e) = render_driver.upload_render_ctx(&session, &mut ctx) {
                        println!("error in uploading: {}", e);
                    }

                    let (image_idx, acquisition_semaphore) = swapchain.next().unwrap();
                    let swap_image = swapchain.image(image_idx);
-                    let query_pool = &query_pools[frame_idx];
-                    let mut cmd_buf = cmd_bufs[frame_idx].take().unwrap_or_else(|| session.cmd_buf().unwrap());
-                    cmd_buf.begin();
-                    renderer.record(&mut cmd_buf, &query_pool, frame_idx);
+                    render_driver.run_coarse(&session).unwrap();
+                    let target = render_driver.record_fine(&session).unwrap();
+                    let cmd_buf = target.cmd_buf;

                    // Image -> Swapchain
                    cmd_buf.image_barrier(
@ -185,32 +167,25 @@ fn main() -> Result<(), Error> {
                        ImageLayout::Undefined,
                        ImageLayout::BlitDst,
                    );
-                    cmd_buf.blit_image(&renderer.image_dev, &swap_image);
+                    cmd_buf.blit_image(target.image, &swap_image);
                    cmd_buf.image_barrier(&swap_image, ImageLayout::BlitDst, ImageLayout::Present);
-                    cmd_buf.finish();
-
-                    submitted[frame_idx] = Some(session
-                        .run_cmd_buf(
-                            cmd_buf,
+                    render_driver
+                        .submit(
+                            &session,
                            &[&acquisition_semaphore],
                            &[&present_semaphores[frame_idx]],
                        )
-                        .unwrap());
+                        .unwrap();

                    swapchain
                        .present(image_idx, &[&present_semaphores[frame_idx]])
                        .unwrap();

+                    render_driver.next_buffer();
                    current_frame += 1;
                }
                Event::LoopDestroyed => {
-                    for cmd_buf in &mut submitted {
-                        // Wait for command list submission, otherwise dropping of renderer may
-                        // cause validation errors (and possibly crashes).
-                        if let Some(cmd_buf) = cmd_buf.take() {
-                            cmd_buf.wait().unwrap();
-                        }
-                    }
+                    render_driver.wait_all(&session);
                }
                _ => (),
            }
--- a/piet-gpu/shader/backdrop.comp
+++ b/piet-gpu/shader/backdrop.comp
@ -45,12 +45,15 @@ shared Alloc sh_row_alloc[BACKDROP_WG];
 shared uint sh_row_width[BACKDROP_WG];

 void main() {
+    if (!check_deps(STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_PATH_COARSE)) {
+        return;
+    }
+
    uint th_ix = gl_LocalInvocationIndex;
    uint element_ix = gl_GlobalInvocationID.x;

    // Work assignment: 1 thread : 1 path element
    uint row_count = 0;
-    bool mem_ok = mem_error == NO_ERROR;
    if (gl_LocalInvocationID.y == 0) {
        if (element_ix < conf.n_elements) {
            // Possible TODO: it's not necessary to process backdrops of stroked paths.
@ -68,7 +71,7 @@ void main() {
                row_count = 0;
            }
            Alloc path_alloc = new_alloc(
-                path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
+                path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, true);
            sh_row_alloc[th_ix] = path_alloc;
        }
        sh_row_count[th_ix] = row_count;
@ -98,7 +101,7 @@ void main() {
            }
        }
        uint width = sh_row_width[el_ix];
-        if (width > 0 && mem_ok) {
+        if (width > 0) {
            // Process one row sequentially
            // Read backdrop value per tile and prefix sum it
            Alloc tiles_alloc = sh_row_alloc[el_ix];
--- a/piet-gpu/shader/binning.comp
+++ b/piet-gpu/shader/binning.comp
@ -32,8 +32,7 @@ layout(set = 0, binding = 1) readonly buffer ConfigBuf {
 // Bitmaps are sliced (256bit into 8 (N_SLICE) 32bit submaps)
 shared uint bitmaps[N_SLICE][N_TILE];
 shared uint count[N_SLICE][N_TILE];
-shared Alloc sh_chunk_alloc[N_TILE];
-shared bool sh_alloc_failed;
+shared uint sh_chunk_offset[N_TILE];

 DrawMonoid load_draw_monoid(uint element_ix) {
    uint base = (conf.drawmonoid_alloc.offset >> 2) + 4 * element_ix;
@ -84,10 +83,6 @@ void main() {
    for (uint i = 0; i < N_SLICE; i++) {
        bitmaps[i][gl_LocalInvocationID.x] = 0;
    }
-    if (gl_LocalInvocationID.x == 0) {
-        sh_alloc_failed = false;
-    }
-    barrier();

    // Read inputs and determine coverage of bins
    uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x;
@ -148,26 +143,18 @@ void main() {
        count[i][gl_LocalInvocationID.x] = element_count;
    }
    // element_count is number of elements covering bin for this invocation.
-    Alloc chunk_alloc = new_alloc(0, 0, true);
+    uint chunk_offset = 0;
    if (element_count != 0) {
-        // TODO: aggregate atomic adds (subgroup is probably fastest)
-        MallocResult chunk = malloc(element_count * BinInstance_size);
-        chunk_alloc = chunk.alloc;
-        sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
-        if (chunk.failed) {
-            sh_alloc_failed = true;
-        }
+        chunk_offset = malloc_stage(element_count * BinInstance_size, conf.mem_size, STAGE_BINNING);
+        sh_chunk_offset[gl_LocalInvocationID.x] = chunk_offset;
    }
    // Note: it might be more efficient for reading to do this in the
    // other order (each bin is a contiguous sequence of partitions)
    uint out_ix = (conf.bin_alloc.offset >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2;
    write_mem(conf.bin_alloc, out_ix, element_count);
-    write_mem(conf.bin_alloc, out_ix + 1, chunk_alloc.offset);
+    write_mem(conf.bin_alloc, out_ix + 1, chunk_offset);

    barrier();
-    if (sh_alloc_failed || mem_error != NO_ERROR) {
-        return;
-    }

    // Use similar strategy as Laine & Karras paper; loop over bbox of bins
    // touched by this element
@ -181,9 +168,10 @@ void main() {
            if (my_slice > 0) {
                idx += count[my_slice - 1][bin_ix];
            }
-            Alloc out_alloc = sh_chunk_alloc[bin_ix];
-            uint out_offset = out_alloc.offset + idx * BinInstance_size;
-            BinInstance_write(out_alloc, BinInstanceRef(out_offset), BinInstance(element_ix));
+            uint chunk_offset = sh_chunk_offset[bin_ix];
+            if (chunk_offset != MALLOC_FAILED) {
+                memory[(chunk_offset >> 2) + idx] = element_ix;
+            }
        }
        x++;
        if (x == x1) {
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@ -72,49 +72,62 @@ void write_tile_alloc(uint el_ix, Alloc a) {

 Alloc read_tile_alloc(uint el_ix, bool mem_ok) {
    // All memory.
-    return new_alloc(0, memory.length() * 4, mem_ok);
+    return new_alloc(0, conf.mem_size, mem_ok);
 }
 #endif

 // The maximum number of commands per annotated element.
 #define ANNO_COMMANDS 2

-// Perhaps cmd_alloc should be a global? This is a style question.
-bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) {
+// All writes to the output must be gated by mem_ok.
+bool mem_ok = true;
+
+// Perhaps cmd allocations should be a global? This is a style question.
+void alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) {
    if (cmd_ref.offset < cmd_limit) {
-        return true;
+        return;
    }
-    MallocResult new_cmd = malloc(PTCL_INITIAL_ALLOC);
-    if (new_cmd.failed) {
-        return false;
+    uint new_cmd = malloc_stage(PTCL_INITIAL_ALLOC, conf.mem_size, STAGE_COARSE);
+    if (new_cmd == MALLOC_FAILED) {
+        mem_ok = false;
    }
-    CmdJump jump = CmdJump(new_cmd.alloc.offset);
-    Cmd_Jump_write(cmd_alloc, cmd_ref, jump);
-    cmd_alloc = new_cmd.alloc;
-    cmd_ref = CmdRef(cmd_alloc.offset);
+    if (mem_ok) {
+        CmdJump jump = CmdJump(new_cmd);
+        Cmd_Jump_write(cmd_alloc, cmd_ref, jump);
+    }
+    cmd_alloc = new_alloc(new_cmd, PTCL_INITIAL_ALLOC, true);
+    cmd_ref = CmdRef(new_cmd);
    // Reserve space for the maximum number of commands and a potential jump.
-    cmd_limit = cmd_alloc.offset + PTCL_INITIAL_ALLOC - (ANNO_COMMANDS + 1) * Cmd_size;
-    return true;
+    cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - (ANNO_COMMANDS + 1) * Cmd_size;
 }

 void write_fill(Alloc alloc, inout CmdRef cmd_ref, Tile tile, float linewidth) {
    if (linewidth < 0.0) {
        if (tile.tile.offset != 0) {
            CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop);
-            Cmd_Fill_write(alloc, cmd_ref, cmd_fill);
+            if (mem_ok) {
+                Cmd_Fill_write(alloc, cmd_ref, cmd_fill);
+            }
            cmd_ref.offset += 4 + CmdFill_size;
        } else {
-            Cmd_Solid_write(alloc, cmd_ref);
+            if (mem_ok) {
+                Cmd_Solid_write(alloc, cmd_ref);
+            }
            cmd_ref.offset += 4;
        }
    } else {
        CmdStroke cmd_stroke = CmdStroke(tile.tile.offset, 0.5 * linewidth);
-        Cmd_Stroke_write(alloc, cmd_ref, cmd_stroke);
+        if (mem_ok) {
+            Cmd_Stroke_write(alloc, cmd_ref, cmd_stroke);
+        }
        cmd_ref.offset += 4 + CmdStroke_size;
    }
 }

 void main() {
+    if (!check_deps(STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_PATH_COARSE)) {
+        return;
+    }
    // Could use either linear or 2d layouts for both dispatch and
    // invocations within the workgroup. We'll use variables to abstract.
    uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1) / N_TILE_X;
@ -161,7 +174,6 @@ void main() {
    uint drawtag_start = conf.drawtag_offset >> 2;
    uint drawdata_start = conf.drawdata_offset >> 2;
    uint drawinfo_start = conf.drawinfo_alloc.offset >> 2;
-    bool mem_ok = mem_error == NO_ERROR;
    while (true) {
        for (uint i = 0; i < N_SLICE; i++) {
            sh_bitmaps[i][th_ix] = 0;
@ -176,7 +188,7 @@ void main() {
                    uint in_ix = (conf.bin_alloc.offset >> 2) + ((partition_ix + th_ix) * N_TILE + bin_ix) * 2;
                    count = read_mem(conf.bin_alloc, in_ix);
                    uint offset = read_mem(conf.bin_alloc, in_ix + 1);
-                    sh_part_elements[th_ix] = new_alloc(offset, count * BinInstance_size, mem_ok);
+                    sh_part_elements[th_ix] = new_alloc(offset, count * BinInstance_size, true);
                }
                // prefix sum of counts
                for (uint i = 0; i < LG_N_PART_READ; i++) {
@ -200,7 +212,7 @@ void main() {
            }
            // use binary search to find element to read
            uint ix = rd_ix + th_ix;
-            if (ix >= wr_ix && ix < ready_ix && mem_ok) {
+            if (ix >= wr_ix && ix < ready_ix) {
                uint part_ix = 0;
                for (uint i = 0; i < LG_N_PART_READ; i++) {
                    uint probe = part_ix + (uint(N_PART_READ / 2) >> i);
@ -257,7 +269,7 @@ void main() {
            uint base = path.tiles.offset - uint(dy * stride + dx) * Tile_size;
            sh_tile_base[th_ix] = base;
            Alloc path_alloc = new_alloc(path.tiles.offset,
-                                         (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
+                                         (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, true);
            write_tile_alloc(th_ix, path_alloc);
            break;
        default:
@ -293,27 +305,25 @@ void main() {
            uint x = sh_tile_x0[el_ix] + seq_ix % width;
            uint y = sh_tile_y0[el_ix] + seq_ix / width;
            bool include_tile = false;
-            if (mem_ok) {
-                Tile tile = Tile_read(read_tile_alloc(el_ix, mem_ok),
-                                      TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
-                bool is_clip = (tag & 1) != 0;
-                // Always include the tile if it contains a path segment.
-                // For draws, include the tile if it is solid.
-                // For clips, include the tile if it is empty - this way, logic
-                // below will suppress the drawing of inner elements.
-                // For blends, include the tile if
-                // (blend_mode, composition_mode) != (Normal, SrcOver)
-                bool is_blend = false;
-                if (is_clip) {
-                    uint drawmonoid_base = drawmonoid_start + 4 * element_ix;
-                    uint scene_offset = memory[drawmonoid_base + 2];
-                    uint dd = drawdata_start + (scene_offset >> 2);
-                    uint blend = scene[dd];
-                    is_blend = (blend != BlendComp_clip);
-                }
-                include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip
-                    || is_blend;
+            Tile tile = Tile_read(read_tile_alloc(el_ix, true),
+                                    TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
+            bool is_clip = (tag & 1) != 0;
+            // Always include the tile if it contains a path segment.
+            // For draws, include the tile if it is solid.
+            // For clips, include the tile if it is empty - this way, logic
+            // below will suppress the drawing of inner elements.
+            // For blends, include the tile if
+            // (blend_mode, composition_mode) != (Normal, SrcOver)
+            bool is_blend = false;
+            if (is_clip) {
+                uint drawmonoid_base = drawmonoid_start + 4 * element_ix;
+                uint scene_offset = memory[drawmonoid_base + 2];
+                uint dd = drawdata_start + (scene_offset >> 2);
+                uint blend = scene[dd];
+                is_blend = (blend != BlendComp_clip);
            }
+            include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip
+                || is_blend;
            if (include_tile) {
                uint el_slice = el_ix / 32;
                uint el_mask = 1u << (el_ix & 31);
@ -327,7 +337,7 @@ void main() {
        // through the draw objects.
        uint slice_ix = 0;
        uint bitmap = sh_bitmaps[0][th_ix];
-        while (mem_ok) {
+        while (true) {
            if (bitmap == 0) {
                slice_ix++;
                if (slice_ix == N_SLICE) {
@ -347,7 +357,7 @@ void main() {
            uint drawtag = scene[drawtag_start + element_ix];

            if (clip_zero_depth == 0) {
-                Tile tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok),
+                Tile tile = Tile_read(read_tile_alloc(element_ref_ix, true),
                                        TileRef(sh_tile_base[element_ref_ix] +
                                                (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
                uint drawmonoid_base = drawmonoid_start + 4 * element_ix;
@ -358,18 +368,16 @@ void main() {
                switch (drawtag) {
                case Drawtag_FillColor:
                    float linewidth = uintBitsToFloat(memory[di]);
-                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
-                        break;
-                    }
+                    alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
                    write_fill(cmd_alloc, cmd_ref, tile, linewidth);
                    uint rgba = scene[dd];
-                    Cmd_Color_write(cmd_alloc, cmd_ref, CmdColor(rgba));
+                    if (mem_ok) {
+                        Cmd_Color_write(cmd_alloc, cmd_ref, CmdColor(rgba));
+                    }
                    cmd_ref.offset += 4 + CmdColor_size;
                    break;
                case Drawtag_FillLinGradient:
-                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
-                        break;
-                    }
+                    alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
                    linewidth = uintBitsToFloat(memory[di]);
                    write_fill(cmd_alloc, cmd_ref, tile, linewidth);
                    CmdLinGrad cmd_lin;
@ -377,13 +385,13 @@ void main() {
                    cmd_lin.line_x = uintBitsToFloat(memory[di + 1]);
                    cmd_lin.line_y = uintBitsToFloat(memory[di + 2]);
                    cmd_lin.line_c = uintBitsToFloat(memory[di + 3]);
-                    Cmd_LinGrad_write(cmd_alloc, cmd_ref, cmd_lin);
+                    if (mem_ok) {
+                        Cmd_LinGrad_write(cmd_alloc, cmd_ref, cmd_lin);
+                    }
                    cmd_ref.offset += 4 + CmdLinGrad_size;
                    break;
                case Drawtag_FillRadGradient:
-                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
-                        break;
-                    }
+                    alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
                    linewidth = uintBitsToFloat(memory[di]);
                    write_fill(cmd_alloc, cmd_ref, tile, linewidth);
                    CmdRadGrad cmd_rad;
@ -396,29 +404,31 @@ void main() {
                    cmd_rad.c1 = uintBitsToFloat(uvec2(memory[di + 7], memory[di + 8]));
                    cmd_rad.ra = uintBitsToFloat(memory[di + 9]);
                    cmd_rad.roff = uintBitsToFloat(memory[di + 10]);
-                    Cmd_RadGrad_write(cmd_alloc, cmd_ref, cmd_rad);
+                    if (mem_ok) {
+                        Cmd_RadGrad_write(cmd_alloc, cmd_ref, cmd_rad);
+                    }
                    cmd_ref.offset += 4 + CmdRadGrad_size;
                    break;
                case Drawtag_FillImage:
+                    alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
                    linewidth = uintBitsToFloat(memory[di]);
-                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
-                        break;
-                    }
                    write_fill(cmd_alloc, cmd_ref, tile, linewidth);
                    uint index = scene[dd];
                    uint raw1 = scene[dd + 1];
                    ivec2 offset = ivec2(int(raw1 << 16) >> 16, int(raw1) >> 16);
-                    Cmd_Image_write(cmd_alloc, cmd_ref, CmdImage(index, offset));
+                    if (mem_ok) {
+                        Cmd_Image_write(cmd_alloc, cmd_ref, CmdImage(index, offset));
+                    }
                    cmd_ref.offset += 4 + CmdImage_size;
                    break;
                case Drawtag_BeginClip:
                    if (tile.tile.offset == 0 && tile.backdrop == 0) {
                        clip_zero_depth = clip_depth + 1;
                    } else {
-                        if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
-                            break;
+                        alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
+                        if (mem_ok) {
+                            Cmd_BeginClip_write(cmd_alloc, cmd_ref);
                        }
-                        Cmd_BeginClip_write(cmd_alloc, cmd_ref);
                        cmd_ref.offset += 4;
                        render_blend_depth++;
                        max_blend_depth = max(max_blend_depth, render_blend_depth);
@ -427,12 +437,11 @@ void main() {
                    break;
                case Drawtag_EndClip:
                    clip_depth--;
-                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
-                        break;
-                    }
                    write_fill(cmd_alloc, cmd_ref, tile, -1.0);
                    uint blend = scene[dd];
-                    Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(blend));
+                    if (mem_ok) {
+                        Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(blend));
+                    }
                    cmd_ref.offset += 4 + CmdEndClip_size;
                    render_blend_depth--;
                    break;
@ -459,11 +468,13 @@ void main() {
            break;
    }
    if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) {
-        Cmd_End_write(cmd_alloc, cmd_ref);
+        if (mem_ok) {
+            Cmd_End_write(cmd_alloc, cmd_ref);
+        }
        if (max_blend_depth > BLEND_STACK_SPLIT) {
            uint scratch_size = max_blend_depth * TILE_WIDTH_PX * TILE_HEIGHT_PX * CLIP_STATE_SIZE * 4;
-            MallocResult scratch = malloc(scratch_size);
-            alloc_write(scratch_alloc, scratch_alloc.offset, scratch.alloc);
+            uint scratch = atomicAdd(blend_offset, scratch_size);
+            write_mem(scratch_alloc, scratch_alloc.offset >> 2, scratch);
        }
    }
 }
--- a/piet-gpu/shader/image.png
+++ b/piet-gpu/shader/image.png
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@ -14,6 +14,7 @@
 // higher quality antialiasing among other things).
 #define DO_SRGB_CONVERSION 0

+// TODO: the binding of the main buffer can be readonly
 #include "mem.h"
 #include "setup.h"

@ -24,19 +25,23 @@
 #define CHUNK_DY (TILE_HEIGHT_PX / CHUNK_Y)
 layout(local_size_x = CHUNK_DX, local_size_y = CHUNK_DY) in;

-layout(set = 0, binding = 1) restrict readonly buffer ConfigBuf {
+layout(binding = 1) restrict readonly buffer ConfigBuf {
    Config conf;
 };

+layout(binding = 2) buffer BlendBuf {
+    uint blend_mem[];
+};
+
 #ifdef GRAY
-layout(r8, set = 0, binding = 2) uniform restrict writeonly image2D image;
+layout(r8, binding = 3) uniform restrict writeonly image2D image;
 #else
-layout(rgba8, set = 0, binding = 2) uniform restrict writeonly image2D image;
+layout(rgba8, binding = 3) uniform restrict writeonly image2D image;
 #endif

-layout(rgba8, set = 0, binding = 3) uniform restrict readonly image2D image_atlas;
+layout(rgba8, binding = 4) uniform restrict readonly image2D image_atlas;

-layout(rgba8, set = 0, binding = 4) uniform restrict readonly image2D gradients;
+layout(rgba8, binding = 5) uniform restrict readonly image2D gradients;

 #include "ptcl.h"
 #include "tile.h"
@ -114,8 +119,9 @@ void main() {

    mediump float area[CHUNK];
    uint clip_depth = 0;
-    bool mem_ok = mem_error == NO_ERROR;
-    while (mem_ok) {
+    // Previously we would early-out if there was a memory failure, so we wouldn't try to read corrupt
+    // tiles. But now we assume this is checked CPU-side before launching fine rasterization.
+    while (true) {
        uint tag = Cmd_tag(cmd_alloc, cmd_ref).tag;
        if (tag == Cmd_End) {
            break;
@ -129,7 +135,7 @@ void main() {
                df[k] = 1e9;
            TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref);
            do {
-                TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, mem_ok), tile_seg_ref);
+                TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, true), tile_seg_ref);
                vec2 line_vec = seg.vector;
                for (uint k = 0; k < CHUNK; k++) {
                    vec2 dpos = xy + vec2(0.5, 0.5) - seg.origin;
@ -151,7 +157,7 @@ void main() {
            tile_seg_ref = TileSegRef(fill.tile_ref);
            // Calculate coverage based on backdrop + coverage of each line segment
            do {
-                TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, mem_ok), tile_seg_ref);
+                TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, true), tile_seg_ref);
                for (uint k = 0; k < CHUNK; k++) {
                    vec2 my_xy = xy + vec2(chunk_offset(k));
                    vec2 start = seg.origin - my_xy;
@ -248,7 +254,7 @@ void main() {
                uint base_ix = (blend_offset >> 2) + (clip_depth - BLEND_STACK_SPLIT) * TILE_HEIGHT_PX * TILE_WIDTH_PX +
                    CHUNK * (gl_LocalInvocationID.x + CHUNK_DX * gl_LocalInvocationID.y);
                for (uint k = 0; k < CHUNK; k++) {
-                    memory[base_ix + k] = packsRGB(vec4(rgba[k]));
+                    blend_mem[base_ix + k] = packsRGB(vec4(rgba[k]));
                    rgba[k] = vec4(0.0);
                }
            }
@ -268,7 +274,7 @@ void main() {
                if (clip_depth < BLEND_STACK_SPLIT) {
                    bg_rgba = blend_stack[clip_depth][k];
                } else {
-                    bg_rgba = memory[base_ix + k];
+                    bg_rgba = blend_mem[base_ix + k];
                }
                mediump vec4 bg = unpacksRGB(bg_rgba);
                mediump vec4 fg = rgba[k] * area[k];
--- a/piet-gpu/shader/mem.h
+++ b/piet-gpu/shader/mem.h
@ -3,27 +3,23 @@
 layout(set = 0, binding = 0) buffer Memory {
    // offset into memory of the next allocation, initialized by the user.
    uint mem_offset;
-    // mem_error tracks the status of memory accesses, initialized to NO_ERROR
-    // by the user. ERR_MALLOC_FAILED is reported for insufficient memory.
-    // If MEM_DEBUG is defined the following errors are reported:
-    // - ERR_OUT_OF_BOUNDS is reported for out of bounds writes.
-    // - ERR_UNALIGNED_ACCESS for memory access not aligned to 32-bit words.
+    // mem_error is a bitmask of stages that have failed allocation.
    uint mem_error;
+    // offset into blend memory of allocations for blend stack.
+    uint blend_offset;
    uint[] memory;
 };

 // Uncomment this line to add the size field to Alloc and enable memory checks.
 // Note that the Config struct in setup.h grows size fields as well.
-//#define MEM_DEBUG

-#define NO_ERROR 0
-#define ERR_MALLOC_FAILED 1
-#define ERR_OUT_OF_BOUNDS 2
-#define ERR_UNALIGNED_ACCESS 3
+// This setting is not working and the mechanism will be removed.
+//#define MEM_DEBUG

 #ifdef MEM_DEBUG
 #define Alloc_size 16
 #else
+// TODO: this seems wrong
 #define Alloc_size 8
 #endif

@ -37,12 +33,6 @@ struct Alloc {
 #endif
 };

-struct MallocResult {
-    Alloc alloc;
-    // failed is true if the allocation overflowed memory.
-    bool failed;
-};
-
 // new_alloc synthesizes an Alloc from an offset and size.
 Alloc new_alloc(uint offset, uint size, bool mem_ok) {
    Alloc a;
@ -57,24 +47,32 @@ Alloc new_alloc(uint offset, uint size, bool mem_ok) {
    return a;
 }

-// malloc allocates size bytes of memory.
-MallocResult malloc(uint size) {
-    MallocResult r;
+#define STAGE_BINNING (1u << 0)
+#define STAGE_TILE_ALLOC (1u << 1)
+#define STAGE_PATH_COARSE (1u << 2)
+#define STAGE_COARSE (1u << 3)
+
+// Allocations in main memory will never be 0, and this might be slightly
+// faster to test against than some other value.
+#define MALLOC_FAILED 0
+
+// Check that previous dependent stages have succeeded.
+bool check_deps(uint dep_stage) {
+    // TODO: this should be an atomic relaxed load, but that involves
+    // bringing in "memory scope semantics"
+    return (atomicOr(mem_error, 0) & dep_stage) == 0;
+}
+
+// Allocate size bytes of memory, offset in bytes.
+// Note: with a bit of rearrangement of header files, we could make the
+// mem_size argument go away (it comes from the config binding).
+uint malloc_stage(uint size, uint mem_size, uint stage) {
    uint offset = atomicAdd(mem_offset, size);
-    r.failed = offset + size > memory.length() * 4;
-    r.alloc = new_alloc(offset, size, !r.failed);
-    if (r.failed) {
-        atomicMax(mem_error, ERR_MALLOC_FAILED);
-        return r;
+    if (offset + size > mem_size) {
+        atomicOr(mem_error, stage);
+        offset = MALLOC_FAILED;
    }
-#ifdef MEM_DEBUG
-    if ((size & 3) != 0) {
-        r.failed = true;
-        atomicMax(mem_error, ERR_UNALIGNED_ACCESS);
-        return r;
-    }
-#endif
-    return r;
+    return offset;
 }

 // touch_mem checks whether access to the memory word at offset is valid.
--- a/piet-gpu/shader/path_coarse.comp
+++ b/piet-gpu/shader/path_coarse.comp
@ -87,7 +87,13 @@ SubdivResult estimate_subdiv(vec2 p0, vec2 p1, vec2 p2, float sqrt_tol) {
    return SubdivResult(val, a0, a2);
 }

+// All writes to the output must be gated by mem_ok.
+bool mem_ok = true;
+
 void main() {
+    if (!check_deps(STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_PATH_COARSE)) {
+        return;
+    }
    uint element_ix = gl_GlobalInvocationID.x;
    PathSegRef ref = PathSegRef(conf.pathseg_alloc.offset + element_ix * PathSeg_size);

@ -95,24 +101,10 @@ void main() {
    if (element_ix < conf.n_pathseg) {
        tag = PathSeg_tag(conf.pathseg_alloc, ref);
    }
-    bool mem_ok = mem_error == NO_ERROR;
    switch (tag.tag) {
    case PathSeg_Cubic:
        PathCubic cubic = PathSeg_Cubic_read(conf.pathseg_alloc, ref);

-        // Affine transform is now applied in pathseg
-        /*
-        uint trans_ix = cubic.trans_ix;
-        if (trans_ix > 0) {
-            TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + (trans_ix - 1) * TransformSeg_size);
-            TransformSeg trans = TransformSeg_read(conf.trans_alloc, trans_ref);
-            cubic.p0 = trans.mat.xy * cubic.p0.x + trans.mat.zw * cubic.p0.y + trans.translate;
-            cubic.p1 = trans.mat.xy * cubic.p1.x + trans.mat.zw * cubic.p1.y + trans.translate;
-            cubic.p2 = trans.mat.xy * cubic.p2.x + trans.mat.zw * cubic.p2.y + trans.translate;
-            cubic.p3 = trans.mat.xy * cubic.p3.x + trans.mat.zw * cubic.p3.y + trans.translate;
-        }
-        */
-
        vec2 err_v = 3.0 * (cubic.p2 - cubic.p1) + cubic.p0 - cubic.p3;
        float err = err_v.x * err_v.x + err_v.y * err_v.y;
        // The number of quadratics.
@ -140,7 +132,7 @@ void main() {
        uint path_ix = cubic.path_ix;
        Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size));
        Alloc path_alloc =
-            new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
+            new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, true);
        ivec4 bbox = ivec4(path.bbox);
        vec2 p0 = cubic.p0;
        qp0 = cubic.p0;
@ -199,11 +191,12 @@ void main() {
                // TODO: can be tighter, use c to bound width
                uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
                // Consider using subgroups to aggregate atomic add.
-                MallocResult tile_alloc = malloc(n_tile_alloc * TileSeg_size);
-                if (tile_alloc.failed || !mem_ok) {
-                    return;
+                uint malloc_size = n_tile_alloc * TileSeg_size;
+                uint tile_offset = malloc_stage(malloc_size, conf.mem_size, STAGE_PATH_COARSE);
+                if (tile_offset == MALLOC_FAILED) {
+                    mem_ok = false;
                }
-                uint tile_offset = tile_alloc.alloc.offset;
+                Alloc tile_alloc = new_alloc(tile_offset, malloc_size, true);

                TileSeg tile_seg;

@ -221,9 +214,7 @@ void main() {
                        int backdrop = p1.y < p0.y ? 1 : -1;
                        TileRef tile_ref = Tile_index(path.tiles, uint(base + xbackdrop));
                        uint tile_el = tile_ref.offset >> 2;
-                        if (touch_mem(path_alloc, tile_el + 1)) {
-                            atomicAdd(memory[tile_el + 1], backdrop);
-                        }
+                        atomicAdd(memory[tile_el + 1], backdrop);
                    }

                    // next_xray is the xray for the next scanline; the line segment intersects
@ -247,9 +238,7 @@ void main() {
                        TileRef tile_ref = Tile_index(TileRef(path.tiles.offset), uint(base + x));
                        uint tile_el = tile_ref.offset >> 2;
                        uint old = 0;
-                        if (touch_mem(path_alloc, tile_el)) {
-                            old = atomicExchange(memory[tile_el], tile_offset);
-                        }
+                        old = atomicExchange(memory[tile_el], tile_offset);
                        tile_seg.origin = p0;
                        tile_seg.vector = p1 - p0;
                        float y_edge = 0.0;
@ -276,7 +265,9 @@ void main() {
                        }
                        tile_seg.y_edge = y_edge;
                        tile_seg.next.offset = old;
-                        TileSeg_write(tile_alloc.alloc, TileSegRef(tile_offset), tile_seg);
+                        if (mem_ok) {
+                            TileSeg_write(tile_alloc, TileSegRef(tile_offset), tile_seg);
+                        }
                        tile_offset += TileSeg_size;
                    }
                    xc += b;
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@ -31,8 +31,9 @@
 // to memory for the overflow.
 #define BLEND_STACK_SPLIT 4

-#ifdef ERR_MALLOC_FAILED
+#ifdef MALLOC_FAILED
 struct Config {
+    uint mem_size; // in bytes
    uint n_elements; // paths
    uint n_pathseg;
    uint width_in_tiles;
--- a/piet-gpu/shader/tile_alloc.comp
+++ b/piet-gpu/shader/tile_alloc.comp
@ -29,7 +29,7 @@ layout(binding = 2) readonly buffer SceneBuf {
 #define SY (1.0 / float(TILE_HEIGHT_PX))

 shared uint sh_tile_count[TILE_ALLOC_WG];
-shared MallocResult sh_tile_alloc;
+shared uint sh_tile_offset;

 vec4 load_draw_bbox(uint draw_ix) {
    uint base = (conf.draw_bbox_alloc.offset >> 2) + 4 * draw_ix;
@ -42,6 +42,9 @@ vec4 load_draw_bbox(uint draw_ix) {
 }

 void main() {
+    if (!check_deps(STAGE_BINNING)) {
+        return;
+    }
    uint th_ix = gl_LocalInvocationID.x;
    uint element_ix = gl_GlobalInvocationID.x;
    // At the moment, element_ix == path_ix. The clip-intersected bounding boxes
@ -86,27 +89,24 @@ void main() {
        sh_tile_count[th_ix] = total_tile_count;
    }
    if (th_ix == TILE_ALLOC_WG - 1) {
-        sh_tile_alloc = malloc(total_tile_count * Tile_size);
+        sh_tile_offset = malloc_stage(total_tile_count * Tile_size, conf.mem_size, STAGE_TILE_ALLOC);
    }
    barrier();
-    MallocResult alloc_start = sh_tile_alloc;
-    if (alloc_start.failed || mem_error != NO_ERROR) {
+    uint offset_start = sh_tile_offset;
+    if (offset_start == MALLOC_FAILED) {
        return;
    }

    if (element_ix < conf.n_elements) {
        uint tile_subix = th_ix > 0 ? sh_tile_count[th_ix - 1] : 0;
-        Alloc tiles_alloc = slice_mem(alloc_start.alloc, Tile_size * tile_subix, Tile_size * tile_count);
-        path.tiles = TileRef(tiles_alloc.offset);
+        path.tiles = TileRef(offset_start + Tile_size * tile_subix);
        Path_write(conf.tile_alloc, path_ref, path);
    }

    // Zero out allocated tiles efficiently
    uint total_count = sh_tile_count[TILE_ALLOC_WG - 1] * (Tile_size / 4);
-    uint start_ix = alloc_start.alloc.offset >> 2;
+    uint start_ix = offset_start >> 2;
    for (uint i = th_ix; i < total_count; i += TILE_ALLOC_WG) {
-        // Note: this interleaving is faster than using Tile_write
-        // by a significant amount.
-        write_mem(alloc_start.alloc, start_ix + i, 0);
+        memory[start_ix + i] = 0;
    }
 }
--- a/piet-gpu/src/encoder.rs
+++ b/piet-gpu/src/encoder.rs
@ -16,13 +16,12 @@

 //! Low-level scene encoding.

-use crate::Blend;
+use crate::{Blend, SceneStats, DRAWTAG_SIZE, TRANSFORM_SIZE};
 use bytemuck::{Pod, Zeroable};
 use piet_gpu_hal::BufWrite;

 use crate::stages::{
-    self, Config, PathEncoder, Transform, CLIP_PART_SIZE, DRAW_PART_SIZE, PATHSEG_PART_SIZE,
-    TRANSFORM_PART_SIZE,
+    self, PathEncoder, Transform, DRAW_PART_SIZE, PATHSEG_PART_SIZE, TRANSFORM_PART_SIZE,
 };

 pub struct Encoder {
@ -52,86 +51,19 @@ pub struct EncodedSceneRef<'a, T: Copy + Pod> {
 }

 impl<'a, T: Copy + Pod> EncodedSceneRef<'a, T> {
-    /// Return a config for the element processing pipeline.
-    ///
-    /// This does not include further pipeline processing. Also returns the
-    /// beginning of free memory.
-    pub fn stage_config(&self) -> (Config, usize) {
-        // Layout of scene buffer
-        let drawtag_offset = 0;
-        let n_drawobj = self.n_drawobj();
-        let n_drawobj_padded = align_up(n_drawobj, DRAW_PART_SIZE as usize);
-        let drawdata_offset = drawtag_offset + n_drawobj_padded * DRAWTAG_SIZE;
-        let trans_offset = drawdata_offset + self.drawdata_stream.len();
-        let n_trans = self.transform_stream.len();
-        let n_trans_padded = align_up(n_trans, TRANSFORM_PART_SIZE as usize);
-        let linewidth_offset = trans_offset + n_trans_padded * TRANSFORM_SIZE;
-        let n_linewidth = self.linewidth_stream.len();
-        let pathtag_offset = linewidth_offset + n_linewidth * LINEWIDTH_SIZE;
-        let n_pathtag = self.tag_stream.len();
-        let n_pathtag_padded = align_up(n_pathtag, PATHSEG_PART_SIZE as usize);
-        let pathseg_offset = pathtag_offset + n_pathtag_padded;
+    pub(crate) fn stats(&self) -> SceneStats {
+        SceneStats {
+            n_drawobj: self.drawtag_stream.len(),
+            drawdata_len: self.drawdata_stream.len(),
+            n_transform: self.transform_stream.len(),
+            linewidth_len: std::mem::size_of_val(self.linewidth_stream),
+            pathseg_len: self.pathseg_stream.len(),
+            n_pathtag: self.tag_stream.len(),

-        // Layout of memory
-        let mut alloc = 0;
-        let trans_alloc = alloc;
-        alloc += trans_alloc + n_trans_padded * TRANSFORM_SIZE;
-        let pathseg_alloc = alloc;
-        alloc += pathseg_alloc + self.n_pathseg as usize * PATHSEG_SIZE;
-        let path_bbox_alloc = alloc;
-        let n_path = self.n_path as usize;
-        alloc += path_bbox_alloc + n_path * PATH_BBOX_SIZE;
-        let drawmonoid_alloc = alloc;
-        alloc += n_drawobj_padded * DRAWMONOID_SIZE;
-        let anno_alloc = alloc;
-        alloc += n_drawobj * ANNOTATED_SIZE;
-        let clip_alloc = alloc;
-        let n_clip = self.n_clip as usize;
-        const CLIP_SIZE: usize = 4;
-        alloc += n_clip * CLIP_SIZE;
-        let clip_bic_alloc = alloc;
-        const CLIP_BIC_SIZE: usize = 8;
-        // This can round down, as we only reduce the prefix
-        alloc += (n_clip / CLIP_PART_SIZE as usize) * CLIP_BIC_SIZE;
-        let clip_stack_alloc = alloc;
-        const CLIP_EL_SIZE: usize = 20;
-        alloc += n_clip * CLIP_EL_SIZE;
-        let clip_bbox_alloc = alloc;
-        const CLIP_BBOX_SIZE: usize = 16;
-        alloc += align_up(n_clip as usize, CLIP_PART_SIZE as usize) * CLIP_BBOX_SIZE;
-        let draw_bbox_alloc = alloc;
-        alloc += n_drawobj * DRAW_BBOX_SIZE;
-        let drawinfo_alloc = alloc;
-        // TODO: not optimized; it can be accumulated during encoding or summed from drawtags
-        const MAX_DRAWINFO_SIZE: usize = 44;
-        alloc += n_drawobj * MAX_DRAWINFO_SIZE;
-
-        let config = Config {
-            n_elements: n_drawobj as u32,
-            n_pathseg: self.n_pathseg,
-            pathseg_alloc: pathseg_alloc as u32,
-            anno_alloc: anno_alloc as u32,
-            trans_alloc: trans_alloc as u32,
-            path_bbox_alloc: path_bbox_alloc as u32,
-            drawmonoid_alloc: drawmonoid_alloc as u32,
-            clip_alloc: clip_alloc as u32,
-            clip_bic_alloc: clip_bic_alloc as u32,
-            clip_stack_alloc: clip_stack_alloc as u32,
-            clip_bbox_alloc: clip_bbox_alloc as u32,
-            draw_bbox_alloc: draw_bbox_alloc as u32,
-            drawinfo_alloc: drawinfo_alloc as u32,
-            n_trans: n_trans as u32,
            n_path: self.n_path,
+            n_pathseg: self.n_pathseg,
            n_clip: self.n_clip,
-            trans_offset: trans_offset as u32,
-            linewidth_offset: linewidth_offset as u32,
-            pathtag_offset: pathtag_offset as u32,
-            pathseg_offset: pathseg_offset as u32,
-            drawtag_offset: drawtag_offset as u32,
-            drawdata_offset: drawdata_offset as u32,
-            ..Default::default()
-        };
-        (config, alloc)
+        }
    }

    pub fn write_scene(&self, buf: &mut BufWrite) {
@ -148,34 +80,6 @@ impl<'a, T: Copy + Pod> EncodedSceneRef<'a, T> {
        buf.fill_zero(padding(n_pathtag, PATHSEG_PART_SIZE as usize));
        buf.extend_slice(&self.pathseg_stream);
    }
-
-    /// The number of draw objects in the draw object stream.
-    pub(crate) fn n_drawobj(&self) -> usize {
-        self.drawtag_stream.len()
-    }
-
-    /// The number of paths.
-    pub(crate) fn n_path(&self) -> u32 {
-        self.n_path
-    }
-
-    /// The number of path segments.
-    pub(crate) fn n_pathseg(&self) -> u32 {
-        self.n_pathseg
-    }
-
-    pub(crate) fn n_transform(&self) -> usize {
-        self.transform_stream.len()
-    }
-
-    /// The number of tags in the path stream.
-    pub(crate) fn n_pathtag(&self) -> usize {
-        self.tag_stream.len()
-    }
-
-    pub(crate) fn n_clip(&self) -> u32 {
-        self.n_clip
-    }
 }

 /// A scene fragment encoding a glyph.
@ -191,15 +95,6 @@ pub struct GlyphEncoder {
    n_pathseg: u32,
 }

-const TRANSFORM_SIZE: usize = 24;
-const LINEWIDTH_SIZE: usize = 4;
-const PATHSEG_SIZE: usize = 52;
-const PATH_BBOX_SIZE: usize = 24;
-const DRAWMONOID_SIZE: usize = 16;
-const DRAW_BBOX_SIZE: usize = 16;
-const DRAWTAG_SIZE: usize = 4;
-const ANNOTATED_SIZE: usize = 40;
-
 // Tags for draw objects. See shader/drawtag.h for the authoritative source.
 const DRAWTAG_FILLCOLOR: u32 = 0x44;
 const DRAWTAG_FILLLINGRADIENT: u32 = 0x114;
@ -343,88 +238,6 @@ impl Encoder {
        self.n_clip += 1;
    }

-    /// Return a config for the element processing pipeline.
-    ///
-    /// This does not include further pipeline processing. Also returns the
-    /// beginning of free memory.
-    pub fn stage_config(&self) -> (Config, usize) {
-        // Layout of scene buffer
-        let drawtag_offset = 0;
-        let n_drawobj = self.n_drawobj();
-        let n_drawobj_padded = align_up(n_drawobj, DRAW_PART_SIZE as usize);
-        let drawdata_offset = drawtag_offset + n_drawobj_padded * DRAWTAG_SIZE;
-        let trans_offset = drawdata_offset + self.drawdata_stream.len();
-        let n_trans = self.transform_stream.len();
-        let n_trans_padded = align_up(n_trans, TRANSFORM_PART_SIZE as usize);
-        let linewidth_offset = trans_offset + n_trans_padded * TRANSFORM_SIZE;
-        let n_linewidth = self.linewidth_stream.len();
-        let pathtag_offset = linewidth_offset + n_linewidth * LINEWIDTH_SIZE;
-        let n_pathtag = self.tag_stream.len();
-        let n_pathtag_padded = align_up(n_pathtag, PATHSEG_PART_SIZE as usize);
-        let pathseg_offset = pathtag_offset + n_pathtag_padded;
-
-        // Layout of memory
-        let mut alloc = 0;
-        let trans_alloc = alloc;
-        alloc += trans_alloc + n_trans_padded * TRANSFORM_SIZE;
-        let pathseg_alloc = alloc;
-        alloc += pathseg_alloc + self.n_pathseg as usize * PATHSEG_SIZE;
-        let path_bbox_alloc = alloc;
-        let n_path = self.n_path as usize;
-        alloc += path_bbox_alloc + n_path * PATH_BBOX_SIZE;
-        let drawmonoid_alloc = alloc;
-        alloc += n_drawobj_padded * DRAWMONOID_SIZE;
-        let anno_alloc = alloc;
-        alloc += n_drawobj * ANNOTATED_SIZE;
-        let clip_alloc = alloc;
-        let n_clip = self.n_clip as usize;
-        const CLIP_SIZE: usize = 4;
-        alloc += n_clip * CLIP_SIZE;
-        let clip_bic_alloc = alloc;
-        const CLIP_BIC_SIZE: usize = 8;
-        // This can round down, as we only reduce the prefix
-        alloc += (n_clip / CLIP_PART_SIZE as usize) * CLIP_BIC_SIZE;
-        let clip_stack_alloc = alloc;
-        const CLIP_EL_SIZE: usize = 20;
-        alloc += n_clip * CLIP_EL_SIZE;
-        let clip_bbox_alloc = alloc;
-        const CLIP_BBOX_SIZE: usize = 16;
-        alloc += align_up(n_clip as usize, CLIP_PART_SIZE as usize) * CLIP_BBOX_SIZE;
-        let draw_bbox_alloc = alloc;
-        alloc += n_drawobj * DRAW_BBOX_SIZE;
-        let drawinfo_alloc = alloc;
-        // TODO: not optimized; it can be accumulated during encoding or summed from drawtags
-        const MAX_DRAWINFO_SIZE: usize = 44;
-        alloc += n_drawobj * MAX_DRAWINFO_SIZE;
-
-        let config = Config {
-            n_elements: n_drawobj as u32,
-            n_pathseg: self.n_pathseg,
-            pathseg_alloc: pathseg_alloc as u32,
-            anno_alloc: anno_alloc as u32,
-            trans_alloc: trans_alloc as u32,
-            path_bbox_alloc: path_bbox_alloc as u32,
-            drawmonoid_alloc: drawmonoid_alloc as u32,
-            clip_alloc: clip_alloc as u32,
-            clip_bic_alloc: clip_bic_alloc as u32,
-            clip_stack_alloc: clip_stack_alloc as u32,
-            clip_bbox_alloc: clip_bbox_alloc as u32,
-            draw_bbox_alloc: draw_bbox_alloc as u32,
-            drawinfo_alloc: drawinfo_alloc as u32,
-            n_trans: n_trans as u32,
-            n_path: self.n_path,
-            n_clip: self.n_clip,
-            trans_offset: trans_offset as u32,
-            linewidth_offset: linewidth_offset as u32,
-            pathtag_offset: pathtag_offset as u32,
-            pathseg_offset: pathseg_offset as u32,
-            drawtag_offset: drawtag_offset as u32,
-            drawdata_offset: drawdata_offset as u32,
-            ..Default::default()
-        };
-        (config, alloc)
-    }
-
    pub fn write_scene(&self, buf: &mut BufWrite) {
        buf.extend_slice(&self.drawtag_stream);
        let n_drawobj = self.drawtag_stream.len();
@ -440,32 +253,19 @@ impl Encoder {
        buf.extend_slice(&self.pathseg_stream);
    }

-    /// The number of draw objects in the draw object stream.
-    pub(crate) fn n_drawobj(&self) -> usize {
-        self.drawtag_stream.len()
-    }
+    pub(crate) fn stats(&self) -> SceneStats {
+        SceneStats {
+            n_drawobj: self.drawtag_stream.len(),
+            drawdata_len: self.drawdata_stream.len(),
+            n_transform: self.transform_stream.len(),
+            linewidth_len: std::mem::size_of_val(&*self.linewidth_stream),
+            n_pathtag: self.tag_stream.len(),
+            pathseg_len: self.pathseg_stream.len(),

-    /// The number of paths.
-    pub(crate) fn n_path(&self) -> u32 {
-        self.n_path
-    }
-
-    /// The number of path segments.
-    pub(crate) fn n_pathseg(&self) -> u32 {
-        self.n_pathseg
-    }
-
-    pub(crate) fn n_transform(&self) -> usize {
-        self.transform_stream.len()
-    }
-
-    /// The number of tags in the path stream.
-    pub(crate) fn n_pathtag(&self) -> usize {
-        self.tag_stream.len()
-    }
-
-    pub(crate) fn n_clip(&self) -> u32 {
-        self.n_clip
+            n_path: self.n_path,
+            n_pathseg: self.n_pathseg,
+            n_clip: self.n_clip,
+        }
    }

    pub(crate) fn encode_glyph(&mut self, glyph: &GlyphEncoder) {
@ -478,11 +278,6 @@ impl Encoder {
    }
 }

-fn align_up(x: usize, align: usize) -> usize {
-    debug_assert!(align.is_power_of_two());
-    (x + align - 1) & !(align - 1)
-}
-
 fn padding(x: usize, align: usize) -> usize {
    x.wrapping_neg() & (align - 1)
 }
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -4,17 +4,19 @@ pub mod glyph_render;
 mod gradient;
 mod pico_svg;
 mod render_ctx;
+mod render_driver;
 pub mod stages;
 pub mod test_scenes;
 mod text;

-use bytemuck::Pod;
+use bytemuck::{Pod, Zeroable};
 use std::convert::TryInto;

 pub use blend::{Blend, BlendMode, CompositionMode};
 pub use encoder::EncodedSceneRef;
 pub use gradient::Colrv1RadialGradient;
 pub use render_ctx::PietGpuRenderContext;
+pub use render_driver::RenderDriver;

 use piet::kurbo::Vec2;
 use piet::{ImageFormat, RenderContext};
@ -25,9 +27,12 @@ use piet_gpu_hal::{
 };

 pub use pico_svg::PicoSvg;
-use stages::{ClipBinding, ElementBinding, ElementCode};
+use stages::{
+    ClipBinding, ElementBinding, ElementCode, DRAW_PART_SIZE, PATHSEG_PART_SIZE,
+    TRANSFORM_PART_SIZE,
+};

-use crate::stages::{ClipCode, Config, ElementStage};
+use crate::stages::{ClipCode, Config, ElementStage, CLIP_PART_SIZE};

 const TILE_W: usize = 16;
 const TILE_H: usize = 16;
@ -64,6 +69,31 @@ pub enum PixelFormat {
    Rgba8,
 }

+#[repr(C)]
+#[derive(Clone, Copy, Debug, Zeroable, Pod)]
+pub(crate) struct MemoryHeader {
+    mem_offset: u32,
+    mem_error: u32,
+    blend_offset: u32,
+}
+
+/// The sizes of various objects in the encoded scene, needed for memory layout.
+#[derive(Default)]
+pub(crate) struct SceneStats {
+    // Slices of scene encoding, in order
+    pub n_drawobj: usize,
+    pub drawdata_len: usize,
+    pub n_transform: usize,
+    pub linewidth_len: usize,
+    pub pathseg_len: usize,
+    pub n_pathtag: usize,
+
+    // Additional stats needed needed for memory layout & dispatch
+    pub n_path: u32,
+    pub n_pathseg: u32,
+    pub n_clip: u32,
+}
+
 pub struct Renderer {
    // These sizes are aligned to tile boundaries, though at some point
    // we'll want to have a good strategy for dealing with odd sizes.
@ -72,18 +102,23 @@ pub struct Renderer {

    pub image_dev: Image, // resulting image

-    // The reference is held by the pipelines. We will be changing
-    // this to make the scene upload dynamic.
+    // TODO: two changes needed here. First, if we're fencing on the coarse
+    // pipeline, then we only need one copy (this changes if we also bind the
+    // scene buffer in fine rasterization, which might be a good idea to reduce
+    // copying). Second, there should be a staging buffer for discrete cards.
    scene_bufs: Vec<Buffer>,

    memory_buf_host: Vec<Buffer>,
    memory_buf_dev: Buffer,
+    memory_buf_readback: Buffer,

    // Staging buffers
    config_bufs: Vec<Buffer>,
    // Device config buf
    config_buf: Buffer,

+    blend_buf: Buffer,
+
    // New element pipeline
    element_code: ElementCode,
    element_stage: ElementStage,
@ -111,6 +146,8 @@ pub struct Renderer {
    k4_pipeline: Pipeline,
    k4_ds: DescriptorSet,

+    scene_stats: SceneStats,
+    // TODO: the following stats are now redundant and can be removed.
    n_transform: usize,
    n_drawobj: usize,
    n_paths: usize,
@ -142,7 +179,13 @@ impl RenderConfig {

 impl Renderer {
    /// The number of query pool entries needed to run the renderer.
-    pub const QUERY_POOL_SIZE: u32 = 12;
+    pub const QUERY_POOL_SIZE: u32 = Self::COARSE_QUERY_POOL_SIZE + Self::FINE_QUERY_POOL_SIZE;
+
+    /// The number of query pool entries needed to run the coarse pipeline.
+    pub const COARSE_QUERY_POOL_SIZE: u32 = 10;
+
+    /// The number of query pool entries needed to run the fine pipeline.
+    pub const FINE_QUERY_POOL_SIZE: u32 = 2;

    pub unsafe fn new(
        session: &Session,
@ -166,12 +209,18 @@ impl Renderer {
        let width = width + (width.wrapping_neg() & (TILE_W - 1));
        let height = height + (height.wrapping_neg() & (TILE_W - 1));
        let dev = BufferUsage::STORAGE | BufferUsage::COPY_DST;
-        let host_upload = BufferUsage::MAP_WRITE | BufferUsage::COPY_SRC;
+        let usage_mem_dev = BufferUsage::STORAGE | BufferUsage::COPY_DST | BufferUsage::COPY_SRC;
+        let usage_blend = BufferUsage::STORAGE;
+        let usage_upload = BufferUsage::MAP_WRITE | BufferUsage::COPY_SRC;
+        let usage_readback = BufferUsage::MAP_READ | BufferUsage::COPY_DST;

-        // This may be inadequate for very complex scenes (paris etc)
        // TODO: separate staging buffer (if needed)
        let scene_bufs = (0..n_bufs)
-            .map(|_| session.create_buffer(8 * 1024 * 1024, host_upload).unwrap())
+            .map(|_| {
+                session
+                    .create_buffer(8 * 1024 * 1024, usage_upload)
+                    .unwrap()
+            })
            .collect::<Vec<_>>();

        let image_format = match config.format {
@ -185,15 +234,22 @@ impl Renderer {
        let config_bufs = (0..n_bufs)
            .map(|_| {
                session
-                    .create_buffer(CONFIG_BUFFER_SIZE, host_upload)
+                    .create_buffer(CONFIG_BUFFER_SIZE, usage_upload)
                    .unwrap()
            })
            .collect();

        let memory_buf_host = (0..n_bufs)
-            .map(|_| session.create_buffer(2 * 4, host_upload).unwrap())
+            .map(|_| {
+                session
+                    .create_buffer(std::mem::size_of::<MemoryHeader>() as u64, usage_upload)
+                    .unwrap()
+            })
            .collect();
-        let memory_buf_dev = session.create_buffer(128 * 1024 * 1024, dev)?;
+        let memory_buf_dev = session.create_buffer(16 * 1024 * 1024, usage_mem_dev)?;
+        let memory_buf_readback =
+            session.create_buffer(std::mem::size_of::<MemoryHeader>() as u64, usage_readback)?;
+        let blend_buf = session.create_buffer(16 * 1024 * 1024, usage_blend)?;

        let element_code = ElementCode::new(session);
        let element_stage = ElementStage::new(session, &element_code);
@ -282,7 +338,7 @@ impl Renderer {
        let gradient_bufs = (0..n_bufs)
            .map(|_| {
                session
-                    .create_buffer(GRADIENT_BUF_SIZE as u64, host_upload)
+                    .create_buffer(GRADIENT_BUF_SIZE as u64, usage_upload)
                    .unwrap()
            })
            .collect();
@ -297,6 +353,7 @@ impl Renderer {
            &[
                BindType::Buffer,
                BindType::BufReadOnly,
+                BindType::Buffer,
                BindType::Image,
                BindType::ImageRead,
                BindType::ImageRead,
@ -304,19 +361,22 @@ impl Renderer {
        )?;
        let k4_ds = session
            .descriptor_set_builder()
-            .add_buffers(&[&memory_buf_dev, &config_buf])
+            .add_buffers(&[&memory_buf_dev, &config_buf, &blend_buf])
            .add_images(&[&image_dev])
            .add_textures(&[&bg_image, &gradients])
            .build(&session, &k4_pipeline)?;

+        let scene_stats = Default::default();
        Ok(Renderer {
            width,
            height,
            scene_bufs,
            memory_buf_host,
            memory_buf_dev,
+            memory_buf_readback,
            config_buf,
            config_bufs,
+            blend_buf,
            image_dev,
            element_code,
            element_stage,
@ -336,6 +396,7 @@ impl Renderer {
            coarse_ds,
            k4_pipeline,
            k4_ds,
+            scene_stats,
            n_transform: 0,
            n_drawobj: 0,
            n_paths: 0,
@ -358,43 +419,14 @@ impl Renderer {
        render_ctx: &mut PietGpuRenderContext,
        buf_ix: usize,
    ) -> Result<(), Error> {
-        let (mut config, mut alloc) = render_ctx.stage_config();
-        let n_drawobj = render_ctx.n_drawobj();
-        // TODO: be more consistent in size types
-        let n_path = render_ctx.n_path() as usize;
-        self.n_paths = n_path;
-        self.n_transform = render_ctx.n_transform();
-        self.n_drawobj = render_ctx.n_drawobj();
-        self.n_pathseg = render_ctx.n_pathseg() as usize;
-        self.n_pathtag = render_ctx.n_pathtag();
-        self.n_clip = render_ctx.n_clip();
+        self.scene_stats = render_ctx.stats();

-        // These constants depend on encoding and may need to be updated.
-        // Perhaps we can plumb these from piet-gpu-derive?
-        const PATH_SIZE: usize = 12;
-        const BIN_SIZE: usize = 8;
-        let width_in_tiles = self.width / TILE_W;
-        let height_in_tiles = self.height / TILE_H;
-        let tile_base = alloc;
-        alloc += ((n_path + 3) & !3) * PATH_SIZE;
-        let bin_base = alloc;
-        alloc += ((n_drawobj + 255) & !255) * BIN_SIZE;
-        let ptcl_base = alloc;
-        alloc += width_in_tiles * height_in_tiles * PTCL_INITIAL_ALLOC;
-
-        config.width_in_tiles = width_in_tiles as u32;
-        config.height_in_tiles = height_in_tiles as u32;
-        config.tile_alloc = tile_base as u32;
-        config.bin_alloc = bin_base as u32;
-        config.ptcl_alloc = ptcl_base as u32;
        unsafe {
-            // TODO: reallocate scene buffer if size is inadequate
+            self.upload_config(buf_ix)?;
            {
                let mut mapped_scene = self.scene_bufs[buf_ix].map_write(..)?;
                render_ctx.write_scene(&mut mapped_scene);
            }
-            self.config_bufs[buf_ix].write(&[config])?;
-            self.memory_buf_host[buf_ix].write(&[alloc as u32, 0 /* Overflow flag */])?;

            // Upload gradient data.
            let ramp_data = render_ctx.get_ramp_data();
@ -414,43 +446,14 @@ impl Renderer {
        scene: &EncodedSceneRef<T>,
        buf_ix: usize,
    ) -> Result<(), Error> {
-        let (mut config, mut alloc) = scene.stage_config();
-        let n_drawobj = scene.n_drawobj();
-        // TODO: be more consistent in size types
-        let n_path = scene.n_path() as usize;
-        self.n_paths = n_path;
-        self.n_transform = scene.n_transform();
-        self.n_drawobj = scene.n_drawobj();
-        self.n_pathseg = scene.n_pathseg() as usize;
-        self.n_pathtag = scene.n_pathtag();
-        self.n_clip = scene.n_clip();
+        self.scene_stats = scene.stats();

-        // These constants depend on encoding and may need to be updated.
-        // Perhaps we can plumb these from piet-gpu-derive?
-        const PATH_SIZE: usize = 12;
-        const BIN_SIZE: usize = 8;
-        let width_in_tiles = self.width / TILE_W;
-        let height_in_tiles = self.height / TILE_H;
-        let tile_base = alloc;
-        alloc += ((n_path + 3) & !3) * PATH_SIZE;
-        let bin_base = alloc;
-        alloc += ((n_drawobj + 255) & !255) * BIN_SIZE;
-        let ptcl_base = alloc;
-        alloc += width_in_tiles * height_in_tiles * PTCL_INITIAL_ALLOC;
-
-        config.width_in_tiles = width_in_tiles as u32;
-        config.height_in_tiles = height_in_tiles as u32;
-        config.tile_alloc = tile_base as u32;
-        config.bin_alloc = bin_base as u32;
-        config.ptcl_alloc = ptcl_base as u32;
        unsafe {
-            // TODO: reallocate scene buffer if size is inadequate
+            self.upload_config(buf_ix)?;
            {
                let mut mapped_scene = self.scene_bufs[buf_ix].map_write(..)?;
                scene.write_scene(&mut mapped_scene);
            }
-            self.config_bufs[buf_ix].write(&[config])?;
-            self.memory_buf_host[buf_ix].write(&[alloc as u32, 0 /* Overflow flag */])?;

            // Upload gradient data.
            if !scene.ramp_data.is_empty() {
@ -464,7 +467,41 @@ impl Renderer {
        Ok(())
    }

-    pub unsafe fn record(&self, cmd_buf: &mut CmdBuf, query_pool: &QueryPool, buf_ix: usize) {
+    // Note: configuration has to be re-uploaded when memory buffer is resized
+    pub(crate) unsafe fn upload_config(&mut self, buf_ix: usize) -> Result<(), Error> {
+        let stats = &self.scene_stats;
+        let n_path = stats.n_path as usize;
+        self.n_paths = n_path;
+        self.n_transform = stats.n_transform;
+        self.n_drawobj = stats.n_drawobj;
+        self.n_pathseg = stats.n_pathseg as usize;
+        self.n_pathtag = stats.n_pathtag;
+        self.n_clip = stats.n_clip;
+        let (mut config, alloc) = stats.config(self.width, self.height);
+        config.mem_size = self.memory_buf_size() as u32;
+        self.config_bufs[buf_ix].write(&[config])?;
+        let mem_header = MemoryHeader {
+            mem_offset: alloc as u32,
+            mem_error: 0,
+            blend_offset: 0,
+        };
+        // Note: we could skip doing this on realloc, but probably not worth the bother
+        self.memory_buf_host[buf_ix].write(&[mem_header])?;
+        Ok(())
+    }
+
+    /// Get the size of memory for the allocations known in advance.
+    pub(crate) fn memory_size(&self, stats: &SceneStats) -> usize {
+        stats.config(self.width, self.height).1
+    }
+
+    /// Record the coarse part of a render pipeline.
+    pub unsafe fn record_coarse(
+        &self,
+        cmd_buf: &mut CmdBuf,
+        query_pool: &QueryPool,
+        buf_ix: usize,
+    ) {
        cmd_buf.copy_buffer(&self.config_bufs[buf_ix], &self.config_buf);
        cmd_buf.copy_buffer(&self.memory_buf_host[buf_ix], &self.memory_buf_dev);
        cmd_buf.memory_barrier();
@ -558,9 +595,21 @@ impl Renderer {
        pass.end();
        cmd_buf.end_debug_label();
        cmd_buf.memory_barrier();
+    }
+
+    pub unsafe fn record_fine(
+        &self,
+        cmd_buf: &mut CmdBuf,
+        query_pool: &QueryPool,
+        query_start: u32,
+    ) {
+        cmd_buf.reset_query_pool(&query_pool);
        cmd_buf.begin_debug_label("Fine raster");
-        let mut pass =
-            cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 10, 11));
+        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(
+            &query_pool,
+            query_start,
+            query_start + 1,
+        ));
        pass.dispatch(
            &self.k4_pipeline,
            &self.k4_ds,
@ -577,6 +626,19 @@ impl Renderer {
        cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
    }

+    pub unsafe fn record_readback(&self, cmd_buf: &mut CmdBuf) {
+        cmd_buf.copy_buffer(&self.memory_buf_dev, &self.memory_buf_readback);
+        cmd_buf.memory_barrier();
+    }
+
+    /// Record a render pipeline.
+    ///
+    /// This *assumes* the buffers are adequately sized.
+    pub unsafe fn record(&self, cmd_buf: &mut CmdBuf, query_pool: &QueryPool, buf_ix: usize) {
+        self.record_coarse(cmd_buf, query_pool, buf_ix);
+        self.record_fine(cmd_buf, query_pool, 10);
+    }
+
    pub fn make_image(
        session: &Session,
        width: usize,
@ -636,4 +698,210 @@ impl Renderer {
                .unwrap()
        }
    }
+
+    pub(crate) unsafe fn realloc_scene_if_needed(
+        &mut self,
+        session: &Session,
+        new_size: u64,
+        buf_ix: usize,
+    ) -> Result<(), Error> {
+        if new_size <= self.scene_bufs[buf_ix].size() {
+            return Ok(());
+        }
+        const ALIGN: u64 = 0x10000;
+        let new_size = (new_size + ALIGN - 1) & ALIGN.wrapping_neg();
+        println!(
+            "reallocating scene buf[{}] {} -> {}",
+            buf_ix,
+            self.scene_bufs[buf_ix].size(),
+            new_size
+        );
+        let usage_upload = BufferUsage::MAP_WRITE | BufferUsage::COPY_SRC;
+        let scene_buf = session.create_buffer(new_size, usage_upload)?;
+        self.element_bindings[buf_ix].rebind_scene(session, &scene_buf);
+        session.update_buffer_descriptor(&mut self.tile_ds[buf_ix], 2, &scene_buf);
+        session.update_buffer_descriptor(&mut self.coarse_ds[buf_ix], 2, &scene_buf);
+        self.scene_bufs[buf_ix] = scene_buf;
+        Ok(())
+    }
+
+    /// Get the size of the memory buffer.
+    ///
+    /// This is the usable size (not including the header).
+    pub(crate) fn memory_buf_size(&self) -> u64 {
+        self.memory_buf_dev.size() - std::mem::size_of::<MemoryHeader>() as u64
+    }
+
+    pub(crate) unsafe fn realloc_memory(
+        &mut self,
+        session: &Session,
+        new_size: u64,
+    ) -> Result<(), Error> {
+        println!(
+            "reallocating memory buf {} -> {}",
+            self.memory_buf_dev.size(),
+            new_size
+        );
+        let usage_mem_dev = BufferUsage::STORAGE | BufferUsage::COPY_DST | BufferUsage::COPY_SRC;
+        let memory_buf_dev = session.create_buffer(new_size, usage_mem_dev)?;
+        for element_binding in &mut self.element_bindings {
+            element_binding.rebind_memory(session, &memory_buf_dev);
+        }
+        self.clip_binding.rebind_memory(session, &memory_buf_dev);
+        for tile_ds in &mut self.tile_ds {
+            session.update_buffer_descriptor(tile_ds, 0, &memory_buf_dev);
+        }
+        session.update_buffer_descriptor(&mut self.path_ds, 0, &memory_buf_dev);
+        session.update_buffer_descriptor(&mut self.backdrop_ds, 0, &memory_buf_dev);
+        session.update_buffer_descriptor(&mut self.bin_ds, 0, &memory_buf_dev);
+        for coarse_ds in &mut self.coarse_ds {
+            session.update_buffer_descriptor(coarse_ds, 0, &memory_buf_dev);
+        }
+        session.update_buffer_descriptor(&mut self.k4_ds, 0, &memory_buf_dev);
+        self.memory_buf_dev = memory_buf_dev;
+        Ok(())
+    }
+
+    pub(crate) fn blend_size(&self) -> u64 {
+        self.blend_buf.size()
+    }
+
+    pub(crate) unsafe fn realloc_blend(
+        &mut self,
+        session: &Session,
+        new_size: u64,
+    ) -> Result<(), Error> {
+        println!(
+            "reallocating blend buf {} -> {}",
+            self.blend_size(),
+            new_size
+        );
+        let usage_blend = BufferUsage::STORAGE;
+        let blend_buf = session.create_buffer(new_size, usage_blend)?;
+        session.update_buffer_descriptor(&mut self.k4_ds, 2, &blend_buf);
+        self.blend_buf = blend_buf;
+        Ok(())
+    }
+}
+
+const TRANSFORM_SIZE: usize = 24;
+const PATHSEG_SIZE: usize = 52;
+const PATH_BBOX_SIZE: usize = 24;
+const DRAWMONOID_SIZE: usize = 16;
+const DRAW_BBOX_SIZE: usize = 16;
+const DRAWTAG_SIZE: usize = 4;
+const ANNOTATED_SIZE: usize = 40;
+
+impl SceneStats {
+    pub(crate) fn scene_size(&self) -> usize {
+        align_up(self.n_drawobj, DRAW_PART_SIZE as usize) * DRAWTAG_SIZE
+            + self.drawdata_len
+            + align_up(self.n_transform, TRANSFORM_PART_SIZE as usize) * TRANSFORM_SIZE
+            + self.linewidth_len
+            + align_up(self.n_pathtag, PATHSEG_PART_SIZE as usize)
+            + self.pathseg_len
+    }
+
+    /// Return a config for a scene with these stats.
+    ///
+    /// Also returns the beginning of free (dynamic) memory.
+    fn config(&self, width: usize, height: usize) -> (Config, usize) {
+        // Layout of scene buffer
+        let drawtag_offset = 0;
+        let n_drawobj = self.n_drawobj;
+        let n_drawobj_padded = align_up(n_drawobj, DRAW_PART_SIZE as usize);
+        let drawdata_offset = drawtag_offset + n_drawobj_padded * DRAWTAG_SIZE;
+        let trans_offset = drawdata_offset + self.drawdata_len;
+        let n_trans = self.n_transform;
+        let n_trans_padded = align_up(n_trans, TRANSFORM_PART_SIZE as usize);
+        let linewidth_offset = trans_offset + n_trans_padded * TRANSFORM_SIZE;
+        let pathtag_offset = linewidth_offset + self.linewidth_len;
+        let n_pathtag = self.n_pathtag;
+        let n_pathtag_padded = align_up(n_pathtag, PATHSEG_PART_SIZE as usize);
+        let pathseg_offset = pathtag_offset + n_pathtag_padded;
+
+        // Layout of memory
+        let mut alloc = 0;
+        let trans_alloc = alloc;
+        alloc += trans_alloc + n_trans_padded * TRANSFORM_SIZE;
+        let pathseg_alloc = alloc;
+        alloc += pathseg_alloc + self.n_pathseg as usize * PATHSEG_SIZE;
+        let path_bbox_alloc = alloc;
+        let n_path = self.n_path as usize;
+        alloc += path_bbox_alloc + n_path * PATH_BBOX_SIZE;
+        let drawmonoid_alloc = alloc;
+        alloc += n_drawobj_padded * DRAWMONOID_SIZE;
+        let anno_alloc = alloc;
+        alloc += n_drawobj * ANNOTATED_SIZE;
+        let clip_alloc = alloc;
+        let n_clip = self.n_clip as usize;
+        const CLIP_SIZE: usize = 4;
+        alloc += n_clip * CLIP_SIZE;
+        let clip_bic_alloc = alloc;
+        const CLIP_BIC_SIZE: usize = 8;
+        // This can round down, as we only reduce the prefix
+        alloc += (n_clip / CLIP_PART_SIZE as usize) * CLIP_BIC_SIZE;
+        let clip_stack_alloc = alloc;
+        const CLIP_EL_SIZE: usize = 20;
+        alloc += n_clip * CLIP_EL_SIZE;
+        let clip_bbox_alloc = alloc;
+        const CLIP_BBOX_SIZE: usize = 16;
+        alloc += align_up(n_clip as usize, CLIP_PART_SIZE as usize) * CLIP_BBOX_SIZE;
+        let draw_bbox_alloc = alloc;
+        alloc += n_drawobj * DRAW_BBOX_SIZE;
+        let drawinfo_alloc = alloc;
+        // TODO: not optimized; it can be accumulated during encoding or summed from drawtags
+        const MAX_DRAWINFO_SIZE: usize = 44;
+        alloc += n_drawobj * MAX_DRAWINFO_SIZE;
+
+        // These constants depend on encoding and may need to be updated.
+        const PATH_SIZE: usize = 12;
+        const BIN_SIZE: usize = 8;
+        let width_in_tiles = width / TILE_W;
+        let height_in_tiles = height / TILE_H;
+        let tile_base = alloc;
+        alloc += ((n_path + 3) & !3) * PATH_SIZE;
+        let bin_base = alloc;
+        alloc += ((n_drawobj + 255) & !255) * BIN_SIZE;
+        let ptcl_base = alloc;
+        alloc += width_in_tiles * height_in_tiles * PTCL_INITIAL_ALLOC;
+
+        let config = Config {
+            mem_size: 0, // to be filled in later
+            n_elements: n_drawobj as u32,
+            n_pathseg: self.n_pathseg,
+            pathseg_alloc: pathseg_alloc as u32,
+            anno_alloc: anno_alloc as u32,
+            trans_alloc: trans_alloc as u32,
+            path_bbox_alloc: path_bbox_alloc as u32,
+            drawmonoid_alloc: drawmonoid_alloc as u32,
+            clip_alloc: clip_alloc as u32,
+            clip_bic_alloc: clip_bic_alloc as u32,
+            clip_stack_alloc: clip_stack_alloc as u32,
+            clip_bbox_alloc: clip_bbox_alloc as u32,
+            draw_bbox_alloc: draw_bbox_alloc as u32,
+            drawinfo_alloc: drawinfo_alloc as u32,
+            n_trans: n_trans as u32,
+            n_path: self.n_path,
+            n_clip: self.n_clip,
+            trans_offset: trans_offset as u32,
+            linewidth_offset: linewidth_offset as u32,
+            pathtag_offset: pathtag_offset as u32,
+            pathseg_offset: pathseg_offset as u32,
+            drawtag_offset: drawtag_offset as u32,
+            drawdata_offset: drawdata_offset as u32,
+            width_in_tiles: width_in_tiles as u32,
+            height_in_tiles: height_in_tiles as u32,
+            tile_alloc: tile_base as u32,
+            bin_alloc: bin_base as u32,
+            ptcl_alloc: ptcl_base as u32,
+        };
+
+        (config, alloc)
+    }
+}
+
+fn align_up(x: usize, align: usize) -> usize {
+    debug_assert!(align.is_power_of_two());
+    (x + align - 1) & !(align - 1)
 }
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@ -4,7 +4,7 @@ const DO_SRGB_CONVERSION: bool = false;
 use std::borrow::Cow;

 use crate::encoder::GlyphEncoder;
-use crate::stages::{Config, Transform};
+use crate::stages::Transform;
 use piet::kurbo::{Affine, PathEl, Point, Rect, Shape};
 use piet::{
    Color, Error, FixedGradient, ImageFormat, InterpolationMode, IntoBrush, RenderContext,
@ -18,7 +18,7 @@ use piet_gpu_types::scene::Element;
 use crate::gradient::{Colrv1RadialGradient, LinearGradient, RadialGradient, RampCache};
 use crate::text::Font;
 pub use crate::text::{PietGpuText, PietGpuTextLayout, PietGpuTextLayoutBuilder};
-use crate::Blend;
+use crate::{Blend, SceneStats};

 pub struct PietGpuImage;

@ -95,44 +95,15 @@ impl PietGpuRenderContext {
        }
    }

-    pub fn stage_config(&self) -> (Config, usize) {
-        self.new_encoder.stage_config()
-    }
-
-    /// Number of draw objects.
-    ///
-    /// This is for the new element processing pipeline. It's not necessarily the
-    /// same as the number of paths (as in the old pipeline), but it might take a
-    /// while to sort that out.
-    pub fn n_drawobj(&self) -> usize {
-        self.new_encoder.n_drawobj()
-    }
-
-    /// Number of paths.
-    pub fn n_path(&self) -> u32 {
-        self.new_encoder.n_path()
-    }
-
-    pub fn n_pathseg(&self) -> u32 {
-        self.new_encoder.n_pathseg()
-    }
-
-    pub fn n_pathtag(&self) -> usize {
-        self.new_encoder.n_pathtag()
-    }
-
-    pub fn n_transform(&self) -> usize {
-        self.new_encoder.n_transform()
-    }
-
-    pub fn n_clip(&self) -> u32 {
-        self.new_encoder.n_clip()
+    pub(crate) fn stats(&self) -> SceneStats {
+        self.new_encoder.stats()
    }

    pub fn write_scene(&self, buf: &mut BufWrite) {
        self.new_encoder.write_scene(buf);
    }

+    // TODO: delete
    pub fn get_scene_buf(&mut self) -> &[u8] {
        const ALIGN: usize = 128;
        let padded_size = (self.elements.len() + (ALIGN - 1)) & ALIGN.wrapping_neg();
@ -194,7 +165,6 @@ impl RenderContext for PietGpuRenderContext {
                let rad = self.ramp_cache.add_radial_gradient(&rad);
                Ok(PietGpuBrush::RadGradient(rad))
            }
-            _ => todo!("don't do radial gradients yet"),
        }
    }

--- a/piet-gpu/src/render_driver.rs
+++ b/piet-gpu/src/render_driver.rs
@ -0,0 +1,332 @@
+// Copyright 2022 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+use bytemuck::Pod;
+use piet_gpu_hal::{CmdBuf, Error, Image, QueryPool, Semaphore, Session, SubmittedCmdBuf};
+
+use crate::{EncodedSceneRef, MemoryHeader, PietGpuRenderContext, Renderer, SceneStats};
+
+/// Additional logic for sequencing rendering operations, specifically
+/// for handling failure and reallocation.
+///
+/// It may be this shouldn't be a separate object from Renderer.
+pub struct RenderDriver {
+    frames: Vec<RenderFrame>,
+    renderer: Renderer,
+    buf_ix: usize,
+    /// The index of a pending fine rasterization submission.
+    pending: Option<usize>,
+}
+
+pub struct TargetState<'a> {
+    pub cmd_buf: &'a mut CmdBuf,
+    pub image: &'a Image,
+}
+
+#[derive(Default, Debug)]
+pub struct TimingStats {
+    coarse: Vec<f64>,
+    fine: Vec<f64>,
+}
+
+struct RenderFrame {
+    cmd_buf: CmdBufState,
+    coarse_query_pool: QueryPool,
+    fine_query_pool: QueryPool,
+    timing_stats: TimingStats,
+}
+
+enum CmdBufState {
+    Start,
+    Submitted(SubmittedCmdBuf),
+    Ready(CmdBuf),
+}
+
+impl RenderDriver {
+    /// Create new render driver.
+    ///
+    /// Should probably be fallible.
+    ///
+    /// We can get n from the renderer as well.
+    pub fn new(session: &Session, n: usize, renderer: Renderer) -> RenderDriver {
+        let frames = (0..n)
+            .map(|_| {
+                // Maybe should allocate here so it doesn't happen on first frame?
+                let cmd_buf = CmdBufState::default();
+                let coarse_query_pool =
+                    session.create_query_pool(Renderer::COARSE_QUERY_POOL_SIZE)?;
+                let fine_query_pool = session.create_query_pool(Renderer::FINE_QUERY_POOL_SIZE)?;
+                Ok(RenderFrame {
+                    cmd_buf,
+                    coarse_query_pool,
+                    fine_query_pool,
+                    timing_stats: TimingStats::default(),
+                })
+            })
+            .collect::<Result<_, Error>>()
+            .unwrap();
+        RenderDriver {
+            frames,
+            renderer,
+            buf_ix: 0,
+            pending: None,
+        }
+    }
+
+    pub fn upload_render_ctx(
+        &mut self,
+        session: &Session,
+        render_ctx: &mut PietGpuRenderContext,
+    ) -> Result<(), Error> {
+        let stats = render_ctx.stats();
+        self.ensure_scene_buffers(session, &stats)?;
+        self.renderer.upload_render_ctx(render_ctx, self.buf_ix)
+    }
+
+    pub fn upload_scene<T: Copy + Pod>(
+        &mut self,
+        session: &Session,
+        scene: &EncodedSceneRef<T>,
+    ) -> Result<(), Error> {
+        let stats = scene.stats();
+        self.ensure_scene_buffers(session, &stats)?;
+        self.renderer.upload_scene(scene, self.buf_ix)
+    }
+
+    fn ensure_scene_buffers(&mut self, session: &Session, stats: &SceneStats) -> Result<(), Error> {
+        let scene_size = stats.scene_size();
+        unsafe {
+            self.renderer
+                .realloc_scene_if_needed(session, scene_size as u64, self.buf_ix)?;
+        }
+        let memory_size = self.renderer.memory_size(&stats);
+        // TODO: better estimate of additional memory needed
+        // Note: if we were to cover the worst-case binning output, we could make the
+        // binning stage infallible and cut checking logic. It also may not be a bad
+        // estimate for the rest.
+        let estimated_needed = memory_size as u64 + (1 << 20);
+        if estimated_needed > self.renderer.memory_buf_size() {
+            if let Some(pending) = self.pending.take() {
+                // There might be a fine rasterization task that binds the memory buffer
+                // still in flight.
+                self.frames[pending].cmd_buf.wait();
+            }
+            unsafe {
+                self.renderer.realloc_memory(session, estimated_needed)?;
+            }
+        }
+        Ok(())
+    }
+
+    /// Run one try of the coarse rendering pipeline.
+    pub(crate) fn try_run_coarse(&mut self, session: &Session) -> Result<MemoryHeader, Error> {
+        let frame = &mut self.frames[self.buf_ix];
+        let cmd_buf = frame.cmd_buf.cmd_buf(session)?;
+        unsafe {
+            cmd_buf.begin();
+            // TODO: probably want to return query results as well
+            self.renderer
+                .record_coarse(cmd_buf, &frame.coarse_query_pool, self.buf_ix);
+            self.renderer.record_readback(cmd_buf);
+            let cmd_buf = frame.cmd_buf.cmd_buf(session)?;
+            cmd_buf.finish_timestamps(&frame.coarse_query_pool);
+            cmd_buf.host_barrier();
+            cmd_buf.finish();
+            frame.cmd_buf.submit(session, &[], &[])?;
+            frame.cmd_buf.wait();
+            frame.timing_stats.coarse = session.fetch_query_pool(&frame.coarse_query_pool)?;
+            let mut result = Vec::new();
+            // TODO: consider read method for single POD value
+            self.renderer.memory_buf_readback.read(&mut result)?;
+            Ok(result[0])
+        }
+    }
+
+    /// Run the coarse render pipeline, ensuring enough memory for intermediate buffers.
+    pub fn run_coarse(&mut self, session: &Session) -> Result<(), Error> {
+        loop {
+            let mem_header = self.try_run_coarse(session)?;
+            //println!("{:?}", mem_header);
+            if mem_header.mem_error == 0 {
+                let blend_needed = mem_header.blend_offset as u64;
+                if blend_needed > self.renderer.blend_size() {
+                    unsafe {
+                        self.renderer.realloc_blend(session, blend_needed)?;
+                    }
+                }
+                return Ok(());
+            }
+            // Not enough memory, reallocate and retry.
+            // TODO: be smarter (multiplier for early stages)
+            let mem_size = mem_header.mem_offset + 4096;
+            // Safety rationalization: no command buffers containing the buffer are
+            // in flight.
+            unsafe {
+                self.renderer.realloc_memory(session, mem_size.into())?;
+                self.renderer.upload_config(self.buf_ix)?;
+            }
+        }
+    }
+
+    /// Record the fine rasterizer, leaving the command buffer open.
+    pub fn record_fine(&mut self, session: &Session) -> Result<TargetState, Error> {
+        let frame = &mut self.frames[self.buf_ix];
+        let cmd_buf = frame.cmd_buf.cmd_buf(session)?;
+        unsafe {
+            cmd_buf.begin();
+            self.renderer
+                .record_fine(cmd_buf, &frame.fine_query_pool, 0);
+        }
+        let image = &self.renderer.image_dev;
+        Ok(TargetState { cmd_buf, image })
+    }
+
+    /// Submit the current command buffer.
+    pub fn submit(
+        &mut self,
+        session: &Session,
+        wait_semaphores: &[&Semaphore],
+        signal_semaphores: &[&Semaphore],
+    ) -> Result<(), Error> {
+        let frame = &mut self.frames[self.buf_ix];
+        let cmd_buf = frame.cmd_buf.cmd_buf(session)?;
+        unsafe {
+            cmd_buf.finish_timestamps(&frame.fine_query_pool);
+            cmd_buf.host_barrier();
+            cmd_buf.finish();
+            frame
+                .cmd_buf
+                .submit(session, wait_semaphores, signal_semaphores)?
+        }
+        self.pending = Some(self.buf_ix);
+        Ok(())
+    }
+
+    unsafe fn wait_frame(&mut self, session: &Session, buf_ix: usize) {
+        let frame = &mut self.frames[buf_ix];
+        frame.cmd_buf.wait();
+        if let Ok(stats) = session.fetch_query_pool(&frame.fine_query_pool) {
+            frame.timing_stats.fine = stats;
+        }
+        if self.pending == Some(buf_ix) {
+            self.pending = None;
+        }
+    }
+
+    pub unsafe fn wait(&mut self, session: &Session) {
+        self.wait_frame(session, self.buf_ix);
+    }
+
+    /// Move to the next buffer.
+    pub fn next_buffer(&mut self) {
+        self.buf_ix = (self.buf_ix + 1) % self.frames.len()
+    }
+
+    pub unsafe fn get_timing_stats(&mut self, session: &Session, buf_ix: usize) -> &TimingStats {
+        self.wait_frame(session, buf_ix);
+        &self.frames[buf_ix].timing_stats
+    }
+
+    pub fn wait_all(&mut self, session: &Session) {
+        for buf_ix in 0..self.frames.len() {
+            unsafe {
+                self.wait_frame(session, buf_ix);
+            }
+        }
+    }
+}
+
+impl Default for CmdBufState {
+    fn default() -> Self {
+        CmdBufState::Start
+    }
+}
+
+impl CmdBufState {
+    /// Get a command buffer suitable for recording.
+    ///
+    /// If the command buffer is submitted, wait.
+    fn cmd_buf(&mut self, session: &Session) -> Result<&mut CmdBuf, Error> {
+        if let CmdBufState::Ready(cmd_buf) = self {
+            return Ok(cmd_buf);
+        }
+        if let CmdBufState::Submitted(submitted) = std::mem::take(self) {
+            if let Ok(Some(cmd_buf)) = submitted.wait() {
+                *self = CmdBufState::Ready(cmd_buf);
+            }
+        }
+        if matches!(self, CmdBufState::Start) {
+            *self = CmdBufState::Ready(session.cmd_buf()?);
+        }
+        if let CmdBufState::Ready(cmd_buf) = self {
+            Ok(cmd_buf)
+        } else {
+            unreachable!()
+        }
+    }
+
+    unsafe fn submit(
+        &mut self,
+        session: &Session,
+        wait_semaphores: &[&Semaphore],
+        signal_semaphores: &[&Semaphore],
+    ) -> Result<(), Error> {
+        if let CmdBufState::Ready(cmd_buf) = std::mem::take(self) {
+            let submitted = session.run_cmd_buf(cmd_buf, wait_semaphores, signal_semaphores)?;
+            *self = CmdBufState::Submitted(submitted);
+            Ok(())
+        } else {
+            Err("Tried to submit CmdBufState not in ready state".into())
+        }
+    }
+
+    fn wait(&mut self) {
+        if matches!(self, CmdBufState::Submitted(_)) {
+            if let CmdBufState::Submitted(submitted) = std::mem::take(self) {
+                if let Ok(Some(cmd_buf)) = submitted.wait() {
+                    *self = CmdBufState::Ready(cmd_buf);
+                }
+            }
+        }
+    }
+}
+
+impl TimingStats {
+    pub fn print_summary(&self) {
+        let ts = &self.coarse;
+        println!("Element time: {:.3}ms", ts[0] * 1e3);
+        println!("Clip + bin + tile time: {:.3}ms", (ts[2] - ts[1]) * 1e3);
+        println!("Coarse path time: {:.3}ms", (ts[4] - ts[2]) * 1e3);
+        println!("Backdrop time: {:.3}ms", (ts[6] - ts[5]) * 1e3);
+        println!("Coarse raster kernel time: {:.3}ms", (ts[8] - ts[7]) * 1e3);
+        println!("Fine kernel time: {:.3}ms", self.fine[0] * 1e3);
+    }
+
+    pub fn short_summary(&self) -> String {
+        let ts = &self.coarse;
+        let el = ts[0] * 1e3;
+        let cl = (ts[2] - ts[1]) * 1e3;
+        let cp = (ts[4] - ts[3]) * 1e3;
+        let bd = (ts[6] - ts[5]) * 1e3;
+        let cr = (ts[8] - ts[7]) * 1e3;
+        let fr = self.fine[0] * 1e3;
+        let total = el + cl + cp + bd + cr + fr;
+        format!(
+            "{:.3}ms :: el:{:.3}ms|cl:{:.3}ms|cp:{:.3}ms|bd:{:.3}ms|cr:{:.3}ms|fr:{:.3}ms",
+            total, el, cl, cp, bd, cr, fr
+        )
+    }
+}
--- a/piet-gpu/src/stages.rs
+++ b/piet-gpu/src/stages.rs
@ -37,6 +37,7 @@ pub use transform::{
 #[repr(C)]
 #[derive(Clone, Copy, Default, Debug, Zeroable, Pod)]
 pub struct Config {
+    pub mem_size: u32,
    pub n_elements: u32, // paths
    pub n_pathseg: u32,
    pub width_in_tiles: u32,
@ -167,3 +168,17 @@ impl ElementStage {
            .record(pass, &code.draw_code, &binding.draw_binding, n_drawobj);
    }
 }
+
+impl ElementBinding {
+    pub unsafe fn rebind_memory(&mut self, session: &Session, memory: &Buffer) {
+        self.transform_binding.rebind_memory(session, memory);
+        self.path_binding.rebind_memory(session, memory);
+        self.draw_binding.rebind_memory(session, memory);
+    }
+
+    pub unsafe fn rebind_scene(&mut self, session: &Session, scene: &Buffer) {
+        self.transform_binding.rebind_scene(session, scene);
+        self.path_binding.rebind_scene(session, scene);
+        self.draw_binding.rebind_scene(session, scene);
+    }
+}
--- a/piet-gpu/src/stages/clip.rs
+++ b/piet-gpu/src/stages/clip.rs
@ -93,4 +93,9 @@ impl ClipBinding {
            pass.memory_barrier();
        }
    }
+
+    pub unsafe fn rebind_memory(&mut self, session: &Session, memory: &Buffer) {
+        session.update_buffer_descriptor(&mut self.reduce_ds, 0, memory);
+        session.update_buffer_descriptor(&mut self.leaf_ds, 0, memory);
+    }
 }
--- a/piet-gpu/src/stages/draw.rs
+++ b/piet-gpu/src/stages/draw.rs
@ -163,3 +163,15 @@ impl DrawStage {
        );
    }
 }
+
+impl DrawBinding {
+    pub unsafe fn rebind_memory(&mut self, session: &Session, memory: &Buffer) {
+        session.update_buffer_descriptor(&mut self.reduce_ds, 0, memory);
+        session.update_buffer_descriptor(&mut self.leaf_ds, 0, memory);
+    }
+
+    pub unsafe fn rebind_scene(&mut self, session: &Session, scene: &Buffer) {
+        session.update_buffer_descriptor(&mut self.reduce_ds, 2, scene);
+        session.update_buffer_descriptor(&mut self.leaf_ds, 2, scene);
+    }
+}
--- a/piet-gpu/src/stages/path.rs
+++ b/piet-gpu/src/stages/path.rs
@ -200,6 +200,19 @@ impl PathStage {
    }
 }

+impl PathBinding {
+    pub unsafe fn rebind_memory(&mut self, session: &Session, memory: &Buffer) {
+        session.update_buffer_descriptor(&mut self.reduce_ds, 0, memory);
+        session.update_buffer_descriptor(&mut self.clear_ds, 0, memory);
+        session.update_buffer_descriptor(&mut self.path_ds, 0, memory);
+    }
+
+    pub unsafe fn rebind_scene(&mut self, session: &Session, scene: &Buffer) {
+        session.update_buffer_descriptor(&mut self.reduce_ds, 2, scene);
+        session.update_buffer_descriptor(&mut self.path_ds, 2, scene);
+    }
+}
+
 pub struct PathEncoder<'a> {
    tag_stream: &'a mut Vec<u8>,
    // If we're never going to use the i16 encoding, it might be
--- a/piet-gpu/src/stages/transform.rs
+++ b/piet-gpu/src/stages/transform.rs
@ -166,6 +166,18 @@ impl TransformStage {
    }
 }

+impl TransformBinding {
+    pub unsafe fn rebind_memory(&mut self, session: &Session, memory: &Buffer) {
+        session.update_buffer_descriptor(&mut self.reduce_ds, 0, memory);
+        session.update_buffer_descriptor(&mut self.leaf_ds, 0, memory);
+    }
+
+    pub unsafe fn rebind_scene(&mut self, session: &Session, scene: &Buffer) {
+        session.update_buffer_descriptor(&mut self.reduce_ds, 2, scene);
+        session.update_buffer_descriptor(&mut self.leaf_ds, 2, scene);
+    }
+}
+
 impl Transform {
    pub const IDENTITY: Transform = Transform {
        mat: [1.0, 0.0, 0.0, 1.0],