Implement robust dynamic memory

This is the core logic for robust dynamic memory. There are changes to both shaders and the driver logic. On the shader side, failure information is more useful and fine grained. In particular, it now reports which stage failed and how much memory would have been required to make that stage succeed. On the driver side, there is a new RenderDriver abstraction which owns command buffers (and associated query pools) and runs the logic to retry and reallocate buffers when necessary. There's also a fairly significant rework of the logic to produce the config block, as that overlaps the robust memory. The RenderDriver abstraction may not stay. It was done this way to minimize code disruption, but arguably it should just be combined with Renderer. Another change: the GLSL length() method on a buffer requires additional infrastructure (at least on Metal, where it needs a binding of its own), so we now pass that in as a field in the config. This also moves blend memory to its own buffer. This worked out well because coarse rasterization can simply report the size of the blend buffer and it can be reallocated without needing to rerun the pipeline. In the previous state, blend allocations and ptcl writes were interleaved in coarse rasterization, so a failure of the former would require rerunning coarse. This should fix #83 (finally!) There are a few loose ends. The binaries haven't (yet) been updated (I've been testing using a hand-written test program). Gradients weren't touched so still have a fixed size allocation. And the logic to calculate the new buffer size on allocation failure could be smarter. Closes #175
2025-01-10 20:51:29 +11:00 · 2022-06-23 08:48:26 -07:00 · 2022-06-23 08:48:26 -07:00 · 240f44a228
parent 64e6268059
commit 240f44a228
18 changed files with 865 additions and 514 deletions
--- a/piet-gpu/shader/backdrop.comp
+++ b/piet-gpu/shader/backdrop.comp
@ -45,12 +45,15 @@ shared Alloc sh_row_alloc[BACKDROP_WG];
 shared uint sh_row_width[BACKDROP_WG];
 void main() {
    if (!check_deps(STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_PATH_COARSE)) {
        return;
    }
    uint th_ix = gl_LocalInvocationIndex;
    uint element_ix = gl_GlobalInvocationID.x;
    // Work assignment: 1 thread : 1 path element
    uint row_count = 0;
    bool mem_ok = mem_error == NO_ERROR;
    if (gl_LocalInvocationID.y == 0) {
        if (element_ix < conf.n_elements) {
            // Possible TODO: it's not necessary to process backdrops of stroked paths.
@ -68,7 +71,7 @@ void main() {
                row_count = 0;
            }
            Alloc path_alloc = new_alloc(
-                path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
+                path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, true);
            sh_row_alloc[th_ix] = path_alloc;
        }
        sh_row_count[th_ix] = row_count;
@ -98,7 +101,7 @@ void main() {
            }
        }
        uint width = sh_row_width[el_ix];
-        if (width > 0 && mem_ok) {
+        if (width > 0) {
            // Process one row sequentially
            // Read backdrop value per tile and prefix sum it
            Alloc tiles_alloc = sh_row_alloc[el_ix];
--- a/piet-gpu/shader/binning.comp
+++ b/piet-gpu/shader/binning.comp
@ -32,8 +32,7 @@ layout(set = 0, binding = 1) readonly buffer ConfigBuf {
 // Bitmaps are sliced (256bit into 8 (N_SLICE) 32bit submaps)
 shared uint bitmaps[N_SLICE][N_TILE];
 shared uint count[N_SLICE][N_TILE];
-shared Alloc sh_chunk_alloc[N_TILE];
+shared uint sh_chunk_offset[N_TILE];
 shared bool sh_alloc_failed;
 DrawMonoid load_draw_monoid(uint element_ix) {
    uint base = (conf.drawmonoid_alloc.offset >> 2) + 4 * element_ix;
@ -84,10 +83,6 @@ void main() {
    for (uint i = 0; i < N_SLICE; i++) {
        bitmaps[i][gl_LocalInvocationID.x] = 0;
    }
    if (gl_LocalInvocationID.x == 0) {
        sh_alloc_failed = false;
    }
    barrier();
    // Read inputs and determine coverage of bins
    uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x;
@ -148,26 +143,18 @@ void main() {
        count[i][gl_LocalInvocationID.x] = element_count;
    }
    // element_count is number of elements covering bin for this invocation.
-    Alloc chunk_alloc = new_alloc(0, 0, true);
+    uint chunk_offset = 0;
    if (element_count != 0) {
-        // TODO: aggregate atomic adds (subgroup is probably fastest)
+        chunk_offset = malloc_stage(element_count * BinInstance_size, conf.mem_size, STAGE_BINNING);
-        MallocResult chunk = malloc(element_count * BinInstance_size);
+        sh_chunk_offset[gl_LocalInvocationID.x] = chunk_offset;
        chunk_alloc = chunk.alloc;
        sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
        if (chunk.failed) {
            sh_alloc_failed = true;
        }
    }
    // Note: it might be more efficient for reading to do this in the
    // other order (each bin is a contiguous sequence of partitions)
    uint out_ix = (conf.bin_alloc.offset >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2;
    write_mem(conf.bin_alloc, out_ix, element_count);
-    write_mem(conf.bin_alloc, out_ix + 1, chunk_alloc.offset);
+    write_mem(conf.bin_alloc, out_ix + 1, chunk_offset);
    barrier();
    if (sh_alloc_failed || mem_error != NO_ERROR) {
        return;
    }
    // Use similar strategy as Laine & Karras paper; loop over bbox of bins
    // touched by this element
@ -181,9 +168,10 @@ void main() {
            if (my_slice > 0) {
                idx += count[my_slice - 1][bin_ix];
            }
-            Alloc out_alloc = sh_chunk_alloc[bin_ix];
+            uint chunk_offset = sh_chunk_offset[bin_ix];
-            uint out_offset = out_alloc.offset + idx * BinInstance_size;
+            if (chunk_offset != MALLOC_FAILED) {
-            BinInstance_write(out_alloc, BinInstanceRef(out_offset), BinInstance(element_ix));
+                memory[(chunk_offset >> 2) + idx] = element_ix;
            }
        }
        x++;
        if (x == x1) {
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@ -72,49 +72,62 @@ void write_tile_alloc(uint el_ix, Alloc a) {
 Alloc read_tile_alloc(uint el_ix, bool mem_ok) {
    // All memory.
-    return new_alloc(0, memory.length() * 4, mem_ok);
+    return new_alloc(0, conf.mem_size, mem_ok);
 }
 #endif
 // The maximum number of commands per annotated element.
 #define ANNO_COMMANDS 2
-// Perhaps cmd_alloc should be a global? This is a style question.
+// All writes to the output must be gated by mem_ok.
-bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) {
+bool mem_ok = true;
 // Perhaps cmd allocations should be a global? This is a style question.
 void alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) {
    if (cmd_ref.offset < cmd_limit) {
-        return true;
+        return;
    }
-    MallocResult new_cmd = malloc(PTCL_INITIAL_ALLOC);
+    uint new_cmd = malloc_stage(PTCL_INITIAL_ALLOC, conf.mem_size, STAGE_COARSE);
-    if (new_cmd.failed) {
+    if (new_cmd == MALLOC_FAILED) {
-        return false;
+        mem_ok = false;
    }
-    CmdJump jump = CmdJump(new_cmd.alloc.offset);
+    if (mem_ok) {
        CmdJump jump = CmdJump(new_cmd);
        Cmd_Jump_write(cmd_alloc, cmd_ref, jump);
-    cmd_alloc = new_cmd.alloc;
+    }
-    cmd_ref = CmdRef(cmd_alloc.offset);
+    cmd_alloc = new_alloc(new_cmd, PTCL_INITIAL_ALLOC, true);
    cmd_ref = CmdRef(new_cmd);
    // Reserve space for the maximum number of commands and a potential jump.
-    cmd_limit = cmd_alloc.offset + PTCL_INITIAL_ALLOC - (ANNO_COMMANDS + 1) * Cmd_size;
+    cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - (ANNO_COMMANDS + 1) * Cmd_size;
    return true;
 }
 void write_fill(Alloc alloc, inout CmdRef cmd_ref, Tile tile, float linewidth) {
    if (linewidth < 0.0) {
        if (tile.tile.offset != 0) {
            CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop);
            if (mem_ok) {
                Cmd_Fill_write(alloc, cmd_ref, cmd_fill);
            }
            cmd_ref.offset += 4 + CmdFill_size;
        } else {
            if (mem_ok) {
                Cmd_Solid_write(alloc, cmd_ref);
            }
            cmd_ref.offset += 4;
        }
    } else {
        CmdStroke cmd_stroke = CmdStroke(tile.tile.offset, 0.5 * linewidth);
        if (mem_ok) {
            Cmd_Stroke_write(alloc, cmd_ref, cmd_stroke);
        }
        cmd_ref.offset += 4 + CmdStroke_size;
    }
 }
 void main() {
    if (!check_deps(STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_PATH_COARSE)) {
        return;
    }
    // Could use either linear or 2d layouts for both dispatch and
    // invocations within the workgroup. We'll use variables to abstract.
    uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1) / N_TILE_X;
@ -161,7 +174,6 @@ void main() {
    uint drawtag_start = conf.drawtag_offset >> 2;
    uint drawdata_start = conf.drawdata_offset >> 2;
    uint drawinfo_start = conf.drawinfo_alloc.offset >> 2;
    bool mem_ok = mem_error == NO_ERROR;
    while (true) {
        for (uint i = 0; i < N_SLICE; i++) {
            sh_bitmaps[i][th_ix] = 0;
@ -176,7 +188,7 @@ void main() {
                    uint in_ix = (conf.bin_alloc.offset >> 2) + ((partition_ix + th_ix) * N_TILE + bin_ix) * 2;
                    count = read_mem(conf.bin_alloc, in_ix);
                    uint offset = read_mem(conf.bin_alloc, in_ix + 1);
-                    sh_part_elements[th_ix] = new_alloc(offset, count * BinInstance_size, mem_ok);
+                    sh_part_elements[th_ix] = new_alloc(offset, count * BinInstance_size, true);
                }
                // prefix sum of counts
                for (uint i = 0; i < LG_N_PART_READ; i++) {
@ -200,7 +212,7 @@ void main() {
            }
            // use binary search to find element to read
            uint ix = rd_ix + th_ix;
-            if (ix >= wr_ix && ix < ready_ix && mem_ok) {
+            if (ix >= wr_ix && ix < ready_ix) {
                uint part_ix = 0;
                for (uint i = 0; i < LG_N_PART_READ; i++) {
                    uint probe = part_ix + (uint(N_PART_READ / 2) >> i);
@ -257,7 +269,7 @@ void main() {
            uint base = path.tiles.offset - uint(dy * stride + dx) * Tile_size;
            sh_tile_base[th_ix] = base;
            Alloc path_alloc = new_alloc(path.tiles.offset,
-                                         (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
+                                         (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, true);
            write_tile_alloc(th_ix, path_alloc);
            break;
        default:
@ -293,8 +305,7 @@ void main() {
            uint x = sh_tile_x0[el_ix] + seq_ix % width;
            uint y = sh_tile_y0[el_ix] + seq_ix / width;
            bool include_tile = false;
-            if (mem_ok) {
+            Tile tile = Tile_read(read_tile_alloc(el_ix, true),
                Tile tile = Tile_read(read_tile_alloc(el_ix, mem_ok),
                                    TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
            bool is_clip = (tag & 1) != 0;
            // Always include the tile if it contains a path segment.
@ -313,7 +324,6 @@ void main() {
            }
            include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip
                || is_blend;
            }
            if (include_tile) {
                uint el_slice = el_ix / 32;
                uint el_mask = 1u << (el_ix & 31);
@ -327,7 +337,7 @@ void main() {
        // through the draw objects.
        uint slice_ix = 0;
        uint bitmap = sh_bitmaps[0][th_ix];
-        while (mem_ok) {
+        while (true) {
            if (bitmap == 0) {
                slice_ix++;
                if (slice_ix == N_SLICE) {
@ -347,7 +357,7 @@ void main() {
            uint drawtag = scene[drawtag_start + element_ix];
            if (clip_zero_depth == 0) {
-                Tile tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok),
+                Tile tile = Tile_read(read_tile_alloc(element_ref_ix, true),
                                        TileRef(sh_tile_base[element_ref_ix] +
                                                (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
                uint drawmonoid_base = drawmonoid_start + 4 * element_ix;
@ -358,18 +368,16 @@ void main() {
                switch (drawtag) {
                case Drawtag_FillColor:
                    float linewidth = uintBitsToFloat(memory[di]);
-                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
+                    alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
                        break;
                    }
                    write_fill(cmd_alloc, cmd_ref, tile, linewidth);
                    uint rgba = scene[dd];
                    if (mem_ok) {
                        Cmd_Color_write(cmd_alloc, cmd_ref, CmdColor(rgba));
                    }
                    cmd_ref.offset += 4 + CmdColor_size;
                    break;
                case Drawtag_FillLinGradient:
-                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
+                    alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
                        break;
                    }
                    linewidth = uintBitsToFloat(memory[di]);
                    write_fill(cmd_alloc, cmd_ref, tile, linewidth);
                    CmdLinGrad cmd_lin;
@ -377,13 +385,13 @@ void main() {
                    cmd_lin.line_x = uintBitsToFloat(memory[di + 1]);
                    cmd_lin.line_y = uintBitsToFloat(memory[di + 2]);
                    cmd_lin.line_c = uintBitsToFloat(memory[di + 3]);
                    if (mem_ok) {
                        Cmd_LinGrad_write(cmd_alloc, cmd_ref, cmd_lin);
                    }
                    cmd_ref.offset += 4 + CmdLinGrad_size;
                    break;
                case Drawtag_FillRadGradient:
-                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
+                    alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
                        break;
                    }
                    linewidth = uintBitsToFloat(memory[di]);
                    write_fill(cmd_alloc, cmd_ref, tile, linewidth);
                    CmdRadGrad cmd_rad;
@ -396,29 +404,31 @@ void main() {
                    cmd_rad.c1 = uintBitsToFloat(uvec2(memory[di + 7], memory[di + 8]));
                    cmd_rad.ra = uintBitsToFloat(memory[di + 9]);
                    cmd_rad.roff = uintBitsToFloat(memory[di + 10]);
                    if (mem_ok) {
                        Cmd_RadGrad_write(cmd_alloc, cmd_ref, cmd_rad);
                    }
                    cmd_ref.offset += 4 + CmdRadGrad_size;
                    break;
                case Drawtag_FillImage:
                    alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
                    linewidth = uintBitsToFloat(memory[di]);
                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
                        break;
                    }
                    write_fill(cmd_alloc, cmd_ref, tile, linewidth);
                    uint index = scene[dd];
                    uint raw1 = scene[dd + 1];
                    ivec2 offset = ivec2(int(raw1 << 16) >> 16, int(raw1) >> 16);
                    if (mem_ok) {
                        Cmd_Image_write(cmd_alloc, cmd_ref, CmdImage(index, offset));
                    }
                    cmd_ref.offset += 4 + CmdImage_size;
                    break;
                case Drawtag_BeginClip:
                    if (tile.tile.offset == 0 && tile.backdrop == 0) {
                        clip_zero_depth = clip_depth + 1;
                    } else {
-                        if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
+                        alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
-                            break;
+                        if (mem_ok) {
                        }
                            Cmd_BeginClip_write(cmd_alloc, cmd_ref);
                        }
                        cmd_ref.offset += 4;
                        render_blend_depth++;
                        max_blend_depth = max(max_blend_depth, render_blend_depth);
@ -427,12 +437,11 @@ void main() {
                    break;
                case Drawtag_EndClip:
                    clip_depth--;
                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
                        break;
                    }
                    write_fill(cmd_alloc, cmd_ref, tile, -1.0);
                    uint blend = scene[dd];
                    if (mem_ok) {
                        Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(blend));
                    }
                    cmd_ref.offset += 4 + CmdEndClip_size;
                    render_blend_depth--;
                    break;
@ -459,11 +468,13 @@ void main() {
            break;
    }
    if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) {
        if (mem_ok) {
            Cmd_End_write(cmd_alloc, cmd_ref);
        }
        if (max_blend_depth > BLEND_STACK_SPLIT) {
            uint scratch_size = max_blend_depth * TILE_WIDTH_PX * TILE_HEIGHT_PX * CLIP_STATE_SIZE * 4;
-            MallocResult scratch = malloc(scratch_size);
+            uint scratch = atomicAdd(blend_offset, scratch_size);
-            alloc_write(scratch_alloc, scratch_alloc.offset, scratch.alloc);
+            write_mem(scratch_alloc, scratch_alloc.offset >> 2, scratch);
        }
    }
 }
--- a/piet-gpu/shader/image.png
+++ b/piet-gpu/shader/image.png
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@ -14,6 +14,7 @@
 // higher quality antialiasing among other things).
 #define DO_SRGB_CONVERSION 0
 // TODO: the binding of the main buffer can be readonly
 #include "mem.h"
 #include "setup.h"
@ -24,19 +25,23 @@
 #define CHUNK_DY (TILE_HEIGHT_PX / CHUNK_Y)
 layout(local_size_x = CHUNK_DX, local_size_y = CHUNK_DY) in;
-layout(set = 0, binding = 1) restrict readonly buffer ConfigBuf {
+layout(binding = 1) restrict readonly buffer ConfigBuf {
    Config conf;
 };
 layout(binding = 2) buffer BlendBuf {
    uint blend_mem[];
 };
 #ifdef GRAY
-layout(r8, set = 0, binding = 2) uniform restrict writeonly image2D image;
+layout(r8, binding = 3) uniform restrict writeonly image2D image;
 #else
-layout(rgba8, set = 0, binding = 2) uniform restrict writeonly image2D image;
+layout(rgba8, binding = 3) uniform restrict writeonly image2D image;
 #endif
-layout(rgba8, set = 0, binding = 3) uniform restrict readonly image2D image_atlas;
+layout(rgba8, binding = 4) uniform restrict readonly image2D image_atlas;
-layout(rgba8, set = 0, binding = 4) uniform restrict readonly image2D gradients;
+layout(rgba8, binding = 5) uniform restrict readonly image2D gradients;
 #include "ptcl.h"
 #include "tile.h"
@ -114,8 +119,9 @@ void main() {
    mediump float area[CHUNK];
    uint clip_depth = 0;
-    bool mem_ok = mem_error == NO_ERROR;
+    // Previously we would early-out if there was a memory failure, so we wouldn't try to read corrupt
-    while (mem_ok) {
+    // tiles. But now we assume this is checked CPU-side before launching fine rasterization.
    while (true) {
        uint tag = Cmd_tag(cmd_alloc, cmd_ref).tag;
        if (tag == Cmd_End) {
            break;
@ -129,7 +135,7 @@ void main() {
                df[k] = 1e9;
            TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref);
            do {
-                TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, mem_ok), tile_seg_ref);
+                TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, true), tile_seg_ref);
                vec2 line_vec = seg.vector;
                for (uint k = 0; k < CHUNK; k++) {
                    vec2 dpos = xy + vec2(0.5, 0.5) - seg.origin;
@ -151,7 +157,7 @@ void main() {
            tile_seg_ref = TileSegRef(fill.tile_ref);
            // Calculate coverage based on backdrop + coverage of each line segment
            do {
-                TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, mem_ok), tile_seg_ref);
+                TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, true), tile_seg_ref);
                for (uint k = 0; k < CHUNK; k++) {
                    vec2 my_xy = xy + vec2(chunk_offset(k));
                    vec2 start = seg.origin - my_xy;
@ -248,7 +254,7 @@ void main() {
                uint base_ix = (blend_offset >> 2) + (clip_depth - BLEND_STACK_SPLIT) * TILE_HEIGHT_PX * TILE_WIDTH_PX +
                    CHUNK * (gl_LocalInvocationID.x + CHUNK_DX * gl_LocalInvocationID.y);
                for (uint k = 0; k < CHUNK; k++) {
-                    memory[base_ix + k] = packsRGB(vec4(rgba[k]));
+                    blend_mem[base_ix + k] = packsRGB(vec4(rgba[k]));
                    rgba[k] = vec4(0.0);
                }
            }
@ -268,7 +274,7 @@ void main() {
                if (clip_depth < BLEND_STACK_SPLIT) {
                    bg_rgba = blend_stack[clip_depth][k];
                } else {
-                    bg_rgba = memory[base_ix + k];
+                    bg_rgba = blend_mem[base_ix + k];
                }
                mediump vec4 bg = unpacksRGB(bg_rgba);
                mediump vec4 fg = rgba[k] * area[k];
--- a/piet-gpu/shader/mem.h
+++ b/piet-gpu/shader/mem.h
@ -3,27 +3,23 @@
 layout(set = 0, binding = 0) buffer Memory {
    // offset into memory of the next allocation, initialized by the user.
    uint mem_offset;
-    // mem_error tracks the status of memory accesses, initialized to NO_ERROR
+    // mem_error is a bitmask of stages that have failed allocation.
    // by the user. ERR_MALLOC_FAILED is reported for insufficient memory.
    // If MEM_DEBUG is defined the following errors are reported:
    // - ERR_OUT_OF_BOUNDS is reported for out of bounds writes.
    // - ERR_UNALIGNED_ACCESS for memory access not aligned to 32-bit words.
    uint mem_error;
    // offset into blend memory of allocations for blend stack.
    uint blend_offset;
    uint[] memory;
 };
 // Uncomment this line to add the size field to Alloc and enable memory checks.
 // Note that the Config struct in setup.h grows size fields as well.
 //#define MEM_DEBUG
-#define NO_ERROR 0
+// This setting is not working and the mechanism will be removed.
-#define ERR_MALLOC_FAILED 1
+//#define MEM_DEBUG
 #define ERR_OUT_OF_BOUNDS 2
 #define ERR_UNALIGNED_ACCESS 3
 #ifdef MEM_DEBUG
 #define Alloc_size 16
 #else
 // TODO: this seems wrong
 #define Alloc_size 8
 #endif
@ -37,12 +33,6 @@ struct Alloc {
 #endif
 };
 struct MallocResult {
    Alloc alloc;
    // failed is true if the allocation overflowed memory.
    bool failed;
 };
 // new_alloc synthesizes an Alloc from an offset and size.
 Alloc new_alloc(uint offset, uint size, bool mem_ok) {
    Alloc a;
@ -57,24 +47,32 @@ Alloc new_alloc(uint offset, uint size, bool mem_ok) {
    return a;
 }
-// malloc allocates size bytes of memory.
+#define STAGE_BINNING (1u << 0)
-MallocResult malloc(uint size) {
+#define STAGE_TILE_ALLOC (1u << 1)
-    MallocResult r;
+#define STAGE_PATH_COARSE (1u << 2)
 #define STAGE_COARSE (1u << 3)
 // Allocations in main memory will never be 0, and this might be slightly
 // faster to test against than some other value.
 #define MALLOC_FAILED 0
 // Check that previous dependent stages have succeeded.
 bool check_deps(uint dep_stage) {
    // TODO: this should be an atomic relaxed load, but that involves
    // bringing in "memory scope semantics"
    return (atomicOr(mem_error, 0) & dep_stage) == 0;
 }
 // Allocate size bytes of memory, offset in bytes.
 // Note: with a bit of rearrangement of header files, we could make the
 // mem_size argument go away (it comes from the config binding).
 uint malloc_stage(uint size, uint mem_size, uint stage) {
    uint offset = atomicAdd(mem_offset, size);
-    r.failed = offset + size > memory.length() * 4;
+    if (offset + size > mem_size) {
-    r.alloc = new_alloc(offset, size, !r.failed);
+        atomicOr(mem_error, stage);
-    if (r.failed) {
+        offset = MALLOC_FAILED;
        atomicMax(mem_error, ERR_MALLOC_FAILED);
        return r;
    }
-#ifdef MEM_DEBUG
+    return offset;
    if ((size & 3) != 0) {
        r.failed = true;
        atomicMax(mem_error, ERR_UNALIGNED_ACCESS);
        return r;
    }
 #endif
    return r;
 }
 // touch_mem checks whether access to the memory word at offset is valid.
--- a/piet-gpu/shader/path_coarse.comp
+++ b/piet-gpu/shader/path_coarse.comp
@ -87,7 +87,13 @@ SubdivResult estimate_subdiv(vec2 p0, vec2 p1, vec2 p2, float sqrt_tol) {
    return SubdivResult(val, a0, a2);
 }
 // All writes to the output must be gated by mem_ok.
 bool mem_ok = true;
 void main() {
    if (!check_deps(STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_PATH_COARSE)) {
        return;
    }
    uint element_ix = gl_GlobalInvocationID.x;
    PathSegRef ref = PathSegRef(conf.pathseg_alloc.offset + element_ix * PathSeg_size);
@ -95,24 +101,10 @@ void main() {
    if (element_ix < conf.n_pathseg) {
        tag = PathSeg_tag(conf.pathseg_alloc, ref);
    }
    bool mem_ok = mem_error == NO_ERROR;
    switch (tag.tag) {
    case PathSeg_Cubic:
        PathCubic cubic = PathSeg_Cubic_read(conf.pathseg_alloc, ref);
        // Affine transform is now applied in pathseg
        /*
        uint trans_ix = cubic.trans_ix;
        if (trans_ix > 0) {
            TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + (trans_ix - 1) * TransformSeg_size);
            TransformSeg trans = TransformSeg_read(conf.trans_alloc, trans_ref);
            cubic.p0 = trans.mat.xy * cubic.p0.x + trans.mat.zw * cubic.p0.y + trans.translate;
            cubic.p1 = trans.mat.xy * cubic.p1.x + trans.mat.zw * cubic.p1.y + trans.translate;
            cubic.p2 = trans.mat.xy * cubic.p2.x + trans.mat.zw * cubic.p2.y + trans.translate;
            cubic.p3 = trans.mat.xy * cubic.p3.x + trans.mat.zw * cubic.p3.y + trans.translate;
        }
        */
        vec2 err_v = 3.0 * (cubic.p2 - cubic.p1) + cubic.p0 - cubic.p3;
        float err = err_v.x * err_v.x + err_v.y * err_v.y;
        // The number of quadratics.
@ -140,7 +132,7 @@ void main() {
        uint path_ix = cubic.path_ix;
        Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size));
        Alloc path_alloc =
-            new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
+            new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, true);
        ivec4 bbox = ivec4(path.bbox);
        vec2 p0 = cubic.p0;
        qp0 = cubic.p0;
@ -199,11 +191,12 @@ void main() {
                // TODO: can be tighter, use c to bound width
                uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
                // Consider using subgroups to aggregate atomic add.
-                MallocResult tile_alloc = malloc(n_tile_alloc * TileSeg_size);
+                uint malloc_size = n_tile_alloc * TileSeg_size;
-                if (tile_alloc.failed || !mem_ok) {
+                uint tile_offset = malloc_stage(malloc_size, conf.mem_size, STAGE_PATH_COARSE);
-                    return;
+                if (tile_offset == MALLOC_FAILED) {
                    mem_ok = false;
                }
-                uint tile_offset = tile_alloc.alloc.offset;
+                Alloc tile_alloc = new_alloc(tile_offset, malloc_size, true);
                TileSeg tile_seg;
@ -221,10 +214,8 @@ void main() {
                        int backdrop = p1.y < p0.y ? 1 : -1;
                        TileRef tile_ref = Tile_index(path.tiles, uint(base + xbackdrop));
                        uint tile_el = tile_ref.offset >> 2;
                        if (touch_mem(path_alloc, tile_el + 1)) {
                        atomicAdd(memory[tile_el + 1], backdrop);
                    }
                    }
                    // next_xray is the xray for the next scanline; the line segment intersects
                    // all tiles between xray and next_xray.
@ -247,9 +238,7 @@ void main() {
                        TileRef tile_ref = Tile_index(TileRef(path.tiles.offset), uint(base + x));
                        uint tile_el = tile_ref.offset >> 2;
                        uint old = 0;
                        if (touch_mem(path_alloc, tile_el)) {
                        old = atomicExchange(memory[tile_el], tile_offset);
                        }
                        tile_seg.origin = p0;
                        tile_seg.vector = p1 - p0;
                        float y_edge = 0.0;
@ -276,7 +265,9 @@ void main() {
                        }
                        tile_seg.y_edge = y_edge;
                        tile_seg.next.offset = old;
-                        TileSeg_write(tile_alloc.alloc, TileSegRef(tile_offset), tile_seg);
+                        if (mem_ok) {
                            TileSeg_write(tile_alloc, TileSegRef(tile_offset), tile_seg);
                        }
                        tile_offset += TileSeg_size;
                    }
                    xc += b;
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@ -31,8 +31,9 @@
 // to memory for the overflow.
 #define BLEND_STACK_SPLIT 4
-#ifdef ERR_MALLOC_FAILED
+#ifdef MALLOC_FAILED
 struct Config {
    uint mem_size; // in bytes
    uint n_elements; // paths
    uint n_pathseg;
    uint width_in_tiles;
--- a/piet-gpu/shader/tile_alloc.comp
+++ b/piet-gpu/shader/tile_alloc.comp
@ -29,7 +29,7 @@ layout(binding = 2) readonly buffer SceneBuf {
 #define SY (1.0 / float(TILE_HEIGHT_PX))
 shared uint sh_tile_count[TILE_ALLOC_WG];
-shared MallocResult sh_tile_alloc;
+shared uint sh_tile_offset;
 vec4 load_draw_bbox(uint draw_ix) {
    uint base = (conf.draw_bbox_alloc.offset >> 2) + 4 * draw_ix;
@ -42,6 +42,9 @@ vec4 load_draw_bbox(uint draw_ix) {
 }
 void main() {
    if (!check_deps(STAGE_BINNING)) {
        return;
    }
    uint th_ix = gl_LocalInvocationID.x;
    uint element_ix = gl_GlobalInvocationID.x;
    // At the moment, element_ix == path_ix. The clip-intersected bounding boxes
@ -86,27 +89,24 @@ void main() {
        sh_tile_count[th_ix] = total_tile_count;
    }
    if (th_ix == TILE_ALLOC_WG - 1) {
-        sh_tile_alloc = malloc(total_tile_count * Tile_size);
+        sh_tile_offset = malloc_stage(total_tile_count * Tile_size, conf.mem_size, STAGE_TILE_ALLOC);
    }
    barrier();
-    MallocResult alloc_start = sh_tile_alloc;
+    uint offset_start = sh_tile_offset;
-    if (alloc_start.failed || mem_error != NO_ERROR) {
+    if (offset_start == MALLOC_FAILED) {
        return;
    }
    if (element_ix < conf.n_elements) {
        uint tile_subix = th_ix > 0 ? sh_tile_count[th_ix - 1] : 0;
-        Alloc tiles_alloc = slice_mem(alloc_start.alloc, Tile_size * tile_subix, Tile_size * tile_count);
+        path.tiles = TileRef(offset_start + Tile_size * tile_subix);
        path.tiles = TileRef(tiles_alloc.offset);
        Path_write(conf.tile_alloc, path_ref, path);
    }
    // Zero out allocated tiles efficiently
    uint total_count = sh_tile_count[TILE_ALLOC_WG - 1] * (Tile_size / 4);
-    uint start_ix = alloc_start.alloc.offset >> 2;
+    uint start_ix = offset_start >> 2;
    for (uint i = th_ix; i < total_count; i += TILE_ALLOC_WG) {
-        // Note: this interleaving is faster than using Tile_write
+        memory[start_ix + i] = 0;
        // by a significant amount.
        write_mem(alloc_start.alloc, start_ix + i, 0);
    }
 }
--- a/piet-gpu/src/encoder.rs
+++ b/piet-gpu/src/encoder.rs
@ -16,13 +16,12 @@
 //! Low-level scene encoding.
-use crate::Blend;
+use crate::{Blend, SceneStats, DRAWTAG_SIZE, TRANSFORM_SIZE};
 use bytemuck::{Pod, Zeroable};
 use piet_gpu_hal::BufWrite;
 use crate::stages::{
-    self, Config, PathEncoder, Transform, CLIP_PART_SIZE, DRAW_PART_SIZE, PATHSEG_PART_SIZE,
+    self, PathEncoder, Transform, DRAW_PART_SIZE, PATHSEG_PART_SIZE, TRANSFORM_PART_SIZE,
    TRANSFORM_PART_SIZE,
 };
 pub struct Encoder {
@ -52,86 +51,19 @@ pub struct EncodedSceneRef<'a, T: Copy + Pod> {
 }
 impl<'a, T: Copy + Pod> EncodedSceneRef<'a, T> {
-    /// Return a config for the element processing pipeline.
+    pub(crate) fn stats(&self) -> SceneStats {
-    ///
+        SceneStats {
-    /// This does not include further pipeline processing. Also returns the
+            n_drawobj: self.drawtag_stream.len(),
-    /// beginning of free memory.
+            drawdata_len: self.drawdata_stream.len(),
-    pub fn stage_config(&self) -> (Config, usize) {
+            n_transform: self.transform_stream.len(),
-        // Layout of scene buffer
+            linewidth_len: std::mem::size_of_val(self.linewidth_stream),
-        let drawtag_offset = 0;
+            pathseg_len: self.pathseg_stream.len(),
-        let n_drawobj = self.n_drawobj();
+            n_pathtag: self.tag_stream.len(),
        let n_drawobj_padded = align_up(n_drawobj, DRAW_PART_SIZE as usize);
        let drawdata_offset = drawtag_offset + n_drawobj_padded * DRAWTAG_SIZE;
        let trans_offset = drawdata_offset + self.drawdata_stream.len();
        let n_trans = self.transform_stream.len();
        let n_trans_padded = align_up(n_trans, TRANSFORM_PART_SIZE as usize);
        let linewidth_offset = trans_offset + n_trans_padded * TRANSFORM_SIZE;
        let n_linewidth = self.linewidth_stream.len();
        let pathtag_offset = linewidth_offset + n_linewidth * LINEWIDTH_SIZE;
        let n_pathtag = self.tag_stream.len();
        let n_pathtag_padded = align_up(n_pathtag, PATHSEG_PART_SIZE as usize);
        let pathseg_offset = pathtag_offset + n_pathtag_padded;
        // Layout of memory
        let mut alloc = 0;
        let trans_alloc = alloc;
        alloc += trans_alloc + n_trans_padded * TRANSFORM_SIZE;
        let pathseg_alloc = alloc;
        alloc += pathseg_alloc + self.n_pathseg as usize * PATHSEG_SIZE;
        let path_bbox_alloc = alloc;
        let n_path = self.n_path as usize;
        alloc += path_bbox_alloc + n_path * PATH_BBOX_SIZE;
        let drawmonoid_alloc = alloc;
        alloc += n_drawobj_padded * DRAWMONOID_SIZE;
        let anno_alloc = alloc;
        alloc += n_drawobj * ANNOTATED_SIZE;
        let clip_alloc = alloc;
        let n_clip = self.n_clip as usize;
        const CLIP_SIZE: usize = 4;
        alloc += n_clip * CLIP_SIZE;
        let clip_bic_alloc = alloc;
        const CLIP_BIC_SIZE: usize = 8;
        // This can round down, as we only reduce the prefix
        alloc += (n_clip / CLIP_PART_SIZE as usize) * CLIP_BIC_SIZE;
        let clip_stack_alloc = alloc;
        const CLIP_EL_SIZE: usize = 20;
        alloc += n_clip * CLIP_EL_SIZE;
        let clip_bbox_alloc = alloc;
        const CLIP_BBOX_SIZE: usize = 16;
        alloc += align_up(n_clip as usize, CLIP_PART_SIZE as usize) * CLIP_BBOX_SIZE;
        let draw_bbox_alloc = alloc;
        alloc += n_drawobj * DRAW_BBOX_SIZE;
        let drawinfo_alloc = alloc;
        // TODO: not optimized; it can be accumulated during encoding or summed from drawtags
        const MAX_DRAWINFO_SIZE: usize = 44;
        alloc += n_drawobj * MAX_DRAWINFO_SIZE;
        let config = Config {
            n_elements: n_drawobj as u32,
            n_pathseg: self.n_pathseg,
            pathseg_alloc: pathseg_alloc as u32,
            anno_alloc: anno_alloc as u32,
            trans_alloc: trans_alloc as u32,
            path_bbox_alloc: path_bbox_alloc as u32,
            drawmonoid_alloc: drawmonoid_alloc as u32,
            clip_alloc: clip_alloc as u32,
            clip_bic_alloc: clip_bic_alloc as u32,
            clip_stack_alloc: clip_stack_alloc as u32,
            clip_bbox_alloc: clip_bbox_alloc as u32,
            draw_bbox_alloc: draw_bbox_alloc as u32,
            drawinfo_alloc: drawinfo_alloc as u32,
            n_trans: n_trans as u32,
            n_path: self.n_path,
            n_pathseg: self.n_pathseg,
            n_clip: self.n_clip,
-            trans_offset: trans_offset as u32,
+        }
            linewidth_offset: linewidth_offset as u32,
            pathtag_offset: pathtag_offset as u32,
            pathseg_offset: pathseg_offset as u32,
            drawtag_offset: drawtag_offset as u32,
            drawdata_offset: drawdata_offset as u32,
            ..Default::default()
        };
        (config, alloc)
    }
    pub fn write_scene(&self, buf: &mut BufWrite) {
@ -148,34 +80,6 @@ impl<'a, T: Copy + Pod> EncodedSceneRef<'a, T> {
        buf.fill_zero(padding(n_pathtag, PATHSEG_PART_SIZE as usize));
        buf.extend_slice(&self.pathseg_stream);
    }
    /// The number of draw objects in the draw object stream.
    pub(crate) fn n_drawobj(&self) -> usize {
        self.drawtag_stream.len()
    }
    /// The number of paths.
    pub(crate) fn n_path(&self) -> u32 {
        self.n_path
    }
    /// The number of path segments.
    pub(crate) fn n_pathseg(&self) -> u32 {
        self.n_pathseg
    }
    pub(crate) fn n_transform(&self) -> usize {
        self.transform_stream.len()
    }
    /// The number of tags in the path stream.
    pub(crate) fn n_pathtag(&self) -> usize {
        self.tag_stream.len()
    }
    pub(crate) fn n_clip(&self) -> u32 {
        self.n_clip
    }
 }
 /// A scene fragment encoding a glyph.
@ -191,15 +95,6 @@ pub struct GlyphEncoder {
    n_pathseg: u32,
 }
 const TRANSFORM_SIZE: usize = 24;
 const LINEWIDTH_SIZE: usize = 4;
 const PATHSEG_SIZE: usize = 52;
 const PATH_BBOX_SIZE: usize = 24;
 const DRAWMONOID_SIZE: usize = 16;
 const DRAW_BBOX_SIZE: usize = 16;
 const DRAWTAG_SIZE: usize = 4;
 const ANNOTATED_SIZE: usize = 40;
 // Tags for draw objects. See shader/drawtag.h for the authoritative source.
 const DRAWTAG_FILLCOLOR: u32 = 0x44;
 const DRAWTAG_FILLLINGRADIENT: u32 = 0x114;
@ -343,88 +238,6 @@ impl Encoder {
        self.n_clip += 1;
    }
    /// Return a config for the element processing pipeline.
    ///
    /// This does not include further pipeline processing. Also returns the
    /// beginning of free memory.
    pub fn stage_config(&self) -> (Config, usize) {
        // Layout of scene buffer
        let drawtag_offset = 0;
        let n_drawobj = self.n_drawobj();
        let n_drawobj_padded = align_up(n_drawobj, DRAW_PART_SIZE as usize);
        let drawdata_offset = drawtag_offset + n_drawobj_padded * DRAWTAG_SIZE;
        let trans_offset = drawdata_offset + self.drawdata_stream.len();
        let n_trans = self.transform_stream.len();
        let n_trans_padded = align_up(n_trans, TRANSFORM_PART_SIZE as usize);
        let linewidth_offset = trans_offset + n_trans_padded * TRANSFORM_SIZE;
        let n_linewidth = self.linewidth_stream.len();
        let pathtag_offset = linewidth_offset + n_linewidth * LINEWIDTH_SIZE;
        let n_pathtag = self.tag_stream.len();
        let n_pathtag_padded = align_up(n_pathtag, PATHSEG_PART_SIZE as usize);
        let pathseg_offset = pathtag_offset + n_pathtag_padded;
        // Layout of memory
        let mut alloc = 0;
        let trans_alloc = alloc;
        alloc += trans_alloc + n_trans_padded * TRANSFORM_SIZE;
        let pathseg_alloc = alloc;
        alloc += pathseg_alloc + self.n_pathseg as usize * PATHSEG_SIZE;
        let path_bbox_alloc = alloc;
        let n_path = self.n_path as usize;
        alloc += path_bbox_alloc + n_path * PATH_BBOX_SIZE;
        let drawmonoid_alloc = alloc;
        alloc += n_drawobj_padded * DRAWMONOID_SIZE;
        let anno_alloc = alloc;
        alloc += n_drawobj * ANNOTATED_SIZE;
        let clip_alloc = alloc;
        let n_clip = self.n_clip as usize;
        const CLIP_SIZE: usize = 4;
        alloc += n_clip * CLIP_SIZE;
        let clip_bic_alloc = alloc;
        const CLIP_BIC_SIZE: usize = 8;
        // This can round down, as we only reduce the prefix
        alloc += (n_clip / CLIP_PART_SIZE as usize) * CLIP_BIC_SIZE;
        let clip_stack_alloc = alloc;
        const CLIP_EL_SIZE: usize = 20;
        alloc += n_clip * CLIP_EL_SIZE;
        let clip_bbox_alloc = alloc;
        const CLIP_BBOX_SIZE: usize = 16;
        alloc += align_up(n_clip as usize, CLIP_PART_SIZE as usize) * CLIP_BBOX_SIZE;
        let draw_bbox_alloc = alloc;
        alloc += n_drawobj * DRAW_BBOX_SIZE;
        let drawinfo_alloc = alloc;
        // TODO: not optimized; it can be accumulated during encoding or summed from drawtags
        const MAX_DRAWINFO_SIZE: usize = 44;
        alloc += n_drawobj * MAX_DRAWINFO_SIZE;
        let config = Config {
            n_elements: n_drawobj as u32,
            n_pathseg: self.n_pathseg,
            pathseg_alloc: pathseg_alloc as u32,
            anno_alloc: anno_alloc as u32,
            trans_alloc: trans_alloc as u32,
            path_bbox_alloc: path_bbox_alloc as u32,
            drawmonoid_alloc: drawmonoid_alloc as u32,
            clip_alloc: clip_alloc as u32,
            clip_bic_alloc: clip_bic_alloc as u32,
            clip_stack_alloc: clip_stack_alloc as u32,
            clip_bbox_alloc: clip_bbox_alloc as u32,
            draw_bbox_alloc: draw_bbox_alloc as u32,
            drawinfo_alloc: drawinfo_alloc as u32,
            n_trans: n_trans as u32,
            n_path: self.n_path,
            n_clip: self.n_clip,
            trans_offset: trans_offset as u32,
            linewidth_offset: linewidth_offset as u32,
            pathtag_offset: pathtag_offset as u32,
            pathseg_offset: pathseg_offset as u32,
            drawtag_offset: drawtag_offset as u32,
            drawdata_offset: drawdata_offset as u32,
            ..Default::default()
        };
        (config, alloc)
    }
    pub fn write_scene(&self, buf: &mut BufWrite) {
        buf.extend_slice(&self.drawtag_stream);
        let n_drawobj = self.drawtag_stream.len();
@ -440,32 +253,19 @@ impl Encoder {
        buf.extend_slice(&self.pathseg_stream);
    }
-    /// The number of draw objects in the draw object stream.
+    pub(crate) fn stats(&self) -> SceneStats {
-    pub(crate) fn n_drawobj(&self) -> usize {
+        SceneStats {
-        self.drawtag_stream.len()
+            n_drawobj: self.drawtag_stream.len(),
-    }
+            drawdata_len: self.drawdata_stream.len(),
            n_transform: self.transform_stream.len(),
            linewidth_len: std::mem::size_of_val(&*self.linewidth_stream),
            n_pathtag: self.tag_stream.len(),
            pathseg_len: self.pathseg_stream.len(),
-    /// The number of paths.
+            n_path: self.n_path,
-    pub(crate) fn n_path(&self) -> u32 {
+            n_pathseg: self.n_pathseg,
-        self.n_path
+            n_clip: self.n_clip,
        }
    /// The number of path segments.
    pub(crate) fn n_pathseg(&self) -> u32 {
        self.n_pathseg
    }
    pub(crate) fn n_transform(&self) -> usize {
        self.transform_stream.len()
    }
    /// The number of tags in the path stream.
    pub(crate) fn n_pathtag(&self) -> usize {
        self.tag_stream.len()
    }
    pub(crate) fn n_clip(&self) -> u32 {
        self.n_clip
    }
    pub(crate) fn encode_glyph(&mut self, glyph: &GlyphEncoder) {
@ -478,11 +278,6 @@ impl Encoder {
    }
 }
 fn align_up(x: usize, align: usize) -> usize {
    debug_assert!(align.is_power_of_two());
    (x + align - 1) & !(align - 1)
 }
 fn padding(x: usize, align: usize) -> usize {
    x.wrapping_neg() & (align - 1)
 }
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -4,17 +4,19 @@ pub mod glyph_render;
 mod gradient;
 mod pico_svg;
 mod render_ctx;
 mod render_driver;
 pub mod stages;
 pub mod test_scenes;
 mod text;
-use bytemuck::Pod;
+use bytemuck::{Pod, Zeroable};
 use std::convert::TryInto;
 pub use blend::{Blend, BlendMode, CompositionMode};
 pub use encoder::EncodedSceneRef;
 pub use gradient::Colrv1RadialGradient;
 pub use render_ctx::PietGpuRenderContext;
 pub use render_driver::RenderDriver;
 use piet::kurbo::Vec2;
 use piet::{ImageFormat, RenderContext};
@ -25,9 +27,12 @@ use piet_gpu_hal::{
 };
 pub use pico_svg::PicoSvg;
-use stages::{ClipBinding, ElementBinding, ElementCode};
+use stages::{
    ClipBinding, ElementBinding, ElementCode, DRAW_PART_SIZE, PATHSEG_PART_SIZE,
    TRANSFORM_PART_SIZE,
 };
-use crate::stages::{ClipCode, Config, ElementStage};
+use crate::stages::{ClipCode, Config, ElementStage, CLIP_PART_SIZE};
 const TILE_W: usize = 16;
 const TILE_H: usize = 16;
@ -64,6 +69,31 @@ pub enum PixelFormat {
    Rgba8,
 }
 #[repr(C)]
 #[derive(Clone, Copy, Debug, Zeroable, Pod)]
 pub(crate) struct MemoryHeader {
    mem_offset: u32,
    mem_error: u32,
    blend_offset: u32,
 }
 /// The sizes of various objects in the encoded scene, needed for memory layout.
 #[derive(Default)]
 pub(crate) struct SceneStats {
    // Slices of scene encoding, in order
    pub n_drawobj: usize,
    pub drawdata_len: usize,
    pub n_transform: usize,
    pub linewidth_len: usize,
    pub pathseg_len: usize,
    pub n_pathtag: usize,
    // Additional stats needed needed for memory layout & dispatch
    pub n_path: u32,
    pub n_pathseg: u32,
    pub n_clip: u32,
 }
 pub struct Renderer {
    // These sizes are aligned to tile boundaries, though at some point
    // we'll want to have a good strategy for dealing with odd sizes.
@ -72,18 +102,23 @@ pub struct Renderer {
    pub image_dev: Image, // resulting image
-    // The reference is held by the pipelines. We will be changing
+    // TODO: two changes needed here. First, if we're fencing on the coarse
-    // this to make the scene upload dynamic.
+    // pipeline, then we only need one copy (this changes if we also bind the
    // scene buffer in fine rasterization, which might be a good idea to reduce
    // copying). Second, there should be a staging buffer for discrete cards.
    scene_bufs: Vec<Buffer>,
    memory_buf_host: Vec<Buffer>,
    memory_buf_dev: Buffer,
    memory_buf_readback: Buffer,
    // Staging buffers
    config_bufs: Vec<Buffer>,
    // Device config buf
    config_buf: Buffer,
    blend_buf: Buffer,
    // New element pipeline
    element_code: ElementCode,
    element_stage: ElementStage,
@ -111,6 +146,8 @@ pub struct Renderer {
    k4_pipeline: Pipeline,
    k4_ds: DescriptorSet,
    scene_stats: SceneStats,
    // TODO: the following stats are now redundant and can be removed.
    n_transform: usize,
    n_drawobj: usize,
    n_paths: usize,
@ -166,12 +203,18 @@ impl Renderer {
        let width = width + (width.wrapping_neg() & (TILE_W - 1));
        let height = height + (height.wrapping_neg() & (TILE_W - 1));
        let dev = BufferUsage::STORAGE | BufferUsage::COPY_DST;
-        let host_upload = BufferUsage::MAP_WRITE | BufferUsage::COPY_SRC;
+        let usage_mem_dev = BufferUsage::STORAGE | BufferUsage::COPY_DST | BufferUsage::COPY_SRC;
        let usage_blend = BufferUsage::STORAGE;
        let usage_upload = BufferUsage::MAP_WRITE | BufferUsage::COPY_SRC;
        let usage_readback = BufferUsage::MAP_READ | BufferUsage::COPY_DST;
        // This may be inadequate for very complex scenes (paris etc)
        // TODO: separate staging buffer (if needed)
        let scene_bufs = (0..n_bufs)
-            .map(|_| session.create_buffer(8 * 1024 * 1024, host_upload).unwrap())
+            .map(|_| {
                session
                    .create_buffer(8 * 1024 * 1024, usage_upload)
                    .unwrap()
            })
            .collect::<Vec<_>>();
        let image_format = match config.format {
@ -185,15 +228,22 @@ impl Renderer {
        let config_bufs = (0..n_bufs)
            .map(|_| {
                session
-                    .create_buffer(CONFIG_BUFFER_SIZE, host_upload)
+                    .create_buffer(CONFIG_BUFFER_SIZE, usage_upload)
                    .unwrap()
            })
            .collect();
        let memory_buf_host = (0..n_bufs)
-            .map(|_| session.create_buffer(2 * 4, host_upload).unwrap())
+            .map(|_| {
                session
                    .create_buffer(std::mem::size_of::<MemoryHeader>() as u64, usage_upload)
                    .unwrap()
            })
            .collect();
-        let memory_buf_dev = session.create_buffer(128 * 1024 * 1024, dev)?;
+        let memory_buf_dev = session.create_buffer(16 * 1024 * 1024, usage_mem_dev)?;
        let memory_buf_readback =
            session.create_buffer(std::mem::size_of::<MemoryHeader>() as u64, usage_readback)?;
        let blend_buf = session.create_buffer(16 * 1024 * 1024, usage_blend)?;
        let element_code = ElementCode::new(session);
        let element_stage = ElementStage::new(session, &element_code);
@ -282,7 +332,7 @@ impl Renderer {
        let gradient_bufs = (0..n_bufs)
            .map(|_| {
                session
-                    .create_buffer(GRADIENT_BUF_SIZE as u64, host_upload)
+                    .create_buffer(GRADIENT_BUF_SIZE as u64, usage_upload)
                    .unwrap()
            })
            .collect();
@ -297,6 +347,7 @@ impl Renderer {
            &[
                BindType::Buffer,
                BindType::BufReadOnly,
                BindType::Buffer,
                BindType::Image,
                BindType::ImageRead,
                BindType::ImageRead,
@ -304,19 +355,22 @@ impl Renderer {
        )?;
        let k4_ds = session
            .descriptor_set_builder()
-            .add_buffers(&[&memory_buf_dev, &config_buf])
+            .add_buffers(&[&memory_buf_dev, &config_buf, &blend_buf])
            .add_images(&[&image_dev])
            .add_textures(&[&bg_image, &gradients])
            .build(&session, &k4_pipeline)?;
        let scene_stats = Default::default();
        Ok(Renderer {
            width,
            height,
            scene_bufs,
            memory_buf_host,
            memory_buf_dev,
            memory_buf_readback,
            config_buf,
            config_bufs,
            blend_buf,
            image_dev,
            element_code,
            element_stage,
@ -336,6 +390,7 @@ impl Renderer {
            coarse_ds,
            k4_pipeline,
            k4_ds,
            scene_stats,
            n_transform: 0,
            n_drawobj: 0,
            n_paths: 0,
@ -358,43 +413,14 @@ impl Renderer {
        render_ctx: &mut PietGpuRenderContext,
        buf_ix: usize,
    ) -> Result<(), Error> {
-        let (mut config, mut alloc) = render_ctx.stage_config();
+        self.scene_stats = render_ctx.stats();
        let n_drawobj = render_ctx.n_drawobj();
        // TODO: be more consistent in size types
        let n_path = render_ctx.n_path() as usize;
        self.n_paths = n_path;
        self.n_transform = render_ctx.n_transform();
        self.n_drawobj = render_ctx.n_drawobj();
        self.n_pathseg = render_ctx.n_pathseg() as usize;
        self.n_pathtag = render_ctx.n_pathtag();
        self.n_clip = render_ctx.n_clip();
        // These constants depend on encoding and may need to be updated.
        // Perhaps we can plumb these from piet-gpu-derive?
        const PATH_SIZE: usize = 12;
        const BIN_SIZE: usize = 8;
        let width_in_tiles = self.width / TILE_W;
        let height_in_tiles = self.height / TILE_H;
        let tile_base = alloc;
        alloc += ((n_path + 3) & !3) * PATH_SIZE;
        let bin_base = alloc;
        alloc += ((n_drawobj + 255) & !255) * BIN_SIZE;
        let ptcl_base = alloc;
        alloc += width_in_tiles * height_in_tiles * PTCL_INITIAL_ALLOC;
        config.width_in_tiles = width_in_tiles as u32;
        config.height_in_tiles = height_in_tiles as u32;
        config.tile_alloc = tile_base as u32;
        config.bin_alloc = bin_base as u32;
        config.ptcl_alloc = ptcl_base as u32;
        unsafe {
-            // TODO: reallocate scene buffer if size is inadequate
+            self.upload_config(buf_ix)?;
            {
                let mut mapped_scene = self.scene_bufs[buf_ix].map_write(..)?;
                render_ctx.write_scene(&mut mapped_scene);
            }
            self.config_bufs[buf_ix].write(&[config])?;
            self.memory_buf_host[buf_ix].write(&[alloc as u32, 0 /* Overflow flag */])?;
            // Upload gradient data.
            let ramp_data = render_ctx.get_ramp_data();
@ -414,43 +440,14 @@ impl Renderer {
        scene: &EncodedSceneRef<T>,
        buf_ix: usize,
    ) -> Result<(), Error> {
-        let (mut config, mut alloc) = scene.stage_config();
+        self.scene_stats = scene.stats();
        let n_drawobj = scene.n_drawobj();
        // TODO: be more consistent in size types
        let n_path = scene.n_path() as usize;
        self.n_paths = n_path;
        self.n_transform = scene.n_transform();
        self.n_drawobj = scene.n_drawobj();
        self.n_pathseg = scene.n_pathseg() as usize;
        self.n_pathtag = scene.n_pathtag();
        self.n_clip = scene.n_clip();
        // These constants depend on encoding and may need to be updated.
        // Perhaps we can plumb these from piet-gpu-derive?
        const PATH_SIZE: usize = 12;
        const BIN_SIZE: usize = 8;
        let width_in_tiles = self.width / TILE_W;
        let height_in_tiles = self.height / TILE_H;
        let tile_base = alloc;
        alloc += ((n_path + 3) & !3) * PATH_SIZE;
        let bin_base = alloc;
        alloc += ((n_drawobj + 255) & !255) * BIN_SIZE;
        let ptcl_base = alloc;
        alloc += width_in_tiles * height_in_tiles * PTCL_INITIAL_ALLOC;
        config.width_in_tiles = width_in_tiles as u32;
        config.height_in_tiles = height_in_tiles as u32;
        config.tile_alloc = tile_base as u32;
        config.bin_alloc = bin_base as u32;
        config.ptcl_alloc = ptcl_base as u32;
        unsafe {
-            // TODO: reallocate scene buffer if size is inadequate
+            self.upload_config(buf_ix)?;
            {
                let mut mapped_scene = self.scene_bufs[buf_ix].map_write(..)?;
                scene.write_scene(&mut mapped_scene);
            }
            self.config_bufs[buf_ix].write(&[config])?;
            self.memory_buf_host[buf_ix].write(&[alloc as u32, 0 /* Overflow flag */])?;
            // Upload gradient data.
            if !scene.ramp_data.is_empty() {
@ -464,7 +461,44 @@ impl Renderer {
        Ok(())
    }
-    pub unsafe fn record(&self, cmd_buf: &mut CmdBuf, query_pool: &QueryPool, buf_ix: usize) {
+    // Note: configuration has to be re-uploaded when memory buffer is resized
    pub(crate) unsafe fn upload_config(
        &mut self,
        buf_ix: usize,
    ) -> Result<(), Error> {
        let stats = &self.scene_stats;
        let n_path = stats.n_path as usize;
        self.n_paths = n_path;
        self.n_transform = stats.n_transform;
        self.n_drawobj = stats.n_drawobj;
        self.n_pathseg = stats.n_pathseg as usize;
        self.n_pathtag = stats.n_pathtag;
        self.n_clip = stats.n_clip;
        let (mut config, alloc) = stats.config(self.width, self.height);
        config.mem_size = self.memory_buf_size() as u32;
        self.config_bufs[buf_ix].write(&[config])?;
        let mem_header = MemoryHeader {
            mem_offset: alloc as u32,
            mem_error: 0,
            blend_offset: 0,
        };
        // Note: we could skip doing this on realloc, but probably not worth the bother
        self.memory_buf_host[buf_ix].write(&[mem_header])?;
        Ok(())
    }
    /// Get the size of memory for the allocations known in advance.
    pub(crate) fn memory_size(&self, stats: &SceneStats) -> usize {
        stats.config(self.width, self.height).1
    }
    /// Record the coarse part of a render pipeline.
    pub unsafe fn record_coarse(
        &self,
        cmd_buf: &mut CmdBuf,
        query_pool: &QueryPool,
        buf_ix: usize,
    ) {
        cmd_buf.copy_buffer(&self.config_bufs[buf_ix], &self.config_buf);
        cmd_buf.copy_buffer(&self.memory_buf_host[buf_ix], &self.memory_buf_dev);
        cmd_buf.memory_barrier();
@ -558,9 +592,20 @@ impl Renderer {
        pass.end();
        cmd_buf.end_debug_label();
        cmd_buf.memory_barrier();
    }
    pub unsafe fn record_fine(
        &self,
        cmd_buf: &mut CmdBuf,
        query_pool: &QueryPool,
        query_start: u32,
    ) {
        cmd_buf.begin_debug_label("Fine raster");
-        let mut pass =
+        let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(
-            cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 10, 11));
+            &query_pool,
            query_start,
            query_start + 1,
        ));
        pass.dispatch(
            &self.k4_pipeline,
            &self.k4_ds,
@ -577,6 +622,19 @@ impl Renderer {
        cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
    }
    pub unsafe fn record_readback(&self, cmd_buf: &mut CmdBuf) {
        cmd_buf.copy_buffer(&self.memory_buf_dev, &self.memory_buf_readback);
        cmd_buf.memory_barrier();
    }
    /// Record a render pipeline.
    ///
    /// This *assumes* the buffers are adequately sized.
    pub unsafe fn record(&self, cmd_buf: &mut CmdBuf, query_pool: &QueryPool, buf_ix: usize) {
        self.record_coarse(cmd_buf, query_pool, buf_ix);
        self.record_fine(cmd_buf, query_pool, 10);
    }
    pub fn make_image(
        session: &Session,
        width: usize,
@ -636,4 +694,210 @@ impl Renderer {
                .unwrap()
        }
    }
    pub(crate) unsafe fn realloc_scene_if_needed(
        &mut self,
        session: &Session,
        new_size: u64,
        buf_ix: usize,
    ) -> Result<(), Error> {
        if new_size <= self.scene_bufs[buf_ix].size() {
            return Ok(());
        }
        const ALIGN: u64 = 0x10000;
        let new_size = (new_size + ALIGN - 1) & ALIGN.wrapping_neg();
        println!(
            "reallocating scene buf[{}] {} -> {}",
            buf_ix,
            self.scene_bufs[buf_ix].size(),
            new_size
        );
        let usage_upload = BufferUsage::MAP_WRITE | BufferUsage::COPY_SRC;
        let scene_buf = session.create_buffer(new_size, usage_upload)?;
        self.element_bindings[buf_ix].rebind_scene(session, &scene_buf);
        session.update_buffer_descriptor(&mut self.tile_ds[buf_ix], 2, &scene_buf);
        session.update_buffer_descriptor(&mut self.coarse_ds[buf_ix], 2, &scene_buf);
        self.scene_bufs[buf_ix] = scene_buf;
        Ok(())
    }
    /// Get the size of the memory buffer.
    ///
    /// This is the usable size (not including the header).
    pub(crate) fn memory_buf_size(&self) -> u64 {
        self.memory_buf_dev.size() - std::mem::size_of::<MemoryHeader>() as u64
    }
    pub(crate) unsafe fn realloc_memory(
        &mut self,
        session: &Session,
        new_size: u64,
    ) -> Result<(), Error> {
        println!(
            "reallocating memory buf {} -> {}",
            self.memory_buf_dev.size(),
            new_size
        );
        let usage_mem_dev = BufferUsage::STORAGE | BufferUsage::COPY_DST | BufferUsage::COPY_SRC;
        let memory_buf_dev = session.create_buffer(new_size, usage_mem_dev)?;
        for element_binding in &mut self.element_bindings {
            element_binding.rebind_memory(session, &memory_buf_dev);
        }
        self.clip_binding.rebind_memory(session, &memory_buf_dev);
        for tile_ds in &mut self.tile_ds {
            session.update_buffer_descriptor(tile_ds, 0, &memory_buf_dev);
        }
        session.update_buffer_descriptor(&mut self.path_ds, 0, &memory_buf_dev);
        session.update_buffer_descriptor(&mut self.backdrop_ds, 0, &memory_buf_dev);
        session.update_buffer_descriptor(&mut self.bin_ds, 0, &memory_buf_dev);
        for coarse_ds in &mut self.coarse_ds {
            session.update_buffer_descriptor(coarse_ds, 0, &memory_buf_dev);
        }
        session.update_buffer_descriptor(&mut self.k4_ds, 0, &memory_buf_dev);
        self.memory_buf_dev = memory_buf_dev;
        Ok(())
    }
    pub(crate) fn blend_size(&self) -> u64 {
        self.blend_buf.size()
    }
    pub(crate) unsafe fn realloc_blend(
        &mut self,
        session: &Session,
        new_size: u64,
    ) -> Result<(), Error> {
        println!(
            "reallocating blend buf {} -> {}",
            self.blend_size(),
            new_size
        );
        let usage_blend = BufferUsage::STORAGE;
        let blend_buf = session.create_buffer(new_size, usage_blend)?;
        session.update_buffer_descriptor(&mut self.k4_ds, 2, &blend_buf);
        self.blend_buf = blend_buf;
        Ok(())
    }
 }
 const TRANSFORM_SIZE: usize = 24;
 const PATHSEG_SIZE: usize = 52;
 const PATH_BBOX_SIZE: usize = 24;
 const DRAWMONOID_SIZE: usize = 16;
 const DRAW_BBOX_SIZE: usize = 16;
 const DRAWTAG_SIZE: usize = 4;
 const ANNOTATED_SIZE: usize = 40;
 impl SceneStats {
    pub(crate) fn scene_size(&self) -> usize {
        align_up(self.n_drawobj, DRAW_PART_SIZE as usize) * DRAWTAG_SIZE
            + self.drawdata_len
            + align_up(self.n_transform, TRANSFORM_PART_SIZE as usize) * TRANSFORM_SIZE
            + self.linewidth_len
            + align_up(self.n_pathtag, PATHSEG_PART_SIZE as usize)
            + self.pathseg_len
    }
    /// Return a config for a scene with these stats.
    ///
    /// Also returns the beginning of free (dynamic) memory.
    fn config(&self, width: usize, height: usize) -> (Config, usize) {
        // Layout of scene buffer
        let drawtag_offset = 0;
        let n_drawobj = self.n_drawobj;
        let n_drawobj_padded = align_up(n_drawobj, DRAW_PART_SIZE as usize);
        let drawdata_offset = drawtag_offset + n_drawobj_padded * DRAWTAG_SIZE;
        let trans_offset = drawdata_offset + self.drawdata_len;
        let n_trans = self.n_transform;
        let n_trans_padded = align_up(n_trans, TRANSFORM_PART_SIZE as usize);
        let linewidth_offset = trans_offset + n_trans_padded * TRANSFORM_SIZE;
        let pathtag_offset = linewidth_offset + self.linewidth_len;
        let n_pathtag = self.n_pathtag;
        let n_pathtag_padded = align_up(n_pathtag, PATHSEG_PART_SIZE as usize);
        let pathseg_offset = pathtag_offset + n_pathtag_padded;
        // Layout of memory
        let mut alloc = 0;
        let trans_alloc = alloc;
        alloc += trans_alloc + n_trans_padded * TRANSFORM_SIZE;
        let pathseg_alloc = alloc;
        alloc += pathseg_alloc + self.n_pathseg as usize * PATHSEG_SIZE;
        let path_bbox_alloc = alloc;
        let n_path = self.n_path as usize;
        alloc += path_bbox_alloc + n_path * PATH_BBOX_SIZE;
        let drawmonoid_alloc = alloc;
        alloc += n_drawobj_padded * DRAWMONOID_SIZE;
        let anno_alloc = alloc;
        alloc += n_drawobj * ANNOTATED_SIZE;
        let clip_alloc = alloc;
        let n_clip = self.n_clip as usize;
        const CLIP_SIZE: usize = 4;
        alloc += n_clip * CLIP_SIZE;
        let clip_bic_alloc = alloc;
        const CLIP_BIC_SIZE: usize = 8;
        // This can round down, as we only reduce the prefix
        alloc += (n_clip / CLIP_PART_SIZE as usize) * CLIP_BIC_SIZE;
        let clip_stack_alloc = alloc;
        const CLIP_EL_SIZE: usize = 20;
        alloc += n_clip * CLIP_EL_SIZE;
        let clip_bbox_alloc = alloc;
        const CLIP_BBOX_SIZE: usize = 16;
        alloc += align_up(n_clip as usize, CLIP_PART_SIZE as usize) * CLIP_BBOX_SIZE;
        let draw_bbox_alloc = alloc;
        alloc += n_drawobj * DRAW_BBOX_SIZE;
        let drawinfo_alloc = alloc;
        // TODO: not optimized; it can be accumulated during encoding or summed from drawtags
        const MAX_DRAWINFO_SIZE: usize = 44;
        alloc += n_drawobj * MAX_DRAWINFO_SIZE;
        // These constants depend on encoding and may need to be updated.
        const PATH_SIZE: usize = 12;
        const BIN_SIZE: usize = 8;
        let width_in_tiles = width / TILE_W;
        let height_in_tiles = height / TILE_H;
        let tile_base = alloc;
        alloc += ((n_path + 3) & !3) * PATH_SIZE;
        let bin_base = alloc;
        alloc += ((n_drawobj + 255) & !255) * BIN_SIZE;
        let ptcl_base = alloc;
        alloc += width_in_tiles * height_in_tiles * PTCL_INITIAL_ALLOC;
        let config = Config {
            mem_size: 0, // to be filled in later
            n_elements: n_drawobj as u32,
            n_pathseg: self.n_pathseg,
            pathseg_alloc: pathseg_alloc as u32,
            anno_alloc: anno_alloc as u32,
            trans_alloc: trans_alloc as u32,
            path_bbox_alloc: path_bbox_alloc as u32,
            drawmonoid_alloc: drawmonoid_alloc as u32,
            clip_alloc: clip_alloc as u32,
            clip_bic_alloc: clip_bic_alloc as u32,
            clip_stack_alloc: clip_stack_alloc as u32,
            clip_bbox_alloc: clip_bbox_alloc as u32,
            draw_bbox_alloc: draw_bbox_alloc as u32,
            drawinfo_alloc: drawinfo_alloc as u32,
            n_trans: n_trans as u32,
            n_path: self.n_path,
            n_clip: self.n_clip,
            trans_offset: trans_offset as u32,
            linewidth_offset: linewidth_offset as u32,
            pathtag_offset: pathtag_offset as u32,
            pathseg_offset: pathseg_offset as u32,
            drawtag_offset: drawtag_offset as u32,
            drawdata_offset: drawdata_offset as u32,
            width_in_tiles: width_in_tiles as u32,
            height_in_tiles: height_in_tiles as u32,
            tile_alloc: tile_base as u32,
            bin_alloc: bin_base as u32,
            ptcl_alloc: ptcl_base as u32,
        };
        (config, alloc)
    }
 }
 fn align_up(x: usize, align: usize) -> usize {
    debug_assert!(align.is_power_of_two());
    (x + align - 1) & !(align - 1)
 }
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@ -4,7 +4,7 @@ const DO_SRGB_CONVERSION: bool = false;
 use std::borrow::Cow;
 use crate::encoder::GlyphEncoder;
-use crate::stages::{Config, Transform};
+use crate::stages::Transform;
 use piet::kurbo::{Affine, PathEl, Point, Rect, Shape};
 use piet::{
    Color, Error, FixedGradient, ImageFormat, InterpolationMode, IntoBrush, RenderContext,
@ -18,7 +18,7 @@ use piet_gpu_types::scene::Element;
 use crate::gradient::{Colrv1RadialGradient, LinearGradient, RadialGradient, RampCache};
 use crate::text::Font;
 pub use crate::text::{PietGpuText, PietGpuTextLayout, PietGpuTextLayoutBuilder};
-use crate::Blend;
+use crate::{Blend, SceneStats};
 pub struct PietGpuImage;
@ -95,44 +95,15 @@ impl PietGpuRenderContext {
        }
    }
-    pub fn stage_config(&self) -> (Config, usize) {
+    pub(crate) fn stats(&self) -> SceneStats {
-        self.new_encoder.stage_config()
+        self.new_encoder.stats()
    }
    /// Number of draw objects.
    ///
    /// This is for the new element processing pipeline. It's not necessarily the
    /// same as the number of paths (as in the old pipeline), but it might take a
    /// while to sort that out.
    pub fn n_drawobj(&self) -> usize {
        self.new_encoder.n_drawobj()
    }
    /// Number of paths.
    pub fn n_path(&self) -> u32 {
        self.new_encoder.n_path()
    }
    pub fn n_pathseg(&self) -> u32 {
        self.new_encoder.n_pathseg()
    }
    pub fn n_pathtag(&self) -> usize {
        self.new_encoder.n_pathtag()
    }
    pub fn n_transform(&self) -> usize {
        self.new_encoder.n_transform()
    }
    pub fn n_clip(&self) -> u32 {
        self.new_encoder.n_clip()
    }
    pub fn write_scene(&self, buf: &mut BufWrite) {
        self.new_encoder.write_scene(buf);
    }
    // TODO: delete
    pub fn get_scene_buf(&mut self) -> &[u8] {
        const ALIGN: usize = 128;
        let padded_size = (self.elements.len() + (ALIGN - 1)) & ALIGN.wrapping_neg();
@ -194,7 +165,6 @@ impl RenderContext for PietGpuRenderContext {
                let rad = self.ramp_cache.add_radial_gradient(&rad);
                Ok(PietGpuBrush::RadGradient(rad))
            }
            _ => todo!("don't do radial gradients yet"),
        }
    }
--- a/piet-gpu/src/render_driver.rs
+++ b/piet-gpu/src/render_driver.rs
@ -0,0 +1,267 @@
 // Copyright 2022 The piet-gpu authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 // Also licensed under MIT license, at your choice.
 use bytemuck::Pod;
 use piet_gpu_hal::{CmdBuf, Error, Image, QueryPool, Semaphore, Session, SubmittedCmdBuf};
 use crate::{EncodedSceneRef, MemoryHeader, PietGpuRenderContext, Renderer, SceneStats};
 /// Additional logic for sequencing rendering operations, specifically
 /// for handling failure and reallocation.
 ///
 /// It may be this shouldn't be a separate object from Renderer.
 pub struct RenderDriver {
    frames: Vec<RenderFrame>,
    renderer: Renderer,
    buf_ix: usize,
    /// The index of a pending fine rasterization submission.
    pending: Option<usize>,
 }
 pub struct TargetState<'a> {
    pub cmd_buf: &'a mut CmdBuf,
    pub image: &'a Image,
 }
 struct RenderFrame {
    cmd_buf: CmdBufState,
    query_pool: QueryPool,
 }
 enum CmdBufState {
    Start,
    Submitted(SubmittedCmdBuf),
    Ready(CmdBuf),
 }
 impl RenderDriver {
    /// Create new render driver.
    ///
    /// Should probably be fallible.
    ///
    /// We can get n from the renderer as well.
    pub fn new(session: &Session, n: usize, renderer: Renderer) -> RenderDriver {
        let frames = (0..n)
            .map(|_| {
                // Maybe should allocate here so it doesn't happen on first frame?
                let cmd_buf = CmdBufState::default();
                let query_pool = session.create_query_pool(Renderer::QUERY_POOL_SIZE)?;
                Ok(RenderFrame {
                    cmd_buf,
                    query_pool,
                })
            })
            .collect::<Result<_, Error>>()
            .unwrap();
        RenderDriver {
            frames,
            renderer,
            buf_ix: 0,
            pending: None,
        }
    }
    pub fn upload_render_ctx(
        &mut self,
        session: &Session,
        render_ctx: &mut PietGpuRenderContext,
    ) -> Result<(), Error> {
        let stats = render_ctx.stats();
        self.ensure_scene_buffers(session, &stats)?;
        self.renderer.upload_render_ctx(render_ctx, self.buf_ix)
    }
    pub fn upload_scene<T: Copy + Pod>(
        &mut self,
        session: &Session,
        scene: &EncodedSceneRef<T>,
    ) -> Result<(), Error> {
        let stats = scene.stats();
        self.ensure_scene_buffers(session, &stats)?;
        self.renderer.upload_scene(scene, self.buf_ix)
    }
    fn ensure_scene_buffers(&mut self, session: &Session, stats: &SceneStats) -> Result<(), Error> {
        let scene_size = stats.scene_size();
        unsafe {
            self.renderer
                .realloc_scene_if_needed(session, scene_size as u64, self.buf_ix)?;
        }
        let memory_size = self.renderer.memory_size(&stats);
        // TODO: better estimate of additional memory needed
        // Note: if we were to cover the worst-case binning output, we could make the
        // binning stage infallible and cut checking logic. It also may not be a bad
        // estimate for the rest.
        let estimated_needed = memory_size as u64 + (1 << 20);
        if estimated_needed > self.renderer.memory_buf_size() {
            if let Some(pending) = self.pending.take() {
                // There might be a fine rasterization task that binds the memory buffer
                // still in flight.
                self.frames[pending].cmd_buf.wait();
            }
            unsafe {
                self.renderer.realloc_memory(session, estimated_needed)?;
            }
        }
        Ok(())
    }
    /// Run one try of the coarse rendering pipeline.
    pub(crate) fn try_run_coarse(&mut self, session: &Session) -> Result<MemoryHeader, Error> {
        let frame = &mut self.frames[self.buf_ix];
        let cmd_buf = frame.cmd_buf.cmd_buf(session)?;
        unsafe {
            cmd_buf.begin();
            // TODO: probably want to return query results as well
            self.renderer
                .record_coarse(cmd_buf, &frame.query_pool, self.buf_ix);
            self.renderer.record_readback(cmd_buf);
            let cmd_buf = frame.cmd_buf.cmd_buf(session)?;
            cmd_buf.finish_timestamps(&frame.query_pool);
            cmd_buf.host_barrier();
            cmd_buf.finish();
            frame.cmd_buf.submit(session, &[], &[])?;
            frame.cmd_buf.wait();
            let mut result = Vec::new();
            // TODO: consider read method for single POD value
            self.renderer.memory_buf_readback.read(&mut result)?;
            Ok(result[0])
        }
    }
    /// Run the coarse render pipeline, ensuring enough memory for intermediate buffers.
    pub fn run_coarse(&mut self, session: &Session) -> Result<(), Error> {
        loop {
            let mem_header = self.try_run_coarse(session)?;
            println!("{:?}", mem_header);
            if mem_header.mem_error == 0 {
                let blend_needed = mem_header.blend_offset as u64;
                if blend_needed > self.renderer.blend_size() {
                    unsafe {
                        self.renderer.realloc_blend(session, blend_needed)?;
                    }
                }
                return Ok(());
            }
            // Not enough memory, reallocate and retry.
            // TODO: be smarter (multiplier for early stages)
            let mem_size = mem_header.mem_offset + 4096;
            // Safety rationalization: no command buffers containing the buffer are
            // in flight.
            unsafe {
                self.renderer.realloc_memory(session, mem_size.into())?;
                self.renderer.upload_config(self.buf_ix)?;
            }
        }
    }
    /// Record the fine rasterizer, leaving the command buffer open.
    pub fn record_fine(&mut self, session: &Session) -> Result<TargetState, Error> {
        let frame = &mut self.frames[self.buf_ix];
        let cmd_buf = frame.cmd_buf.cmd_buf(session)?;
        unsafe {
            self.renderer.record_fine(cmd_buf, &frame.query_pool, 0);
        }
        let image = &self.renderer.image_dev;
        Ok(TargetState { cmd_buf, image })
    }
    /// Submit the current command buffer.
    pub fn submit(
        &mut self,
        session: &Session,
        wait_semaphores: &[&Semaphore],
        signal_semaphores: &[&Semaphore],
    ) -> Result<(), Error> {
        let frame = &mut self.frames[self.buf_ix];
        let cmd_buf = frame.cmd_buf.cmd_buf(session)?;
        unsafe {
            cmd_buf.finish_timestamps(&frame.query_pool);
            cmd_buf.host_barrier();
            cmd_buf.finish();
            frame
                .cmd_buf
                .submit(session, wait_semaphores, signal_semaphores)?
        }
        self.pending = Some(self.buf_ix);
        Ok(())
    }
    pub fn wait(&mut self) {
        self.frames[self.buf_ix].cmd_buf.wait();
        self.pending = None;
    }
    /// Move to the next buffer.
    pub fn next_buffer(&mut self) {
        self.buf_ix = (self.buf_ix + 1) % self.frames.len()
    }
 }
 impl Default for CmdBufState {
    fn default() -> Self {
        CmdBufState::Start
    }
 }
 impl CmdBufState {
    /// Get a command buffer suitable for recording.
    ///
    /// If the command buffer is submitted, wait.
    fn cmd_buf(&mut self, session: &Session) -> Result<&mut CmdBuf, Error> {
        if let CmdBufState::Ready(cmd_buf) = self {
            return Ok(cmd_buf);
        }
        if let CmdBufState::Submitted(submitted) = std::mem::take(self) {
            if let Ok(Some(cmd_buf)) = submitted.wait() {
                *self = CmdBufState::Ready(cmd_buf);
            }
        }
        if matches!(self, CmdBufState::Start) {
            *self = CmdBufState::Ready(session.cmd_buf()?);
        }
        if let CmdBufState::Ready(cmd_buf) = self {
            Ok(cmd_buf)
        } else {
            unreachable!()
        }
    }
    unsafe fn submit(
        &mut self,
        session: &Session,
        wait_semaphores: &[&Semaphore],
        signal_semaphores: &[&Semaphore],
    ) -> Result<(), Error> {
        if let CmdBufState::Ready(cmd_buf) = std::mem::take(self) {
            let submitted = session.run_cmd_buf(cmd_buf, wait_semaphores, signal_semaphores)?;
            *self = CmdBufState::Submitted(submitted);
            Ok(())
        } else {
            Err("Tried to submit CmdBufState not in ready state".into())
        }
    }
    fn wait(&mut self) {
        if matches!(self, CmdBufState::Submitted(_)) {
            if let CmdBufState::Submitted(submitted) = std::mem::take(self) {
                if let Ok(Some(cmd_buf)) = submitted.wait() {
                    *self = CmdBufState::Ready(cmd_buf);
                }
            }
        }
    }
 }
--- a/piet-gpu/src/stages.rs
+++ b/piet-gpu/src/stages.rs
@ -37,6 +37,7 @@ pub use transform::{
 #[repr(C)]
 #[derive(Clone, Copy, Default, Debug, Zeroable, Pod)]
 pub struct Config {
    pub mem_size: u32,
    pub n_elements: u32, // paths
    pub n_pathseg: u32,
    pub width_in_tiles: u32,
@ -167,3 +168,17 @@ impl ElementStage {
            .record(pass, &code.draw_code, &binding.draw_binding, n_drawobj);
    }
 }
 impl ElementBinding {
    pub unsafe fn rebind_memory(&mut self, session: &Session, memory: &Buffer) {
        self.transform_binding.rebind_memory(session, memory);
        self.path_binding.rebind_memory(session, memory);
        self.draw_binding.rebind_memory(session, memory);
    }
    pub unsafe fn rebind_scene(&mut self, session: &Session, scene: &Buffer) {
        self.transform_binding.rebind_scene(session, scene);
        self.path_binding.rebind_scene(session, scene);
        self.draw_binding.rebind_scene(session, scene);
    }
 }
--- a/piet-gpu/src/stages/clip.rs
+++ b/piet-gpu/src/stages/clip.rs
@ -93,4 +93,9 @@ impl ClipBinding {
            pass.memory_barrier();
        }
    }
    pub unsafe fn rebind_memory(&mut self, session: &Session, memory: &Buffer) {
        session.update_buffer_descriptor(&mut self.reduce_ds, 0, memory);
        session.update_buffer_descriptor(&mut self.leaf_ds, 0, memory);
    }
 }
--- a/piet-gpu/src/stages/draw.rs
+++ b/piet-gpu/src/stages/draw.rs
@ -163,3 +163,15 @@ impl DrawStage {
        );
    }
 }
 impl DrawBinding {
    pub unsafe fn rebind_memory(&mut self, session: &Session, memory: &Buffer) {
        session.update_buffer_descriptor(&mut self.reduce_ds, 0, memory);
        session.update_buffer_descriptor(&mut self.leaf_ds, 0, memory);
    }
    pub unsafe fn rebind_scene(&mut self, session: &Session, scene: &Buffer) {
        session.update_buffer_descriptor(&mut self.reduce_ds, 2, scene);
        session.update_buffer_descriptor(&mut self.leaf_ds, 2, scene);
    }
 }
--- a/piet-gpu/src/stages/path.rs
+++ b/piet-gpu/src/stages/path.rs
@ -200,6 +200,19 @@ impl PathStage {
    }
 }
 impl PathBinding {
    pub unsafe fn rebind_memory(&mut self, session: &Session, memory: &Buffer) {
        session.update_buffer_descriptor(&mut self.reduce_ds, 0, memory);
        session.update_buffer_descriptor(&mut self.clear_ds, 0, memory);
        session.update_buffer_descriptor(&mut self.path_ds, 0, memory);
    }
    pub unsafe fn rebind_scene(&mut self, session: &Session, scene: &Buffer) {
        session.update_buffer_descriptor(&mut self.reduce_ds, 2, scene);
        session.update_buffer_descriptor(&mut self.path_ds, 2, scene);
    }
 }
 pub struct PathEncoder<'a> {
    tag_stream: &'a mut Vec<u8>,
    // If we're never going to use the i16 encoding, it might be
--- a/piet-gpu/src/stages/transform.rs
+++ b/piet-gpu/src/stages/transform.rs
@ -166,6 +166,18 @@ impl TransformStage {
    }
 }
 impl TransformBinding {
    pub unsafe fn rebind_memory(&mut self, session: &Session, memory: &Buffer) {
        session.update_buffer_descriptor(&mut self.reduce_ds, 0, memory);
        session.update_buffer_descriptor(&mut self.leaf_ds, 0, memory);
    }
    pub unsafe fn rebind_scene(&mut self, session: &Session, scene: &Buffer) {
        session.update_buffer_descriptor(&mut self.reduce_ds, 2, scene);
        session.update_buffer_descriptor(&mut self.leaf_ds, 2, scene);
    }
 }
 impl Transform {
    pub const IDENTITY: Transform = Transform {
        mat: [1.0, 0.0, 0.0, 1.0],