unify GPU memory management

Merge all static and dynamic buffers to just one, "memory". Add a malloc function for dynamic allocations. Unify static allocation offsets into a "config" buffer containing scene setup (number of paths, number of path segments), as well as the memory offsets of the static allocations. Finally, set an overflow flag when an allocation fail, and make sure to exit shader execution as soon as that triggers. Add checks before beginning execution in case the client wants to run two or more shaders before checking the flag. The "state" buffer is left alone because it needs zero'ing and because it is accessed with the "volatile" keyword. Fixes #40 Signed-off-by: Elias Naur <mail@eliasnaur.com>
2025-01-10 12:41:30 +11:00 · 2020-12-11 18:30:20 +01:00 · 2020-12-11 18:30:20 +01:00 · 4de67d9081
parent a2a2d12c5d
commit 4de67d9081
23 changed files with 463 additions and 448 deletions
--- a/piet-gpu-derive/src/glsl.rs
+++ b/piet-gpu-derive/src/glsl.rs
@ -31,17 +31,22 @@ pub fn gen_glsl(module: &LayoutModule) -> String {

    for name in &module.def_names {
        let def = module.defs.get(name).unwrap();
+        let mem = &"memory".to_owned();
+        let mut buf_name = &module.name;
+        if !module.name.eq(&"state") && !module.name.eq(&"scene") {
+            buf_name = mem;
+        }
        match def {
            (_size, LayoutTypeDef::Struct(fields)) => {
-                gen_struct_read(&mut r, &module.name, &name, fields);
+                gen_struct_read(&mut r, buf_name, &name, fields);
                if module.gpu_write {
-                    gen_struct_write(&mut r, &module.name, &name, fields);
+                    gen_struct_write(&mut r, buf_name, &name, fields);
                }
            }
            (_size, LayoutTypeDef::Enum(en)) => {
-                gen_enum_read(&mut r, &module.name, &name, en);
+                gen_enum_read(&mut r, buf_name, &name, en);
                if module.gpu_write {
-                    gen_enum_write(&mut r, &module.name, &name, en);
+                    gen_enum_write(&mut r, buf_name, &name, en);
                }
            }
        }
--- a/piet-gpu/shader/annotated.h
+++ b/piet-gpu/shader/annotated.h
@ -64,11 +64,11 @@ AnnotatedRef Annotated_index(AnnotatedRef ref, uint index) {

 AnnoFill AnnoFill_read(AnnoFillRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = annotated[ix + 0];
-    uint raw1 = annotated[ix + 1];
-    uint raw2 = annotated[ix + 2];
-    uint raw3 = annotated[ix + 3];
-    uint raw4 = annotated[ix + 4];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
+    uint raw2 = memory[ix + 2];
+    uint raw3 = memory[ix + 3];
+    uint raw4 = memory[ix + 4];
    AnnoFill s;
    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.rgba_color = raw4;
@ -77,21 +77,21 @@ AnnoFill AnnoFill_read(AnnoFillRef ref) {

 void AnnoFill_write(AnnoFillRef ref, AnnoFill s) {
    uint ix = ref.offset >> 2;
-    annotated[ix + 0] = floatBitsToUint(s.bbox.x);
-    annotated[ix + 1] = floatBitsToUint(s.bbox.y);
-    annotated[ix + 2] = floatBitsToUint(s.bbox.z);
-    annotated[ix + 3] = floatBitsToUint(s.bbox.w);
-    annotated[ix + 4] = s.rgba_color;
+    memory[ix + 0] = floatBitsToUint(s.bbox.x);
+    memory[ix + 1] = floatBitsToUint(s.bbox.y);
+    memory[ix + 2] = floatBitsToUint(s.bbox.z);
+    memory[ix + 3] = floatBitsToUint(s.bbox.w);
+    memory[ix + 4] = s.rgba_color;
 }

 AnnoStroke AnnoStroke_read(AnnoStrokeRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = annotated[ix + 0];
-    uint raw1 = annotated[ix + 1];
-    uint raw2 = annotated[ix + 2];
-    uint raw3 = annotated[ix + 3];
-    uint raw4 = annotated[ix + 4];
-    uint raw5 = annotated[ix + 5];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
+    uint raw2 = memory[ix + 2];
+    uint raw3 = memory[ix + 3];
+    uint raw4 = memory[ix + 4];
+    uint raw5 = memory[ix + 5];
    AnnoStroke s;
    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    s.rgba_color = raw4;
@ -101,20 +101,20 @@ AnnoStroke AnnoStroke_read(AnnoStrokeRef ref) {

 void AnnoStroke_write(AnnoStrokeRef ref, AnnoStroke s) {
    uint ix = ref.offset >> 2;
-    annotated[ix + 0] = floatBitsToUint(s.bbox.x);
-    annotated[ix + 1] = floatBitsToUint(s.bbox.y);
-    annotated[ix + 2] = floatBitsToUint(s.bbox.z);
-    annotated[ix + 3] = floatBitsToUint(s.bbox.w);
-    annotated[ix + 4] = s.rgba_color;
-    annotated[ix + 5] = floatBitsToUint(s.linewidth);
+    memory[ix + 0] = floatBitsToUint(s.bbox.x);
+    memory[ix + 1] = floatBitsToUint(s.bbox.y);
+    memory[ix + 2] = floatBitsToUint(s.bbox.z);
+    memory[ix + 3] = floatBitsToUint(s.bbox.w);
+    memory[ix + 4] = s.rgba_color;
+    memory[ix + 5] = floatBitsToUint(s.linewidth);
 }

 AnnoClip AnnoClip_read(AnnoClipRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = annotated[ix + 0];
-    uint raw1 = annotated[ix + 1];
-    uint raw2 = annotated[ix + 2];
-    uint raw3 = annotated[ix + 3];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
+    uint raw2 = memory[ix + 2];
+    uint raw3 = memory[ix + 3];
    AnnoClip s;
    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
    return s;
@ -122,14 +122,14 @@ AnnoClip AnnoClip_read(AnnoClipRef ref) {

 void AnnoClip_write(AnnoClipRef ref, AnnoClip s) {
    uint ix = ref.offset >> 2;
-    annotated[ix + 0] = floatBitsToUint(s.bbox.x);
-    annotated[ix + 1] = floatBitsToUint(s.bbox.y);
-    annotated[ix + 2] = floatBitsToUint(s.bbox.z);
-    annotated[ix + 3] = floatBitsToUint(s.bbox.w);
+    memory[ix + 0] = floatBitsToUint(s.bbox.x);
+    memory[ix + 1] = floatBitsToUint(s.bbox.y);
+    memory[ix + 2] = floatBitsToUint(s.bbox.z);
+    memory[ix + 3] = floatBitsToUint(s.bbox.w);
 }

 uint Annotated_tag(AnnotatedRef ref) {
-    return annotated[ref.offset >> 2];
+    return memory[ref.offset >> 2];
 }

 AnnoStroke Annotated_Stroke_read(AnnotatedRef ref) {
@ -149,26 +149,26 @@ AnnoClip Annotated_EndClip_read(AnnotatedRef ref) {
 }

 void Annotated_Nop_write(AnnotatedRef ref) {
-    annotated[ref.offset >> 2] = Annotated_Nop;
+    memory[ref.offset >> 2] = Annotated_Nop;
 }

 void Annotated_Stroke_write(AnnotatedRef ref, AnnoStroke s) {
-    annotated[ref.offset >> 2] = Annotated_Stroke;
+    memory[ref.offset >> 2] = Annotated_Stroke;
    AnnoStroke_write(AnnoStrokeRef(ref.offset + 4), s);
 }

 void Annotated_Fill_write(AnnotatedRef ref, AnnoFill s) {
-    annotated[ref.offset >> 2] = Annotated_Fill;
+    memory[ref.offset >> 2] = Annotated_Fill;
    AnnoFill_write(AnnoFillRef(ref.offset + 4), s);
 }

 void Annotated_BeginClip_write(AnnotatedRef ref, AnnoClip s) {
-    annotated[ref.offset >> 2] = Annotated_BeginClip;
+    memory[ref.offset >> 2] = Annotated_BeginClip;
    AnnoClip_write(AnnoClipRef(ref.offset + 4), s);
 }

 void Annotated_EndClip_write(AnnotatedRef ref, AnnoClip s) {
-    annotated[ref.offset >> 2] = Annotated_EndClip;
+    memory[ref.offset >> 2] = Annotated_EndClip;
    AnnoClip_write(AnnoClipRef(ref.offset + 4), s);
 }

--- a/piet-gpu/shader/backdrop.comp
+++ b/piet-gpu/shader/backdrop.comp
@ -16,27 +16,15 @@
 #extension GL_GOOGLE_include_directive : enable

 #include "setup.h"
+#include "mem.h"

 #define LG_BACKDROP_WG (7 + LG_WG_FACTOR)
 #define BACKDROP_WG (1 << LG_BACKDROP_WG)

 layout(local_size_x = BACKDROP_WG, local_size_y = 1) in;

-layout(set = 0, binding = 0) buffer AnnotatedBuf {
-    uint[] annotated;
-};
-
-// This is really only used for n_elements; maybe we can handle that
-// a different way, but it's convenient to have the same signature as
-// tile allocation.
-layout(set = 0, binding = 1) readonly buffer AllocBuf {
-    uint n_elements; // paths
-    uint n_pathseg;
-    uint alloc;
-};
-
-layout(set = 0, binding = 2) buffer TileBuf {
-    uint[] tile;
+layout(set = 0, binding = 1) readonly buffer ConfigBuf {
+    Config conf;
 };

 #include "annotated.h"
@ -47,18 +35,22 @@ shared uint sh_row_base[BACKDROP_WG];
 shared uint sh_row_width[BACKDROP_WG];

 void main() {
+    if (mem_overflow) {
+        return;
+    }
+
    uint th_ix = gl_LocalInvocationID.x;
    uint element_ix = gl_GlobalInvocationID.x;
-    AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size);
+    AnnotatedRef ref = AnnotatedRef(conf.anno_base + element_ix * Annotated_size);

    // Work assignment: 1 thread : 1 path element
    uint row_count = 0;
-    if (element_ix < n_elements) {
+    if (element_ix < conf.n_elements) {
        uint tag = Annotated_tag(ref);
        switch (tag) {
        case Annotated_Fill:
        case Annotated_BeginClip:
-            PathRef path_ref = PathRef(element_ix * Path_size);
+            PathRef path_ref = PathRef(conf.tile_base + element_ix * Path_size);
            Path path = Path_read(path_ref);
            sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
            row_count = path.bbox.w - path.bbox.y;
@ -98,11 +90,11 @@ void main() {
        // Process one row sequentially
        // Read backdrop value per tile and prefix sum it
        uint tile_el_ix = sh_row_base[el_ix] + seq_ix * 2 * width;
-        uint sum = tile[tile_el_ix];
+        uint sum = memory[tile_el_ix];
        for (uint x = 1; x < width; x++) {
            tile_el_ix += 2;
-            sum += tile[tile_el_ix];
-            tile[tile_el_ix] = sum;
+            sum += memory[tile_el_ix];
+            memory[tile_el_ix] = sum;
        }
    }
 }
--- a/piet-gpu/shader/backdrop.spv
+++ b/piet-gpu/shader/backdrop.spv
--- a/piet-gpu/shader/binning.comp
+++ b/piet-gpu/shader/binning.comp
@ -10,20 +10,12 @@
 #extension GL_GOOGLE_include_directive : enable

 #include "setup.h"
+#include "mem.h"

 layout(local_size_x = N_TILE, local_size_y = 1) in;

-layout(set = 0, binding = 0) buffer AnnotatedBuf {
-    uint[] annotated;
-};
-
-layout(set = 0, binding = 1) buffer AllocBuf {
-    uint n_elements; // paths
-    uint alloc;
-};
-
-layout(set = 0, binding = 2) buffer BinsBuf {
-    uint[] bins;
+layout(set = 0, binding = 1) readonly buffer ConfigBuf {
+    Config conf;
 };

 #include "annotated.h"
@ -41,19 +33,27 @@ layout(set = 0, binding = 2) buffer BinsBuf {
 shared uint bitmaps[N_SLICE][N_TILE];
 shared uint count[N_SLICE][N_TILE];
 shared uint sh_chunk_start[N_TILE];
+shared bool sh_alloc_failed;

 void main() {
-    uint my_n_elements = n_elements;
+    if (mem_overflow) {
+        return;
+    }
+
+    uint my_n_elements = conf.n_elements;
    uint my_partition = gl_WorkGroupID.x;

    for (uint i = 0; i < N_SLICE; i++) {
        bitmaps[i][gl_LocalInvocationID.x] = 0;
    }
+    if (gl_LocalInvocationID.x == 0) {
+        sh_alloc_failed = false;
+    }
    barrier();

    // Read inputs and determine coverage of bins
    uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x;
-    AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size);
+    AnnotatedRef ref = AnnotatedRef(conf.anno_base + element_ix * Annotated_size);
    uint tag = Annotated_Nop;
    if (element_ix < my_n_elements) {
        tag = Annotated_tag(ref);
@ -103,19 +103,26 @@ void main() {
        count[i][gl_LocalInvocationID.x] = element_count;
    }
    // element_count is number of elements covering bin for this invocation.
-    uint chunk_start = 0;
+    Alloc chunk_alloc = Alloc(0, false);
    if (element_count != 0) {
        // TODO: aggregate atomic adds (subgroup is probably fastest)
-        chunk_start = atomicAdd(alloc, element_count * BinInstance_size);
-        sh_chunk_start[gl_LocalInvocationID.x] = chunk_start;
+        chunk_alloc = malloc(element_count * BinInstance_size);
+        sh_chunk_start[gl_LocalInvocationID.x] = chunk_alloc.offset;
+        if (chunk_alloc.failed) {
+            sh_alloc_failed = true;
+        }
    }
    // Note: it might be more efficient for reading to do this in the
    // other order (each bin is a contiguous sequence of partitions)
-    uint out_ix = (my_partition * N_TILE + gl_LocalInvocationID.x) * 2;
-    bins[out_ix] = element_count;
-    bins[out_ix + 1] = chunk_start;
+    uint out_ix = (conf.bin_base >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2;
+    memory[out_ix] = element_count;
+    memory[out_ix + 1] = chunk_alloc.offset;

    barrier();
+    if (sh_alloc_failed) {
+        return;
+    }
+
    // Use similar strategy as Laine & Karras paper; loop over bbox of bins
    // touched by this element
    x = x0;
--- a/piet-gpu/shader/binning.spv
+++ b/piet-gpu/shader/binning.spv
--- a/piet-gpu/shader/bins.h
+++ b/piet-gpu/shader/bins.h
@ -18,7 +18,7 @@ BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) {

 BinInstance BinInstance_read(BinInstanceRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = bins[ix + 0];
+    uint raw0 = memory[ix + 0];
    BinInstance s;
    s.element_ix = raw0;
    return s;
@ -26,6 +26,6 @@ BinInstance BinInstance_read(BinInstanceRef ref) {

 void BinInstance_write(BinInstanceRef ref, BinInstance s) {
    uint ix = ref.offset >> 2;
-    bins[ix + 0] = s.element_ix;
+    memory[ix + 0] = s.element_ix;
 }

--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@ -14,28 +14,12 @@
 #extension GL_GOOGLE_include_directive : enable

 #include "setup.h"
+#include "mem.h"

 layout(local_size_x = N_TILE, local_size_y = 1) in;

-layout(set = 0, binding = 0) buffer AnnotatedBuf {
-    uint[] annotated;
-};
-
-layout(set = 0, binding = 1) buffer BinsBuf {
-    uint[] bins;
-};
-
-layout(set = 0, binding = 2) buffer TileBuf {
-    uint[] tile;
-};
-
-layout(set = 0, binding = 3) buffer AllocBuf {
-    uint n_elements;
-    uint alloc;
-};
-
-layout(set = 0, binding = 4) buffer PtclBuf {
-    uint[] ptcl;
+layout(set = 0, binding = 1) readonly buffer ConfigBuf {
+    Config conf;
 };

 #include "annotated.h"
@ -65,22 +49,31 @@ shared uint sh_tile_base[N_TILE];
 shared uint sh_tile_stride[N_TILE];

 // Perhaps cmd_limit should be a global? This is a style question.
-void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) {
-    if (cmd_ref.offset > cmd_limit) {
-        uint new_cmd = atomicAdd(alloc, PTCL_INITIAL_ALLOC);
-        CmdJump jump = CmdJump(new_cmd);
-        Cmd_Jump_write(cmd_ref, jump);
-        cmd_ref = CmdRef(new_cmd);
-        cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
+bool alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) {
+    if (cmd_ref.offset < cmd_limit) {
+        return true;
    }
+    Alloc new_cmd = malloc(PTCL_INITIAL_ALLOC);
+    if (new_cmd.failed) {
+        return false;
+    }
+    CmdJump jump = CmdJump(new_cmd.offset);
+    Cmd_Jump_write(cmd_ref, jump);
+    cmd_ref = CmdRef(new_cmd.offset);
+    cmd_limit = new_cmd.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
+    return true;
 }

 void main() {
+    if (mem_overflow) {
+        return;
+    }
+
    // Could use either linear or 2d layouts for both dispatch and
    // invocations within the workgroup. We'll use variables to abstract.
    uint bin_ix = N_TILE_X * gl_WorkGroupID.y + gl_WorkGroupID.x;
    uint partition_ix = 0;
-    uint n_partitions = (n_elements + N_TILE - 1) / N_TILE;
+    uint n_partitions = (conf.n_elements + N_TILE - 1) / N_TILE;
    uint th_ix = gl_LocalInvocationID.x;

    // Coordinates of top left of bin, in tiles.
@ -91,7 +84,7 @@ void main() {
    uint tile_x = gl_LocalInvocationID.x % N_TILE_X;
    uint tile_y = gl_LocalInvocationID.x / N_TILE_X;
    uint this_tile_ix = (bin_tile_y + tile_y) * WIDTH_IN_TILES + bin_tile_x + tile_x;
-    CmdRef cmd_ref = CmdRef(this_tile_ix * PTCL_INITIAL_ALLOC);
+    CmdRef cmd_ref = CmdRef(conf.ptcl_base + this_tile_ix * PTCL_INITIAL_ALLOC);
    uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
    // The nesting depth of the clip stack
    uint clip_depth = 0;
@ -123,9 +116,9 @@ void main() {
                part_start_ix = ready_ix;
                uint count = 0;
                if (th_ix < N_PART_READ && partition_ix + th_ix < n_partitions) {
-                    uint in_ix = ((partition_ix + th_ix) * N_TILE + bin_ix) * 2;
-                    count = bins[in_ix];
-                    sh_part_elements[th_ix] = bins[in_ix + 1];
+                    uint in_ix = (conf.bin_base >> 2) + ((partition_ix + th_ix) * N_TILE + bin_ix) * 2;
+                    count = memory[in_ix];
+                    sh_part_elements[th_ix] = memory[in_ix + 1];
                }
                // prefix sum of counts
                for (uint i = 0; i < LG_N_PART_READ; i++) {
@ -175,7 +168,7 @@ void main() {
        AnnotatedRef ref;
        if (th_ix + rd_ix < wr_ix) {
            element_ix = sh_elements[th_ix];
-            ref = AnnotatedRef(element_ix * Annotated_size);
+            ref = AnnotatedRef(conf.anno_base + element_ix * Annotated_size);
            tag = Annotated_tag(ref);
        }

@ -189,7 +182,7 @@ void main() {
            // We have one "path" for each element, even if the element isn't
            // actually a path (currently EndClip, but images etc in the future).
            uint path_ix = element_ix;
-            Path path = Path_read(PathRef(path_ix * Path_size));
+            Path path = Path_read(PathRef(conf.tile_base + path_ix * Path_size));
            uint stride = path.bbox.z - path.bbox.x;
            sh_tile_stride[th_ix] = stride;
            int dx = int(path.bbox.x) - int(bin_tile_x);
@ -232,7 +225,7 @@ void main() {
                    el_ix = probe;
                }
            }
-            AnnotatedRef ref = AnnotatedRef(sh_elements[el_ix] * Annotated_size);
+            AnnotatedRef ref = AnnotatedRef(conf.anno_base + sh_elements[el_ix] * Annotated_size);
            uint tag = Annotated_tag(ref);
            uint seq_ix = ix - (el_ix > 0 ? sh_tile_count[el_ix - 1] : 0);
            uint width = sh_tile_width[el_ix];
@ -281,7 +274,7 @@ void main() {
            // At this point, we read the element again from global memory.
            // If that turns out to be expensive, maybe we can pack it into
            // shared memory (or perhaps just the tag).
-            ref = AnnotatedRef(element_ix * Annotated_size);
+            ref = AnnotatedRef(conf.anno_base + element_ix * Annotated_size);
            tag = Annotated_tag(ref);

            if (clip_zero_depth == 0) {
@ -290,7 +283,9 @@ void main() {
                    Tile tile = Tile_read(TileRef(sh_tile_base[element_ref_ix]
                        + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
                    AnnoFill fill = Annotated_Fill_read(ref);
-                    alloc_cmd(cmd_ref, cmd_limit);
+                    if (!alloc_cmd(cmd_ref, cmd_limit)) {
+                        break;
+                    }
                    if (tile.tile.offset != 0) {
                        CmdFill cmd_fill;
                        cmd_fill.tile_ref = tile.tile.offset;
@ -310,7 +305,9 @@ void main() {
                    } else if (tile.tile.offset == 0 && clip_depth < 32) {
                        clip_one_mask |= (1 << clip_depth);
                    } else {
-                        alloc_cmd(cmd_ref, cmd_limit);
+                        if (!alloc_cmd(cmd_ref, cmd_limit)) {
+                            break;
+                        }
                        if (tile.tile.offset != 0) {
                            CmdBeginClip cmd_begin_clip;
                            cmd_begin_clip.tile_ref = tile.tile.offset;
@ -331,7 +328,9 @@ void main() {
                case Annotated_EndClip:
                    clip_depth--;
                    if (clip_depth >= 32 || (clip_one_mask & (1 << clip_depth)) == 0) {
-                        alloc_cmd(cmd_ref, cmd_limit);
+                        if (!alloc_cmd(cmd_ref, cmd_limit)) {
+                            break;
+                        }
                        Cmd_EndClip_write(cmd_ref, CmdEndClip(1.0));
                        cmd_ref.offset += Cmd_size;
                    }
@ -344,7 +343,9 @@ void main() {
                    cmd_stroke.tile_ref = tile.tile.offset;
                    cmd_stroke.half_width = 0.5 * stroke.linewidth;
                    cmd_stroke.rgba_color = stroke.rgba_color;
-                    alloc_cmd(cmd_ref, cmd_limit);
+                    if (!alloc_cmd(cmd_ref, cmd_limit)) {
+                        break;
+                    }
                    Cmd_Stroke_write(cmd_ref, cmd_stroke);
                    cmd_ref.offset += Cmd_size;
                    break;
--- a/piet-gpu/shader/coarse.spv
+++ b/piet-gpu/shader/coarse.spv
--- a/piet-gpu/shader/elements.comp
+++ b/piet-gpu/shader/elements.comp
@ -9,6 +9,9 @@
 #version 450
 #extension GL_GOOGLE_include_directive : enable

+#include "setup.h"
+#include "mem.h"
+
 #define N_ROWS 4
 #define WG_SIZE 32
 #define LG_WG_SIZE 5
@ -16,28 +19,22 @@

 layout(local_size_x = WG_SIZE, local_size_y = 1) in;

-layout(set = 0, binding = 0) readonly buffer SceneBuf {
+layout(set = 0, binding = 1) readonly buffer ConfigBuf {
+    Config conf;
+};
+
+layout(set = 0, binding = 2) readonly buffer SceneBuf {
    uint[] scene;
 };

 // It would be better to use the Vulkan memory model than
 // "volatile" but shooting for compatibility here rather
 // than doing things right.
-layout(set = 0, binding = 1) volatile buffer StateBuf {
+layout(set = 0, binding = 3) volatile buffer StateBuf {
    uint part_counter;
    uint[] state;
 };

-// The annotated results are stored here.
-layout(set = 0, binding = 2) buffer AnnotatedBuf {
-    uint[] annotated;
-};
-
-// Path segments are stored here.
-layout(set = 0, binding = 3) buffer PathSegBuf {
-    uint[] pathseg;
-};
-
 #include "scene.h"
 #include "state.h"
 #include "annotated.h"
@ -175,6 +172,10 @@ shared uint sh_part_ix;
 shared State sh_prefix;

 void main() {
+    if (mem_overflow) {
+        return;
+    }
+
    State th_state[N_ROWS];
    // Determine partition to process by atomic counter (described in Section
    // 4.4 of prefix sum paper).
@ -341,9 +342,9 @@ void main() {
            }
            // We do encoding a bit by hand to minimize divergence. Another approach
            // would be to have a fill/stroke bool.
-            PathSegRef path_out_ref = PathSegRef((st.pathseg_count - 1) * PathSeg_size);
+            PathSegRef path_out_ref = PathSegRef(conf.pathseg_base + (st.pathseg_count - 1) * PathSeg_size);
            uint out_tag = tag == Element_FillLine ? PathSeg_FillCubic : PathSeg_StrokeCubic;
-            pathseg[path_out_ref.offset >> 2] = out_tag;
+            memory[path_out_ref.offset >> 2] = out_tag;
            PathStrokeCubic_write(PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic);
            break;
        case Element_FillQuad:
@ -365,9 +366,9 @@ void main() {
            }
            // We do encoding a bit by hand to minimize divergence. Another approach
            // would be to have a fill/stroke bool.
-            path_out_ref = PathSegRef((st.pathseg_count - 1) * PathSeg_size);
+            path_out_ref = PathSegRef(conf.pathseg_base + (st.pathseg_count - 1) * PathSeg_size);
            out_tag = tag == Element_FillQuad ? PathSeg_FillCubic : PathSeg_StrokeCubic;
-            pathseg[path_out_ref.offset >> 2] = out_tag;
+            memory[path_out_ref.offset >> 2] = out_tag;
            PathStrokeCubic_write(PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic);
            break;
        case Element_FillCubic:
@ -386,9 +387,9 @@ void main() {
            }
            // We do encoding a bit by hand to minimize divergence. Another approach
            // would be to have a fill/stroke bool.
-            path_out_ref = PathSegRef((st.pathseg_count - 1) * PathSeg_size);
+            path_out_ref = PathSegRef(conf.pathseg_base + (st.pathseg_count - 1) * PathSeg_size);
            out_tag = tag == Element_FillCubic ? PathSeg_FillCubic : PathSeg_StrokeCubic;
-            pathseg[path_out_ref.offset >> 2] = out_tag;
+            memory[path_out_ref.offset >> 2] = out_tag;
            PathStrokeCubic_write(PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic);
            break;
        case Element_Stroke:
@ -398,7 +399,7 @@ void main() {
            vec2 lw = get_linewidth(st);
            anno_stroke.bbox = st.bbox + vec4(-lw, lw);
            anno_stroke.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z));
-            AnnotatedRef out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size);
+            AnnotatedRef out_ref = AnnotatedRef(conf.anno_base + (st.path_count - 1) * Annotated_size);
            Annotated_Stroke_write(out_ref, anno_stroke);
            break;
        case Element_Fill:
@ -406,7 +407,7 @@ void main() {
            AnnoFill anno_fill;
            anno_fill.rgba_color = fill.rgba_color;
            anno_fill.bbox = st.bbox;
-            out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size);
+            out_ref = AnnotatedRef(conf.anno_base + (st.path_count - 1) * Annotated_size);
            Annotated_Fill_write(out_ref, anno_fill);
            break;
        case Element_BeginClip:
@ -414,14 +415,14 @@ void main() {
            AnnoClip anno_begin_clip = AnnoClip(begin_clip.bbox);
            // This is the absolute bbox, it's been transformed during encoding.
            anno_begin_clip.bbox = begin_clip.bbox;
-            out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size);
+            out_ref = AnnotatedRef(conf.anno_base + (st.path_count - 1) * Annotated_size);
            Annotated_BeginClip_write(out_ref, anno_begin_clip);
            break;
        case Element_EndClip:
            Clip end_clip = Element_EndClip_read(this_ref);
            // This bbox is expected to be the same as the begin one.
            AnnoClip anno_end_clip = AnnoClip(end_clip.bbox);
-            out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size);
+            out_ref = AnnotatedRef(conf.anno_base + (st.path_count - 1) * Annotated_size);
            Annotated_EndClip_write(out_ref, anno_end_clip);
            break;
        }
--- a/piet-gpu/shader/elements.spv
+++ b/piet-gpu/shader/elements.spv
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@ -11,50 +11,42 @@
 #extension GL_EXT_nonuniform_qualifier : enable

 #include "setup.h"
+#include "mem.h"

 #define CHUNK 8
 #define CHUNK_DY (TILE_HEIGHT_PX / CHUNK)
 layout(local_size_x = TILE_WIDTH_PX, local_size_y = CHUNK_DY) in;

-// Same concern that this should be readonly as in kernel 3.
-layout(set = 0, binding = 0) buffer PtclBuf {
-    uint[] ptcl;
+layout(set = 0, binding = 1) readonly buffer ConfigBuf {
+    Config conf;
 };

-layout(set = 0, binding = 1) buffer TileBuf {
-    uint[] tile;
-};
+layout(rgba8, set = 0, binding = 2) uniform writeonly image2D image;

-layout(set = 0, binding = 2) buffer ClipScratchBuf {
-    uint[] clip_scratch;
-};
-
-layout(rgba8, set = 0, binding = 3) uniform writeonly image2D image;
-
-layout(set = 0, binding = 4) uniform sampler2D textures[];
+layout(set = 0, binding = 3) uniform sampler2D textures[];

 #include "ptcl.h"
 #include "tile.h"

 #define BLEND_STACK_SIZE 4

-// Layout of clip_scratch buffer:
-// [0] is the alloc bump offset (in units of 32 bit words, initially 0)
-// Starting at 1 is a sequence of frames.
+// Layout of a clip scratch frame:
 // Each frame is WIDTH * HEIGHT 32-bit words, then a link reference.

+// Link offset and frame size in 32-bit words.
 #define CLIP_LINK_OFFSET (TILE_WIDTH_PX * TILE_HEIGHT_PX)
 #define CLIP_BUF_SIZE (CLIP_LINK_OFFSET + 1)

-shared uint sh_clip_alloc;
+shared Alloc sh_clip_alloc;

-// Allocate a scratch buffer for clipping. Unlike offsets in the rest of the code,
-// it counts 32-bit words.
-uint alloc_clip_buf(uint link) {
+// Allocate a scratch buffer for clipping.
+Alloc alloc_clip_buf(uint link) {
    if (gl_LocalInvocationID.x == 0 && gl_LocalInvocationID.y == 0) {
-        uint alloc = atomicAdd(clip_scratch[0], CLIP_BUF_SIZE) + 1;
+        Alloc alloc = malloc(CLIP_BUF_SIZE * 4);
+        if (!alloc.failed) {
+            memory[(alloc.offset >> 2) + CLIP_LINK_OFFSET] = link;
+        }
        sh_clip_alloc = alloc;
-        clip_scratch[alloc + CLIP_LINK_OFFSET] = link;
    }
    barrier();
    return sh_clip_alloc;
@ -95,8 +87,12 @@ float[CHUNK] computeArea(vec2 xy, int backdrop, uint tile_ref) {
 }

 void main() {
+    if (mem_overflow) {
+        return;
+    }
+
    uint tile_ix = gl_WorkGroupID.y * WIDTH_IN_TILES + gl_WorkGroupID.x;
-    CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC);
+    CmdRef cmd_ref = CmdRef(conf.ptcl_base + tile_ix * PTCL_INITIAL_ALLOC);

    uvec2 xy_uint = uvec2(gl_GlobalInvocationID.x, gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y);
    vec2 xy = vec2(xy_uint);
@ -168,10 +164,14 @@ void main() {
            uint blend_slot = blend_sp % BLEND_STACK_SIZE;
            if (blend_sp == blend_spill + BLEND_STACK_SIZE) {
                // spill to scratch buffer
-                clip_tos = alloc_clip_buf(clip_tos);
-                uint base_ix = clip_tos + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y;
+                Alloc alloc = alloc_clip_buf(clip_tos);
+                if (alloc.failed) {
+                    return;
+                }
+                clip_tos = alloc.offset;
+                uint base_ix = (clip_tos >> 2) + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y;
                for (uint k = 0; k < CHUNK; k++) {
-                    clip_scratch[base_ix + k * TILE_WIDTH_PX * CHUNK_DY] = blend_stack[blend_slot][k];
+                    memory[base_ix + k * TILE_WIDTH_PX * CHUNK_DY] = blend_stack[blend_slot][k];
                }
                blend_spill++;
            }
@ -194,11 +194,11 @@ void main() {
            CmdEndClip end_clip = Cmd_EndClip_read(cmd_ref);
            blend_slot = (blend_sp - 1) % BLEND_STACK_SIZE;
            if (blend_sp == blend_spill) {
-                uint base_ix = clip_tos + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y;
+                uint base_ix = (clip_tos >> 2) + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y;
                for (uint k = 0; k < CHUNK; k++) {
-                    blend_stack[blend_slot][k] = clip_scratch[base_ix + k * TILE_WIDTH_PX * CHUNK_DY];
+                    blend_stack[blend_slot][k] = memory[base_ix + k * TILE_WIDTH_PX * CHUNK_DY];
                }
-                clip_tos = clip_scratch[clip_tos + CLIP_LINK_OFFSET];
+                clip_tos = memory[(clip_tos >> 2) + CLIP_LINK_OFFSET];
                blend_spill--;
            }
            blend_sp--;
--- a/piet-gpu/shader/kernel4.spv
+++ b/piet-gpu/shader/kernel4.spv
--- a/piet-gpu/shader/mem.h
+++ b/piet-gpu/shader/mem.h
@ -0,0 +1,29 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+layout(set = 0, binding = 0) buffer Memory {
+    // offset into memory of the next allocation, initialized by the user.
+    uint mem_offset;
+    bool mem_overflow;
+    uint[] memory;
+};
+
+// Alloc represents a memory allocation.
+struct Alloc {
+    // offset in bytes into memory.
+    uint offset;
+    // failed is true if the allocation overflowed memory.
+    bool failed;
+};
+
+// malloc allocates size bytes of memory.
+Alloc malloc(uint size) {
+    Alloc a;
+	// Round up to nearest 32-bit word.
+	size = (size + 3) & ~3;
+    a.offset = atomicAdd(mem_offset, size);
+    a.failed = a.offset + size > memory.length() * 4;
+    if (a.failed) {
+        mem_overflow = true;
+    }
+    return a;
+}
--- a/piet-gpu/shader/path_coarse.comp
+++ b/piet-gpu/shader/path_coarse.comp
@ -8,24 +8,15 @@
 #extension GL_GOOGLE_include_directive : enable

 #include "setup.h"
+#include "mem.h"

 #define LG_COARSE_WG 5
 #define COARSE_WG (1 << LG_COARSE_WG)

 layout(local_size_x = COARSE_WG, local_size_y = 1) in;

-layout(set = 0, binding = 0) buffer PathSegBuf {
-    uint[] pathseg;
-};
-
-layout(set = 0, binding = 1) buffer AllocBuf {
-    uint n_paths;
-    uint n_pathseg;
-    uint alloc;
-};
-
-layout(set = 0, binding = 2) buffer TileBuf {
-    uint[] tile;
+layout(set = 0, binding = 1) readonly buffer ConfigBuf {
+    Config conf;
 };

 #include "pathseg.h"
@ -96,11 +87,15 @@ SubdivResult estimate_subdiv(vec2 p0, vec2 p1, vec2 p2, float sqrt_tol) {
 }

 void main() {
+    if (mem_overflow) {
+        return;
+    }
+
    uint element_ix = gl_GlobalInvocationID.x;
-    PathSegRef ref = PathSegRef(element_ix * PathSeg_size);
+    PathSegRef ref = PathSegRef(conf.pathseg_base + element_ix * PathSeg_size);

    uint tag = PathSeg_Nop;
-    if (element_ix < n_pathseg) {
+    if (element_ix < conf.n_pathseg) {
        tag = PathSeg_tag(ref);
    }
    switch (tag) {
@ -128,7 +123,7 @@ void main() {
        uint n = max(uint(ceil(val * 0.5 / sqrt(REM_ACCURACY))), 1);

        uint path_ix = cubic.path_ix;
-        Path path = Path_read(PathRef(path_ix * Path_size));
+        Path path = Path_read(PathRef(conf.tile_base + path_ix * Path_size));
        ivec4 bbox = ivec4(path.bbox);
        vec2 p0 = cubic.p0;
        qp0 = cubic.p0;
@ -187,7 +182,12 @@ void main() {
                // TODO: can be tighter, use c to bound width
                uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
                // Consider using subgroups to aggregate atomic add.
-                uint tile_offset = atomicAdd(alloc, n_tile_alloc * TileSeg_size);
+                Alloc tile_alloc = malloc(n_tile_alloc * TileSeg_size);
+                if (tile_alloc.failed) {
+                    return;
+                }
+                uint tile_offset = tile_alloc.offset;
+
                TileSeg tile_seg;

                int xray = int(floor(p0.x*SX));
@ -204,7 +204,7 @@ void main() {
                        int backdrop = p1.y < p0.y ? 1 : -1;
                        TileRef tile_ref = Tile_index(path.tiles, uint(base + xbackdrop));
                        uint tile_el = tile_ref.offset >> 2;
-                        atomicAdd(tile[tile_el + 1], backdrop);
+                        atomicAdd(memory[tile_el + 1], backdrop);
                    }

                    // next_xray is the xray for the next scanline; the line segment intersects
@ -227,7 +227,7 @@ void main() {
                        float tile_x0 = float(x * TILE_WIDTH_PX);
                        TileRef tile_ref = Tile_index(path.tiles, uint(base + x));
                        uint tile_el = tile_ref.offset >> 2;
-                        uint old = atomicExchange(tile[tile_el], tile_offset);
+                        uint old = atomicExchange(memory[tile_el], tile_offset);
                        tile_seg.origin = p0;
                        tile_seg.vector = p1 - p0;
                        float y_edge = 0.0;
--- a/piet-gpu/shader/path_coarse.spv
+++ b/piet-gpu/shader/path_coarse.spv
--- a/piet-gpu/shader/pathseg.h
+++ b/piet-gpu/shader/pathseg.h
@ -89,11 +89,11 @@ PathSegRef PathSeg_index(PathSegRef ref, uint index) {

 PathFillLine PathFillLine_read(PathFillLineRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = pathseg[ix + 0];
-    uint raw1 = pathseg[ix + 1];
-    uint raw2 = pathseg[ix + 2];
-    uint raw3 = pathseg[ix + 3];
-    uint raw4 = pathseg[ix + 4];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
+    uint raw2 = memory[ix + 2];
+    uint raw3 = memory[ix + 3];
+    uint raw4 = memory[ix + 4];
    PathFillLine s;
    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
@ -103,22 +103,22 @@ PathFillLine PathFillLine_read(PathFillLineRef ref) {

 void PathFillLine_write(PathFillLineRef ref, PathFillLine s) {
    uint ix = ref.offset >> 2;
-    pathseg[ix + 0] = floatBitsToUint(s.p0.x);
-    pathseg[ix + 1] = floatBitsToUint(s.p0.y);
-    pathseg[ix + 2] = floatBitsToUint(s.p1.x);
-    pathseg[ix + 3] = floatBitsToUint(s.p1.y);
-    pathseg[ix + 4] = s.path_ix;
+    memory[ix + 0] = floatBitsToUint(s.p0.x);
+    memory[ix + 1] = floatBitsToUint(s.p0.y);
+    memory[ix + 2] = floatBitsToUint(s.p1.x);
+    memory[ix + 3] = floatBitsToUint(s.p1.y);
+    memory[ix + 4] = s.path_ix;
 }

 PathStrokeLine PathStrokeLine_read(PathStrokeLineRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = pathseg[ix + 0];
-    uint raw1 = pathseg[ix + 1];
-    uint raw2 = pathseg[ix + 2];
-    uint raw3 = pathseg[ix + 3];
-    uint raw4 = pathseg[ix + 4];
-    uint raw5 = pathseg[ix + 5];
-    uint raw6 = pathseg[ix + 6];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
+    uint raw2 = memory[ix + 2];
+    uint raw3 = memory[ix + 3];
+    uint raw4 = memory[ix + 4];
+    uint raw5 = memory[ix + 5];
+    uint raw6 = memory[ix + 6];
    PathStrokeLine s;
    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
@ -129,26 +129,26 @@ PathStrokeLine PathStrokeLine_read(PathStrokeLineRef ref) {

 void PathStrokeLine_write(PathStrokeLineRef ref, PathStrokeLine s) {
    uint ix = ref.offset >> 2;
-    pathseg[ix + 0] = floatBitsToUint(s.p0.x);
-    pathseg[ix + 1] = floatBitsToUint(s.p0.y);
-    pathseg[ix + 2] = floatBitsToUint(s.p1.x);
-    pathseg[ix + 3] = floatBitsToUint(s.p1.y);
-    pathseg[ix + 4] = s.path_ix;
-    pathseg[ix + 5] = floatBitsToUint(s.stroke.x);
-    pathseg[ix + 6] = floatBitsToUint(s.stroke.y);
+    memory[ix + 0] = floatBitsToUint(s.p0.x);
+    memory[ix + 1] = floatBitsToUint(s.p0.y);
+    memory[ix + 2] = floatBitsToUint(s.p1.x);
+    memory[ix + 3] = floatBitsToUint(s.p1.y);
+    memory[ix + 4] = s.path_ix;
+    memory[ix + 5] = floatBitsToUint(s.stroke.x);
+    memory[ix + 6] = floatBitsToUint(s.stroke.y);
 }

 PathFillCubic PathFillCubic_read(PathFillCubicRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = pathseg[ix + 0];
-    uint raw1 = pathseg[ix + 1];
-    uint raw2 = pathseg[ix + 2];
-    uint raw3 = pathseg[ix + 3];
-    uint raw4 = pathseg[ix + 4];
-    uint raw5 = pathseg[ix + 5];
-    uint raw6 = pathseg[ix + 6];
-    uint raw7 = pathseg[ix + 7];
-    uint raw8 = pathseg[ix + 8];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
+    uint raw2 = memory[ix + 2];
+    uint raw3 = memory[ix + 3];
+    uint raw4 = memory[ix + 4];
+    uint raw5 = memory[ix + 5];
+    uint raw6 = memory[ix + 6];
+    uint raw7 = memory[ix + 7];
+    uint raw8 = memory[ix + 8];
    PathFillCubic s;
    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
@ -160,30 +160,30 @@ PathFillCubic PathFillCubic_read(PathFillCubicRef ref) {

 void PathFillCubic_write(PathFillCubicRef ref, PathFillCubic s) {
    uint ix = ref.offset >> 2;
-    pathseg[ix + 0] = floatBitsToUint(s.p0.x);
-    pathseg[ix + 1] = floatBitsToUint(s.p0.y);
-    pathseg[ix + 2] = floatBitsToUint(s.p1.x);
-    pathseg[ix + 3] = floatBitsToUint(s.p1.y);
-    pathseg[ix + 4] = floatBitsToUint(s.p2.x);
-    pathseg[ix + 5] = floatBitsToUint(s.p2.y);
-    pathseg[ix + 6] = floatBitsToUint(s.p3.x);
-    pathseg[ix + 7] = floatBitsToUint(s.p3.y);
-    pathseg[ix + 8] = s.path_ix;
+    memory[ix + 0] = floatBitsToUint(s.p0.x);
+    memory[ix + 1] = floatBitsToUint(s.p0.y);
+    memory[ix + 2] = floatBitsToUint(s.p1.x);
+    memory[ix + 3] = floatBitsToUint(s.p1.y);
+    memory[ix + 4] = floatBitsToUint(s.p2.x);
+    memory[ix + 5] = floatBitsToUint(s.p2.y);
+    memory[ix + 6] = floatBitsToUint(s.p3.x);
+    memory[ix + 7] = floatBitsToUint(s.p3.y);
+    memory[ix + 8] = s.path_ix;
 }

 PathStrokeCubic PathStrokeCubic_read(PathStrokeCubicRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = pathseg[ix + 0];
-    uint raw1 = pathseg[ix + 1];
-    uint raw2 = pathseg[ix + 2];
-    uint raw3 = pathseg[ix + 3];
-    uint raw4 = pathseg[ix + 4];
-    uint raw5 = pathseg[ix + 5];
-    uint raw6 = pathseg[ix + 6];
-    uint raw7 = pathseg[ix + 7];
-    uint raw8 = pathseg[ix + 8];
-    uint raw9 = pathseg[ix + 9];
-    uint raw10 = pathseg[ix + 10];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
+    uint raw2 = memory[ix + 2];
+    uint raw3 = memory[ix + 3];
+    uint raw4 = memory[ix + 4];
+    uint raw5 = memory[ix + 5];
+    uint raw6 = memory[ix + 6];
+    uint raw7 = memory[ix + 7];
+    uint raw8 = memory[ix + 8];
+    uint raw9 = memory[ix + 9];
+    uint raw10 = memory[ix + 10];
    PathStrokeCubic s;
    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
@ -196,21 +196,21 @@ PathStrokeCubic PathStrokeCubic_read(PathStrokeCubicRef ref) {

 void PathStrokeCubic_write(PathStrokeCubicRef ref, PathStrokeCubic s) {
    uint ix = ref.offset >> 2;
-    pathseg[ix + 0] = floatBitsToUint(s.p0.x);
-    pathseg[ix + 1] = floatBitsToUint(s.p0.y);
-    pathseg[ix + 2] = floatBitsToUint(s.p1.x);
-    pathseg[ix + 3] = floatBitsToUint(s.p1.y);
-    pathseg[ix + 4] = floatBitsToUint(s.p2.x);
-    pathseg[ix + 5] = floatBitsToUint(s.p2.y);
-    pathseg[ix + 6] = floatBitsToUint(s.p3.x);
-    pathseg[ix + 7] = floatBitsToUint(s.p3.y);
-    pathseg[ix + 8] = s.path_ix;
-    pathseg[ix + 9] = floatBitsToUint(s.stroke.x);
-    pathseg[ix + 10] = floatBitsToUint(s.stroke.y);
+    memory[ix + 0] = floatBitsToUint(s.p0.x);
+    memory[ix + 1] = floatBitsToUint(s.p0.y);
+    memory[ix + 2] = floatBitsToUint(s.p1.x);
+    memory[ix + 3] = floatBitsToUint(s.p1.y);
+    memory[ix + 4] = floatBitsToUint(s.p2.x);
+    memory[ix + 5] = floatBitsToUint(s.p2.y);
+    memory[ix + 6] = floatBitsToUint(s.p3.x);
+    memory[ix + 7] = floatBitsToUint(s.p3.y);
+    memory[ix + 8] = s.path_ix;
+    memory[ix + 9] = floatBitsToUint(s.stroke.x);
+    memory[ix + 10] = floatBitsToUint(s.stroke.y);
 }

 uint PathSeg_tag(PathSegRef ref) {
-    return pathseg[ref.offset >> 2];
+    return memory[ref.offset >> 2];
 }

 PathFillLine PathSeg_FillLine_read(PathSegRef ref) {
@ -230,26 +230,26 @@ PathStrokeCubic PathSeg_StrokeCubic_read(PathSegRef ref) {
 }

 void PathSeg_Nop_write(PathSegRef ref) {
-    pathseg[ref.offset >> 2] = PathSeg_Nop;
+    memory[ref.offset >> 2] = PathSeg_Nop;
 }

 void PathSeg_FillLine_write(PathSegRef ref, PathFillLine s) {
-    pathseg[ref.offset >> 2] = PathSeg_FillLine;
+    memory[ref.offset >> 2] = PathSeg_FillLine;
    PathFillLine_write(PathFillLineRef(ref.offset + 4), s);
 }

 void PathSeg_StrokeLine_write(PathSegRef ref, PathStrokeLine s) {
-    pathseg[ref.offset >> 2] = PathSeg_StrokeLine;
+    memory[ref.offset >> 2] = PathSeg_StrokeLine;
    PathStrokeLine_write(PathStrokeLineRef(ref.offset + 4), s);
 }

 void PathSeg_FillCubic_write(PathSegRef ref, PathFillCubic s) {
-    pathseg[ref.offset >> 2] = PathSeg_FillCubic;
+    memory[ref.offset >> 2] = PathSeg_FillCubic;
    PathFillCubic_write(PathFillCubicRef(ref.offset + 4), s);
 }

 void PathSeg_StrokeCubic_write(PathSegRef ref, PathStrokeCubic s) {
-    pathseg[ref.offset >> 2] = PathSeg_StrokeCubic;
+    memory[ref.offset >> 2] = PathSeg_StrokeCubic;
    PathStrokeCubic_write(PathStrokeCubicRef(ref.offset + 4), s);
 }

--- a/piet-gpu/shader/ptcl.h
+++ b/piet-gpu/shader/ptcl.h
@ -173,10 +173,10 @@ CmdRef Cmd_index(CmdRef ref, uint index) {

 CmdCircle CmdCircle_read(CmdCircleRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
-    uint raw1 = ptcl[ix + 1];
-    uint raw2 = ptcl[ix + 2];
-    uint raw3 = ptcl[ix + 3];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
+    uint raw2 = memory[ix + 2];
+    uint raw3 = memory[ix + 3];
    CmdCircle s;
    s.center = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.radius = uintBitsToFloat(raw2);
@ -186,18 +186,18 @@ CmdCircle CmdCircle_read(CmdCircleRef ref) {

 void CmdCircle_write(CmdCircleRef ref, CmdCircle s) {
    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = floatBitsToUint(s.center.x);
-    ptcl[ix + 1] = floatBitsToUint(s.center.y);
-    ptcl[ix + 2] = floatBitsToUint(s.radius);
-    ptcl[ix + 3] = s.rgba_color;
+    memory[ix + 0] = floatBitsToUint(s.center.x);
+    memory[ix + 1] = floatBitsToUint(s.center.y);
+    memory[ix + 2] = floatBitsToUint(s.radius);
+    memory[ix + 3] = s.rgba_color;
 }

 CmdLine CmdLine_read(CmdLineRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
-    uint raw1 = ptcl[ix + 1];
-    uint raw2 = ptcl[ix + 2];
-    uint raw3 = ptcl[ix + 3];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
+    uint raw2 = memory[ix + 2];
+    uint raw3 = memory[ix + 3];
    CmdLine s;
    s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
@ -206,17 +206,17 @@ CmdLine CmdLine_read(CmdLineRef ref) {

 void CmdLine_write(CmdLineRef ref, CmdLine s) {
    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = floatBitsToUint(s.start.x);
-    ptcl[ix + 1] = floatBitsToUint(s.start.y);
-    ptcl[ix + 2] = floatBitsToUint(s.end.x);
-    ptcl[ix + 3] = floatBitsToUint(s.end.y);
+    memory[ix + 0] = floatBitsToUint(s.start.x);
+    memory[ix + 1] = floatBitsToUint(s.start.y);
+    memory[ix + 2] = floatBitsToUint(s.end.x);
+    memory[ix + 3] = floatBitsToUint(s.end.y);
 }

 CmdStroke CmdStroke_read(CmdStrokeRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
-    uint raw1 = ptcl[ix + 1];
-    uint raw2 = ptcl[ix + 2];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
+    uint raw2 = memory[ix + 2];
    CmdStroke s;
    s.tile_ref = raw0;
    s.half_width = uintBitsToFloat(raw1);
@ -226,16 +226,16 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) {

 void CmdStroke_write(CmdStrokeRef ref, CmdStroke s) {
    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = s.tile_ref;
-    ptcl[ix + 1] = floatBitsToUint(s.half_width);
-    ptcl[ix + 2] = s.rgba_color;
+    memory[ix + 0] = s.tile_ref;
+    memory[ix + 1] = floatBitsToUint(s.half_width);
+    memory[ix + 2] = s.rgba_color;
 }

 CmdFill CmdFill_read(CmdFillRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
-    uint raw1 = ptcl[ix + 1];
-    uint raw2 = ptcl[ix + 2];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
+    uint raw2 = memory[ix + 2];
    CmdFill s;
    s.tile_ref = raw0;
    s.backdrop = int(raw1);
@ -245,15 +245,15 @@ CmdFill CmdFill_read(CmdFillRef ref) {

 void CmdFill_write(CmdFillRef ref, CmdFill s) {
    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = s.tile_ref;
-    ptcl[ix + 1] = uint(s.backdrop);
-    ptcl[ix + 2] = s.rgba_color;
+    memory[ix + 0] = s.tile_ref;
+    memory[ix + 1] = uint(s.backdrop);
+    memory[ix + 2] = s.rgba_color;
 }

 CmdBeginClip CmdBeginClip_read(CmdBeginClipRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
-    uint raw1 = ptcl[ix + 1];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
    CmdBeginClip s;
    s.tile_ref = raw0;
    s.backdrop = int(raw1);
@ -262,13 +262,13 @@ CmdBeginClip CmdBeginClip_read(CmdBeginClipRef ref) {

 void CmdBeginClip_write(CmdBeginClipRef ref, CmdBeginClip s) {
    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = s.tile_ref;
-    ptcl[ix + 1] = uint(s.backdrop);
+    memory[ix + 0] = s.tile_ref;
+    memory[ix + 1] = uint(s.backdrop);
 }

 CmdBeginSolidClip CmdBeginSolidClip_read(CmdBeginSolidClipRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
+    uint raw0 = memory[ix + 0];
    CmdBeginSolidClip s;
    s.alpha = uintBitsToFloat(raw0);
    return s;
@ -276,12 +276,12 @@ CmdBeginSolidClip CmdBeginSolidClip_read(CmdBeginSolidClipRef ref) {

 void CmdBeginSolidClip_write(CmdBeginSolidClipRef ref, CmdBeginSolidClip s) {
    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = floatBitsToUint(s.alpha);
+    memory[ix + 0] = floatBitsToUint(s.alpha);
 }

 CmdEndClip CmdEndClip_read(CmdEndClipRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
+    uint raw0 = memory[ix + 0];
    CmdEndClip s;
    s.alpha = uintBitsToFloat(raw0);
    return s;
@ -289,12 +289,12 @@ CmdEndClip CmdEndClip_read(CmdEndClipRef ref) {

 void CmdEndClip_write(CmdEndClipRef ref, CmdEndClip s) {
    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = floatBitsToUint(s.alpha);
+    memory[ix + 0] = floatBitsToUint(s.alpha);
 }

 CmdSolid CmdSolid_read(CmdSolidRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
+    uint raw0 = memory[ix + 0];
    CmdSolid s;
    s.rgba_color = raw0;
    return s;
@ -302,12 +302,12 @@ CmdSolid CmdSolid_read(CmdSolidRef ref) {

 void CmdSolid_write(CmdSolidRef ref, CmdSolid s) {
    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = s.rgba_color;
+    memory[ix + 0] = s.rgba_color;
 }

 CmdSolidMask CmdSolidMask_read(CmdSolidMaskRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
+    uint raw0 = memory[ix + 0];
    CmdSolidMask s;
    s.mask = uintBitsToFloat(raw0);
    return s;
@ -315,12 +315,12 @@ CmdSolidMask CmdSolidMask_read(CmdSolidMaskRef ref) {

 void CmdSolidMask_write(CmdSolidMaskRef ref, CmdSolidMask s) {
    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = floatBitsToUint(s.mask);
+    memory[ix + 0] = floatBitsToUint(s.mask);
 }

 CmdJump CmdJump_read(CmdJumpRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
+    uint raw0 = memory[ix + 0];
    CmdJump s;
    s.new_ref = raw0;
    return s;
@ -328,11 +328,11 @@ CmdJump CmdJump_read(CmdJumpRef ref) {

 void CmdJump_write(CmdJumpRef ref, CmdJump s) {
    uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = s.new_ref;
+    memory[ix + 0] = s.new_ref;
 }

 uint Cmd_tag(CmdRef ref) {
-    return ptcl[ref.offset >> 2];
+    return memory[ref.offset >> 2];
 }

 CmdCircle Cmd_Circle_read(CmdRef ref) {
@ -376,56 +376,56 @@ CmdJump Cmd_Jump_read(CmdRef ref) {
 }

 void Cmd_End_write(CmdRef ref) {
-    ptcl[ref.offset >> 2] = Cmd_End;
+    memory[ref.offset >> 2] = Cmd_End;
 }

 void Cmd_Circle_write(CmdRef ref, CmdCircle s) {
-    ptcl[ref.offset >> 2] = Cmd_Circle;
+    memory[ref.offset >> 2] = Cmd_Circle;
    CmdCircle_write(CmdCircleRef(ref.offset + 4), s);
 }

 void Cmd_Line_write(CmdRef ref, CmdLine s) {
-    ptcl[ref.offset >> 2] = Cmd_Line;
+    memory[ref.offset >> 2] = Cmd_Line;
    CmdLine_write(CmdLineRef(ref.offset + 4), s);
 }

 void Cmd_Fill_write(CmdRef ref, CmdFill s) {
-    ptcl[ref.offset >> 2] = Cmd_Fill;
+    memory[ref.offset >> 2] = Cmd_Fill;
    CmdFill_write(CmdFillRef(ref.offset + 4), s);
 }

 void Cmd_BeginClip_write(CmdRef ref, CmdBeginClip s) {
-    ptcl[ref.offset >> 2] = Cmd_BeginClip;
+    memory[ref.offset >> 2] = Cmd_BeginClip;
    CmdBeginClip_write(CmdBeginClipRef(ref.offset + 4), s);
 }

 void Cmd_BeginSolidClip_write(CmdRef ref, CmdBeginSolidClip s) {
-    ptcl[ref.offset >> 2] = Cmd_BeginSolidClip;
+    memory[ref.offset >> 2] = Cmd_BeginSolidClip;
    CmdBeginSolidClip_write(CmdBeginSolidClipRef(ref.offset + 4), s);
 }

 void Cmd_EndClip_write(CmdRef ref, CmdEndClip s) {
-    ptcl[ref.offset >> 2] = Cmd_EndClip;
+    memory[ref.offset >> 2] = Cmd_EndClip;
    CmdEndClip_write(CmdEndClipRef(ref.offset + 4), s);
 }

 void Cmd_Stroke_write(CmdRef ref, CmdStroke s) {
-    ptcl[ref.offset >> 2] = Cmd_Stroke;
+    memory[ref.offset >> 2] = Cmd_Stroke;
    CmdStroke_write(CmdStrokeRef(ref.offset + 4), s);
 }

 void Cmd_Solid_write(CmdRef ref, CmdSolid s) {
-    ptcl[ref.offset >> 2] = Cmd_Solid;
+    memory[ref.offset >> 2] = Cmd_Solid;
    CmdSolid_write(CmdSolidRef(ref.offset + 4), s);
 }

 void Cmd_SolidMask_write(CmdRef ref, CmdSolidMask s) {
-    ptcl[ref.offset >> 2] = Cmd_SolidMask;
+    memory[ref.offset >> 2] = Cmd_SolidMask;
    CmdSolidMask_write(CmdSolidMaskRef(ref.offset + 4), s);
 }

 void Cmd_Jump_write(CmdRef ref, CmdJump s) {
-    ptcl[ref.offset >> 2] = Cmd_Jump;
+    memory[ref.offset >> 2] = Cmd_Jump;
    CmdJump_write(CmdJumpRef(ref.offset + 4), s);
 }

--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@ -28,3 +28,13 @@
 #define N_TILE (N_TILE_X * N_TILE_Y)
 #define LG_N_TILE (7 + LG_WG_FACTOR)
 #define N_SLICE (N_TILE / 32)
+
+struct Config {
+    uint n_elements; // paths
+    uint n_pathseg;
+    uint tile_base;
+    uint bin_base;
+    uint ptcl_base;
+    uint pathseg_base;
+    uint anno_base;
+};
--- a/piet-gpu/shader/tile.h
+++ b/piet-gpu/shader/tile.h
@ -51,9 +51,9 @@ TileSegRef TileSeg_index(TileSegRef ref, uint index) {

 Path Path_read(PathRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = tile[ix + 0];
-    uint raw1 = tile[ix + 1];
-    uint raw2 = tile[ix + 2];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
+    uint raw2 = memory[ix + 2];
    Path s;
    s.bbox = uvec4(raw0 & 0xffff, raw0 >> 16, raw1 & 0xffff, raw1 >> 16);
    s.tiles = TileRef(raw2);
@ -62,15 +62,15 @@ Path Path_read(PathRef ref) {

 void Path_write(PathRef ref, Path s) {
    uint ix = ref.offset >> 2;
-    tile[ix + 0] = s.bbox.x | (s.bbox.y << 16);
-    tile[ix + 1] = s.bbox.z | (s.bbox.w << 16);
-    tile[ix + 2] = s.tiles.offset;
+    memory[ix + 0] = s.bbox.x | (s.bbox.y << 16);
+    memory[ix + 1] = s.bbox.z | (s.bbox.w << 16);
+    memory[ix + 2] = s.tiles.offset;
 }

 Tile Tile_read(TileRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = tile[ix + 0];
-    uint raw1 = tile[ix + 1];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
    Tile s;
    s.tile = TileSegRef(raw0);
    s.backdrop = int(raw1);
@ -79,18 +79,18 @@ Tile Tile_read(TileRef ref) {

 void Tile_write(TileRef ref, Tile s) {
    uint ix = ref.offset >> 2;
-    tile[ix + 0] = s.tile.offset;
-    tile[ix + 1] = uint(s.backdrop);
+    memory[ix + 0] = s.tile.offset;
+    memory[ix + 1] = uint(s.backdrop);
 }

 TileSeg TileSeg_read(TileSegRef ref) {
    uint ix = ref.offset >> 2;
-    uint raw0 = tile[ix + 0];
-    uint raw1 = tile[ix + 1];
-    uint raw2 = tile[ix + 2];
-    uint raw3 = tile[ix + 3];
-    uint raw4 = tile[ix + 4];
-    uint raw5 = tile[ix + 5];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
+    uint raw2 = memory[ix + 2];
+    uint raw3 = memory[ix + 3];
+    uint raw4 = memory[ix + 4];
+    uint raw5 = memory[ix + 5];
    TileSeg s;
    s.origin = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
    s.vector = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
@ -101,11 +101,11 @@ TileSeg TileSeg_read(TileSegRef ref) {

 void TileSeg_write(TileSegRef ref, TileSeg s) {
    uint ix = ref.offset >> 2;
-    tile[ix + 0] = floatBitsToUint(s.origin.x);
-    tile[ix + 1] = floatBitsToUint(s.origin.y);
-    tile[ix + 2] = floatBitsToUint(s.vector.x);
-    tile[ix + 3] = floatBitsToUint(s.vector.y);
-    tile[ix + 4] = floatBitsToUint(s.y_edge);
-    tile[ix + 5] = s.next.offset;
+    memory[ix + 0] = floatBitsToUint(s.origin.x);
+    memory[ix + 1] = floatBitsToUint(s.origin.y);
+    memory[ix + 2] = floatBitsToUint(s.vector.x);
+    memory[ix + 3] = floatBitsToUint(s.vector.y);
+    memory[ix + 4] = floatBitsToUint(s.y_edge);
+    memory[ix + 5] = s.next.offset;
 }

--- a/piet-gpu/shader/tile_alloc.comp
+++ b/piet-gpu/shader/tile_alloc.comp
@ -6,24 +6,15 @@
 #extension GL_GOOGLE_include_directive : enable

 #include "setup.h"
+#include "mem.h"

 #define LG_TILE_ALLOC_WG (7 + LG_WG_FACTOR)
 #define TILE_ALLOC_WG (1 << LG_TILE_ALLOC_WG)

 layout(local_size_x = TILE_ALLOC_WG, local_size_y = 1) in;

-layout(set = 0, binding = 0) buffer AnnotatedBuf {
-    uint[] annotated;
-};
-
-layout(set = 0, binding = 1) buffer AllocBuf {
-    uint n_elements;
-    uint n_pathseg;
-    uint alloc;
-};
-
-layout(set = 0, binding = 2) buffer TileBuf {
-    uint[] tile;
+layout(set = 0, binding = 1) readonly buffer ConfigBuf {
+    Config conf;
 };

 #include "annotated.h"
@ -34,16 +25,20 @@ layout(set = 0, binding = 2) buffer TileBuf {
 #define SY (1.0 / float(TILE_HEIGHT_PX))

 shared uint sh_tile_count[TILE_ALLOC_WG];
-shared uint sh_tile_alloc;
+shared Alloc sh_tile_alloc;

 void main() {
+    if (mem_overflow) {
+        return;
+    }
+
    uint th_ix = gl_LocalInvocationID.x;
    uint element_ix = gl_GlobalInvocationID.x;
-    PathRef path_ref = PathRef(element_ix * Path_size);
-    AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size);
+    PathRef path_ref = PathRef(conf.tile_base + element_ix * Path_size);
+    AnnotatedRef ref = AnnotatedRef(conf.anno_base + element_ix * Annotated_size);

    uint tag = Annotated_Nop;
-    if (element_ix < n_elements) {
+    if (element_ix < conf.n_elements) {
        tag = Annotated_tag(ref);
    }
    int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
@ -86,23 +81,26 @@ void main() {
        sh_tile_count[th_ix] = tile_count;
    }
    if (th_ix == TILE_ALLOC_WG - 1) {
-        sh_tile_alloc = atomicAdd(alloc, tile_count * Tile_size);
+        sh_tile_alloc = malloc(tile_count * Tile_size);
    }
    barrier();
-    uint alloc_start = sh_tile_alloc;
+    Alloc alloc_start = sh_tile_alloc;
+    if (alloc_start.failed) {
+        return;
+    }

-    if (element_ix < n_elements) {
+    if (element_ix < conf.n_elements) {
        uint tile_subix = th_ix > 0 ? sh_tile_count[th_ix - 1] : 0;
-        path.tiles = TileRef(alloc_start + Tile_size * tile_subix);
+        path.tiles = TileRef(alloc_start.offset + Tile_size * tile_subix);
        Path_write(path_ref, path);
    }

    // Zero out allocated tiles efficiently
    uint total_count = sh_tile_count[TILE_ALLOC_WG - 1] * (Tile_size / 4);
-    uint start_ix = alloc_start >> 2;
+    uint start_ix = alloc_start.offset >> 2;
    for (uint i = th_ix; i < total_count; i += TILE_ALLOC_WG) {
        // Note: this interleaving is faster than using Tile_write
        // by a significant amount.
-        tile[start_ix + i] = 0;
+        memory[start_ix + i] = 0;
    }
 }
--- a/piet-gpu/shader/tile_alloc.spv
+++ b/piet-gpu/shader/tile_alloc.spv
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -156,15 +156,16 @@ pub fn dump_k1_data(k1_buf: &[u32]) {
 pub struct Renderer {
    pub image_dev: hub::Image, // resulting image

-    scene_buf: hub::Buffer,
-    scene_dev: hub::Buffer,
+    scene_buf_host: hub::Buffer,
+    scene_buf_dev: hub::Buffer,

-    pub state_buf: hub::Buffer,
-    pub anno_buf: hub::Buffer,
-    pub pathseg_buf: hub::Buffer,
-    pub tile_buf: hub::Buffer,
-    pub bin_buf: hub::Buffer,
-    pub ptcl_buf: hub::Buffer,
+    memory_buf_host: hub::Buffer,
+    memory_buf_dev: hub::Buffer,
+
+    state_buf: hub::Buffer,
+
+    config_buf_host: hub::Buffer,
+    config_buf_dev: hub::Buffer,

    el_pipeline: hub::Pipeline,
    el_ds: hub::DescriptorSet,
@ -178,23 +179,12 @@ pub struct Renderer {
    backdrop_pipeline: hub::Pipeline,
    backdrop_ds: hub::DescriptorSet,

-    tile_alloc_buf_host: hub::Buffer,
-    tile_alloc_buf_dev: hub::Buffer,
-
    bin_pipeline: hub::Pipeline,
    bin_ds: hub::DescriptorSet,

-    bin_alloc_buf_host: hub::Buffer,
-    bin_alloc_buf_dev: hub::Buffer,
-
    coarse_pipeline: hub::Pipeline,
    coarse_ds: hub::DescriptorSet,

-    coarse_alloc_buf_host: hub::Buffer,
-    coarse_alloc_buf_dev: hub::Buffer,
-
-    clip_scratch_buf: hub::Buffer,
-
    k4_pipeline: hub::Pipeline,
    k4_ds: hub::DescriptorSet,

@ -221,88 +211,83 @@ impl Renderer {
            n_elements, n_paths, n_pathseg
        );

-        let mut scene_buf = session
+        let mut scene_buf_host = session
            .create_buffer(std::mem::size_of_val(&scene[..]) as u64, host)
            .unwrap();
-        let scene_dev = session
+        let scene_buf_dev = session
            .create_buffer(std::mem::size_of_val(&scene[..]) as u64, dev)
            .unwrap();
-        scene_buf.write(&scene)?;
+        scene_buf_host.write(&scene)?;

        let state_buf = session.create_buffer(1 * 1024 * 1024, dev)?;
-        let anno_buf = session.create_buffer(64 * 1024 * 1024, dev)?;
-        let pathseg_buf = session.create_buffer(64 * 1024 * 1024, dev)?;
-        let tile_buf = session.create_buffer(64 * 1024 * 1024, dev)?;
-        let bin_buf = session.create_buffer(64 * 1024 * 1024, dev)?;
-        let ptcl_buf = session.create_buffer(48 * 1024 * 1024, dev)?;
        let image_dev = session.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?;

+        let mut config_buf_host = session.create_buffer(7*4, host)?;
+        let config_buf_dev = session.create_buffer(7*4, dev)?;
+
+        // TODO: constants
+        const PATH_SIZE: usize = 12;
+        const BIN_SIZE: usize = 8;
+        const PATHSEG_SIZE: usize = 48;
+        const ANNO_SIZE: usize = 28;
+        let mut alloc = 0;
+        let tile_base = alloc;
+        alloc += ((n_paths + 3) & !3) * PATH_SIZE;
+        let bin_base = alloc;
+        alloc += ((n_paths + 255) & !255) * BIN_SIZE;
+        let ptcl_base = alloc;
+        alloc += WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC;
+        let pathseg_base = alloc;
+        alloc += (n_pathseg * PATHSEG_SIZE + 3) & !3;
+        let anno_base = alloc;
+        alloc += (n_paths * ANNO_SIZE + 3) & !3;
+        config_buf_host.write(&[n_paths as u32, n_pathseg as u32, tile_base as u32, bin_base as u32, ptcl_base as u32, pathseg_base as u32, anno_base as u32])?;
+
+        let mut memory_buf_host = session.create_buffer(2*4, host)?;
+        let memory_buf_dev = session.create_buffer(128 * 1024 * 1024, dev)?;
+        memory_buf_host.write(&[alloc as u32, 0 /* Overflow flag */])?;
+
        let el_code = include_bytes!("../shader/elements.spv");
        let el_pipeline = session.create_simple_compute_pipeline(el_code, 4)?;
        let el_ds = session.create_simple_descriptor_set(
            &el_pipeline,
-            &[&scene_dev, &state_buf, &anno_buf, &pathseg_buf],
+            &[&memory_buf_dev, &config_buf_dev, &scene_buf_dev, &state_buf],
        )?;

-        let mut tile_alloc_buf_host = session.create_buffer(12, host)?;
-        let tile_alloc_buf_dev = session.create_buffer(12, dev)?;
-
-        // TODO: constants
-        const PATH_SIZE: usize = 12;
-        let tile_alloc_start = ((n_paths + 31) & !31) * PATH_SIZE;
-        tile_alloc_buf_host.write(&[n_paths as u32, n_pathseg as u32, tile_alloc_start as u32])?;
        let tile_alloc_code = include_bytes!("../shader/tile_alloc.spv");
-        let tile_pipeline = session.create_simple_compute_pipeline(tile_alloc_code, 3)?;
+        let tile_pipeline = session.create_simple_compute_pipeline(tile_alloc_code, 2)?;
        let tile_ds = session.create_simple_descriptor_set(
            &tile_pipeline,
-            &[&anno_buf, &tile_alloc_buf_dev, &tile_buf],
+            &[&memory_buf_dev, &config_buf_dev],
        )?;

        let path_alloc_code = include_bytes!("../shader/path_coarse.spv");
-        let path_pipeline = session.create_simple_compute_pipeline(path_alloc_code, 3)?;
+        let path_pipeline = session.create_simple_compute_pipeline(path_alloc_code, 2)?;
        let path_ds = session.create_simple_descriptor_set(
            &path_pipeline,
-            &[&pathseg_buf, &tile_alloc_buf_dev, &tile_buf],
+            &[&memory_buf_dev, &config_buf_dev],
        )?;

        let backdrop_alloc_code = include_bytes!("../shader/backdrop.spv");
-        let backdrop_pipeline = session.create_simple_compute_pipeline(backdrop_alloc_code, 3)?;
+        let backdrop_pipeline = session.create_simple_compute_pipeline(backdrop_alloc_code, 2)?;
        let backdrop_ds = session.create_simple_descriptor_set(
            &backdrop_pipeline,
-            &[&anno_buf, &tile_alloc_buf_dev, &tile_buf],
+            &[&memory_buf_dev, &config_buf_dev],
        )?;

-        let mut bin_alloc_buf_host = session.create_buffer(8, host)?;
-        let bin_alloc_buf_dev = session.create_buffer(8, dev)?;
-
        // TODO: constants
-        let bin_alloc_start = ((n_paths + 255) & !255) * 8;
-        bin_alloc_buf_host.write(&[n_paths as u32, bin_alloc_start as u32])?;
        let bin_code = include_bytes!("../shader/binning.spv");
-        let bin_pipeline = session.create_simple_compute_pipeline(bin_code, 3)?;
+        let bin_pipeline = session.create_simple_compute_pipeline(bin_code, 2)?;
        let bin_ds = session.create_simple_descriptor_set(
            &bin_pipeline,
-            &[&anno_buf, &bin_alloc_buf_dev, &bin_buf],
+            &[&memory_buf_dev, &config_buf_dev],
        )?;

-        let clip_scratch_buf = session.create_buffer(1024 * 1024, dev)?;
-
-        let mut coarse_alloc_buf_host = session.create_buffer(8, host)?;
-        let coarse_alloc_buf_dev = session.create_buffer(8, dev)?;
-
-        let coarse_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC;
-        coarse_alloc_buf_host.write(&[n_paths as u32, coarse_alloc_start as u32])?;
        let coarse_code = include_bytes!("../shader/coarse.spv");
-        let coarse_pipeline = session.create_simple_compute_pipeline(coarse_code, 5)?;
+        let coarse_pipeline = session.create_simple_compute_pipeline(coarse_code, 2)?;
        let coarse_ds = session.create_simple_descriptor_set(
            &coarse_pipeline,
-            &[
-                &anno_buf,
-                &bin_buf,
-                &tile_buf,
-                &coarse_alloc_buf_dev,
-                &ptcl_buf,
-            ],
+            &[&memory_buf_dev, &config_buf_dev],
        )?;

        let bg_image = Self::make_test_bg_image(&session);
@ -318,20 +303,25 @@ impl Renderer {
        let sampler = session.create_sampler(SamplerParams::Linear)?;
        let k4_pipeline = session
            .pipeline_builder()
-            .add_buffers(3)
+            .add_buffers(2)
            .add_images(1)
            .add_textures(max_textures)
            .create_compute_pipeline(&session, k4_code)?;
        let k4_ds = session
            .descriptor_set_builder()
-            .add_buffers(&[&ptcl_buf, &tile_buf, &clip_scratch_buf])
+            .add_buffers(&[&memory_buf_dev, &config_buf_dev])
            .add_images(&[&image_dev])
            .add_textures(&[&bg_image], &sampler)
            .build(&session, &k4_pipeline)?;

        Ok(Renderer {
-            scene_buf,
-            scene_dev,
+            scene_buf_host,
+            scene_buf_dev,
+            memory_buf_host,
+            memory_buf_dev,
+            state_buf,
+            config_buf_host,
+            config_buf_dev,
            image_dev,
            el_pipeline,
            el_ds,
@ -347,19 +337,6 @@ impl Renderer {
            coarse_ds,
            k4_pipeline,
            k4_ds,
-            state_buf,
-            anno_buf,
-            pathseg_buf,
-            tile_buf,
-            bin_buf,
-            ptcl_buf,
-            tile_alloc_buf_host,
-            tile_alloc_buf_dev,
-            bin_alloc_buf_host,
-            bin_alloc_buf_dev,
-            coarse_alloc_buf_host,
-            coarse_alloc_buf_dev,
-            clip_scratch_buf,
            n_elements,
            n_paths,
            n_pathseg,
@ -368,21 +345,16 @@ impl Renderer {
    }

    pub unsafe fn record(&self, cmd_buf: &mut hub::CmdBuf, query_pool: &hub::QueryPool) {
-        cmd_buf.copy_buffer(self.scene_buf.vk_buffer(), self.scene_dev.vk_buffer());
+        cmd_buf.copy_buffer(self.scene_buf_host.vk_buffer(), self.scene_buf_dev.vk_buffer());
        cmd_buf.copy_buffer(
-            self.tile_alloc_buf_host.vk_buffer(),
-            self.tile_alloc_buf_dev.vk_buffer(),
+            self.config_buf_host.vk_buffer(),
+            self.config_buf_dev.vk_buffer(),
        );
        cmd_buf.copy_buffer(
-            self.bin_alloc_buf_host.vk_buffer(),
-            self.bin_alloc_buf_dev.vk_buffer(),
-        );
-        cmd_buf.copy_buffer(
-            self.coarse_alloc_buf_host.vk_buffer(),
-            self.coarse_alloc_buf_dev.vk_buffer(),
+            self.memory_buf_host.vk_buffer(),
+            self.memory_buf_dev.vk_buffer(),
        );
        cmd_buf.clear_buffer(self.state_buf.vk_buffer(), None);
-        cmd_buf.clear_buffer(self.clip_scratch_buf.vk_buffer(), Some(4));
        cmd_buf.memory_barrier();
        cmd_buf.image_barrier(
            self.image_dev.vk_image(),