diff --git a/piet-gpu-derive/src/glsl.rs b/piet-gpu-derive/src/glsl.rs
index 2409637..ce862b0 100644
--- a/piet-gpu-derive/src/glsl.rs
+++ b/piet-gpu-derive/src/glsl.rs
@@ -31,22 +31,18 @@ pub fn gen_glsl(module: &LayoutModule) -> String {
 
     for name in &module.def_names {
         let def = module.defs.get(name).unwrap();
-        let mem = &"memory".to_owned();
-        let mut buf_name = &module.name;
-        if !module.name.eq(&"state") && !module.name.eq(&"scene") {
-            buf_name = mem;
-        }
+        let is_mem = !module.name.eq(&"state") && !module.name.eq(&"scene");
         match def {
             (_size, LayoutTypeDef::Struct(fields)) => {
-                gen_struct_read(&mut r, buf_name, &name, fields);
+                gen_struct_read(&mut r, &module.name, &name, is_mem, fields);
                 if module.gpu_write {
-                    gen_struct_write(&mut r, buf_name, &name, fields);
+                    gen_struct_write(&mut r, &module.name, &name, is_mem, fields);
                 }
             }
             (_size, LayoutTypeDef::Enum(en)) => {
-                gen_enum_read(&mut r, buf_name, &name, en);
+                gen_enum_read(&mut r, &module.name, &name, is_mem, en);
                 if module.gpu_write {
-                    gen_enum_write(&mut r, buf_name, &name, en);
+                    gen_enum_write(&mut r, &module.name, &name, is_mem, en);
                 }
             }
         }
@@ -96,14 +92,23 @@ fn gen_struct_read(
     r: &mut String,
     bufname: &str,
     name: &str,
+    is_mem: bool,
     fields: &[(String, usize, LayoutType)],
 ) {
-    writeln!(r, "{} {}_read({}Ref ref) {{", name, name, name).unwrap();
+    write!(r, "{} {}_read(", name, name).unwrap();
+    if is_mem {
+        write!(r, "Alloc a, ").unwrap();
+    }
+    writeln!(r, "{}Ref ref) {{", name).unwrap();
     writeln!(r, "    uint ix = ref.offset >> 2;").unwrap();
     let coverage = crate::layout::struct_coverage(fields, false);
     for (i, fields) in coverage.iter().enumerate() {
         if !fields.is_empty() {
-            writeln!(r, "    uint raw{} = {}[ix + {}];", i, bufname, i).unwrap();
+            if is_mem {
+                writeln!(r, "    uint raw{} = read_mem(a, ix + {});", i, i).unwrap();
+            } else {
+                writeln!(r, "    uint raw{} = {}[ix + {}];", i, bufname, i).unwrap();
+            }
         }
     }
     writeln!(r, "    {} s;", name).unwrap();
@@ -130,26 +135,47 @@ fn gen_enum_read(
     r: &mut String,
     bufname: &str,
     name: &str,
+    is_mem: bool,
     variants: &[(String, Vec<(usize, LayoutType)>)],
 ) {
-    writeln!(r, "uint {}_tag({}Ref ref) {{", name, name).unwrap();
-    writeln!(r, "    return {}[ref.offset >> 2];", bufname).unwrap();
+    if is_mem {
+        writeln!(r, "uint {}_tag(Alloc a, {}Ref ref) {{", name, name).unwrap();
+        writeln!(r, "    return read_mem(a, ref.offset >> 2);").unwrap();
+    } else {
+        writeln!(r, "uint {}_tag({}Ref ref) {{", name, name).unwrap();
+        writeln!(r, "    return {}[ref.offset >> 2];", bufname).unwrap();
+    }
     writeln!(r, "}}\n").unwrap();
     for (var_name, payload) in variants {
         if payload.len() == 1 {
             if let GpuType::InlineStruct(structname) = &payload[0].1.ty {
-                writeln!(
-                    r,
-                    "{} {}_{}_read({}Ref ref) {{",
-                    structname, name, var_name, name
-                )
-                .unwrap();
-                writeln!(
-                    r,
-                    "    return {}_read({}Ref(ref.offset + {}));",
-                    structname, structname, payload[0].0
-                )
-                .unwrap();
+                if is_mem {
+                    writeln!(
+                        r,
+                        "{} {}_{}_read(Alloc a, {}Ref ref) {{",
+                        structname, name, var_name, name
+                    )
+                    .unwrap();
+                    writeln!(
+                        r,
+                        "    return {}_read(a, {}Ref(ref.offset + {}));",
+                        structname, structname, payload[0].0
+                    )
+                    .unwrap();
+                } else {
+                    writeln!(
+                        r,
+                        "{} {}_{}_read({}Ref ref) {{",
+                        structname, name, var_name, name
+                    )
+                    .unwrap();
+                    writeln!(
+                        r,
+                        "    return {}_read({}Ref(ref.offset + {}));",
+                        structname, structname, payload[0].0
+                    )
+                    .unwrap();
+                }
                 writeln!(r, "}}\n").unwrap();
             }
         }
@@ -303,9 +329,14 @@ fn gen_struct_write(
     r: &mut String,
     bufname: &str,
     name: &str,
+    is_mem: bool,
     fields: &[(String, usize, LayoutType)],
 ) {
-    writeln!(r, "void {}_write({}Ref ref, {} s) {{", name, name, name).unwrap();
+    write!(r, "void {}_write(", name).unwrap();
+    if is_mem {
+        write!(r, "Alloc a, ").unwrap();
+    }
+    writeln!(r, "{}Ref ref, {} s) {{", name, name).unwrap();
     writeln!(r, "    uint ix = ref.offset >> 2;").unwrap();
     let coverage = crate::layout::struct_coverage(fields, true);
 
@@ -381,13 +412,20 @@ fn gen_struct_write(
         }
 
         if !pieces.is_empty() {
-            write!(r, "    {}[ix + {}] = ", bufname, i).unwrap();
+            if is_mem {
+                write!(r, "    write_mem(a, ix + {}, ", i).unwrap();
+            } else {
+                write!(r, "    {}[ix + {}] = ", bufname, i).unwrap();
+            }
             for (j, piece) in pieces.iter().enumerate() {
                 if j != 0 {
                     write!(r, " | ").unwrap();
                 }
                 write!(r, "{}", piece).unwrap();
             }
+            if is_mem {
+                write!(r, ")").unwrap();
+            }
             writeln!(r, ";").unwrap();
         }
     }
@@ -429,38 +467,70 @@ fn gen_enum_write(
     r: &mut String,
     bufname: &str,
     name: &str,
+    is_mem: bool,
     variants: &[(String, Vec<(usize, LayoutType)>)],
 ) {
     for (var_name, payload) in variants {
         if payload.is_empty() {
-            writeln!(r, "void {}_{}_write({}Ref ref) {{", name, var_name, name).unwrap();
-            writeln!(
-                r,
-                "    {}[ref.offset >> 2] = {}_{};",
-                bufname, name, var_name
-            )
-            .unwrap();
-            writeln!(r, "}}\n").unwrap();
-        } else if payload.len() == 1 {
-            if let GpuType::InlineStruct(structname) = &payload[0].1.ty {
+            if is_mem {
+                writeln!(r, "void {}_{}_write(Alloc a, {}Ref ref) {{", name, var_name, name).unwrap();
                 writeln!(
                     r,
-                    "void {}_{}_write({}Ref ref, {} s) {{",
-                    name, var_name, name, structname
+                    "    write_mem(a, ref.offset >> 2, {}_{});",
+                    name, var_name
                 )
                 .unwrap();
+            } else {
+                writeln!(r, "void {}_{}_write({}Ref ref) {{", name, var_name, name).unwrap();
                 writeln!(
                     r,
                     "    {}[ref.offset >> 2] = {}_{};",
                     bufname, name, var_name
                 )
                 .unwrap();
-                writeln!(
-                    r,
-                    "    {}_write({}Ref(ref.offset + {}), s);",
-                    structname, structname, payload[0].0
-                )
-                .unwrap();
+            }
+            writeln!(r, "}}\n").unwrap();
+        } else if payload.len() == 1 {
+            if let GpuType::InlineStruct(structname) = &payload[0].1.ty {
+                if is_mem {
+                    writeln!(
+                        r,
+                        "void {}_{}_write(Alloc a, {}Ref ref, {} s) {{",
+                        name, var_name, name, structname
+                    )
+                    .unwrap();
+                    writeln!(
+                        r,
+                        "    write_mem(a, ref.offset >> 2, {}_{});",
+                        name, var_name
+                    )
+                    .unwrap();
+                    writeln!(
+                        r,
+                        "    {}_write(a, {}Ref(ref.offset + {}), s);",
+                        structname, structname, payload[0].0
+                    )
+                    .unwrap();
+                } else {
+                    writeln!(
+                        r,
+                        "void {}_{}_write(Alloc a, {}Ref ref, {} s) {{",
+                        name, var_name, name, structname
+                    )
+                    .unwrap();
+                    writeln!(
+                        r,
+                        "    {}[ref.offset >> 2] = {}_{};",
+                        bufname, name, var_name
+                    )
+                    .unwrap();
+                    writeln!(
+                        r,
+                        "    {}_write({}Ref(ref.offset + {}), s);",
+                        structname, structname, payload[0].0
+                    )
+                    .unwrap();
+                }
                 writeln!(r, "}}\n").unwrap();
             }
         }
diff --git a/piet-gpu/shader/annotated.h b/piet-gpu/shader/annotated.h
index 8a757ef..291496f 100644
--- a/piet-gpu/shader/annotated.h
+++ b/piet-gpu/shader/annotated.h
@@ -62,36 +62,36 @@ AnnotatedRef Annotated_index(AnnotatedRef ref, uint index) {
     return AnnotatedRef(ref.offset + index * Annotated_size);
 }
 
-AnnoFill AnnoFill_read(AnnoFillRef ref) {
+AnnoFill AnnoFill_read(Alloc a, AnnoFillRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = memory[ix + 0];
-    uint raw1 = memory[ix + 1];
-    uint raw2 = memory[ix + 2];
-    uint raw3 = memory[ix + 3];
-    uint raw4 = memory[ix + 4];
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    uint raw4 = read_mem(a, ix + 4);
     AnnoFill s;
     s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
     s.rgba_color = raw4;
     return s;
 }
 
-void AnnoFill_write(AnnoFillRef ref, AnnoFill s) {
+void AnnoFill_write(Alloc a, AnnoFillRef ref, AnnoFill s) {
     uint ix = ref.offset >> 2;
-    memory[ix + 0] = floatBitsToUint(s.bbox.x);
-    memory[ix + 1] = floatBitsToUint(s.bbox.y);
-    memory[ix + 2] = floatBitsToUint(s.bbox.z);
-    memory[ix + 3] = floatBitsToUint(s.bbox.w);
-    memory[ix + 4] = s.rgba_color;
+    write_mem(a, ix + 0, floatBitsToUint(s.bbox.x));
+    write_mem(a, ix + 1, floatBitsToUint(s.bbox.y));
+    write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
+    write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
+    write_mem(a, ix + 4, s.rgba_color);
 }
 
-AnnoStroke AnnoStroke_read(AnnoStrokeRef ref) {
+AnnoStroke AnnoStroke_read(Alloc a, AnnoStrokeRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = memory[ix + 0];
-    uint raw1 = memory[ix + 1];
-    uint raw2 = memory[ix + 2];
-    uint raw3 = memory[ix + 3];
-    uint raw4 = memory[ix + 4];
-    uint raw5 = memory[ix + 5];
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    uint raw4 = read_mem(a, ix + 4);
+    uint raw5 = read_mem(a, ix + 5);
     AnnoStroke s;
     s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
     s.rgba_color = raw4;
@@ -99,76 +99,76 @@ AnnoStroke AnnoStroke_read(AnnoStrokeRef ref) {
     return s;
 }
 
-void AnnoStroke_write(AnnoStrokeRef ref, AnnoStroke s) {
+void AnnoStroke_write(Alloc a, AnnoStrokeRef ref, AnnoStroke s) {
     uint ix = ref.offset >> 2;
-    memory[ix + 0] = floatBitsToUint(s.bbox.x);
-    memory[ix + 1] = floatBitsToUint(s.bbox.y);
-    memory[ix + 2] = floatBitsToUint(s.bbox.z);
-    memory[ix + 3] = floatBitsToUint(s.bbox.w);
-    memory[ix + 4] = s.rgba_color;
-    memory[ix + 5] = floatBitsToUint(s.linewidth);
+    write_mem(a, ix + 0, floatBitsToUint(s.bbox.x));
+    write_mem(a, ix + 1, floatBitsToUint(s.bbox.y));
+    write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
+    write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
+    write_mem(a, ix + 4, s.rgba_color);
+    write_mem(a, ix + 5, floatBitsToUint(s.linewidth));
 }
 
-AnnoClip AnnoClip_read(AnnoClipRef ref) {
+AnnoClip AnnoClip_read(Alloc a, AnnoClipRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = memory[ix + 0];
-    uint raw1 = memory[ix + 1];
-    uint raw2 = memory[ix + 2];
-    uint raw3 = memory[ix + 3];
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
     AnnoClip s;
     s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
     return s;
 }
 
-void AnnoClip_write(AnnoClipRef ref, AnnoClip s) {
+void AnnoClip_write(Alloc a, AnnoClipRef ref, AnnoClip s) {
     uint ix = ref.offset >> 2;
-    memory[ix + 0] = floatBitsToUint(s.bbox.x);
-    memory[ix + 1] = floatBitsToUint(s.bbox.y);
-    memory[ix + 2] = floatBitsToUint(s.bbox.z);
-    memory[ix + 3] = floatBitsToUint(s.bbox.w);
+    write_mem(a, ix + 0, floatBitsToUint(s.bbox.x));
+    write_mem(a, ix + 1, floatBitsToUint(s.bbox.y));
+    write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
+    write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
 }
 
-uint Annotated_tag(AnnotatedRef ref) {
-    return memory[ref.offset >> 2];
+uint Annotated_tag(Alloc a, AnnotatedRef ref) {
+    return read_mem(a, ref.offset >> 2);
 }
 
-AnnoStroke Annotated_Stroke_read(AnnotatedRef ref) {
-    return AnnoStroke_read(AnnoStrokeRef(ref.offset + 4));
+AnnoStroke Annotated_Stroke_read(Alloc a, AnnotatedRef ref) {
+    return AnnoStroke_read(a, AnnoStrokeRef(ref.offset + 4));
 }
 
-AnnoFill Annotated_Fill_read(AnnotatedRef ref) {
-    return AnnoFill_read(AnnoFillRef(ref.offset + 4));
+AnnoFill Annotated_Fill_read(Alloc a, AnnotatedRef ref) {
+    return AnnoFill_read(a, AnnoFillRef(ref.offset + 4));
 }
 
-AnnoClip Annotated_BeginClip_read(AnnotatedRef ref) {
-    return AnnoClip_read(AnnoClipRef(ref.offset + 4));
+AnnoClip Annotated_BeginClip_read(Alloc a, AnnotatedRef ref) {
+    return AnnoClip_read(a, AnnoClipRef(ref.offset + 4));
 }
 
-AnnoClip Annotated_EndClip_read(AnnotatedRef ref) {
-    return AnnoClip_read(AnnoClipRef(ref.offset + 4));
+AnnoClip Annotated_EndClip_read(Alloc a, AnnotatedRef ref) {
+    return AnnoClip_read(a, AnnoClipRef(ref.offset + 4));
 }
 
-void Annotated_Nop_write(AnnotatedRef ref) {
-    memory[ref.offset >> 2] = Annotated_Nop;
+void Annotated_Nop_write(Alloc a, AnnotatedRef ref) {
+    write_mem(a, ref.offset >> 2, Annotated_Nop);
 }
 
-void Annotated_Stroke_write(AnnotatedRef ref, AnnoStroke s) {
-    memory[ref.offset >> 2] = Annotated_Stroke;
-    AnnoStroke_write(AnnoStrokeRef(ref.offset + 4), s);
+void Annotated_Stroke_write(Alloc a, AnnotatedRef ref, AnnoStroke s) {
+    write_mem(a, ref.offset >> 2, Annotated_Stroke);
+    AnnoStroke_write(a, AnnoStrokeRef(ref.offset + 4), s);
 }
 
-void Annotated_Fill_write(AnnotatedRef ref, AnnoFill s) {
-    memory[ref.offset >> 2] = Annotated_Fill;
-    AnnoFill_write(AnnoFillRef(ref.offset + 4), s);
+void Annotated_Fill_write(Alloc a, AnnotatedRef ref, AnnoFill s) {
+    write_mem(a, ref.offset >> 2, Annotated_Fill);
+    AnnoFill_write(a, AnnoFillRef(ref.offset + 4), s);
 }
 
-void Annotated_BeginClip_write(AnnotatedRef ref, AnnoClip s) {
-    memory[ref.offset >> 2] = Annotated_BeginClip;
-    AnnoClip_write(AnnoClipRef(ref.offset + 4), s);
+void Annotated_BeginClip_write(Alloc a, AnnotatedRef ref, AnnoClip s) {
+    write_mem(a, ref.offset >> 2, Annotated_BeginClip);
+    AnnoClip_write(a, AnnoClipRef(ref.offset + 4), s);
 }
 
-void Annotated_EndClip_write(AnnotatedRef ref, AnnoClip s) {
-    memory[ref.offset >> 2] = Annotated_EndClip;
-    AnnoClip_write(AnnoClipRef(ref.offset + 4), s);
+void Annotated_EndClip_write(Alloc a, AnnotatedRef ref, AnnoClip s) {
+    write_mem(a, ref.offset >> 2, Annotated_EndClip);
+    AnnoClip_write(a, AnnoClipRef(ref.offset + 4), s);
 }
 
diff --git a/piet-gpu/shader/backdrop.comp b/piet-gpu/shader/backdrop.comp
index 6828ac1..85e54e8 100644
--- a/piet-gpu/shader/backdrop.comp
+++ b/piet-gpu/shader/backdrop.comp
@@ -15,8 +15,8 @@
 #version 450
 #extension GL_GOOGLE_include_directive : enable
 
-#include "setup.h"
 #include "mem.h"
+#include "setup.h"
 
 #define LG_BACKDROP_WG (7 + LG_WG_FACTOR)
 #define BACKDROP_WG (1 << LG_BACKDROP_WG)
@@ -31,27 +31,27 @@ layout(set = 0, binding = 1) readonly buffer ConfigBuf {
 #include "tile.h"
 
 shared uint sh_row_count[BACKDROP_WG];
-shared uint sh_row_base[BACKDROP_WG];
+shared Alloc sh_row_alloc[BACKDROP_WG];
 shared uint sh_row_width[BACKDROP_WG];
 
 void main() {
-    if (mem_overflow) {
+    if (mem_error != NO_ERROR) {
         return;
     }
 
     uint th_ix = gl_LocalInvocationID.x;
     uint element_ix = gl_GlobalInvocationID.x;
-    AnnotatedRef ref = AnnotatedRef(conf.anno_base + element_ix * Annotated_size);
+    AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
 
     // Work assignment: 1 thread : 1 path element
     uint row_count = 0;
     if (element_ix < conf.n_elements) {
-        uint tag = Annotated_tag(ref);
+        uint tag = Annotated_tag(conf.anno_alloc, ref);
         switch (tag) {
         case Annotated_Fill:
         case Annotated_BeginClip:
-            PathRef path_ref = PathRef(conf.tile_base + element_ix * Path_size);
-            Path path = Path_read(path_ref);
+            PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size);
+            Path path = Path_read(conf.tile_alloc, path_ref);
             sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
             row_count = path.bbox.w - path.bbox.y;
             // Paths that don't cross tile top edges don't have backdrops.
@@ -62,7 +62,8 @@ void main() {
                 // long as it doesn't cross the left edge.
                 row_count = 0;
             }
-            sh_row_base[th_ix] = (path.tiles.offset >> 2) + 1;
+            Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size);
+            sh_row_alloc[th_ix] = path_alloc;
         }
     }
 
@@ -92,13 +93,14 @@ void main() {
         if (width > 0) {
             // Process one row sequentially
             // Read backdrop value per tile and prefix sum it
+            Alloc tiles_alloc = sh_row_alloc[el_ix];
             uint seq_ix = row - (el_ix > 0 ? sh_row_count[el_ix - 1] : 0);
-            uint tile_el_ix = sh_row_base[el_ix] + seq_ix * 2 * width;
-            uint sum = memory[tile_el_ix];
+            uint tile_el_ix = (tiles_alloc.offset >> 2) + 1 + seq_ix * 2 * width;
+            uint sum = read_mem(tiles_alloc, tile_el_ix);
             for (uint x = 1; x < width; x++) {
                 tile_el_ix += 2;
-                sum += memory[tile_el_ix];
-                memory[tile_el_ix] = sum;
+                sum += read_mem(tiles_alloc, tile_el_ix);
+                write_mem(tiles_alloc, tile_el_ix, sum);
             }
         }
     }
diff --git a/piet-gpu/shader/backdrop.spv b/piet-gpu/shader/backdrop.spv
index dc1cbc4..81d26c0 100644
Binary files a/piet-gpu/shader/backdrop.spv and b/piet-gpu/shader/backdrop.spv differ
diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp
index 6812fb6..8ad72c7 100644
--- a/piet-gpu/shader/binning.comp
+++ b/piet-gpu/shader/binning.comp
@@ -9,8 +9,8 @@
 #version 450
 #extension GL_GOOGLE_include_directive : enable
 
-#include "setup.h"
 #include "mem.h"
+#include "setup.h"
 
 layout(local_size_x = N_TILE, local_size_y = 1) in;
 
@@ -32,11 +32,11 @@ layout(set = 0, binding = 1) readonly buffer ConfigBuf {
 // Bitmaps are sliced (256bit into 8 (N_SLICE) 32bit submaps)
 shared uint bitmaps[N_SLICE][N_TILE];
 shared uint count[N_SLICE][N_TILE];
-shared uint sh_chunk_start[N_TILE];
+shared Alloc sh_chunk_alloc[N_TILE];
 shared bool sh_alloc_failed;
 
 void main() {
-    if (mem_overflow) {
+    if (mem_error != NO_ERROR) {
         return;
     }
 
@@ -53,10 +53,10 @@ void main() {
 
     // Read inputs and determine coverage of bins
     uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x;
-    AnnotatedRef ref = AnnotatedRef(conf.anno_base + element_ix * Annotated_size);
+    AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
     uint tag = Annotated_Nop;
     if (element_ix < my_n_elements) {
-        tag = Annotated_tag(ref);
+        tag = Annotated_tag(conf.anno_alloc, ref);
     }
     int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
     switch (tag) {
@@ -66,7 +66,7 @@ void main() {
     case Annotated_EndClip:
         // Note: we take advantage of the fact that these drawing elements
         // have the bbox at the same place in their layout.
-        AnnoFill fill = Annotated_Fill_read(ref);
+        AnnoFill fill = Annotated_Fill_read(conf.anno_alloc, ref);
         x0 = int(floor(fill.bbox.x * SX));
         y0 = int(floor(fill.bbox.y * SY));
         x1 = int(ceil(fill.bbox.z * SX));
@@ -105,20 +105,21 @@ void main() {
         count[i][gl_LocalInvocationID.x] = element_count;
     }
     // element_count is number of elements covering bin for this invocation.
-    Alloc chunk_alloc = Alloc(0, false);
+    Alloc chunk_alloc = new_alloc(0, 0);
     if (element_count != 0) {
         // TODO: aggregate atomic adds (subgroup is probably fastest)
-        chunk_alloc = malloc(element_count * BinInstance_size);
-        sh_chunk_start[gl_LocalInvocationID.x] = chunk_alloc.offset;
-        if (chunk_alloc.failed) {
+        MallocResult chunk = malloc(element_count * BinInstance_size);
+        chunk_alloc = chunk.alloc;
+        sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
+        if (chunk.failed) {
             sh_alloc_failed = true;
         }
     }
     // Note: it might be more efficient for reading to do this in the
     // other order (each bin is a contiguous sequence of partitions)
-    uint out_ix = (conf.bin_base >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2;
-    memory[out_ix] = element_count;
-    memory[out_ix + 1] = chunk_alloc.offset;
+    uint out_ix = (conf.bin_alloc.offset >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2;
+    write_mem(conf.bin_alloc, out_ix, element_count);
+    write_mem(conf.bin_alloc, out_ix + 1, chunk_alloc.offset);
 
     barrier();
     if (sh_alloc_failed) {
@@ -137,8 +138,9 @@ void main() {
             if (my_slice > 0) {
                 idx += count[my_slice - 1][bin_ix];
             }
-            uint out_offset = sh_chunk_start[bin_ix] + idx * BinInstance_size;
-            BinInstance_write(BinInstanceRef(out_offset), BinInstance(element_ix));
+            Alloc out_alloc = sh_chunk_alloc[bin_ix];
+            uint out_offset = out_alloc.offset + idx * BinInstance_size;
+            BinInstance_write(out_alloc, BinInstanceRef(out_offset), BinInstance(element_ix));
         }
         x++;
         if (x == x1) {
diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv
index 7974dac..a9a05f5 100644
Binary files a/piet-gpu/shader/binning.spv and b/piet-gpu/shader/binning.spv differ
diff --git a/piet-gpu/shader/bins.h b/piet-gpu/shader/bins.h
index 4364278..853adab 100644
--- a/piet-gpu/shader/bins.h
+++ b/piet-gpu/shader/bins.h
@@ -16,16 +16,16 @@ BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) {
     return BinInstanceRef(ref.offset + index * BinInstance_size);
 }
 
-BinInstance BinInstance_read(BinInstanceRef ref) {
+BinInstance BinInstance_read(Alloc a, BinInstanceRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = memory[ix + 0];
+    uint raw0 = read_mem(a, ix + 0);
     BinInstance s;
     s.element_ix = raw0;
     return s;
 }
 
-void BinInstance_write(BinInstanceRef ref, BinInstance s) {
+void BinInstance_write(Alloc a, BinInstanceRef ref, BinInstance s) {
     uint ix = ref.offset >> 2;
-    memory[ix + 0] = s.element_ix;
+    write_mem(a, ix + 0, s.element_ix);
 }
 
diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
index 112a57d..3f4e460 100644
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@@ -13,8 +13,8 @@
 #version 450
 #extension GL_GOOGLE_include_directive : enable
 
-#include "setup.h"
 #include "mem.h"
+#include "setup.h"
 
 layout(local_size_x = N_TILE, local_size_y = 1) in;
 
@@ -34,7 +34,7 @@ shared uint sh_elements[N_TILE];
 
 // Number of elements in the partition; prefix sum.
 shared uint sh_part_count[N_PART_READ];
-shared uint sh_part_elements[N_PART_READ];
+shared Alloc sh_part_elements[N_PART_READ];
 
 shared uint sh_bitmaps[N_SLICE][N_TILE];
 
@@ -48,24 +48,47 @@ shared uint sh_tile_y0[N_TILE];
 shared uint sh_tile_base[N_TILE];
 shared uint sh_tile_stride[N_TILE];
 
-// Perhaps cmd_limit should be a global? This is a style question.
-bool alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) {
+#ifdef MEM_DEBUG
+// Store allocs only when MEM_DEBUG to save shared memory traffic.
+shared Alloc sh_tile_alloc[N_TILE];
+
+void write_tile_alloc(uint el_ix, Alloc a) {
+    sh_tile_alloc[el_ix] = a;
+}
+
+Alloc read_tile_alloc(uint el_ix) {
+    return sh_tile_alloc[el_ix];
+}
+#else
+void write_tile_alloc(uint el_ix, Alloc a) {
+    // No-op
+}
+
+Alloc read_tile_alloc(uint el_ix) {
+    // All memory.
+    return new_alloc(0, memory.length()*4);
+}
+#endif
+
+// Perhaps cmd_alloc should be a global? This is a style question.
+bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) {
     if (cmd_ref.offset < cmd_limit) {
         return true;
     }
-    Alloc new_cmd = malloc(PTCL_INITIAL_ALLOC);
+    MallocResult new_cmd = malloc(PTCL_INITIAL_ALLOC);
     if (new_cmd.failed) {
         return false;
     }
-    CmdJump jump = CmdJump(new_cmd.offset);
-    Cmd_Jump_write(cmd_ref, jump);
-    cmd_ref = CmdRef(new_cmd.offset);
-    cmd_limit = new_cmd.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
+    CmdJump jump = CmdJump(new_cmd.alloc.offset);
+    Cmd_Jump_write(cmd_alloc, cmd_ref, jump);
+    cmd_alloc = new_cmd.alloc;
+    cmd_ref = CmdRef(cmd_alloc.offset);
+    cmd_limit = cmd_alloc.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
     return true;
 }
 
 void main() {
-    if (mem_overflow) {
+    if (mem_error != NO_ERROR) {
         return;
     }
 
@@ -85,7 +108,8 @@ void main() {
     uint tile_x = gl_LocalInvocationID.x % N_TILE_X;
     uint tile_y = gl_LocalInvocationID.x / N_TILE_X;
     uint this_tile_ix = (bin_tile_y + tile_y) * conf.width_in_tiles + bin_tile_x + tile_x;
-    CmdRef cmd_ref = CmdRef(conf.ptcl_base + this_tile_ix * PTCL_INITIAL_ALLOC);
+    Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, this_tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC);
+    CmdRef cmd_ref = CmdRef(cmd_alloc.offset);
     uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
     // The nesting depth of the clip stack
     uint clip_depth = 0;
@@ -117,9 +141,10 @@ void main() {
                 part_start_ix = ready_ix;
                 uint count = 0;
                 if (th_ix < N_PART_READ && partition_ix + th_ix < n_partitions) {
-                    uint in_ix = (conf.bin_base >> 2) + ((partition_ix + th_ix) * N_TILE + bin_ix) * 2;
-                    count = memory[in_ix];
-                    sh_part_elements[th_ix] = memory[in_ix + 1];
+                    uint in_ix = (conf.bin_alloc.offset >> 2) + ((partition_ix + th_ix) * N_TILE + bin_ix) * 2;
+                    count = read_mem(conf.bin_alloc, in_ix);
+                    uint offset = read_mem(conf.bin_alloc, in_ix + 1);
+                    sh_part_elements[th_ix] = new_alloc(offset, count*BinInstance_size);
                 }
                 // prefix sum of counts
                 for (uint i = 0; i < LG_N_PART_READ; i++) {
@@ -152,8 +177,9 @@ void main() {
                     }
                 }
                 ix -= part_ix > 0 ? sh_part_count[part_ix - 1] : part_start_ix;
-                BinInstanceRef inst_ref = BinInstanceRef(sh_part_elements[part_ix]);
-                BinInstance inst = BinInstance_read(BinInstance_index(inst_ref, ix));
+                Alloc bin_alloc = sh_part_elements[part_ix];
+                BinInstanceRef inst_ref = BinInstanceRef(bin_alloc.offset);
+                BinInstance inst = BinInstance_read(bin_alloc, BinInstance_index(inst_ref, ix));
                 sh_elements[th_ix] = inst.element_ix;
             }
             barrier();
@@ -169,8 +195,8 @@ void main() {
         AnnotatedRef ref;
         if (th_ix + rd_ix < wr_ix) {
             element_ix = sh_elements[th_ix];
-            ref = AnnotatedRef(conf.anno_base + element_ix * Annotated_size);
-            tag = Annotated_tag(ref);
+            ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
+            tag = Annotated_tag(conf.anno_alloc, ref);
         }
 
         // Bounding box of element in pixel coordinates.
@@ -183,7 +209,7 @@ void main() {
             // We have one "path" for each element, even if the element isn't
             // actually a path (currently EndClip, but images etc in the future).
             uint path_ix = element_ix;
-            Path path = Path_read(PathRef(conf.tile_base + path_ix * Path_size));
+            Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size));
             uint stride = path.bbox.z - path.bbox.x;
             sh_tile_stride[th_ix] = stride;
             int dx = int(path.bbox.x) - int(bin_tile_x);
@@ -199,6 +225,8 @@ void main() {
             // base relative to bin
             uint base = path.tiles.offset - uint(dy * stride + dx) * Tile_size;
             sh_tile_base[th_ix] = base;
+            Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size);
+            write_tile_alloc(th_ix, path_alloc);
             break;
         default:
             tile_count = 0;
@@ -226,8 +254,8 @@ void main() {
                     el_ix = probe;
                 }
             }
-            AnnotatedRef ref = AnnotatedRef(conf.anno_base + sh_elements[el_ix] * Annotated_size);
-            uint tag = Annotated_tag(ref);
+            AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + sh_elements[el_ix] * Annotated_size);
+            uint tag = Annotated_tag(conf.anno_alloc, ref);
             uint seq_ix = ix - (el_ix > 0 ? sh_tile_count[el_ix - 1] : 0);
             uint width = sh_tile_width[el_ix];
             uint x = sh_tile_x0[el_ix] + seq_ix % width;
@@ -236,7 +264,7 @@ void main() {
             if (tag == Annotated_BeginClip || tag == Annotated_EndClip) {
                 include_tile = true;
             } else {
-                Tile tile = Tile_read(TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
+                Tile tile = Tile_read(read_tile_alloc(el_ix), TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
                 // Include the path in the tile if
                 // - the tile contains at least a segment (tile offset non-zero)
                 // - the tile is completely covered (backdrop non-zero)
@@ -275,16 +303,16 @@ void main() {
             // At this point, we read the element again from global memory.
             // If that turns out to be expensive, maybe we can pack it into
             // shared memory (or perhaps just the tag).
-            ref = AnnotatedRef(conf.anno_base + element_ix * Annotated_size);
-            tag = Annotated_tag(ref);
+            ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
+            tag = Annotated_tag(conf.anno_alloc, ref);
 
             if (clip_zero_depth == 0) {
                 switch (tag) {
                 case Annotated_Fill:
-                    Tile tile = Tile_read(TileRef(sh_tile_base[element_ref_ix]
+                    Tile tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix]
                         + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
-                    AnnoFill fill = Annotated_Fill_read(ref);
-                    if (!alloc_cmd(cmd_ref, cmd_limit)) {
+                    AnnoFill fill = Annotated_Fill_read(conf.anno_alloc, ref);
+                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
                         break;
                     }
                     if (tile.tile.offset != 0) {
@@ -292,32 +320,32 @@ void main() {
                         cmd_fill.tile_ref = tile.tile.offset;
                         cmd_fill.backdrop = tile.backdrop;
                         cmd_fill.rgba_color = fill.rgba_color;
-                        Cmd_Fill_write(cmd_ref, cmd_fill);
+                        Cmd_Fill_write(cmd_alloc, cmd_ref, cmd_fill);
                     } else {
-                        Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color));
+                        Cmd_Solid_write(cmd_alloc, cmd_ref, CmdSolid(fill.rgba_color));
                     }
                     cmd_ref.offset += Cmd_size;
                     break;
                 case Annotated_BeginClip:
-                    tile = Tile_read(TileRef(sh_tile_base[element_ref_ix]
+                    tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix]
                         + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
                     if (tile.tile.offset == 0 && tile.backdrop == 0) {
                         clip_zero_depth = clip_depth + 1;
                     } else if (tile.tile.offset == 0 && clip_depth < 32) {
                         clip_one_mask |= (1 << clip_depth);
                     } else {
-                        if (!alloc_cmd(cmd_ref, cmd_limit)) {
+                        if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
                             break;
                         }
                         if (tile.tile.offset != 0) {
                             CmdBeginClip cmd_begin_clip;
                             cmd_begin_clip.tile_ref = tile.tile.offset;
                             cmd_begin_clip.backdrop = tile.backdrop;
-                            Cmd_BeginClip_write(cmd_ref, cmd_begin_clip);
+                            Cmd_BeginClip_write(cmd_alloc, cmd_ref, cmd_begin_clip);
                         } else {
                             // TODO: here is where a bunch of optimization magic should happen
                             float alpha = tile.backdrop == 0 ? 0.0 : 1.0;
-                            Cmd_BeginSolidClip_write(cmd_ref, CmdBeginSolidClip(alpha));
+                            Cmd_BeginSolidClip_write(cmd_alloc, cmd_ref, CmdBeginSolidClip(alpha));
                         }
                         cmd_ref.offset += Cmd_size;
                         if (clip_depth < 32) {
@@ -329,25 +357,25 @@ void main() {
                 case Annotated_EndClip:
                     clip_depth--;
                     if (clip_depth >= 32 || (clip_one_mask & (1 << clip_depth)) == 0) {
-                        if (!alloc_cmd(cmd_ref, cmd_limit)) {
+                        if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
                             break;
                         }
-                        Cmd_EndClip_write(cmd_ref, CmdEndClip(1.0));
+                        Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(1.0));
                         cmd_ref.offset += Cmd_size;
                     }
                     break;
                 case Annotated_Stroke:
-                    tile = Tile_read(TileRef(sh_tile_base[element_ref_ix]
+                    tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix]
                         + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
-                    AnnoStroke stroke = Annotated_Stroke_read(ref);
+                    AnnoStroke stroke = Annotated_Stroke_read(conf.anno_alloc, ref);
                     CmdStroke cmd_stroke;
                     cmd_stroke.tile_ref = tile.tile.offset;
                     cmd_stroke.half_width = 0.5 * stroke.linewidth;
                     cmd_stroke.rgba_color = stroke.rgba_color;
-                    if (!alloc_cmd(cmd_ref, cmd_limit)) {
+                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
                         break;
                     }
-                    Cmd_Stroke_write(cmd_ref, cmd_stroke);
+                    Cmd_Stroke_write(cmd_alloc, cmd_ref, cmd_stroke);
                     cmd_ref.offset += Cmd_size;
                     break;
                 }
@@ -372,6 +400,6 @@ void main() {
         if (rd_ix >= ready_ix && partition_ix >= n_partitions) break;
     }
     if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) {
-        Cmd_End_write(cmd_ref);
+        Cmd_End_write(cmd_alloc, cmd_ref);
     }
 }
diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv
index 8991094..505c4f4 100644
Binary files a/piet-gpu/shader/coarse.spv and b/piet-gpu/shader/coarse.spv differ
diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp
index a0e5011..255dd13 100644
--- a/piet-gpu/shader/elements.comp
+++ b/piet-gpu/shader/elements.comp
@@ -9,8 +9,8 @@
 #version 450
 #extension GL_GOOGLE_include_directive : enable
 
-#include "setup.h"
 #include "mem.h"
+#include "setup.h"
 
 #define N_ROWS 4
 #define WG_SIZE 32
@@ -172,7 +172,7 @@ shared uint sh_part_ix;
 shared State sh_prefix;
 
 void main() {
-    if (mem_overflow) {
+    if (mem_error != NO_ERROR) {
         return;
     }
 
@@ -342,10 +342,10 @@ void main() {
             }
             // We do encoding a bit by hand to minimize divergence. Another approach
             // would be to have a fill/stroke bool.
-            PathSegRef path_out_ref = PathSegRef(conf.pathseg_base + (st.pathseg_count - 1) * PathSeg_size);
+            PathSegRef path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size);
             uint out_tag = tag == Element_FillLine ? PathSeg_FillCubic : PathSeg_StrokeCubic;
-            memory[path_out_ref.offset >> 2] = out_tag;
-            PathStrokeCubic_write(PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic);
+            write_mem(conf.pathseg_alloc, path_out_ref.offset >> 2, out_tag);
+            PathStrokeCubic_write(conf.pathseg_alloc, PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic);
             break;
         case Element_FillQuad:
         case Element_StrokeQuad:
@@ -366,10 +366,10 @@ void main() {
             }
             // We do encoding a bit by hand to minimize divergence. Another approach
             // would be to have a fill/stroke bool.
-            path_out_ref = PathSegRef(conf.pathseg_base + (st.pathseg_count - 1) * PathSeg_size);
+            path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size);
             out_tag = tag == Element_FillQuad ? PathSeg_FillCubic : PathSeg_StrokeCubic;
-            memory[path_out_ref.offset >> 2] = out_tag;
-            PathStrokeCubic_write(PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic);
+            write_mem(conf.pathseg_alloc, path_out_ref.offset >> 2, out_tag);
+            PathStrokeCubic_write(conf.pathseg_alloc, PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic);
             break;
         case Element_FillCubic:
         case Element_StrokeCubic:
@@ -387,10 +387,10 @@ void main() {
             }
             // We do encoding a bit by hand to minimize divergence. Another approach
             // would be to have a fill/stroke bool.
-            path_out_ref = PathSegRef(conf.pathseg_base + (st.pathseg_count - 1) * PathSeg_size);
+            path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size);
             out_tag = tag == Element_FillCubic ? PathSeg_FillCubic : PathSeg_StrokeCubic;
-            memory[path_out_ref.offset >> 2] = out_tag;
-            PathStrokeCubic_write(PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic);
+            write_mem(conf.pathseg_alloc, path_out_ref.offset >> 2, out_tag);
+            PathStrokeCubic_write(conf.pathseg_alloc, PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic);
             break;
         case Element_Stroke:
             Stroke stroke = Element_Stroke_read(this_ref);
@@ -399,31 +399,31 @@ void main() {
             vec2 lw = get_linewidth(st);
             anno_stroke.bbox = st.bbox + vec4(-lw, lw);
             anno_stroke.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z));
-            AnnotatedRef out_ref = AnnotatedRef(conf.anno_base + (st.path_count - 1) * Annotated_size);
-            Annotated_Stroke_write(out_ref, anno_stroke);
+            AnnotatedRef out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
+            Annotated_Stroke_write(conf.anno_alloc, out_ref, anno_stroke);
             break;
         case Element_Fill:
             Fill fill = Element_Fill_read(this_ref);
             AnnoFill anno_fill;
             anno_fill.rgba_color = fill.rgba_color;
             anno_fill.bbox = st.bbox;
-            out_ref = AnnotatedRef(conf.anno_base + (st.path_count - 1) * Annotated_size);
-            Annotated_Fill_write(out_ref, anno_fill);
+            out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
+            Annotated_Fill_write(conf.anno_alloc, out_ref, anno_fill);
             break;
         case Element_BeginClip:
             Clip begin_clip = Element_BeginClip_read(this_ref);
             AnnoClip anno_begin_clip = AnnoClip(begin_clip.bbox);
             // This is the absolute bbox, it's been transformed during encoding.
             anno_begin_clip.bbox = begin_clip.bbox;
-            out_ref = AnnotatedRef(conf.anno_base + (st.path_count - 1) * Annotated_size);
-            Annotated_BeginClip_write(out_ref, anno_begin_clip);
+            out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
+            Annotated_BeginClip_write(conf.anno_alloc, out_ref, anno_begin_clip);
             break;
         case Element_EndClip:
             Clip end_clip = Element_EndClip_read(this_ref);
             // This bbox is expected to be the same as the begin one.
             AnnoClip anno_end_clip = AnnoClip(end_clip.bbox);
-            out_ref = AnnotatedRef(conf.anno_base + (st.path_count - 1) * Annotated_size);
-            Annotated_EndClip_write(out_ref, anno_end_clip);
+            out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
+            Annotated_EndClip_write(conf.anno_alloc, out_ref, anno_end_clip);
             break;
         }
     }
diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv
index 7475f04..287aa4e 100644
Binary files a/piet-gpu/shader/elements.spv and b/piet-gpu/shader/elements.spv differ
diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp
index fe55ff9..395ac80 100644
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@@ -10,8 +10,8 @@
 #extension GL_GOOGLE_include_directive : enable
 #extension GL_EXT_nonuniform_qualifier : enable
 
-#include "setup.h"
 #include "mem.h"
+#include "setup.h"
 
 #define CHUNK 8
 #define CHUNK_DY (TILE_HEIGHT_PX / CHUNK)
@@ -37,16 +37,16 @@ layout(set = 0, binding = 3) uniform sampler2D textures[];
 #define CLIP_LINK_OFFSET (TILE_WIDTH_PX * TILE_HEIGHT_PX)
 #define CLIP_BUF_SIZE (CLIP_LINK_OFFSET + 1)
 
-shared Alloc sh_clip_alloc;
+shared MallocResult sh_clip_alloc;
 
 // Allocate a scratch buffer for clipping.
-Alloc alloc_clip_buf(uint link) {
+MallocResult alloc_clip_buf(uint link) {
     if (gl_LocalInvocationID.x == 0 && gl_LocalInvocationID.y == 0) {
-        Alloc alloc = malloc(CLIP_BUF_SIZE * 4);
-        if (!alloc.failed) {
-            memory[(alloc.offset >> 2) + CLIP_LINK_OFFSET] = link;
+        MallocResult m = malloc(CLIP_BUF_SIZE * 4);
+        if (!m.failed) {
+            write_mem(m.alloc, (m.alloc.offset >> 2) + CLIP_LINK_OFFSET, link);
         }
-        sh_clip_alloc = alloc;
+        sh_clip_alloc = m;
     }
     barrier();
     return sh_clip_alloc;
@@ -59,7 +59,7 @@ float[CHUNK] computeArea(vec2 xy, int backdrop, uint tile_ref) {
     for (uint k = 0; k < CHUNK; k++) area[k] = float(backdrop);
     TileSegRef tile_seg_ref = TileSegRef(tile_ref);
     do {
-        TileSeg seg = TileSeg_read(tile_seg_ref);
+        TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size), tile_seg_ref);
         for (uint k = 0; k < CHUNK; k++) {
             vec2 my_xy = vec2(xy.x, xy.y + float(k * CHUNK_DY));
             vec2 start = seg.origin - my_xy;
@@ -87,12 +87,13 @@ float[CHUNK] computeArea(vec2 xy, int backdrop, uint tile_ref) {
 }
 
 void main() {
-    if (mem_overflow) {
+    if (mem_error != NO_ERROR) {
         return;
     }
 
     uint tile_ix = gl_WorkGroupID.y * conf.width_in_tiles + gl_WorkGroupID.x;
-    CmdRef cmd_ref = CmdRef(conf.ptcl_base + tile_ix * PTCL_INITIAL_ALLOC);
+    Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC);
+    CmdRef cmd_ref = CmdRef(cmd_alloc.offset);
 
     uvec2 xy_uint = uvec2(gl_GlobalInvocationID.x, gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y);
     vec2 xy = vec2(xy_uint);
@@ -101,7 +102,7 @@ void main() {
     uint blend_stack[BLEND_STACK_SIZE][CHUNK];
     uint blend_spill = 0;
     uint blend_sp = 0;
-    uint clip_tos = 0;
+    Alloc clip_tos = new_alloc(0, 0);
     for (uint i = 0; i < CHUNK; i++) {
         rgb[i] = vec3(0.5);
         if (xy_uint.x < 1024 && xy_uint.y < 1024) {
@@ -111,13 +112,13 @@ void main() {
     }
 
     while (true) {
-        uint tag = Cmd_tag(cmd_ref);
+        uint tag = Cmd_tag(cmd_alloc, cmd_ref);
         if (tag == Cmd_End) {
             break;
         }
         switch (tag) {
         case Cmd_Circle:
-            CmdCircle circle = Cmd_Circle_read(cmd_ref);
+            CmdCircle circle = Cmd_Circle_read(cmd_alloc, cmd_ref);
             vec4 fg_rgba = unpackUnorm4x8(circle.rgba_color).wzyx;
             for (uint i = 0; i < CHUNK; i++) {
                 float dy = float(i * CHUNK_DY);
@@ -129,12 +130,12 @@ void main() {
             break;
         case Cmd_Stroke:
             // Calculate distance field from all the line segments in this tile.
-            CmdStroke stroke = Cmd_Stroke_read(cmd_ref);
+            CmdStroke stroke = Cmd_Stroke_read(cmd_alloc, cmd_ref);
             float df[CHUNK];
             for (uint k = 0; k < CHUNK; k++) df[k] = 1e9;
             TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref);
             do {
-                TileSeg seg = TileSeg_read(tile_seg_ref);
+                TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size), tile_seg_ref);
                 vec2 line_vec = seg.vector;
                 for (uint k = 0; k < CHUNK; k++) {
                     vec2 dpos = xy + vec2(0.5, 0.5) - seg.origin;
@@ -151,7 +152,7 @@ void main() {
             }
             break;
         case Cmd_Fill:
-            CmdFill fill = Cmd_Fill_read(cmd_ref);
+            CmdFill fill = Cmd_Fill_read(cmd_alloc, cmd_ref);
             float area[CHUNK];
             area = computeArea(xy, fill.backdrop, fill.tile_ref);
             fg_rgba = unpackUnorm4x8(fill.rgba_color).wzyx;
@@ -164,25 +165,25 @@ void main() {
             uint blend_slot = blend_sp % BLEND_STACK_SIZE;
             if (blend_sp == blend_spill + BLEND_STACK_SIZE) {
                 // spill to scratch buffer
-                Alloc alloc = alloc_clip_buf(clip_tos);
-                if (alloc.failed) {
+                MallocResult m = alloc_clip_buf(clip_tos.offset);
+                if (m.failed) {
                     return;
                 }
-                clip_tos = alloc.offset;
-                uint base_ix = (clip_tos >> 2) + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y;
+                clip_tos = m.alloc;
+                uint base_ix = (clip_tos.offset >> 2) + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y;
                 for (uint k = 0; k < CHUNK; k++) {
-                    memory[base_ix + k * TILE_WIDTH_PX * CHUNK_DY] = blend_stack[blend_slot][k];
+                    write_mem(clip_tos, base_ix + k * TILE_WIDTH_PX * CHUNK_DY, blend_stack[blend_slot][k]);
                 }
                 blend_spill++;
             }
             if (tag == Cmd_BeginClip) {
-                CmdBeginClip begin_clip = Cmd_BeginClip_read(cmd_ref);
+                CmdBeginClip begin_clip = Cmd_BeginClip_read(cmd_alloc, cmd_ref);
                 area = computeArea(xy, begin_clip.backdrop, begin_clip.tile_ref);
                 for (uint k = 0; k < CHUNK; k++) {
                     blend_stack[blend_slot][k] = packUnorm4x8(vec4(rgb[k], clamp(abs(area[k]), 0.0, 1.0)));
                 }
             } else {
-                CmdBeginSolidClip begin_solid_clip = Cmd_BeginSolidClip_read(cmd_ref);
+                CmdBeginSolidClip begin_solid_clip = Cmd_BeginSolidClip_read(cmd_alloc, cmd_ref);
                 float solid_alpha = begin_solid_clip.alpha;
                 for (uint k = 0; k < CHUNK; k++) {
                     blend_stack[blend_slot][k] = packUnorm4x8(vec4(rgb[k], solid_alpha));
@@ -191,14 +192,14 @@ void main() {
             blend_sp++;
             break;
         case Cmd_EndClip:
-            CmdEndClip end_clip = Cmd_EndClip_read(cmd_ref);
+            CmdEndClip end_clip = Cmd_EndClip_read(cmd_alloc, cmd_ref);
             blend_slot = (blend_sp - 1) % BLEND_STACK_SIZE;
             if (blend_sp == blend_spill) {
-                uint base_ix = (clip_tos >> 2) + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y;
+                uint base_ix = (clip_tos.offset >> 2) + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y;
                 for (uint k = 0; k < CHUNK; k++) {
-                    blend_stack[blend_slot][k] = memory[base_ix + k * TILE_WIDTH_PX * CHUNK_DY];
+                    blend_stack[blend_slot][k] = read_mem(clip_tos, base_ix + k * TILE_WIDTH_PX * CHUNK_DY);
                 }
-                clip_tos = memory[(clip_tos >> 2) + CLIP_LINK_OFFSET];
+                clip_tos.offset = read_mem(clip_tos, (clip_tos.offset >> 2) + CLIP_LINK_OFFSET);
                 blend_spill--;
             }
             blend_sp--;
@@ -208,20 +209,21 @@ void main() {
             }
             break;
         case Cmd_Solid:
-            CmdSolid solid = Cmd_Solid_read(cmd_ref);
+            CmdSolid solid = Cmd_Solid_read(cmd_alloc, cmd_ref);
             fg_rgba = unpackUnorm4x8(solid.rgba_color).wzyx;
             for (uint k = 0; k < CHUNK; k++) {
                 rgb[k] = mix(rgb[k], fg_rgba.rgb, mask[k] * fg_rgba.a);
             }
             break;
         case Cmd_SolidMask:
-            CmdSolidMask solid_mask = Cmd_SolidMask_read(cmd_ref);
+            CmdSolidMask solid_mask = Cmd_SolidMask_read(cmd_alloc, cmd_ref);
             for (uint k = 0; k < CHUNK; k++) {
                 mask[k] = solid_mask.mask;
             }
             break;
         case Cmd_Jump:
-            cmd_ref = CmdRef(Cmd_Jump_read(cmd_ref).new_ref);
+            cmd_ref = CmdRef(Cmd_Jump_read(cmd_alloc, cmd_ref).new_ref);
+            cmd_alloc.offset = cmd_ref.offset;
             continue;
         }
         cmd_ref.offset += Cmd_size;
diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv
index b384340..3ccf21d 100644
Binary files a/piet-gpu/shader/kernel4.spv and b/piet-gpu/shader/kernel4.spv differ
diff --git a/piet-gpu/shader/mem.h b/piet-gpu/shader/mem.h
index 9373cbf..7b2a02a 100644
--- a/piet-gpu/shader/mem.h
+++ b/piet-gpu/shader/mem.h
@@ -3,27 +3,118 @@
 layout(set = 0, binding = 0) buffer Memory {
     // offset into memory of the next allocation, initialized by the user.
     uint mem_offset;
-    bool mem_overflow;
+    // mem_error tracks the status of memory accesses, initialized to NO_ERROR
+    // by the user. ERR_MALLOC_FAILED is reported for insufficient memory.
+    // If MEM_DEBUG is defined the following errors are reported:
+    // - ERR_OUT_OF_BOUNDS is reported for out of bounds writes.
+    // - ERR_UNALIGNED_ACCESS for memory access not aligned to 32-bit words.
+    uint mem_error;
     uint[] memory;
 };
 
+// Uncomment this line to add the size field to Alloc and enable memory checks.
+// Note that the Config struct in setup.h grows size fields as well.
+//#define MEM_DEBUG
+
+#define NO_ERROR 0
+#define ERR_MALLOC_FAILED 1
+#define ERR_OUT_OF_BOUNDS 2
+#define ERR_UNALIGNED_ACCESS 3
+
+#define Alloc_size 8
+
 // Alloc represents a memory allocation.
 struct Alloc {
     // offset in bytes into memory.
     uint offset;
+#ifdef MEM_DEBUG
+    // size in bytes of the allocation.
+    uint size;
+#endif
+};
+
+struct MallocResult {
+    Alloc alloc;
     // failed is true if the allocation overflowed memory.
     bool failed;
 };
 
-// malloc allocates size bytes of memory.
-Alloc malloc(uint size) {
+// new_alloc synthesizes an Alloc when its offset and size are derived.
+Alloc new_alloc(uint offset, uint size) {
     Alloc a;
-	// Round up to nearest 32-bit word.
-	size = (size + 3) & ~3;
-    a.offset = atomicAdd(mem_offset, size);
-    a.failed = a.offset + size > memory.length() * 4;
-    if (a.failed) {
-        mem_overflow = true;
-    }
+    a.offset = offset;
+#ifdef MEM_DEBUG
+    a.size = size;
+#endif
     return a;
 }
+
+// malloc allocates size bytes of memory.
+MallocResult malloc(uint size) {
+    MallocResult r;
+    r.failed = false;
+    uint offset = atomicAdd(mem_offset, size);
+    r.alloc = new_alloc(offset, size);
+    if (offset + size > memory.length() * 4) {
+        r.failed = true;
+        atomicMax(mem_error, ERR_MALLOC_FAILED);
+        return r;
+    }
+#ifdef MEM_DEBUG
+    if ((size & 3) != 0) {
+        r.failed = true;
+        atomicMax(mem_error, ERR_UNALIGNED_ACCESS);
+        return r;
+    }
+#endif
+    return r;
+}
+
+// touch_mem checks whether access to the memory word at offset is valid.
+// If MEM_DEBUG is defined, touch_mem returns false if offset is out of bounds.
+// Offset is in words.
+bool touch_mem(Alloc alloc, uint offset) {
+#ifdef MEM_DEBUG
+    if (offset < alloc.offset/4 || offset >= (alloc.offset + alloc.size)/4) {
+        atomicMax(mem_error, ERR_OUT_OF_BOUNDS);
+        return false;
+    }
+#endif
+    return true;
+}
+
+// write_mem writes val to memory at offset.
+// Offset is in words.
+void write_mem(Alloc alloc, uint offset, uint val) {
+    if (!touch_mem(alloc, offset)) {
+        return;
+    }
+    memory[offset] = val;
+}
+
+// read_mem reads the value from memory at offset.
+// Offset is in words.
+uint read_mem(Alloc alloc, uint offset) {
+    if (!touch_mem(alloc, offset)) {
+        return 0;
+    }
+    uint v = memory[offset];
+    return v;
+}
+
+// slice_mem returns a sub-allocation inside another. Offset and size are in
+// bytes, relative to a.offset.
+Alloc slice_mem(Alloc a, uint offset, uint size) {
+#ifdef MEM_DEBUG
+    if ((offset & 3) != 0 || (size & 3) != 0) {
+        atomicMax(mem_error, ERR_UNALIGNED_ACCESS);
+        return Alloc(0, 0);
+    }
+    if (offset + size > a.size) {
+        // slice_mem is sometimes used for slices outside bounds,
+        // but never written.
+        return Alloc(0, 0);
+    }
+#endif
+    return new_alloc(a.offset + offset, size);
+}
diff --git a/piet-gpu/shader/path_coarse.comp b/piet-gpu/shader/path_coarse.comp
index 20c3586..4f77ff9 100644
--- a/piet-gpu/shader/path_coarse.comp
+++ b/piet-gpu/shader/path_coarse.comp
@@ -7,8 +7,8 @@
 #version 450
 #extension GL_GOOGLE_include_directive : enable
 
-#include "setup.h"
 #include "mem.h"
+#include "setup.h"
 
 #define LG_COARSE_WG 5
 #define COARSE_WG (1 << LG_COARSE_WG)
@@ -87,21 +87,21 @@ SubdivResult estimate_subdiv(vec2 p0, vec2 p1, vec2 p2, float sqrt_tol) {
 }
 
 void main() {
-    if (mem_overflow) {
+    if (mem_error != NO_ERROR) {
         return;
     }
 
     uint element_ix = gl_GlobalInvocationID.x;
-    PathSegRef ref = PathSegRef(conf.pathseg_base + element_ix * PathSeg_size);
+    PathSegRef ref = PathSegRef(conf.pathseg_alloc.offset + element_ix * PathSeg_size);
 
     uint tag = PathSeg_Nop;
     if (element_ix < conf.n_pathseg) {
-        tag = PathSeg_tag(ref);
+        tag = PathSeg_tag(conf.pathseg_alloc, ref);
     }
     switch (tag) {
     case PathSeg_FillCubic:
     case PathSeg_StrokeCubic:
-        PathStrokeCubic cubic = PathSeg_StrokeCubic_read(ref);
+        PathStrokeCubic cubic = PathSeg_StrokeCubic_read(conf.pathseg_alloc, ref);
         vec2 err_v = 3.0 * (cubic.p2 - cubic.p1) + cubic.p0 - cubic.p3;
         float err = err_v.x * err_v.x + err_v.y * err_v.y;
         // The number of quadratics.
@@ -123,7 +123,8 @@ void main() {
         uint n = max(uint(ceil(val * 0.5 / sqrt(REM_ACCURACY))), 1);
 
         uint path_ix = cubic.path_ix;
-        Path path = Path_read(PathRef(conf.tile_base + path_ix * Path_size));
+        Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size));
+        Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size);
         ivec4 bbox = ivec4(path.bbox);
         vec2 p0 = cubic.p0;
         qp0 = cubic.p0;
@@ -182,11 +183,11 @@ void main() {
                 // TODO: can be tighter, use c to bound width
                 uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
                 // Consider using subgroups to aggregate atomic add.
-                Alloc tile_alloc = malloc(n_tile_alloc * TileSeg_size);
+                MallocResult tile_alloc = malloc(n_tile_alloc * TileSeg_size);
                 if (tile_alloc.failed) {
                     return;
                 }
-                uint tile_offset = tile_alloc.offset;
+                uint tile_offset = tile_alloc.alloc.offset;
 
                 TileSeg tile_seg;
 
@@ -204,7 +205,9 @@ void main() {
                         int backdrop = p1.y < p0.y ? 1 : -1;
                         TileRef tile_ref = Tile_index(path.tiles, uint(base + xbackdrop));
                         uint tile_el = tile_ref.offset >> 2;
-                        atomicAdd(memory[tile_el + 1], backdrop);
+                        if (touch_mem(path_alloc, tile_el + 1)) {
+                            atomicAdd(memory[tile_el + 1], backdrop);
+                        }
                     }
 
                     // next_xray is the xray for the next scanline; the line segment intersects
@@ -225,9 +228,12 @@ void main() {
 
                     for (int x = xx0; x < xx1; x++) {
                         float tile_x0 = float(x * TILE_WIDTH_PX);
-                        TileRef tile_ref = Tile_index(path.tiles, uint(base + x));
+                        TileRef tile_ref = Tile_index(TileRef(path.tiles.offset), uint(base + x));
                         uint tile_el = tile_ref.offset >> 2;
-                        uint old = atomicExchange(memory[tile_el], tile_offset);
+                        uint old = 0;
+                        if (touch_mem(path_alloc, tile_el)) {
+                            old = atomicExchange(memory[tile_el], tile_offset);
+                        }
                         tile_seg.origin = p0;
                         tile_seg.vector = p1 - p0;
                         float y_edge = 0.0;
@@ -254,7 +260,7 @@ void main() {
                         }
                         tile_seg.y_edge = y_edge;
                         tile_seg.next.offset = old;
-                        TileSeg_write(TileSegRef(tile_offset), tile_seg);
+                        TileSeg_write(tile_alloc.alloc, TileSegRef(tile_offset), tile_seg);
                         tile_offset += TileSeg_size;
                     }
                     xc += b;
diff --git a/piet-gpu/shader/path_coarse.spv b/piet-gpu/shader/path_coarse.spv
index 3e4392c..1854604 100644
Binary files a/piet-gpu/shader/path_coarse.spv and b/piet-gpu/shader/path_coarse.spv differ
diff --git a/piet-gpu/shader/pathseg.h b/piet-gpu/shader/pathseg.h
index ecba9c5..00509fb 100644
--- a/piet-gpu/shader/pathseg.h
+++ b/piet-gpu/shader/pathseg.h
@@ -87,13 +87,13 @@ PathSegRef PathSeg_index(PathSegRef ref, uint index) {
     return PathSegRef(ref.offset + index * PathSeg_size);
 }
 
-PathFillLine PathFillLine_read(PathFillLineRef ref) {
+PathFillLine PathFillLine_read(Alloc a, PathFillLineRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = memory[ix + 0];
-    uint raw1 = memory[ix + 1];
-    uint raw2 = memory[ix + 2];
-    uint raw3 = memory[ix + 3];
-    uint raw4 = memory[ix + 4];
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    uint raw4 = read_mem(a, ix + 4);
     PathFillLine s;
     s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
     s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
@@ -101,24 +101,24 @@ PathFillLine PathFillLine_read(PathFillLineRef ref) {
     return s;
 }
 
-void PathFillLine_write(PathFillLineRef ref, PathFillLine s) {
+void PathFillLine_write(Alloc a, PathFillLineRef ref, PathFillLine s) {
     uint ix = ref.offset >> 2;
-    memory[ix + 0] = floatBitsToUint(s.p0.x);
-    memory[ix + 1] = floatBitsToUint(s.p0.y);
-    memory[ix + 2] = floatBitsToUint(s.p1.x);
-    memory[ix + 3] = floatBitsToUint(s.p1.y);
-    memory[ix + 4] = s.path_ix;
+    write_mem(a, ix + 0, floatBitsToUint(s.p0.x));
+    write_mem(a, ix + 1, floatBitsToUint(s.p0.y));
+    write_mem(a, ix + 2, floatBitsToUint(s.p1.x));
+    write_mem(a, ix + 3, floatBitsToUint(s.p1.y));
+    write_mem(a, ix + 4, s.path_ix);
 }
 
-PathStrokeLine PathStrokeLine_read(PathStrokeLineRef ref) {
+PathStrokeLine PathStrokeLine_read(Alloc a, PathStrokeLineRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = memory[ix + 0];
-    uint raw1 = memory[ix + 1];
-    uint raw2 = memory[ix + 2];
-    uint raw3 = memory[ix + 3];
-    uint raw4 = memory[ix + 4];
-    uint raw5 = memory[ix + 5];
-    uint raw6 = memory[ix + 6];
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    uint raw4 = read_mem(a, ix + 4);
+    uint raw5 = read_mem(a, ix + 5);
+    uint raw6 = read_mem(a, ix + 6);
     PathStrokeLine s;
     s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
     s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
@@ -127,28 +127,28 @@ PathStrokeLine PathStrokeLine_read(PathStrokeLineRef ref) {
     return s;
 }
 
-void PathStrokeLine_write(PathStrokeLineRef ref, PathStrokeLine s) {
+void PathStrokeLine_write(Alloc a, PathStrokeLineRef ref, PathStrokeLine s) {
     uint ix = ref.offset >> 2;
-    memory[ix + 0] = floatBitsToUint(s.p0.x);
-    memory[ix + 1] = floatBitsToUint(s.p0.y);
-    memory[ix + 2] = floatBitsToUint(s.p1.x);
-    memory[ix + 3] = floatBitsToUint(s.p1.y);
-    memory[ix + 4] = s.path_ix;
-    memory[ix + 5] = floatBitsToUint(s.stroke.x);
-    memory[ix + 6] = floatBitsToUint(s.stroke.y);
+    write_mem(a, ix + 0, floatBitsToUint(s.p0.x));
+    write_mem(a, ix + 1, floatBitsToUint(s.p0.y));
+    write_mem(a, ix + 2, floatBitsToUint(s.p1.x));
+    write_mem(a, ix + 3, floatBitsToUint(s.p1.y));
+    write_mem(a, ix + 4, s.path_ix);
+    write_mem(a, ix + 5, floatBitsToUint(s.stroke.x));
+    write_mem(a, ix + 6, floatBitsToUint(s.stroke.y));
 }
 
-PathFillCubic PathFillCubic_read(PathFillCubicRef ref) {
+PathFillCubic PathFillCubic_read(Alloc a, PathFillCubicRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = memory[ix + 0];
-    uint raw1 = memory[ix + 1];
-    uint raw2 = memory[ix + 2];
-    uint raw3 = memory[ix + 3];
-    uint raw4 = memory[ix + 4];
-    uint raw5 = memory[ix + 5];
-    uint raw6 = memory[ix + 6];
-    uint raw7 = memory[ix + 7];
-    uint raw8 = memory[ix + 8];
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    uint raw4 = read_mem(a, ix + 4);
+    uint raw5 = read_mem(a, ix + 5);
+    uint raw6 = read_mem(a, ix + 6);
+    uint raw7 = read_mem(a, ix + 7);
+    uint raw8 = read_mem(a, ix + 8);
     PathFillCubic s;
     s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
     s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
@@ -158,32 +158,32 @@ PathFillCubic PathFillCubic_read(PathFillCubicRef ref) {
     return s;
 }
 
-void PathFillCubic_write(PathFillCubicRef ref, PathFillCubic s) {
+void PathFillCubic_write(Alloc a, PathFillCubicRef ref, PathFillCubic s) {
     uint ix = ref.offset >> 2;
-    memory[ix + 0] = floatBitsToUint(s.p0.x);
-    memory[ix + 1] = floatBitsToUint(s.p0.y);
-    memory[ix + 2] = floatBitsToUint(s.p1.x);
-    memory[ix + 3] = floatBitsToUint(s.p1.y);
-    memory[ix + 4] = floatBitsToUint(s.p2.x);
-    memory[ix + 5] = floatBitsToUint(s.p2.y);
-    memory[ix + 6] = floatBitsToUint(s.p3.x);
-    memory[ix + 7] = floatBitsToUint(s.p3.y);
-    memory[ix + 8] = s.path_ix;
+    write_mem(a, ix + 0, floatBitsToUint(s.p0.x));
+    write_mem(a, ix + 1, floatBitsToUint(s.p0.y));
+    write_mem(a, ix + 2, floatBitsToUint(s.p1.x));
+    write_mem(a, ix + 3, floatBitsToUint(s.p1.y));
+    write_mem(a, ix + 4, floatBitsToUint(s.p2.x));
+    write_mem(a, ix + 5, floatBitsToUint(s.p2.y));
+    write_mem(a, ix + 6, floatBitsToUint(s.p3.x));
+    write_mem(a, ix + 7, floatBitsToUint(s.p3.y));
+    write_mem(a, ix + 8, s.path_ix);
 }
 
-PathStrokeCubic PathStrokeCubic_read(PathStrokeCubicRef ref) {
+PathStrokeCubic PathStrokeCubic_read(Alloc a, PathStrokeCubicRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = memory[ix + 0];
-    uint raw1 = memory[ix + 1];
-    uint raw2 = memory[ix + 2];
-    uint raw3 = memory[ix + 3];
-    uint raw4 = memory[ix + 4];
-    uint raw5 = memory[ix + 5];
-    uint raw6 = memory[ix + 6];
-    uint raw7 = memory[ix + 7];
-    uint raw8 = memory[ix + 8];
-    uint raw9 = memory[ix + 9];
-    uint raw10 = memory[ix + 10];
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    uint raw4 = read_mem(a, ix + 4);
+    uint raw5 = read_mem(a, ix + 5);
+    uint raw6 = read_mem(a, ix + 6);
+    uint raw7 = read_mem(a, ix + 7);
+    uint raw8 = read_mem(a, ix + 8);
+    uint raw9 = read_mem(a, ix + 9);
+    uint raw10 = read_mem(a, ix + 10);
     PathStrokeCubic s;
     s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
     s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
@@ -194,62 +194,62 @@ PathStrokeCubic PathStrokeCubic_read(PathStrokeCubicRef ref) {
     return s;
 }
 
-void PathStrokeCubic_write(PathStrokeCubicRef ref, PathStrokeCubic s) {
+void PathStrokeCubic_write(Alloc a, PathStrokeCubicRef ref, PathStrokeCubic s) {
     uint ix = ref.offset >> 2;
-    memory[ix + 0] = floatBitsToUint(s.p0.x);
-    memory[ix + 1] = floatBitsToUint(s.p0.y);
-    memory[ix + 2] = floatBitsToUint(s.p1.x);
-    memory[ix + 3] = floatBitsToUint(s.p1.y);
-    memory[ix + 4] = floatBitsToUint(s.p2.x);
-    memory[ix + 5] = floatBitsToUint(s.p2.y);
-    memory[ix + 6] = floatBitsToUint(s.p3.x);
-    memory[ix + 7] = floatBitsToUint(s.p3.y);
-    memory[ix + 8] = s.path_ix;
-    memory[ix + 9] = floatBitsToUint(s.stroke.x);
-    memory[ix + 10] = floatBitsToUint(s.stroke.y);
+    write_mem(a, ix + 0, floatBitsToUint(s.p0.x));
+    write_mem(a, ix + 1, floatBitsToUint(s.p0.y));
+    write_mem(a, ix + 2, floatBitsToUint(s.p1.x));
+    write_mem(a, ix + 3, floatBitsToUint(s.p1.y));
+    write_mem(a, ix + 4, floatBitsToUint(s.p2.x));
+    write_mem(a, ix + 5, floatBitsToUint(s.p2.y));
+    write_mem(a, ix + 6, floatBitsToUint(s.p3.x));
+    write_mem(a, ix + 7, floatBitsToUint(s.p3.y));
+    write_mem(a, ix + 8, s.path_ix);
+    write_mem(a, ix + 9, floatBitsToUint(s.stroke.x));
+    write_mem(a, ix + 10, floatBitsToUint(s.stroke.y));
 }
 
-uint PathSeg_tag(PathSegRef ref) {
-    return memory[ref.offset >> 2];
+uint PathSeg_tag(Alloc a, PathSegRef ref) {
+    return read_mem(a, ref.offset >> 2);
 }
 
-PathFillLine PathSeg_FillLine_read(PathSegRef ref) {
-    return PathFillLine_read(PathFillLineRef(ref.offset + 4));
+PathFillLine PathSeg_FillLine_read(Alloc a, PathSegRef ref) {
+    return PathFillLine_read(a, PathFillLineRef(ref.offset + 4));
 }
 
-PathStrokeLine PathSeg_StrokeLine_read(PathSegRef ref) {
-    return PathStrokeLine_read(PathStrokeLineRef(ref.offset + 4));
+PathStrokeLine PathSeg_StrokeLine_read(Alloc a, PathSegRef ref) {
+    return PathStrokeLine_read(a, PathStrokeLineRef(ref.offset + 4));
 }
 
-PathFillCubic PathSeg_FillCubic_read(PathSegRef ref) {
-    return PathFillCubic_read(PathFillCubicRef(ref.offset + 4));
+PathFillCubic PathSeg_FillCubic_read(Alloc a, PathSegRef ref) {
+    return PathFillCubic_read(a, PathFillCubicRef(ref.offset + 4));
 }
 
-PathStrokeCubic PathSeg_StrokeCubic_read(PathSegRef ref) {
-    return PathStrokeCubic_read(PathStrokeCubicRef(ref.offset + 4));
+PathStrokeCubic PathSeg_StrokeCubic_read(Alloc a, PathSegRef ref) {
+    return PathStrokeCubic_read(a, PathStrokeCubicRef(ref.offset + 4));
 }
 
-void PathSeg_Nop_write(PathSegRef ref) {
-    memory[ref.offset >> 2] = PathSeg_Nop;
+void PathSeg_Nop_write(Alloc a, PathSegRef ref) {
+    write_mem(a, ref.offset >> 2, PathSeg_Nop);
 }
 
-void PathSeg_FillLine_write(PathSegRef ref, PathFillLine s) {
-    memory[ref.offset >> 2] = PathSeg_FillLine;
-    PathFillLine_write(PathFillLineRef(ref.offset + 4), s);
+void PathSeg_FillLine_write(Alloc a, PathSegRef ref, PathFillLine s) {
+    write_mem(a, ref.offset >> 2, PathSeg_FillLine);
+    PathFillLine_write(a, PathFillLineRef(ref.offset + 4), s);
 }
 
-void PathSeg_StrokeLine_write(PathSegRef ref, PathStrokeLine s) {
-    memory[ref.offset >> 2] = PathSeg_StrokeLine;
-    PathStrokeLine_write(PathStrokeLineRef(ref.offset + 4), s);
+void PathSeg_StrokeLine_write(Alloc a, PathSegRef ref, PathStrokeLine s) {
+    write_mem(a, ref.offset >> 2, PathSeg_StrokeLine);
+    PathStrokeLine_write(a, PathStrokeLineRef(ref.offset + 4), s);
 }
 
-void PathSeg_FillCubic_write(PathSegRef ref, PathFillCubic s) {
-    memory[ref.offset >> 2] = PathSeg_FillCubic;
-    PathFillCubic_write(PathFillCubicRef(ref.offset + 4), s);
+void PathSeg_FillCubic_write(Alloc a, PathSegRef ref, PathFillCubic s) {
+    write_mem(a, ref.offset >> 2, PathSeg_FillCubic);
+    PathFillCubic_write(a, PathFillCubicRef(ref.offset + 4), s);
 }
 
-void PathSeg_StrokeCubic_write(PathSegRef ref, PathStrokeCubic s) {
-    memory[ref.offset >> 2] = PathSeg_StrokeCubic;
-    PathStrokeCubic_write(PathStrokeCubicRef(ref.offset + 4), s);
+void PathSeg_StrokeCubic_write(Alloc a, PathSegRef ref, PathStrokeCubic s) {
+    write_mem(a, ref.offset >> 2, PathSeg_StrokeCubic);
+    PathStrokeCubic_write(a, PathStrokeCubicRef(ref.offset + 4), s);
 }
 
diff --git a/piet-gpu/shader/ptcl.h b/piet-gpu/shader/ptcl.h
index eb21eac..4587f8f 100644
--- a/piet-gpu/shader/ptcl.h
+++ b/piet-gpu/shader/ptcl.h
@@ -171,12 +171,12 @@ CmdRef Cmd_index(CmdRef ref, uint index) {
     return CmdRef(ref.offset + index * Cmd_size);
 }
 
-CmdCircle CmdCircle_read(CmdCircleRef ref) {
+CmdCircle CmdCircle_read(Alloc a, CmdCircleRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = memory[ix + 0];
-    uint raw1 = memory[ix + 1];
-    uint raw2 = memory[ix + 2];
-    uint raw3 = memory[ix + 3];
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
     CmdCircle s;
     s.center = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
     s.radius = uintBitsToFloat(raw2);
@@ -184,39 +184,39 @@ CmdCircle CmdCircle_read(CmdCircleRef ref) {
     return s;
 }
 
-void CmdCircle_write(CmdCircleRef ref, CmdCircle s) {
+void CmdCircle_write(Alloc a, CmdCircleRef ref, CmdCircle s) {
     uint ix = ref.offset >> 2;
-    memory[ix + 0] = floatBitsToUint(s.center.x);
-    memory[ix + 1] = floatBitsToUint(s.center.y);
-    memory[ix + 2] = floatBitsToUint(s.radius);
-    memory[ix + 3] = s.rgba_color;
+    write_mem(a, ix + 0, floatBitsToUint(s.center.x));
+    write_mem(a, ix + 1, floatBitsToUint(s.center.y));
+    write_mem(a, ix + 2, floatBitsToUint(s.radius));
+    write_mem(a, ix + 3, s.rgba_color);
 }
 
-CmdLine CmdLine_read(CmdLineRef ref) {
+CmdLine CmdLine_read(Alloc a, CmdLineRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = memory[ix + 0];
-    uint raw1 = memory[ix + 1];
-    uint raw2 = memory[ix + 2];
-    uint raw3 = memory[ix + 3];
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
     CmdLine s;
     s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
     s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
     return s;
 }
 
-void CmdLine_write(CmdLineRef ref, CmdLine s) {
+void CmdLine_write(Alloc a, CmdLineRef ref, CmdLine s) {
     uint ix = ref.offset >> 2;
-    memory[ix + 0] = floatBitsToUint(s.start.x);
-    memory[ix + 1] = floatBitsToUint(s.start.y);
-    memory[ix + 2] = floatBitsToUint(s.end.x);
-    memory[ix + 3] = floatBitsToUint(s.end.y);
+    write_mem(a, ix + 0, floatBitsToUint(s.start.x));
+    write_mem(a, ix + 1, floatBitsToUint(s.start.y));
+    write_mem(a, ix + 2, floatBitsToUint(s.end.x));
+    write_mem(a, ix + 3, floatBitsToUint(s.end.y));
 }
 
-CmdStroke CmdStroke_read(CmdStrokeRef ref) {
+CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = memory[ix + 0];
-    uint raw1 = memory[ix + 1];
-    uint raw2 = memory[ix + 2];
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
     CmdStroke s;
     s.tile_ref = raw0;
     s.half_width = uintBitsToFloat(raw1);
@@ -224,18 +224,18 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) {
     return s;
 }
 
-void CmdStroke_write(CmdStrokeRef ref, CmdStroke s) {
+void CmdStroke_write(Alloc a, CmdStrokeRef ref, CmdStroke s) {
     uint ix = ref.offset >> 2;
-    memory[ix + 0] = s.tile_ref;
-    memory[ix + 1] = floatBitsToUint(s.half_width);
-    memory[ix + 2] = s.rgba_color;
+    write_mem(a, ix + 0, s.tile_ref);
+    write_mem(a, ix + 1, floatBitsToUint(s.half_width));
+    write_mem(a, ix + 2, s.rgba_color);
 }
 
-CmdFill CmdFill_read(CmdFillRef ref) {
+CmdFill CmdFill_read(Alloc a, CmdFillRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = memory[ix + 0];
-    uint raw1 = memory[ix + 1];
-    uint raw2 = memory[ix + 2];
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
     CmdFill s;
     s.tile_ref = raw0;
     s.backdrop = int(raw1);
@@ -243,189 +243,189 @@ CmdFill CmdFill_read(CmdFillRef ref) {
     return s;
 }
 
-void CmdFill_write(CmdFillRef ref, CmdFill s) {
+void CmdFill_write(Alloc a, CmdFillRef ref, CmdFill s) {
     uint ix = ref.offset >> 2;
-    memory[ix + 0] = s.tile_ref;
-    memory[ix + 1] = uint(s.backdrop);
-    memory[ix + 2] = s.rgba_color;
+    write_mem(a, ix + 0, s.tile_ref);
+    write_mem(a, ix + 1, uint(s.backdrop));
+    write_mem(a, ix + 2, s.rgba_color);
 }
 
-CmdBeginClip CmdBeginClip_read(CmdBeginClipRef ref) {
+CmdBeginClip CmdBeginClip_read(Alloc a, CmdBeginClipRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = memory[ix + 0];
-    uint raw1 = memory[ix + 1];
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
     CmdBeginClip s;
     s.tile_ref = raw0;
     s.backdrop = int(raw1);
     return s;
 }
 
-void CmdBeginClip_write(CmdBeginClipRef ref, CmdBeginClip s) {
+void CmdBeginClip_write(Alloc a, CmdBeginClipRef ref, CmdBeginClip s) {
     uint ix = ref.offset >> 2;
-    memory[ix + 0] = s.tile_ref;
-    memory[ix + 1] = uint(s.backdrop);
+    write_mem(a, ix + 0, s.tile_ref);
+    write_mem(a, ix + 1, uint(s.backdrop));
 }
 
-CmdBeginSolidClip CmdBeginSolidClip_read(CmdBeginSolidClipRef ref) {
+CmdBeginSolidClip CmdBeginSolidClip_read(Alloc a, CmdBeginSolidClipRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = memory[ix + 0];
+    uint raw0 = read_mem(a, ix + 0);
     CmdBeginSolidClip s;
     s.alpha = uintBitsToFloat(raw0);
     return s;
 }
 
-void CmdBeginSolidClip_write(CmdBeginSolidClipRef ref, CmdBeginSolidClip s) {
+void CmdBeginSolidClip_write(Alloc a, CmdBeginSolidClipRef ref, CmdBeginSolidClip s) {
     uint ix = ref.offset >> 2;
-    memory[ix + 0] = floatBitsToUint(s.alpha);
+    write_mem(a, ix + 0, floatBitsToUint(s.alpha));
 }
 
-CmdEndClip CmdEndClip_read(CmdEndClipRef ref) {
+CmdEndClip CmdEndClip_read(Alloc a, CmdEndClipRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = memory[ix + 0];
+    uint raw0 = read_mem(a, ix + 0);
     CmdEndClip s;
     s.alpha = uintBitsToFloat(raw0);
     return s;
 }
 
-void CmdEndClip_write(CmdEndClipRef ref, CmdEndClip s) {
+void CmdEndClip_write(Alloc a, CmdEndClipRef ref, CmdEndClip s) {
     uint ix = ref.offset >> 2;
-    memory[ix + 0] = floatBitsToUint(s.alpha);
+    write_mem(a, ix + 0, floatBitsToUint(s.alpha));
 }
 
-CmdSolid CmdSolid_read(CmdSolidRef ref) {
+CmdSolid CmdSolid_read(Alloc a, CmdSolidRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = memory[ix + 0];
+    uint raw0 = read_mem(a, ix + 0);
     CmdSolid s;
     s.rgba_color = raw0;
     return s;
 }
 
-void CmdSolid_write(CmdSolidRef ref, CmdSolid s) {
+void CmdSolid_write(Alloc a, CmdSolidRef ref, CmdSolid s) {
     uint ix = ref.offset >> 2;
-    memory[ix + 0] = s.rgba_color;
+    write_mem(a, ix + 0, s.rgba_color);
 }
 
-CmdSolidMask CmdSolidMask_read(CmdSolidMaskRef ref) {
+CmdSolidMask CmdSolidMask_read(Alloc a, CmdSolidMaskRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = memory[ix + 0];
+    uint raw0 = read_mem(a, ix + 0);
     CmdSolidMask s;
     s.mask = uintBitsToFloat(raw0);
     return s;
 }
 
-void CmdSolidMask_write(CmdSolidMaskRef ref, CmdSolidMask s) {
+void CmdSolidMask_write(Alloc a, CmdSolidMaskRef ref, CmdSolidMask s) {
     uint ix = ref.offset >> 2;
-    memory[ix + 0] = floatBitsToUint(s.mask);
+    write_mem(a, ix + 0, floatBitsToUint(s.mask));
 }
 
-CmdJump CmdJump_read(CmdJumpRef ref) {
+CmdJump CmdJump_read(Alloc a, CmdJumpRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = memory[ix + 0];
+    uint raw0 = read_mem(a, ix + 0);
     CmdJump s;
     s.new_ref = raw0;
     return s;
 }
 
-void CmdJump_write(CmdJumpRef ref, CmdJump s) {
+void CmdJump_write(Alloc a, CmdJumpRef ref, CmdJump s) {
     uint ix = ref.offset >> 2;
-    memory[ix + 0] = s.new_ref;
+    write_mem(a, ix + 0, s.new_ref);
 }
 
-uint Cmd_tag(CmdRef ref) {
-    return memory[ref.offset >> 2];
+uint Cmd_tag(Alloc a, CmdRef ref) {
+    return read_mem(a, ref.offset >> 2);
 }
 
-CmdCircle Cmd_Circle_read(CmdRef ref) {
-    return CmdCircle_read(CmdCircleRef(ref.offset + 4));
+CmdCircle Cmd_Circle_read(Alloc a, CmdRef ref) {
+    return CmdCircle_read(a, CmdCircleRef(ref.offset + 4));
 }
 
-CmdLine Cmd_Line_read(CmdRef ref) {
-    return CmdLine_read(CmdLineRef(ref.offset + 4));
+CmdLine Cmd_Line_read(Alloc a, CmdRef ref) {
+    return CmdLine_read(a, CmdLineRef(ref.offset + 4));
 }
 
-CmdFill Cmd_Fill_read(CmdRef ref) {
-    return CmdFill_read(CmdFillRef(ref.offset + 4));
+CmdFill Cmd_Fill_read(Alloc a, CmdRef ref) {
+    return CmdFill_read(a, CmdFillRef(ref.offset + 4));
 }
 
-CmdBeginClip Cmd_BeginClip_read(CmdRef ref) {
-    return CmdBeginClip_read(CmdBeginClipRef(ref.offset + 4));
+CmdBeginClip Cmd_BeginClip_read(Alloc a, CmdRef ref) {
+    return CmdBeginClip_read(a, CmdBeginClipRef(ref.offset + 4));
 }
 
-CmdBeginSolidClip Cmd_BeginSolidClip_read(CmdRef ref) {
-    return CmdBeginSolidClip_read(CmdBeginSolidClipRef(ref.offset + 4));
+CmdBeginSolidClip Cmd_BeginSolidClip_read(Alloc a, CmdRef ref) {
+    return CmdBeginSolidClip_read(a, CmdBeginSolidClipRef(ref.offset + 4));
 }
 
-CmdEndClip Cmd_EndClip_read(CmdRef ref) {
-    return CmdEndClip_read(CmdEndClipRef(ref.offset + 4));
+CmdEndClip Cmd_EndClip_read(Alloc a, CmdRef ref) {
+    return CmdEndClip_read(a, CmdEndClipRef(ref.offset + 4));
 }
 
-CmdStroke Cmd_Stroke_read(CmdRef ref) {
-    return CmdStroke_read(CmdStrokeRef(ref.offset + 4));
+CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref) {
+    return CmdStroke_read(a, CmdStrokeRef(ref.offset + 4));
 }
 
-CmdSolid Cmd_Solid_read(CmdRef ref) {
-    return CmdSolid_read(CmdSolidRef(ref.offset + 4));
+CmdSolid Cmd_Solid_read(Alloc a, CmdRef ref) {
+    return CmdSolid_read(a, CmdSolidRef(ref.offset + 4));
 }
 
-CmdSolidMask Cmd_SolidMask_read(CmdRef ref) {
-    return CmdSolidMask_read(CmdSolidMaskRef(ref.offset + 4));
+CmdSolidMask Cmd_SolidMask_read(Alloc a, CmdRef ref) {
+    return CmdSolidMask_read(a, CmdSolidMaskRef(ref.offset + 4));
 }
 
-CmdJump Cmd_Jump_read(CmdRef ref) {
-    return CmdJump_read(CmdJumpRef(ref.offset + 4));
+CmdJump Cmd_Jump_read(Alloc a, CmdRef ref) {
+    return CmdJump_read(a, CmdJumpRef(ref.offset + 4));
 }
 
-void Cmd_End_write(CmdRef ref) {
-    memory[ref.offset >> 2] = Cmd_End;
+void Cmd_End_write(Alloc a, CmdRef ref) {
+    write_mem(a, ref.offset >> 2, Cmd_End);
 }
 
-void Cmd_Circle_write(CmdRef ref, CmdCircle s) {
-    memory[ref.offset >> 2] = Cmd_Circle;
-    CmdCircle_write(CmdCircleRef(ref.offset + 4), s);
+void Cmd_Circle_write(Alloc a, CmdRef ref, CmdCircle s) {
+    write_mem(a, ref.offset >> 2, Cmd_Circle);
+    CmdCircle_write(a, CmdCircleRef(ref.offset + 4), s);
 }
 
-void Cmd_Line_write(CmdRef ref, CmdLine s) {
-    memory[ref.offset >> 2] = Cmd_Line;
-    CmdLine_write(CmdLineRef(ref.offset + 4), s);
+void Cmd_Line_write(Alloc a, CmdRef ref, CmdLine s) {
+    write_mem(a, ref.offset >> 2, Cmd_Line);
+    CmdLine_write(a, CmdLineRef(ref.offset + 4), s);
 }
 
-void Cmd_Fill_write(CmdRef ref, CmdFill s) {
-    memory[ref.offset >> 2] = Cmd_Fill;
-    CmdFill_write(CmdFillRef(ref.offset + 4), s);
+void Cmd_Fill_write(Alloc a, CmdRef ref, CmdFill s) {
+    write_mem(a, ref.offset >> 2, Cmd_Fill);
+    CmdFill_write(a, CmdFillRef(ref.offset + 4), s);
 }
 
-void Cmd_BeginClip_write(CmdRef ref, CmdBeginClip s) {
-    memory[ref.offset >> 2] = Cmd_BeginClip;
-    CmdBeginClip_write(CmdBeginClipRef(ref.offset + 4), s);
+void Cmd_BeginClip_write(Alloc a, CmdRef ref, CmdBeginClip s) {
+    write_mem(a, ref.offset >> 2, Cmd_BeginClip);
+    CmdBeginClip_write(a, CmdBeginClipRef(ref.offset + 4), s);
 }
 
-void Cmd_BeginSolidClip_write(CmdRef ref, CmdBeginSolidClip s) {
-    memory[ref.offset >> 2] = Cmd_BeginSolidClip;
-    CmdBeginSolidClip_write(CmdBeginSolidClipRef(ref.offset + 4), s);
+void Cmd_BeginSolidClip_write(Alloc a, CmdRef ref, CmdBeginSolidClip s) {
+    write_mem(a, ref.offset >> 2, Cmd_BeginSolidClip);
+    CmdBeginSolidClip_write(a, CmdBeginSolidClipRef(ref.offset + 4), s);
 }
 
-void Cmd_EndClip_write(CmdRef ref, CmdEndClip s) {
-    memory[ref.offset >> 2] = Cmd_EndClip;
-    CmdEndClip_write(CmdEndClipRef(ref.offset + 4), s);
+void Cmd_EndClip_write(Alloc a, CmdRef ref, CmdEndClip s) {
+    write_mem(a, ref.offset >> 2, Cmd_EndClip);
+    CmdEndClip_write(a, CmdEndClipRef(ref.offset + 4), s);
 }
 
-void Cmd_Stroke_write(CmdRef ref, CmdStroke s) {
-    memory[ref.offset >> 2] = Cmd_Stroke;
-    CmdStroke_write(CmdStrokeRef(ref.offset + 4), s);
+void Cmd_Stroke_write(Alloc a, CmdRef ref, CmdStroke s) {
+    write_mem(a, ref.offset >> 2, Cmd_Stroke);
+    CmdStroke_write(a, CmdStrokeRef(ref.offset + 4), s);
 }
 
-void Cmd_Solid_write(CmdRef ref, CmdSolid s) {
-    memory[ref.offset >> 2] = Cmd_Solid;
-    CmdSolid_write(CmdSolidRef(ref.offset + 4), s);
+void Cmd_Solid_write(Alloc a, CmdRef ref, CmdSolid s) {
+    write_mem(a, ref.offset >> 2, Cmd_Solid);
+    CmdSolid_write(a, CmdSolidRef(ref.offset + 4), s);
 }
 
-void Cmd_SolidMask_write(CmdRef ref, CmdSolidMask s) {
-    memory[ref.offset >> 2] = Cmd_SolidMask;
-    CmdSolidMask_write(CmdSolidMaskRef(ref.offset + 4), s);
+void Cmd_SolidMask_write(Alloc a, CmdRef ref, CmdSolidMask s) {
+    write_mem(a, ref.offset >> 2, Cmd_SolidMask);
+    CmdSolidMask_write(a, CmdSolidMaskRef(ref.offset + 4), s);
 }
 
-void Cmd_Jump_write(CmdRef ref, CmdJump s) {
-    memory[ref.offset >> 2] = Cmd_Jump;
-    CmdJump_write(CmdJumpRef(ref.offset + 4), s);
+void Cmd_Jump_write(Alloc a, CmdRef ref, CmdJump s) {
+    write_mem(a, ref.offset >> 2, Cmd_Jump);
+    CmdJump_write(a, CmdJumpRef(ref.offset + 4), s);
 }
 
diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h
index bcfa510..5a4935c 100644
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@@ -30,9 +30,9 @@ struct Config {
     uint n_pathseg;
     uint width_in_tiles;
     uint height_in_tiles;
-    uint tile_base;
-    uint bin_base;
-    uint ptcl_base;
-    uint pathseg_base;
-    uint anno_base;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
 };
diff --git a/piet-gpu/shader/tile.h b/piet-gpu/shader/tile.h
index 133ff53..500277b 100644
--- a/piet-gpu/shader/tile.h
+++ b/piet-gpu/shader/tile.h
@@ -49,48 +49,48 @@ TileSegRef TileSeg_index(TileSegRef ref, uint index) {
     return TileSegRef(ref.offset + index * TileSeg_size);
 }
 
-Path Path_read(PathRef ref) {
+Path Path_read(Alloc a, PathRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = memory[ix + 0];
-    uint raw1 = memory[ix + 1];
-    uint raw2 = memory[ix + 2];
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
     Path s;
     s.bbox = uvec4(raw0 & 0xffff, raw0 >> 16, raw1 & 0xffff, raw1 >> 16);
     s.tiles = TileRef(raw2);
     return s;
 }
 
-void Path_write(PathRef ref, Path s) {
+void Path_write(Alloc a, PathRef ref, Path s) {
     uint ix = ref.offset >> 2;
-    memory[ix + 0] = s.bbox.x | (s.bbox.y << 16);
-    memory[ix + 1] = s.bbox.z | (s.bbox.w << 16);
-    memory[ix + 2] = s.tiles.offset;
+    write_mem(a, ix + 0, s.bbox.x | (s.bbox.y << 16));
+    write_mem(a, ix + 1, s.bbox.z | (s.bbox.w << 16));
+    write_mem(a, ix + 2, s.tiles.offset);
 }
 
-Tile Tile_read(TileRef ref) {
+Tile Tile_read(Alloc a, TileRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = memory[ix + 0];
-    uint raw1 = memory[ix + 1];
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
     Tile s;
     s.tile = TileSegRef(raw0);
     s.backdrop = int(raw1);
     return s;
 }
 
-void Tile_write(TileRef ref, Tile s) {
+void Tile_write(Alloc a, TileRef ref, Tile s) {
     uint ix = ref.offset >> 2;
-    memory[ix + 0] = s.tile.offset;
-    memory[ix + 1] = uint(s.backdrop);
+    write_mem(a, ix + 0, s.tile.offset);
+    write_mem(a, ix + 1, uint(s.backdrop));
 }
 
-TileSeg TileSeg_read(TileSegRef ref) {
+TileSeg TileSeg_read(Alloc a, TileSegRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = memory[ix + 0];
-    uint raw1 = memory[ix + 1];
-    uint raw2 = memory[ix + 2];
-    uint raw3 = memory[ix + 3];
-    uint raw4 = memory[ix + 4];
-    uint raw5 = memory[ix + 5];
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    uint raw4 = read_mem(a, ix + 4);
+    uint raw5 = read_mem(a, ix + 5);
     TileSeg s;
     s.origin = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
     s.vector = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
@@ -99,13 +99,13 @@ TileSeg TileSeg_read(TileSegRef ref) {
     return s;
 }
 
-void TileSeg_write(TileSegRef ref, TileSeg s) {
+void TileSeg_write(Alloc a, TileSegRef ref, TileSeg s) {
     uint ix = ref.offset >> 2;
-    memory[ix + 0] = floatBitsToUint(s.origin.x);
-    memory[ix + 1] = floatBitsToUint(s.origin.y);
-    memory[ix + 2] = floatBitsToUint(s.vector.x);
-    memory[ix + 3] = floatBitsToUint(s.vector.y);
-    memory[ix + 4] = floatBitsToUint(s.y_edge);
-    memory[ix + 5] = s.next.offset;
+    write_mem(a, ix + 0, floatBitsToUint(s.origin.x));
+    write_mem(a, ix + 1, floatBitsToUint(s.origin.y));
+    write_mem(a, ix + 2, floatBitsToUint(s.vector.x));
+    write_mem(a, ix + 3, floatBitsToUint(s.vector.y));
+    write_mem(a, ix + 4, floatBitsToUint(s.y_edge));
+    write_mem(a, ix + 5, s.next.offset);
 }
 
diff --git a/piet-gpu/shader/tile_alloc.comp b/piet-gpu/shader/tile_alloc.comp
index 6588227..f0d42da 100644
--- a/piet-gpu/shader/tile_alloc.comp
+++ b/piet-gpu/shader/tile_alloc.comp
@@ -5,8 +5,8 @@
 #version 450
 #extension GL_GOOGLE_include_directive : enable
 
-#include "setup.h"
 #include "mem.h"
+#include "setup.h"
 
 #define LG_TILE_ALLOC_WG (7 + LG_WG_FACTOR)
 #define TILE_ALLOC_WG (1 << LG_TILE_ALLOC_WG)
@@ -25,21 +25,21 @@ layout(set = 0, binding = 1) readonly buffer ConfigBuf {
 #define SY (1.0 / float(TILE_HEIGHT_PX))
 
 shared uint sh_tile_count[TILE_ALLOC_WG];
-shared Alloc sh_tile_alloc;
+shared MallocResult sh_tile_alloc;
 
 void main() {
-    if (mem_overflow) {
+    if (mem_error != NO_ERROR) {
         return;
     }
 
     uint th_ix = gl_LocalInvocationID.x;
     uint element_ix = gl_GlobalInvocationID.x;
-    PathRef path_ref = PathRef(conf.tile_base + element_ix * Path_size);
-    AnnotatedRef ref = AnnotatedRef(conf.anno_base + element_ix * Annotated_size);
+    PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size);
+    AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
 
     uint tag = Annotated_Nop;
     if (element_ix < conf.n_elements) {
-        tag = Annotated_tag(ref);
+        tag = Annotated_tag(conf.anno_alloc, ref);
     }
     int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
     switch (tag) {
@@ -49,7 +49,7 @@ void main() {
     case Annotated_EndClip:
         // Note: we take advantage of the fact that fills, strokes, and
         // clips have compatible layout.
-        AnnoFill fill = Annotated_Fill_read(ref);
+        AnnoFill fill = Annotated_Fill_read(conf.anno_alloc, ref);
         x0 = int(floor(fill.bbox.x * SX));
         y0 = int(floor(fill.bbox.y * SY));
         x1 = int(ceil(fill.bbox.z * SX));
@@ -71,36 +71,38 @@ void main() {
     }
 
     sh_tile_count[th_ix] = tile_count;
+    uint total_tile_count = tile_count;
     // Prefix sum of sh_tile_count
     for (uint i = 0; i < LG_TILE_ALLOC_WG; i++) {
         barrier();
         if (th_ix >= (1 << i)) {
-            tile_count += sh_tile_count[th_ix - (1 << i)];
+            total_tile_count += sh_tile_count[th_ix - (1 << i)];
         }
         barrier();
-        sh_tile_count[th_ix] = tile_count;
+        sh_tile_count[th_ix] = total_tile_count;
     }
     if (th_ix == TILE_ALLOC_WG - 1) {
-        sh_tile_alloc = malloc(tile_count * Tile_size);
+        sh_tile_alloc = malloc(total_tile_count * Tile_size);
     }
     barrier();
-    Alloc alloc_start = sh_tile_alloc;
+    MallocResult alloc_start = sh_tile_alloc;
     if (alloc_start.failed) {
         return;
     }
 
     if (element_ix < conf.n_elements) {
         uint tile_subix = th_ix > 0 ? sh_tile_count[th_ix - 1] : 0;
-        path.tiles = TileRef(alloc_start.offset + Tile_size * tile_subix);
-        Path_write(path_ref, path);
+        Alloc tiles_alloc = slice_mem(alloc_start.alloc, Tile_size * tile_subix, Tile_size * tile_count);
+        path.tiles = TileRef(tiles_alloc.offset);
+        Path_write(conf.tile_alloc, path_ref, path);
     }
 
     // Zero out allocated tiles efficiently
     uint total_count = sh_tile_count[TILE_ALLOC_WG - 1] * (Tile_size / 4);
-    uint start_ix = alloc_start.offset >> 2;
+    uint start_ix = alloc_start.alloc.offset >> 2;
     for (uint i = th_ix; i < total_count; i += TILE_ALLOC_WG) {
         // Note: this interleaving is faster than using Tile_write
         // by a significant amount.
-        memory[start_ix + i] = 0;
+        write_mem(alloc_start.alloc, start_ix + i, 0);
     }
 }
diff --git a/piet-gpu/shader/tile_alloc.spv b/piet-gpu/shader/tile_alloc.spv
index 7a80ad7..b256392 100644
Binary files a/piet-gpu/shader/tile_alloc.spv and b/piet-gpu/shader/tile_alloc.spv differ