diff --git a/piet-gpu-types/src/annotated.rs b/piet-gpu-types/src/annotated.rs
index 04f2111..cedbd3f 100644
--- a/piet-gpu-types/src/annotated.rs
+++ b/piet-gpu-types/src/annotated.rs
@@ -4,16 +4,18 @@ piet_gpu! {
     #[gpu_write]
     mod annotated {
         struct AnnoFill {
-            rgba_color: u32,
+            // The bbox is always first, as we take advantage of common
+            // layout when binning.
             bbox: [f32; 4],
+            rgba_color: u32,
         }
         struct AnnoFillMask {
-            mask: f32,
             bbox: [f32; 4],
+            mask: f32,
         }
         struct AnnoStroke {
-            rgba_color: u32,
             bbox: [f32; 4],
+            rgba_color: u32,
             // For the nonuniform scale case, this needs to be a 2x2 matrix.
             // That's expected to be uncommon, so we could special-case it.
             linewidth: f32,
diff --git a/piet-gpu-types/src/bins.rs b/piet-gpu-types/src/bins.rs
index 1ac2413..88f16f1 100644
--- a/piet-gpu-types/src/bins.rs
+++ b/piet-gpu-types/src/bins.rs
@@ -7,9 +7,6 @@ piet_gpu! {
     mod bins {
         struct BinInstance {
             element_ix: u32,
-            // Right edge of the bounding box of the associated fill
-            // element; used in backdrop computation.
-            right_edge: f32,
         }
 
         struct BinChunk {
diff --git a/piet-gpu-types/src/ptcl.rs b/piet-gpu-types/src/ptcl.rs
index d05218b..86e4572 100644
--- a/piet-gpu-types/src/ptcl.rs
+++ b/piet-gpu-types/src/ptcl.rs
@@ -34,6 +34,11 @@ piet_gpu! {
             tile_ref: u32,
             backdrop: i32,
         }
+        // This is mostly here for expedience and can always be optimized
+        // out for pure clips, but will be useful for blend groups.
+        struct CmdBeginSolidClip {
+            alpha: f32,
+        }
         struct CmdEndClip {
             // This will be 1.0 for clips, but we can imagine blend groups.
             alpha: f32,
@@ -55,6 +60,7 @@ piet_gpu! {
             FillMask(CmdFillMask),
             FillMaskInv(CmdFillMask),
             BeginClip(CmdBeginClip),
+            BeginSolidClip(CmdBeginSolidClip),
             EndClip(CmdEndClip),
             Stroke(CmdStroke),
             Solid(CmdSolid),
diff --git a/piet-gpu-types/src/scene.rs b/piet-gpu-types/src/scene.rs
index 7e2fb43..855b500 100644
--- a/piet-gpu-types/src/scene.rs
+++ b/piet-gpu-types/src/scene.rs
@@ -1,7 +1,7 @@
 use piet_gpu_derive::piet_gpu;
 
 pub use self::scene::{
-    BeginClip, CubicSeg, Element, EndClip, Fill, LineSeg, QuadSeg, SetLineWidth, Stroke, Transform,
+    Clip, CubicSeg, Element, Fill, LineSeg, QuadSeg, SetLineWidth, Stroke, Transform,
 };
 
 piet_gpu! {
@@ -38,15 +38,10 @@ piet_gpu! {
             mat: [f32; 4],
             translate: [f32; 2],
         }
-        struct BeginClip {
+        struct Clip {
             bbox: [f32; 4],
             // TODO: add alpha?
         }
-        struct EndClip {
-            // The delta between the BeginClip and EndClip element indices.
-            // It is stored as a delta to facilitate binary string concatenation.
-            delta: u32,
-        }
         enum Element {
             Nop,
             // Another approach to encoding would be to use a single
@@ -66,8 +61,8 @@ piet_gpu! {
             Transform(Transform),
             FillMask(FillMask),
             FillMaskInv(FillMask),
-            BeginClip(BeginClip),
-            EndClip(EndClip),
+            BeginClip(Clip),
+            EndClip(Clip),
         }
     }
 }
diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs
index 7876f9e..dfed520 100644
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@@ -165,6 +165,38 @@ fn trace_ptcl(buf: &[u32]) {
                             }
                         }
                     }
+                    6 => {
+                        let backdrop = buf[tile_offset / 4 + 2];
+                        println!("  {:x}: begin_clip {}", tile_offset, backdrop);
+                        let mut seg_chunk = buf[tile_offset / 4 + 1] as usize;
+                        let n = buf[seg_chunk / 4] as usize;
+                        let segs = buf[seg_chunk / 4 + 2] as usize;
+                        println!("    chunk @{:x}: n={}, segs @{:x}", seg_chunk, n, segs);
+                        for i in 0..n {
+                            let x0 = f32::from_bits(buf[segs / 4 + i * 5]);
+                            let y0 = f32::from_bits(buf[segs / 4 + i * 5 + 1]);
+                            let x1 = f32::from_bits(buf[segs / 4 + i * 5 + 2]);
+                            let y1 = f32::from_bits(buf[segs / 4 + i * 5 + 3]);
+                            let y_edge = f32::from_bits(buf[segs / 4 + i * 5 + 4]);
+                            println!(
+                                "      ({:.3}, {:.3}) - ({:.3}, {:.3}) | {:.3}",
+                                x0, y0, x1, y1, y_edge
+                            );
+                        }
+                        loop {
+                            seg_chunk = buf[seg_chunk / 4 + 1] as usize;
+                            if seg_chunk == 0 {
+                                break;
+                            }
+                        }
+                    }
+                    7 => {
+                        let backdrop = buf[tile_offset / 4 + 1];
+                        println!("{:x}: solid_clip {:x}", tile_offset, backdrop);
+                    }
+                    8 => {
+                        println!("{:x}: end_clip", tile_offset);
+                    }
                     _ => {
                         println!("{:x}: {}", tile_offset, tag);
                     }
@@ -246,9 +278,9 @@ fn main() -> Result<(), Error> {
 
         /*
         let mut data: Vec<u32> = Default::default();
-        device.read_buffer(&renderer.tile_buf, &mut data).unwrap();
+        renderer.tile_buf.read(&mut data).unwrap();
         piet_gpu::dump_k1_data(&data);
-        //trace_ptcl(&data);
+        trace_ptcl(&data);
         */
 
         let mut img_data: Vec<u8> = Default::default();
diff --git a/piet-gpu/shader/annotated.h b/piet-gpu/shader/annotated.h
index 986eff2..677e473 100644
--- a/piet-gpu/shader/annotated.h
+++ b/piet-gpu/shader/annotated.h
@@ -21,8 +21,8 @@ struct AnnotatedRef {
 };
 
 struct AnnoFill {
-    uint rgba_color;
     vec4 bbox;
+    uint rgba_color;
 };
 
 #define AnnoFill_size 20
@@ -32,8 +32,8 @@ AnnoFillRef AnnoFill_index(AnnoFillRef ref, uint index) {
 }
 
 struct AnnoFillMask {
-    float mask;
     vec4 bbox;
+    float mask;
 };
 
 #define AnnoFillMask_size 20
@@ -43,8 +43,8 @@ AnnoFillMaskRef AnnoFillMask_index(AnnoFillMaskRef ref, uint index) {
 }
 
 struct AnnoStroke {
-    uint rgba_color;
     vec4 bbox;
+    uint rgba_color;
     float linewidth;
 };
 
@@ -85,18 +85,18 @@ AnnoFill AnnoFill_read(AnnoFillRef ref) {
     uint raw3 = annotated[ix + 3];
     uint raw4 = annotated[ix + 4];
     AnnoFill s;
-    s.rgba_color = raw0;
-    s.bbox = vec4(uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3), uintBitsToFloat(raw4));
+    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.rgba_color = raw4;
     return s;
 }
 
 void AnnoFill_write(AnnoFillRef ref, AnnoFill s) {
     uint ix = ref.offset >> 2;
-    annotated[ix + 0] = s.rgba_color;
-    annotated[ix + 1] = floatBitsToUint(s.bbox.x);
-    annotated[ix + 2] = floatBitsToUint(s.bbox.y);
-    annotated[ix + 3] = floatBitsToUint(s.bbox.z);
-    annotated[ix + 4] = floatBitsToUint(s.bbox.w);
+    annotated[ix + 0] = floatBitsToUint(s.bbox.x);
+    annotated[ix + 1] = floatBitsToUint(s.bbox.y);
+    annotated[ix + 2] = floatBitsToUint(s.bbox.z);
+    annotated[ix + 3] = floatBitsToUint(s.bbox.w);
+    annotated[ix + 4] = s.rgba_color;
 }
 
 AnnoFillMask AnnoFillMask_read(AnnoFillMaskRef ref) {
@@ -107,18 +107,18 @@ AnnoFillMask AnnoFillMask_read(AnnoFillMaskRef ref) {
     uint raw3 = annotated[ix + 3];
     uint raw4 = annotated[ix + 4];
     AnnoFillMask s;
-    s.mask = uintBitsToFloat(raw0);
-    s.bbox = vec4(uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3), uintBitsToFloat(raw4));
+    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.mask = uintBitsToFloat(raw4);
     return s;
 }
 
 void AnnoFillMask_write(AnnoFillMaskRef ref, AnnoFillMask s) {
     uint ix = ref.offset >> 2;
-    annotated[ix + 0] = floatBitsToUint(s.mask);
-    annotated[ix + 1] = floatBitsToUint(s.bbox.x);
-    annotated[ix + 2] = floatBitsToUint(s.bbox.y);
-    annotated[ix + 3] = floatBitsToUint(s.bbox.z);
-    annotated[ix + 4] = floatBitsToUint(s.bbox.w);
+    annotated[ix + 0] = floatBitsToUint(s.bbox.x);
+    annotated[ix + 1] = floatBitsToUint(s.bbox.y);
+    annotated[ix + 2] = floatBitsToUint(s.bbox.z);
+    annotated[ix + 3] = floatBitsToUint(s.bbox.w);
+    annotated[ix + 4] = floatBitsToUint(s.mask);
 }
 
 AnnoStroke AnnoStroke_read(AnnoStrokeRef ref) {
@@ -130,19 +130,19 @@ AnnoStroke AnnoStroke_read(AnnoStrokeRef ref) {
     uint raw4 = annotated[ix + 4];
     uint raw5 = annotated[ix + 5];
     AnnoStroke s;
-    s.rgba_color = raw0;
-    s.bbox = vec4(uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3), uintBitsToFloat(raw4));
+    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.rgba_color = raw4;
     s.linewidth = uintBitsToFloat(raw5);
     return s;
 }
 
 void AnnoStroke_write(AnnoStrokeRef ref, AnnoStroke s) {
     uint ix = ref.offset >> 2;
-    annotated[ix + 0] = s.rgba_color;
-    annotated[ix + 1] = floatBitsToUint(s.bbox.x);
-    annotated[ix + 2] = floatBitsToUint(s.bbox.y);
-    annotated[ix + 3] = floatBitsToUint(s.bbox.z);
-    annotated[ix + 4] = floatBitsToUint(s.bbox.w);
+    annotated[ix + 0] = floatBitsToUint(s.bbox.x);
+    annotated[ix + 1] = floatBitsToUint(s.bbox.y);
+    annotated[ix + 2] = floatBitsToUint(s.bbox.z);
+    annotated[ix + 3] = floatBitsToUint(s.bbox.w);
+    annotated[ix + 4] = s.rgba_color;
     annotated[ix + 5] = floatBitsToUint(s.linewidth);
 }
 
diff --git a/piet-gpu/shader/backdrop.comp b/piet-gpu/shader/backdrop.comp
index 20c6ce9..6fa9f9f 100644
--- a/piet-gpu/shader/backdrop.comp
+++ b/piet-gpu/shader/backdrop.comp
@@ -57,6 +57,7 @@ void main() {
         case Annotated_Fill:
         case Annotated_FillMask:
         case Annotated_FillMaskInv:
+        case Annotated_BeginClip:
             PathRef path_ref = PathRef(element_ix * Path_size);
             Path path = Path_read(path_ref);
             sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
diff --git a/piet-gpu/shader/backdrop.spv b/piet-gpu/shader/backdrop.spv
index 7f0852d..ee4dda8 100644
Binary files a/piet-gpu/shader/backdrop.spv and b/piet-gpu/shader/backdrop.spv differ
diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp
index ee3301b..5dce813 100644
--- a/piet-gpu/shader/binning.comp
+++ b/piet-gpu/shader/binning.comp
@@ -57,22 +57,20 @@ void main() {
         tag = Annotated_tag(ref);
     }
     int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
-    float my_right_edge = INFINITY;
     switch (tag) {
     case Annotated_Fill:
     case Annotated_FillMask:
     case Annotated_FillMaskInv:
     case Annotated_Stroke:
-        // Note: we take advantage of the fact that fills and strokes
-        // have compatible layout.
+    case Annotated_BeginClip:
+    case Annotated_EndClip:
+        // Note: we take advantage of the fact that these drawing elements
+        // have the bbox at the same place in their layout.
         AnnoFill fill = Annotated_Fill_read(ref);
         x0 = int(floor(fill.bbox.x * SX));
         y0 = int(floor(fill.bbox.y * SY));
         x1 = int(ceil(fill.bbox.z * SX));
         y1 = int(ceil(fill.bbox.w * SY));
-        // It probably makes more sense to track x1, to avoid having to redo
-        // the rounding to tile coords.
-        my_right_edge = fill.bbox.z;
         break;
     }
 
@@ -131,7 +129,7 @@ void main() {
                 idx += count[my_slice - 1][bin_ix];
             }
             uint out_offset = sh_chunk_start[bin_ix] + idx * BinInstance_size;
-            BinInstance_write(BinInstanceRef(out_offset), BinInstance(element_ix, my_right_edge));
+            BinInstance_write(BinInstanceRef(out_offset), BinInstance(element_ix));
         }
         x++;
         if (x == x1) {
diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv
index 8f44c89..2c923c4 100644
Binary files a/piet-gpu/shader/binning.spv and b/piet-gpu/shader/binning.spv differ
diff --git a/piet-gpu/shader/bins.h b/piet-gpu/shader/bins.h
index 85f7536..3ce06e0 100644
--- a/piet-gpu/shader/bins.h
+++ b/piet-gpu/shader/bins.h
@@ -10,10 +10,9 @@ struct BinChunkRef {
 
 struct BinInstance {
     uint element_ix;
-    float right_edge;
 };
 
-#define BinInstance_size 8
+#define BinInstance_size 4
 
 BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) {
     return BinInstanceRef(ref.offset + index * BinInstance_size);
@@ -33,17 +32,14 @@ BinChunkRef BinChunk_index(BinChunkRef ref, uint index) {
 BinInstance BinInstance_read(BinInstanceRef ref) {
     uint ix = ref.offset >> 2;
     uint raw0 = bins[ix + 0];
-    uint raw1 = bins[ix + 1];
     BinInstance s;
     s.element_ix = raw0;
-    s.right_edge = uintBitsToFloat(raw1);
     return s;
 }
 
 void BinInstance_write(BinInstanceRef ref, BinInstance s) {
     uint ix = ref.offset >> 2;
     bins[ix + 0] = s.element_ix;
-    bins[ix + 1] = floatBitsToUint(s.right_edge);
 }
 
 BinChunk BinChunk_read(BinChunkRef ref) {
diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
index 16573e8..12ec348 100644
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@@ -45,7 +45,6 @@ layout(set = 0, binding = 4) buffer PtclBuf {
 #define N_PART_READ (1 << LG_N_PART_READ)
 
 shared uint sh_elements[N_TILE];
-shared float sh_right_edge[N_TILE];
 
 // Number of elements in the partition; prefix sum.
 shared uint sh_part_count[N_PART_READ];
@@ -148,7 +147,6 @@ void main() {
                 BinInstanceRef inst_ref = BinInstanceRef(sh_part_elements[part_ix]);
                 BinInstance inst = BinInstance_read(BinInstance_index(inst_ref, ix));
                 sh_elements[th_ix] = inst.element_ix;
-                sh_right_edge[th_ix] = inst.right_edge;
             }
             barrier();
 
@@ -161,10 +159,8 @@ void main() {
         uint tag = Annotated_Nop;
         uint element_ix;
         AnnotatedRef ref;
-        float right_edge = 0.0;
         if (th_ix + rd_ix < wr_ix) {
             element_ix = sh_elements[th_ix];
-            right_edge = sh_right_edge[th_ix];
             ref = AnnotatedRef(element_ix * Annotated_size);
             tag = Annotated_tag(ref);
         }
@@ -173,13 +169,11 @@ void main() {
         uint tile_count;
         switch (tag) {
         case Annotated_Fill:
-        case Annotated_FillMask:
-        case Annotated_FillMaskInv:
         case Annotated_Stroke:
-            // Because the only elements we're processing right now are
-            // paths, we can just use the element index as the path index.
-            // In future, when we're doing a bunch of stuff, the path index
-            // should probably be stored in the annotated element.
+        case Annotated_BeginClip:
+        case Annotated_EndClip:
+            // We have one "path" for each element, even if the element isn't
+            // actually a path (currently EndClip, but images etc in the future).
             uint path_ix = element_ix;
             Path path = Path_read(PathRef(path_ix * Path_size));
             uint stride = path.bbox.z - path.bbox.x;
@@ -224,20 +218,23 @@ void main() {
                     el_ix = probe;
                 }
             }
-            AnnotatedRef ref = AnnotatedRef(el_ix * Annotated_size);
+            AnnotatedRef ref = AnnotatedRef(sh_elements[el_ix] * Annotated_size);
             uint tag = Annotated_tag(ref);
             uint seq_ix = ix - (el_ix > 0 ? sh_tile_count[el_ix - 1] : 0);
             uint width = sh_tile_width[el_ix];
             uint x = sh_tile_x0[el_ix] + seq_ix % width;
             uint y = sh_tile_y0[el_ix] + seq_ix / width;
-            Tile tile = Tile_read(TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
-            // Include the path in the tile if
-            // - the tile contains at least a segment (tile offset non-zero)
-            // - the tile is completely covered (backdrop non-zero)
-            // - the tile is not covered and we're filling everything outside the path (backdrop zero, inverse fills).
-            bool inside = tile.backdrop != 0;
-            bool fill = tag != Annotated_FillMaskInv;
-            if (tile.tile.offset != 0 || inside == fill) {
+            bool include_tile;
+            if (tag == Annotated_BeginClip || tag == Annotated_EndClip) {
+                include_tile = true;
+            } else {
+                Tile tile = Tile_read(TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
+                // Include the path in the tile if
+                // - the tile contains at least a segment (tile offset non-zero)
+                // - the tile is completely covered (backdrop non-zero)
+                include_tile = tile.backdrop != 0 || tile.tile.offset != 0;
+            }
+            if (include_tile) {
                 uint el_slice = el_ix / 32;
                 uint el_mask = 1 << (el_ix & 31);
                 atomicOr(sh_bitmaps[el_slice][y * N_TILE_X + x], el_mask);
@@ -247,8 +244,7 @@ void main() {
         barrier();
 
         // Output non-segment elements for this tile. The thread does a sequential walk
-        // through the non-segment elements, and for segments, count and backdrop are
-        // aggregated using bit counting.
+        // through the non-segment elements.
         uint slice_ix = 0;
         uint bitmap = sh_bitmaps[0][th_ix];
         while (true) {
@@ -291,27 +287,27 @@ void main() {
                 }
                 cmd_ref.offset += Cmd_size;
                 break;
-            case Annotated_FillMask:
-            case Annotated_FillMaskInv:
+            case Annotated_BeginClip:
                 tile = Tile_read(TileRef(sh_tile_base[element_ref_ix]
                     + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
-                AnnoFillMask fill_mask = Annotated_FillMask_read(ref);
                 alloc_cmd(cmd_ref, cmd_limit);
                 if (tile.tile.offset != 0) {
-                    CmdFillMask cmd_fill;
-                    cmd_fill.tile_ref = tile.tile.offset;
-                    cmd_fill.backdrop = tile.backdrop;
-                    cmd_fill.mask = fill_mask.mask;
-                    if (tag == Annotated_FillMask) {
-                        Cmd_FillMask_write(cmd_ref, cmd_fill);
-                    } else {
-                        Cmd_FillMaskInv_write(cmd_ref, cmd_fill);
-                    }
+                    CmdBeginClip cmd_begin_clip;
+                    cmd_begin_clip.tile_ref = tile.tile.offset;
+                    cmd_begin_clip.backdrop = tile.backdrop;
+                    Cmd_BeginClip_write(cmd_ref, cmd_begin_clip);
                 } else {
-                    Cmd_SolidMask_write(cmd_ref, CmdSolidMask(fill_mask.mask));
+                    // TODO: here is where a bunch of optimization magic should happen
+                    float alpha = tile.backdrop == 0 ? 0.0 : 1.0;
+                    Cmd_BeginSolidClip_write(cmd_ref, CmdBeginSolidClip(alpha));
                 }
                 cmd_ref.offset += Cmd_size;
                 break;
+            case Annotated_EndClip:
+                alloc_cmd(cmd_ref, cmd_limit);
+                Cmd_EndClip_write(cmd_ref, CmdEndClip(1.0));
+                cmd_ref.offset += Cmd_size;
+                break;
             case Annotated_Stroke:
                 tile = Tile_read(TileRef(sh_tile_base[element_ref_ix]
                     + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv
index 17bae64..436ac1a 100644
Binary files a/piet-gpu/shader/coarse.spv and b/piet-gpu/shader/coarse.spv differ
diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp
index 8bb9a4d..7606554 100644
--- a/piet-gpu/shader/elements.comp
+++ b/piet-gpu/shader/elements.comp
@@ -132,9 +132,13 @@ State map_element(ElementRef ref) {
     case Element_FillMask:
     case Element_FillMaskInv:
     case Element_Stroke:
+    case Element_BeginClip:
         c.flags = FLAG_RESET_BBOX;
         c.path_count = 1;
         break;
+    case Element_EndClip:
+        c.path_count = 1;
+        break;
     case Element_SetLineWidth:
         SetLineWidth lw = Element_SetLineWidth_read(ref);
         c.linewidth = lw.width;
@@ -421,6 +425,21 @@ void main() {
             out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size);
             Annotated_FillMaskInv_write(out_ref, anno_fill_mask);
             break;
+        case Element_BeginClip:
+            Clip begin_clip = Element_BeginClip_read(this_ref);
+            AnnoClip anno_begin_clip = AnnoClip(begin_clip.bbox);
+            // This is the absolute bbox, it's been transformed during encoding.
+            anno_begin_clip.bbox = begin_clip.bbox;
+            out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size);
+            Annotated_BeginClip_write(out_ref, anno_begin_clip);
+            break;
+        case Element_EndClip:
+            Clip end_clip = Element_EndClip_read(this_ref);
+            // This bbox is expected to be the same as the begin one.
+            AnnoClip anno_end_clip = AnnoClip(end_clip.bbox);
+            out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size);
+            Annotated_EndClip_write(out_ref, anno_end_clip);
+            break;
         }
     }
 }
diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv
index fed60fb..73ff3f4 100644
Binary files a/piet-gpu/shader/elements.spv and b/piet-gpu/shader/elements.spv differ
diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp
index 919d120..a7d5e92 100644
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@@ -149,12 +149,20 @@ void main() {
             }
             blend_sp++;
             break;
+        case Cmd_BeginSolidClip:
+            CmdBeginSolidClip begin_solid_clip = Cmd_BeginSolidClip_read(cmd_ref);
+            float solid_alpha = begin_solid_clip.alpha;
+            for (uint k = 0; k < CHUNK; k++) {
+                blend_stack[blend_sp][k] = packUnorm4x8(vec4(rgb[k], solid_alpha));
+            }
+            blend_sp++;
+            break;
         case Cmd_EndClip:
             CmdEndClip end_clip = Cmd_EndClip_read(cmd_ref);
             blend_sp--;
             for (uint k = 0; k < CHUNK; k++) {
                 vec4 rgba = unpackUnorm4x8(blend_stack[blend_sp][k]);
-                rgb[k] = mix(rgb[k], rgba.rgb, end_clip.alpha * rgba.a);
+                rgb[k] = mix(rgba.rgb, rgb[k], end_clip.alpha * rgba.a);
             }
             break;
         case Cmd_Solid:
diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv
index 0d38581..a02387a 100644
Binary files a/piet-gpu/shader/kernel4.spv and b/piet-gpu/shader/kernel4.spv differ
diff --git a/piet-gpu/shader/ptcl.h b/piet-gpu/shader/ptcl.h
index 33ab664..db8d47b 100644
--- a/piet-gpu/shader/ptcl.h
+++ b/piet-gpu/shader/ptcl.h
@@ -24,6 +24,10 @@ struct CmdBeginClipRef {
     uint offset;
 };
 
+struct CmdBeginSolidClipRef {
+    uint offset;
+};
+
 struct CmdEndClipRef {
     uint offset;
 };
@@ -122,6 +126,16 @@ CmdBeginClipRef CmdBeginClip_index(CmdBeginClipRef ref, uint index) {
     return CmdBeginClipRef(ref.offset + index * CmdBeginClip_size);
 }
 
+struct CmdBeginSolidClip {
+    float alpha;
+};
+
+#define CmdBeginSolidClip_size 4
+
+CmdBeginSolidClipRef CmdBeginSolidClip_index(CmdBeginSolidClipRef ref, uint index) {
+    return CmdBeginSolidClipRef(ref.offset + index * CmdBeginSolidClip_size);
+}
+
 struct CmdEndClip {
     float alpha;
 };
@@ -169,11 +183,12 @@ CmdJumpRef CmdJump_index(CmdJumpRef ref, uint index) {
 #define Cmd_FillMask 4
 #define Cmd_FillMaskInv 5
 #define Cmd_BeginClip 6
-#define Cmd_EndClip 7
-#define Cmd_Stroke 8
-#define Cmd_Solid 9
-#define Cmd_SolidMask 10
-#define Cmd_Jump 11
+#define Cmd_BeginSolidClip 7
+#define Cmd_EndClip 8
+#define Cmd_Stroke 9
+#define Cmd_Solid 10
+#define Cmd_SolidMask 11
+#define Cmd_Jump 12
 #define Cmd_size 20
 
 CmdRef Cmd_index(CmdRef ref, uint index) {
@@ -318,6 +333,19 @@ void CmdBeginClip_write(CmdBeginClipRef ref, CmdBeginClip s) {
     ptcl[ix + 1] = uint(s.backdrop);
 }
 
+CmdBeginSolidClip CmdBeginSolidClip_read(CmdBeginSolidClipRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = ptcl[ix + 0];
+    CmdBeginSolidClip s;
+    s.alpha = uintBitsToFloat(raw0);
+    return s;
+}
+
+void CmdBeginSolidClip_write(CmdBeginSolidClipRef ref, CmdBeginSolidClip s) {
+    uint ix = ref.offset >> 2;
+    ptcl[ix + 0] = floatBitsToUint(s.alpha);
+}
+
 CmdEndClip CmdEndClip_read(CmdEndClipRef ref) {
     uint ix = ref.offset >> 2;
     uint raw0 = ptcl[ix + 0];
@@ -398,6 +426,10 @@ CmdBeginClip Cmd_BeginClip_read(CmdRef ref) {
     return CmdBeginClip_read(CmdBeginClipRef(ref.offset + 4));
 }
 
+CmdBeginSolidClip Cmd_BeginSolidClip_read(CmdRef ref) {
+    return CmdBeginSolidClip_read(CmdBeginSolidClipRef(ref.offset + 4));
+}
+
 CmdEndClip Cmd_EndClip_read(CmdRef ref) {
     return CmdEndClip_read(CmdEndClipRef(ref.offset + 4));
 }
@@ -452,6 +484,11 @@ void Cmd_BeginClip_write(CmdRef ref, CmdBeginClip s) {
     CmdBeginClip_write(CmdBeginClipRef(ref.offset + 4), s);
 }
 
+void Cmd_BeginSolidClip_write(CmdRef ref, CmdBeginSolidClip s) {
+    ptcl[ref.offset >> 2] = Cmd_BeginSolidClip;
+    CmdBeginSolidClip_write(CmdBeginSolidClipRef(ref.offset + 4), s);
+}
+
 void Cmd_EndClip_write(CmdRef ref, CmdEndClip s) {
     ptcl[ref.offset >> 2] = Cmd_EndClip;
     CmdEndClip_write(CmdEndClipRef(ref.offset + 4), s);
diff --git a/piet-gpu/shader/scene.h b/piet-gpu/shader/scene.h
index 0a4a2ce..6558ad3 100644
--- a/piet-gpu/shader/scene.h
+++ b/piet-gpu/shader/scene.h
@@ -32,11 +32,7 @@ struct TransformRef {
     uint offset;
 };
 
-struct BeginClipRef {
-    uint offset;
-};
-
-struct EndClipRef {
+struct ClipRef {
     uint offset;
 };
 
@@ -131,24 +127,14 @@ TransformRef Transform_index(TransformRef ref, uint index) {
     return TransformRef(ref.offset + index * Transform_size);
 }
 
-struct BeginClip {
+struct Clip {
     vec4 bbox;
 };
 
-#define BeginClip_size 16
+#define Clip_size 16
 
-BeginClipRef BeginClip_index(BeginClipRef ref, uint index) {
-    return BeginClipRef(ref.offset + index * BeginClip_size);
-}
-
-struct EndClip {
-    uint clip_size;
-};
-
-#define EndClip_size 4
-
-EndClipRef EndClip_index(EndClipRef ref, uint index) {
-    return EndClipRef(ref.offset + index * EndClip_size);
+ClipRef Clip_index(ClipRef ref, uint index) {
+    return ClipRef(ref.offset + index * Clip_size);
 }
 
 #define Element_Nop 0
@@ -263,25 +249,17 @@ Transform Transform_read(TransformRef ref) {
     return s;
 }
 
-BeginClip BeginClip_read(BeginClipRef ref) {
+Clip Clip_read(ClipRef ref) {
     uint ix = ref.offset >> 2;
     uint raw0 = scene[ix + 0];
     uint raw1 = scene[ix + 1];
     uint raw2 = scene[ix + 2];
     uint raw3 = scene[ix + 3];
-    BeginClip s;
+    Clip s;
     s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
     return s;
 }
 
-EndClip EndClip_read(EndClipRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = scene[ix + 0];
-    EndClip s;
-    s.clip_size = raw0;
-    return s;
-}
-
 uint Element_tag(ElementRef ref) {
     return scene[ref.offset >> 2];
 }
@@ -334,11 +312,11 @@ FillMask Element_FillMaskInv_read(ElementRef ref) {
     return FillMask_read(FillMaskRef(ref.offset + 4));
 }
 
-BeginClip Element_BeginClip_read(ElementRef ref) {
-    return BeginClip_read(BeginClipRef(ref.offset + 4));
+Clip Element_BeginClip_read(ElementRef ref) {
+    return Clip_read(ClipRef(ref.offset + 4));
 }
 
-EndClip Element_EndClip_read(ElementRef ref) {
-    return EndClip_read(EndClipRef(ref.offset + 4));
+Clip Element_EndClip_read(ElementRef ref) {
+    return Clip_read(ClipRef(ref.offset + 4));
 }
 
diff --git a/piet-gpu/shader/tile_alloc.comp b/piet-gpu/shader/tile_alloc.comp
index 3e1e52f..de6e827 100644
--- a/piet-gpu/shader/tile_alloc.comp
+++ b/piet-gpu/shader/tile_alloc.comp
@@ -50,8 +50,10 @@ void main() {
     case Annotated_FillMask:
     case Annotated_FillMaskInv:
     case Annotated_Stroke:
-        // Note: we take advantage of the fact that fills and strokes
-        // have compatible layout.
+    case Annotated_BeginClip:
+    case Annotated_EndClip:
+        // Note: we take advantage of the fact that fills, strokes, and
+        // clips have compatible layout.
         AnnoFill fill = Annotated_Fill_read(ref);
         x0 = int(floor(fill.bbox.x * SX));
         y0 = int(floor(fill.bbox.y * SY));
@@ -67,6 +69,11 @@ void main() {
     Path path;
     path.bbox = uvec4(x0, y0, x1, y1);
     uint tile_count = (x1 - x0) * (y1 - y0);
+    if (tag == Annotated_EndClip) {
+        // Don't actually allocate tiles for an end clip, but we do want
+        // the path structure (especially bbox) allocated for it.
+        tile_count = 0;
+    }
 
     sh_tile_count[th_ix] = tile_count;
     // Prefix sum of sh_tile_count
diff --git a/piet-gpu/shader/tile_alloc.spv b/piet-gpu/shader/tile_alloc.spv
index 449f4d6..93cdea6 100644
Binary files a/piet-gpu/shader/tile_alloc.spv and b/piet-gpu/shader/tile_alloc.spv differ
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index dc3c2c8..6eff190 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -7,7 +7,7 @@ pub use render_ctx::PietGpuRenderContext;
 
 use rand::{Rng, RngCore};
 
-use piet::kurbo::{BezPath, Circle, Line, Point, Vec2};
+use piet::kurbo::{BezPath, Circle, Point, Vec2};
 use piet::{Color, ImageFormat, RenderContext};
 
 use piet_gpu_types::encoder::Encode;
@@ -52,16 +52,23 @@ pub fn render_scene(rc: &mut impl RenderContext) {
         let circle = Circle::new(center, radius);
         rc.fill(circle, &color);
     }
-    /*
+    let _ = rc.save();
     let mut path = BezPath::new();
-    path.move_to((100.0, 1150.0));
-    path.line_to((200.0, 1200.0));
-    path.line_to((150.0, 1250.0));
+    path.move_to((200.0, 150.0));
+    path.line_to((100.0, 200.0));
+    path.line_to((150.0, 250.0));
+    path.close_path();
+    rc.clip(path);
+
+    let mut path = BezPath::new();
+    path.move_to((100.0, 150.0));
+    path.line_to((200.0, 200.0));
+    path.line_to((150.0, 250.0));
     path.close_path();
     rc.fill(path, &Color::rgb8(128, 0, 128));
-    */
+    let _ = rc.restore();
     rc.stroke(
-        Line::new((100.0, 100.0), (200.0, 150.0)),
+        piet::kurbo::Line::new((100.0, 100.0), (200.0, 150.0)),
         &Color::WHITE,
         5.0,
     );
diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs
index b3aa895..73d1f27 100644
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@@ -1,9 +1,9 @@
-use std::{borrow::Cow, convert::TryInto, ops::RangeBounds};
+use std::{borrow::Cow, ops::RangeBounds};
 
 use piet_gpu_types::encoder::{Encode, Encoder};
 
 use piet_gpu_types::scene::{
-    BeginClip, CubicSeg, Element, EndClip, Fill, LineSeg, QuadSeg, SetLineWidth, Stroke, Transform,
+    Clip, CubicSeg, Element, Fill, LineSeg, QuadSeg, SetLineWidth, Stroke, Transform,
 };
 
 use piet::{
@@ -53,6 +53,10 @@ pub enum PietGpuBrush {
 #[derive(Default)]
 struct State {
     /// The transform relative to the parent state.
+    rel_transform: Affine,
+    /// The transform at the parent state.
+    ///
+    /// This invariant should hold: transform * rel_transform = cur_transform
     transform: Affine,
     n_clip: usize,
 }
@@ -61,8 +65,6 @@ struct ClipElement {
     /// Index of BeginClip element in element vec, for bbox fixup.
     begin_ix: usize,
     bbox: Option<Rect>,
-    /// The transform relative to the next clip element on the stack.
-    transform: Affine,
 }
 
 const TOLERANCE: f64 = 0.25;
@@ -168,18 +170,20 @@ impl RenderContext for PietGpuRenderContext {
     fn fill_even_odd(&mut self, _shape: impl Shape, _brush: &impl IntoBrush<Self>) {}
 
     fn clip(&mut self, shape: impl Shape) {
-        let begin_ix = self.elements.len();
         let path = shape.path_elements(TOLERANCE);
         self.encode_path(path, true);
-        self.elements.push(Element::BeginClip(BeginClip {
+        let begin_ix = self.elements.len();
+        self.elements.push(Element::BeginClip(Clip {
             bbox: Default::default(),
         }));
         self.clip_stack.push(ClipElement {
             bbox: None,
             begin_ix,
-            transform: Affine::default(),
         });
         self.path_count += 1;
+        if let Some(tos) = self.state_stack.last_mut() {
+            tos.n_clip += 1;
+        }
     }
 
     fn text(&mut self) -> &mut Self::Text {
@@ -189,18 +193,22 @@ impl RenderContext for PietGpuRenderContext {
     fn draw_text(&mut self, _layout: &Self::TextLayout, _pos: impl Into<Point>) {}
 
     fn save(&mut self) -> Result<(), Error> {
-        self.state_stack.push(Default::default());
+        self.state_stack.push(State {
+            rel_transform: Affine::default(),
+            transform: self.cur_transform,
+            n_clip: 0,
+        });
         Ok(())
     }
 
     fn restore(&mut self) -> Result<(), Error> {
         if let Some(state) = self.state_stack.pop() {
-            if state.transform != Affine::default() {
-                let a_inv = state.transform.inverse();
+            if state.rel_transform != Affine::default() {
+                let a_inv = state.rel_transform.inverse();
                 self.elements
                     .push(Element::Transform(to_scene_transform(a_inv)));
-                self.cur_transform *= a_inv;
             }
+            self.cur_transform = state.transform;
             for _ in 0..state.n_clip {
                 self.pop_clip();
             }
@@ -211,6 +219,9 @@ impl RenderContext for PietGpuRenderContext {
     }
 
     fn finish(&mut self) -> Result<(), Error> {
+        for _ in 0..self.clip_stack.len() {
+            self.pop_clip();
+        }
         Ok(())
     }
 
@@ -218,10 +229,7 @@ impl RenderContext for PietGpuRenderContext {
         self.elements
             .push(Element::Transform(to_scene_transform(transform)));
         if let Some(tos) = self.state_stack.last_mut() {
-            tos.transform *= transform;
-        }
-        if let Some(tos) = self.clip_stack.last_mut() {
-            tos.transform *= transform;
+            tos.rel_transform *= transform;
         }
         self.cur_transform *= transform;
     }
@@ -392,23 +400,38 @@ impl PietGpuRenderContext {
 
     fn pop_clip(&mut self) {
         let tos = self.clip_stack.pop().unwrap();
-        let delta = (self.elements.len() - tos.begin_ix).try_into().unwrap();
-        self.elements.push(Element::EndClip(EndClip { delta }));
+        let bbox = tos.bbox.unwrap_or_default();
+        let bbox_f32_4 = rect_to_f32_4(bbox);
+        self.elements
+            .push(Element::EndClip(Clip { bbox: bbox_f32_4 }));
         self.path_count += 1;
+        if let Element::BeginClip(begin_clip) = &mut self.elements[tos.begin_ix] {
+            begin_clip.bbox = bbox_f32_4;
+        } else {
+            unreachable!("expected BeginClip, not found");
+        }
         if let Some(bbox) = tos.bbox {
-            if let Element::BeginClip(begin_clip) = &mut self.elements[tos.begin_ix] {
-                begin_clip.bbox = rect_to_f32_4(bbox);
-            } else {
-                unreachable!("expected BeginClip, not found");
-            }
-            self.accumulate_bbox(|| bbox);
+            self.union_bbox(bbox);
         }
     }
 
+    /// Accumulate a bbox.
+    ///
+    /// The bbox is given lazily as a closure, relative to the current transform.
+    /// It's lazy because we don't need to compute it unless we're inside a clip.
     fn accumulate_bbox(&mut self, f: impl FnOnce() -> Rect) {
-        if let Some(tos) = self.clip_stack.last_mut() {
+        if !self.clip_stack.is_empty() {
             let bbox = f();
-            let bbox = tos.transform.transform_rect_bbox(bbox);
+            let bbox = self.cur_transform.transform_rect_bbox(bbox);
+            self.union_bbox(bbox);
+        }
+    }
+
+    /// Accumulate an absolute bbox.
+    ///
+    /// The bbox is given already transformed into surface coordinates.
+    fn union_bbox(&mut self, bbox: Rect) {
+        if let Some(tos) = self.clip_stack.last_mut() {
             tos.bbox = if let Some(old_bbox) = tos.bbox {
                 Some(old_bbox.union(bbox))
             } else {
@@ -512,7 +535,12 @@ fn to_f32_2(point: Point) -> [f32; 2] {
 }
 
 fn rect_to_f32_4(rect: Rect) -> [f32; 4] {
-    [rect.x0 as f32, rect.y0 as f32, rect.x1 as f32, rect.y1 as f32]
+    [
+        rect.x0 as f32,
+        rect.y0 as f32,
+        rect.x1 as f32,
+        rect.y1 as f32,
+    ]
 }
 
 fn to_scene_transform(transform: Affine) -> Transform {