diff --git a/Cargo.lock b/Cargo.lock
index 737c033..78b6326 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -949,6 +949,14 @@ dependencies = [
  "piet-gpu-derive",
 ]
 
+[[package]]
+name = "piet-scene"
+version = "0.1.0"
+dependencies = [
+ "bytemuck",
+ "smallvec",
+]
+
 [[package]]
 name = "pkg-config"
 version = "0.3.22"
@@ -1139,9 +1147,9 @@ checksum = "f12d06de37cf59146fbdecab66aa99f9fe4f78722e3607577a5375d66bd0c913"
 
 [[package]]
 name = "smallvec"
-version = "1.7.0"
+version = "1.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1ecab6c735a6bb4139c0caafd0cc3635748bbb3acf4550e8138122099251f309"
+checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83"
 
 [[package]]
 name = "smithay-client-toolkit"
diff --git a/Cargo.toml b/Cargo.toml
index bfa0030..b94b82b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,5 +5,6 @@ members = [
     "piet-gpu-derive",
     "piet-gpu-hal",
     "piet-gpu-types",
+    "piet-scene",
     "tests"
 ]
diff --git a/piet-gpu-types/src/ptcl.rs b/piet-gpu-types/src/ptcl.rs
index e8c29c3..14831ca 100644
--- a/piet-gpu-types/src/ptcl.rs
+++ b/piet-gpu-types/src/ptcl.rs
@@ -24,6 +24,14 @@ piet_gpu! {
             line_y: f32,
             line_c: f32,
         }
+        struct CmdRadGrad {
+            index: u32,
+            mat: [f32; 4],
+            xlat: [f32; 2],
+            c1: [f32; 2],
+            ra: f32,
+            roff: f32,
+        }
         struct CmdImage {
             index: u32,
             offset: [i16; 2],
@@ -31,6 +39,9 @@ piet_gpu! {
         struct CmdAlpha {
             alpha: f32,
         }
+        struct CmdEndClip {
+            blend: u32,
+        }
         struct CmdJump {
             new_ref: u32,
         }
@@ -42,9 +53,10 @@ piet_gpu! {
             Alpha(CmdAlpha),
             Color(CmdColor),
             LinGrad(CmdLinGrad),
+            RadGrad(CmdRadGrad),
             Image(CmdImage),
             BeginClip,
-            EndClip,
+            EndClip(CmdEndClip),
             Jump(CmdJump),
         }
     }
diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs
index abe6ae1..70023af 100644
--- a/piet-gpu/bin/cli.rs
+++ b/piet-gpu/bin/cli.rs
@@ -6,7 +6,7 @@ use clap::{App, Arg};
 
 use piet_gpu_hal::{BufferUsage, Error, Instance, InstanceFlags, Session};
 
-use piet_gpu::{test_scenes, PicoSvg, PietGpuRenderContext, Renderer};
+use piet_gpu::{test_scenes, PietGpuRenderContext, Renderer};
 
 const WIDTH: usize = 2048;
 const HEIGHT: usize = 1536;
@@ -243,11 +243,7 @@ fn main() -> Result<(), Error> {
             if matches.is_present("flip") {
                 scale = -scale;
             }
-            let xml_str = std::fs::read_to_string(input).unwrap();
-            let start = std::time::Instant::now();
-            let svg = PicoSvg::load(&xml_str, scale).unwrap();
-            println!("parsing time: {:?}", start.elapsed());
-            test_scenes::render_svg(&mut ctx, &svg);
+            test_scenes::render_svg(&mut ctx, input, scale);
         } else {
             test_scenes::render_scene(&mut ctx);
         }
diff --git a/piet-gpu/bin/winit.rs b/piet-gpu/bin/winit.rs
index b1db5e0..3ca0742 100644
--- a/piet-gpu/bin/winit.rs
+++ b/piet-gpu/bin/winit.rs
@@ -2,7 +2,7 @@ use piet::kurbo::Point;
 use piet::{RenderContext, Text, TextAttribute, TextLayoutBuilder};
 use piet_gpu_hal::{CmdBuf, Error, ImageLayout, Instance, Session, SubmittedCmdBuf};
 
-use piet_gpu::{test_scenes, PicoSvg, PietGpuRenderContext, Renderer};
+use piet_gpu::{test_scenes, PietGpuRenderContext, Renderer};
 
 use clap::{App, Arg};
 
@@ -29,25 +29,6 @@ fn main() -> Result<(), Error> {
         )
         .get_matches();
 
-    // Collect SVG if input
-    let svg = match matches.value_of("INPUT") {
-        Some(file) => {
-            let mut scale = matches
-                .value_of("scale")
-                .map(|scale| scale.parse().unwrap())
-                .unwrap_or(8.0);
-            if matches.is_present("flip") {
-                scale = -scale;
-            }
-            let xml_str = std::fs::read_to_string(file).unwrap();
-            let start = std::time::Instant::now();
-            let svg = PicoSvg::load(&xml_str, scale).unwrap();
-            println!("parsing time: {:?}", start.elapsed());
-            Some(svg)
-        }
-        None => None,
-    };
-
     let event_loop = EventLoop::new();
     let window = WindowBuilder::new()
         .with_inner_size(winit::dpi::LogicalSize {
@@ -125,8 +106,15 @@ fn main() -> Result<(), Error> {
                     }
 
                     let mut ctx = PietGpuRenderContext::new();
-                    if let Some(svg) = &svg {
-                        test_scenes::render_svg(&mut ctx, svg);
+                    if let Some(input) = matches.value_of("INPUT") {
+                        let mut scale = matches
+                            .value_of("scale")
+                            .map(|scale| scale.parse().unwrap())
+                            .unwrap_or(8.0);
+                        if matches.is_present("flip") {
+                            scale = -scale;
+                        }
+                        test_scenes::render_svg(&mut ctx, input, scale);
                     } else {
                         use piet_gpu::{Blend, BlendMode::*, CompositionMode::*};
                         let blends = [
diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
index 454371c..adbedfd 100644
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@@ -229,6 +229,7 @@ void main() {
         case Drawtag_FillColor:
         case Drawtag_FillImage:
         case Drawtag_FillLinGradient:
+        case Drawtag_FillRadGradient:
         case Drawtag_BeginClip:
         case Drawtag_EndClip:
             uint drawmonoid_base = drawmonoid_start + 4 * element_ix;
@@ -305,7 +306,7 @@ void main() {
                     is_blend = (blend != BlendComp_default);
                 }
                 include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip
-                    || is_blend;
+                    || (is_clip && is_blend);
             }
             if (include_tile) {
                 uint el_slice = el_ix / 32;
@@ -373,6 +374,25 @@ void main() {
                     Cmd_LinGrad_write(cmd_alloc, cmd_ref, cmd_lin);
                     cmd_ref.offset += 4 + CmdLinGrad_size;
                     break;
+                case Drawtag_FillRadGradient:
+                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
+                        break;
+                    }
+                    linewidth = uintBitsToFloat(memory[di]);
+                    write_fill(cmd_alloc, cmd_ref, tile, linewidth);
+                    CmdRadGrad cmd_rad;
+                    cmd_rad.index = scene[dd];
+                    // Given that this is basically a memcpy, we might consider
+                    // letting the fine raster read the info itself.
+                    cmd_rad.mat = uintBitsToFloat(uvec4(memory[di + 1], memory[di + 2],
+                        memory[di + 3], memory[di + 4]));
+                    cmd_rad.xlat = uintBitsToFloat(uvec2(memory[di + 5], memory[di + 6]));
+                    cmd_rad.c1 = uintBitsToFloat(uvec2(memory[di + 7], memory[di + 8]));
+                    cmd_rad.ra = uintBitsToFloat(memory[di + 9]);
+                    cmd_rad.roff = uintBitsToFloat(memory[di + 10]);
+                    Cmd_RadGrad_write(cmd_alloc, cmd_ref, cmd_rad);
+                    cmd_ref.offset += 4 + CmdRadGrad_size;
+                    break;
                 case Drawtag_FillImage:
                     linewidth = uintBitsToFloat(memory[di]);
                     if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
diff --git a/piet-gpu/shader/draw_leaf.comp b/piet-gpu/shader/draw_leaf.comp
index 1cee0ef..ef369c9 100644
--- a/piet-gpu/shader/draw_leaf.comp
+++ b/piet-gpu/shader/draw_leaf.comp
@@ -94,8 +94,8 @@ void main() {
         // pipeline. However, going forward we'll get rid of that, and have
         // later stages read scene + bbox etc.
         tag_word = scene[drawtag_base + ix + i];
-        if (tag_word == Drawtag_FillColor || tag_word == Drawtag_FillLinGradient || tag_word == Drawtag_FillImage ||
-            tag_word == Drawtag_BeginClip) {
+        if (tag_word == Drawtag_FillColor || tag_word == Drawtag_FillLinGradient || tag_word == Drawtag_FillRadGradient ||
+            tag_word == Drawtag_FillImage || tag_word == Drawtag_BeginClip) {
             uint bbox_offset = (conf.path_bbox_alloc.offset >> 2) + 6 * m.path_ix;
             float bbox_l = float(memory[bbox_offset]) - 32768.0;
             float bbox_t = float(memory[bbox_offset + 1]) - 32768.0;
@@ -106,11 +106,11 @@ void main() {
             uint fill_mode = uint(linewidth >= 0.0);
             vec4 mat;
             vec2 translate;
-            if (linewidth >= 0.0 || tag_word == Drawtag_FillLinGradient) {
+            if (linewidth >= 0.0 || tag_word == Drawtag_FillLinGradient || tag_word == Drawtag_FillRadGradient) {
                 uint trans_ix = memory[bbox_offset + 5];
                 uint t = (conf.trans_alloc.offset >> 2) + 6 * trans_ix;
                 mat = uintBitsToFloat(uvec4(memory[t], memory[t + 1], memory[t + 2], memory[t + 3]));
-                if (tag_word == Drawtag_FillLinGradient) {
+                if (tag_word == Drawtag_FillLinGradient || tag_word == Drawtag_FillRadGradient) {
                     translate = uintBitsToFloat(uvec2(memory[t + 4], memory[t + 5]));
                 }
             }
@@ -125,7 +125,6 @@ void main() {
                 break;
             case Drawtag_FillLinGradient:
                 memory[di] = floatBitsToUint(linewidth);
-                uint index = scene[dd];
                 vec2 p0 = uintBitsToFloat(uvec2(scene[dd + 1], scene[dd + 2]));
                 vec2 p1 = uintBitsToFloat(uvec2(scene[dd + 3], scene[dd + 4]));
                 p0 = mat.xy * p0.x + mat.zw * p0.y + translate;
@@ -139,6 +138,33 @@ void main() {
                 memory[di + 2] = floatBitsToUint(line_y);
                 memory[di + 3] = floatBitsToUint(line_c);
                 break;
+            case Drawtag_FillRadGradient:
+                p0 = uintBitsToFloat(uvec2(scene[dd + 1], scene[dd + 2]));
+                p1 = uintBitsToFloat(uvec2(scene[dd + 3], scene[dd + 4]));
+                float r0 = uintBitsToFloat(scene[dd + 5]);
+                float r1 = uintBitsToFloat(scene[dd + 6]);
+                float inv_det = 1.0 / (mat.x * mat.w - mat.y * mat.z);
+                vec4 inv_mat = inv_det * vec4(mat.w, -mat.y, -mat.z, mat.x);
+                vec2 inv_tr = inv_mat.xz * translate.x + inv_mat.yw * translate.y;
+                inv_tr += p0;
+                vec2 center1 = p1 - p0;
+                float rr = r1 / (r1 - r0);
+                float rainv = rr / (r1 * r1 - dot(center1, center1));
+                vec2 c1 = center1 * rainv;
+                float ra = rr * rainv;
+                float roff = rr - 1.0;
+                memory[di] = floatBitsToUint(linewidth);
+                memory[di + 1] = floatBitsToUint(inv_mat.x);
+                memory[di + 2] = floatBitsToUint(inv_mat.y);
+                memory[di + 3] = floatBitsToUint(inv_mat.z);
+                memory[di + 4] = floatBitsToUint(inv_mat.w);
+                memory[di + 5] = floatBitsToUint(inv_tr.x);
+                memory[di + 6] = floatBitsToUint(inv_tr.y);
+                memory[di + 7] = floatBitsToUint(c1.x);
+                memory[di + 8] = floatBitsToUint(c1.y);
+                memory[di + 9] = floatBitsToUint(ra);
+                memory[di + 10] = floatBitsToUint(roff);
+                break;
             case Drawtag_BeginClip:
                 break;
             }
diff --git a/piet-gpu/shader/drawtag.h b/piet-gpu/shader/drawtag.h
index 7f73546..1e35318 100644
--- a/piet-gpu/shader/drawtag.h
+++ b/piet-gpu/shader/drawtag.h
@@ -4,11 +4,12 @@
 
 // Design of draw tag: & 0x1c gives scene size in bytes
 // & 1 gives clip
-// (tag >> 4) & 0x1c is info size in bytes
+// (tag >> 4) & 0x3c is info size in bytes
 
 #define Drawtag_Nop 0
 #define Drawtag_FillColor 0x44
 #define Drawtag_FillLinGradient 0x114
+#define Drawtag_FillRadGradient 0x2dc
 #define Drawtag_FillImage 0x48
 #define Drawtag_BeginClip 0x05
 #define Drawtag_EndClip 0x25
@@ -36,5 +37,5 @@ DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b) {
 DrawMonoid map_tag(uint tag_word) {
     // TODO: at some point, EndClip should not generate a path
     uint has_path = uint(tag_word != Drawtag_Nop);
-    return DrawMonoid(has_path, tag_word & 1, tag_word & 0x1c, (tag_word >> 4) & 0x1c);
+    return DrawMonoid(has_path, tag_word & 1, tag_word & 0x1c, (tag_word >> 4) & 0x3c);
 }
diff --git a/piet-gpu/shader/gen/binning.dxil b/piet-gpu/shader/gen/binning.dxil
index 3050aa8..4a4f073 100644
Binary files a/piet-gpu/shader/gen/binning.dxil and b/piet-gpu/shader/gen/binning.dxil differ
diff --git a/piet-gpu/shader/gen/coarse.dxil b/piet-gpu/shader/gen/coarse.dxil
index 12e88dd..879b7c8 100644
Binary files a/piet-gpu/shader/gen/coarse.dxil and b/piet-gpu/shader/gen/coarse.dxil differ
diff --git a/piet-gpu/shader/gen/coarse.hlsl b/piet-gpu/shader/gen/coarse.hlsl
index a702df5..1e610ec 100644
--- a/piet-gpu/shader/gen/coarse.hlsl
+++ b/piet-gpu/shader/gen/coarse.hlsl
@@ -91,6 +91,21 @@ struct CmdLinGrad
     float line_c;
 };
 
+struct CmdRadGradRef
+{
+    uint offset;
+};
+
+struct CmdRadGrad
+{
+    uint index;
+    float4 mat;
+    float2 xlat;
+    float2 c1;
+    float ra;
+    float roff;
+};
+
 struct CmdImageRef
 {
     uint offset;
@@ -160,9 +175,9 @@ struct Config
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-RWByteAddressBuffer _242 : register(u0, space0);
-ByteAddressBuffer _854 : register(t1, space0);
-ByteAddressBuffer _1222 : register(t2, space0);
+RWByteAddressBuffer _260 : register(u0, space0);
+ByteAddressBuffer _1005 : register(t1, space0);
+ByteAddressBuffer _1372 : register(t2, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -185,8 +200,8 @@ groupshared uint sh_tile_count[256];
 
 Alloc slice_mem(Alloc a, uint offset, uint size)
 {
-    Alloc _319 = { a.offset + offset };
-    return _319;
+    Alloc _337 = { a.offset + offset };
+    return _337;
 }
 
 bool touch_mem(Alloc alloc, uint offset)
@@ -202,7 +217,7 @@ uint read_mem(Alloc alloc, uint offset)
     {
         return 0u;
     }
-    uint v = _242.Load(offset * 4 + 8);
+    uint v = _260.Load(offset * 4 + 8);
     return v;
 }
 
@@ -215,8 +230,8 @@ Alloc new_alloc(uint offset, uint size, bool mem_ok)
 
 BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index)
 {
-    BinInstanceRef _328 = { ref.offset + (index * 4u) };
-    return _328;
+    BinInstanceRef _346 = { ref.offset + (index * 4u) };
+    return _346;
 }
 
 BinInstance BinInstance_read(Alloc a, BinInstanceRef ref)
@@ -244,8 +259,8 @@ Path Path_read(Alloc a, PathRef ref)
     uint raw2 = read_mem(param_4, param_5);
     Path s;
     s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
-    TileRef _391 = { raw2 };
-    s.tiles = _391;
+    TileRef _409 = { raw2 };
+    s.tiles = _409;
     return s;
 }
 
@@ -255,11 +270,11 @@ void write_tile_alloc(uint el_ix, Alloc a)
 
 Alloc read_tile_alloc(uint el_ix, bool mem_ok)
 {
-    uint _741;
-    _242.GetDimensions(_741);
-    _741 = (_741 - 8) / 4;
+    uint _892;
+    _260.GetDimensions(_892);
+    _892 = (_892 - 8) / 4;
     uint param = 0u;
-    uint param_1 = uint(int(_741) * 4);
+    uint param_1 = uint(int(_892) * 4);
     bool param_2 = mem_ok;
     return new_alloc(param, param_1, param_2);
 }
@@ -273,31 +288,31 @@ Tile Tile_read(Alloc a, TileRef ref)
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
     uint raw1 = read_mem(param_2, param_3);
-    TileSegRef _416 = { raw0 };
+    TileSegRef _434 = { raw0 };
     Tile s;
-    s.tile = _416;
+    s.tile = _434;
     s.backdrop = int(raw1);
     return s;
 }
 
 MallocResult malloc(uint size)
 {
-    uint _248;
-    _242.InterlockedAdd(0, size, _248);
-    uint offset = _248;
-    uint _255;
-    _242.GetDimensions(_255);
-    _255 = (_255 - 8) / 4;
+    uint _266;
+    _260.InterlockedAdd(0, size, _266);
+    uint offset = _266;
+    uint _273;
+    _260.GetDimensions(_273);
+    _273 = (_273 - 8) / 4;
     MallocResult r;
-    r.failed = (offset + size) > uint(int(_255) * 4);
+    r.failed = (offset + size) > uint(int(_273) * 4);
     uint param = offset;
     uint param_1 = size;
     bool param_2 = !r.failed;
     r.alloc = new_alloc(param, param_1, param_2);
     if (r.failed)
     {
-        uint _277;
-        _242.InterlockedMax(4, 1u, _277);
+        uint _295;
+        _260.InterlockedMax(4, 1u, _295);
         return r;
     }
     return r;
@@ -311,7 +326,7 @@ void write_mem(Alloc alloc, uint offset, uint val)
     {
         return;
     }
-    _242.Store(offset * 4 + 8, val);
+    _260.Store(offset * 4 + 8, val);
 }
 
 void CmdJump_write(Alloc a, CmdJumpRef ref, CmdJump s)
@@ -327,11 +342,11 @@ void Cmd_Jump_write(Alloc a, CmdRef ref, CmdJump s)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint param_2 = 10u;
+    uint param_2 = 11u;
     write_mem(param, param_1, param_2);
-    CmdJumpRef _734 = { ref.offset + 4u };
+    CmdJumpRef _885 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdJumpRef param_4 = _734;
+    CmdJumpRef param_4 = _885;
     CmdJump param_5 = s;
     CmdJump_write(param_3, param_4, param_5);
 }
@@ -343,22 +358,22 @@ bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit
         return true;
     }
     uint param = 1024u;
-    MallocResult _762 = malloc(param);
-    MallocResult new_cmd = _762;
+    MallocResult _913 = malloc(param);
+    MallocResult new_cmd = _913;
     if (new_cmd.failed)
     {
         return false;
     }
-    CmdJump _772 = { new_cmd.alloc.offset };
-    CmdJump jump = _772;
+    CmdJump _923 = { new_cmd.alloc.offset };
+    CmdJump jump = _923;
     Alloc param_1 = cmd_alloc;
     CmdRef param_2 = cmd_ref;
     CmdJump param_3 = jump;
     Cmd_Jump_write(param_1, param_2, param_3);
     cmd_alloc = new_cmd.alloc;
-    CmdRef _784 = { cmd_alloc.offset };
-    cmd_ref = _784;
-    cmd_limit = (cmd_alloc.offset + 1024u) - 60u;
+    CmdRef _935 = { cmd_alloc.offset };
+    cmd_ref = _935;
+    cmd_limit = (cmd_alloc.offset + 1024u) - 144u;
     return true;
 }
 
@@ -381,9 +396,9 @@ void Cmd_Fill_write(Alloc a, CmdRef ref, CmdFill s)
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 1u;
     write_mem(param, param_1, param_2);
-    CmdFillRef _604 = { ref.offset + 4u };
+    CmdFillRef _742 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdFillRef param_4 = _604;
+    CmdFillRef param_4 = _742;
     CmdFill param_5 = s;
     CmdFill_write(param_3, param_4, param_5);
 }
@@ -415,9 +430,9 @@ void Cmd_Stroke_write(Alloc a, CmdRef ref, CmdStroke s)
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 2u;
     write_mem(param, param_1, param_2);
-    CmdStrokeRef _622 = { ref.offset + 4u };
+    CmdStrokeRef _760 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdStrokeRef param_4 = _622;
+    CmdStrokeRef param_4 = _760;
     CmdStroke param_5 = s;
     CmdStroke_write(param_3, param_4, param_5);
 }
@@ -428,8 +443,8 @@ void write_fill(Alloc alloc, inout CmdRef cmd_ref, Tile tile, float linewidth)
     {
         if (tile.tile.offset != 0u)
         {
-            CmdFill _807 = { tile.tile.offset, tile.backdrop };
-            CmdFill cmd_fill = _807;
+            CmdFill _958 = { tile.tile.offset, tile.backdrop };
+            CmdFill cmd_fill = _958;
             Alloc param = alloc;
             CmdRef param_1 = cmd_ref;
             CmdFill param_2 = cmd_fill;
@@ -446,8 +461,8 @@ void write_fill(Alloc alloc, inout CmdRef cmd_ref, Tile tile, float linewidth)
     }
     else
     {
-        CmdStroke _837 = { tile.tile.offset, 0.5f * linewidth };
-        CmdStroke cmd_stroke = _837;
+        CmdStroke _988 = { tile.tile.offset, 0.5f * linewidth };
+        CmdStroke cmd_stroke = _988;
         Alloc param_5 = alloc;
         CmdRef param_6 = cmd_ref;
         CmdStroke param_7 = cmd_stroke;
@@ -471,9 +486,9 @@ void Cmd_Color_write(Alloc a, CmdRef ref, CmdColor s)
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 5u;
     write_mem(param, param_1, param_2);
-    CmdColorRef _649 = { ref.offset + 4u };
+    CmdColorRef _786 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdColorRef param_4 = _649;
+    CmdColorRef param_4 = _786;
     CmdColor param_5 = s;
     CmdColor_write(param_3, param_4, param_5);
 }
@@ -505,13 +520,75 @@ void Cmd_LinGrad_write(Alloc a, CmdRef ref, CmdLinGrad s)
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 6u;
     write_mem(param, param_1, param_2);
-    CmdLinGradRef _668 = { ref.offset + 4u };
+    CmdLinGradRef _804 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdLinGradRef param_4 = _668;
+    CmdLinGradRef param_4 = _804;
     CmdLinGrad param_5 = s;
     CmdLinGrad_write(param_3, param_4, param_5);
 }
 
+void CmdRadGrad_write(Alloc a, CmdRadGradRef ref, CmdRadGrad s)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = s.index;
+    write_mem(param, param_1, param_2);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = asuint(s.mat.x);
+    write_mem(param_3, param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 2u;
+    uint param_8 = asuint(s.mat.y);
+    write_mem(param_6, param_7, param_8);
+    Alloc param_9 = a;
+    uint param_10 = ix + 3u;
+    uint param_11 = asuint(s.mat.z);
+    write_mem(param_9, param_10, param_11);
+    Alloc param_12 = a;
+    uint param_13 = ix + 4u;
+    uint param_14 = asuint(s.mat.w);
+    write_mem(param_12, param_13, param_14);
+    Alloc param_15 = a;
+    uint param_16 = ix + 5u;
+    uint param_17 = asuint(s.xlat.x);
+    write_mem(param_15, param_16, param_17);
+    Alloc param_18 = a;
+    uint param_19 = ix + 6u;
+    uint param_20 = asuint(s.xlat.y);
+    write_mem(param_18, param_19, param_20);
+    Alloc param_21 = a;
+    uint param_22 = ix + 7u;
+    uint param_23 = asuint(s.c1.x);
+    write_mem(param_21, param_22, param_23);
+    Alloc param_24 = a;
+    uint param_25 = ix + 8u;
+    uint param_26 = asuint(s.c1.y);
+    write_mem(param_24, param_25, param_26);
+    Alloc param_27 = a;
+    uint param_28 = ix + 9u;
+    uint param_29 = asuint(s.ra);
+    write_mem(param_27, param_28, param_29);
+    Alloc param_30 = a;
+    uint param_31 = ix + 10u;
+    uint param_32 = asuint(s.roff);
+    write_mem(param_30, param_31, param_32);
+}
+
+void Cmd_RadGrad_write(Alloc a, CmdRef ref, CmdRadGrad s)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = 7u;
+    write_mem(param, param_1, param_2);
+    CmdRadGradRef _822 = { ref.offset + 4u };
+    Alloc param_3 = a;
+    CmdRadGradRef param_4 = _822;
+    CmdRadGrad param_5 = s;
+    CmdRadGrad_write(param_3, param_4, param_5);
+}
+
 void CmdImage_write(Alloc a, CmdImageRef ref, CmdImage s)
 {
     uint ix = ref.offset >> uint(2);
@@ -529,11 +606,11 @@ void Cmd_Image_write(Alloc a, CmdRef ref, CmdImage s)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint param_2 = 7u;
+    uint param_2 = 8u;
     write_mem(param, param_1, param_2);
-    CmdImageRef _687 = { ref.offset + 4u };
+    CmdImageRef _840 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdImageRef param_4 = _687;
+    CmdImageRef param_4 = _840;
     CmdImage param_5 = s;
     CmdImage_write(param_3, param_4, param_5);
 }
@@ -542,7 +619,7 @@ void Cmd_BeginClip_write(Alloc a, CmdRef ref)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint param_2 = 8u;
+    uint param_2 = 9u;
     write_mem(param, param_1, param_2);
 }
 
@@ -559,11 +636,11 @@ void Cmd_EndClip_write(Alloc a, CmdRef ref, CmdEndClip s)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint param_2 = 9u;
+    uint param_2 = 10u;
     write_mem(param, param_1, param_2);
-    CmdEndClipRef _715 = { ref.offset + 4u };
+    CmdEndClipRef _866 = { ref.offset + 4u };
     Alloc param_3 = a;
-    CmdEndClipRef param_4 = _715;
+    CmdEndClipRef param_4 = _866;
     CmdEndClip param_5 = s;
     CmdEndClip_write(param_3, param_4, param_5);
 }
@@ -578,80 +655,81 @@ void Cmd_End_write(Alloc a, CmdRef ref)
 
 void comp_main()
 {
-    uint width_in_bins = ((_854.Load(8) + 16u) - 1u) / 16u;
+    uint width_in_bins = ((_1005.Load(8) + 16u) - 1u) / 16u;
     uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x;
     uint partition_ix = 0u;
-    uint n_partitions = ((_854.Load(0) + 256u) - 1u) / 256u;
+    uint n_partitions = ((_1005.Load(0) + 256u) - 1u) / 256u;
     uint th_ix = gl_LocalInvocationID.x;
     uint bin_tile_x = 16u * gl_WorkGroupID.x;
     uint bin_tile_y = 16u * gl_WorkGroupID.y;
     uint tile_x = gl_LocalInvocationID.x % 16u;
     uint tile_y = gl_LocalInvocationID.x / 16u;
-    uint this_tile_ix = (((bin_tile_y + tile_y) * _854.Load(8)) + bin_tile_x) + tile_x;
-    Alloc _919;
-    _919.offset = _854.Load(24);
+    uint this_tile_ix = (((bin_tile_y + tile_y) * _1005.Load(8)) + bin_tile_x) + tile_x;
+    Alloc _1070;
+    _1070.offset = _1005.Load(24);
     Alloc param;
-    param.offset = _919.offset;
+    param.offset = _1070.offset;
     uint param_1 = this_tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
-    CmdRef _928 = { cmd_alloc.offset };
-    CmdRef cmd_ref = _928;
-    uint cmd_limit = (cmd_ref.offset + 1024u) - 60u;
+    CmdRef _1079 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _1079;
+    uint cmd_limit = (cmd_ref.offset + 1024u) - 144u;
     uint clip_depth = 0u;
     uint clip_zero_depth = 0u;
     uint rd_ix = 0u;
     uint wr_ix = 0u;
     uint part_start_ix = 0u;
     uint ready_ix = 0u;
-    uint drawmonoid_start = _854.Load(44) >> uint(2);
-    uint drawtag_start = _854.Load(100) >> uint(2);
-    uint drawdata_start = _854.Load(104) >> uint(2);
-    uint drawinfo_start = _854.Load(68) >> uint(2);
-    bool mem_ok = _242.Load(4) == 0u;
+    uint drawmonoid_start = _1005.Load(44) >> uint(2);
+    uint drawtag_start = _1005.Load(100) >> uint(2);
+    uint drawdata_start = _1005.Load(104) >> uint(2);
+    uint drawinfo_start = _1005.Load(68) >> uint(2);
+    bool mem_ok = _260.Load(4) == 0u;
     Alloc param_3;
     Alloc param_5;
-    uint _1154;
+    uint _1304;
     uint element_ix;
     Alloc param_14;
     uint tile_count;
-    uint _1455;
+    uint _1605;
     float linewidth;
     CmdLinGrad cmd_lin;
+    CmdRadGrad cmd_rad;
     while (true)
     {
         for (uint i = 0u; i < 8u; i++)
         {
             sh_bitmaps[i][th_ix] = 0u;
         }
-        bool _1206;
+        bool _1356;
         for (;;)
         {
             if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
             {
                 part_start_ix = ready_ix;
                 uint count = 0u;
-                bool _1003 = th_ix < 256u;
-                bool _1011;
-                if (_1003)
+                bool _1154 = th_ix < 256u;
+                bool _1162;
+                if (_1154)
                 {
-                    _1011 = (partition_ix + th_ix) < n_partitions;
+                    _1162 = (partition_ix + th_ix) < n_partitions;
                 }
                 else
                 {
-                    _1011 = _1003;
+                    _1162 = _1154;
                 }
-                if (_1011)
+                if (_1162)
                 {
-                    uint in_ix = (_854.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
-                    Alloc _1029;
-                    _1029.offset = _854.Load(20);
-                    param_3.offset = _1029.offset;
+                    uint in_ix = (_1005.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
+                    Alloc _1179;
+                    _1179.offset = _1005.Load(20);
+                    param_3.offset = _1179.offset;
                     uint param_4 = in_ix;
                     count = read_mem(param_3, param_4);
-                    Alloc _1040;
-                    _1040.offset = _854.Load(20);
-                    param_5.offset = _1040.offset;
+                    Alloc _1190;
+                    _1190.offset = _1005.Load(20);
+                    param_5.offset = _1190.offset;
                     uint param_6 = in_ix + 1u;
                     uint offset = read_mem(param_5, param_6);
                     uint param_7 = offset;
@@ -697,16 +775,16 @@ void comp_main()
                 }
                 if (part_ix > 0u)
                 {
-                    _1154 = sh_part_count[part_ix - 1u];
+                    _1304 = sh_part_count[part_ix - 1u];
                 }
                 else
                 {
-                    _1154 = part_start_ix;
+                    _1304 = part_start_ix;
                 }
-                ix -= _1154;
+                ix -= _1304;
                 Alloc bin_alloc = sh_part_elements[part_ix];
-                BinInstanceRef _1173 = { bin_alloc.offset };
-                BinInstanceRef inst_ref = _1173;
+                BinInstanceRef _1323 = { bin_alloc.offset };
+                BinInstanceRef inst_ref = _1323;
                 BinInstanceRef param_10 = inst_ref;
                 uint param_11 = ix;
                 Alloc param_12 = bin_alloc;
@@ -716,16 +794,16 @@ void comp_main()
             }
             GroupMemoryBarrierWithGroupSync();
             wr_ix = min((rd_ix + 256u), ready_ix);
-            bool _1196 = (wr_ix - rd_ix) < 256u;
-            if (_1196)
+            bool _1346 = (wr_ix - rd_ix) < 256u;
+            if (_1346)
             {
-                _1206 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
+                _1356 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
             }
             else
             {
-                _1206 = _1196;
+                _1356 = _1346;
             }
-            if (_1206)
+            if (_1356)
             {
                 continue;
             }
@@ -738,23 +816,24 @@ void comp_main()
         if ((th_ix + rd_ix) < wr_ix)
         {
             element_ix = sh_elements[th_ix];
-            tag = _1222.Load((drawtag_start + element_ix) * 4 + 0);
+            tag = _1372.Load((drawtag_start + element_ix) * 4 + 0);
         }
         switch (tag)
         {
             case 68u:
             case 72u:
             case 276u:
+            case 732u:
             case 5u:
             case 37u:
             {
                 uint drawmonoid_base = drawmonoid_start + (4u * element_ix);
-                uint path_ix = _242.Load(drawmonoid_base * 4 + 8);
-                PathRef _1247 = { _854.Load(16) + (path_ix * 12u) };
-                Alloc _1250;
-                _1250.offset = _854.Load(16);
-                param_14.offset = _1250.offset;
-                PathRef param_15 = _1247;
+                uint path_ix = _260.Load(drawmonoid_base * 4 + 8);
+                PathRef _1397 = { _1005.Load(16) + (path_ix * 12u) };
+                Alloc _1400;
+                _1400.offset = _1005.Load(16);
+                param_14.offset = _1400.offset;
+                PathRef param_15 = _1397;
                 Path path = Path_read(param_14, param_15);
                 uint stride = path.bbox.z - path.bbox.x;
                 sh_tile_stride[th_ix] = stride;
@@ -810,16 +889,16 @@ void comp_main()
                 }
             }
             uint element_ix_1 = sh_elements[el_ix];
-            uint tag_1 = _1222.Load((drawtag_start + element_ix_1) * 4 + 0);
+            uint tag_1 = _1372.Load((drawtag_start + element_ix_1) * 4 + 0);
             if (el_ix > 0u)
             {
-                _1455 = sh_tile_count[el_ix - 1u];
+                _1605 = sh_tile_count[el_ix - 1u];
             }
             else
             {
-                _1455 = 0u;
+                _1605 = 0u;
             }
-            uint seq_ix = ix_1 - _1455;
+            uint seq_ix = ix_1 - _1605;
             uint width = sh_tile_width[el_ix];
             uint x = sh_tile_x0[el_ix] + (seq_ix % width);
             uint y = sh_tile_y0[el_ix] + (seq_ix / width);
@@ -828,38 +907,47 @@ void comp_main()
             {
                 uint param_21 = el_ix;
                 bool param_22 = mem_ok;
-                TileRef _1507 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
+                TileRef _1657 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
                 Alloc param_23 = read_tile_alloc(param_21, param_22);
-                TileRef param_24 = _1507;
+                TileRef param_24 = _1657;
                 Tile tile = Tile_read(param_23, param_24);
                 bool is_clip = (tag_1 & 1u) != 0u;
                 bool is_blend = false;
                 if (is_clip)
                 {
                     uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1);
-                    uint scene_offset = _242.Load((drawmonoid_base_1 + 2u) * 4 + 8);
+                    uint scene_offset = _260.Load((drawmonoid_base_1 + 2u) * 4 + 8);
                     uint dd = drawdata_start + (scene_offset >> uint(2));
-                    uint blend = _1222.Load(dd * 4 + 0);
+                    uint blend = _1372.Load(dd * 4 + 0);
                     is_blend = blend != 3u;
                 }
-                bool _1542 = tile.tile.offset != 0u;
-                bool _1551;
-                if (!_1542)
+                bool _1692 = tile.tile.offset != 0u;
+                bool _1701;
+                if (!_1692)
                 {
-                    _1551 = (tile.backdrop == 0) == is_clip;
+                    _1701 = (tile.backdrop == 0) == is_clip;
                 }
                 else
                 {
-                    _1551 = _1542;
+                    _1701 = _1692;
                 }
-                include_tile = _1551 || is_blend;
+                bool _1708;
+                if (!_1701)
+                {
+                    _1708 = is_clip && is_blend;
+                }
+                else
+                {
+                    _1708 = _1701;
+                }
+                include_tile = _1708;
             }
             if (include_tile)
             {
                 uint el_slice = el_ix / 32u;
                 uint el_mask = 1u << (el_ix & 31u);
-                uint _1573;
-                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1573);
+                uint _1728;
+                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1728);
             }
         }
         GroupMemoryBarrierWithGroupSync();
@@ -883,33 +971,33 @@ void comp_main()
             uint element_ref_ix = (slice_ix * 32u) + uint(int(firstbitlow(bitmap)));
             uint element_ix_2 = sh_elements[element_ref_ix];
             bitmap &= (bitmap - 1u);
-            uint drawtag = _1222.Load((drawtag_start + element_ix_2) * 4 + 0);
+            uint drawtag = _1372.Load((drawtag_start + element_ix_2) * 4 + 0);
             if (clip_zero_depth == 0u)
             {
                 uint param_25 = element_ref_ix;
                 bool param_26 = mem_ok;
-                TileRef _1650 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                TileRef _1805 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
                 Alloc param_27 = read_tile_alloc(param_25, param_26);
-                TileRef param_28 = _1650;
+                TileRef param_28 = _1805;
                 Tile tile_1 = Tile_read(param_27, param_28);
                 uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2);
-                uint scene_offset_1 = _242.Load((drawmonoid_base_2 + 2u) * 4 + 8);
-                uint info_offset = _242.Load((drawmonoid_base_2 + 3u) * 4 + 8);
+                uint scene_offset_1 = _260.Load((drawmonoid_base_2 + 2u) * 4 + 8);
+                uint info_offset = _260.Load((drawmonoid_base_2 + 3u) * 4 + 8);
                 uint dd_1 = drawdata_start + (scene_offset_1 >> uint(2));
                 uint di = drawinfo_start + (info_offset >> uint(2));
                 switch (drawtag)
                 {
                     case 68u:
                     {
-                        linewidth = asfloat(_242.Load(di * 4 + 8));
+                        linewidth = asfloat(_260.Load(di * 4 + 8));
                         Alloc param_29 = cmd_alloc;
                         CmdRef param_30 = cmd_ref;
                         uint param_31 = cmd_limit;
-                        bool _1697 = alloc_cmd(param_29, param_30, param_31);
+                        bool _1853 = alloc_cmd(param_29, param_30, param_31);
                         cmd_alloc = param_29;
                         cmd_ref = param_30;
                         cmd_limit = param_31;
-                        if (!_1697)
+                        if (!_1853)
                         {
                             break;
                         }
@@ -919,11 +1007,11 @@ void comp_main()
                         float param_35 = linewidth;
                         write_fill(param_32, param_33, param_34, param_35);
                         cmd_ref = param_33;
-                        uint rgba = _1222.Load(dd_1 * 4 + 0);
-                        CmdColor _1720 = { rgba };
+                        uint rgba = _1372.Load(dd_1 * 4 + 0);
+                        CmdColor _1876 = { rgba };
                         Alloc param_36 = cmd_alloc;
                         CmdRef param_37 = cmd_ref;
-                        CmdColor param_38 = _1720;
+                        CmdColor param_38 = _1876;
                         Cmd_Color_write(param_36, param_37, param_38);
                         cmd_ref.offset += 8u;
                         break;
@@ -933,25 +1021,25 @@ void comp_main()
                         Alloc param_39 = cmd_alloc;
                         CmdRef param_40 = cmd_ref;
                         uint param_41 = cmd_limit;
-                        bool _1738 = alloc_cmd(param_39, param_40, param_41);
+                        bool _1894 = alloc_cmd(param_39, param_40, param_41);
                         cmd_alloc = param_39;
                         cmd_ref = param_40;
                         cmd_limit = param_41;
-                        if (!_1738)
+                        if (!_1894)
                         {
                             break;
                         }
-                        linewidth = asfloat(_242.Load(di * 4 + 8));
+                        linewidth = asfloat(_260.Load(di * 4 + 8));
                         Alloc param_42 = cmd_alloc;
                         CmdRef param_43 = cmd_ref;
                         Tile param_44 = tile_1;
                         float param_45 = linewidth;
                         write_fill(param_42, param_43, param_44, param_45);
                         cmd_ref = param_43;
-                        cmd_lin.index = _1222.Load(dd_1 * 4 + 0);
-                        cmd_lin.line_x = asfloat(_242.Load((di + 1u) * 4 + 8));
-                        cmd_lin.line_y = asfloat(_242.Load((di + 2u) * 4 + 8));
-                        cmd_lin.line_c = asfloat(_242.Load((di + 3u) * 4 + 8));
+                        cmd_lin.index = _1372.Load(dd_1 * 4 + 0);
+                        cmd_lin.line_x = asfloat(_260.Load((di + 1u) * 4 + 8));
+                        cmd_lin.line_y = asfloat(_260.Load((di + 2u) * 4 + 8));
+                        cmd_lin.line_c = asfloat(_260.Load((di + 3u) * 4 + 8));
                         Alloc param_46 = cmd_alloc;
                         CmdRef param_47 = cmd_ref;
                         CmdLinGrad param_48 = cmd_lin;
@@ -959,69 +1047,102 @@ void comp_main()
                         cmd_ref.offset += 20u;
                         break;
                     }
-                    case 72u:
+                    case 732u:
                     {
-                        linewidth = asfloat(_242.Load(di * 4 + 8));
                         Alloc param_49 = cmd_alloc;
                         CmdRef param_50 = cmd_ref;
                         uint param_51 = cmd_limit;
-                        bool _1806 = alloc_cmd(param_49, param_50, param_51);
+                        bool _1958 = alloc_cmd(param_49, param_50, param_51);
                         cmd_alloc = param_49;
                         cmd_ref = param_50;
                         cmd_limit = param_51;
-                        if (!_1806)
+                        if (!_1958)
                         {
                             break;
                         }
+                        linewidth = asfloat(_260.Load(di * 4 + 8));
                         Alloc param_52 = cmd_alloc;
                         CmdRef param_53 = cmd_ref;
                         Tile param_54 = tile_1;
                         float param_55 = linewidth;
                         write_fill(param_52, param_53, param_54, param_55);
                         cmd_ref = param_53;
-                        uint index = _1222.Load(dd_1 * 4 + 0);
-                        uint raw1 = _1222.Load((dd_1 + 1u) * 4 + 0);
-                        int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
-                        CmdImage _1845 = { index, offset_1 };
+                        cmd_rad.index = _1372.Load(dd_1 * 4 + 0);
+                        cmd_rad.mat = asfloat(uint4(_260.Load((di + 1u) * 4 + 8), _260.Load((di + 2u) * 4 + 8), _260.Load((di + 3u) * 4 + 8), _260.Load((di + 4u) * 4 + 8)));
+                        cmd_rad.xlat = asfloat(uint2(_260.Load((di + 5u) * 4 + 8), _260.Load((di + 6u) * 4 + 8)));
+                        cmd_rad.c1 = asfloat(uint2(_260.Load((di + 7u) * 4 + 8), _260.Load((di + 8u) * 4 + 8)));
+                        cmd_rad.ra = asfloat(_260.Load((di + 9u) * 4 + 8));
+                        cmd_rad.roff = asfloat(_260.Load((di + 10u) * 4 + 8));
                         Alloc param_56 = cmd_alloc;
                         CmdRef param_57 = cmd_ref;
-                        CmdImage param_58 = _1845;
-                        Cmd_Image_write(param_56, param_57, param_58);
+                        CmdRadGrad param_58 = cmd_rad;
+                        Cmd_RadGrad_write(param_56, param_57, param_58);
+                        cmd_ref.offset += 48u;
+                        break;
+                    }
+                    case 72u:
+                    {
+                        linewidth = asfloat(_260.Load(di * 4 + 8));
+                        Alloc param_59 = cmd_alloc;
+                        CmdRef param_60 = cmd_ref;
+                        uint param_61 = cmd_limit;
+                        bool _2064 = alloc_cmd(param_59, param_60, param_61);
+                        cmd_alloc = param_59;
+                        cmd_ref = param_60;
+                        cmd_limit = param_61;
+                        if (!_2064)
+                        {
+                            break;
+                        }
+                        Alloc param_62 = cmd_alloc;
+                        CmdRef param_63 = cmd_ref;
+                        Tile param_64 = tile_1;
+                        float param_65 = linewidth;
+                        write_fill(param_62, param_63, param_64, param_65);
+                        cmd_ref = param_63;
+                        uint index = _1372.Load(dd_1 * 4 + 0);
+                        uint raw1 = _1372.Load((dd_1 + 1u) * 4 + 0);
+                        int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
+                        CmdImage _2103 = { index, offset_1 };
+                        Alloc param_66 = cmd_alloc;
+                        CmdRef param_67 = cmd_ref;
+                        CmdImage param_68 = _2103;
+                        Cmd_Image_write(param_66, param_67, param_68);
                         cmd_ref.offset += 12u;
                         break;
                     }
                     case 5u:
                     {
-                        bool _1859 = tile_1.tile.offset == 0u;
-                        bool _1865;
-                        if (_1859)
+                        bool _2117 = tile_1.tile.offset == 0u;
+                        bool _2123;
+                        if (_2117)
                         {
-                            _1865 = tile_1.backdrop == 0;
+                            _2123 = tile_1.backdrop == 0;
                         }
                         else
                         {
-                            _1865 = _1859;
+                            _2123 = _2117;
                         }
-                        if (_1865)
+                        if (_2123)
                         {
                             clip_zero_depth = clip_depth + 1u;
                         }
                         else
                         {
-                            Alloc param_59 = cmd_alloc;
-                            CmdRef param_60 = cmd_ref;
-                            uint param_61 = cmd_limit;
-                            bool _1877 = alloc_cmd(param_59, param_60, param_61);
-                            cmd_alloc = param_59;
-                            cmd_ref = param_60;
-                            cmd_limit = param_61;
-                            if (!_1877)
+                            Alloc param_69 = cmd_alloc;
+                            CmdRef param_70 = cmd_ref;
+                            uint param_71 = cmd_limit;
+                            bool _2135 = alloc_cmd(param_69, param_70, param_71);
+                            cmd_alloc = param_69;
+                            cmd_ref = param_70;
+                            cmd_limit = param_71;
+                            if (!_2135)
                             {
                                 break;
                             }
-                            Alloc param_62 = cmd_alloc;
-                            CmdRef param_63 = cmd_ref;
-                            Cmd_BeginClip_write(param_62, param_63);
+                            Alloc param_72 = cmd_alloc;
+                            CmdRef param_73 = cmd_ref;
+                            Cmd_BeginClip_write(param_72, param_73);
                             cmd_ref.offset += 4u;
                         }
                         clip_depth++;
@@ -1030,29 +1151,29 @@ void comp_main()
                     case 37u:
                     {
                         clip_depth--;
-                        Alloc param_64 = cmd_alloc;
-                        CmdRef param_65 = cmd_ref;
-                        uint param_66 = cmd_limit;
-                        bool _1905 = alloc_cmd(param_64, param_65, param_66);
-                        cmd_alloc = param_64;
-                        cmd_ref = param_65;
-                        cmd_limit = param_66;
-                        if (!_1905)
+                        Alloc param_74 = cmd_alloc;
+                        CmdRef param_75 = cmd_ref;
+                        uint param_76 = cmd_limit;
+                        bool _2163 = alloc_cmd(param_74, param_75, param_76);
+                        cmd_alloc = param_74;
+                        cmd_ref = param_75;
+                        cmd_limit = param_76;
+                        if (!_2163)
                         {
                             break;
                         }
-                        Alloc param_67 = cmd_alloc;
-                        CmdRef param_68 = cmd_ref;
-                        Tile param_69 = tile_1;
-                        float param_70 = -1.0f;
-                        write_fill(param_67, param_68, param_69, param_70);
-                        cmd_ref = param_68;
-                        uint blend_1 = _1222.Load(dd_1 * 4 + 0);
-                        CmdEndClip _1928 = { blend_1 };
-                        Alloc param_71 = cmd_alloc;
-                        CmdRef param_72 = cmd_ref;
-                        CmdEndClip param_73 = _1928;
-                        Cmd_EndClip_write(param_71, param_72, param_73);
+                        Alloc param_77 = cmd_alloc;
+                        CmdRef param_78 = cmd_ref;
+                        Tile param_79 = tile_1;
+                        float param_80 = -1.0f;
+                        write_fill(param_77, param_78, param_79, param_80);
+                        cmd_ref = param_78;
+                        uint blend_1 = _1372.Load(dd_1 * 4 + 0);
+                        CmdEndClip _2186 = { blend_1 };
+                        Alloc param_81 = cmd_alloc;
+                        CmdRef param_82 = cmd_ref;
+                        CmdEndClip param_83 = _2186;
+                        Cmd_EndClip_write(param_81, param_82, param_83);
                         cmd_ref.offset += 8u;
                         break;
                     }
@@ -1086,21 +1207,21 @@ void comp_main()
             break;
         }
     }
-    bool _1975 = (bin_tile_x + tile_x) < _854.Load(8);
-    bool _1984;
-    if (_1975)
+    bool _2233 = (bin_tile_x + tile_x) < _1005.Load(8);
+    bool _2242;
+    if (_2233)
     {
-        _1984 = (bin_tile_y + tile_y) < _854.Load(12);
+        _2242 = (bin_tile_y + tile_y) < _1005.Load(12);
     }
     else
     {
-        _1984 = _1975;
+        _2242 = _2233;
     }
-    if (_1984)
+    if (_2242)
     {
-        Alloc param_74 = cmd_alloc;
-        CmdRef param_75 = cmd_ref;
-        Cmd_End_write(param_74, param_75);
+        Alloc param_84 = cmd_alloc;
+        CmdRef param_85 = cmd_ref;
+        Cmd_End_write(param_84, param_85);
     }
 }
 
diff --git a/piet-gpu/shader/gen/coarse.msl b/piet-gpu/shader/gen/coarse.msl
index 4226352..abd636b 100644
--- a/piet-gpu/shader/gen/coarse.msl
+++ b/piet-gpu/shader/gen/coarse.msl
@@ -107,6 +107,21 @@ struct CmdLinGrad
     float line_c;
 };
 
+struct CmdRadGradRef
+{
+    uint offset;
+};
+
+struct CmdRadGrad
+{
+    uint index;
+    float4 mat;
+    float2 xlat;
+    float2 c1;
+    float ra;
+    float roff;
+};
+
 struct CmdImageRef
 {
     uint offset;
@@ -211,7 +226,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
 }
 
 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_242, constant uint& v_242BufferSize)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_260, constant uint& v_260BufferSize)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -219,7 +234,7 @@ uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memor
     {
         return 0u;
     }
-    uint v = v_242.memory[offset];
+    uint v = v_260.memory[offset];
     return v;
 }
 
@@ -238,30 +253,30 @@ BinInstanceRef BinInstance_index(thread const BinInstanceRef& ref, thread const
 }
 
 static inline __attribute__((always_inline))
-BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_242, constant uint& v_242BufferSize)
+BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_260, constant uint& v_260BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_242, v_242BufferSize);
+    uint raw0 = read_mem(param, param_1, v_260, v_260BufferSize);
     BinInstance s;
     s.element_ix = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_242, constant uint& v_242BufferSize)
+Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_260, constant uint& v_260BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_242, v_242BufferSize);
+    uint raw0 = read_mem(param, param_1, v_260, v_260BufferSize);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_242, v_242BufferSize);
+    uint raw1 = read_mem(param_2, param_3, v_260, v_260BufferSize);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_242, v_242BufferSize);
+    uint raw2 = read_mem(param_4, param_5, v_260, v_260BufferSize);
     Path s;
     s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
     s.tiles = TileRef{ raw2 };
@@ -274,24 +289,24 @@ void write_tile_alloc(thread const uint& el_ix, thread const Alloc& a)
 }
 
 static inline __attribute__((always_inline))
-Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, device Memory& v_242, constant uint& v_242BufferSize)
+Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, device Memory& v_260, constant uint& v_260BufferSize)
 {
     uint param = 0u;
-    uint param_1 = uint(int((v_242BufferSize - 8) / 4) * 4);
+    uint param_1 = uint(int((v_260BufferSize - 8) / 4) * 4);
     bool param_2 = mem_ok;
     return new_alloc(param, param_1, param_2);
 }
 
 static inline __attribute__((always_inline))
-Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_242, constant uint& v_242BufferSize)
+Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_260, constant uint& v_260BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_242, v_242BufferSize);
+    uint raw0 = read_mem(param, param_1, v_260, v_260BufferSize);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_242, v_242BufferSize);
+    uint raw1 = read_mem(param_2, param_3, v_260, v_260BufferSize);
     Tile s;
     s.tile = TileSegRef{ raw0 };
     s.backdrop = int(raw1);
@@ -299,26 +314,26 @@ Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory&
 }
 
 static inline __attribute__((always_inline))
-MallocResult malloc(thread const uint& size, device Memory& v_242, constant uint& v_242BufferSize)
+MallocResult malloc(thread const uint& size, device Memory& v_260, constant uint& v_260BufferSize)
 {
-    uint _248 = atomic_fetch_add_explicit((device atomic_uint*)&v_242.mem_offset, size, memory_order_relaxed);
-    uint offset = _248;
+    uint _266 = atomic_fetch_add_explicit((device atomic_uint*)&v_260.mem_offset, size, memory_order_relaxed);
+    uint offset = _266;
     MallocResult r;
-    r.failed = (offset + size) > uint(int((v_242BufferSize - 8) / 4) * 4);
+    r.failed = (offset + size) > uint(int((v_260BufferSize - 8) / 4) * 4);
     uint param = offset;
     uint param_1 = size;
     bool param_2 = !r.failed;
     r.alloc = new_alloc(param, param_1, param_2);
     if (r.failed)
     {
-        uint _277 = atomic_fetch_max_explicit((device atomic_uint*)&v_242.mem_error, 1u, memory_order_relaxed);
+        uint _295 = atomic_fetch_max_explicit((device atomic_uint*)&v_260.mem_error, 1u, memory_order_relaxed);
         return r;
     }
     return r;
 }
 
 static inline __attribute__((always_inline))
-void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_242, constant uint& v_242BufferSize)
+void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_260, constant uint& v_260BufferSize)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -326,42 +341,42 @@ void write_mem(thread const Alloc& alloc, thread const uint& offset, thread cons
     {
         return;
     }
-    v_242.memory[offset] = val;
+    v_260.memory[offset] = val;
 }
 
 static inline __attribute__((always_inline))
-void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_242, constant uint& v_242BufferSize)
+void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.new_ref;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_242, constant uint& v_242BufferSize)
+void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint param_2 = 10u;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    uint param_2 = 11u;
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
     Alloc param_3 = a;
     CmdJumpRef param_4 = CmdJumpRef{ ref.offset + 4u };
     CmdJump param_5 = s;
-    CmdJump_write(param_3, param_4, param_5, v_242, v_242BufferSize);
+    CmdJump_write(param_3, param_4, param_5, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, device Memory& v_242, constant uint& v_242BufferSize)
+bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, device Memory& v_260, constant uint& v_260BufferSize)
 {
     if (cmd_ref.offset < cmd_limit)
     {
         return true;
     }
     uint param = 1024u;
-    MallocResult _762 = malloc(param, v_242, v_242BufferSize);
-    MallocResult new_cmd = _762;
+    MallocResult _913 = malloc(param, v_260, v_260BufferSize);
+    MallocResult new_cmd = _913;
     if (new_cmd.failed)
     {
         return false;
@@ -370,78 +385,78 @@ bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd
     Alloc param_1 = cmd_alloc;
     CmdRef param_2 = cmd_ref;
     CmdJump param_3 = jump;
-    Cmd_Jump_write(param_1, param_2, param_3, v_242, v_242BufferSize);
+    Cmd_Jump_write(param_1, param_2, param_3, v_260, v_260BufferSize);
     cmd_alloc = new_cmd.alloc;
     cmd_ref = CmdRef{ cmd_alloc.offset };
-    cmd_limit = (cmd_alloc.offset + 1024u) - 60u;
+    cmd_limit = (cmd_alloc.offset + 1024u) - 144u;
     return true;
 }
 
 static inline __attribute__((always_inline))
-void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_242, constant uint& v_242BufferSize)
+void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.tile_ref;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = uint(s.backdrop);
-    write_mem(param_3, param_4, param_5, v_242, v_242BufferSize);
+    write_mem(param_3, param_4, param_5, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_242, constant uint& v_242BufferSize)
+void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 1u;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
     Alloc param_3 = a;
     CmdFillRef param_4 = CmdFillRef{ ref.offset + 4u };
     CmdFill param_5 = s;
-    CmdFill_write(param_3, param_4, param_5, v_242, v_242BufferSize);
+    CmdFill_write(param_3, param_4, param_5, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_242, constant uint& v_242BufferSize)
+void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_260, constant uint& v_260BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 3u;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_242, constant uint& v_242BufferSize)
+void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.tile_ref;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = as_type<uint>(s.half_width);
-    write_mem(param_3, param_4, param_5, v_242, v_242BufferSize);
+    write_mem(param_3, param_4, param_5, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_242, constant uint& v_242BufferSize)
+void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 2u;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
     Alloc param_3 = a;
     CmdStrokeRef param_4 = CmdStrokeRef{ ref.offset + 4u };
     CmdStroke param_5 = s;
-    CmdStroke_write(param_3, param_4, param_5, v_242, v_242BufferSize);
+    CmdStroke_write(param_3, param_4, param_5, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const Tile& tile, thread const float& linewidth, device Memory& v_242, constant uint& v_242BufferSize)
+void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const Tile& tile, thread const float& linewidth, device Memory& v_260, constant uint& v_260BufferSize)
 {
     if (linewidth < 0.0)
     {
@@ -451,14 +466,14 @@ void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const
             Alloc param = alloc;
             CmdRef param_1 = cmd_ref;
             CmdFill param_2 = cmd_fill;
-            Cmd_Fill_write(param, param_1, param_2, v_242, v_242BufferSize);
+            Cmd_Fill_write(param, param_1, param_2, v_260, v_260BufferSize);
             cmd_ref.offset += 12u;
         }
         else
         {
             Alloc param_3 = alloc;
             CmdRef param_4 = cmd_ref;
-            Cmd_Solid_write(param_3, param_4, v_242, v_242BufferSize);
+            Cmd_Solid_write(param_3, param_4, v_260, v_260BufferSize);
             cmd_ref.offset += 4u;
         }
     }
@@ -468,138 +483,201 @@ void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const
         Alloc param_5 = alloc;
         CmdRef param_6 = cmd_ref;
         CmdStroke param_7 = cmd_stroke;
-        Cmd_Stroke_write(param_5, param_6, param_7, v_242, v_242BufferSize);
+        Cmd_Stroke_write(param_5, param_6, param_7, v_260, v_260BufferSize);
         cmd_ref.offset += 12u;
     }
 }
 
 static inline __attribute__((always_inline))
-void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_242, constant uint& v_242BufferSize)
+void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.rgba_color;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_242, constant uint& v_242BufferSize)
+void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 5u;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
     Alloc param_3 = a;
     CmdColorRef param_4 = CmdColorRef{ ref.offset + 4u };
     CmdColor param_5 = s;
-    CmdColor_write(param_3, param_4, param_5, v_242, v_242BufferSize);
+    CmdColor_write(param_3, param_4, param_5, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_242, constant uint& v_242BufferSize)
+void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.index;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
     uint param_5 = as_type<uint>(s.line_x);
-    write_mem(param_3, param_4, param_5, v_242, v_242BufferSize);
+    write_mem(param_3, param_4, param_5, v_260, v_260BufferSize);
     Alloc param_6 = a;
     uint param_7 = ix + 2u;
     uint param_8 = as_type<uint>(s.line_y);
-    write_mem(param_6, param_7, param_8, v_242, v_242BufferSize);
+    write_mem(param_6, param_7, param_8, v_260, v_260BufferSize);
     Alloc param_9 = a;
     uint param_10 = ix + 3u;
     uint param_11 = as_type<uint>(s.line_c);
-    write_mem(param_9, param_10, param_11, v_242, v_242BufferSize);
+    write_mem(param_9, param_10, param_11, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_242, constant uint& v_242BufferSize)
+void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 6u;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
     Alloc param_3 = a;
     CmdLinGradRef param_4 = CmdLinGradRef{ ref.offset + 4u };
     CmdLinGrad param_5 = s;
-    CmdLinGrad_write(param_3, param_4, param_5, v_242, v_242BufferSize);
+    CmdLinGrad_write(param_3, param_4, param_5, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_242, constant uint& v_242BufferSize)
+void CmdRadGrad_write(thread const Alloc& a, thread const CmdRadGradRef& ref, thread const CmdRadGrad& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.index;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
     Alloc param_3 = a;
     uint param_4 = ix + 1u;
-    uint param_5 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16));
-    write_mem(param_3, param_4, param_5, v_242, v_242BufferSize);
+    uint param_5 = as_type<uint>(s.mat.x);
+    write_mem(param_3, param_4, param_5, v_260, v_260BufferSize);
+    Alloc param_6 = a;
+    uint param_7 = ix + 2u;
+    uint param_8 = as_type<uint>(s.mat.y);
+    write_mem(param_6, param_7, param_8, v_260, v_260BufferSize);
+    Alloc param_9 = a;
+    uint param_10 = ix + 3u;
+    uint param_11 = as_type<uint>(s.mat.z);
+    write_mem(param_9, param_10, param_11, v_260, v_260BufferSize);
+    Alloc param_12 = a;
+    uint param_13 = ix + 4u;
+    uint param_14 = as_type<uint>(s.mat.w);
+    write_mem(param_12, param_13, param_14, v_260, v_260BufferSize);
+    Alloc param_15 = a;
+    uint param_16 = ix + 5u;
+    uint param_17 = as_type<uint>(s.xlat.x);
+    write_mem(param_15, param_16, param_17, v_260, v_260BufferSize);
+    Alloc param_18 = a;
+    uint param_19 = ix + 6u;
+    uint param_20 = as_type<uint>(s.xlat.y);
+    write_mem(param_18, param_19, param_20, v_260, v_260BufferSize);
+    Alloc param_21 = a;
+    uint param_22 = ix + 7u;
+    uint param_23 = as_type<uint>(s.c1.x);
+    write_mem(param_21, param_22, param_23, v_260, v_260BufferSize);
+    Alloc param_24 = a;
+    uint param_25 = ix + 8u;
+    uint param_26 = as_type<uint>(s.c1.y);
+    write_mem(param_24, param_25, param_26, v_260, v_260BufferSize);
+    Alloc param_27 = a;
+    uint param_28 = ix + 9u;
+    uint param_29 = as_type<uint>(s.ra);
+    write_mem(param_27, param_28, param_29, v_260, v_260BufferSize);
+    Alloc param_30 = a;
+    uint param_31 = ix + 10u;
+    uint param_32 = as_type<uint>(s.roff);
+    write_mem(param_30, param_31, param_32, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_242, constant uint& v_242BufferSize)
+void Cmd_RadGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdRadGrad& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 7u;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
     Alloc param_3 = a;
-    CmdImageRef param_4 = CmdImageRef{ ref.offset + 4u };
-    CmdImage param_5 = s;
-    CmdImage_write(param_3, param_4, param_5, v_242, v_242BufferSize);
+    CmdRadGradRef param_4 = CmdRadGradRef{ ref.offset + 4u };
+    CmdRadGrad param_5 = s;
+    CmdRadGrad_write(param_3, param_4, param_5, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_242, constant uint& v_242BufferSize)
+void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_260, constant uint& v_260BufferSize)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = s.index;
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16));
+    write_mem(param_3, param_4, param_5, v_260, v_260BufferSize);
+}
+
+static inline __attribute__((always_inline))
+void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 8u;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
+    Alloc param_3 = a;
+    CmdImageRef param_4 = CmdImageRef{ ref.offset + 4u };
+    CmdImage param_5 = s;
+    CmdImage_write(param_3, param_4, param_5, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void CmdEndClip_write(thread const Alloc& a, thread const CmdEndClipRef& ref, thread const CmdEndClip& s, device Memory& v_242, constant uint& v_242BufferSize)
+void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_260, constant uint& v_260BufferSize)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = 9u;
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
+}
+
+static inline __attribute__((always_inline))
+void CmdEndClip_write(thread const Alloc& a, thread const CmdEndClipRef& ref, thread const CmdEndClip& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
     uint param_2 = s.blend;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdEndClip& s, device Memory& v_242, constant uint& v_242BufferSize)
+void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdEndClip& s, device Memory& v_260, constant uint& v_260BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint param_2 = 9u;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    uint param_2 = 10u;
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
     Alloc param_3 = a;
     CmdEndClipRef param_4 = CmdEndClipRef{ ref.offset + 4u };
     CmdEndClip param_5 = s;
-    CmdEndClip_write(param_3, param_4, param_5, v_242, v_242BufferSize);
+    CmdEndClip_write(param_3, param_4, param_5, v_260, v_260BufferSize);
 }
 
 static inline __attribute__((always_inline))
-void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_242, constant uint& v_242BufferSize)
+void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_260, constant uint& v_260BufferSize)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint param_2 = 0u;
-    write_mem(param, param_1, param_2, v_242, v_242BufferSize);
+    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
 }
 
-kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_242 [[buffer(0)]], const device ConfigBuf& _854 [[buffer(1)]], const device SceneBuf& _1222 [[buffer(2)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_260 [[buffer(0)]], const device ConfigBuf& _1005 [[buffer(1)]], const device SceneBuf& _1372 [[buffer(2)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     threadgroup uint sh_bitmaps[8][256];
     threadgroup Alloc sh_part_elements[256];
@@ -611,76 +689,77 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
     threadgroup uint sh_tile_y0[256];
     threadgroup uint sh_tile_base[256];
     threadgroup uint sh_tile_count[256];
-    constant uint& v_242BufferSize = spvBufferSizeConstants[0];
-    uint width_in_bins = ((_854.conf.width_in_tiles + 16u) - 1u) / 16u;
+    constant uint& v_260BufferSize = spvBufferSizeConstants[0];
+    uint width_in_bins = ((_1005.conf.width_in_tiles + 16u) - 1u) / 16u;
     uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x;
     uint partition_ix = 0u;
-    uint n_partitions = ((_854.conf.n_elements + 256u) - 1u) / 256u;
+    uint n_partitions = ((_1005.conf.n_elements + 256u) - 1u) / 256u;
     uint th_ix = gl_LocalInvocationID.x;
     uint bin_tile_x = 16u * gl_WorkGroupID.x;
     uint bin_tile_y = 16u * gl_WorkGroupID.y;
     uint tile_x = gl_LocalInvocationID.x % 16u;
     uint tile_y = gl_LocalInvocationID.x / 16u;
-    uint this_tile_ix = (((bin_tile_y + tile_y) * _854.conf.width_in_tiles) + bin_tile_x) + tile_x;
+    uint this_tile_ix = (((bin_tile_y + tile_y) * _1005.conf.width_in_tiles) + bin_tile_x) + tile_x;
     Alloc param;
-    param.offset = _854.conf.ptcl_alloc.offset;
+    param.offset = _1005.conf.ptcl_alloc.offset;
     uint param_1 = this_tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
     CmdRef cmd_ref = CmdRef{ cmd_alloc.offset };
-    uint cmd_limit = (cmd_ref.offset + 1024u) - 60u;
+    uint cmd_limit = (cmd_ref.offset + 1024u) - 144u;
     uint clip_depth = 0u;
     uint clip_zero_depth = 0u;
     uint rd_ix = 0u;
     uint wr_ix = 0u;
     uint part_start_ix = 0u;
     uint ready_ix = 0u;
-    uint drawmonoid_start = _854.conf.drawmonoid_alloc.offset >> uint(2);
-    uint drawtag_start = _854.conf.drawtag_offset >> uint(2);
-    uint drawdata_start = _854.conf.drawdata_offset >> uint(2);
-    uint drawinfo_start = _854.conf.drawinfo_alloc.offset >> uint(2);
-    bool mem_ok = v_242.mem_error == 0u;
+    uint drawmonoid_start = _1005.conf.drawmonoid_alloc.offset >> uint(2);
+    uint drawtag_start = _1005.conf.drawtag_offset >> uint(2);
+    uint drawdata_start = _1005.conf.drawdata_offset >> uint(2);
+    uint drawinfo_start = _1005.conf.drawinfo_alloc.offset >> uint(2);
+    bool mem_ok = v_260.mem_error == 0u;
     Alloc param_3;
     Alloc param_5;
-    uint _1154;
+    uint _1304;
     uint element_ix;
     Alloc param_14;
     uint tile_count;
-    uint _1455;
+    uint _1605;
     float linewidth;
     CmdLinGrad cmd_lin;
+    CmdRadGrad cmd_rad;
     while (true)
     {
         for (uint i = 0u; i < 8u; i++)
         {
             sh_bitmaps[i][th_ix] = 0u;
         }
-        bool _1206;
+        bool _1356;
         for (;;)
         {
             if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
             {
                 part_start_ix = ready_ix;
                 uint count = 0u;
-                bool _1003 = th_ix < 256u;
-                bool _1011;
-                if (_1003)
+                bool _1154 = th_ix < 256u;
+                bool _1162;
+                if (_1154)
                 {
-                    _1011 = (partition_ix + th_ix) < n_partitions;
+                    _1162 = (partition_ix + th_ix) < n_partitions;
                 }
                 else
                 {
-                    _1011 = _1003;
+                    _1162 = _1154;
                 }
-                if (_1011)
+                if (_1162)
                 {
-                    uint in_ix = (_854.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
-                    param_3.offset = _854.conf.bin_alloc.offset;
+                    uint in_ix = (_1005.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
+                    param_3.offset = _1005.conf.bin_alloc.offset;
                     uint param_4 = in_ix;
-                    count = read_mem(param_3, param_4, v_242, v_242BufferSize);
-                    param_5.offset = _854.conf.bin_alloc.offset;
+                    count = read_mem(param_3, param_4, v_260, v_260BufferSize);
+                    param_5.offset = _1005.conf.bin_alloc.offset;
                     uint param_6 = in_ix + 1u;
-                    uint offset = read_mem(param_5, param_6, v_242, v_242BufferSize);
+                    uint offset = read_mem(param_5, param_6, v_260, v_260BufferSize);
                     uint param_7 = offset;
                     uint param_8 = count * 4u;
                     bool param_9 = mem_ok;
@@ -724,34 +803,34 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                 }
                 if (part_ix > 0u)
                 {
-                    _1154 = sh_part_count[part_ix - 1u];
+                    _1304 = sh_part_count[part_ix - 1u];
                 }
                 else
                 {
-                    _1154 = part_start_ix;
+                    _1304 = part_start_ix;
                 }
-                ix -= _1154;
+                ix -= _1304;
                 Alloc bin_alloc = sh_part_elements[part_ix];
                 BinInstanceRef inst_ref = BinInstanceRef{ bin_alloc.offset };
                 BinInstanceRef param_10 = inst_ref;
                 uint param_11 = ix;
                 Alloc param_12 = bin_alloc;
                 BinInstanceRef param_13 = BinInstance_index(param_10, param_11);
-                BinInstance inst = BinInstance_read(param_12, param_13, v_242, v_242BufferSize);
+                BinInstance inst = BinInstance_read(param_12, param_13, v_260, v_260BufferSize);
                 sh_elements[th_ix] = inst.element_ix;
             }
             threadgroup_barrier(mem_flags::mem_threadgroup);
             wr_ix = min((rd_ix + 256u), ready_ix);
-            bool _1196 = (wr_ix - rd_ix) < 256u;
-            if (_1196)
+            bool _1346 = (wr_ix - rd_ix) < 256u;
+            if (_1346)
             {
-                _1206 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
+                _1356 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
             }
             else
             {
-                _1206 = _1196;
+                _1356 = _1346;
             }
-            if (_1206)
+            if (_1356)
             {
                 continue;
             }
@@ -764,21 +843,22 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
         if ((th_ix + rd_ix) < wr_ix)
         {
             element_ix = sh_elements[th_ix];
-            tag = _1222.scene[drawtag_start + element_ix];
+            tag = _1372.scene[drawtag_start + element_ix];
         }
         switch (tag)
         {
             case 68u:
             case 72u:
             case 276u:
+            case 732u:
             case 5u:
             case 37u:
             {
                 uint drawmonoid_base = drawmonoid_start + (4u * element_ix);
-                uint path_ix = v_242.memory[drawmonoid_base];
-                param_14.offset = _854.conf.tile_alloc.offset;
-                PathRef param_15 = PathRef{ _854.conf.tile_alloc.offset + (path_ix * 12u) };
-                Path path = Path_read(param_14, param_15, v_242, v_242BufferSize);
+                uint path_ix = v_260.memory[drawmonoid_base];
+                param_14.offset = _1005.conf.tile_alloc.offset;
+                PathRef param_15 = PathRef{ _1005.conf.tile_alloc.offset + (path_ix * 12u) };
+                Path path = Path_read(param_14, param_15, v_260, v_260BufferSize);
                 uint stride = path.bbox.z - path.bbox.x;
                 sh_tile_stride[th_ix] = stride;
                 int dx = int(path.bbox.x) - int(bin_tile_x);
@@ -833,16 +913,16 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                 }
             }
             uint element_ix_1 = sh_elements[el_ix];
-            uint tag_1 = _1222.scene[drawtag_start + element_ix_1];
+            uint tag_1 = _1372.scene[drawtag_start + element_ix_1];
             if (el_ix > 0u)
             {
-                _1455 = sh_tile_count[el_ix - 1u];
+                _1605 = sh_tile_count[el_ix - 1u];
             }
             else
             {
-                _1455 = 0u;
+                _1605 = 0u;
             }
-            uint seq_ix = ix_1 - _1455;
+            uint seq_ix = ix_1 - _1605;
             uint width = sh_tile_width[el_ix];
             uint x = sh_tile_x0[el_ix] + (seq_ix % width);
             uint y = sh_tile_y0[el_ix] + (seq_ix / width);
@@ -851,36 +931,45 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
             {
                 uint param_21 = el_ix;
                 bool param_22 = mem_ok;
-                Alloc param_23 = read_tile_alloc(param_21, param_22, v_242, v_242BufferSize);
+                Alloc param_23 = read_tile_alloc(param_21, param_22, v_260, v_260BufferSize);
                 TileRef param_24 = TileRef{ sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
-                Tile tile = Tile_read(param_23, param_24, v_242, v_242BufferSize);
+                Tile tile = Tile_read(param_23, param_24, v_260, v_260BufferSize);
                 bool is_clip = (tag_1 & 1u) != 0u;
                 bool is_blend = false;
                 if (is_clip)
                 {
                     uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1);
-                    uint scene_offset = v_242.memory[drawmonoid_base_1 + 2u];
+                    uint scene_offset = v_260.memory[drawmonoid_base_1 + 2u];
                     uint dd = drawdata_start + (scene_offset >> uint(2));
-                    uint blend = _1222.scene[dd];
+                    uint blend = _1372.scene[dd];
                     is_blend = blend != 3u;
                 }
-                bool _1542 = tile.tile.offset != 0u;
-                bool _1551;
-                if (!_1542)
+                bool _1692 = tile.tile.offset != 0u;
+                bool _1701;
+                if (!_1692)
                 {
-                    _1551 = (tile.backdrop == 0) == is_clip;
+                    _1701 = (tile.backdrop == 0) == is_clip;
                 }
                 else
                 {
-                    _1551 = _1542;
+                    _1701 = _1692;
                 }
-                include_tile = _1551 || is_blend;
+                bool _1708;
+                if (!_1701)
+                {
+                    _1708 = is_clip && is_blend;
+                }
+                else
+                {
+                    _1708 = _1701;
+                }
+                include_tile = _1708;
             }
             if (include_tile)
             {
                 uint el_slice = el_ix / 32u;
                 uint el_mask = 1u << (el_ix & 31u);
-                uint _1573 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
+                uint _1728 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
             }
         }
         threadgroup_barrier(mem_flags::mem_threadgroup);
@@ -904,32 +993,32 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
             uint element_ref_ix = (slice_ix * 32u) + uint(int(spvFindLSB(bitmap)));
             uint element_ix_2 = sh_elements[element_ref_ix];
             bitmap &= (bitmap - 1u);
-            uint drawtag = _1222.scene[drawtag_start + element_ix_2];
+            uint drawtag = _1372.scene[drawtag_start + element_ix_2];
             if (clip_zero_depth == 0u)
             {
                 uint param_25 = element_ref_ix;
                 bool param_26 = mem_ok;
-                Alloc param_27 = read_tile_alloc(param_25, param_26, v_242, v_242BufferSize);
+                Alloc param_27 = read_tile_alloc(param_25, param_26, v_260, v_260BufferSize);
                 TileRef param_28 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
-                Tile tile_1 = Tile_read(param_27, param_28, v_242, v_242BufferSize);
+                Tile tile_1 = Tile_read(param_27, param_28, v_260, v_260BufferSize);
                 uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2);
-                uint scene_offset_1 = v_242.memory[drawmonoid_base_2 + 2u];
-                uint info_offset = v_242.memory[drawmonoid_base_2 + 3u];
+                uint scene_offset_1 = v_260.memory[drawmonoid_base_2 + 2u];
+                uint info_offset = v_260.memory[drawmonoid_base_2 + 3u];
                 uint dd_1 = drawdata_start + (scene_offset_1 >> uint(2));
                 uint di = drawinfo_start + (info_offset >> uint(2));
                 switch (drawtag)
                 {
                     case 68u:
                     {
-                        linewidth = as_type<float>(v_242.memory[di]);
+                        linewidth = as_type<float>(v_260.memory[di]);
                         Alloc param_29 = cmd_alloc;
                         CmdRef param_30 = cmd_ref;
                         uint param_31 = cmd_limit;
-                        bool _1697 = alloc_cmd(param_29, param_30, param_31, v_242, v_242BufferSize);
+                        bool _1853 = alloc_cmd(param_29, param_30, param_31, v_260, v_260BufferSize);
                         cmd_alloc = param_29;
                         cmd_ref = param_30;
                         cmd_limit = param_31;
-                        if (!_1697)
+                        if (!_1853)
                         {
                             break;
                         }
@@ -937,13 +1026,13 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                         CmdRef param_33 = cmd_ref;
                         Tile param_34 = tile_1;
                         float param_35 = linewidth;
-                        write_fill(param_32, param_33, param_34, param_35, v_242, v_242BufferSize);
+                        write_fill(param_32, param_33, param_34, param_35, v_260, v_260BufferSize);
                         cmd_ref = param_33;
-                        uint rgba = _1222.scene[dd_1];
+                        uint rgba = _1372.scene[dd_1];
                         Alloc param_36 = cmd_alloc;
                         CmdRef param_37 = cmd_ref;
                         CmdColor param_38 = CmdColor{ rgba };
-                        Cmd_Color_write(param_36, param_37, param_38, v_242, v_242BufferSize);
+                        Cmd_Color_write(param_36, param_37, param_38, v_260, v_260BufferSize);
                         cmd_ref.offset += 8u;
                         break;
                     }
@@ -952,94 +1041,127 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                         Alloc param_39 = cmd_alloc;
                         CmdRef param_40 = cmd_ref;
                         uint param_41 = cmd_limit;
-                        bool _1738 = alloc_cmd(param_39, param_40, param_41, v_242, v_242BufferSize);
+                        bool _1894 = alloc_cmd(param_39, param_40, param_41, v_260, v_260BufferSize);
                         cmd_alloc = param_39;
                         cmd_ref = param_40;
                         cmd_limit = param_41;
-                        if (!_1738)
+                        if (!_1894)
                         {
                             break;
                         }
-                        linewidth = as_type<float>(v_242.memory[di]);
+                        linewidth = as_type<float>(v_260.memory[di]);
                         Alloc param_42 = cmd_alloc;
                         CmdRef param_43 = cmd_ref;
                         Tile param_44 = tile_1;
                         float param_45 = linewidth;
-                        write_fill(param_42, param_43, param_44, param_45, v_242, v_242BufferSize);
+                        write_fill(param_42, param_43, param_44, param_45, v_260, v_260BufferSize);
                         cmd_ref = param_43;
-                        cmd_lin.index = _1222.scene[dd_1];
-                        cmd_lin.line_x = as_type<float>(v_242.memory[di + 1u]);
-                        cmd_lin.line_y = as_type<float>(v_242.memory[di + 2u]);
-                        cmd_lin.line_c = as_type<float>(v_242.memory[di + 3u]);
+                        cmd_lin.index = _1372.scene[dd_1];
+                        cmd_lin.line_x = as_type<float>(v_260.memory[di + 1u]);
+                        cmd_lin.line_y = as_type<float>(v_260.memory[di + 2u]);
+                        cmd_lin.line_c = as_type<float>(v_260.memory[di + 3u]);
                         Alloc param_46 = cmd_alloc;
                         CmdRef param_47 = cmd_ref;
                         CmdLinGrad param_48 = cmd_lin;
-                        Cmd_LinGrad_write(param_46, param_47, param_48, v_242, v_242BufferSize);
+                        Cmd_LinGrad_write(param_46, param_47, param_48, v_260, v_260BufferSize);
                         cmd_ref.offset += 20u;
                         break;
                     }
-                    case 72u:
+                    case 732u:
                     {
-                        linewidth = as_type<float>(v_242.memory[di]);
                         Alloc param_49 = cmd_alloc;
                         CmdRef param_50 = cmd_ref;
                         uint param_51 = cmd_limit;
-                        bool _1806 = alloc_cmd(param_49, param_50, param_51, v_242, v_242BufferSize);
+                        bool _1958 = alloc_cmd(param_49, param_50, param_51, v_260, v_260BufferSize);
                         cmd_alloc = param_49;
                         cmd_ref = param_50;
                         cmd_limit = param_51;
-                        if (!_1806)
+                        if (!_1958)
                         {
                             break;
                         }
+                        linewidth = as_type<float>(v_260.memory[di]);
                         Alloc param_52 = cmd_alloc;
                         CmdRef param_53 = cmd_ref;
                         Tile param_54 = tile_1;
                         float param_55 = linewidth;
-                        write_fill(param_52, param_53, param_54, param_55, v_242, v_242BufferSize);
+                        write_fill(param_52, param_53, param_54, param_55, v_260, v_260BufferSize);
                         cmd_ref = param_53;
-                        uint index = _1222.scene[dd_1];
-                        uint raw1 = _1222.scene[dd_1 + 1u];
-                        int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
+                        cmd_rad.index = _1372.scene[dd_1];
+                        cmd_rad.mat = as_type<float4>(uint4(v_260.memory[di + 1u], v_260.memory[di + 2u], v_260.memory[di + 3u], v_260.memory[di + 4u]));
+                        cmd_rad.xlat = as_type<float2>(uint2(v_260.memory[di + 5u], v_260.memory[di + 6u]));
+                        cmd_rad.c1 = as_type<float2>(uint2(v_260.memory[di + 7u], v_260.memory[di + 8u]));
+                        cmd_rad.ra = as_type<float>(v_260.memory[di + 9u]);
+                        cmd_rad.roff = as_type<float>(v_260.memory[di + 10u]);
                         Alloc param_56 = cmd_alloc;
                         CmdRef param_57 = cmd_ref;
-                        CmdImage param_58 = CmdImage{ index, offset_1 };
-                        Cmd_Image_write(param_56, param_57, param_58, v_242, v_242BufferSize);
+                        CmdRadGrad param_58 = cmd_rad;
+                        Cmd_RadGrad_write(param_56, param_57, param_58, v_260, v_260BufferSize);
+                        cmd_ref.offset += 48u;
+                        break;
+                    }
+                    case 72u:
+                    {
+                        linewidth = as_type<float>(v_260.memory[di]);
+                        Alloc param_59 = cmd_alloc;
+                        CmdRef param_60 = cmd_ref;
+                        uint param_61 = cmd_limit;
+                        bool _2064 = alloc_cmd(param_59, param_60, param_61, v_260, v_260BufferSize);
+                        cmd_alloc = param_59;
+                        cmd_ref = param_60;
+                        cmd_limit = param_61;
+                        if (!_2064)
+                        {
+                            break;
+                        }
+                        Alloc param_62 = cmd_alloc;
+                        CmdRef param_63 = cmd_ref;
+                        Tile param_64 = tile_1;
+                        float param_65 = linewidth;
+                        write_fill(param_62, param_63, param_64, param_65, v_260, v_260BufferSize);
+                        cmd_ref = param_63;
+                        uint index = _1372.scene[dd_1];
+                        uint raw1 = _1372.scene[dd_1 + 1u];
+                        int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
+                        Alloc param_66 = cmd_alloc;
+                        CmdRef param_67 = cmd_ref;
+                        CmdImage param_68 = CmdImage{ index, offset_1 };
+                        Cmd_Image_write(param_66, param_67, param_68, v_260, v_260BufferSize);
                         cmd_ref.offset += 12u;
                         break;
                     }
                     case 5u:
                     {
-                        bool _1859 = tile_1.tile.offset == 0u;
-                        bool _1865;
-                        if (_1859)
+                        bool _2117 = tile_1.tile.offset == 0u;
+                        bool _2123;
+                        if (_2117)
                         {
-                            _1865 = tile_1.backdrop == 0;
+                            _2123 = tile_1.backdrop == 0;
                         }
                         else
                         {
-                            _1865 = _1859;
+                            _2123 = _2117;
                         }
-                        if (_1865)
+                        if (_2123)
                         {
                             clip_zero_depth = clip_depth + 1u;
                         }
                         else
                         {
-                            Alloc param_59 = cmd_alloc;
-                            CmdRef param_60 = cmd_ref;
-                            uint param_61 = cmd_limit;
-                            bool _1877 = alloc_cmd(param_59, param_60, param_61, v_242, v_242BufferSize);
-                            cmd_alloc = param_59;
-                            cmd_ref = param_60;
-                            cmd_limit = param_61;
-                            if (!_1877)
+                            Alloc param_69 = cmd_alloc;
+                            CmdRef param_70 = cmd_ref;
+                            uint param_71 = cmd_limit;
+                            bool _2135 = alloc_cmd(param_69, param_70, param_71, v_260, v_260BufferSize);
+                            cmd_alloc = param_69;
+                            cmd_ref = param_70;
+                            cmd_limit = param_71;
+                            if (!_2135)
                             {
                                 break;
                             }
-                            Alloc param_62 = cmd_alloc;
-                            CmdRef param_63 = cmd_ref;
-                            Cmd_BeginClip_write(param_62, param_63, v_242, v_242BufferSize);
+                            Alloc param_72 = cmd_alloc;
+                            CmdRef param_73 = cmd_ref;
+                            Cmd_BeginClip_write(param_72, param_73, v_260, v_260BufferSize);
                             cmd_ref.offset += 4u;
                         }
                         clip_depth++;
@@ -1048,28 +1170,28 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
                     case 37u:
                     {
                         clip_depth--;
-                        Alloc param_64 = cmd_alloc;
-                        CmdRef param_65 = cmd_ref;
-                        uint param_66 = cmd_limit;
-                        bool _1905 = alloc_cmd(param_64, param_65, param_66, v_242, v_242BufferSize);
-                        cmd_alloc = param_64;
-                        cmd_ref = param_65;
-                        cmd_limit = param_66;
-                        if (!_1905)
+                        Alloc param_74 = cmd_alloc;
+                        CmdRef param_75 = cmd_ref;
+                        uint param_76 = cmd_limit;
+                        bool _2163 = alloc_cmd(param_74, param_75, param_76, v_260, v_260BufferSize);
+                        cmd_alloc = param_74;
+                        cmd_ref = param_75;
+                        cmd_limit = param_76;
+                        if (!_2163)
                         {
                             break;
                         }
-                        Alloc param_67 = cmd_alloc;
-                        CmdRef param_68 = cmd_ref;
-                        Tile param_69 = tile_1;
-                        float param_70 = -1.0;
-                        write_fill(param_67, param_68, param_69, param_70, v_242, v_242BufferSize);
-                        cmd_ref = param_68;
-                        uint blend_1 = _1222.scene[dd_1];
-                        Alloc param_71 = cmd_alloc;
-                        CmdRef param_72 = cmd_ref;
-                        CmdEndClip param_73 = CmdEndClip{ blend_1 };
-                        Cmd_EndClip_write(param_71, param_72, param_73, v_242, v_242BufferSize);
+                        Alloc param_77 = cmd_alloc;
+                        CmdRef param_78 = cmd_ref;
+                        Tile param_79 = tile_1;
+                        float param_80 = -1.0;
+                        write_fill(param_77, param_78, param_79, param_80, v_260, v_260BufferSize);
+                        cmd_ref = param_78;
+                        uint blend_1 = _1372.scene[dd_1];
+                        Alloc param_81 = cmd_alloc;
+                        CmdRef param_82 = cmd_ref;
+                        CmdEndClip param_83 = CmdEndClip{ blend_1 };
+                        Cmd_EndClip_write(param_81, param_82, param_83, v_260, v_260BufferSize);
                         cmd_ref.offset += 8u;
                         break;
                     }
@@ -1103,21 +1225,21 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
             break;
         }
     }
-    bool _1975 = (bin_tile_x + tile_x) < _854.conf.width_in_tiles;
-    bool _1984;
-    if (_1975)
+    bool _2233 = (bin_tile_x + tile_x) < _1005.conf.width_in_tiles;
+    bool _2242;
+    if (_2233)
     {
-        _1984 = (bin_tile_y + tile_y) < _854.conf.height_in_tiles;
+        _2242 = (bin_tile_y + tile_y) < _1005.conf.height_in_tiles;
     }
     else
     {
-        _1984 = _1975;
+        _2242 = _2233;
     }
-    if (_1984)
+    if (_2242)
     {
-        Alloc param_74 = cmd_alloc;
-        CmdRef param_75 = cmd_ref;
-        Cmd_End_write(param_74, param_75, v_242, v_242BufferSize);
+        Alloc param_84 = cmd_alloc;
+        CmdRef param_85 = cmd_ref;
+        Cmd_End_write(param_84, param_85, v_260, v_260BufferSize);
     }
 }
 
diff --git a/piet-gpu/shader/gen/coarse.spv b/piet-gpu/shader/gen/coarse.spv
index b85fd8c..fdc10a0 100644
Binary files a/piet-gpu/shader/gen/coarse.spv and b/piet-gpu/shader/gen/coarse.spv differ
diff --git a/piet-gpu/shader/gen/draw_leaf.dxil b/piet-gpu/shader/gen/draw_leaf.dxil
index 77396c1..6353f19 100644
Binary files a/piet-gpu/shader/gen/draw_leaf.dxil and b/piet-gpu/shader/gen/draw_leaf.dxil differ
diff --git a/piet-gpu/shader/gen/draw_leaf.hlsl b/piet-gpu/shader/gen/draw_leaf.hlsl
index f812f52..734d21e 100644
--- a/piet-gpu/shader/gen/draw_leaf.hlsl
+++ b/piet-gpu/shader/gen/draw_leaf.hlsl
@@ -46,10 +46,10 @@ static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
 static const DrawMonoid _23 = { 0u, 0u, 0u, 0u };
 
-ByteAddressBuffer _92 : register(t1, space0);
-ByteAddressBuffer _102 : register(t2, space0);
-ByteAddressBuffer _202 : register(t3, space0);
-RWByteAddressBuffer _284 : register(u0, space0);
+ByteAddressBuffer _93 : register(t1, space0);
+ByteAddressBuffer _103 : register(t2, space0);
+ByteAddressBuffer _203 : register(t3, space0);
+RWByteAddressBuffer _285 : register(u0, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -66,8 +66,8 @@ groupshared DrawMonoid sh_scratch[256];
 DrawMonoid map_tag(uint tag_word)
 {
     uint has_path = uint(tag_word != 0u);
-    DrawMonoid _75 = { has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 28u };
-    return _75;
+    DrawMonoid _76 = { has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 60u };
+    return _76;
 }
 
 DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b)
@@ -88,15 +88,15 @@ DrawMonoid draw_monoid_identity()
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 8u;
-    uint drawtag_base = _92.Load(100) >> uint(2);
-    uint tag_word = _102.Load((drawtag_base + ix) * 4 + 0);
+    uint drawtag_base = _93.Load(100) >> uint(2);
+    uint tag_word = _103.Load((drawtag_base + ix) * 4 + 0);
     uint param = tag_word;
     DrawMonoid agg = map_tag(param);
     DrawMonoid local[8];
     local[0] = agg;
     for (uint i = 1u; i < 8u; i++)
     {
-        tag_word = _102.Load(((drawtag_base + ix) + i) * 4 + 0);
+        tag_word = _103.Load(((drawtag_base + ix) + i) * 4 + 0);
         uint param_1 = tag_word;
         DrawMonoid param_2 = agg;
         DrawMonoid param_3 = map_tag(param_1);
@@ -121,15 +121,15 @@ void comp_main()
     DrawMonoid row = draw_monoid_identity();
     if (gl_WorkGroupID.x > 0u)
     {
-        DrawMonoid _208;
-        _208.path_ix = _202.Load((gl_WorkGroupID.x - 1u) * 16 + 0);
-        _208.clip_ix = _202.Load((gl_WorkGroupID.x - 1u) * 16 + 4);
-        _208.scene_offset = _202.Load((gl_WorkGroupID.x - 1u) * 16 + 8);
-        _208.info_offset = _202.Load((gl_WorkGroupID.x - 1u) * 16 + 12);
-        row.path_ix = _208.path_ix;
-        row.clip_ix = _208.clip_ix;
-        row.scene_offset = _208.scene_offset;
-        row.info_offset = _208.info_offset;
+        DrawMonoid _209;
+        _209.path_ix = _203.Load((gl_WorkGroupID.x - 1u) * 16 + 0);
+        _209.clip_ix = _203.Load((gl_WorkGroupID.x - 1u) * 16 + 4);
+        _209.scene_offset = _203.Load((gl_WorkGroupID.x - 1u) * 16 + 8);
+        _209.info_offset = _203.Load((gl_WorkGroupID.x - 1u) * 16 + 12);
+        row.path_ix = _209.path_ix;
+        row.clip_ix = _209.clip_ix;
+        row.scene_offset = _209.scene_offset;
+        row.info_offset = _209.info_offset;
     }
     if (gl_LocalInvocationID.x > 0u)
     {
@@ -137,13 +137,15 @@ void comp_main()
         DrawMonoid param_7 = sh_scratch[gl_LocalInvocationID.x - 1u];
         row = combine_draw_monoid(param_6, param_7);
     }
-    uint drawdata_base = _92.Load(104) >> uint(2);
-    uint drawinfo_base = _92.Load(68) >> uint(2);
+    uint drawdata_base = _93.Load(104) >> uint(2);
+    uint drawinfo_base = _93.Load(68) >> uint(2);
     uint out_ix = gl_GlobalInvocationID.x * 8u;
-    uint out_base = (_92.Load(44) >> uint(2)) + (out_ix * 4u);
-    uint clip_out_base = _92.Load(48) >> uint(2);
+    uint out_base = (_93.Load(44) >> uint(2)) + (out_ix * 4u);
+    uint clip_out_base = _93.Load(48) >> uint(2);
     float4 mat;
     float2 translate;
+    float2 p0;
+    float2 p1;
     for (uint i_2 = 0u; i_2 < 8u; i_2++)
     {
         DrawMonoid m = row;
@@ -153,31 +155,31 @@ void comp_main()
             DrawMonoid param_9 = local[i_2 - 1u];
             m = combine_draw_monoid(param_8, param_9);
         }
-        _284.Store((out_base + (i_2 * 4u)) * 4 + 8, m.path_ix);
-        _284.Store(((out_base + (i_2 * 4u)) + 1u) * 4 + 8, m.clip_ix);
-        _284.Store(((out_base + (i_2 * 4u)) + 2u) * 4 + 8, m.scene_offset);
-        _284.Store(((out_base + (i_2 * 4u)) + 3u) * 4 + 8, m.info_offset);
+        _285.Store((out_base + (i_2 * 4u)) * 4 + 8, m.path_ix);
+        _285.Store(((out_base + (i_2 * 4u)) + 1u) * 4 + 8, m.clip_ix);
+        _285.Store(((out_base + (i_2 * 4u)) + 2u) * 4 + 8, m.scene_offset);
+        _285.Store(((out_base + (i_2 * 4u)) + 3u) * 4 + 8, m.info_offset);
         uint dd = drawdata_base + (m.scene_offset >> uint(2));
         uint di = drawinfo_base + (m.info_offset >> uint(2));
-        tag_word = _102.Load(((drawtag_base + ix) + i_2) * 4 + 0);
-        if ((((tag_word == 68u) || (tag_word == 276u)) || (tag_word == 72u)) || (tag_word == 5u))
+        tag_word = _103.Load(((drawtag_base + ix) + i_2) * 4 + 0);
+        if (((((tag_word == 68u) || (tag_word == 276u)) || (tag_word == 732u)) || (tag_word == 72u)) || (tag_word == 5u))
         {
-            uint bbox_offset = (_92.Load(40) >> uint(2)) + (6u * m.path_ix);
-            float bbox_l = float(_284.Load(bbox_offset * 4 + 8)) - 32768.0f;
-            float bbox_t = float(_284.Load((bbox_offset + 1u) * 4 + 8)) - 32768.0f;
-            float bbox_r = float(_284.Load((bbox_offset + 2u) * 4 + 8)) - 32768.0f;
-            float bbox_b = float(_284.Load((bbox_offset + 3u) * 4 + 8)) - 32768.0f;
+            uint bbox_offset = (_93.Load(40) >> uint(2)) + (6u * m.path_ix);
+            float bbox_l = float(_285.Load(bbox_offset * 4 + 8)) - 32768.0f;
+            float bbox_t = float(_285.Load((bbox_offset + 1u) * 4 + 8)) - 32768.0f;
+            float bbox_r = float(_285.Load((bbox_offset + 2u) * 4 + 8)) - 32768.0f;
+            float bbox_b = float(_285.Load((bbox_offset + 3u) * 4 + 8)) - 32768.0f;
             float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
-            float linewidth = asfloat(_284.Load((bbox_offset + 4u) * 4 + 8));
+            float linewidth = asfloat(_285.Load((bbox_offset + 4u) * 4 + 8));
             uint fill_mode = uint(linewidth >= 0.0f);
-            if ((linewidth >= 0.0f) || (tag_word == 276u))
+            if (((linewidth >= 0.0f) || (tag_word == 276u)) || (tag_word == 732u))
             {
-                uint trans_ix = _284.Load((bbox_offset + 5u) * 4 + 8);
-                uint t = (_92.Load(36) >> uint(2)) + (6u * trans_ix);
-                mat = asfloat(uint4(_284.Load(t * 4 + 8), _284.Load((t + 1u) * 4 + 8), _284.Load((t + 2u) * 4 + 8), _284.Load((t + 3u) * 4 + 8)));
-                if (tag_word == 276u)
+                uint trans_ix = _285.Load((bbox_offset + 5u) * 4 + 8);
+                uint t = (_93.Load(36) >> uint(2)) + (6u * trans_ix);
+                mat = asfloat(uint4(_285.Load(t * 4 + 8), _285.Load((t + 1u) * 4 + 8), _285.Load((t + 2u) * 4 + 8), _285.Load((t + 3u) * 4 + 8)));
+                if ((tag_word == 276u) || (tag_word == 732u))
                 {
-                    translate = asfloat(uint2(_284.Load((t + 4u) * 4 + 8), _284.Load((t + 5u) * 4 + 8)));
+                    translate = asfloat(uint2(_285.Load((t + 4u) * 4 + 8), _285.Load((t + 5u) * 4 + 8)));
                 }
             }
             if (linewidth >= 0.0f)
@@ -189,15 +191,14 @@ void comp_main()
                 case 68u:
                 case 72u:
                 {
-                    _284.Store(di * 4 + 8, asuint(linewidth));
+                    _285.Store(di * 4 + 8, asuint(linewidth));
                     break;
                 }
                 case 276u:
                 {
-                    _284.Store(di * 4 + 8, asuint(linewidth));
-                    uint index = _102.Load(dd * 4 + 0);
-                    float2 p0 = asfloat(uint2(_102.Load((dd + 1u) * 4 + 0), _102.Load((dd + 2u) * 4 + 0)));
-                    float2 p1 = asfloat(uint2(_102.Load((dd + 3u) * 4 + 0), _102.Load((dd + 4u) * 4 + 0)));
+                    _285.Store(di * 4 + 8, asuint(linewidth));
+                    p0 = asfloat(uint2(_103.Load((dd + 1u) * 4 + 0), _103.Load((dd + 2u) * 4 + 0)));
+                    p1 = asfloat(uint2(_103.Load((dd + 3u) * 4 + 0), _103.Load((dd + 4u) * 4 + 0)));
                     p0 = ((mat.xy * p0.x) + (mat.zw * p0.y)) + translate;
                     p1 = ((mat.xy * p1.x) + (mat.zw * p1.y)) + translate;
                     float2 dxy = p1 - p0;
@@ -205,9 +206,38 @@ void comp_main()
                     float line_x = dxy.x * scale;
                     float line_y = dxy.y * scale;
                     float line_c = -((p0.x * line_x) + (p0.y * line_y));
-                    _284.Store((di + 1u) * 4 + 8, asuint(line_x));
-                    _284.Store((di + 2u) * 4 + 8, asuint(line_y));
-                    _284.Store((di + 3u) * 4 + 8, asuint(line_c));
+                    _285.Store((di + 1u) * 4 + 8, asuint(line_x));
+                    _285.Store((di + 2u) * 4 + 8, asuint(line_y));
+                    _285.Store((di + 3u) * 4 + 8, asuint(line_c));
+                    break;
+                }
+                case 732u:
+                {
+                    p0 = asfloat(uint2(_103.Load((dd + 1u) * 4 + 0), _103.Load((dd + 2u) * 4 + 0)));
+                    p1 = asfloat(uint2(_103.Load((dd + 3u) * 4 + 0), _103.Load((dd + 4u) * 4 + 0)));
+                    float r0 = asfloat(_103.Load((dd + 5u) * 4 + 0));
+                    float r1 = asfloat(_103.Load((dd + 6u) * 4 + 0));
+                    float inv_det = 1.0f / ((mat.x * mat.w) - (mat.y * mat.z));
+                    float4 inv_mat = float4(mat.w, -mat.y, -mat.z, mat.x) * inv_det;
+                    float2 inv_tr = (inv_mat.xz * translate.x) + (inv_mat.yw * translate.y);
+                    inv_tr += p0;
+                    float2 center1 = p1 - p0;
+                    float rr = r1 / (r1 - r0);
+                    float rainv = rr / ((r1 * r1) - dot(center1, center1));
+                    float2 c1 = center1 * rainv;
+                    float ra = rr * rainv;
+                    float roff = rr - 1.0f;
+                    _285.Store(di * 4 + 8, asuint(linewidth));
+                    _285.Store((di + 1u) * 4 + 8, asuint(inv_mat.x));
+                    _285.Store((di + 2u) * 4 + 8, asuint(inv_mat.y));
+                    _285.Store((di + 3u) * 4 + 8, asuint(inv_mat.z));
+                    _285.Store((di + 4u) * 4 + 8, asuint(inv_mat.w));
+                    _285.Store((di + 5u) * 4 + 8, asuint(inv_tr.x));
+                    _285.Store((di + 6u) * 4 + 8, asuint(inv_tr.y));
+                    _285.Store((di + 7u) * 4 + 8, asuint(c1.x));
+                    _285.Store((di + 8u) * 4 + 8, asuint(c1.y));
+                    _285.Store((di + 9u) * 4 + 8, asuint(ra));
+                    _285.Store((di + 10u) * 4 + 8, asuint(roff));
                     break;
                 }
                 case 5u:
@@ -223,7 +253,7 @@ void comp_main()
             {
                 path_ix = m.path_ix;
             }
-            _284.Store((clip_out_base + m.clip_ix) * 4 + 8, path_ix);
+            _285.Store((clip_out_base + m.clip_ix) * 4 + 8, path_ix);
         }
     }
 }
diff --git a/piet-gpu/shader/gen/draw_leaf.msl b/piet-gpu/shader/gen/draw_leaf.msl
index a8516ae..c11e21b 100644
--- a/piet-gpu/shader/gen/draw_leaf.msl
+++ b/piet-gpu/shader/gen/draw_leaf.msl
@@ -124,7 +124,7 @@ static inline __attribute__((always_inline))
 DrawMonoid map_tag(thread const uint& tag_word)
 {
     uint has_path = uint(tag_word != 0u);
-    return DrawMonoid{ has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 28u };
+    return DrawMonoid{ has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 60u };
 }
 
 static inline __attribute__((always_inline))
@@ -144,19 +144,19 @@ DrawMonoid draw_monoid_identity()
     return DrawMonoid{ 0u, 0u, 0u, 0u };
 }
 
-kernel void main0(device Memory& _284 [[buffer(0)]], const device ConfigBuf& _92 [[buffer(1)]], const device SceneBuf& _102 [[buffer(2)]], const device ParentBuf& _202 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+kernel void main0(device Memory& _285 [[buffer(0)]], const device ConfigBuf& _93 [[buffer(1)]], const device SceneBuf& _103 [[buffer(2)]], const device ParentBuf& _203 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
 {
     threadgroup DrawMonoid sh_scratch[256];
     uint ix = gl_GlobalInvocationID.x * 8u;
-    uint drawtag_base = _92.conf.drawtag_offset >> uint(2);
-    uint tag_word = _102.scene[drawtag_base + ix];
+    uint drawtag_base = _93.conf.drawtag_offset >> uint(2);
+    uint tag_word = _103.scene[drawtag_base + ix];
     uint param = tag_word;
     DrawMonoid agg = map_tag(param);
     spvUnsafeArray<DrawMonoid, 8> local;
     local[0] = agg;
     for (uint i = 1u; i < 8u; i++)
     {
-        tag_word = _102.scene[(drawtag_base + ix) + i];
+        tag_word = _103.scene[(drawtag_base + ix) + i];
         uint param_1 = tag_word;
         DrawMonoid param_2 = agg;
         DrawMonoid param_3 = map_tag(param_1);
@@ -181,11 +181,11 @@ kernel void main0(device Memory& _284 [[buffer(0)]], const device ConfigBuf& _92
     DrawMonoid row = draw_monoid_identity();
     if (gl_WorkGroupID.x > 0u)
     {
-        uint _205 = gl_WorkGroupID.x - 1u;
-        row.path_ix = _202.parent[_205].path_ix;
-        row.clip_ix = _202.parent[_205].clip_ix;
-        row.scene_offset = _202.parent[_205].scene_offset;
-        row.info_offset = _202.parent[_205].info_offset;
+        uint _206 = gl_WorkGroupID.x - 1u;
+        row.path_ix = _203.parent[_206].path_ix;
+        row.clip_ix = _203.parent[_206].clip_ix;
+        row.scene_offset = _203.parent[_206].scene_offset;
+        row.info_offset = _203.parent[_206].info_offset;
     }
     if (gl_LocalInvocationID.x > 0u)
     {
@@ -193,13 +193,15 @@ kernel void main0(device Memory& _284 [[buffer(0)]], const device ConfigBuf& _92
         DrawMonoid param_7 = sh_scratch[gl_LocalInvocationID.x - 1u];
         row = combine_draw_monoid(param_6, param_7);
     }
-    uint drawdata_base = _92.conf.drawdata_offset >> uint(2);
-    uint drawinfo_base = _92.conf.drawinfo_alloc.offset >> uint(2);
+    uint drawdata_base = _93.conf.drawdata_offset >> uint(2);
+    uint drawinfo_base = _93.conf.drawinfo_alloc.offset >> uint(2);
     uint out_ix = gl_GlobalInvocationID.x * 8u;
-    uint out_base = (_92.conf.drawmonoid_alloc.offset >> uint(2)) + (out_ix * 4u);
-    uint clip_out_base = _92.conf.clip_alloc.offset >> uint(2);
+    uint out_base = (_93.conf.drawmonoid_alloc.offset >> uint(2)) + (out_ix * 4u);
+    uint clip_out_base = _93.conf.clip_alloc.offset >> uint(2);
     float4 mat;
     float2 translate;
+    float2 p0;
+    float2 p1;
     for (uint i_2 = 0u; i_2 < 8u; i_2++)
     {
         DrawMonoid m = row;
@@ -209,31 +211,31 @@ kernel void main0(device Memory& _284 [[buffer(0)]], const device ConfigBuf& _92
             DrawMonoid param_9 = local[i_2 - 1u];
             m = combine_draw_monoid(param_8, param_9);
         }
-        _284.memory[out_base + (i_2 * 4u)] = m.path_ix;
-        _284.memory[(out_base + (i_2 * 4u)) + 1u] = m.clip_ix;
-        _284.memory[(out_base + (i_2 * 4u)) + 2u] = m.scene_offset;
-        _284.memory[(out_base + (i_2 * 4u)) + 3u] = m.info_offset;
+        _285.memory[out_base + (i_2 * 4u)] = m.path_ix;
+        _285.memory[(out_base + (i_2 * 4u)) + 1u] = m.clip_ix;
+        _285.memory[(out_base + (i_2 * 4u)) + 2u] = m.scene_offset;
+        _285.memory[(out_base + (i_2 * 4u)) + 3u] = m.info_offset;
         uint dd = drawdata_base + (m.scene_offset >> uint(2));
         uint di = drawinfo_base + (m.info_offset >> uint(2));
-        tag_word = _102.scene[(drawtag_base + ix) + i_2];
-        if ((((tag_word == 68u) || (tag_word == 276u)) || (tag_word == 72u)) || (tag_word == 5u))
+        tag_word = _103.scene[(drawtag_base + ix) + i_2];
+        if (((((tag_word == 68u) || (tag_word == 276u)) || (tag_word == 732u)) || (tag_word == 72u)) || (tag_word == 5u))
         {
-            uint bbox_offset = (_92.conf.path_bbox_alloc.offset >> uint(2)) + (6u * m.path_ix);
-            float bbox_l = float(_284.memory[bbox_offset]) - 32768.0;
-            float bbox_t = float(_284.memory[bbox_offset + 1u]) - 32768.0;
-            float bbox_r = float(_284.memory[bbox_offset + 2u]) - 32768.0;
-            float bbox_b = float(_284.memory[bbox_offset + 3u]) - 32768.0;
+            uint bbox_offset = (_93.conf.path_bbox_alloc.offset >> uint(2)) + (6u * m.path_ix);
+            float bbox_l = float(_285.memory[bbox_offset]) - 32768.0;
+            float bbox_t = float(_285.memory[bbox_offset + 1u]) - 32768.0;
+            float bbox_r = float(_285.memory[bbox_offset + 2u]) - 32768.0;
+            float bbox_b = float(_285.memory[bbox_offset + 3u]) - 32768.0;
             float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
-            float linewidth = as_type<float>(_284.memory[bbox_offset + 4u]);
+            float linewidth = as_type<float>(_285.memory[bbox_offset + 4u]);
             uint fill_mode = uint(linewidth >= 0.0);
-            if ((linewidth >= 0.0) || (tag_word == 276u))
+            if (((linewidth >= 0.0) || (tag_word == 276u)) || (tag_word == 732u))
             {
-                uint trans_ix = _284.memory[bbox_offset + 5u];
-                uint t = (_92.conf.trans_alloc.offset >> uint(2)) + (6u * trans_ix);
-                mat = as_type<float4>(uint4(_284.memory[t], _284.memory[t + 1u], _284.memory[t + 2u], _284.memory[t + 3u]));
-                if (tag_word == 276u)
+                uint trans_ix = _285.memory[bbox_offset + 5u];
+                uint t = (_93.conf.trans_alloc.offset >> uint(2)) + (6u * trans_ix);
+                mat = as_type<float4>(uint4(_285.memory[t], _285.memory[t + 1u], _285.memory[t + 2u], _285.memory[t + 3u]));
+                if ((tag_word == 276u) || (tag_word == 732u))
                 {
-                    translate = as_type<float2>(uint2(_284.memory[t + 4u], _284.memory[t + 5u]));
+                    translate = as_type<float2>(uint2(_285.memory[t + 4u], _285.memory[t + 5u]));
                 }
             }
             if (linewidth >= 0.0)
@@ -245,15 +247,14 @@ kernel void main0(device Memory& _284 [[buffer(0)]], const device ConfigBuf& _92
                 case 68u:
                 case 72u:
                 {
-                    _284.memory[di] = as_type<uint>(linewidth);
+                    _285.memory[di] = as_type<uint>(linewidth);
                     break;
                 }
                 case 276u:
                 {
-                    _284.memory[di] = as_type<uint>(linewidth);
-                    uint index = _102.scene[dd];
-                    float2 p0 = as_type<float2>(uint2(_102.scene[dd + 1u], _102.scene[dd + 2u]));
-                    float2 p1 = as_type<float2>(uint2(_102.scene[dd + 3u], _102.scene[dd + 4u]));
+                    _285.memory[di] = as_type<uint>(linewidth);
+                    p0 = as_type<float2>(uint2(_103.scene[dd + 1u], _103.scene[dd + 2u]));
+                    p1 = as_type<float2>(uint2(_103.scene[dd + 3u], _103.scene[dd + 4u]));
                     p0 = ((mat.xy * p0.x) + (mat.zw * p0.y)) + translate;
                     p1 = ((mat.xy * p1.x) + (mat.zw * p1.y)) + translate;
                     float2 dxy = p1 - p0;
@@ -261,9 +262,38 @@ kernel void main0(device Memory& _284 [[buffer(0)]], const device ConfigBuf& _92
                     float line_x = dxy.x * scale;
                     float line_y = dxy.y * scale;
                     float line_c = -((p0.x * line_x) + (p0.y * line_y));
-                    _284.memory[di + 1u] = as_type<uint>(line_x);
-                    _284.memory[di + 2u] = as_type<uint>(line_y);
-                    _284.memory[di + 3u] = as_type<uint>(line_c);
+                    _285.memory[di + 1u] = as_type<uint>(line_x);
+                    _285.memory[di + 2u] = as_type<uint>(line_y);
+                    _285.memory[di + 3u] = as_type<uint>(line_c);
+                    break;
+                }
+                case 732u:
+                {
+                    p0 = as_type<float2>(uint2(_103.scene[dd + 1u], _103.scene[dd + 2u]));
+                    p1 = as_type<float2>(uint2(_103.scene[dd + 3u], _103.scene[dd + 4u]));
+                    float r0 = as_type<float>(_103.scene[dd + 5u]);
+                    float r1 = as_type<float>(_103.scene[dd + 6u]);
+                    float inv_det = 1.0 / ((mat.x * mat.w) - (mat.y * mat.z));
+                    float4 inv_mat = float4(mat.w, -mat.y, -mat.z, mat.x) * inv_det;
+                    float2 inv_tr = (inv_mat.xz * translate.x) + (inv_mat.yw * translate.y);
+                    inv_tr += p0;
+                    float2 center1 = p1 - p0;
+                    float rr = r1 / (r1 - r0);
+                    float rainv = rr / ((r1 * r1) - dot(center1, center1));
+                    float2 c1 = center1 * rainv;
+                    float ra = rr * rainv;
+                    float roff = rr - 1.0;
+                    _285.memory[di] = as_type<uint>(linewidth);
+                    _285.memory[di + 1u] = as_type<uint>(inv_mat.x);
+                    _285.memory[di + 2u] = as_type<uint>(inv_mat.y);
+                    _285.memory[di + 3u] = as_type<uint>(inv_mat.z);
+                    _285.memory[di + 4u] = as_type<uint>(inv_mat.w);
+                    _285.memory[di + 5u] = as_type<uint>(inv_tr.x);
+                    _285.memory[di + 6u] = as_type<uint>(inv_tr.y);
+                    _285.memory[di + 7u] = as_type<uint>(c1.x);
+                    _285.memory[di + 8u] = as_type<uint>(c1.y);
+                    _285.memory[di + 9u] = as_type<uint>(ra);
+                    _285.memory[di + 10u] = as_type<uint>(roff);
                     break;
                 }
                 case 5u:
@@ -279,7 +309,7 @@ kernel void main0(device Memory& _284 [[buffer(0)]], const device ConfigBuf& _92
             {
                 path_ix = m.path_ix;
             }
-            _284.memory[clip_out_base + m.clip_ix] = path_ix;
+            _285.memory[clip_out_base + m.clip_ix] = path_ix;
         }
     }
 }
diff --git a/piet-gpu/shader/gen/draw_leaf.spv b/piet-gpu/shader/gen/draw_leaf.spv
index d18b287..58dde43 100644
Binary files a/piet-gpu/shader/gen/draw_leaf.spv and b/piet-gpu/shader/gen/draw_leaf.spv differ
diff --git a/piet-gpu/shader/gen/draw_reduce.dxil b/piet-gpu/shader/gen/draw_reduce.dxil
index 4df0ec5..c101fc8 100644
Binary files a/piet-gpu/shader/gen/draw_reduce.dxil and b/piet-gpu/shader/gen/draw_reduce.dxil differ
diff --git a/piet-gpu/shader/gen/draw_reduce.hlsl b/piet-gpu/shader/gen/draw_reduce.hlsl
index 7220b7e..8311155 100644
--- a/piet-gpu/shader/gen/draw_reduce.hlsl
+++ b/piet-gpu/shader/gen/draw_reduce.hlsl
@@ -44,10 +44,10 @@ struct Config
 
 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
 
-ByteAddressBuffer _86 : register(t1, space0);
-ByteAddressBuffer _96 : register(t2, space0);
-RWByteAddressBuffer _187 : register(u3, space0);
-RWByteAddressBuffer _205 : register(u0, space0);
+ByteAddressBuffer _87 : register(t1, space0);
+ByteAddressBuffer _97 : register(t2, space0);
+RWByteAddressBuffer _188 : register(u3, space0);
+RWByteAddressBuffer _206 : register(u0, space0);
 
 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@@ -64,8 +64,8 @@ groupshared DrawMonoid sh_scratch[256];
 DrawMonoid map_tag(uint tag_word)
 {
     uint has_path = uint(tag_word != 0u);
-    DrawMonoid _69 = { has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 28u };
-    return _69;
+    DrawMonoid _70 = { has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 60u };
+    return _70;
 }
 
 DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b)
@@ -81,13 +81,13 @@ DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b)
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 8u;
-    uint drawtag_base = _86.Load(100) >> uint(2);
-    uint tag_word = _96.Load((drawtag_base + ix) * 4 + 0);
+    uint drawtag_base = _87.Load(100) >> uint(2);
+    uint tag_word = _97.Load((drawtag_base + ix) * 4 + 0);
     uint param = tag_word;
     DrawMonoid agg = map_tag(param);
     for (uint i = 1u; i < 8u; i++)
     {
-        uint tag_word_1 = _96.Load(((drawtag_base + ix) + i) * 4 + 0);
+        uint tag_word_1 = _97.Load(((drawtag_base + ix) + i) * 4 + 0);
         uint param_1 = tag_word_1;
         DrawMonoid param_2 = agg;
         DrawMonoid param_3 = map_tag(param_1);
@@ -109,10 +109,10 @@ void comp_main()
     }
     if (gl_LocalInvocationID.x == 0u)
     {
-        _187.Store(gl_WorkGroupID.x * 16 + 0, agg.path_ix);
-        _187.Store(gl_WorkGroupID.x * 16 + 4, agg.clip_ix);
-        _187.Store(gl_WorkGroupID.x * 16 + 8, agg.scene_offset);
-        _187.Store(gl_WorkGroupID.x * 16 + 12, agg.info_offset);
+        _188.Store(gl_WorkGroupID.x * 16 + 0, agg.path_ix);
+        _188.Store(gl_WorkGroupID.x * 16 + 4, agg.clip_ix);
+        _188.Store(gl_WorkGroupID.x * 16 + 8, agg.scene_offset);
+        _188.Store(gl_WorkGroupID.x * 16 + 12, agg.info_offset);
     }
 }
 
diff --git a/piet-gpu/shader/gen/draw_reduce.msl b/piet-gpu/shader/gen/draw_reduce.msl
index 8e409a8..759267c 100644
--- a/piet-gpu/shader/gen/draw_reduce.msl
+++ b/piet-gpu/shader/gen/draw_reduce.msl
@@ -85,7 +85,7 @@ static inline __attribute__((always_inline))
 DrawMonoid map_tag(thread const uint& tag_word)
 {
     uint has_path = uint(tag_word != 0u);
-    return DrawMonoid{ has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 28u };
+    return DrawMonoid{ has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 60u };
 }
 
 static inline __attribute__((always_inline))
@@ -99,17 +99,17 @@ DrawMonoid combine_draw_monoid(thread const DrawMonoid& a, thread const DrawMono
     return c;
 }
 
-kernel void main0(const device ConfigBuf& _86 [[buffer(1)]], const device SceneBuf& _96 [[buffer(2)]], device OutBuf& _187 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+kernel void main0(const device ConfigBuf& _87 [[buffer(1)]], const device SceneBuf& _97 [[buffer(2)]], device OutBuf& _188 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
 {
     threadgroup DrawMonoid sh_scratch[256];
     uint ix = gl_GlobalInvocationID.x * 8u;
-    uint drawtag_base = _86.conf.drawtag_offset >> uint(2);
-    uint tag_word = _96.scene[drawtag_base + ix];
+    uint drawtag_base = _87.conf.drawtag_offset >> uint(2);
+    uint tag_word = _97.scene[drawtag_base + ix];
     uint param = tag_word;
     DrawMonoid agg = map_tag(param);
     for (uint i = 1u; i < 8u; i++)
     {
-        uint tag_word_1 = _96.scene[(drawtag_base + ix) + i];
+        uint tag_word_1 = _97.scene[(drawtag_base + ix) + i];
         uint param_1 = tag_word_1;
         DrawMonoid param_2 = agg;
         DrawMonoid param_3 = map_tag(param_1);
@@ -131,10 +131,10 @@ kernel void main0(const device ConfigBuf& _86 [[buffer(1)]], const device SceneB
     }
     if (gl_LocalInvocationID.x == 0u)
     {
-        _187.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix;
-        _187.outbuf[gl_WorkGroupID.x].clip_ix = agg.clip_ix;
-        _187.outbuf[gl_WorkGroupID.x].scene_offset = agg.scene_offset;
-        _187.outbuf[gl_WorkGroupID.x].info_offset = agg.info_offset;
+        _188.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix;
+        _188.outbuf[gl_WorkGroupID.x].clip_ix = agg.clip_ix;
+        _188.outbuf[gl_WorkGroupID.x].scene_offset = agg.scene_offset;
+        _188.outbuf[gl_WorkGroupID.x].info_offset = agg.info_offset;
     }
 }
 
diff --git a/piet-gpu/shader/gen/draw_reduce.spv b/piet-gpu/shader/gen/draw_reduce.spv
index 4daf43a..d6c6fb7 100644
Binary files a/piet-gpu/shader/gen/draw_reduce.spv and b/piet-gpu/shader/gen/draw_reduce.spv differ
diff --git a/piet-gpu/shader/gen/draw_root.dxil b/piet-gpu/shader/gen/draw_root.dxil
index 4ea23f7..873fa29 100644
Binary files a/piet-gpu/shader/gen/draw_root.dxil and b/piet-gpu/shader/gen/draw_root.dxil differ
diff --git a/piet-gpu/shader/gen/kernel4.dxil b/piet-gpu/shader/gen/kernel4.dxil
index c0c27c9..9f8080b 100644
Binary files a/piet-gpu/shader/gen/kernel4.dxil and b/piet-gpu/shader/gen/kernel4.dxil differ
diff --git a/piet-gpu/shader/gen/kernel4.hlsl b/piet-gpu/shader/gen/kernel4.hlsl
index f17b240..92fe05b 100644
--- a/piet-gpu/shader/gen/kernel4.hlsl
+++ b/piet-gpu/shader/gen/kernel4.hlsl
@@ -48,6 +48,21 @@ struct CmdLinGrad
     float line_c;
 };
 
+struct CmdRadGradRef
+{
+    uint offset;
+};
+
+struct CmdRadGrad
+{
+    uint index;
+    float4 mat;
+    float2 xlat;
+    float2 c1;
+    float ra;
+    float roff;
+};
+
 struct CmdImageRef
 {
     uint offset;
@@ -146,8 +161,8 @@ struct Config
 
 static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);
 
-RWByteAddressBuffer _278 : register(u0, space0);
-ByteAddressBuffer _1521 : register(t1, space0);
+RWByteAddressBuffer _291 : register(u0, space0);
+ByteAddressBuffer _1666 : register(t1, space0);
 RWTexture2D<unorm float4> image_atlas : register(u3, space0);
 RWTexture2D<unorm float4> gradients : register(u4, space0);
 RWTexture2D<unorm float4> image : register(u2, space0);
@@ -174,8 +189,8 @@ float4 spvUnpackUnorm4x8(uint value)
 
 Alloc slice_mem(Alloc a, uint offset, uint size)
 {
-    Alloc _291 = { a.offset + offset };
-    return _291;
+    Alloc _304 = { a.offset + offset };
+    return _304;
 }
 
 bool touch_mem(Alloc alloc, uint offset)
@@ -191,7 +206,7 @@ uint read_mem(Alloc alloc, uint offset)
     {
         return 0u;
     }
-    uint v = _278.Load(offset * 4 + 8);
+    uint v = _291.Load(offset * 4 + 8);
     return v;
 }
 
@@ -200,8 +215,8 @@ CmdTag Cmd_tag(Alloc a, CmdRef ref)
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint tag_and_flags = read_mem(param, param_1);
-    CmdTag _525 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _525;
+    CmdTag _663 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+    return _663;
 }
 
 CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref)
@@ -221,9 +236,9 @@ CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref)
 
 CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref)
 {
-    CmdStrokeRef _542 = { ref.offset + 4u };
+    CmdStrokeRef _679 = { ref.offset + 4u };
     Alloc param = a;
-    CmdStrokeRef param_1 = _542;
+    CmdStrokeRef param_1 = _679;
     return CmdStroke_read(param, param_1);
 }
 
@@ -259,8 +274,8 @@ TileSeg TileSeg_read(Alloc a, TileSegRef ref)
     s.origin = float2(asfloat(raw0), asfloat(raw1));
     s._vector = float2(asfloat(raw2), asfloat(raw3));
     s.y_edge = asfloat(raw4);
-    TileSegRef _675 = { raw5 };
-    s.next = _675;
+    TileSegRef _820 = { raw5 };
+    s.next = _820;
     return s;
 }
 
@@ -286,9 +301,9 @@ CmdFill CmdFill_read(Alloc a, CmdFillRef ref)
 
 CmdFill Cmd_Fill_read(Alloc a, CmdRef ref)
 {
-    CmdFillRef _532 = { ref.offset + 4u };
+    CmdFillRef _669 = { ref.offset + 4u };
     Alloc param = a;
-    CmdFillRef param_1 = _532;
+    CmdFillRef param_1 = _669;
     return CmdFill_read(param, param_1);
 }
 
@@ -305,9 +320,9 @@ CmdAlpha CmdAlpha_read(Alloc a, CmdAlphaRef ref)
 
 CmdAlpha Cmd_Alpha_read(Alloc a, CmdRef ref)
 {
-    CmdAlphaRef _552 = { ref.offset + 4u };
+    CmdAlphaRef _689 = { ref.offset + 4u };
     Alloc param = a;
-    CmdAlphaRef param_1 = _552;
+    CmdAlphaRef param_1 = _689;
     return CmdAlpha_read(param, param_1);
 }
 
@@ -324,9 +339,9 @@ CmdColor CmdColor_read(Alloc a, CmdColorRef ref)
 
 CmdColor Cmd_Color_read(Alloc a, CmdRef ref)
 {
-    CmdColorRef _562 = { ref.offset + 4u };
+    CmdColorRef _699 = { ref.offset + 4u };
     Alloc param = a;
-    CmdColorRef param_1 = _562;
+    CmdColorRef param_1 = _699;
     return CmdColor_read(param, param_1);
 }
 
@@ -370,12 +385,66 @@ CmdLinGrad CmdLinGrad_read(Alloc a, CmdLinGradRef ref)
 
 CmdLinGrad Cmd_LinGrad_read(Alloc a, CmdRef ref)
 {
-    CmdLinGradRef _572 = { ref.offset + 4u };
+    CmdLinGradRef _709 = { ref.offset + 4u };
     Alloc param = a;
-    CmdLinGradRef param_1 = _572;
+    CmdLinGradRef param_1 = _709;
     return CmdLinGrad_read(param, param_1);
 }
 
+CmdRadGrad CmdRadGrad_read(Alloc a, CmdRadGradRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9);
+    Alloc param_10 = a;
+    uint param_11 = ix + 5u;
+    uint raw5 = read_mem(param_10, param_11);
+    Alloc param_12 = a;
+    uint param_13 = ix + 6u;
+    uint raw6 = read_mem(param_12, param_13);
+    Alloc param_14 = a;
+    uint param_15 = ix + 7u;
+    uint raw7 = read_mem(param_14, param_15);
+    Alloc param_16 = a;
+    uint param_17 = ix + 8u;
+    uint raw8 = read_mem(param_16, param_17);
+    Alloc param_18 = a;
+    uint param_19 = ix + 9u;
+    uint raw9 = read_mem(param_18, param_19);
+    Alloc param_20 = a;
+    uint param_21 = ix + 10u;
+    uint raw10 = read_mem(param_20, param_21);
+    CmdRadGrad s;
+    s.index = raw0;
+    s.mat = float4(asfloat(raw1), asfloat(raw2), asfloat(raw3), asfloat(raw4));
+    s.xlat = float2(asfloat(raw5), asfloat(raw6));
+    s.c1 = float2(asfloat(raw7), asfloat(raw8));
+    s.ra = asfloat(raw9);
+    s.roff = asfloat(raw10);
+    return s;
+}
+
+CmdRadGrad Cmd_RadGrad_read(Alloc a, CmdRef ref)
+{
+    CmdRadGradRef _719 = { ref.offset + 4u };
+    Alloc param = a;
+    CmdRadGradRef param_1 = _719;
+    return CmdRadGrad_read(param, param_1);
+}
+
 CmdImage CmdImage_read(Alloc a, CmdImageRef ref)
 {
     uint ix = ref.offset >> uint(2);
@@ -393,9 +462,9 @@ CmdImage CmdImage_read(Alloc a, CmdImageRef ref)
 
 CmdImage Cmd_Image_read(Alloc a, CmdRef ref)
 {
-    CmdImageRef _582 = { ref.offset + 4u };
+    CmdImageRef _729 = { ref.offset + 4u };
     Alloc param = a;
-    CmdImageRef param_1 = _582;
+    CmdImageRef param_1 = _729;
     return CmdImage_read(param, param_1);
 }
 
@@ -408,10 +477,10 @@ void fillImage(out float4 spvReturnValue[8], uint2 xy, CmdImage cmd_img)
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas[uv];
         float3 param_1 = fg_rgba.xyz;
-        float3 _1493 = fromsRGB(param_1);
-        fg_rgba.x = _1493.x;
-        fg_rgba.y = _1493.y;
-        fg_rgba.z = _1493.z;
+        float3 _1638 = fromsRGB(param_1);
+        fg_rgba.x = _1638.x;
+        fg_rgba.y = _1638.y;
+        fg_rgba.z = _1638.z;
         rgba[i] = fg_rgba;
     }
     spvReturnValue = rgba;
@@ -445,9 +514,9 @@ CmdEndClip CmdEndClip_read(Alloc a, CmdEndClipRef ref)
 
 CmdEndClip Cmd_EndClip_read(Alloc a, CmdRef ref)
 {
-    CmdEndClipRef _592 = { ref.offset + 4u };
+    CmdEndClipRef _739 = { ref.offset + 4u };
     Alloc param = a;
-    CmdEndClipRef param_1 = _592;
+    CmdEndClipRef param_1 = _739;
     return CmdEndClip_read(param, param_1);
 }
 
@@ -637,8 +706,8 @@ float3 set_lum(float3 c, float l)
 {
     float3 param = c;
     float3 param_1 = c + (l - lum(param)).xxx;
-    float3 _901 = clip_color(param_1);
-    return _901;
+    float3 _1046 = clip_color(param_1);
+    return _1046;
 }
 
 float3 mix_blend(float3 cb, float3 cs, uint mode)
@@ -726,9 +795,9 @@ float3 mix_blend(float3 cb, float3 cs, uint mode)
             float3 param_20 = cb;
             float3 param_21 = cs;
             float param_22 = sat(param_20);
-            float3 _1192 = set_sat(param_21, param_22);
+            float3 _1337 = set_sat(param_21, param_22);
             float3 param_23 = cb;
-            float3 param_24 = _1192;
+            float3 param_24 = _1337;
             float param_25 = lum(param_23);
             b = set_lum(param_24, param_25);
             break;
@@ -738,9 +807,9 @@ float3 mix_blend(float3 cb, float3 cs, uint mode)
             float3 param_26 = cs;
             float3 param_27 = cb;
             float param_28 = sat(param_26);
-            float3 _1206 = set_sat(param_27, param_28);
+            float3 _1351 = set_sat(param_27, param_28);
             float3 param_29 = cb;
-            float3 param_30 = _1206;
+            float3 param_30 = _1351;
             float param_31 = lum(param_29);
             b = set_lum(param_30, param_31);
             break;
@@ -877,24 +946,24 @@ CmdJump CmdJump_read(Alloc a, CmdJumpRef ref)
 
 CmdJump Cmd_Jump_read(Alloc a, CmdRef ref)
 {
-    CmdJumpRef _602 = { ref.offset + 4u };
+    CmdJumpRef _749 = { ref.offset + 4u };
     Alloc param = a;
-    CmdJumpRef param_1 = _602;
+    CmdJumpRef param_1 = _749;
     return CmdJump_read(param, param_1);
 }
 
 void comp_main()
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1521.Load(8)) + gl_WorkGroupID.x;
-    Alloc _1536;
-    _1536.offset = _1521.Load(24);
+    uint tile_ix = (gl_WorkGroupID.y * _1666.Load(8)) + gl_WorkGroupID.x;
+    Alloc _1681;
+    _1681.offset = _1666.Load(24);
     Alloc param;
-    param.offset = _1536.offset;
+    param.offset = _1681.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
-    CmdRef _1545 = { cmd_alloc.offset };
-    CmdRef cmd_ref = _1545;
+    CmdRef _1690 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _1690;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
     float4 rgba[8];
@@ -903,7 +972,7 @@ void comp_main()
         rgba[i] = 0.0f.xxxx;
     }
     uint clip_depth = 0u;
-    bool mem_ok = _278.Load(4) == 0u;
+    bool mem_ok = _291.Load(4) == 0u;
     float df[8];
     TileSegRef tile_seg_ref;
     float area[8];
@@ -928,8 +997,8 @@ void comp_main()
                 {
                     df[k] = 1000000000.0f;
                 }
-                TileSegRef _1638 = { stroke.tile_ref };
-                tile_seg_ref = _1638;
+                TileSegRef _1784 = { stroke.tile_ref };
+                tile_seg_ref = _1784;
                 do
                 {
                     uint param_7 = tile_seg_ref.offset;
@@ -965,8 +1034,8 @@ void comp_main()
                 {
                     area[k_3] = float(fill.backdrop);
                 }
-                TileSegRef _1758 = { fill.tile_ref };
-                tile_seg_ref = _1758;
+                TileSegRef _1904 = { fill.tile_ref };
+                tile_seg_ref = _1904;
                 do
                 {
                     uint param_15 = tile_seg_ref.offset;
@@ -1055,11 +1124,12 @@ void comp_main()
                     int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba = gradients[int2(x, int(lin.index))];
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2092 = fromsRGB(param_29);
-                    fg_rgba.x = _2092.x;
-                    fg_rgba.y = _2092.y;
-                    fg_rgba.z = _2092.z;
-                    rgba[k_9] = fg_rgba;
+                    float3 _2238 = fromsRGB(param_29);
+                    fg_rgba.x = _2238.x;
+                    fg_rgba.y = _2238.y;
+                    fg_rgba.z = _2238.z;
+                    float4 fg_k_1 = fg_rgba * area[k_9];
+                    rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1;
                 }
                 cmd_ref.offset += 20u;
                 break;
@@ -1068,74 +1138,100 @@ void comp_main()
             {
                 Alloc param_30 = cmd_alloc;
                 CmdRef param_31 = cmd_ref;
-                CmdImage fill_img = Cmd_Image_read(param_30, param_31);
-                uint2 param_32 = xy_uint;
-                CmdImage param_33 = fill_img;
-                float4 _2121[8];
-                fillImage(_2121, param_32, param_33);
-                float4 img[8] = _2121;
+                CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31);
                 for (uint k_10 = 0u; k_10 < 8u; k_10++)
                 {
-                    float4 fg_k_1 = img[k_10] * area[k_10];
-                    rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_1.w)) + fg_k_1;
+                    uint param_32 = k_10;
+                    float2 my_xy_1 = xy + float2(chunk_offset(param_32));
+                    my_xy_1 = ((rad.mat.xz * my_xy_1.x) + (rad.mat.yw * my_xy_1.y)) - rad.xlat;
+                    float ba = dot(my_xy_1, rad.c1);
+                    float ca = rad.ra * dot(my_xy_1, my_xy_1);
+                    float t_2 = (sqrt((ba * ba) + ca) - ba) - rad.roff;
+                    int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f));
+                    float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))];
+                    float3 param_33 = fg_rgba_1.xyz;
+                    float3 _2348 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2348.x;
+                    fg_rgba_1.y = _2348.y;
+                    fg_rgba_1.z = _2348.z;
+                    float4 fg_k_2 = fg_rgba_1 * area[k_10];
+                    rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2;
                 }
-                cmd_ref.offset += 12u;
+                cmd_ref.offset += 48u;
                 break;
             }
             case 8u:
             {
+                Alloc param_34 = cmd_alloc;
+                CmdRef param_35 = cmd_ref;
+                CmdImage fill_img = Cmd_Image_read(param_34, param_35);
+                uint2 param_36 = xy_uint;
+                CmdImage param_37 = fill_img;
+                float4 _2391[8];
+                fillImage(_2391, param_36, param_37);
+                float4 img[8] = _2391;
                 for (uint k_11 = 0u; k_11 < 8u; k_11++)
+                {
+                    float4 fg_k_3 = img[k_11] * area[k_11];
+                    rgba[k_11] = (rgba[k_11] * (1.0f - fg_k_3.w)) + fg_k_3;
+                }
+                cmd_ref.offset += 12u;
+                break;
+            }
+            case 9u:
+            {
+                for (uint k_12 = 0u; k_12 < 8u; k_12++)
                 {
                     uint d_2 = min(clip_depth, 127u);
-                    float4 param_34 = float4(rgba[k_11]);
-                    uint _2184 = packsRGB(param_34);
-                    blend_stack[d_2][k_11] = _2184;
-                    rgba[k_11] = 0.0f.xxxx;
+                    float4 param_38 = float4(rgba[k_12]);
+                    uint _2454 = packsRGB(param_38);
+                    blend_stack[d_2][k_12] = _2454;
+                    rgba[k_12] = 0.0f.xxxx;
                 }
                 clip_depth++;
                 cmd_ref.offset += 4u;
                 break;
             }
-            case 9u:
+            case 10u:
             {
-                Alloc param_35 = cmd_alloc;
-                CmdRef param_36 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_35, param_36);
+                Alloc param_39 = cmd_alloc;
+                CmdRef param_40 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40);
                 uint blend_mode = end_clip.blend >> uint(8);
                 uint comp_mode = end_clip.blend & 255u;
                 clip_depth--;
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                for (uint k_13 = 0u; k_13 < 8u; k_13++)
                 {
                     uint d_3 = min(clip_depth, 127u);
-                    uint param_37 = blend_stack[d_3][k_12];
-                    float4 bg = unpacksRGB(param_37);
-                    float4 fg_1 = rgba[k_12] * area[k_12];
-                    float3 param_38 = bg.xyz;
-                    float3 param_39 = fg_1.xyz;
-                    uint param_40 = blend_mode;
-                    float3 blend = mix_blend(param_38, param_39, param_40);
-                    float4 _2251 = fg_1;
-                    float _2255 = fg_1.w;
-                    float3 _2262 = lerp(_2251.xyz, blend, float((_2255 * bg.w) > 0.0f).xxx);
-                    fg_1.x = _2262.x;
-                    fg_1.y = _2262.y;
-                    fg_1.z = _2262.z;
-                    float3 param_41 = bg.xyz;
-                    float3 param_42 = fg_1.xyz;
-                    float param_43 = bg.w;
-                    float param_44 = fg_1.w;
-                    uint param_45 = comp_mode;
-                    rgba[k_12] = mix_compose(param_41, param_42, param_43, param_44, param_45);
+                    uint param_41 = blend_stack[d_3][k_13];
+                    float4 bg = unpacksRGB(param_41);
+                    float4 fg_1 = rgba[k_13] * area[k_13];
+                    float3 param_42 = bg.xyz;
+                    float3 param_43 = fg_1.xyz;
+                    uint param_44 = blend_mode;
+                    float3 blend = mix_blend(param_42, param_43, param_44);
+                    float4 _2521 = fg_1;
+                    float _2525 = fg_1.w;
+                    float3 _2532 = lerp(_2521.xyz, blend, float((_2525 * bg.w) > 0.0f).xxx);
+                    fg_1.x = _2532.x;
+                    fg_1.y = _2532.y;
+                    fg_1.z = _2532.z;
+                    float3 param_45 = bg.xyz;
+                    float3 param_46 = fg_1.xyz;
+                    float param_47 = bg.w;
+                    float param_48 = fg_1.w;
+                    uint param_49 = comp_mode;
+                    rgba[k_13] = mix_compose(param_45, param_46, param_47, param_48, param_49);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
-            case 10u:
+            case 11u:
             {
-                Alloc param_46 = cmd_alloc;
-                CmdRef param_47 = cmd_ref;
-                CmdRef _2299 = { Cmd_Jump_read(param_46, param_47).new_ref };
-                cmd_ref = _2299;
+                Alloc param_50 = cmd_alloc;
+                CmdRef param_51 = cmd_ref;
+                CmdRef _2569 = { Cmd_Jump_read(param_50, param_51).new_ref };
+                cmd_ref = _2569;
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1143,9 +1239,9 @@ void comp_main()
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_48 = i_1;
-        float3 param_49 = rgba[i_1].xyz;
-        image[int2(xy_uint + chunk_offset(param_48))] = float4(tosRGB(param_49), rgba[i_1].w);
+        uint param_52 = i_1;
+        float3 param_53 = rgba[i_1].xyz;
+        image[int2(xy_uint + chunk_offset(param_52))] = float4(tosRGB(param_53), rgba[i_1].w);
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4.msl b/piet-gpu/shader/gen/kernel4.msl
index c1f41af..6489563 100644
--- a/piet-gpu/shader/gen/kernel4.msl
+++ b/piet-gpu/shader/gen/kernel4.msl
@@ -94,6 +94,21 @@ struct CmdLinGrad
     float line_c;
 };
 
+struct CmdRadGradRef
+{
+    uint offset;
+};
+
+struct CmdRadGrad
+{
+    uint index;
+    float4 mat;
+    float2 xlat;
+    float2 c1;
+    float ra;
+    float roff;
+};
+
 struct CmdImageRef
 {
     uint offset;
@@ -222,7 +237,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
 }
 
 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_278)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_291)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -230,29 +245,29 @@ uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memor
     {
         return 0u;
     }
-    uint v = v_278.memory[offset];
+    uint v = v_291.memory[offset];
     return v;
 }
 
 static inline __attribute__((always_inline))
-CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_278);
+    uint tag_and_flags = read_mem(param, param_1, v_291);
     return CmdTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
 }
 
 static inline __attribute__((always_inline))
-CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_278)
+CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
     CmdStroke s;
     s.tile_ref = raw0;
     s.half_width = as_type<float>(raw1);
@@ -260,11 +275,11 @@ CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref,
 }
 
 static inline __attribute__((always_inline))
-CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdStrokeRef param_1 = CmdStrokeRef{ ref.offset + 4u };
-    return CmdStroke_read(param, param_1, v_278);
+    return CmdStroke_read(param, param_1, v_291);
 }
 
 static inline __attribute__((always_inline))
@@ -276,27 +291,27 @@ Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const
 }
 
 static inline __attribute__((always_inline))
-TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_278)
+TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_278);
+    uint raw2 = read_mem(param_4, param_5, v_291);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_278);
+    uint raw3 = read_mem(param_6, param_7, v_291);
     Alloc param_8 = a;
     uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_278);
+    uint raw4 = read_mem(param_8, param_9, v_291);
     Alloc param_10 = a;
     uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_278);
+    uint raw5 = read_mem(param_10, param_11, v_291);
     TileSeg s;
     s.origin = float2(as_type<float>(raw0), as_type<float>(raw1));
     s.vector = float2(as_type<float>(raw2), as_type<float>(raw3));
@@ -312,15 +327,15 @@ uint2 chunk_offset(thread const uint& i)
 }
 
 static inline __attribute__((always_inline))
-CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_278)
+CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
     CmdFill s;
     s.tile_ref = raw0;
     s.backdrop = int(raw1);
@@ -328,51 +343,51 @@ CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device
 }
 
 static inline __attribute__((always_inline))
-CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdFillRef param_1 = CmdFillRef{ ref.offset + 4u };
-    return CmdFill_read(param, param_1, v_278);
+    return CmdFill_read(param, param_1, v_291);
 }
 
 static inline __attribute__((always_inline))
-CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_278)
+CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     CmdAlpha s;
     s.alpha = as_type<float>(raw0);
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdAlphaRef param_1 = CmdAlphaRef{ ref.offset + 4u };
-    return CmdAlpha_read(param, param_1, v_278);
+    return CmdAlpha_read(param, param_1, v_291);
 }
 
 static inline __attribute__((always_inline))
-CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_278)
+CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     CmdColor s;
     s.rgba_color = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdColorRef param_1 = CmdColorRef{ ref.offset + 4u };
-    return CmdColor_read(param, param_1, v_278);
+    return CmdColor_read(param, param_1, v_291);
 }
 
 static inline __attribute__((always_inline))
@@ -393,21 +408,21 @@ float4 unpacksRGB(thread const uint& srgba)
 }
 
 static inline __attribute__((always_inline))
-CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_278)
+CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_278);
+    uint raw2 = read_mem(param_4, param_5, v_291);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_278);
+    uint raw3 = read_mem(param_6, param_7, v_291);
     CmdLinGrad s;
     s.index = raw0;
     s.line_x = as_type<float>(raw1);
@@ -417,23 +432,78 @@ CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& re
 }
 
 static inline __attribute__((always_inline))
-CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdLinGradRef param_1 = CmdLinGradRef{ ref.offset + 4u };
-    return CmdLinGrad_read(param, param_1, v_278);
+    return CmdLinGrad_read(param, param_1, v_291);
 }
 
 static inline __attribute__((always_inline))
-CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_278)
+CmdRadGrad CmdRadGrad_read(thread const Alloc& a, thread const CmdRadGradRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5, v_291);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7, v_291);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9, v_291);
+    Alloc param_10 = a;
+    uint param_11 = ix + 5u;
+    uint raw5 = read_mem(param_10, param_11, v_291);
+    Alloc param_12 = a;
+    uint param_13 = ix + 6u;
+    uint raw6 = read_mem(param_12, param_13, v_291);
+    Alloc param_14 = a;
+    uint param_15 = ix + 7u;
+    uint raw7 = read_mem(param_14, param_15, v_291);
+    Alloc param_16 = a;
+    uint param_17 = ix + 8u;
+    uint raw8 = read_mem(param_16, param_17, v_291);
+    Alloc param_18 = a;
+    uint param_19 = ix + 9u;
+    uint raw9 = read_mem(param_18, param_19, v_291);
+    Alloc param_20 = a;
+    uint param_21 = ix + 10u;
+    uint raw10 = read_mem(param_20, param_21, v_291);
+    CmdRadGrad s;
+    s.index = raw0;
+    s.mat = float4(as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3), as_type<float>(raw4));
+    s.xlat = float2(as_type<float>(raw5), as_type<float>(raw6));
+    s.c1 = float2(as_type<float>(raw7), as_type<float>(raw8));
+    s.ra = as_type<float>(raw9);
+    s.roff = as_type<float>(raw10);
+    return s;
+}
+
+static inline __attribute__((always_inline))
+CmdRadGrad Cmd_RadGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+{
+    Alloc param = a;
+    CmdRadGradRef param_1 = CmdRadGradRef{ ref.offset + 4u };
+    return CmdRadGrad_read(param, param_1, v_291);
+}
+
+static inline __attribute__((always_inline))
+CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_291)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_291);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3, v_291);
     CmdImage s;
     s.index = raw0;
     s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
@@ -441,11 +511,11 @@ CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, dev
 }
 
 static inline __attribute__((always_inline))
-CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdImageRef param_1 = CmdImageRef{ ref.offset + 4u };
-    return CmdImage_read(param, param_1, v_278);
+    return CmdImage_read(param, param_1, v_291);
 }
 
 static inline __attribute__((always_inline))
@@ -458,10 +528,10 @@ spvUnsafeArray<float4, 8> fillImage(thread const uint2& xy, thread const CmdImag
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas.read(uint2(uv));
         float3 param_1 = fg_rgba.xyz;
-        float3 _1493 = fromsRGB(param_1);
-        fg_rgba.x = _1493.x;
-        fg_rgba.y = _1493.y;
-        fg_rgba.z = _1493.z;
+        float3 _1638 = fromsRGB(param_1);
+        fg_rgba.x = _1638.x;
+        fg_rgba.y = _1638.y;
+        fg_rgba.z = _1638.z;
         rgba[i] = fg_rgba;
     }
     return rgba;
@@ -485,23 +555,23 @@ uint packsRGB(thread float4& rgba)
 }
 
 static inline __attribute__((always_inline))
-CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_278)
+CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     CmdEndClip s;
     s.blend = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdEndClipRef param_1 = CmdEndClipRef{ ref.offset + 4u };
-    return CmdEndClip_read(param, param_1, v_278);
+    return CmdEndClip_read(param, param_1, v_291);
 }
 
 static inline __attribute__((always_inline))
@@ -701,8 +771,8 @@ float3 set_lum(thread const float3& c, thread const float& l)
 {
     float3 param = c;
     float3 param_1 = c + float3(l - lum(param));
-    float3 _901 = clip_color(param_1);
-    return _901;
+    float3 _1046 = clip_color(param_1);
+    return _1046;
 }
 
 static inline __attribute__((always_inline))
@@ -791,9 +861,9 @@ float3 mix_blend(thread const float3& cb, thread const float3& cs, thread const
             float3 param_20 = cb;
             float3 param_21 = cs;
             float param_22 = sat(param_20);
-            float3 _1192 = set_sat(param_21, param_22);
+            float3 _1337 = set_sat(param_21, param_22);
             float3 param_23 = cb;
-            float3 param_24 = _1192;
+            float3 param_24 = _1337;
             float param_25 = lum(param_23);
             b = set_lum(param_24, param_25);
             break;
@@ -803,9 +873,9 @@ float3 mix_blend(thread const float3& cb, thread const float3& cs, thread const
             float3 param_26 = cs;
             float3 param_27 = cb;
             float param_28 = sat(param_26);
-            float3 _1206 = set_sat(param_27, param_28);
+            float3 _1351 = set_sat(param_27, param_28);
             float3 param_29 = cb;
-            float3 param_30 = _1206;
+            float3 param_30 = _1351;
             float param_31 = lum(param_29);
             b = set_lum(param_30, param_31);
             break;
@@ -931,30 +1001,30 @@ float4 mix_compose(thread const float3& cb, thread const float3& cs, thread cons
 }
 
 static inline __attribute__((always_inline))
-CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_278)
+CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     CmdJump s;
     s.new_ref = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdJumpRef param_1 = CmdJumpRef{ ref.offset + 4u };
-    return CmdJump_read(param, param_1, v_278);
+    return CmdJump_read(param, param_1, v_291);
 }
 
-kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1521 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1666 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1521.conf.width_in_tiles) + gl_WorkGroupID.x;
+    uint tile_ix = (gl_WorkGroupID.y * _1666.conf.width_in_tiles) + gl_WorkGroupID.x;
     Alloc param;
-    param.offset = _1521.conf.ptcl_alloc.offset;
+    param.offset = _1666.conf.ptcl_alloc.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
@@ -967,7 +1037,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
         rgba[i] = float4(0.0);
     }
     uint clip_depth = 0u;
-    bool mem_ok = v_278.mem_error == 0u;
+    bool mem_ok = v_291.mem_error == 0u;
     spvUnsafeArray<float, 8> df;
     TileSegRef tile_seg_ref;
     spvUnsafeArray<float, 8> area;
@@ -976,7 +1046,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
     {
         Alloc param_3 = cmd_alloc;
         CmdRef param_4 = cmd_ref;
-        uint tag = Cmd_tag(param_3, param_4, v_278).tag;
+        uint tag = Cmd_tag(param_3, param_4, v_291).tag;
         if (tag == 0u)
         {
             break;
@@ -987,7 +1057,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
             {
                 Alloc param_5 = cmd_alloc;
                 CmdRef param_6 = cmd_ref;
-                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_278);
+                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_291);
                 for (uint k = 0u; k < 8u; k++)
                 {
                     df[k] = 1000000000.0;
@@ -1000,7 +1070,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
                     bool param_9 = mem_ok;
                     Alloc param_10 = new_alloc(param_7, param_8, param_9);
                     TileSegRef param_11 = tile_seg_ref;
-                    TileSeg seg = TileSeg_read(param_10, param_11, v_278);
+                    TileSeg seg = TileSeg_read(param_10, param_11, v_291);
                     float2 line_vec = seg.vector;
                     for (uint k_1 = 0u; k_1 < 8u; k_1++)
                     {
@@ -1023,7 +1093,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
             {
                 Alloc param_13 = cmd_alloc;
                 CmdRef param_14 = cmd_ref;
-                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_278);
+                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_291);
                 for (uint k_3 = 0u; k_3 < 8u; k_3++)
                 {
                     area[k_3] = float(fill.backdrop);
@@ -1036,7 +1106,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
                     bool param_17 = mem_ok;
                     Alloc param_18 = new_alloc(param_15, param_16, param_17);
                     TileSegRef param_19 = tile_seg_ref;
-                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_278);
+                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_291);
                     for (uint k_4 = 0u; k_4 < 8u; k_4++)
                     {
                         uint param_20 = k_4;
@@ -1080,7 +1150,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
             {
                 Alloc param_21 = cmd_alloc;
                 CmdRef param_22 = cmd_ref;
-                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_278);
+                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_291);
                 for (uint k_7 = 0u; k_7 < 8u; k_7++)
                 {
                     area[k_7] = alpha.alpha;
@@ -1092,7 +1162,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
             {
                 Alloc param_23 = cmd_alloc;
                 CmdRef param_24 = cmd_ref;
-                CmdColor color = Cmd_Color_read(param_23, param_24, v_278);
+                CmdColor color = Cmd_Color_read(param_23, param_24, v_291);
                 uint param_25 = color.rgba_color;
                 float4 fg = unpacksRGB(param_25);
                 for (uint k_8 = 0u; k_8 < 8u; k_8++)
@@ -1107,7 +1177,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
             {
                 Alloc param_26 = cmd_alloc;
                 CmdRef param_27 = cmd_ref;
-                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_278);
+                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_291);
                 float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c;
                 for (uint k_9 = 0u; k_9 < 8u; k_9++)
                 {
@@ -1117,11 +1187,12 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
                     int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                     float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2092 = fromsRGB(param_29);
-                    fg_rgba.x = _2092.x;
-                    fg_rgba.y = _2092.y;
-                    fg_rgba.z = _2092.z;
-                    rgba[k_9] = fg_rgba;
+                    float3 _2238 = fromsRGB(param_29);
+                    fg_rgba.x = _2238.x;
+                    fg_rgba.y = _2238.y;
+                    fg_rgba.z = _2238.z;
+                    float4 fg_k_1 = fg_rgba * area[k_9];
+                    rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1;
                 }
                 cmd_ref.offset += 20u;
                 break;
@@ -1130,72 +1201,98 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
             {
                 Alloc param_30 = cmd_alloc;
                 CmdRef param_31 = cmd_ref;
-                CmdImage fill_img = Cmd_Image_read(param_30, param_31, v_278);
-                uint2 param_32 = xy_uint;
-                CmdImage param_33 = fill_img;
-                spvUnsafeArray<float4, 8> img;
-                img = fillImage(param_32, param_33, image_atlas);
+                CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31, v_291);
                 for (uint k_10 = 0u; k_10 < 8u; k_10++)
                 {
-                    float4 fg_k_1 = img[k_10] * area[k_10];
-                    rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_1.w)) + fg_k_1;
+                    uint param_32 = k_10;
+                    float2 my_xy_1 = xy + float2(chunk_offset(param_32));
+                    my_xy_1 = ((rad.mat.xz * my_xy_1.x) + (rad.mat.yw * my_xy_1.y)) - rad.xlat;
+                    float ba = dot(my_xy_1, rad.c1);
+                    float ca = rad.ra * dot(my_xy_1, my_xy_1);
+                    float t_2 = (sqrt((ba * ba) + ca) - ba) - rad.roff;
+                    int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0));
+                    float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index))));
+                    float3 param_33 = fg_rgba_1.xyz;
+                    float3 _2348 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2348.x;
+                    fg_rgba_1.y = _2348.y;
+                    fg_rgba_1.z = _2348.z;
+                    float4 fg_k_2 = fg_rgba_1 * area[k_10];
+                    rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2;
                 }
-                cmd_ref.offset += 12u;
+                cmd_ref.offset += 48u;
                 break;
             }
             case 8u:
             {
+                Alloc param_34 = cmd_alloc;
+                CmdRef param_35 = cmd_ref;
+                CmdImage fill_img = Cmd_Image_read(param_34, param_35, v_291);
+                uint2 param_36 = xy_uint;
+                CmdImage param_37 = fill_img;
+                spvUnsafeArray<float4, 8> img;
+                img = fillImage(param_36, param_37, image_atlas);
                 for (uint k_11 = 0u; k_11 < 8u; k_11++)
+                {
+                    float4 fg_k_3 = img[k_11] * area[k_11];
+                    rgba[k_11] = (rgba[k_11] * (1.0 - fg_k_3.w)) + fg_k_3;
+                }
+                cmd_ref.offset += 12u;
+                break;
+            }
+            case 9u:
+            {
+                for (uint k_12 = 0u; k_12 < 8u; k_12++)
                 {
                     uint d_2 = min(clip_depth, 127u);
-                    float4 param_34 = float4(rgba[k_11]);
-                    uint _2184 = packsRGB(param_34);
-                    blend_stack[d_2][k_11] = _2184;
-                    rgba[k_11] = float4(0.0);
+                    float4 param_38 = float4(rgba[k_12]);
+                    uint _2454 = packsRGB(param_38);
+                    blend_stack[d_2][k_12] = _2454;
+                    rgba[k_12] = float4(0.0);
                 }
                 clip_depth++;
                 cmd_ref.offset += 4u;
                 break;
             }
-            case 9u:
+            case 10u:
             {
-                Alloc param_35 = cmd_alloc;
-                CmdRef param_36 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_35, param_36, v_278);
+                Alloc param_39 = cmd_alloc;
+                CmdRef param_40 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40, v_291);
                 uint blend_mode = end_clip.blend >> uint(8);
                 uint comp_mode = end_clip.blend & 255u;
                 clip_depth--;
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                for (uint k_13 = 0u; k_13 < 8u; k_13++)
                 {
                     uint d_3 = min(clip_depth, 127u);
-                    uint param_37 = blend_stack[d_3][k_12];
-                    float4 bg = unpacksRGB(param_37);
-                    float4 fg_1 = rgba[k_12] * area[k_12];
-                    float3 param_38 = bg.xyz;
-                    float3 param_39 = fg_1.xyz;
-                    uint param_40 = blend_mode;
-                    float3 blend = mix_blend(param_38, param_39, param_40);
-                    float4 _2251 = fg_1;
-                    float _2255 = fg_1.w;
-                    float3 _2262 = mix(_2251.xyz, blend, float3(float((_2255 * bg.w) > 0.0)));
-                    fg_1.x = _2262.x;
-                    fg_1.y = _2262.y;
-                    fg_1.z = _2262.z;
-                    float3 param_41 = bg.xyz;
-                    float3 param_42 = fg_1.xyz;
-                    float param_43 = bg.w;
-                    float param_44 = fg_1.w;
-                    uint param_45 = comp_mode;
-                    rgba[k_12] = mix_compose(param_41, param_42, param_43, param_44, param_45);
+                    uint param_41 = blend_stack[d_3][k_13];
+                    float4 bg = unpacksRGB(param_41);
+                    float4 fg_1 = rgba[k_13] * area[k_13];
+                    float3 param_42 = bg.xyz;
+                    float3 param_43 = fg_1.xyz;
+                    uint param_44 = blend_mode;
+                    float3 blend = mix_blend(param_42, param_43, param_44);
+                    float4 _2521 = fg_1;
+                    float _2525 = fg_1.w;
+                    float3 _2532 = mix(_2521.xyz, blend, float3(float((_2525 * bg.w) > 0.0)));
+                    fg_1.x = _2532.x;
+                    fg_1.y = _2532.y;
+                    fg_1.z = _2532.z;
+                    float3 param_45 = bg.xyz;
+                    float3 param_46 = fg_1.xyz;
+                    float param_47 = bg.w;
+                    float param_48 = fg_1.w;
+                    uint param_49 = comp_mode;
+                    rgba[k_13] = mix_compose(param_45, param_46, param_47, param_48, param_49);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
-            case 10u:
+            case 11u:
             {
-                Alloc param_46 = cmd_alloc;
-                CmdRef param_47 = cmd_ref;
-                cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_278).new_ref };
+                Alloc param_50 = cmd_alloc;
+                CmdRef param_51 = cmd_ref;
+                cmd_ref = CmdRef{ Cmd_Jump_read(param_50, param_51, v_291).new_ref };
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1203,9 +1300,9 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_48 = i_1;
-        float3 param_49 = rgba[i_1].xyz;
-        image.write(float4(tosRGB(param_49), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48))));
+        uint param_52 = i_1;
+        float3 param_53 = rgba[i_1].xyz;
+        image.write(float4(tosRGB(param_53), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_52))));
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4.spv b/piet-gpu/shader/gen/kernel4.spv
index 91272da..7061263 100644
Binary files a/piet-gpu/shader/gen/kernel4.spv and b/piet-gpu/shader/gen/kernel4.spv differ
diff --git a/piet-gpu/shader/gen/kernel4_gray.dxil b/piet-gpu/shader/gen/kernel4_gray.dxil
index 18c4b7e..a594d50 100644
Binary files a/piet-gpu/shader/gen/kernel4_gray.dxil and b/piet-gpu/shader/gen/kernel4_gray.dxil differ
diff --git a/piet-gpu/shader/gen/kernel4_gray.hlsl b/piet-gpu/shader/gen/kernel4_gray.hlsl
index de95771..019a73c 100644
--- a/piet-gpu/shader/gen/kernel4_gray.hlsl
+++ b/piet-gpu/shader/gen/kernel4_gray.hlsl
@@ -48,6 +48,21 @@ struct CmdLinGrad
     float line_c;
 };
 
+struct CmdRadGradRef
+{
+    uint offset;
+};
+
+struct CmdRadGrad
+{
+    uint index;
+    float4 mat;
+    float2 xlat;
+    float2 c1;
+    float ra;
+    float roff;
+};
+
 struct CmdImageRef
 {
     uint offset;
@@ -146,8 +161,8 @@ struct Config
 
 static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);
 
-RWByteAddressBuffer _278 : register(u0, space0);
-ByteAddressBuffer _1521 : register(t1, space0);
+RWByteAddressBuffer _291 : register(u0, space0);
+ByteAddressBuffer _1666 : register(t1, space0);
 RWTexture2D<unorm float4> image_atlas : register(u3, space0);
 RWTexture2D<unorm float4> gradients : register(u4, space0);
 RWTexture2D<unorm float> image : register(u2, space0);
@@ -174,8 +189,8 @@ float4 spvUnpackUnorm4x8(uint value)
 
 Alloc slice_mem(Alloc a, uint offset, uint size)
 {
-    Alloc _291 = { a.offset + offset };
-    return _291;
+    Alloc _304 = { a.offset + offset };
+    return _304;
 }
 
 bool touch_mem(Alloc alloc, uint offset)
@@ -191,7 +206,7 @@ uint read_mem(Alloc alloc, uint offset)
     {
         return 0u;
     }
-    uint v = _278.Load(offset * 4 + 8);
+    uint v = _291.Load(offset * 4 + 8);
     return v;
 }
 
@@ -200,8 +215,8 @@ CmdTag Cmd_tag(Alloc a, CmdRef ref)
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
     uint tag_and_flags = read_mem(param, param_1);
-    CmdTag _525 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _525;
+    CmdTag _663 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+    return _663;
 }
 
 CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref)
@@ -221,9 +236,9 @@ CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref)
 
 CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref)
 {
-    CmdStrokeRef _542 = { ref.offset + 4u };
+    CmdStrokeRef _679 = { ref.offset + 4u };
     Alloc param = a;
-    CmdStrokeRef param_1 = _542;
+    CmdStrokeRef param_1 = _679;
     return CmdStroke_read(param, param_1);
 }
 
@@ -259,8 +274,8 @@ TileSeg TileSeg_read(Alloc a, TileSegRef ref)
     s.origin = float2(asfloat(raw0), asfloat(raw1));
     s._vector = float2(asfloat(raw2), asfloat(raw3));
     s.y_edge = asfloat(raw4);
-    TileSegRef _675 = { raw5 };
-    s.next = _675;
+    TileSegRef _820 = { raw5 };
+    s.next = _820;
     return s;
 }
 
@@ -286,9 +301,9 @@ CmdFill CmdFill_read(Alloc a, CmdFillRef ref)
 
 CmdFill Cmd_Fill_read(Alloc a, CmdRef ref)
 {
-    CmdFillRef _532 = { ref.offset + 4u };
+    CmdFillRef _669 = { ref.offset + 4u };
     Alloc param = a;
-    CmdFillRef param_1 = _532;
+    CmdFillRef param_1 = _669;
     return CmdFill_read(param, param_1);
 }
 
@@ -305,9 +320,9 @@ CmdAlpha CmdAlpha_read(Alloc a, CmdAlphaRef ref)
 
 CmdAlpha Cmd_Alpha_read(Alloc a, CmdRef ref)
 {
-    CmdAlphaRef _552 = { ref.offset + 4u };
+    CmdAlphaRef _689 = { ref.offset + 4u };
     Alloc param = a;
-    CmdAlphaRef param_1 = _552;
+    CmdAlphaRef param_1 = _689;
     return CmdAlpha_read(param, param_1);
 }
 
@@ -324,9 +339,9 @@ CmdColor CmdColor_read(Alloc a, CmdColorRef ref)
 
 CmdColor Cmd_Color_read(Alloc a, CmdRef ref)
 {
-    CmdColorRef _562 = { ref.offset + 4u };
+    CmdColorRef _699 = { ref.offset + 4u };
     Alloc param = a;
-    CmdColorRef param_1 = _562;
+    CmdColorRef param_1 = _699;
     return CmdColor_read(param, param_1);
 }
 
@@ -370,12 +385,66 @@ CmdLinGrad CmdLinGrad_read(Alloc a, CmdLinGradRef ref)
 
 CmdLinGrad Cmd_LinGrad_read(Alloc a, CmdRef ref)
 {
-    CmdLinGradRef _572 = { ref.offset + 4u };
+    CmdLinGradRef _709 = { ref.offset + 4u };
     Alloc param = a;
-    CmdLinGradRef param_1 = _572;
+    CmdLinGradRef param_1 = _709;
     return CmdLinGrad_read(param, param_1);
 }
 
+CmdRadGrad CmdRadGrad_read(Alloc a, CmdRadGradRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9);
+    Alloc param_10 = a;
+    uint param_11 = ix + 5u;
+    uint raw5 = read_mem(param_10, param_11);
+    Alloc param_12 = a;
+    uint param_13 = ix + 6u;
+    uint raw6 = read_mem(param_12, param_13);
+    Alloc param_14 = a;
+    uint param_15 = ix + 7u;
+    uint raw7 = read_mem(param_14, param_15);
+    Alloc param_16 = a;
+    uint param_17 = ix + 8u;
+    uint raw8 = read_mem(param_16, param_17);
+    Alloc param_18 = a;
+    uint param_19 = ix + 9u;
+    uint raw9 = read_mem(param_18, param_19);
+    Alloc param_20 = a;
+    uint param_21 = ix + 10u;
+    uint raw10 = read_mem(param_20, param_21);
+    CmdRadGrad s;
+    s.index = raw0;
+    s.mat = float4(asfloat(raw1), asfloat(raw2), asfloat(raw3), asfloat(raw4));
+    s.xlat = float2(asfloat(raw5), asfloat(raw6));
+    s.c1 = float2(asfloat(raw7), asfloat(raw8));
+    s.ra = asfloat(raw9);
+    s.roff = asfloat(raw10);
+    return s;
+}
+
+CmdRadGrad Cmd_RadGrad_read(Alloc a, CmdRef ref)
+{
+    CmdRadGradRef _719 = { ref.offset + 4u };
+    Alloc param = a;
+    CmdRadGradRef param_1 = _719;
+    return CmdRadGrad_read(param, param_1);
+}
+
 CmdImage CmdImage_read(Alloc a, CmdImageRef ref)
 {
     uint ix = ref.offset >> uint(2);
@@ -393,9 +462,9 @@ CmdImage CmdImage_read(Alloc a, CmdImageRef ref)
 
 CmdImage Cmd_Image_read(Alloc a, CmdRef ref)
 {
-    CmdImageRef _582 = { ref.offset + 4u };
+    CmdImageRef _729 = { ref.offset + 4u };
     Alloc param = a;
-    CmdImageRef param_1 = _582;
+    CmdImageRef param_1 = _729;
     return CmdImage_read(param, param_1);
 }
 
@@ -408,10 +477,10 @@ void fillImage(out float4 spvReturnValue[8], uint2 xy, CmdImage cmd_img)
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas[uv];
         float3 param_1 = fg_rgba.xyz;
-        float3 _1493 = fromsRGB(param_1);
-        fg_rgba.x = _1493.x;
-        fg_rgba.y = _1493.y;
-        fg_rgba.z = _1493.z;
+        float3 _1638 = fromsRGB(param_1);
+        fg_rgba.x = _1638.x;
+        fg_rgba.y = _1638.y;
+        fg_rgba.z = _1638.z;
         rgba[i] = fg_rgba;
     }
     spvReturnValue = rgba;
@@ -445,9 +514,9 @@ CmdEndClip CmdEndClip_read(Alloc a, CmdEndClipRef ref)
 
 CmdEndClip Cmd_EndClip_read(Alloc a, CmdRef ref)
 {
-    CmdEndClipRef _592 = { ref.offset + 4u };
+    CmdEndClipRef _739 = { ref.offset + 4u };
     Alloc param = a;
-    CmdEndClipRef param_1 = _592;
+    CmdEndClipRef param_1 = _739;
     return CmdEndClip_read(param, param_1);
 }
 
@@ -637,8 +706,8 @@ float3 set_lum(float3 c, float l)
 {
     float3 param = c;
     float3 param_1 = c + (l - lum(param)).xxx;
-    float3 _901 = clip_color(param_1);
-    return _901;
+    float3 _1046 = clip_color(param_1);
+    return _1046;
 }
 
 float3 mix_blend(float3 cb, float3 cs, uint mode)
@@ -726,9 +795,9 @@ float3 mix_blend(float3 cb, float3 cs, uint mode)
             float3 param_20 = cb;
             float3 param_21 = cs;
             float param_22 = sat(param_20);
-            float3 _1192 = set_sat(param_21, param_22);
+            float3 _1337 = set_sat(param_21, param_22);
             float3 param_23 = cb;
-            float3 param_24 = _1192;
+            float3 param_24 = _1337;
             float param_25 = lum(param_23);
             b = set_lum(param_24, param_25);
             break;
@@ -738,9 +807,9 @@ float3 mix_blend(float3 cb, float3 cs, uint mode)
             float3 param_26 = cs;
             float3 param_27 = cb;
             float param_28 = sat(param_26);
-            float3 _1206 = set_sat(param_27, param_28);
+            float3 _1351 = set_sat(param_27, param_28);
             float3 param_29 = cb;
-            float3 param_30 = _1206;
+            float3 param_30 = _1351;
             float param_31 = lum(param_29);
             b = set_lum(param_30, param_31);
             break;
@@ -877,24 +946,24 @@ CmdJump CmdJump_read(Alloc a, CmdJumpRef ref)
 
 CmdJump Cmd_Jump_read(Alloc a, CmdRef ref)
 {
-    CmdJumpRef _602 = { ref.offset + 4u };
+    CmdJumpRef _749 = { ref.offset + 4u };
     Alloc param = a;
-    CmdJumpRef param_1 = _602;
+    CmdJumpRef param_1 = _749;
     return CmdJump_read(param, param_1);
 }
 
 void comp_main()
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1521.Load(8)) + gl_WorkGroupID.x;
-    Alloc _1536;
-    _1536.offset = _1521.Load(24);
+    uint tile_ix = (gl_WorkGroupID.y * _1666.Load(8)) + gl_WorkGroupID.x;
+    Alloc _1681;
+    _1681.offset = _1666.Load(24);
     Alloc param;
-    param.offset = _1536.offset;
+    param.offset = _1681.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
-    CmdRef _1545 = { cmd_alloc.offset };
-    CmdRef cmd_ref = _1545;
+    CmdRef _1690 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _1690;
     uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
     float2 xy = float2(xy_uint);
     float4 rgba[8];
@@ -903,7 +972,7 @@ void comp_main()
         rgba[i] = 0.0f.xxxx;
     }
     uint clip_depth = 0u;
-    bool mem_ok = _278.Load(4) == 0u;
+    bool mem_ok = _291.Load(4) == 0u;
     float df[8];
     TileSegRef tile_seg_ref;
     float area[8];
@@ -928,8 +997,8 @@ void comp_main()
                 {
                     df[k] = 1000000000.0f;
                 }
-                TileSegRef _1638 = { stroke.tile_ref };
-                tile_seg_ref = _1638;
+                TileSegRef _1784 = { stroke.tile_ref };
+                tile_seg_ref = _1784;
                 do
                 {
                     uint param_7 = tile_seg_ref.offset;
@@ -965,8 +1034,8 @@ void comp_main()
                 {
                     area[k_3] = float(fill.backdrop);
                 }
-                TileSegRef _1758 = { fill.tile_ref };
-                tile_seg_ref = _1758;
+                TileSegRef _1904 = { fill.tile_ref };
+                tile_seg_ref = _1904;
                 do
                 {
                     uint param_15 = tile_seg_ref.offset;
@@ -1055,11 +1124,12 @@ void comp_main()
                     int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
                     float4 fg_rgba = gradients[int2(x, int(lin.index))];
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2092 = fromsRGB(param_29);
-                    fg_rgba.x = _2092.x;
-                    fg_rgba.y = _2092.y;
-                    fg_rgba.z = _2092.z;
-                    rgba[k_9] = fg_rgba;
+                    float3 _2238 = fromsRGB(param_29);
+                    fg_rgba.x = _2238.x;
+                    fg_rgba.y = _2238.y;
+                    fg_rgba.z = _2238.z;
+                    float4 fg_k_1 = fg_rgba * area[k_9];
+                    rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1;
                 }
                 cmd_ref.offset += 20u;
                 break;
@@ -1068,74 +1138,100 @@ void comp_main()
             {
                 Alloc param_30 = cmd_alloc;
                 CmdRef param_31 = cmd_ref;
-                CmdImage fill_img = Cmd_Image_read(param_30, param_31);
-                uint2 param_32 = xy_uint;
-                CmdImage param_33 = fill_img;
-                float4 _2121[8];
-                fillImage(_2121, param_32, param_33);
-                float4 img[8] = _2121;
+                CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31);
                 for (uint k_10 = 0u; k_10 < 8u; k_10++)
                 {
-                    float4 fg_k_1 = img[k_10] * area[k_10];
-                    rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_1.w)) + fg_k_1;
+                    uint param_32 = k_10;
+                    float2 my_xy_1 = xy + float2(chunk_offset(param_32));
+                    my_xy_1 = ((rad.mat.xz * my_xy_1.x) + (rad.mat.yw * my_xy_1.y)) - rad.xlat;
+                    float ba = dot(my_xy_1, rad.c1);
+                    float ca = rad.ra * dot(my_xy_1, my_xy_1);
+                    float t_2 = (sqrt((ba * ba) + ca) - ba) - rad.roff;
+                    int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f));
+                    float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))];
+                    float3 param_33 = fg_rgba_1.xyz;
+                    float3 _2348 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2348.x;
+                    fg_rgba_1.y = _2348.y;
+                    fg_rgba_1.z = _2348.z;
+                    float4 fg_k_2 = fg_rgba_1 * area[k_10];
+                    rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2;
                 }
-                cmd_ref.offset += 12u;
+                cmd_ref.offset += 48u;
                 break;
             }
             case 8u:
             {
+                Alloc param_34 = cmd_alloc;
+                CmdRef param_35 = cmd_ref;
+                CmdImage fill_img = Cmd_Image_read(param_34, param_35);
+                uint2 param_36 = xy_uint;
+                CmdImage param_37 = fill_img;
+                float4 _2391[8];
+                fillImage(_2391, param_36, param_37);
+                float4 img[8] = _2391;
                 for (uint k_11 = 0u; k_11 < 8u; k_11++)
+                {
+                    float4 fg_k_3 = img[k_11] * area[k_11];
+                    rgba[k_11] = (rgba[k_11] * (1.0f - fg_k_3.w)) + fg_k_3;
+                }
+                cmd_ref.offset += 12u;
+                break;
+            }
+            case 9u:
+            {
+                for (uint k_12 = 0u; k_12 < 8u; k_12++)
                 {
                     uint d_2 = min(clip_depth, 127u);
-                    float4 param_34 = float4(rgba[k_11]);
-                    uint _2184 = packsRGB(param_34);
-                    blend_stack[d_2][k_11] = _2184;
-                    rgba[k_11] = 0.0f.xxxx;
+                    float4 param_38 = float4(rgba[k_12]);
+                    uint _2454 = packsRGB(param_38);
+                    blend_stack[d_2][k_12] = _2454;
+                    rgba[k_12] = 0.0f.xxxx;
                 }
                 clip_depth++;
                 cmd_ref.offset += 4u;
                 break;
             }
-            case 9u:
+            case 10u:
             {
-                Alloc param_35 = cmd_alloc;
-                CmdRef param_36 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_35, param_36);
+                Alloc param_39 = cmd_alloc;
+                CmdRef param_40 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40);
                 uint blend_mode = end_clip.blend >> uint(8);
                 uint comp_mode = end_clip.blend & 255u;
                 clip_depth--;
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                for (uint k_13 = 0u; k_13 < 8u; k_13++)
                 {
                     uint d_3 = min(clip_depth, 127u);
-                    uint param_37 = blend_stack[d_3][k_12];
-                    float4 bg = unpacksRGB(param_37);
-                    float4 fg_1 = rgba[k_12] * area[k_12];
-                    float3 param_38 = bg.xyz;
-                    float3 param_39 = fg_1.xyz;
-                    uint param_40 = blend_mode;
-                    float3 blend = mix_blend(param_38, param_39, param_40);
-                    float4 _2251 = fg_1;
-                    float _2255 = fg_1.w;
-                    float3 _2262 = lerp(_2251.xyz, blend, float((_2255 * bg.w) > 0.0f).xxx);
-                    fg_1.x = _2262.x;
-                    fg_1.y = _2262.y;
-                    fg_1.z = _2262.z;
-                    float3 param_41 = bg.xyz;
-                    float3 param_42 = fg_1.xyz;
-                    float param_43 = bg.w;
-                    float param_44 = fg_1.w;
-                    uint param_45 = comp_mode;
-                    rgba[k_12] = mix_compose(param_41, param_42, param_43, param_44, param_45);
+                    uint param_41 = blend_stack[d_3][k_13];
+                    float4 bg = unpacksRGB(param_41);
+                    float4 fg_1 = rgba[k_13] * area[k_13];
+                    float3 param_42 = bg.xyz;
+                    float3 param_43 = fg_1.xyz;
+                    uint param_44 = blend_mode;
+                    float3 blend = mix_blend(param_42, param_43, param_44);
+                    float4 _2521 = fg_1;
+                    float _2525 = fg_1.w;
+                    float3 _2532 = lerp(_2521.xyz, blend, float((_2525 * bg.w) > 0.0f).xxx);
+                    fg_1.x = _2532.x;
+                    fg_1.y = _2532.y;
+                    fg_1.z = _2532.z;
+                    float3 param_45 = bg.xyz;
+                    float3 param_46 = fg_1.xyz;
+                    float param_47 = bg.w;
+                    float param_48 = fg_1.w;
+                    uint param_49 = comp_mode;
+                    rgba[k_13] = mix_compose(param_45, param_46, param_47, param_48, param_49);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
-            case 10u:
+            case 11u:
             {
-                Alloc param_46 = cmd_alloc;
-                CmdRef param_47 = cmd_ref;
-                CmdRef _2299 = { Cmd_Jump_read(param_46, param_47).new_ref };
-                cmd_ref = _2299;
+                Alloc param_50 = cmd_alloc;
+                CmdRef param_51 = cmd_ref;
+                CmdRef _2569 = { Cmd_Jump_read(param_50, param_51).new_ref };
+                cmd_ref = _2569;
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1143,8 +1239,8 @@ void comp_main()
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_48 = i_1;
-        image[int2(xy_uint + chunk_offset(param_48))] = rgba[i_1].w.x;
+        uint param_52 = i_1;
+        image[int2(xy_uint + chunk_offset(param_52))] = rgba[i_1].w.x;
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4_gray.msl b/piet-gpu/shader/gen/kernel4_gray.msl
index 5128e99..6402c6f 100644
--- a/piet-gpu/shader/gen/kernel4_gray.msl
+++ b/piet-gpu/shader/gen/kernel4_gray.msl
@@ -94,6 +94,21 @@ struct CmdLinGrad
     float line_c;
 };
 
+struct CmdRadGradRef
+{
+    uint offset;
+};
+
+struct CmdRadGrad
+{
+    uint index;
+    float4 mat;
+    float2 xlat;
+    float2 c1;
+    float ra;
+    float roff;
+};
+
 struct CmdImageRef
 {
     uint offset;
@@ -222,7 +237,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
 }
 
 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_278)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_291)
 {
     Alloc param = alloc;
     uint param_1 = offset;
@@ -230,29 +245,29 @@ uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memor
     {
         return 0u;
     }
-    uint v = v_278.memory[offset];
+    uint v = v_291.memory[offset];
     return v;
 }
 
 static inline __attribute__((always_inline))
-CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_278);
+    uint tag_and_flags = read_mem(param, param_1, v_291);
     return CmdTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
 }
 
 static inline __attribute__((always_inline))
-CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_278)
+CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
     CmdStroke s;
     s.tile_ref = raw0;
     s.half_width = as_type<float>(raw1);
@@ -260,11 +275,11 @@ CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref,
 }
 
 static inline __attribute__((always_inline))
-CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdStrokeRef param_1 = CmdStrokeRef{ ref.offset + 4u };
-    return CmdStroke_read(param, param_1, v_278);
+    return CmdStroke_read(param, param_1, v_291);
 }
 
 static inline __attribute__((always_inline))
@@ -276,27 +291,27 @@ Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const
 }
 
 static inline __attribute__((always_inline))
-TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_278)
+TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_278);
+    uint raw2 = read_mem(param_4, param_5, v_291);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_278);
+    uint raw3 = read_mem(param_6, param_7, v_291);
     Alloc param_8 = a;
     uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_278);
+    uint raw4 = read_mem(param_8, param_9, v_291);
     Alloc param_10 = a;
     uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_278);
+    uint raw5 = read_mem(param_10, param_11, v_291);
     TileSeg s;
     s.origin = float2(as_type<float>(raw0), as_type<float>(raw1));
     s.vector = float2(as_type<float>(raw2), as_type<float>(raw3));
@@ -312,15 +327,15 @@ uint2 chunk_offset(thread const uint& i)
 }
 
 static inline __attribute__((always_inline))
-CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_278)
+CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
     CmdFill s;
     s.tile_ref = raw0;
     s.backdrop = int(raw1);
@@ -328,51 +343,51 @@ CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device
 }
 
 static inline __attribute__((always_inline))
-CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdFillRef param_1 = CmdFillRef{ ref.offset + 4u };
-    return CmdFill_read(param, param_1, v_278);
+    return CmdFill_read(param, param_1, v_291);
 }
 
 static inline __attribute__((always_inline))
-CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_278)
+CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     CmdAlpha s;
     s.alpha = as_type<float>(raw0);
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdAlphaRef param_1 = CmdAlphaRef{ ref.offset + 4u };
-    return CmdAlpha_read(param, param_1, v_278);
+    return CmdAlpha_read(param, param_1, v_291);
 }
 
 static inline __attribute__((always_inline))
-CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_278)
+CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     CmdColor s;
     s.rgba_color = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdColorRef param_1 = CmdColorRef{ ref.offset + 4u };
-    return CmdColor_read(param, param_1, v_278);
+    return CmdColor_read(param, param_1, v_291);
 }
 
 static inline __attribute__((always_inline))
@@ -393,21 +408,21 @@ float4 unpacksRGB(thread const uint& srgba)
 }
 
 static inline __attribute__((always_inline))
-CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_278)
+CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
     Alloc param_4 = a;
     uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_278);
+    uint raw2 = read_mem(param_4, param_5, v_291);
     Alloc param_6 = a;
     uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_278);
+    uint raw3 = read_mem(param_6, param_7, v_291);
     CmdLinGrad s;
     s.index = raw0;
     s.line_x = as_type<float>(raw1);
@@ -417,23 +432,78 @@ CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& re
 }
 
 static inline __attribute__((always_inline))
-CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdLinGradRef param_1 = CmdLinGradRef{ ref.offset + 4u };
-    return CmdLinGrad_read(param, param_1, v_278);
+    return CmdLinGrad_read(param, param_1, v_291);
 }
 
 static inline __attribute__((always_inline))
-CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_278)
+CmdRadGrad CmdRadGrad_read(thread const Alloc& a, thread const CmdRadGradRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     Alloc param_2 = a;
     uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5, v_291);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7, v_291);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9, v_291);
+    Alloc param_10 = a;
+    uint param_11 = ix + 5u;
+    uint raw5 = read_mem(param_10, param_11, v_291);
+    Alloc param_12 = a;
+    uint param_13 = ix + 6u;
+    uint raw6 = read_mem(param_12, param_13, v_291);
+    Alloc param_14 = a;
+    uint param_15 = ix + 7u;
+    uint raw7 = read_mem(param_14, param_15, v_291);
+    Alloc param_16 = a;
+    uint param_17 = ix + 8u;
+    uint raw8 = read_mem(param_16, param_17, v_291);
+    Alloc param_18 = a;
+    uint param_19 = ix + 9u;
+    uint raw9 = read_mem(param_18, param_19, v_291);
+    Alloc param_20 = a;
+    uint param_21 = ix + 10u;
+    uint raw10 = read_mem(param_20, param_21, v_291);
+    CmdRadGrad s;
+    s.index = raw0;
+    s.mat = float4(as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3), as_type<float>(raw4));
+    s.xlat = float2(as_type<float>(raw5), as_type<float>(raw6));
+    s.c1 = float2(as_type<float>(raw7), as_type<float>(raw8));
+    s.ra = as_type<float>(raw9);
+    s.roff = as_type<float>(raw10);
+    return s;
+}
+
+static inline __attribute__((always_inline))
+CmdRadGrad Cmd_RadGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+{
+    Alloc param = a;
+    CmdRadGradRef param_1 = CmdRadGradRef{ ref.offset + 4u };
+    return CmdRadGrad_read(param, param_1, v_291);
+}
+
+static inline __attribute__((always_inline))
+CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_291)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_291);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3, v_291);
     CmdImage s;
     s.index = raw0;
     s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
@@ -441,11 +511,11 @@ CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, dev
 }
 
 static inline __attribute__((always_inline))
-CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdImageRef param_1 = CmdImageRef{ ref.offset + 4u };
-    return CmdImage_read(param, param_1, v_278);
+    return CmdImage_read(param, param_1, v_291);
 }
 
 static inline __attribute__((always_inline))
@@ -458,10 +528,10 @@ spvUnsafeArray<float4, 8> fillImage(thread const uint2& xy, thread const CmdImag
         int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
         float4 fg_rgba = image_atlas.read(uint2(uv));
         float3 param_1 = fg_rgba.xyz;
-        float3 _1493 = fromsRGB(param_1);
-        fg_rgba.x = _1493.x;
-        fg_rgba.y = _1493.y;
-        fg_rgba.z = _1493.z;
+        float3 _1638 = fromsRGB(param_1);
+        fg_rgba.x = _1638.x;
+        fg_rgba.y = _1638.y;
+        fg_rgba.z = _1638.z;
         rgba[i] = fg_rgba;
     }
     return rgba;
@@ -485,23 +555,23 @@ uint packsRGB(thread float4& rgba)
 }
 
 static inline __attribute__((always_inline))
-CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_278)
+CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     CmdEndClip s;
     s.blend = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdEndClipRef param_1 = CmdEndClipRef{ ref.offset + 4u };
-    return CmdEndClip_read(param, param_1, v_278);
+    return CmdEndClip_read(param, param_1, v_291);
 }
 
 static inline __attribute__((always_inline))
@@ -701,8 +771,8 @@ float3 set_lum(thread const float3& c, thread const float& l)
 {
     float3 param = c;
     float3 param_1 = c + float3(l - lum(param));
-    float3 _901 = clip_color(param_1);
-    return _901;
+    float3 _1046 = clip_color(param_1);
+    return _1046;
 }
 
 static inline __attribute__((always_inline))
@@ -791,9 +861,9 @@ float3 mix_blend(thread const float3& cb, thread const float3& cs, thread const
             float3 param_20 = cb;
             float3 param_21 = cs;
             float param_22 = sat(param_20);
-            float3 _1192 = set_sat(param_21, param_22);
+            float3 _1337 = set_sat(param_21, param_22);
             float3 param_23 = cb;
-            float3 param_24 = _1192;
+            float3 param_24 = _1337;
             float param_25 = lum(param_23);
             b = set_lum(param_24, param_25);
             break;
@@ -803,9 +873,9 @@ float3 mix_blend(thread const float3& cb, thread const float3& cs, thread const
             float3 param_26 = cs;
             float3 param_27 = cb;
             float param_28 = sat(param_26);
-            float3 _1206 = set_sat(param_27, param_28);
+            float3 _1351 = set_sat(param_27, param_28);
             float3 param_29 = cb;
-            float3 param_30 = _1206;
+            float3 param_30 = _1351;
             float param_31 = lum(param_29);
             b = set_lum(param_30, param_31);
             break;
@@ -931,30 +1001,30 @@ float4 mix_compose(thread const float3& cb, thread const float3& cs, thread cons
 }
 
 static inline __attribute__((always_inline))
-CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_278)
+CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_291)
 {
     uint ix = ref.offset >> uint(2);
     Alloc param = a;
     uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
     CmdJump s;
     s.new_ref = raw0;
     return s;
 }
 
 static inline __attribute__((always_inline))
-CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
     Alloc param = a;
     CmdJumpRef param_1 = CmdJumpRef{ ref.offset + 4u };
-    return CmdJump_read(param, param_1, v_278);
+    return CmdJump_read(param, param_1, v_291);
 }
 
-kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1521 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1666 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1521.conf.width_in_tiles) + gl_WorkGroupID.x;
+    uint tile_ix = (gl_WorkGroupID.y * _1666.conf.width_in_tiles) + gl_WorkGroupID.x;
     Alloc param;
-    param.offset = _1521.conf.ptcl_alloc.offset;
+    param.offset = _1666.conf.ptcl_alloc.offset;
     uint param_1 = tile_ix * 1024u;
     uint param_2 = 1024u;
     Alloc cmd_alloc = slice_mem(param, param_1, param_2);
@@ -967,7 +1037,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
         rgba[i] = float4(0.0);
     }
     uint clip_depth = 0u;
-    bool mem_ok = v_278.mem_error == 0u;
+    bool mem_ok = v_291.mem_error == 0u;
     spvUnsafeArray<float, 8> df;
     TileSegRef tile_seg_ref;
     spvUnsafeArray<float, 8> area;
@@ -976,7 +1046,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
     {
         Alloc param_3 = cmd_alloc;
         CmdRef param_4 = cmd_ref;
-        uint tag = Cmd_tag(param_3, param_4, v_278).tag;
+        uint tag = Cmd_tag(param_3, param_4, v_291).tag;
         if (tag == 0u)
         {
             break;
@@ -987,7 +1057,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
             {
                 Alloc param_5 = cmd_alloc;
                 CmdRef param_6 = cmd_ref;
-                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_278);
+                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_291);
                 for (uint k = 0u; k < 8u; k++)
                 {
                     df[k] = 1000000000.0;
@@ -1000,7 +1070,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
                     bool param_9 = mem_ok;
                     Alloc param_10 = new_alloc(param_7, param_8, param_9);
                     TileSegRef param_11 = tile_seg_ref;
-                    TileSeg seg = TileSeg_read(param_10, param_11, v_278);
+                    TileSeg seg = TileSeg_read(param_10, param_11, v_291);
                     float2 line_vec = seg.vector;
                     for (uint k_1 = 0u; k_1 < 8u; k_1++)
                     {
@@ -1023,7 +1093,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
             {
                 Alloc param_13 = cmd_alloc;
                 CmdRef param_14 = cmd_ref;
-                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_278);
+                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_291);
                 for (uint k_3 = 0u; k_3 < 8u; k_3++)
                 {
                     area[k_3] = float(fill.backdrop);
@@ -1036,7 +1106,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
                     bool param_17 = mem_ok;
                     Alloc param_18 = new_alloc(param_15, param_16, param_17);
                     TileSegRef param_19 = tile_seg_ref;
-                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_278);
+                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_291);
                     for (uint k_4 = 0u; k_4 < 8u; k_4++)
                     {
                         uint param_20 = k_4;
@@ -1080,7 +1150,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
             {
                 Alloc param_21 = cmd_alloc;
                 CmdRef param_22 = cmd_ref;
-                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_278);
+                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_291);
                 for (uint k_7 = 0u; k_7 < 8u; k_7++)
                 {
                     area[k_7] = alpha.alpha;
@@ -1092,7 +1162,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
             {
                 Alloc param_23 = cmd_alloc;
                 CmdRef param_24 = cmd_ref;
-                CmdColor color = Cmd_Color_read(param_23, param_24, v_278);
+                CmdColor color = Cmd_Color_read(param_23, param_24, v_291);
                 uint param_25 = color.rgba_color;
                 float4 fg = unpacksRGB(param_25);
                 for (uint k_8 = 0u; k_8 < 8u; k_8++)
@@ -1107,7 +1177,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
             {
                 Alloc param_26 = cmd_alloc;
                 CmdRef param_27 = cmd_ref;
-                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_278);
+                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_291);
                 float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c;
                 for (uint k_9 = 0u; k_9 < 8u; k_9++)
                 {
@@ -1117,11 +1187,12 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
                     int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                     float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                     float3 param_29 = fg_rgba.xyz;
-                    float3 _2092 = fromsRGB(param_29);
-                    fg_rgba.x = _2092.x;
-                    fg_rgba.y = _2092.y;
-                    fg_rgba.z = _2092.z;
-                    rgba[k_9] = fg_rgba;
+                    float3 _2238 = fromsRGB(param_29);
+                    fg_rgba.x = _2238.x;
+                    fg_rgba.y = _2238.y;
+                    fg_rgba.z = _2238.z;
+                    float4 fg_k_1 = fg_rgba * area[k_9];
+                    rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1;
                 }
                 cmd_ref.offset += 20u;
                 break;
@@ -1130,72 +1201,98 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
             {
                 Alloc param_30 = cmd_alloc;
                 CmdRef param_31 = cmd_ref;
-                CmdImage fill_img = Cmd_Image_read(param_30, param_31, v_278);
-                uint2 param_32 = xy_uint;
-                CmdImage param_33 = fill_img;
-                spvUnsafeArray<float4, 8> img;
-                img = fillImage(param_32, param_33, image_atlas);
+                CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31, v_291);
                 for (uint k_10 = 0u; k_10 < 8u; k_10++)
                 {
-                    float4 fg_k_1 = img[k_10] * area[k_10];
-                    rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_1.w)) + fg_k_1;
+                    uint param_32 = k_10;
+                    float2 my_xy_1 = xy + float2(chunk_offset(param_32));
+                    my_xy_1 = ((rad.mat.xz * my_xy_1.x) + (rad.mat.yw * my_xy_1.y)) - rad.xlat;
+                    float ba = dot(my_xy_1, rad.c1);
+                    float ca = rad.ra * dot(my_xy_1, my_xy_1);
+                    float t_2 = (sqrt((ba * ba) + ca) - ba) - rad.roff;
+                    int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0));
+                    float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index))));
+                    float3 param_33 = fg_rgba_1.xyz;
+                    float3 _2348 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2348.x;
+                    fg_rgba_1.y = _2348.y;
+                    fg_rgba_1.z = _2348.z;
+                    float4 fg_k_2 = fg_rgba_1 * area[k_10];
+                    rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2;
                 }
-                cmd_ref.offset += 12u;
+                cmd_ref.offset += 48u;
                 break;
             }
             case 8u:
             {
+                Alloc param_34 = cmd_alloc;
+                CmdRef param_35 = cmd_ref;
+                CmdImage fill_img = Cmd_Image_read(param_34, param_35, v_291);
+                uint2 param_36 = xy_uint;
+                CmdImage param_37 = fill_img;
+                spvUnsafeArray<float4, 8> img;
+                img = fillImage(param_36, param_37, image_atlas);
                 for (uint k_11 = 0u; k_11 < 8u; k_11++)
+                {
+                    float4 fg_k_3 = img[k_11] * area[k_11];
+                    rgba[k_11] = (rgba[k_11] * (1.0 - fg_k_3.w)) + fg_k_3;
+                }
+                cmd_ref.offset += 12u;
+                break;
+            }
+            case 9u:
+            {
+                for (uint k_12 = 0u; k_12 < 8u; k_12++)
                 {
                     uint d_2 = min(clip_depth, 127u);
-                    float4 param_34 = float4(rgba[k_11]);
-                    uint _2184 = packsRGB(param_34);
-                    blend_stack[d_2][k_11] = _2184;
-                    rgba[k_11] = float4(0.0);
+                    float4 param_38 = float4(rgba[k_12]);
+                    uint _2454 = packsRGB(param_38);
+                    blend_stack[d_2][k_12] = _2454;
+                    rgba[k_12] = float4(0.0);
                 }
                 clip_depth++;
                 cmd_ref.offset += 4u;
                 break;
             }
-            case 9u:
+            case 10u:
             {
-                Alloc param_35 = cmd_alloc;
-                CmdRef param_36 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_35, param_36, v_278);
+                Alloc param_39 = cmd_alloc;
+                CmdRef param_40 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40, v_291);
                 uint blend_mode = end_clip.blend >> uint(8);
                 uint comp_mode = end_clip.blend & 255u;
                 clip_depth--;
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                for (uint k_13 = 0u; k_13 < 8u; k_13++)
                 {
                     uint d_3 = min(clip_depth, 127u);
-                    uint param_37 = blend_stack[d_3][k_12];
-                    float4 bg = unpacksRGB(param_37);
-                    float4 fg_1 = rgba[k_12] * area[k_12];
-                    float3 param_38 = bg.xyz;
-                    float3 param_39 = fg_1.xyz;
-                    uint param_40 = blend_mode;
-                    float3 blend = mix_blend(param_38, param_39, param_40);
-                    float4 _2251 = fg_1;
-                    float _2255 = fg_1.w;
-                    float3 _2262 = mix(_2251.xyz, blend, float3(float((_2255 * bg.w) > 0.0)));
-                    fg_1.x = _2262.x;
-                    fg_1.y = _2262.y;
-                    fg_1.z = _2262.z;
-                    float3 param_41 = bg.xyz;
-                    float3 param_42 = fg_1.xyz;
-                    float param_43 = bg.w;
-                    float param_44 = fg_1.w;
-                    uint param_45 = comp_mode;
-                    rgba[k_12] = mix_compose(param_41, param_42, param_43, param_44, param_45);
+                    uint param_41 = blend_stack[d_3][k_13];
+                    float4 bg = unpacksRGB(param_41);
+                    float4 fg_1 = rgba[k_13] * area[k_13];
+                    float3 param_42 = bg.xyz;
+                    float3 param_43 = fg_1.xyz;
+                    uint param_44 = blend_mode;
+                    float3 blend = mix_blend(param_42, param_43, param_44);
+                    float4 _2521 = fg_1;
+                    float _2525 = fg_1.w;
+                    float3 _2532 = mix(_2521.xyz, blend, float3(float((_2525 * bg.w) > 0.0)));
+                    fg_1.x = _2532.x;
+                    fg_1.y = _2532.y;
+                    fg_1.z = _2532.z;
+                    float3 param_45 = bg.xyz;
+                    float3 param_46 = fg_1.xyz;
+                    float param_47 = bg.w;
+                    float param_48 = fg_1.w;
+                    uint param_49 = comp_mode;
+                    rgba[k_13] = mix_compose(param_45, param_46, param_47, param_48, param_49);
                 }
                 cmd_ref.offset += 8u;
                 break;
             }
-            case 10u:
+            case 11u:
             {
-                Alloc param_46 = cmd_alloc;
-                CmdRef param_47 = cmd_ref;
-                cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_278).new_ref };
+                Alloc param_50 = cmd_alloc;
+                CmdRef param_51 = cmd_ref;
+                cmd_ref = CmdRef{ Cmd_Jump_read(param_50, param_51, v_291).new_ref };
                 cmd_alloc.offset = cmd_ref.offset;
                 break;
             }
@@ -1203,8 +1300,8 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
     }
     for (uint i_1 = 0u; i_1 < 8u; i_1++)
     {
-        uint param_48 = i_1;
-        image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48))));
+        uint param_52 = i_1;
+        image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_52))));
     }
 }
 
diff --git a/piet-gpu/shader/gen/kernel4_gray.spv b/piet-gpu/shader/gen/kernel4_gray.spv
index 791b76c..4633401 100644
Binary files a/piet-gpu/shader/gen/kernel4_gray.spv and b/piet-gpu/shader/gen/kernel4_gray.spv differ
diff --git a/piet-gpu/shader/gen/tile_alloc.dxil b/piet-gpu/shader/gen/tile_alloc.dxil
index 7759910..7b130e0 100644
Binary files a/piet-gpu/shader/gen/tile_alloc.dxil and b/piet-gpu/shader/gen/tile_alloc.dxil differ
diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp
index a97715a..c49e2fa 100644
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@@ -192,10 +192,27 @@ void main() {
                 int x = int(round(clamp(my_d, 0.0, 1.0) * float(GRADIENT_WIDTH - 1)));
                 mediump vec4 fg_rgba = imageLoad(gradients, ivec2(x, int(lin.index)));
                 fg_rgba.rgb = fromsRGB(fg_rgba.rgb);
-                rgba[k] = fg_rgba;
+                mediump vec4 fg_k = fg_rgba * area[k];
+                rgba[k] = rgba[k] * (1.0 - fg_k.a) + fg_k;
             }
             cmd_ref.offset += 4 + CmdLinGrad_size;
             break;
+        case Cmd_RadGrad:
+            CmdRadGrad rad = Cmd_RadGrad_read(cmd_alloc, cmd_ref);
+            for (uint k = 0; k < CHUNK; k++) {
+                vec2 my_xy = xy + vec2(chunk_offset(k));
+                my_xy = rad.mat.xz * my_xy.x + rad.mat.yw * my_xy.y - rad.xlat;
+                float ba = dot(my_xy, rad.c1);
+                float ca = rad.ra * dot(my_xy, my_xy);
+                float t = sqrt(ba * ba  + ca) - ba - rad.roff;
+                int x = int(round(clamp(t, 0.0, 1.0) * float(GRADIENT_WIDTH - 1)));
+                mediump vec4 fg_rgba = imageLoad(gradients, ivec2(x, int(rad.index)));
+                fg_rgba.rgb = fromsRGB(fg_rgba.rgb);
+                mediump vec4 fg_k = fg_rgba * area[k];
+                rgba[k] = rgba[k] * (1.0 - fg_k.a) + fg_k;
+            }
+            cmd_ref.offset += 4 + CmdRadGrad_size;
+            break;
         case Cmd_Image:
             CmdImage fill_img = Cmd_Image_read(cmd_alloc, cmd_ref);
             mediump vec4 img[CHUNK] = fillImage(xy_uint, fill_img);
diff --git a/piet-gpu/shader/ptcl.h b/piet-gpu/shader/ptcl.h
index 9b9b341..54dcc9e 100644
--- a/piet-gpu/shader/ptcl.h
+++ b/piet-gpu/shader/ptcl.h
@@ -18,6 +18,10 @@ struct CmdLinGradRef {
     uint offset;
 };
 
+struct CmdRadGradRef {
+    uint offset;
+};
+
 struct CmdImageRef {
     uint offset;
 };
@@ -83,6 +87,21 @@ CmdLinGradRef CmdLinGrad_index(CmdLinGradRef ref, uint index) {
     return CmdLinGradRef(ref.offset + index * CmdLinGrad_size);
 }
 
+struct CmdRadGrad {
+    uint index;
+    vec4 mat;
+    vec2 xlat;
+    vec2 c1;
+    float ra;
+    float roff;
+};
+
+#define CmdRadGrad_size 44
+
+CmdRadGradRef CmdRadGrad_index(CmdRadGradRef ref, uint index) {
+    return CmdRadGradRef(ref.offset + index * CmdRadGrad_size);
+}
+
 struct CmdImage {
     uint index;
     ivec2 offset;
@@ -131,11 +150,12 @@ CmdJumpRef CmdJump_index(CmdJumpRef ref, uint index) {
 #define Cmd_Alpha 4
 #define Cmd_Color 5
 #define Cmd_LinGrad 6
-#define Cmd_Image 7
-#define Cmd_BeginClip 8
-#define Cmd_EndClip 9
-#define Cmd_Jump 10
-#define Cmd_size 20
+#define Cmd_RadGrad 7
+#define Cmd_Image 8
+#define Cmd_BeginClip 9
+#define Cmd_EndClip 10
+#define Cmd_Jump 11
+#define Cmd_size 48
 
 CmdRef Cmd_index(CmdRef ref, uint index) {
     return CmdRef(ref.offset + index * Cmd_size);
@@ -213,6 +233,44 @@ void CmdLinGrad_write(Alloc a, CmdLinGradRef ref, CmdLinGrad s) {
     write_mem(a, ix + 3, floatBitsToUint(s.line_c));
 }
 
+CmdRadGrad CmdRadGrad_read(Alloc a, CmdRadGradRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    uint raw4 = read_mem(a, ix + 4);
+    uint raw5 = read_mem(a, ix + 5);
+    uint raw6 = read_mem(a, ix + 6);
+    uint raw7 = read_mem(a, ix + 7);
+    uint raw8 = read_mem(a, ix + 8);
+    uint raw9 = read_mem(a, ix + 9);
+    uint raw10 = read_mem(a, ix + 10);
+    CmdRadGrad s;
+    s.index = raw0;
+    s.mat = vec4(uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3), uintBitsToFloat(raw4));
+    s.xlat = vec2(uintBitsToFloat(raw5), uintBitsToFloat(raw6));
+    s.c1 = vec2(uintBitsToFloat(raw7), uintBitsToFloat(raw8));
+    s.ra = uintBitsToFloat(raw9);
+    s.roff = uintBitsToFloat(raw10);
+    return s;
+}
+
+void CmdRadGrad_write(Alloc a, CmdRadGradRef ref, CmdRadGrad s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, s.index);
+    write_mem(a, ix + 1, floatBitsToUint(s.mat.x));
+    write_mem(a, ix + 2, floatBitsToUint(s.mat.y));
+    write_mem(a, ix + 3, floatBitsToUint(s.mat.z));
+    write_mem(a, ix + 4, floatBitsToUint(s.mat.w));
+    write_mem(a, ix + 5, floatBitsToUint(s.xlat.x));
+    write_mem(a, ix + 6, floatBitsToUint(s.xlat.y));
+    write_mem(a, ix + 7, floatBitsToUint(s.c1.x));
+    write_mem(a, ix + 8, floatBitsToUint(s.c1.y));
+    write_mem(a, ix + 9, floatBitsToUint(s.ra));
+    write_mem(a, ix + 10, floatBitsToUint(s.roff));
+}
+
 CmdImage CmdImage_read(Alloc a, CmdImageRef ref) {
     uint ix = ref.offset >> 2;
     uint raw0 = read_mem(a, ix + 0);
@@ -293,6 +351,10 @@ CmdLinGrad Cmd_LinGrad_read(Alloc a, CmdRef ref) {
     return CmdLinGrad_read(a, CmdLinGradRef(ref.offset + 4));
 }
 
+CmdRadGrad Cmd_RadGrad_read(Alloc a, CmdRef ref) {
+    return CmdRadGrad_read(a, CmdRadGradRef(ref.offset + 4));
+}
+
 CmdImage Cmd_Image_read(Alloc a, CmdRef ref) {
     return CmdImage_read(a, CmdImageRef(ref.offset + 4));
 }
@@ -338,6 +400,11 @@ void Cmd_LinGrad_write(Alloc a, CmdRef ref, CmdLinGrad s) {
     CmdLinGrad_write(a, CmdLinGradRef(ref.offset + 4), s);
 }
 
+void Cmd_RadGrad_write(Alloc a, CmdRef ref, CmdRadGrad s) {
+    write_mem(a, ref.offset >> 2, Cmd_RadGrad);
+    CmdRadGrad_write(a, CmdRadGradRef(ref.offset + 4), s);
+}
+
 void Cmd_Image_write(Alloc a, CmdRef ref, CmdImage s) {
     write_mem(a, ref.offset >> 2, Cmd_Image);
     CmdImage_write(a, CmdImageRef(ref.offset + 4), s);
diff --git a/piet-gpu/src/encoder.rs b/piet-gpu/src/encoder.rs
index 62c59c4..2f4b85e 100644
--- a/piet-gpu/src/encoder.rs
+++ b/piet-gpu/src/encoder.rs
@@ -62,6 +62,7 @@ const ANNOTATED_SIZE: usize = 40;
 // Tags for draw objects. See shader/drawtag.h for the authoritative source.
 const DRAWTAG_FILLCOLOR: u32 = 0x44;
 const DRAWTAG_FILLLINGRADIENT: u32 = 0x114;
+const DRAWTAG_FILLRADGRADIENT: u32 = 0x2dc;
 const DRAWTAG_BEGINCLIP: u32 = 0x05;
 const DRAWTAG_ENDCLIP: u32 = 0x25;
 
@@ -79,6 +80,16 @@ pub struct FillLinGradient {
     p1: [f32; 2],
 }
 
+#[repr(C)]
+#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
+pub struct FillRadGradient {
+    index: u32,
+    p0: [f32; 2],
+    p1: [f32; 2],
+    r0: f32,
+    r1: f32,
+}
+
 #[allow(unused)]
 #[repr(C)]
 #[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
@@ -123,6 +134,13 @@ impl Encoder {
         self.transform_stream.push(transform);
     }
 
+    // Swap the last two tags in the tag stream; used for transformed
+    // gradients.
+    pub fn swap_last_tags(&mut self) {
+        let len = self.tag_stream.len();
+        self.tag_stream.swap(len - 1, len - 2);
+    }
+
     // -1.0 means "fill"
     pub fn linewidth(&mut self, linewidth: f32) {
         self.tag_stream.push(0x40);
@@ -147,6 +165,16 @@ impl Encoder {
         self.drawdata_stream.extend(bytemuck::bytes_of(&element));
     }
 
+
+    /// Encode a fill radial gradient draw object.
+    ///
+    /// This should be encoded after a path.
+    pub fn fill_rad_gradient(&mut self, index: u32, p0: [f32; 2], p1: [f32; 2], r0: f32, r1: f32) {
+        self.drawtag_stream.push(DRAWTAG_FILLRADGRADIENT);
+        let element = FillRadGradient { index, p0, p1, r0, r1 };
+        self.drawdata_stream.extend(bytemuck::bytes_of(&element));
+    }
+    
     /// Start a clip.
     pub fn begin_clip(&mut self, blend: Option<Blend>) {
         self.drawtag_stream.push(DRAWTAG_BEGINCLIP);
@@ -220,7 +248,7 @@ impl Encoder {
         alloc += n_drawobj * DRAW_BBOX_SIZE;
         let drawinfo_alloc = alloc;
         // TODO: not optimized; it can be accumulated during encoding or summed from drawtags
-        const MAX_DRAWINFO_SIZE: usize = 16;
+        const MAX_DRAWINFO_SIZE: usize = 44;
         alloc += n_drawobj * MAX_DRAWINFO_SIZE;
 
         let config = Config {
diff --git a/piet-gpu/src/gradient.rs b/piet-gpu/src/gradient.rs
index 20982e9..e655908 100644
--- a/piet-gpu/src/gradient.rs
+++ b/piet-gpu/src/gradient.rs
@@ -18,15 +18,29 @@
 
 use std::collections::hash_map::{Entry, HashMap};
 
-use piet::{Color, FixedLinearGradient, GradientStop};
+use piet::kurbo::Point;
+use piet::{Color, FixedLinearGradient, GradientStop, FixedRadialGradient};
+
+/// Radial gradient compatible with COLRv1 spec
+#[derive(Debug, Clone)]
+pub struct Colrv1RadialGradient {
+    /// The center of the iner circle.
+    pub center0: Point,
+    /// The offset of the origin relative to the center.
+    pub center1: Point,
+    /// The radius of the inner circle.
+    pub radius0: f64,
+    /// The radius of the outer circle.
+    pub radius1: f64,
+    /// The stops.
+    pub stops: Vec<GradientStop>,
+}
 
 #[derive(Clone)]
 pub struct BakedGradient {
     ramp: Vec<u32>,
 }
 
-/// This is basically the same type as scene::FillLinGradient, so could
-/// potentially use that directly.
 #[derive(Clone)]
 pub struct LinearGradient {
     pub(crate) start: [f32; 2],
@@ -34,6 +48,15 @@ pub struct LinearGradient {
     pub(crate) ramp_id: u32,
 }
 
+#[derive(Clone)]
+pub struct RadialGradient {
+    pub(crate) start: [f32; 2],
+    pub(crate) end: [f32; 2],
+    pub(crate) r0: f32,
+    pub(crate) r1: f32,
+    pub(crate) ramp_id: u32,
+}
+
 #[derive(Default)]
 pub struct RampCache {
     ramps: Vec<GradientRamp>,
@@ -154,6 +177,28 @@ impl RampCache {
         }
     }
 
+    pub fn add_radial_gradient(&mut self, rad: &FixedRadialGradient) -> RadialGradient {
+        let ramp_id = self.add_ramp(&rad.stops);
+        RadialGradient {
+            ramp_id: ramp_id as u32,
+            start: crate::render_ctx::to_f32_2(rad.center + rad.origin_offset),
+            end: crate::render_ctx::to_f32_2(rad.center),
+            r0: 0.0,
+            r1: rad.radius as f32,
+        }
+    }
+
+    pub fn add_radial_gradient_colrv1(&mut self, rad: &Colrv1RadialGradient) -> RadialGradient {
+        let ramp_id = self.add_ramp(&rad.stops);
+        RadialGradient {
+            ramp_id: ramp_id as u32,
+            start: crate::render_ctx::to_f32_2(rad.center0),
+            end: crate::render_ctx::to_f32_2(rad.center1),
+            r0: rad.radius0 as f32,
+            r1: rad.radius1 as f32,
+        }
+    }
+
     /// Dump the contents of a gradient. This is for debugging.
     #[allow(unused)]
     pub(crate) fn dump_gradient(&self, lin: &LinearGradient) {
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index 249735a..475d723 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -12,6 +12,7 @@ use std::convert::TryInto;
 
 pub use blend::{Blend, BlendMode, CompositionMode};
 pub use render_ctx::PietGpuRenderContext;
+pub use gradient::Colrv1RadialGradient;
 
 use piet::kurbo::Vec2;
 use piet::{ImageFormat, RenderContext};
@@ -21,7 +22,7 @@ use piet_gpu_hal::{
     ImageLayout, Pipeline, QueryPool, Session,
 };
 
-pub use pico_svg::PicoSvg;
+use pico_svg::PicoSvg;
 use stages::{ClipBinding, ElementBinding, ElementCode};
 
 use crate::stages::{ClipCode, Config, ElementStage};
diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs
index 024dd2b..dca03eb 100644
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@@ -13,7 +13,7 @@ use piet_gpu_hal::BufWrite;
 use piet_gpu_types::encoder::{Encode, Encoder};
 use piet_gpu_types::scene::Element;
 
-use crate::gradient::{LinearGradient, RampCache};
+use crate::gradient::{LinearGradient, RadialGradient, RampCache, Colrv1RadialGradient};
 use crate::text::Font;
 pub use crate::text::{PietGpuText, PietGpuTextLayout, PietGpuTextLayoutBuilder};
 use crate::Blend;
@@ -50,6 +50,7 @@ pub struct PietGpuRenderContext {
 pub enum PietGpuBrush {
     Solid(u32),
     LinGradient(LinearGradient),
+    RadGradient(RadialGradient),
 }
 
 #[derive(Default)]
@@ -187,6 +188,10 @@ impl RenderContext for PietGpuRenderContext {
                 let lin = self.ramp_cache.add_linear_gradient(&lin);
                 Ok(PietGpuBrush::LinGradient(lin))
             }
+            FixedGradient::Radial(rad) => {
+                let rad = self.ramp_cache.add_radial_gradient(&rad);
+                Ok(PietGpuBrush::RadGradient(rad))
+            }
             _ => todo!("don't do radial gradients yet"),
         }
     }
@@ -338,6 +343,20 @@ impl PietGpuRenderContext {
         }
     }
 
+    pub fn radial_gradient_colrv1(&mut self, rad: &Colrv1RadialGradient) -> PietGpuBrush {
+        PietGpuBrush::RadGradient(self.ramp_cache.add_radial_gradient_colrv1(rad))
+    }
+
+    pub fn fill_transform(&mut self, shape: impl Shape, brush: &PietGpuBrush, transform: Affine) {
+        let path = shape.path_elements(TOLERANCE);
+        self.encode_linewidth(-1.0);
+        self.encode_path(path, true);
+        self.encode_transform(Transform::from_kurbo(transform));
+        self.new_encoder.swap_last_tags();
+        self.encode_brush(&brush);
+        self.encode_transform(Transform::from_kurbo(transform.inverse()));
+    }
+
     fn encode_path(&mut self, path: impl Iterator<Item = PathEl>, is_fill: bool) {
         if is_fill {
             self.encode_path_inner(
@@ -420,6 +439,10 @@ impl PietGpuRenderContext {
                 self.new_encoder
                     .fill_lin_gradient(lin.ramp_id, lin.start, lin.end);
             }
+            PietGpuBrush::RadGradient(rad) => {
+                self.new_encoder
+                    .fill_rad_gradient(rad.ramp_id, rad.start, rad.end, rad.r0, rad.r1);
+            }
         }
     }
 }
diff --git a/piet-gpu/src/test_scenes.rs b/piet-gpu/src/test_scenes.rs
index 350b9dd..cf5a50d 100644
--- a/piet-gpu/src/test_scenes.rs
+++ b/piet-gpu/src/test_scenes.rs
@@ -2,10 +2,10 @@
 
 use rand::{Rng, RngCore};
 
-use crate::{Blend, BlendMode, CompositionMode, PietGpuRenderContext};
+use crate::{Blend, BlendMode, CompositionMode, PietGpuRenderContext, Colrv1RadialGradient};
 use piet::kurbo::{Affine, BezPath, Circle, Line, Point, Rect, Shape};
 use piet::{
-    Color, FixedGradient, FixedLinearGradient, GradientStop, Text, TextAttribute, TextLayoutBuilder,
+    Color, FixedGradient, FixedRadialGradient, GradientStop, Text, TextAttribute, TextLayoutBuilder,
 };
 
 use crate::{PicoSvg, RenderContext, Vec2};
@@ -21,13 +21,18 @@ pub fn render_blend_test(rc: &mut PietGpuRenderContext, i: usize, blend: Blend)
     rc.restore().unwrap();
 }
 
-pub fn render_svg(rc: &mut impl RenderContext, svg: &PicoSvg) {
+pub fn render_svg(rc: &mut impl RenderContext, filename: &str, scale: f64) {
+    let xml_str = std::fs::read_to_string(filename).unwrap();
+    let start = std::time::Instant::now();
+    let svg = PicoSvg::load(&xml_str, scale).unwrap();
+    println!("parsing time: {:?}", start.elapsed());
+
     let start = std::time::Instant::now();
     svg.render(rc);
     println!("flattening and encoding time: {:?}", start.elapsed());
 }
 
-pub fn render_scene(rc: &mut impl RenderContext) {
+pub fn render_scene(rc: &mut PietGpuRenderContext) {
     const WIDTH: usize = 2048;
     const HEIGHT: usize = 1536;
     let mut rng = rand::thread_rng();
@@ -137,7 +142,7 @@ fn render_alpha_test(rc: &mut impl RenderContext) {
 }
 
 #[allow(unused)]
-fn render_gradient_test(rc: &mut impl RenderContext) {
+fn render_gradient_test(rc: &mut PietGpuRenderContext) {
     let stops = vec![
         GradientStop {
             color: Color::rgb8(0, 255, 0),
@@ -148,14 +153,18 @@ fn render_gradient_test(rc: &mut impl RenderContext) {
             pos: 1.0,
         },
     ];
-    let lin = FixedLinearGradient {
-        start: Point::new(0.0, 100.0),
-        end: Point::new(0.0, 300.0),
+    let rad = Colrv1RadialGradient {
+        center0: Point::new(200.0, 200.0),
+        center1: Point::new(250.0, 200.0),
+        radius0: 50.0,
+        radius1: 100.0,
         stops,
     };
-    let brush = FixedGradient::Linear(lin);
+    let brush = rc.radial_gradient_colrv1(&rad);
+    //let brush = FixedGradient::Radial(rad);
     //let brush = Color::rgb8(0, 128, 0);
-    rc.fill(Rect::new(100.0, 100.0, 300.0, 300.0), &brush);
+    let transform = Affine::new([1.0, 0.0, 0.0, 0.5, 0.0, 100.0]);
+    rc.fill_transform(Rect::new(100.0, 100.0, 300.0, 300.0), &brush, transform);
 }
 
 fn diamond(origin: Point) -> impl Shape {
diff --git a/piet-scene/Cargo.toml b/piet-scene/Cargo.toml
new file mode 100644
index 0000000..8706119
--- /dev/null
+++ b/piet-scene/Cargo.toml
@@ -0,0 +1,9 @@
+[package]
+name = "piet-scene"
+version = "0.1.0"
+license = "MIT/Apache-2.0"
+edition = "2021"
+
+[dependencies]
+bytemuck = { version = "1.7.2", features = ["derive"] }
+smallvec = "1.8.0"
diff --git a/piet-scene/src/brush/color.rs b/piet-scene/src/brush/color.rs
new file mode 100644
index 0000000..d079210
--- /dev/null
+++ b/piet-scene/src/brush/color.rs
@@ -0,0 +1,62 @@
+// Copyright 2022 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Default, Debug)]
+pub struct Color {
+    pub r: u8,
+    pub g: u8,
+    pub b: u8,
+    pub a: u8,
+}
+
+impl Color {
+    pub const fn rgb8(r: u8, g: u8, b: u8) -> Self {
+        Self { r, g, b, a: 255 }
+    }
+
+    pub const fn rgba8(r: u8, g: u8, b: u8, a: u8) -> Self {
+        Self { r, g, b, a }
+    }
+
+    pub fn rgb<C: Into<f64>>(r: C, g: C, b: C) -> Self {
+        Self::rgb8(
+            (r.into() / 255.0) as u8,
+            (g.into() / 255.0) as u8,
+            (b.into() / 255.0) as u8,
+        )
+    }
+
+    pub fn rgba<C: Into<f64>>(r: C, g: C, b: C, a: C) -> Self {
+        Self::rgba8(
+            (r.into() / 255.0) as u8,
+            (g.into() / 255.0) as u8,
+            (b.into() / 255.0) as u8,
+            (a.into() / 255.0) as u8,
+        )
+    }
+
+    pub fn pack(self) -> u32 {
+        (self.b as u32) << 24 | (self.g as u32) << 16 | (self.r as u32) << 8 | self.a as u32
+    }
+
+    pub fn to_premul_u32(self) -> u32 {
+        let a = self.a as f64 / 255.0;
+        let r = (self.r as f64 * a) as u32;
+        let g = (self.g as f64 * a) as u32;
+        let b = (self.b as f64 * a) as u32;
+        r | (g << 8) | (b << 16) | ((self.a as u32) << 24)
+    }
+}
diff --git a/piet-scene/src/brush/gradient.rs b/piet-scene/src/brush/gradient.rs
new file mode 100644
index 0000000..51d558e
--- /dev/null
+++ b/piet-scene/src/brush/gradient.rs
@@ -0,0 +1,87 @@
+// Copyright 2022 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+use super::color::Color;
+use crate::geometry::Point;
+use smallvec::SmallVec;
+use std::hash::{Hash, Hasher};
+
+#[derive(Copy, Clone, PartialOrd, Default, Debug)]
+pub struct Stop {
+    pub offset: f32,
+    pub color: Color,
+}
+
+impl Hash for Stop {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.offset.to_bits().hash(state);
+        self.color.hash(state);
+    }
+}
+
+// Override PartialEq to use to_bits for the offset to match with the Hash impl
+impl std::cmp::PartialEq for Stop {
+    fn eq(&self, other: &Self) -> bool {
+        self.offset.to_bits() == other.offset.to_bits() && self.color == other.color
+    }
+}
+
+impl std::cmp::Eq for Stop {}
+
+pub type StopVec = SmallVec<[Stop; 4]>;
+
+#[derive(Copy, Clone, PartialEq, Debug)]
+pub enum Extend {
+    Pad,
+    Repeat,
+    Reflect,
+}
+
+#[derive(Copy, Clone, PartialEq, Debug)]
+pub enum Space {
+    Object,
+    Global,
+}
+
+#[derive(Clone, Debug)]
+pub struct LinearGradient {
+    pub space: Space,
+    pub start: Point,
+    pub end: Point,
+    pub stops: StopVec,
+    pub extend: Extend,
+}
+
+#[derive(Clone, Debug)]
+pub struct RadialGradient {
+    pub space: Space,
+    pub center0: Point,
+    pub radius0: f32,
+    pub center1: Point,
+    pub radius1: f32,
+    pub stops: StopVec,
+    pub extend: Extend,
+}
+
+#[derive(Clone, Debug)]
+pub struct SweepGradient {
+    pub space: Space,
+    pub center: Point,
+    pub start_angle: f32,
+    pub end_angle: f32,
+    pub stops: StopVec,
+    pub extend: Extend,
+}
diff --git a/piet-scene/src/brush/image.rs b/piet-scene/src/brush/image.rs
new file mode 100644
index 0000000..07157e7
--- /dev/null
+++ b/piet-scene/src/brush/image.rs
@@ -0,0 +1,96 @@
+// Copyright 2022 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+use std::result::Result;
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::Arc;
+
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+pub enum Format {
+    A8,
+    Rgba8,
+}
+
+impl Format {
+    pub fn data_size(self, width: u32, height: u32) -> Option<usize> {
+        (width as usize)
+            .checked_mul(height as usize)
+            .and_then(|size| {
+                size.checked_mul(match self {
+                    Self::A8 => 1,
+                    Self::Rgba8 => 4,
+                })
+            })
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct Image(Arc<Inner>);
+
+#[derive(Clone, Debug)]
+struct Inner {
+    id: u64,
+    format: Format,
+    width: u32,
+    height: u32,
+    data: Vec<u8>,
+}
+
+impl Image {
+    pub fn new(
+        format: Format,
+        width: u32,
+        height: u32,
+        mut data: Vec<u8>,
+    ) -> Result<Self, DataSizeError> {
+        let data_size = format.data_size(width, height).ok_or(DataSizeError)?;
+        if data.len() < data_size {
+            return Err(DataSizeError);
+        }
+        data.truncate(data_size);
+        static ID: AtomicU64 = AtomicU64::new(1);
+        Ok(Self(Arc::new(Inner {
+            id: ID.fetch_add(1, Ordering::Relaxed),
+            format,
+            width,
+            height,
+            data,
+        })))
+    }
+
+    pub fn id(&self) -> u64 {
+        self.0.id
+    }
+
+    pub fn format(&self) -> Format {
+        self.0.format
+    }
+
+    pub fn width(&self) -> u32 {
+        self.0.width
+    }
+
+    pub fn height(&self) -> u32 {
+        self.0.height
+    }
+
+    pub fn data(&self) -> &[u8] {
+        &self.0.data
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct DataSizeError;
diff --git a/piet-scene/src/brush/mod.rs b/piet-scene/src/brush/mod.rs
new file mode 100644
index 0000000..9cde1fb
--- /dev/null
+++ b/piet-scene/src/brush/mod.rs
@@ -0,0 +1,35 @@
+// Copyright 2022 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+mod color;
+mod gradient;
+mod image;
+
+pub use color::Color;
+pub use gradient::*;
+pub use image::*;
+
+use crate::resource::PersistentBrush;
+
+#[derive(Clone, Debug)]
+pub enum Brush {
+    Solid(Color),
+    LinearGradient(LinearGradient),
+    RadialGradient(RadialGradient),
+    SweepGradient(SweepGradient),
+    Image(Image),
+    Persistent(PersistentBrush),
+}
diff --git a/piet-scene/src/geometry.rs b/piet-scene/src/geometry.rs
new file mode 100644
index 0000000..a40cc40
--- /dev/null
+++ b/piet-scene/src/geometry.rs
@@ -0,0 +1,189 @@
+// Copyright 2022 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+use core::borrow::Borrow;
+use core::hash::{Hash, Hasher};
+
+/// Two dimensional point.
+#[derive(Copy, Clone, PartialEq, PartialOrd, Default, Debug)]
+#[repr(C)]
+pub struct Point {
+    pub x: f32,
+    pub y: f32,
+}
+
+impl Hash for Point {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.x.to_bits().hash(state);
+        self.y.to_bits().hash(state);
+    }
+}
+
+impl Point {
+    pub const fn new(x: f32, y: f32) -> Self {
+        Self { x, y }
+    }
+
+    pub fn transform(&self, affine: &Affine) -> Self {
+        Self {
+            x: self.x * affine.xx + self.y * affine.yx + affine.dx,
+            y: self.y * affine.yy + self.y * affine.xy + affine.dy,
+        }
+    }
+}
+
+impl From<[f32; 2]> for Point {
+    fn from(value: [f32; 2]) -> Self {
+        Self::new(value[0], value[1])
+    }
+}
+
+impl From<(f32, f32)> for Point {
+    fn from(value: (f32, f32)) -> Self {
+        Self::new(value.0, value.1)
+    }
+}
+
+/// Affine transformation matrix.
+#[derive(Copy, Clone, Debug)]
+#[repr(C)]
+pub struct Affine {
+    pub xx: f32,
+    pub yx: f32,
+    pub xy: f32,
+    pub yy: f32,
+    pub dx: f32,
+    pub dy: f32,
+}
+
+impl Affine {
+    pub const fn new(elements: &[f32; 6]) -> Self {
+        Self {
+            xx: elements[0],
+            yx: elements[1],
+            xy: elements[2],
+            yy: elements[3],
+            dx: elements[4],
+            dy: elements[5],
+        }
+    }
+
+    /// Creates a new affine transform representing the specified scale along the
+    /// x and y axes.
+    pub fn scale(x: f32, y: f32) -> Self {
+        Self::new(&[x, 0., 0., y, 0., 0.])
+    }
+
+    /// Creates a new affine transform representing the specified translation.
+    pub fn translate(x: f32, y: f32) -> Self {
+        Self::new(&[1., 0., 0., 1., x, y])
+    }
+
+    /// Creates a new affine transform representing a counter-clockwise
+    /// rotation for the specified angle in radians.
+    pub fn rotate(th: f32) -> Self {
+        let (s, c) = th.sin_cos();
+        Self::new(&[c, s, -s, c, 0., 0.])
+    }
+
+    /// Creates a new skew transform
+    pub fn skew(x: f32, y: f32) -> Self {
+        Self::new(&[1., x.tan(), y.tan(), 1., 0., 0.])
+    }
+
+    pub fn around_center(&self, x: f32, y: f32) -> Self {
+        Self::translate(x, y) * *self * Self::translate(-x, -y)
+    }
+
+    /// Transforms the specified point.
+    pub fn transform_point(&self, point: Point) -> Point {
+        Point {
+            x: point.x * self.xx + point.y * self.yx + self.dx,
+            y: point.y * self.yy + point.y * self.xy + self.dy,
+        }
+    }
+}
+
+impl std::ops::Mul for Affine {
+    type Output = Self;
+    fn mul(self, other: Self) -> Self {
+        Self::new(&[
+            self.xx * other.xx + self.xy * other.yx,
+            self.yx * other.xx + self.yy * other.yx,
+            self.xx * other.xy + self.xy * other.yy,
+            self.yx * other.xy + self.yy * other.yy,
+            self.xx * other.dx + self.xy * other.dy + self.dx,
+            self.yx * other.dx + self.yy * other.dy + self.dy,
+        ])
+    }
+}
+
+/// Axis-aligned rectangle represented as minimum and maximum points.
+#[derive(Copy, Clone, Default, Debug)]
+#[repr(C)]
+pub struct Rect {
+    pub min: Point,
+    pub max: Point,
+}
+
+impl Rect {
+    /// Creates a new rectangle that encloses the specified collection of
+    /// points.
+    pub fn from_points<I>(points: I) -> Self
+    where
+        I: IntoIterator,
+        I::Item: Borrow<Point>,
+    {
+        let mut rect = Self {
+            min: Point::new(f32::MAX, f32::MAX),
+            max: Point::new(f32::MIN, f32::MIN),
+        };
+        let mut count = 0;
+        for point in points {
+            rect.add(*point.borrow());
+            count += 1;
+        }
+        if count != 0 {
+            rect
+        } else {
+            Self::default()
+        }
+    }
+
+    /// Returns the width of the rectangle.
+    pub fn width(&self) -> f32 {
+        self.max.x - self.min.x
+    }
+
+    /// Returns the height of the rectangle.
+    pub fn height(&self) -> f32 {
+        self.max.y - self.min.y
+    }
+
+    /// Extends the rectangle to include the specified point.
+    pub fn add(&mut self, point: Point) {
+        self.min.x = self.min.x.min(point.x);
+        self.min.y = self.min.y.min(point.y);
+        self.max.x = self.max.x.max(point.x);
+        self.max.y = self.max.y.max(point.y);
+    }
+
+    /// Returns a new rectangle that encloses the minimum and maximum points
+    /// of this rectangle after applying the specified transform to each.
+    pub fn transform(&self, affine: &Affine) -> Self {
+        Self::from_points([self.min.transform(affine), self.max.transform(affine)])
+    }
+}
diff --git a/piet-scene/src/lib.rs b/piet-scene/src/lib.rs
new file mode 100644
index 0000000..6b23f6d
--- /dev/null
+++ b/piet-scene/src/lib.rs
@@ -0,0 +1,21 @@
+// Copyright 2022 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+pub mod brush;
+pub mod geometry;
+pub mod path;
+pub mod resource;
+pub mod scene;
diff --git a/piet-scene/src/main.rs b/piet-scene/src/main.rs
new file mode 100644
index 0000000..8fd361a
--- /dev/null
+++ b/piet-scene/src/main.rs
@@ -0,0 +1,30 @@
+use piet_scene::geometry::*;
+use piet_scene::path::*;
+use piet_scene::scene::*;
+use piet_scene::{geometry::*, path::*, resource::ResourceContext, scene::*};
+
+fn main() {
+    let mut scene = Scene::default();
+    let mut rcx = ResourceContext::new();
+    let mut sb = build_scene(&mut scene, &mut rcx);
+
+    sb.push_layer(Blend::default(), Rect::default().elements());
+
+    let mut path = Path::new();
+    let mut b = PathBuilder::new(&mut path);
+    b.move_to(100., 100.);
+    b.line_to(200., 200.);
+    b.close_path();
+    b.move_to(50., 50.);
+    b.line_to(600., 150.);
+    b.move_to(4., 2.);
+    b.quad_to(8., 8., 9., 9.);
+    b.close_path();
+    println!("{:?}", path);
+    for el in path.elements() {
+        println!("{:?}", el);
+    }
+    //sb.push_layer(path.elements(), BlendMode::default());
+
+    sb.push_layer(Blend::default(), [Element::MoveTo((0., 0.).into())]);
+}
diff --git a/piet-scene/src/path.rs b/piet-scene/src/path.rs
new file mode 100644
index 0000000..ffbce3e
--- /dev/null
+++ b/piet-scene/src/path.rs
@@ -0,0 +1,311 @@
+// Copyright 2022 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+use super::geometry::{Point, Rect};
+
+/// Action of a path element.
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+pub enum Verb {
+    MoveTo,
+    LineTo,
+    QuadTo,
+    CurveTo,
+    Close,
+}
+
+/// Element of a path represented by a verb and its associated points.
+#[derive(Copy, Clone, PartialEq, Debug)]
+pub enum Element {
+    MoveTo(Point),
+    LineTo(Point),
+    QuadTo(Point, Point),
+    CurveTo(Point, Point, Point),
+    Close,
+}
+
+impl Element {
+    /// Returns the verb that describes the action of the path element.
+    pub fn verb(&self) -> Verb {
+        match self {
+            Self::MoveTo(..) => Verb::MoveTo,
+            Self::LineTo(..) => Verb::LineTo,
+            Self::QuadTo(..) => Verb::QuadTo,
+            Self::CurveTo(..) => Verb::CurveTo,
+            Self::Close => Verb::Close,
+        }
+    }
+}
+
+/// Encoded collection of path elements.
+#[derive(Clone, Default, Debug)]
+pub struct Path {
+    tag_stream: Vec<u8>,
+    pathseg_stream: Vec<u8>,
+    n_path: u32,
+    n_pathseg: u32,
+}
+
+impl Path {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn elements(&self) -> Elements {
+        Elements::new(&self)
+    }
+}
+
+#[derive(Clone)]
+pub struct Elements<'a> {
+    tag_stream: &'a [u8],
+    points: &'a [[f32; 2]],
+    tag_ix: usize,
+    point_ix: usize,
+    next_element: Option<Element>,
+    close: bool,
+}
+
+impl<'a> Elements<'a> {
+    fn new(path: &'a Path) -> Self {
+        let points: &'a [[f32; 2]] = bytemuck::cast_slice(&path.pathseg_stream);
+        let (point_ix, next_element) = match points.get(0) {
+            Some(&point) => (1, Some(Element::MoveTo(point.into()))),
+            None => (0, None),
+        };
+        Self {
+            tag_stream: &path.tag_stream,
+            points,
+            tag_ix: 0,
+            point_ix,
+            next_element,
+            close: false,
+        }
+    }
+}
+
+impl<'a> Iterator for Elements<'a> {
+    type Item = Element;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        // println!("n_points: {}", self.points.len());
+        // println!("tag_ix: {}, point_ix: {}, el: {:?}, close: {}", self.tag_ix, self.point_ix, self.next_element, self.close);
+        if self.close {
+            self.close = false;
+            return Some(Element::Close);
+        }
+        if let Some(next_el) = self.next_element.take() {
+            return Some(next_el);
+        }
+        let tag = *self.tag_stream.get(self.tag_ix)?;
+        self.tag_ix += 1;
+        let end = tag & 4 != 0;
+        let el = match tag & 3 {
+            1 => {
+                let p0 = *self.points.get(self.point_ix)?;
+                self.point_ix += 1;
+                Element::LineTo(p0.into())
+            }
+            2 => {
+                let p0 = *self.points.get(self.point_ix)?;
+                let p1 = *self.points.get(self.point_ix + 1)?;
+                self.point_ix += 2;
+                Element::QuadTo(p0.into(), p1.into())
+            }
+            3 => {
+                let p0 = *self.points.get(self.point_ix)?;
+                let p1 = *self.points.get(self.point_ix + 1)?;
+                let p2 = *self.points.get(self.point_ix + 2)?;
+                self.point_ix += 3;
+                Element::CurveTo(p0.into(), p1.into(), p2.into())
+            }
+            _ => return None,
+        };
+        if end {
+            // println!("END!");
+            if let Some(&p0) = self.points.get(self.point_ix) {
+                self.point_ix += 1;
+                self.next_element = Some(Element::MoveTo(p0.into()));
+            }
+            self.close = tag & 0x80 != 0;
+        }
+        Some(el)
+    }
+}
+
+pub struct PathBuilder<'a> {
+    tag_stream: &'a mut Vec<u8>,
+    // If we're never going to use the i16 encoding, it might be
+    // slightly faster to store this as Vec<u32>, we'd get aligned
+    // stores on ARM etc.
+    pathseg_stream: &'a mut Vec<u8>,
+    first_pt: [f32; 2],
+    state: State,
+    n_pathseg: u32,
+}
+
+#[derive(PartialEq)]
+enum State {
+    Start,
+    MoveTo,
+    NonemptySubpath,
+}
+
+impl<'a> PathBuilder<'a> {
+    pub fn new(path: &'a mut Path) -> Self {
+        Self {
+            tag_stream: &mut path.tag_stream,
+            pathseg_stream: &mut path.pathseg_stream,
+            first_pt: [0.0, 0.0],
+            state: State::Start,
+            n_pathseg: 0,
+        }
+    }
+
+    fn new_inner(tags: &'a mut Vec<u8>, pathsegs: &'a mut Vec<u8>) -> PathBuilder<'a> {
+        PathBuilder {
+            tag_stream: tags,
+            pathseg_stream: pathsegs,
+            first_pt: [0.0, 0.0],
+            state: State::Start,
+            n_pathseg: 0,
+        }
+    }
+
+    pub fn move_to(&mut self, x: f32, y: f32) {
+        let buf = [x, y];
+        let bytes = bytemuck::bytes_of(&buf);
+        self.first_pt = buf;
+        if self.state == State::MoveTo {
+            let new_len = self.pathseg_stream.len() - 8;
+            self.pathseg_stream.truncate(new_len);
+        }
+        if self.state == State::NonemptySubpath {
+            if let Some(tag) = self.tag_stream.last_mut() {
+                *tag |= 4;
+            }
+        }
+        self.pathseg_stream.extend_from_slice(bytes);
+        self.state = State::MoveTo;
+    }
+
+    pub fn line_to(&mut self, x: f32, y: f32) {
+        if self.state == State::Start {
+            // should warn or error
+            return;
+        }
+        let buf = [x, y];
+        let bytes = bytemuck::bytes_of(&buf);
+        self.pathseg_stream.extend_from_slice(bytes);
+        self.tag_stream.push(9);
+        self.state = State::NonemptySubpath;
+        self.n_pathseg += 1;
+    }
+
+    pub fn quad_to(&mut self, x1: f32, y1: f32, x2: f32, y2: f32) {
+        if self.state == State::Start {
+            return;
+        }
+        let buf = [x1, y1, x2, y2];
+        let bytes = bytemuck::bytes_of(&buf);
+        self.pathseg_stream.extend_from_slice(bytes);
+        self.tag_stream.push(10);
+        self.state = State::NonemptySubpath;
+        self.n_pathseg += 1;
+    }
+
+    pub fn cubic_to(&mut self, x1: f32, y1: f32, x2: f32, y2: f32, x3: f32, y3: f32) {
+        if self.state == State::Start {
+            return;
+        }
+        let buf = [x1, y1, x2, y2, x3, y3];
+        let bytes = bytemuck::bytes_of(&buf);
+        self.pathseg_stream.extend_from_slice(bytes);
+        self.tag_stream.push(11);
+        self.state = State::NonemptySubpath;
+        self.n_pathseg += 1;
+    }
+
+    pub fn close_path(&mut self) {
+        match self.state {
+            State::Start => return,
+            State::MoveTo => {
+                let new_len = self.pathseg_stream.len() - 8;
+                self.pathseg_stream.truncate(new_len);
+                self.state = State::Start;
+                return;
+            }
+            State::NonemptySubpath => (),
+        }
+        let len = self.pathseg_stream.len();
+        if len < 8 {
+            // can't happen
+            return;
+        }
+        let first_bytes = bytemuck::bytes_of(&self.first_pt);
+        if &self.pathseg_stream[len - 8..len] != first_bytes {
+            self.pathseg_stream.extend_from_slice(first_bytes);
+            self.tag_stream.push(0x80 | 13);
+            self.n_pathseg += 1;
+        } else {
+            if let Some(tag) = self.tag_stream.last_mut() {
+                *tag |= 0x80 | 4;
+            }
+        }
+        self.state = State::Start;
+    }
+
+    fn finish(&mut self) {
+        if self.state == State::MoveTo {
+            let new_len = self.pathseg_stream.len() - 8;
+            self.pathseg_stream.truncate(new_len);
+        }
+        if let Some(tag) = self.tag_stream.last_mut() {
+            *tag |= 4;
+        }
+    }
+
+    /// Finish encoding a path.
+    ///
+    /// Encode this after encoding path segments.
+    pub fn path(&mut self) {
+        self.finish();
+        // maybe don't encode if path is empty? might throw off sync though
+        self.tag_stream.push(0x10);
+    }
+
+    /// Get the number of path segments.
+    ///
+    /// This is the number of path segments that will be written by the
+    /// path stage; use this for allocating the output buffer.
+    ///
+    /// Also note: it takes `self` for lifetime reasons.
+    pub fn n_pathseg(self) -> u32 {
+        self.n_pathseg
+    }
+}
+
+impl Rect {
+    pub fn elements(&self) -> impl Iterator<Item = Element> + Clone {
+        let elements = [
+            Element::MoveTo((self.min.x, self.min.y).into()),
+            Element::LineTo((self.max.x, self.min.y).into()),
+            Element::LineTo((self.max.x, self.max.y).into()),
+            Element::LineTo((self.min.x, self.max.y).into()),
+            Element::Close,
+        ];
+        (0..5).map(move |i| elements[i])
+    }
+}
diff --git a/piet-scene/src/resource/mod.rs b/piet-scene/src/resource/mod.rs
new file mode 100644
index 0000000..36e6419
--- /dev/null
+++ b/piet-scene/src/resource/mod.rs
@@ -0,0 +1,37 @@
+mod ramp_cache;
+
+use crate::brush::{Brush, Stop};
+use ramp_cache::RampCache;
+
+/// Context for caching resources across rendering operations.
+#[derive(Default)]
+pub struct ResourceContext {
+    ramp_cache: RampCache,
+}
+
+impl ResourceContext {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn advance(&mut self) {
+        self.ramp_cache.advance();
+    }
+
+    pub fn add_ramp(&mut self, stops: &[Stop]) -> u32 {
+        self.ramp_cache.add(stops)
+    }
+
+    pub fn create_brush(&mut self, brush: &Brush) -> PersistentBrush {
+        PersistentBrush { kind: 0, id: 0 }
+    }
+
+    pub fn destroy_brush(&mut self, brush: PersistentBrush) {}
+}
+
+/// Handle for a brush that is managed by the resource context.
+#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
+pub struct PersistentBrush {
+    kind: u8,
+    id: u64,
+}
diff --git a/piet-scene/src/resource/ramp_cache.rs b/piet-scene/src/resource/ramp_cache.rs
new file mode 100644
index 0000000..0c509e4
--- /dev/null
+++ b/piet-scene/src/resource/ramp_cache.rs
@@ -0,0 +1,138 @@
+use crate::brush::{Color, Stop, StopVec};
+use std::collections::HashMap;
+
+const N_SAMPLES: usize = 512;
+const RETAINED_COUNT: usize = 64;
+
+#[derive(Default)]
+pub struct RampCache {
+    epoch: u64,
+    map: HashMap<StopVec, (u32, u64)>,
+    data: Vec<u32>,
+}
+
+impl RampCache {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn advance(&mut self) {
+        self.epoch += 1;
+        if self.map.len() > RETAINED_COUNT {
+            self.map
+                .retain(|_key, value| value.0 < RETAINED_COUNT as u32);
+            self.data.truncate(RETAINED_COUNT * N_SAMPLES);
+        }
+    }
+
+    pub fn clear(&mut self) {
+        self.epoch = 0;
+        self.map.clear();
+        self.data.clear();
+    }
+
+    pub fn add(&mut self, stops: &[Stop]) -> u32 {
+        if let Some(entry) = self.map.get_mut(stops) {
+            entry.1 = self.epoch;
+            entry.0
+        } else if self.map.len() < RETAINED_COUNT {
+            let id = (self.data.len() / N_SAMPLES) as u32;
+            self.data.extend(make_ramp(stops));
+            self.map.insert(stops.into(), (id, self.epoch));
+            id
+        } else {
+            let mut reuse = None;
+            for (stops, (id, epoch)) in &self.map {
+                if *epoch + 2 < self.epoch {
+                    reuse = Some((stops.to_owned(), *id));
+                    break;
+                }
+            }
+            if let Some((old_stops, id)) = reuse {
+                self.map.remove(&old_stops);
+                let start = id as usize * N_SAMPLES;
+                for (dst, src) in self.data[start..start + N_SAMPLES]
+                    .iter_mut()
+                    .zip(make_ramp(stops))
+                {
+                    *dst = src;
+                }
+                self.map.insert(stops.into(), (id, self.epoch));
+                id
+            } else {
+                let id = (self.data.len() / N_SAMPLES) as u32;
+                self.data.extend(make_ramp(stops));
+                self.map.insert(stops.into(), (id, self.epoch));
+                id
+            }
+        }
+    }
+
+    pub fn data(&self) -> &[u32] {
+        &self.data
+    }
+}
+
+fn make_ramp<'a>(stops: &'a [Stop]) -> impl Iterator<Item = u32> + 'a {
+    let mut last_u = 0.0;
+    let mut last_c = ColorF64::from_color(stops[0].color);
+    let mut this_u = last_u;
+    let mut this_c = last_c;
+    let mut j = 0;
+    (0..N_SAMPLES).map(move |i| {
+        let u = (i as f64) / (N_SAMPLES - 1) as f64;
+        while u > this_u {
+            last_u = this_u;
+            last_c = this_c;
+            if let Some(s) = stops.get(j + 1) {
+                this_u = s.offset as f64;
+                this_c = ColorF64::from_color(s.color);
+                j += 1;
+            } else {
+                break;
+            }
+        }
+        let du = this_u - last_u;
+        let c = if du < 1e-9 {
+            this_c
+        } else {
+            last_c.lerp(&this_c, (u - last_u) / du)
+        };
+        c.to_premul_u32()
+    })
+}
+
+#[derive(Copy, Clone)]
+struct ColorF64([f64; 4]);
+
+impl ColorF64 {
+    fn from_color(color: Color) -> Self {
+        Self([
+            color.r as f64 / 255.0,
+            color.g as f64 / 255.0,
+            color.b as f64 / 255.0,
+            color.a as f64 / 255.0,
+        ])
+    }
+
+    fn lerp(&self, other: &Self, a: f64) -> Self {
+        fn l(x: f64, y: f64, a: f64) -> f64 {
+            x + (y - x) * a
+        }
+        Self([
+            l(self.0[0], other.0[0], a),
+            l(self.0[1], other.0[1], a),
+            l(self.0[2], other.0[2], a),
+            l(self.0[3], other.0[3], a),
+        ])
+    }
+
+    fn to_premul_u32(&self) -> u32 {
+        let a = self.0[3].min(1.0).max(0.0);
+        let r = ((self.0[0] * a).min(1.0).max(0.0) / 255.0) as u32;
+        let g = ((self.0[1] * a).min(1.0).max(0.0) / 255.0) as u32;
+        let b = ((self.0[2] * a).min(1.0).max(0.0) / 255.0) as u32;
+        let a = (a / 255.0) as u32;
+        r | (g << 8) | (b << 16) | (a << 24)
+    }
+}
diff --git a/piet-scene/src/scene/blend.rs b/piet-scene/src/scene/blend.rs
new file mode 100644
index 0000000..7edc6cd
--- /dev/null
+++ b/piet-scene/src/scene/blend.rs
@@ -0,0 +1,102 @@
+// Copyright 2022 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+/// Defines the color mixing function for a blend operation.
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+#[repr(C)]
+pub enum Mix {
+    Normal = 0,
+    Multiply = 1,
+    Screen = 2,
+    Overlay = 3,
+    Darken = 4,
+    Lighten = 5,
+    ColorDodge = 6,
+    ColorBurn = 7,
+    HardLight = 8,
+    SoftLight = 9,
+    Difference = 10,
+    Exclusion = 11,
+    Hue = 12,
+    Saturation = 13,
+    Color = 14,
+    Luminosity = 15,
+}
+
+/// Defines the layer composition function for a blend operation.
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+#[repr(C)]
+pub enum Compose {
+    Clear = 0,
+    Copy = 1,
+    Dest = 2,
+    SrcOver = 3,
+    DestOver = 4,
+    SrcIn = 5,
+    DestIn = 6,
+    SrcOut = 7,
+    DestOut = 8,
+    SrcAtop = 9,
+    DestAtop = 10,
+    Xor = 11,
+    Plus = 12,
+    PlusDarker = 13,
+    PlusLighter = 14,
+}
+
+/// Blend mode consisting of mixing and composition functions.
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+pub struct Blend {
+    pub mix: Mix,
+    pub compose: Compose,
+}
+
+impl Blend {
+    pub fn new(mix: Mix, compose: Compose) -> Self {
+        Self { mix, compose }
+    }
+
+    pub fn pack(&self) -> u32 {
+        (self.mix as u32) << 8 | self.compose as u32
+    }
+}
+
+impl Default for Blend {
+    fn default() -> Self {
+        Self {
+            mix: Mix::Normal,
+            compose: Compose::SrcOver,
+        }
+    }
+}
+
+impl From<Mix> for Blend {
+    fn from(mix: Mix) -> Self {
+        Self {
+            mix,
+            compose: Compose::SrcOver,
+        }
+    }
+}
+
+impl From<Compose> for Blend {
+    fn from(compose: Compose) -> Self {
+        Self {
+            mix: Mix::Normal,
+            compose,
+        }
+    }
+}
diff --git a/piet-scene/src/scene/builder.rs b/piet-scene/src/scene/builder.rs
new file mode 100644
index 0000000..59e0a15
--- /dev/null
+++ b/piet-scene/src/scene/builder.rs
@@ -0,0 +1,433 @@
+// Copyright 2022 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+use super::style::{Fill, Stroke};
+use super::{Affine, Blend, Element, Fragment, FragmentResources, ResourcePatch, Scene, SceneData};
+use crate::brush::*;
+use crate::resource::ResourceContext;
+use bytemuck::{Pod, Zeroable};
+use core::borrow::Borrow;
+
+/// Creates a new builder for constructing a scene.
+pub fn build_scene<'a>(scene: &'a mut Scene, resources: &'a mut ResourceContext) -> Builder<'a> {
+    Builder::new(&mut scene.data, ResourceData::Scene(resources))
+}
+
+/// Creates a new builder for construction a scene fragment.
+pub fn build_fragment<'a>(fragment: &'a mut Fragment) -> Builder<'a> {
+    Builder::new(
+        &mut fragment.data,
+        ResourceData::Fragment(&mut fragment.resources),
+    )
+}
+
+/// Builder for constructing a scene or scene fragment.
+pub struct Builder<'a> {
+    scene: &'a mut SceneData,
+    resources: ResourceData<'a>,
+}
+
+impl<'a> Builder<'a> {
+    /// Creates a new builder for constructing a scene.
+    fn new(scene: &'a mut SceneData, resources: ResourceData<'a>) -> Self {
+        Self { scene, resources }
+    }
+
+    /// Pushes a transform matrix onto the stack.
+    pub fn push_transform(&mut self, transform: &Affine) {}
+
+    /// Pops the current transform matrix.
+    pub fn pop_transform(&mut self) {}
+
+    /// Pushes a new layer bound by the specifed shape and composed with
+    /// previous layers using the specified blend mode.
+    pub fn push_layer<'s, E>(&mut self, blend: Blend, elements: E)
+    where
+        E: IntoIterator,
+        E::IntoIter: Clone,
+        E::Item: Borrow<Element>,
+    {
+        let elements = elements.into_iter();
+        self.encode_path(elements, true);
+    }
+
+    /// Pops the current layer.
+    pub fn pop_layer(&mut self) {}
+
+    /// Fills a shape using the specified style and brush.
+    pub fn fill<'s, E>(
+        &mut self,
+        style: Fill,
+        brush: &Brush,
+        brush_transform: Option<Affine>,
+        elements: E,
+    ) where
+        E: IntoIterator,
+        E::IntoIter: Clone,
+        E::Item: Borrow<Element>,
+    {
+        let elements = elements.into_iter();
+        self.encode_path(elements, true);
+    }
+
+    /// Strokes a shape using the specified style and brush.
+    pub fn stroke<'s, D, E>(
+        &mut self,
+        style: &Stroke<D>,
+        brush: &Brush,
+        brush_transform: Option<Affine>,
+        elements: E,
+    ) where
+        D: Borrow<[f32]>,
+        E: IntoIterator,
+        E::IntoIter: Clone,
+        E::Item: Borrow<Element>,
+    {
+        let elements = elements.into_iter();
+        self.encode_path(elements, false);
+    }
+
+    /// Appends a fragment to the scene.
+    pub fn append(&mut self, fragment: &Fragment) {
+        let drawdata_base = self.scene.drawdata_stream.len();
+        self.scene.append(&fragment.data);
+        match &mut self.resources {
+            ResourceData::Scene(res) => {
+                for patch in &fragment.resources.patches {
+                    match patch {
+                        ResourcePatch::Ramp {
+                            drawdata_offset,
+                            stops,
+                        } => {
+                            let stops = &fragment.resources.stops[stops.clone()];
+                            let ramp_id = res.add_ramp(stops);
+                            let patch_base = *drawdata_offset + drawdata_base;
+                            (&mut self.scene.drawdata_stream[patch_base..patch_base + 4])
+                                .copy_from_slice(bytemuck::bytes_of(&ramp_id));
+                        }
+                    }
+                }
+            }
+            ResourceData::Fragment(res) => {
+                let stops_base = res.stops.len();
+                res.stops.extend_from_slice(&fragment.resources.stops);
+                res.patches.extend(fragment.resources.patches.iter().map(
+                    |pending| match pending {
+                        ResourcePatch::Ramp {
+                            drawdata_offset,
+                            stops,
+                        } => ResourcePatch::Ramp {
+                            drawdata_offset: drawdata_offset + drawdata_base,
+                            stops: stops.start + stops_base..stops.end + stops_base,
+                        },
+                    },
+                ));
+            }
+        }
+    }
+
+    /// Completes construction and finalizes the underlying scene.
+    pub fn finish(self) {}
+
+    fn encode_path<E>(&mut self, elements: E, is_fill: bool)
+    where
+        E: Iterator,
+        E::Item: Borrow<Element>,
+    {
+        if is_fill {
+            self.encode_path_inner(
+                elements
+                    .map(|el| *el.borrow())
+                    .flat_map(|el| {
+                        match el {
+                            Element::MoveTo(..) => Some(Element::Close),
+                            _ => None,
+                        }
+                        .into_iter()
+                        .chain(Some(el))
+                    })
+                    .chain(Some(Element::Close)),
+            )
+        } else {
+            self.encode_path_inner(elements.map(|el| *el.borrow()))
+        }
+    }
+
+    fn encode_path_inner(&mut self, elements: impl Iterator<Item = Element>) {
+        let mut b = PathBuilder::new(&mut self.scene.tag_stream, &mut self.scene.pathseg_stream);
+        for el in elements {
+            match el {
+                Element::MoveTo(p0) => b.move_to(p0.x, p0.y),
+                Element::LineTo(p0) => b.line_to(p0.x, p0.y),
+                Element::QuadTo(p0, p1) => b.quad_to(p0.x, p0.y, p1.x, p1.y),
+                Element::CurveTo(p0, p1, p2) => b.cubic_to(p0.x, p0.y, p1.x, p1.y, p2.x, p2.y),
+                Element::Close => b.close_path(),
+            }
+        }
+        b.path();
+        let n_pathseg = b.n_pathseg();
+        self.scene.n_path += 1;
+        self.scene.n_pathseg += n_pathseg;
+    }
+
+    fn encode_brush(&mut self, brush: &Brush) {
+        match brush {
+            Brush::Solid(color) => {
+                self.scene.drawtag_stream.push(DRAWTAG_FILLCOLOR);
+                let rgba_color = color.to_premul_u32();
+                self.scene
+                    .drawdata_stream
+                    .extend(bytemuck::bytes_of(&FillColor { rgba_color }));
+            }
+            Brush::LinearGradient(gradient) => {
+                let index = self.add_ramp(&gradient.stops);
+                self.scene.drawtag_stream.push(DRAWTAG_FILLLINGRADIENT);
+                self.scene
+                    .drawdata_stream
+                    .extend(bytemuck::bytes_of(&FillLinGradient {
+                        index,
+                        p0: [gradient.start.x, gradient.start.y],
+                        p1: [gradient.end.x, gradient.end.y],
+                    }));
+            }
+            Brush::RadialGradient(gradient) => {
+                let index = self.add_ramp(&gradient.stops);
+                self.scene.drawtag_stream.push(DRAWTAG_FILLRADGRADIENT);
+                self.scene
+                    .drawdata_stream
+                    .extend(bytemuck::bytes_of(&FillRadGradient {
+                        index,
+                        p0: [gradient.center0.x, gradient.center0.y],
+                        p1: [gradient.center1.x, gradient.center1.y],
+                        r0: gradient.radius0,
+                        r1: gradient.radius1,
+                    }));
+            }
+            Brush::SweepGradient(_gradient) => todo!("sweep gradients aren't done yet!"),
+            Brush::Image(_image) => todo!("images aren't done yet!"),
+            Brush::Persistent(_) => todo!("persistent brushes aren't done yet!"),
+        }
+    }
+
+    fn add_ramp(&mut self, stops: &[Stop]) -> u32 {
+        match &mut self.resources {
+            ResourceData::Scene(res) => res.add_ramp(stops),
+            ResourceData::Fragment(res) => {
+                let stops_start = res.stops.len();
+                res.stops.extend_from_slice(stops);
+                let id = res.patches.len() as u32;
+                res.patches.push(ResourcePatch::Ramp {
+                    drawdata_offset: self.scene.drawdata_stream.len(),
+                    stops: stops_start..stops_start + stops.len(),
+                });
+                id
+            }
+        }
+    }
+}
+
+enum ResourceData<'a> {
+    Fragment(&'a mut FragmentResources),
+    Scene(&'a mut ResourceContext),
+}
+
+// Tags for draw objects. See shader/drawtag.h for the authoritative source.
+const DRAWTAG_FILLCOLOR: u32 = 0x44;
+const DRAWTAG_FILLLINGRADIENT: u32 = 0x114;
+const DRAWTAG_FILLRADGRADIENT: u32 = 0x2dc;
+const DRAWTAG_BEGINCLIP: u32 = 0x05;
+const DRAWTAG_ENDCLIP: u32 = 0x25;
+
+#[repr(C)]
+#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
+pub struct FillColor {
+    rgba_color: u32,
+}
+
+#[repr(C)]
+#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
+pub struct FillLinGradient {
+    index: u32,
+    p0: [f32; 2],
+    p1: [f32; 2],
+}
+
+#[repr(C)]
+#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
+pub struct FillRadGradient {
+    index: u32,
+    p0: [f32; 2],
+    p1: [f32; 2],
+    r0: f32,
+    r1: f32,
+}
+
+#[allow(unused)]
+#[repr(C)]
+#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
+pub struct FillImage {
+    index: u32,
+    // [i16; 2]
+    offset: u32,
+}
+
+#[repr(C)]
+#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
+pub struct Clip {
+    blend: u32,
+}
+
+struct PathBuilder<'a> {
+    tag_stream: &'a mut Vec<u8>,
+    // If we're never going to use the i16 encoding, it might be
+    // slightly faster to store this as Vec<u32>, we'd get aligned
+    // stores on ARM etc.
+    pathseg_stream: &'a mut Vec<u8>,
+    first_pt: [f32; 2],
+    state: PathState,
+    n_pathseg: u32,
+}
+
+#[derive(PartialEq)]
+enum PathState {
+    Start,
+    MoveTo,
+    NonemptySubpath,
+}
+
+impl<'a> PathBuilder<'a> {
+    pub fn new(tags: &'a mut Vec<u8>, pathsegs: &'a mut Vec<u8>) -> PathBuilder<'a> {
+        PathBuilder {
+            tag_stream: tags,
+            pathseg_stream: pathsegs,
+            first_pt: [0.0, 0.0],
+            state: PathState::Start,
+            n_pathseg: 0,
+        }
+    }
+
+    pub fn move_to(&mut self, x: f32, y: f32) {
+        let buf = [x, y];
+        let bytes = bytemuck::bytes_of(&buf);
+        self.first_pt = buf;
+        if self.state == PathState::MoveTo {
+            let new_len = self.pathseg_stream.len() - 8;
+            self.pathseg_stream.truncate(new_len);
+        }
+        if self.state == PathState::NonemptySubpath {
+            if let Some(tag) = self.tag_stream.last_mut() {
+                *tag |= 4;
+            }
+        }
+        self.pathseg_stream.extend_from_slice(bytes);
+        self.state = PathState::MoveTo;
+    }
+
+    pub fn line_to(&mut self, x: f32, y: f32) {
+        if self.state == PathState::Start {
+            // should warn or error
+            return;
+        }
+        let buf = [x, y];
+        let bytes = bytemuck::bytes_of(&buf);
+        self.pathseg_stream.extend_from_slice(bytes);
+        self.tag_stream.push(9);
+        self.state = PathState::NonemptySubpath;
+        self.n_pathseg += 1;
+    }
+
+    pub fn quad_to(&mut self, x1: f32, y1: f32, x2: f32, y2: f32) {
+        if self.state == PathState::Start {
+            return;
+        }
+        let buf = [x1, y1, x2, y2];
+        let bytes = bytemuck::bytes_of(&buf);
+        self.pathseg_stream.extend_from_slice(bytes);
+        self.tag_stream.push(10);
+        self.state = PathState::NonemptySubpath;
+        self.n_pathseg += 1;
+    }
+
+    pub fn cubic_to(&mut self, x1: f32, y1: f32, x2: f32, y2: f32, x3: f32, y3: f32) {
+        if self.state == PathState::Start {
+            return;
+        }
+        let buf = [x1, y1, x2, y2, x3, y3];
+        let bytes = bytemuck::bytes_of(&buf);
+        self.pathseg_stream.extend_from_slice(bytes);
+        self.tag_stream.push(11);
+        self.state = PathState::NonemptySubpath;
+        self.n_pathseg += 1;
+    }
+
+    pub fn close_path(&mut self) {
+        match self.state {
+            PathState::Start => return,
+            PathState::MoveTo => {
+                let new_len = self.pathseg_stream.len() - 8;
+                self.pathseg_stream.truncate(new_len);
+                self.state = PathState::Start;
+                return;
+            }
+            PathState::NonemptySubpath => (),
+        }
+        let len = self.pathseg_stream.len();
+        if len < 8 {
+            // can't happen
+            return;
+        }
+        let first_bytes = bytemuck::bytes_of(&self.first_pt);
+        if &self.pathseg_stream[len - 8..len] != first_bytes {
+            self.pathseg_stream.extend_from_slice(first_bytes);
+            self.tag_stream.push(13);
+            self.n_pathseg += 1;
+        } else {
+            if let Some(tag) = self.tag_stream.last_mut() {
+                *tag |= 4;
+            }
+        }
+        self.state = PathState::Start;
+    }
+
+    fn finish(&mut self) {
+        if self.state == PathState::MoveTo {
+            let new_len = self.pathseg_stream.len() - 8;
+            self.pathseg_stream.truncate(new_len);
+        }
+        if let Some(tag) = self.tag_stream.last_mut() {
+            *tag |= 4;
+        }
+    }
+
+    /// Finish encoding a path.
+    ///
+    /// Encode this after encoding path segments.
+    pub fn path(&mut self) {
+        self.finish();
+        // maybe don't encode if path is empty? might throw off sync though
+        self.tag_stream.push(0x10);
+    }
+
+    /// Get the number of path segments.
+    ///
+    /// This is the number of path segments that will be written by the
+    /// path stage; use this for allocating the output buffer.
+    ///
+    /// Also note: it takes `self` for lifetime reasons.
+    pub fn n_pathseg(self) -> u32 {
+        self.n_pathseg
+    }
+}
diff --git a/piet-scene/src/scene/mod.rs b/piet-scene/src/scene/mod.rs
new file mode 100644
index 0000000..df9db90
--- /dev/null
+++ b/piet-scene/src/scene/mod.rs
@@ -0,0 +1,97 @@
+// Copyright 2022 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+mod blend;
+mod builder;
+mod style;
+
+pub use blend::{Blend, Compose, Mix};
+pub use builder::{build_fragment, build_scene, Builder};
+pub use style::*;
+
+use super::brush::*;
+use super::geometry::{Affine, Rect};
+use super::path::Element;
+use core::ops::Range;
+
+#[derive(Default)]
+struct SceneData {
+    transform_stream: Vec<Affine>,
+    tag_stream: Vec<u8>,
+    pathseg_stream: Vec<u8>,
+    linewidth_stream: Vec<f32>,
+    drawtag_stream: Vec<u32>,
+    drawdata_stream: Vec<u8>,
+    n_path: u32,
+    n_pathseg: u32,
+    n_clip: u32,
+}
+
+impl SceneData {
+    fn clear(&mut self) {
+        self.transform_stream.clear();
+        self.tag_stream.clear();
+        self.pathseg_stream.clear();
+        self.linewidth_stream.clear();
+        self.drawtag_stream.clear();
+        self.drawdata_stream.clear();
+        self.n_path = 0;
+        self.n_pathseg = 0;
+        self.n_clip = 0;
+    }
+
+    fn append(&mut self, other: &SceneData) {
+        self.transform_stream
+            .extend_from_slice(&other.transform_stream);
+        self.tag_stream.extend_from_slice(&other.tag_stream);
+        self.pathseg_stream.extend_from_slice(&other.pathseg_stream);
+        self.linewidth_stream
+            .extend_from_slice(&other.linewidth_stream);
+        self.drawtag_stream.extend_from_slice(&other.drawtag_stream);
+        self.drawdata_stream
+            .extend_from_slice(&other.drawdata_stream);
+        self.n_path += other.n_path;
+        self.n_pathseg += other.n_pathseg;
+        self.n_clip += other.n_clip;
+    }
+}
+
+/// Encoded definition of a scene that is ready for rendering when paired with
+/// an associated resource context.
+#[derive(Default)]
+pub struct Scene {
+    data: SceneData,
+}
+
+/// Encoded definition of a scene fragment and associated resources.
+#[derive(Default)]
+pub struct Fragment {
+    data: SceneData,
+    resources: FragmentResources,
+}
+
+#[derive(Default)]
+struct FragmentResources {
+    patches: Vec<ResourcePatch>,
+    stops: Vec<Stop>,
+}
+
+enum ResourcePatch {
+    Ramp {
+        drawdata_offset: usize,
+        stops: Range<usize>,
+    },
+}
diff --git a/piet-scene/src/scene/style.rs b/piet-scene/src/scene/style.rs
new file mode 100644
index 0000000..0aded61
--- /dev/null
+++ b/piet-scene/src/scene/style.rs
@@ -0,0 +1,71 @@
+// Copyright 2022 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+use core::borrow::Borrow;
+
+/// Describes the winding rule that determines the interior portion of a path.
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+pub enum Fill {
+    NonZero,
+    EvenOdd,
+}
+
+/// Defines the connection between two segments of a stroke.
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+pub enum Join {
+    /// A straight line connecting the segments.
+    Bevel,
+    /// The segments are extended to their natural intersection point.
+    Miter,
+    /// An arc between the segments.
+    Round,
+}
+
+/// Defines the shape to be drawn at the ends of a stroke.
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+pub enum Cap {
+    /// Flat cap.
+    Butt,
+    /// Square cap with dimensions equal to half the stroke width.
+    Square,
+    /// Rounded cap with radius equal to half the stroke width.
+    Round,
+}
+
+/// Describes the visual style of a stroke.
+#[derive(Copy, Clone, Debug)]
+pub struct Stroke<D>
+where
+    D: Borrow<[f32]>,
+{
+    /// Width of the stroke.
+    pub width: f32,
+    /// Style for connecting segments of the stroke.
+    pub join: Join,
+    /// Limit for miter joins.
+    pub miter_limit: f32,
+    /// Style for capping the beginning of an open subpath.
+    pub start_cap: Cap,
+    /// Style for capping the end of an open subpath.
+    pub end_cap: Cap,
+    /// Lengths of dashes in alternating on/off order.
+    pub dash_pattern: D,
+    /// Offset of the first dash.
+    pub dash_offset: f32,
+    /// True if the stroke width should be affected by the scale of a
+    /// transform.
+    pub scale: bool,
+}