Radial gradients

This patch adds radial gradients, including both the piet API and some new methods specifically to support COLRv1, including the ability to transform the gradient separately from the path.
2025-01-09 20:31:29 +11:00 · 2022-03-17 15:00:08 -07:00 · 2022-03-17 15:00:08 -07:00 · 0f91149b49
parent 7a8d41455a
commit 0f91149b49
31 changed files with 1886 additions and 971 deletions
--- a/piet-gpu-types/src/ptcl.rs
+++ b/piet-gpu-types/src/ptcl.rs
@ -24,6 +24,14 @@ piet_gpu! {
            line_y: f32,
            line_c: f32,
        }
+        struct CmdRadGrad {
+            index: u32,
+            mat: [f32; 4],
+            xlat: [f32; 2],
+            c1: [f32; 2],
+            ra: f32,
+            roff: f32,
+        }
        struct CmdImage {
            index: u32,
            offset: [i16; 2],
@ -31,6 +39,9 @@ piet_gpu! {
        struct CmdAlpha {
            alpha: f32,
        }
+        struct CmdEndClip {
+            blend: u32,
+        }
        struct CmdJump {
            new_ref: u32,
        }
@ -42,9 +53,10 @@ piet_gpu! {
            Alpha(CmdAlpha),
            Color(CmdColor),
            LinGrad(CmdLinGrad),
+            RadGrad(CmdRadGrad),
            Image(CmdImage),
            BeginClip,
-            EndClip,
+            EndClip(CmdEndClip),
            Jump(CmdJump),
        }
    }
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@ -229,6 +229,7 @@ void main() {
        case Drawtag_FillColor:
        case Drawtag_FillImage:
        case Drawtag_FillLinGradient:
+        case Drawtag_FillRadGradient:
        case Drawtag_BeginClip:
        case Drawtag_EndClip:
            uint drawmonoid_base = drawmonoid_start + 4 * element_ix;
@ -373,6 +374,25 @@ void main() {
                    Cmd_LinGrad_write(cmd_alloc, cmd_ref, cmd_lin);
                    cmd_ref.offset += 4 + CmdLinGrad_size;
                    break;
+                case Drawtag_FillRadGradient:
+                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
+                        break;
+                    }
+                    linewidth = uintBitsToFloat(memory[di]);
+                    write_fill(cmd_alloc, cmd_ref, tile, linewidth);
+                    CmdRadGrad cmd_rad;
+                    cmd_rad.index = scene[dd];
+                    // Given that this is basically a memcpy, we might consider
+                    // letting the fine raster read the info itself.
+                    cmd_rad.mat = uintBitsToFloat(uvec4(memory[di + 1], memory[di + 2],
+                        memory[di + 3], memory[di + 4]));
+                    cmd_rad.xlat = uintBitsToFloat(uvec2(memory[di + 5], memory[di + 6]));
+                    cmd_rad.c1 = uintBitsToFloat(uvec2(memory[di + 7], memory[di + 8]));
+                    cmd_rad.ra = uintBitsToFloat(memory[di + 9]);
+                    cmd_rad.roff = uintBitsToFloat(memory[di + 10]);
+                    Cmd_RadGrad_write(cmd_alloc, cmd_ref, cmd_rad);
+                    cmd_ref.offset += 4 + CmdRadGrad_size;
+                    break;
                case Drawtag_FillImage:
                    linewidth = uintBitsToFloat(memory[di]);
                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
--- a/piet-gpu/shader/draw_leaf.comp
+++ b/piet-gpu/shader/draw_leaf.comp
@ -94,8 +94,8 @@ void main() {
        // pipeline. However, going forward we'll get rid of that, and have
        // later stages read scene + bbox etc.
        tag_word = scene[drawtag_base + ix + i];
-        if (tag_word == Drawtag_FillColor || tag_word == Drawtag_FillLinGradient || tag_word == Drawtag_FillImage ||
-            tag_word == Drawtag_BeginClip) {
+        if (tag_word == Drawtag_FillColor || tag_word == Drawtag_FillLinGradient || tag_word == Drawtag_FillRadGradient ||
+            tag_word == Drawtag_FillImage || tag_word == Drawtag_BeginClip) {
            uint bbox_offset = (conf.path_bbox_alloc.offset >> 2) + 6 * m.path_ix;
            float bbox_l = float(memory[bbox_offset]) - 32768.0;
            float bbox_t = float(memory[bbox_offset + 1]) - 32768.0;
@ -106,11 +106,11 @@ void main() {
            uint fill_mode = uint(linewidth >= 0.0);
            vec4 mat;
            vec2 translate;
-            if (linewidth >= 0.0 || tag_word == Drawtag_FillLinGradient) {
+            if (linewidth >= 0.0 || tag_word == Drawtag_FillLinGradient || tag_word == Drawtag_FillRadGradient) {
                uint trans_ix = memory[bbox_offset + 5];
                uint t = (conf.trans_alloc.offset >> 2) + 6 * trans_ix;
                mat = uintBitsToFloat(uvec4(memory[t], memory[t + 1], memory[t + 2], memory[t + 3]));
-                if (tag_word == Drawtag_FillLinGradient) {
+                if (tag_word == Drawtag_FillLinGradient || tag_word == Drawtag_FillRadGradient) {
                    translate = uintBitsToFloat(uvec2(memory[t + 4], memory[t + 5]));
                }
            }
@ -125,7 +125,6 @@ void main() {
                break;
            case Drawtag_FillLinGradient:
                memory[di] = floatBitsToUint(linewidth);
-                uint index = scene[dd];
                vec2 p0 = uintBitsToFloat(uvec2(scene[dd + 1], scene[dd + 2]));
                vec2 p1 = uintBitsToFloat(uvec2(scene[dd + 3], scene[dd + 4]));
                p0 = mat.xy * p0.x + mat.zw * p0.y + translate;
@ -139,6 +138,33 @@ void main() {
                memory[di + 2] = floatBitsToUint(line_y);
                memory[di + 3] = floatBitsToUint(line_c);
                break;
+            case Drawtag_FillRadGradient:
+                p0 = uintBitsToFloat(uvec2(scene[dd + 1], scene[dd + 2]));
+                p1 = uintBitsToFloat(uvec2(scene[dd + 3], scene[dd + 4]));
+                float r0 = uintBitsToFloat(scene[dd + 5]);
+                float r1 = uintBitsToFloat(scene[dd + 6]);
+                float inv_det = 1.0 / (mat.x * mat.w - mat.y * mat.z);
+                vec4 inv_mat = inv_det * vec4(mat.w, -mat.y, -mat.z, mat.x);
+                vec2 inv_tr = inv_mat.xz * translate.x + inv_mat.yw * translate.y;
+                inv_tr += p0;
+                vec2 center1 = p1 - p0;
+                float rr = r1 / (r1 - r0);
+                float rainv = rr / (r1 * r1 - dot(center1, center1));
+                vec2 c1 = center1 * rainv;
+                float ra = rr * rainv;
+                float roff = rr - 1.0;
+                memory[di] = floatBitsToUint(linewidth);
+                memory[di + 1] = floatBitsToUint(inv_mat.x);
+                memory[di + 2] = floatBitsToUint(inv_mat.y);
+                memory[di + 3] = floatBitsToUint(inv_mat.z);
+                memory[di + 4] = floatBitsToUint(inv_mat.w);
+                memory[di + 5] = floatBitsToUint(inv_tr.x);
+                memory[di + 6] = floatBitsToUint(inv_tr.y);
+                memory[di + 7] = floatBitsToUint(c1.x);
+                memory[di + 8] = floatBitsToUint(c1.y);
+                memory[di + 9] = floatBitsToUint(ra);
+                memory[di + 10] = floatBitsToUint(roff);
+                break;
            case Drawtag_BeginClip:
                break;
            }
--- a/piet-gpu/shader/drawtag.h
+++ b/piet-gpu/shader/drawtag.h
@ -4,11 +4,12 @@

 // Design of draw tag: & 0x1c gives scene size in bytes
 // & 1 gives clip
-// (tag >> 4) & 0x1c is info size in bytes
+// (tag >> 4) & 0x3c is info size in bytes

 #define Drawtag_Nop 0
 #define Drawtag_FillColor 0x44
 #define Drawtag_FillLinGradient 0x114
+#define Drawtag_FillRadGradient 0x2dc
 #define Drawtag_FillImage 0x48
 #define Drawtag_BeginClip 0x05
 #define Drawtag_EndClip 0x25
@ -36,5 +37,5 @@ DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b) {
 DrawMonoid map_tag(uint tag_word) {
    // TODO: at some point, EndClip should not generate a path
    uint has_path = uint(tag_word != Drawtag_Nop);
-    return DrawMonoid(has_path, tag_word & 1, tag_word & 0x1c, (tag_word >> 4) & 0x1c);
+    return DrawMonoid(has_path, tag_word & 1, tag_word & 0x1c, (tag_word >> 4) & 0x3c);
 }
--- a/piet-gpu/shader/gen/coarse.dxil
+++ b/piet-gpu/shader/gen/coarse.dxil
--- a/piet-gpu/shader/gen/coarse.hlsl
+++ b/piet-gpu/shader/gen/coarse.hlsl
@ -91,6 +91,21 @@ struct CmdLinGrad
    float line_c;
 };

+struct CmdRadGradRef
+{
+    uint offset;
+};
+
+struct CmdRadGrad
+{
+    uint index;
+    float4 mat;
+    float2 xlat;
+    float2 c1;
+    float ra;
+    float roff;
+};
+
 struct CmdImageRef
 {
    uint offset;
@ -160,9 +175,9 @@ struct Config

 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);

-RWByteAddressBuffer _242 : register(u0, space0);
-ByteAddressBuffer _854 : register(t1, space0);
-ByteAddressBuffer _1222 : register(t2, space0);
+RWByteAddressBuffer _260 : register(u0, space0);
+ByteAddressBuffer _1005 : register(t1, space0);
+ByteAddressBuffer _1372 : register(t2, space0);

 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@ -185,8 +200,8 @@ groupshared uint sh_tile_count[256];

 Alloc slice_mem(Alloc a, uint offset, uint size)
 {
-    Alloc _319 = { a.offset + offset };
-    return _319;
+    Alloc _337 = { a.offset + offset };
+    return _337;
 }

 bool touch_mem(Alloc alloc, uint offset)
@ -202,7 +217,7 @@ uint read_mem(Alloc alloc, uint offset)
    {
        return 0u;
    }
-    uint v = _242.Load(offset * 4 + 8);
+    uint v = _260.Load(offset * 4 + 8);
    return v;
 }

@ -215,8 +230,8 @@ Alloc new_alloc(uint offset, uint size, bool mem_ok)

 BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index)
 {
-    BinInstanceRef _328 = { ref.offset + (index * 4u) };
-    return _328;
+    BinInstanceRef _346 = { ref.offset + (index * 4u) };
+    return _346;
 }

 BinInstance BinInstance_read(Alloc a, BinInstanceRef ref)
@ -244,8 +259,8 @@ Path Path_read(Alloc a, PathRef ref)
    uint raw2 = read_mem(param_4, param_5);
    Path s;
    s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
-    TileRef _391 = { raw2 };
-    s.tiles = _391;
+    TileRef _409 = { raw2 };
+    s.tiles = _409;
    return s;
 }

@ -255,11 +270,11 @@ void write_tile_alloc(uint el_ix, Alloc a)

 Alloc read_tile_alloc(uint el_ix, bool mem_ok)
 {
-    uint _741;
-    _242.GetDimensions(_741);
-    _741 = (_741 - 8) / 4;
+    uint _892;
+    _260.GetDimensions(_892);
+    _892 = (_892 - 8) / 4;
    uint param = 0u;
-    uint param_1 = uint(int(_741) * 4);
+    uint param_1 = uint(int(_892) * 4);
    bool param_2 = mem_ok;
    return new_alloc(param, param_1, param_2);
 }
@ -273,31 +288,31 @@ Tile Tile_read(Alloc a, TileRef ref)
    Alloc param_2 = a;
    uint param_3 = ix + 1u;
    uint raw1 = read_mem(param_2, param_3);
-    TileSegRef _416 = { raw0 };
+    TileSegRef _434 = { raw0 };
    Tile s;
-    s.tile = _416;
+    s.tile = _434;
    s.backdrop = int(raw1);
    return s;
 }

 MallocResult malloc(uint size)
 {
-    uint _248;
-    _242.InterlockedAdd(0, size, _248);
-    uint offset = _248;
-    uint _255;
-    _242.GetDimensions(_255);
-    _255 = (_255 - 8) / 4;
+    uint _266;
+    _260.InterlockedAdd(0, size, _266);
+    uint offset = _266;
+    uint _273;
+    _260.GetDimensions(_273);
+    _273 = (_273 - 8) / 4;
    MallocResult r;
-    r.failed = (offset + size) > uint(int(_255) * 4);
+    r.failed = (offset + size) > uint(int(_273) * 4);
    uint param = offset;
    uint param_1 = size;
    bool param_2 = !r.failed;
    r.alloc = new_alloc(param, param_1, param_2);
    if (r.failed)
    {
-        uint _277;
-        _242.InterlockedMax(4, 1u, _277);
+        uint _295;
+        _260.InterlockedMax(4, 1u, _295);
        return r;
    }
    return r;
@ -311,7 +326,7 @@ void write_mem(Alloc alloc, uint offset, uint val)
    {
        return;
    }
-    _242.Store(offset * 4 + 8, val);
+    _260.Store(offset * 4 + 8, val);
 }

 void CmdJump_write(Alloc a, CmdJumpRef ref, CmdJump s)
@ -327,11 +342,11 @@ void Cmd_Jump_write(Alloc a, CmdRef ref, CmdJump s)
 {
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
-    uint param_2 = 10u;
+    uint param_2 = 11u;
    write_mem(param, param_1, param_2);
-    CmdJumpRef _734 = { ref.offset + 4u };
+    CmdJumpRef _885 = { ref.offset + 4u };
    Alloc param_3 = a;
-    CmdJumpRef param_4 = _734;
+    CmdJumpRef param_4 = _885;
    CmdJump param_5 = s;
    CmdJump_write(param_3, param_4, param_5);
 }
@ -343,22 +358,22 @@ bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit
        return true;
    }
    uint param = 1024u;
-    MallocResult _762 = malloc(param);
-    MallocResult new_cmd = _762;
+    MallocResult _913 = malloc(param);
+    MallocResult new_cmd = _913;
    if (new_cmd.failed)
    {
        return false;
    }
-    CmdJump _772 = { new_cmd.alloc.offset };
-    CmdJump jump = _772;
+    CmdJump _923 = { new_cmd.alloc.offset };
+    CmdJump jump = _923;
    Alloc param_1 = cmd_alloc;
    CmdRef param_2 = cmd_ref;
    CmdJump param_3 = jump;
    Cmd_Jump_write(param_1, param_2, param_3);
    cmd_alloc = new_cmd.alloc;
-    CmdRef _784 = { cmd_alloc.offset };
-    cmd_ref = _784;
-    cmd_limit = (cmd_alloc.offset + 1024u) - 60u;
+    CmdRef _935 = { cmd_alloc.offset };
+    cmd_ref = _935;
+    cmd_limit = (cmd_alloc.offset + 1024u) - 144u;
    return true;
 }

@ -381,9 +396,9 @@ void Cmd_Fill_write(Alloc a, CmdRef ref, CmdFill s)
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = 1u;
    write_mem(param, param_1, param_2);
-    CmdFillRef _604 = { ref.offset + 4u };
+    CmdFillRef _742 = { ref.offset + 4u };
    Alloc param_3 = a;
-    CmdFillRef param_4 = _604;
+    CmdFillRef param_4 = _742;
    CmdFill param_5 = s;
    CmdFill_write(param_3, param_4, param_5);
 }
@ -415,9 +430,9 @@ void Cmd_Stroke_write(Alloc a, CmdRef ref, CmdStroke s)
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = 2u;
    write_mem(param, param_1, param_2);
-    CmdStrokeRef _622 = { ref.offset + 4u };
+    CmdStrokeRef _760 = { ref.offset + 4u };
    Alloc param_3 = a;
-    CmdStrokeRef param_4 = _622;
+    CmdStrokeRef param_4 = _760;
    CmdStroke param_5 = s;
    CmdStroke_write(param_3, param_4, param_5);
 }
@ -428,8 +443,8 @@ void write_fill(Alloc alloc, inout CmdRef cmd_ref, Tile tile, float linewidth)
    {
        if (tile.tile.offset != 0u)
        {
-            CmdFill _807 = { tile.tile.offset, tile.backdrop };
-            CmdFill cmd_fill = _807;
+            CmdFill _958 = { tile.tile.offset, tile.backdrop };
+            CmdFill cmd_fill = _958;
            Alloc param = alloc;
            CmdRef param_1 = cmd_ref;
            CmdFill param_2 = cmd_fill;
@ -446,8 +461,8 @@ void write_fill(Alloc alloc, inout CmdRef cmd_ref, Tile tile, float linewidth)
    }
    else
    {
-        CmdStroke _837 = { tile.tile.offset, 0.5f * linewidth };
-        CmdStroke cmd_stroke = _837;
+        CmdStroke _988 = { tile.tile.offset, 0.5f * linewidth };
+        CmdStroke cmd_stroke = _988;
        Alloc param_5 = alloc;
        CmdRef param_6 = cmd_ref;
        CmdStroke param_7 = cmd_stroke;
@ -471,9 +486,9 @@ void Cmd_Color_write(Alloc a, CmdRef ref, CmdColor s)
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = 5u;
    write_mem(param, param_1, param_2);
-    CmdColorRef _649 = { ref.offset + 4u };
+    CmdColorRef _786 = { ref.offset + 4u };
    Alloc param_3 = a;
-    CmdColorRef param_4 = _649;
+    CmdColorRef param_4 = _786;
    CmdColor param_5 = s;
    CmdColor_write(param_3, param_4, param_5);
 }
@ -505,13 +520,75 @@ void Cmd_LinGrad_write(Alloc a, CmdRef ref, CmdLinGrad s)
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = 6u;
    write_mem(param, param_1, param_2);
-    CmdLinGradRef _668 = { ref.offset + 4u };
+    CmdLinGradRef _804 = { ref.offset + 4u };
    Alloc param_3 = a;
-    CmdLinGradRef param_4 = _668;
+    CmdLinGradRef param_4 = _804;
    CmdLinGrad param_5 = s;
    CmdLinGrad_write(param_3, param_4, param_5);
 }

+void CmdRadGrad_write(Alloc a, CmdRadGradRef ref, CmdRadGrad s)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint param_2 = s.index;
+    write_mem(param, param_1, param_2);
+    Alloc param_3 = a;
+    uint param_4 = ix + 1u;
+    uint param_5 = asuint(s.mat.x);
+    write_mem(param_3, param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 2u;
+    uint param_8 = asuint(s.mat.y);
+    write_mem(param_6, param_7, param_8);
+    Alloc param_9 = a;
+    uint param_10 = ix + 3u;
+    uint param_11 = asuint(s.mat.z);
+    write_mem(param_9, param_10, param_11);
+    Alloc param_12 = a;
+    uint param_13 = ix + 4u;
+    uint param_14 = asuint(s.mat.w);
+    write_mem(param_12, param_13, param_14);
+    Alloc param_15 = a;
+    uint param_16 = ix + 5u;
+    uint param_17 = asuint(s.xlat.x);
+    write_mem(param_15, param_16, param_17);
+    Alloc param_18 = a;
+    uint param_19 = ix + 6u;
+    uint param_20 = asuint(s.xlat.y);
+    write_mem(param_18, param_19, param_20);
+    Alloc param_21 = a;
+    uint param_22 = ix + 7u;
+    uint param_23 = asuint(s.c1.x);
+    write_mem(param_21, param_22, param_23);
+    Alloc param_24 = a;
+    uint param_25 = ix + 8u;
+    uint param_26 = asuint(s.c1.y);
+    write_mem(param_24, param_25, param_26);
+    Alloc param_27 = a;
+    uint param_28 = ix + 9u;
+    uint param_29 = asuint(s.ra);
+    write_mem(param_27, param_28, param_29);
+    Alloc param_30 = a;
+    uint param_31 = ix + 10u;
+    uint param_32 = asuint(s.roff);
+    write_mem(param_30, param_31, param_32);
+}
+
+void Cmd_RadGrad_write(Alloc a, CmdRef ref, CmdRadGrad s)
+{
+    Alloc param = a;
+    uint param_1 = ref.offset >> uint(2);
+    uint param_2 = 7u;
+    write_mem(param, param_1, param_2);
+    CmdRadGradRef _822 = { ref.offset + 4u };
+    Alloc param_3 = a;
+    CmdRadGradRef param_4 = _822;
+    CmdRadGrad param_5 = s;
+    CmdRadGrad_write(param_3, param_4, param_5);
+}
+
 void CmdImage_write(Alloc a, CmdImageRef ref, CmdImage s)
 {
    uint ix = ref.offset >> uint(2);
@ -529,11 +606,11 @@ void Cmd_Image_write(Alloc a, CmdRef ref, CmdImage s)
 {
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
-    uint param_2 = 7u;
+    uint param_2 = 8u;
    write_mem(param, param_1, param_2);
-    CmdImageRef _687 = { ref.offset + 4u };
+    CmdImageRef _840 = { ref.offset + 4u };
    Alloc param_3 = a;
-    CmdImageRef param_4 = _687;
+    CmdImageRef param_4 = _840;
    CmdImage param_5 = s;
    CmdImage_write(param_3, param_4, param_5);
 }
@ -542,7 +619,7 @@ void Cmd_BeginClip_write(Alloc a, CmdRef ref)
 {
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
-    uint param_2 = 8u;
+    uint param_2 = 9u;
    write_mem(param, param_1, param_2);
 }

@ -559,11 +636,11 @@ void Cmd_EndClip_write(Alloc a, CmdRef ref, CmdEndClip s)
 {
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
-    uint param_2 = 9u;
+    uint param_2 = 10u;
    write_mem(param, param_1, param_2);
-    CmdEndClipRef _715 = { ref.offset + 4u };
+    CmdEndClipRef _866 = { ref.offset + 4u };
    Alloc param_3 = a;
-    CmdEndClipRef param_4 = _715;
+    CmdEndClipRef param_4 = _866;
    CmdEndClip param_5 = s;
    CmdEndClip_write(param_3, param_4, param_5);
 }
@ -578,80 +655,81 @@ void Cmd_End_write(Alloc a, CmdRef ref)

 void comp_main()
 {
-    uint width_in_bins = ((_854.Load(8) + 16u) - 1u) / 16u;
+    uint width_in_bins = ((_1005.Load(8) + 16u) - 1u) / 16u;
    uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x;
    uint partition_ix = 0u;
-    uint n_partitions = ((_854.Load(0) + 256u) - 1u) / 256u;
+    uint n_partitions = ((_1005.Load(0) + 256u) - 1u) / 256u;
    uint th_ix = gl_LocalInvocationID.x;
    uint bin_tile_x = 16u * gl_WorkGroupID.x;
    uint bin_tile_y = 16u * gl_WorkGroupID.y;
    uint tile_x = gl_LocalInvocationID.x % 16u;
    uint tile_y = gl_LocalInvocationID.x / 16u;
-    uint this_tile_ix = (((bin_tile_y + tile_y) * _854.Load(8)) + bin_tile_x) + tile_x;
-    Alloc _919;
-    _919.offset = _854.Load(24);
+    uint this_tile_ix = (((bin_tile_y + tile_y) * _1005.Load(8)) + bin_tile_x) + tile_x;
+    Alloc _1070;
+    _1070.offset = _1005.Load(24);
    Alloc param;
-    param.offset = _919.offset;
+    param.offset = _1070.offset;
    uint param_1 = this_tile_ix * 1024u;
    uint param_2 = 1024u;
    Alloc cmd_alloc = slice_mem(param, param_1, param_2);
-    CmdRef _928 = { cmd_alloc.offset };
-    CmdRef cmd_ref = _928;
-    uint cmd_limit = (cmd_ref.offset + 1024u) - 60u;
+    CmdRef _1079 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _1079;
+    uint cmd_limit = (cmd_ref.offset + 1024u) - 144u;
    uint clip_depth = 0u;
    uint clip_zero_depth = 0u;
    uint rd_ix = 0u;
    uint wr_ix = 0u;
    uint part_start_ix = 0u;
    uint ready_ix = 0u;
-    uint drawmonoid_start = _854.Load(44) >> uint(2);
-    uint drawtag_start = _854.Load(100) >> uint(2);
-    uint drawdata_start = _854.Load(104) >> uint(2);
-    uint drawinfo_start = _854.Load(68) >> uint(2);
-    bool mem_ok = _242.Load(4) == 0u;
+    uint drawmonoid_start = _1005.Load(44) >> uint(2);
+    uint drawtag_start = _1005.Load(100) >> uint(2);
+    uint drawdata_start = _1005.Load(104) >> uint(2);
+    uint drawinfo_start = _1005.Load(68) >> uint(2);
+    bool mem_ok = _260.Load(4) == 0u;
    Alloc param_3;
    Alloc param_5;
-    uint _1154;
+    uint _1304;
    uint element_ix;
    Alloc param_14;
    uint tile_count;
-    uint _1455;
+    uint _1605;
    float linewidth;
    CmdLinGrad cmd_lin;
+    CmdRadGrad cmd_rad;
    while (true)
    {
        for (uint i = 0u; i < 8u; i++)
        {
            sh_bitmaps[i][th_ix] = 0u;
        }
-        bool _1206;
+        bool _1356;
        for (;;)
        {
            if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
            {
                part_start_ix = ready_ix;
                uint count = 0u;
-                bool _1003 = th_ix < 256u;
-                bool _1011;
-                if (_1003)
+                bool _1154 = th_ix < 256u;
+                bool _1162;
+                if (_1154)
                {
-                    _1011 = (partition_ix + th_ix) < n_partitions;
+                    _1162 = (partition_ix + th_ix) < n_partitions;
                }
                else
                {
-                    _1011 = _1003;
+                    _1162 = _1154;
                }
-                if (_1011)
+                if (_1162)
                {
-                    uint in_ix = (_854.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
-                    Alloc _1029;
-                    _1029.offset = _854.Load(20);
-                    param_3.offset = _1029.offset;
+                    uint in_ix = (_1005.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
+                    Alloc _1179;
+                    _1179.offset = _1005.Load(20);
+                    param_3.offset = _1179.offset;
                    uint param_4 = in_ix;
                    count = read_mem(param_3, param_4);
-                    Alloc _1040;
-                    _1040.offset = _854.Load(20);
-                    param_5.offset = _1040.offset;
+                    Alloc _1190;
+                    _1190.offset = _1005.Load(20);
+                    param_5.offset = _1190.offset;
                    uint param_6 = in_ix + 1u;
                    uint offset = read_mem(param_5, param_6);
                    uint param_7 = offset;
@ -697,16 +775,16 @@ void comp_main()
                }
                if (part_ix > 0u)
                {
-                    _1154 = sh_part_count[part_ix - 1u];
+                    _1304 = sh_part_count[part_ix - 1u];
                }
                else
                {
-                    _1154 = part_start_ix;
+                    _1304 = part_start_ix;
                }
-                ix -= _1154;
+                ix -= _1304;
                Alloc bin_alloc = sh_part_elements[part_ix];
-                BinInstanceRef _1173 = { bin_alloc.offset };
-                BinInstanceRef inst_ref = _1173;
+                BinInstanceRef _1323 = { bin_alloc.offset };
+                BinInstanceRef inst_ref = _1323;
                BinInstanceRef param_10 = inst_ref;
                uint param_11 = ix;
                Alloc param_12 = bin_alloc;
@ -716,16 +794,16 @@ void comp_main()
            }
            GroupMemoryBarrierWithGroupSync();
            wr_ix = min((rd_ix + 256u), ready_ix);
-            bool _1196 = (wr_ix - rd_ix) < 256u;
-            if (_1196)
+            bool _1346 = (wr_ix - rd_ix) < 256u;
+            if (_1346)
            {
-                _1206 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
+                _1356 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
            }
            else
            {
-                _1206 = _1196;
+                _1356 = _1346;
            }
-            if (_1206)
+            if (_1356)
            {
                continue;
            }
@ -738,23 +816,24 @@ void comp_main()
        if ((th_ix + rd_ix) < wr_ix)
        {
            element_ix = sh_elements[th_ix];
-            tag = _1222.Load((drawtag_start + element_ix) * 4 + 0);
+            tag = _1372.Load((drawtag_start + element_ix) * 4 + 0);
        }
        switch (tag)
        {
            case 68u:
            case 72u:
            case 276u:
+            case 732u:
            case 5u:
            case 37u:
            {
                uint drawmonoid_base = drawmonoid_start + (4u * element_ix);
-                uint path_ix = _242.Load(drawmonoid_base * 4 + 8);
-                PathRef _1247 = { _854.Load(16) + (path_ix * 12u) };
-                Alloc _1250;
-                _1250.offset = _854.Load(16);
-                param_14.offset = _1250.offset;
-                PathRef param_15 = _1247;
+                uint path_ix = _260.Load(drawmonoid_base * 4 + 8);
+                PathRef _1397 = { _1005.Load(16) + (path_ix * 12u) };
+                Alloc _1400;
+                _1400.offset = _1005.Load(16);
+                param_14.offset = _1400.offset;
+                PathRef param_15 = _1397;
                Path path = Path_read(param_14, param_15);
                uint stride = path.bbox.z - path.bbox.x;
                sh_tile_stride[th_ix] = stride;
@ -810,16 +889,16 @@ void comp_main()
                }
            }
            uint element_ix_1 = sh_elements[el_ix];
-            uint tag_1 = _1222.Load((drawtag_start + element_ix_1) * 4 + 0);
+            uint tag_1 = _1372.Load((drawtag_start + element_ix_1) * 4 + 0);
            if (el_ix > 0u)
            {
-                _1455 = sh_tile_count[el_ix - 1u];
+                _1605 = sh_tile_count[el_ix - 1u];
            }
            else
            {
-                _1455 = 0u;
+                _1605 = 0u;
            }
-            uint seq_ix = ix_1 - _1455;
+            uint seq_ix = ix_1 - _1605;
            uint width = sh_tile_width[el_ix];
            uint x = sh_tile_x0[el_ix] + (seq_ix % width);
            uint y = sh_tile_y0[el_ix] + (seq_ix / width);
@ -828,38 +907,38 @@ void comp_main()
            {
                uint param_21 = el_ix;
                bool param_22 = mem_ok;
-                TileRef _1507 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
+                TileRef _1657 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
                Alloc param_23 = read_tile_alloc(param_21, param_22);
-                TileRef param_24 = _1507;
+                TileRef param_24 = _1657;
                Tile tile = Tile_read(param_23, param_24);
                bool is_clip = (tag_1 & 1u) != 0u;
                bool is_blend = false;
                if (is_clip)
                {
                    uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1);
-                    uint scene_offset = _242.Load((drawmonoid_base_1 + 2u) * 4 + 8);
+                    uint scene_offset = _260.Load((drawmonoid_base_1 + 2u) * 4 + 8);
                    uint dd = drawdata_start + (scene_offset >> uint(2));
-                    uint blend = _1222.Load(dd * 4 + 0);
+                    uint blend = _1372.Load(dd * 4 + 0);
                    is_blend = blend != 3u;
                }
-                bool _1542 = tile.tile.offset != 0u;
-                bool _1551;
-                if (!_1542)
+                bool _1692 = tile.tile.offset != 0u;
+                bool _1701;
+                if (!_1692)
                {
-                    _1551 = (tile.backdrop == 0) == is_clip;
+                    _1701 = (tile.backdrop == 0) == is_clip;
                }
                else
                {
-                    _1551 = _1542;
+                    _1701 = _1692;
                }
-                include_tile = _1551 || is_blend;
+                include_tile = _1701 || is_blend;
            }
            if (include_tile)
            {
                uint el_slice = el_ix / 32u;
                uint el_mask = 1u << (el_ix & 31u);
-                uint _1573;
-                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1573);
+                uint _1723;
+                InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1723);
            }
        }
        GroupMemoryBarrierWithGroupSync();
@ -883,33 +962,33 @@ void comp_main()
            uint element_ref_ix = (slice_ix * 32u) + uint(int(firstbitlow(bitmap)));
            uint element_ix_2 = sh_elements[element_ref_ix];
            bitmap &= (bitmap - 1u);
-            uint drawtag = _1222.Load((drawtag_start + element_ix_2) * 4 + 0);
+            uint drawtag = _1372.Load((drawtag_start + element_ix_2) * 4 + 0);
            if (clip_zero_depth == 0u)
            {
                uint param_25 = element_ref_ix;
                bool param_26 = mem_ok;
-                TileRef _1650 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
+                TileRef _1800 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
                Alloc param_27 = read_tile_alloc(param_25, param_26);
-                TileRef param_28 = _1650;
+                TileRef param_28 = _1800;
                Tile tile_1 = Tile_read(param_27, param_28);
                uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2);
-                uint scene_offset_1 = _242.Load((drawmonoid_base_2 + 2u) * 4 + 8);
-                uint info_offset = _242.Load((drawmonoid_base_2 + 3u) * 4 + 8);
+                uint scene_offset_1 = _260.Load((drawmonoid_base_2 + 2u) * 4 + 8);
+                uint info_offset = _260.Load((drawmonoid_base_2 + 3u) * 4 + 8);
                uint dd_1 = drawdata_start + (scene_offset_1 >> uint(2));
                uint di = drawinfo_start + (info_offset >> uint(2));
                switch (drawtag)
                {
                    case 68u:
                    {
-                        linewidth = asfloat(_242.Load(di * 4 + 8));
+                        linewidth = asfloat(_260.Load(di * 4 + 8));
                        Alloc param_29 = cmd_alloc;
                        CmdRef param_30 = cmd_ref;
                        uint param_31 = cmd_limit;
-                        bool _1697 = alloc_cmd(param_29, param_30, param_31);
+                        bool _1848 = alloc_cmd(param_29, param_30, param_31);
                        cmd_alloc = param_29;
                        cmd_ref = param_30;
                        cmd_limit = param_31;
-                        if (!_1697)
+                        if (!_1848)
                        {
                            break;
                        }
@ -919,11 +998,11 @@ void comp_main()
                        float param_35 = linewidth;
                        write_fill(param_32, param_33, param_34, param_35);
                        cmd_ref = param_33;
-                        uint rgba = _1222.Load(dd_1 * 4 + 0);
-                        CmdColor _1720 = { rgba };
+                        uint rgba = _1372.Load(dd_1 * 4 + 0);
+                        CmdColor _1871 = { rgba };
                        Alloc param_36 = cmd_alloc;
                        CmdRef param_37 = cmd_ref;
-                        CmdColor param_38 = _1720;
+                        CmdColor param_38 = _1871;
                        Cmd_Color_write(param_36, param_37, param_38);
                        cmd_ref.offset += 8u;
                        break;
@ -933,25 +1012,25 @@ void comp_main()
                        Alloc param_39 = cmd_alloc;
                        CmdRef param_40 = cmd_ref;
                        uint param_41 = cmd_limit;
-                        bool _1738 = alloc_cmd(param_39, param_40, param_41);
+                        bool _1889 = alloc_cmd(param_39, param_40, param_41);
                        cmd_alloc = param_39;
                        cmd_ref = param_40;
                        cmd_limit = param_41;
-                        if (!_1738)
+                        if (!_1889)
                        {
                            break;
                        }
-                        linewidth = asfloat(_242.Load(di * 4 + 8));
+                        linewidth = asfloat(_260.Load(di * 4 + 8));
                        Alloc param_42 = cmd_alloc;
                        CmdRef param_43 = cmd_ref;
                        Tile param_44 = tile_1;
                        float param_45 = linewidth;
                        write_fill(param_42, param_43, param_44, param_45);
                        cmd_ref = param_43;
-                        cmd_lin.index = _1222.Load(dd_1 * 4 + 0);
-                        cmd_lin.line_x = asfloat(_242.Load((di + 1u) * 4 + 8));
-                        cmd_lin.line_y = asfloat(_242.Load((di + 2u) * 4 + 8));
-                        cmd_lin.line_c = asfloat(_242.Load((di + 3u) * 4 + 8));
+                        cmd_lin.index = _1372.Load(dd_1 * 4 + 0);
+                        cmd_lin.line_x = asfloat(_260.Load((di + 1u) * 4 + 8));
+                        cmd_lin.line_y = asfloat(_260.Load((di + 2u) * 4 + 8));
+                        cmd_lin.line_c = asfloat(_260.Load((di + 3u) * 4 + 8));
                        Alloc param_46 = cmd_alloc;
                        CmdRef param_47 = cmd_ref;
                        CmdLinGrad param_48 = cmd_lin;
@ -959,69 +1038,102 @@ void comp_main()
                        cmd_ref.offset += 20u;
                        break;
                    }
-                    case 72u:
+                    case 732u:
                    {
-                        linewidth = asfloat(_242.Load(di * 4 + 8));
                        Alloc param_49 = cmd_alloc;
                        CmdRef param_50 = cmd_ref;
                        uint param_51 = cmd_limit;
-                        bool _1806 = alloc_cmd(param_49, param_50, param_51);
+                        bool _1953 = alloc_cmd(param_49, param_50, param_51);
                        cmd_alloc = param_49;
                        cmd_ref = param_50;
                        cmd_limit = param_51;
-                        if (!_1806)
+                        if (!_1953)
                        {
                            break;
                        }
+                        linewidth = asfloat(_260.Load(di * 4 + 8));
                        Alloc param_52 = cmd_alloc;
                        CmdRef param_53 = cmd_ref;
                        Tile param_54 = tile_1;
                        float param_55 = linewidth;
                        write_fill(param_52, param_53, param_54, param_55);
                        cmd_ref = param_53;
-                        uint index = _1222.Load(dd_1 * 4 + 0);
-                        uint raw1 = _1222.Load((dd_1 + 1u) * 4 + 0);
-                        int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
-                        CmdImage _1845 = { index, offset_1 };
+                        cmd_rad.index = _1372.Load(dd_1 * 4 + 0);
+                        cmd_rad.mat = asfloat(uint4(_260.Load((di + 1u) * 4 + 8), _260.Load((di + 2u) * 4 + 8), _260.Load((di + 3u) * 4 + 8), _260.Load((di + 4u) * 4 + 8)));
+                        cmd_rad.xlat = asfloat(uint2(_260.Load((di + 5u) * 4 + 8), _260.Load((di + 6u) * 4 + 8)));
+                        cmd_rad.c1 = asfloat(uint2(_260.Load((di + 7u) * 4 + 8), _260.Load((di + 8u) * 4 + 8)));
+                        cmd_rad.ra = asfloat(_260.Load((di + 9u) * 4 + 8));
+                        cmd_rad.roff = asfloat(_260.Load((di + 10u) * 4 + 8));
                        Alloc param_56 = cmd_alloc;
                        CmdRef param_57 = cmd_ref;
-                        CmdImage param_58 = _1845;
-                        Cmd_Image_write(param_56, param_57, param_58);
+                        CmdRadGrad param_58 = cmd_rad;
+                        Cmd_RadGrad_write(param_56, param_57, param_58);
+                        cmd_ref.offset += 48u;
+                        break;
+                    }
+                    case 72u:
+                    {
+                        linewidth = asfloat(_260.Load(di * 4 + 8));
+                        Alloc param_59 = cmd_alloc;
+                        CmdRef param_60 = cmd_ref;
+                        uint param_61 = cmd_limit;
+                        bool _2059 = alloc_cmd(param_59, param_60, param_61);
+                        cmd_alloc = param_59;
+                        cmd_ref = param_60;
+                        cmd_limit = param_61;
+                        if (!_2059)
+                        {
+                            break;
+                        }
+                        Alloc param_62 = cmd_alloc;
+                        CmdRef param_63 = cmd_ref;
+                        Tile param_64 = tile_1;
+                        float param_65 = linewidth;
+                        write_fill(param_62, param_63, param_64, param_65);
+                        cmd_ref = param_63;
+                        uint index = _1372.Load(dd_1 * 4 + 0);
+                        uint raw1 = _1372.Load((dd_1 + 1u) * 4 + 0);
+                        int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
+                        CmdImage _2098 = { index, offset_1 };
+                        Alloc param_66 = cmd_alloc;
+                        CmdRef param_67 = cmd_ref;
+                        CmdImage param_68 = _2098;
+                        Cmd_Image_write(param_66, param_67, param_68);
                        cmd_ref.offset += 12u;
                        break;
                    }
                    case 5u:
                    {
-                        bool _1859 = tile_1.tile.offset == 0u;
-                        bool _1865;
-                        if (_1859)
+                        bool _2112 = tile_1.tile.offset == 0u;
+                        bool _2118;
+                        if (_2112)
                        {
-                            _1865 = tile_1.backdrop == 0;
+                            _2118 = tile_1.backdrop == 0;
                        }
                        else
                        {
-                            _1865 = _1859;
+                            _2118 = _2112;
                        }
-                        if (_1865)
+                        if (_2118)
                        {
                            clip_zero_depth = clip_depth + 1u;
                        }
                        else
                        {
-                            Alloc param_59 = cmd_alloc;
-                            CmdRef param_60 = cmd_ref;
-                            uint param_61 = cmd_limit;
-                            bool _1877 = alloc_cmd(param_59, param_60, param_61);
-                            cmd_alloc = param_59;
-                            cmd_ref = param_60;
-                            cmd_limit = param_61;
-                            if (!_1877)
+                            Alloc param_69 = cmd_alloc;
+                            CmdRef param_70 = cmd_ref;
+                            uint param_71 = cmd_limit;
+                            bool _2130 = alloc_cmd(param_69, param_70, param_71);
+                            cmd_alloc = param_69;
+                            cmd_ref = param_70;
+                            cmd_limit = param_71;
+                            if (!_2130)
                            {
                                break;
                            }
-                            Alloc param_62 = cmd_alloc;
-                            CmdRef param_63 = cmd_ref;
-                            Cmd_BeginClip_write(param_62, param_63);
+                            Alloc param_72 = cmd_alloc;
+                            CmdRef param_73 = cmd_ref;
+                            Cmd_BeginClip_write(param_72, param_73);
                            cmd_ref.offset += 4u;
                        }
                        clip_depth++;
@ -1030,29 +1142,29 @@ void comp_main()
                    case 37u:
                    {
                        clip_depth--;
-                        Alloc param_64 = cmd_alloc;
-                        CmdRef param_65 = cmd_ref;
-                        uint param_66 = cmd_limit;
-                        bool _1905 = alloc_cmd(param_64, param_65, param_66);
-                        cmd_alloc = param_64;
-                        cmd_ref = param_65;
-                        cmd_limit = param_66;
-                        if (!_1905)
+                        Alloc param_74 = cmd_alloc;
+                        CmdRef param_75 = cmd_ref;
+                        uint param_76 = cmd_limit;
+                        bool _2158 = alloc_cmd(param_74, param_75, param_76);
+                        cmd_alloc = param_74;
+                        cmd_ref = param_75;
+                        cmd_limit = param_76;
+                        if (!_2158)
                        {
                            break;
                        }
-                        Alloc param_67 = cmd_alloc;
-                        CmdRef param_68 = cmd_ref;
-                        Tile param_69 = tile_1;
-                        float param_70 = -1.0f;
-                        write_fill(param_67, param_68, param_69, param_70);
-                        cmd_ref = param_68;
-                        uint blend_1 = _1222.Load(dd_1 * 4 + 0);
-                        CmdEndClip _1928 = { blend_1 };
-                        Alloc param_71 = cmd_alloc;
-                        CmdRef param_72 = cmd_ref;
-                        CmdEndClip param_73 = _1928;
-                        Cmd_EndClip_write(param_71, param_72, param_73);
+                        Alloc param_77 = cmd_alloc;
+                        CmdRef param_78 = cmd_ref;
+                        Tile param_79 = tile_1;
+                        float param_80 = -1.0f;
+                        write_fill(param_77, param_78, param_79, param_80);
+                        cmd_ref = param_78;
+                        uint blend_1 = _1372.Load(dd_1 * 4 + 0);
+                        CmdEndClip _2181 = { blend_1 };
+                        Alloc param_81 = cmd_alloc;
+                        CmdRef param_82 = cmd_ref;
+                        CmdEndClip param_83 = _2181;
+                        Cmd_EndClip_write(param_81, param_82, param_83);
                        cmd_ref.offset += 8u;
                        break;
                    }
@ -1086,21 +1198,21 @@ void comp_main()
            break;
        }
    }
-    bool _1975 = (bin_tile_x + tile_x) < _854.Load(8);
-    bool _1984;
-    if (_1975)
+    bool _2228 = (bin_tile_x + tile_x) < _1005.Load(8);
+    bool _2237;
+    if (_2228)
    {
-        _1984 = (bin_tile_y + tile_y) < _854.Load(12);
+        _2237 = (bin_tile_y + tile_y) < _1005.Load(12);
    }
    else
    {
-        _1984 = _1975;
+        _2237 = _2228;
    }
-    if (_1984)
+    if (_2237)
    {
-        Alloc param_74 = cmd_alloc;
-        CmdRef param_75 = cmd_ref;
-        Cmd_End_write(param_74, param_75);
+        Alloc param_84 = cmd_alloc;
+        CmdRef param_85 = cmd_ref;
+        Cmd_End_write(param_84, param_85);
    }
 }

--- a/piet-gpu/shader/gen/coarse.msl
+++ b/piet-gpu/shader/gen/coarse.msl
--- a/piet-gpu/shader/gen/coarse.spv
+++ b/piet-gpu/shader/gen/coarse.spv
--- a/piet-gpu/shader/gen/draw_leaf.dxil
+++ b/piet-gpu/shader/gen/draw_leaf.dxil
--- a/piet-gpu/shader/gen/draw_leaf.hlsl
+++ b/piet-gpu/shader/gen/draw_leaf.hlsl
@ -46,10 +46,10 @@ static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);

 static const DrawMonoid _23 = { 0u, 0u, 0u, 0u };

-ByteAddressBuffer _92 : register(t1, space0);
-ByteAddressBuffer _102 : register(t2, space0);
-ByteAddressBuffer _202 : register(t3, space0);
-RWByteAddressBuffer _284 : register(u0, space0);
+ByteAddressBuffer _93 : register(t1, space0);
+ByteAddressBuffer _103 : register(t2, space0);
+ByteAddressBuffer _203 : register(t3, space0);
+RWByteAddressBuffer _285 : register(u0, space0);

 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@ -66,8 +66,8 @@ groupshared DrawMonoid sh_scratch[256];
 DrawMonoid map_tag(uint tag_word)
 {
    uint has_path = uint(tag_word != 0u);
-    DrawMonoid _75 = { has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 28u };
-    return _75;
+    DrawMonoid _76 = { has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 60u };
+    return _76;
 }

 DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b)
@ -88,15 +88,15 @@ DrawMonoid draw_monoid_identity()
 void comp_main()
 {
    uint ix = gl_GlobalInvocationID.x * 8u;
-    uint drawtag_base = _92.Load(100) >> uint(2);
-    uint tag_word = _102.Load((drawtag_base + ix) * 4 + 0);
+    uint drawtag_base = _93.Load(100) >> uint(2);
+    uint tag_word = _103.Load((drawtag_base + ix) * 4 + 0);
    uint param = tag_word;
    DrawMonoid agg = map_tag(param);
    DrawMonoid local[8];
    local[0] = agg;
    for (uint i = 1u; i < 8u; i++)
    {
-        tag_word = _102.Load(((drawtag_base + ix) + i) * 4 + 0);
+        tag_word = _103.Load(((drawtag_base + ix) + i) * 4 + 0);
        uint param_1 = tag_word;
        DrawMonoid param_2 = agg;
        DrawMonoid param_3 = map_tag(param_1);
@ -121,15 +121,15 @@ void comp_main()
    DrawMonoid row = draw_monoid_identity();
    if (gl_WorkGroupID.x > 0u)
    {
-        DrawMonoid _208;
-        _208.path_ix = _202.Load((gl_WorkGroupID.x - 1u) * 16 + 0);
-        _208.clip_ix = _202.Load((gl_WorkGroupID.x - 1u) * 16 + 4);
-        _208.scene_offset = _202.Load((gl_WorkGroupID.x - 1u) * 16 + 8);
-        _208.info_offset = _202.Load((gl_WorkGroupID.x - 1u) * 16 + 12);
-        row.path_ix = _208.path_ix;
-        row.clip_ix = _208.clip_ix;
-        row.scene_offset = _208.scene_offset;
-        row.info_offset = _208.info_offset;
+        DrawMonoid _209;
+        _209.path_ix = _203.Load((gl_WorkGroupID.x - 1u) * 16 + 0);
+        _209.clip_ix = _203.Load((gl_WorkGroupID.x - 1u) * 16 + 4);
+        _209.scene_offset = _203.Load((gl_WorkGroupID.x - 1u) * 16 + 8);
+        _209.info_offset = _203.Load((gl_WorkGroupID.x - 1u) * 16 + 12);
+        row.path_ix = _209.path_ix;
+        row.clip_ix = _209.clip_ix;
+        row.scene_offset = _209.scene_offset;
+        row.info_offset = _209.info_offset;
    }
    if (gl_LocalInvocationID.x > 0u)
    {
@ -137,13 +137,15 @@ void comp_main()
        DrawMonoid param_7 = sh_scratch[gl_LocalInvocationID.x - 1u];
        row = combine_draw_monoid(param_6, param_7);
    }
-    uint drawdata_base = _92.Load(104) >> uint(2);
-    uint drawinfo_base = _92.Load(68) >> uint(2);
+    uint drawdata_base = _93.Load(104) >> uint(2);
+    uint drawinfo_base = _93.Load(68) >> uint(2);
    uint out_ix = gl_GlobalInvocationID.x * 8u;
-    uint out_base = (_92.Load(44) >> uint(2)) + (out_ix * 4u);
-    uint clip_out_base = _92.Load(48) >> uint(2);
+    uint out_base = (_93.Load(44) >> uint(2)) + (out_ix * 4u);
+    uint clip_out_base = _93.Load(48) >> uint(2);
    float4 mat;
    float2 translate;
+    float2 p0;
+    float2 p1;
    for (uint i_2 = 0u; i_2 < 8u; i_2++)
    {
        DrawMonoid m = row;
@ -153,31 +155,31 @@ void comp_main()
            DrawMonoid param_9 = local[i_2 - 1u];
            m = combine_draw_monoid(param_8, param_9);
        }
-        _284.Store((out_base + (i_2 * 4u)) * 4 + 8, m.path_ix);
-        _284.Store(((out_base + (i_2 * 4u)) + 1u) * 4 + 8, m.clip_ix);
-        _284.Store(((out_base + (i_2 * 4u)) + 2u) * 4 + 8, m.scene_offset);
-        _284.Store(((out_base + (i_2 * 4u)) + 3u) * 4 + 8, m.info_offset);
+        _285.Store((out_base + (i_2 * 4u)) * 4 + 8, m.path_ix);
+        _285.Store(((out_base + (i_2 * 4u)) + 1u) * 4 + 8, m.clip_ix);
+        _285.Store(((out_base + (i_2 * 4u)) + 2u) * 4 + 8, m.scene_offset);
+        _285.Store(((out_base + (i_2 * 4u)) + 3u) * 4 + 8, m.info_offset);
        uint dd = drawdata_base + (m.scene_offset >> uint(2));
        uint di = drawinfo_base + (m.info_offset >> uint(2));
-        tag_word = _102.Load(((drawtag_base + ix) + i_2) * 4 + 0);
-        if ((((tag_word == 68u) || (tag_word == 276u)) || (tag_word == 72u)) || (tag_word == 5u))
+        tag_word = _103.Load(((drawtag_base + ix) + i_2) * 4 + 0);
+        if (((((tag_word == 68u) || (tag_word == 276u)) || (tag_word == 732u)) || (tag_word == 72u)) || (tag_word == 5u))
        {
-            uint bbox_offset = (_92.Load(40) >> uint(2)) + (6u * m.path_ix);
-            float bbox_l = float(_284.Load(bbox_offset * 4 + 8)) - 32768.0f;
-            float bbox_t = float(_284.Load((bbox_offset + 1u) * 4 + 8)) - 32768.0f;
-            float bbox_r = float(_284.Load((bbox_offset + 2u) * 4 + 8)) - 32768.0f;
-            float bbox_b = float(_284.Load((bbox_offset + 3u) * 4 + 8)) - 32768.0f;
+            uint bbox_offset = (_93.Load(40) >> uint(2)) + (6u * m.path_ix);
+            float bbox_l = float(_285.Load(bbox_offset * 4 + 8)) - 32768.0f;
+            float bbox_t = float(_285.Load((bbox_offset + 1u) * 4 + 8)) - 32768.0f;
+            float bbox_r = float(_285.Load((bbox_offset + 2u) * 4 + 8)) - 32768.0f;
+            float bbox_b = float(_285.Load((bbox_offset + 3u) * 4 + 8)) - 32768.0f;
            float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
-            float linewidth = asfloat(_284.Load((bbox_offset + 4u) * 4 + 8));
+            float linewidth = asfloat(_285.Load((bbox_offset + 4u) * 4 + 8));
            uint fill_mode = uint(linewidth >= 0.0f);
-            if ((linewidth >= 0.0f) || (tag_word == 276u))
+            if (((linewidth >= 0.0f) || (tag_word == 276u)) || (tag_word == 732u))
            {
-                uint trans_ix = _284.Load((bbox_offset + 5u) * 4 + 8);
-                uint t = (_92.Load(36) >> uint(2)) + (6u * trans_ix);
-                mat = asfloat(uint4(_284.Load(t * 4 + 8), _284.Load((t + 1u) * 4 + 8), _284.Load((t + 2u) * 4 + 8), _284.Load((t + 3u) * 4 + 8)));
-                if (tag_word == 276u)
+                uint trans_ix = _285.Load((bbox_offset + 5u) * 4 + 8);
+                uint t = (_93.Load(36) >> uint(2)) + (6u * trans_ix);
+                mat = asfloat(uint4(_285.Load(t * 4 + 8), _285.Load((t + 1u) * 4 + 8), _285.Load((t + 2u) * 4 + 8), _285.Load((t + 3u) * 4 + 8)));
+                if ((tag_word == 276u) || (tag_word == 732u))
                {
-                    translate = asfloat(uint2(_284.Load((t + 4u) * 4 + 8), _284.Load((t + 5u) * 4 + 8)));
+                    translate = asfloat(uint2(_285.Load((t + 4u) * 4 + 8), _285.Load((t + 5u) * 4 + 8)));
                }
            }
            if (linewidth >= 0.0f)
@ -189,15 +191,14 @@ void comp_main()
                case 68u:
                case 72u:
                {
-                    _284.Store(di * 4 + 8, asuint(linewidth));
+                    _285.Store(di * 4 + 8, asuint(linewidth));
                    break;
                }
                case 276u:
                {
-                    _284.Store(di * 4 + 8, asuint(linewidth));
-                    uint index = _102.Load(dd * 4 + 0);
-                    float2 p0 = asfloat(uint2(_102.Load((dd + 1u) * 4 + 0), _102.Load((dd + 2u) * 4 + 0)));
-                    float2 p1 = asfloat(uint2(_102.Load((dd + 3u) * 4 + 0), _102.Load((dd + 4u) * 4 + 0)));
+                    _285.Store(di * 4 + 8, asuint(linewidth));
+                    p0 = asfloat(uint2(_103.Load((dd + 1u) * 4 + 0), _103.Load((dd + 2u) * 4 + 0)));
+                    p1 = asfloat(uint2(_103.Load((dd + 3u) * 4 + 0), _103.Load((dd + 4u) * 4 + 0)));
                    p0 = ((mat.xy * p0.x) + (mat.zw * p0.y)) + translate;
                    p1 = ((mat.xy * p1.x) + (mat.zw * p1.y)) + translate;
                    float2 dxy = p1 - p0;
@ -205,9 +206,38 @@ void comp_main()
                    float line_x = dxy.x * scale;
                    float line_y = dxy.y * scale;
                    float line_c = -((p0.x * line_x) + (p0.y * line_y));
-                    _284.Store((di + 1u) * 4 + 8, asuint(line_x));
-                    _284.Store((di + 2u) * 4 + 8, asuint(line_y));
-                    _284.Store((di + 3u) * 4 + 8, asuint(line_c));
+                    _285.Store((di + 1u) * 4 + 8, asuint(line_x));
+                    _285.Store((di + 2u) * 4 + 8, asuint(line_y));
+                    _285.Store((di + 3u) * 4 + 8, asuint(line_c));
+                    break;
+                }
+                case 732u:
+                {
+                    p0 = asfloat(uint2(_103.Load((dd + 1u) * 4 + 0), _103.Load((dd + 2u) * 4 + 0)));
+                    p1 = asfloat(uint2(_103.Load((dd + 3u) * 4 + 0), _103.Load((dd + 4u) * 4 + 0)));
+                    float r0 = asfloat(_103.Load((dd + 5u) * 4 + 0));
+                    float r1 = asfloat(_103.Load((dd + 6u) * 4 + 0));
+                    float inv_det = 1.0f / ((mat.x * mat.w) - (mat.y * mat.z));
+                    float4 inv_mat = float4(mat.w, -mat.y, -mat.z, mat.x) * inv_det;
+                    float2 inv_tr = (inv_mat.xz * translate.x) + (inv_mat.yw * translate.y);
+                    inv_tr += p0;
+                    float2 center1 = p1 - p0;
+                    float rr = r1 / (r1 - r0);
+                    float rainv = rr / ((r1 * r1) - dot(center1, center1));
+                    float2 c1 = center1 * rainv;
+                    float ra = rr * rainv;
+                    float roff = rr - 1.0f;
+                    _285.Store(di * 4 + 8, asuint(linewidth));
+                    _285.Store((di + 1u) * 4 + 8, asuint(inv_mat.x));
+                    _285.Store((di + 2u) * 4 + 8, asuint(inv_mat.y));
+                    _285.Store((di + 3u) * 4 + 8, asuint(inv_mat.z));
+                    _285.Store((di + 4u) * 4 + 8, asuint(inv_mat.w));
+                    _285.Store((di + 5u) * 4 + 8, asuint(inv_tr.x));
+                    _285.Store((di + 6u) * 4 + 8, asuint(inv_tr.y));
+                    _285.Store((di + 7u) * 4 + 8, asuint(c1.x));
+                    _285.Store((di + 8u) * 4 + 8, asuint(c1.y));
+                    _285.Store((di + 9u) * 4 + 8, asuint(ra));
+                    _285.Store((di + 10u) * 4 + 8, asuint(roff));
                    break;
                }
                case 5u:
@ -223,7 +253,7 @@ void comp_main()
            {
                path_ix = m.path_ix;
            }
-            _284.Store((clip_out_base + m.clip_ix) * 4 + 8, path_ix);
+            _285.Store((clip_out_base + m.clip_ix) * 4 + 8, path_ix);
        }
    }
 }
--- a/piet-gpu/shader/gen/draw_leaf.msl
+++ b/piet-gpu/shader/gen/draw_leaf.msl
@ -124,7 +124,7 @@ static inline __attribute__((always_inline))
 DrawMonoid map_tag(thread const uint& tag_word)
 {
    uint has_path = uint(tag_word != 0u);
-    return DrawMonoid{ has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 28u };
+    return DrawMonoid{ has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 60u };
 }

 static inline __attribute__((always_inline))
@ -144,19 +144,19 @@ DrawMonoid draw_monoid_identity()
    return DrawMonoid{ 0u, 0u, 0u, 0u };
 }

-kernel void main0(device Memory& _284 [[buffer(0)]], const device ConfigBuf& _92 [[buffer(1)]], const device SceneBuf& _102 [[buffer(2)]], const device ParentBuf& _202 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+kernel void main0(device Memory& _285 [[buffer(0)]], const device ConfigBuf& _93 [[buffer(1)]], const device SceneBuf& _103 [[buffer(2)]], const device ParentBuf& _203 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
 {
    threadgroup DrawMonoid sh_scratch[256];
    uint ix = gl_GlobalInvocationID.x * 8u;
-    uint drawtag_base = _92.conf.drawtag_offset >> uint(2);
-    uint tag_word = _102.scene[drawtag_base + ix];
+    uint drawtag_base = _93.conf.drawtag_offset >> uint(2);
+    uint tag_word = _103.scene[drawtag_base + ix];
    uint param = tag_word;
    DrawMonoid agg = map_tag(param);
    spvUnsafeArray<DrawMonoid, 8> local;
    local[0] = agg;
    for (uint i = 1u; i < 8u; i++)
    {
-        tag_word = _102.scene[(drawtag_base + ix) + i];
+        tag_word = _103.scene[(drawtag_base + ix) + i];
        uint param_1 = tag_word;
        DrawMonoid param_2 = agg;
        DrawMonoid param_3 = map_tag(param_1);
@ -181,11 +181,11 @@ kernel void main0(device Memory& _284 [[buffer(0)]], const device ConfigBuf& _92
    DrawMonoid row = draw_monoid_identity();
    if (gl_WorkGroupID.x > 0u)
    {
-        uint _205 = gl_WorkGroupID.x - 1u;
-        row.path_ix = _202.parent[_205].path_ix;
-        row.clip_ix = _202.parent[_205].clip_ix;
-        row.scene_offset = _202.parent[_205].scene_offset;
-        row.info_offset = _202.parent[_205].info_offset;
+        uint _206 = gl_WorkGroupID.x - 1u;
+        row.path_ix = _203.parent[_206].path_ix;
+        row.clip_ix = _203.parent[_206].clip_ix;
+        row.scene_offset = _203.parent[_206].scene_offset;
+        row.info_offset = _203.parent[_206].info_offset;
    }
    if (gl_LocalInvocationID.x > 0u)
    {
@ -193,13 +193,15 @@ kernel void main0(device Memory& _284 [[buffer(0)]], const device ConfigBuf& _92
        DrawMonoid param_7 = sh_scratch[gl_LocalInvocationID.x - 1u];
        row = combine_draw_monoid(param_6, param_7);
    }
-    uint drawdata_base = _92.conf.drawdata_offset >> uint(2);
-    uint drawinfo_base = _92.conf.drawinfo_alloc.offset >> uint(2);
+    uint drawdata_base = _93.conf.drawdata_offset >> uint(2);
+    uint drawinfo_base = _93.conf.drawinfo_alloc.offset >> uint(2);
    uint out_ix = gl_GlobalInvocationID.x * 8u;
-    uint out_base = (_92.conf.drawmonoid_alloc.offset >> uint(2)) + (out_ix * 4u);
-    uint clip_out_base = _92.conf.clip_alloc.offset >> uint(2);
+    uint out_base = (_93.conf.drawmonoid_alloc.offset >> uint(2)) + (out_ix * 4u);
+    uint clip_out_base = _93.conf.clip_alloc.offset >> uint(2);
    float4 mat;
    float2 translate;
+    float2 p0;
+    float2 p1;
    for (uint i_2 = 0u; i_2 < 8u; i_2++)
    {
        DrawMonoid m = row;
@ -209,31 +211,31 @@ kernel void main0(device Memory& _284 [[buffer(0)]], const device ConfigBuf& _92
            DrawMonoid param_9 = local[i_2 - 1u];
            m = combine_draw_monoid(param_8, param_9);
        }
-        _284.memory[out_base + (i_2 * 4u)] = m.path_ix;
-        _284.memory[(out_base + (i_2 * 4u)) + 1u] = m.clip_ix;
-        _284.memory[(out_base + (i_2 * 4u)) + 2u] = m.scene_offset;
-        _284.memory[(out_base + (i_2 * 4u)) + 3u] = m.info_offset;
+        _285.memory[out_base + (i_2 * 4u)] = m.path_ix;
+        _285.memory[(out_base + (i_2 * 4u)) + 1u] = m.clip_ix;
+        _285.memory[(out_base + (i_2 * 4u)) + 2u] = m.scene_offset;
+        _285.memory[(out_base + (i_2 * 4u)) + 3u] = m.info_offset;
        uint dd = drawdata_base + (m.scene_offset >> uint(2));
        uint di = drawinfo_base + (m.info_offset >> uint(2));
-        tag_word = _102.scene[(drawtag_base + ix) + i_2];
-        if ((((tag_word == 68u) || (tag_word == 276u)) || (tag_word == 72u)) || (tag_word == 5u))
+        tag_word = _103.scene[(drawtag_base + ix) + i_2];
+        if (((((tag_word == 68u) || (tag_word == 276u)) || (tag_word == 732u)) || (tag_word == 72u)) || (tag_word == 5u))
        {
-            uint bbox_offset = (_92.conf.path_bbox_alloc.offset >> uint(2)) + (6u * m.path_ix);
-            float bbox_l = float(_284.memory[bbox_offset]) - 32768.0;
-            float bbox_t = float(_284.memory[bbox_offset + 1u]) - 32768.0;
-            float bbox_r = float(_284.memory[bbox_offset + 2u]) - 32768.0;
-            float bbox_b = float(_284.memory[bbox_offset + 3u]) - 32768.0;
+            uint bbox_offset = (_93.conf.path_bbox_alloc.offset >> uint(2)) + (6u * m.path_ix);
+            float bbox_l = float(_285.memory[bbox_offset]) - 32768.0;
+            float bbox_t = float(_285.memory[bbox_offset + 1u]) - 32768.0;
+            float bbox_r = float(_285.memory[bbox_offset + 2u]) - 32768.0;
+            float bbox_b = float(_285.memory[bbox_offset + 3u]) - 32768.0;
            float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
-            float linewidth = as_type<float>(_284.memory[bbox_offset + 4u]);
+            float linewidth = as_type<float>(_285.memory[bbox_offset + 4u]);
            uint fill_mode = uint(linewidth >= 0.0);
-            if ((linewidth >= 0.0) || (tag_word == 276u))
+            if (((linewidth >= 0.0) || (tag_word == 276u)) || (tag_word == 732u))
            {
-                uint trans_ix = _284.memory[bbox_offset + 5u];
-                uint t = (_92.conf.trans_alloc.offset >> uint(2)) + (6u * trans_ix);
-                mat = as_type<float4>(uint4(_284.memory[t], _284.memory[t + 1u], _284.memory[t + 2u], _284.memory[t + 3u]));
-                if (tag_word == 276u)
+                uint trans_ix = _285.memory[bbox_offset + 5u];
+                uint t = (_93.conf.trans_alloc.offset >> uint(2)) + (6u * trans_ix);
+                mat = as_type<float4>(uint4(_285.memory[t], _285.memory[t + 1u], _285.memory[t + 2u], _285.memory[t + 3u]));
+                if ((tag_word == 276u) || (tag_word == 732u))
                {
-                    translate = as_type<float2>(uint2(_284.memory[t + 4u], _284.memory[t + 5u]));
+                    translate = as_type<float2>(uint2(_285.memory[t + 4u], _285.memory[t + 5u]));
                }
            }
            if (linewidth >= 0.0)
@ -245,15 +247,14 @@ kernel void main0(device Memory& _284 [[buffer(0)]], const device ConfigBuf& _92
                case 68u:
                case 72u:
                {
-                    _284.memory[di] = as_type<uint>(linewidth);
+                    _285.memory[di] = as_type<uint>(linewidth);
                    break;
                }
                case 276u:
                {
-                    _284.memory[di] = as_type<uint>(linewidth);
-                    uint index = _102.scene[dd];
-                    float2 p0 = as_type<float2>(uint2(_102.scene[dd + 1u], _102.scene[dd + 2u]));
-                    float2 p1 = as_type<float2>(uint2(_102.scene[dd + 3u], _102.scene[dd + 4u]));
+                    _285.memory[di] = as_type<uint>(linewidth);
+                    p0 = as_type<float2>(uint2(_103.scene[dd + 1u], _103.scene[dd + 2u]));
+                    p1 = as_type<float2>(uint2(_103.scene[dd + 3u], _103.scene[dd + 4u]));
                    p0 = ((mat.xy * p0.x) + (mat.zw * p0.y)) + translate;
                    p1 = ((mat.xy * p1.x) + (mat.zw * p1.y)) + translate;
                    float2 dxy = p1 - p0;
@ -261,9 +262,38 @@ kernel void main0(device Memory& _284 [[buffer(0)]], const device ConfigBuf& _92
                    float line_x = dxy.x * scale;
                    float line_y = dxy.y * scale;
                    float line_c = -((p0.x * line_x) + (p0.y * line_y));
-                    _284.memory[di + 1u] = as_type<uint>(line_x);
-                    _284.memory[di + 2u] = as_type<uint>(line_y);
-                    _284.memory[di + 3u] = as_type<uint>(line_c);
+                    _285.memory[di + 1u] = as_type<uint>(line_x);
+                    _285.memory[di + 2u] = as_type<uint>(line_y);
+                    _285.memory[di + 3u] = as_type<uint>(line_c);
+                    break;
+                }
+                case 732u:
+                {
+                    p0 = as_type<float2>(uint2(_103.scene[dd + 1u], _103.scene[dd + 2u]));
+                    p1 = as_type<float2>(uint2(_103.scene[dd + 3u], _103.scene[dd + 4u]));
+                    float r0 = as_type<float>(_103.scene[dd + 5u]);
+                    float r1 = as_type<float>(_103.scene[dd + 6u]);
+                    float inv_det = 1.0 / ((mat.x * mat.w) - (mat.y * mat.z));
+                    float4 inv_mat = float4(mat.w, -mat.y, -mat.z, mat.x) * inv_det;
+                    float2 inv_tr = (inv_mat.xz * translate.x) + (inv_mat.yw * translate.y);
+                    inv_tr += p0;
+                    float2 center1 = p1 - p0;
+                    float rr = r1 / (r1 - r0);
+                    float rainv = rr / ((r1 * r1) - dot(center1, center1));
+                    float2 c1 = center1 * rainv;
+                    float ra = rr * rainv;
+                    float roff = rr - 1.0;
+                    _285.memory[di] = as_type<uint>(linewidth);
+                    _285.memory[di + 1u] = as_type<uint>(inv_mat.x);
+                    _285.memory[di + 2u] = as_type<uint>(inv_mat.y);
+                    _285.memory[di + 3u] = as_type<uint>(inv_mat.z);
+                    _285.memory[di + 4u] = as_type<uint>(inv_mat.w);
+                    _285.memory[di + 5u] = as_type<uint>(inv_tr.x);
+                    _285.memory[di + 6u] = as_type<uint>(inv_tr.y);
+                    _285.memory[di + 7u] = as_type<uint>(c1.x);
+                    _285.memory[di + 8u] = as_type<uint>(c1.y);
+                    _285.memory[di + 9u] = as_type<uint>(ra);
+                    _285.memory[di + 10u] = as_type<uint>(roff);
                    break;
                }
                case 5u:
@ -279,7 +309,7 @@ kernel void main0(device Memory& _284 [[buffer(0)]], const device ConfigBuf& _92
            {
                path_ix = m.path_ix;
            }
-            _284.memory[clip_out_base + m.clip_ix] = path_ix;
+            _285.memory[clip_out_base + m.clip_ix] = path_ix;
        }
    }
 }
--- a/piet-gpu/shader/gen/draw_leaf.spv
+++ b/piet-gpu/shader/gen/draw_leaf.spv
--- a/piet-gpu/shader/gen/draw_reduce.dxil
+++ b/piet-gpu/shader/gen/draw_reduce.dxil
--- a/piet-gpu/shader/gen/draw_reduce.hlsl
+++ b/piet-gpu/shader/gen/draw_reduce.hlsl
@ -44,10 +44,10 @@ struct Config

 static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);

-ByteAddressBuffer _86 : register(t1, space0);
-ByteAddressBuffer _96 : register(t2, space0);
-RWByteAddressBuffer _187 : register(u3, space0);
-RWByteAddressBuffer _205 : register(u0, space0);
+ByteAddressBuffer _87 : register(t1, space0);
+ByteAddressBuffer _97 : register(t2, space0);
+RWByteAddressBuffer _188 : register(u3, space0);
+RWByteAddressBuffer _206 : register(u0, space0);

 static uint3 gl_WorkGroupID;
 static uint3 gl_LocalInvocationID;
@ -64,8 +64,8 @@ groupshared DrawMonoid sh_scratch[256];
 DrawMonoid map_tag(uint tag_word)
 {
    uint has_path = uint(tag_word != 0u);
-    DrawMonoid _69 = { has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 28u };
-    return _69;
+    DrawMonoid _70 = { has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 60u };
+    return _70;
 }

 DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b)
@ -81,13 +81,13 @@ DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b)
 void comp_main()
 {
    uint ix = gl_GlobalInvocationID.x * 8u;
-    uint drawtag_base = _86.Load(100) >> uint(2);
-    uint tag_word = _96.Load((drawtag_base + ix) * 4 + 0);
+    uint drawtag_base = _87.Load(100) >> uint(2);
+    uint tag_word = _97.Load((drawtag_base + ix) * 4 + 0);
    uint param = tag_word;
    DrawMonoid agg = map_tag(param);
    for (uint i = 1u; i < 8u; i++)
    {
-        uint tag_word_1 = _96.Load(((drawtag_base + ix) + i) * 4 + 0);
+        uint tag_word_1 = _97.Load(((drawtag_base + ix) + i) * 4 + 0);
        uint param_1 = tag_word_1;
        DrawMonoid param_2 = agg;
        DrawMonoid param_3 = map_tag(param_1);
@ -109,10 +109,10 @@ void comp_main()
    }
    if (gl_LocalInvocationID.x == 0u)
    {
-        _187.Store(gl_WorkGroupID.x * 16 + 0, agg.path_ix);
-        _187.Store(gl_WorkGroupID.x * 16 + 4, agg.clip_ix);
-        _187.Store(gl_WorkGroupID.x * 16 + 8, agg.scene_offset);
-        _187.Store(gl_WorkGroupID.x * 16 + 12, agg.info_offset);
+        _188.Store(gl_WorkGroupID.x * 16 + 0, agg.path_ix);
+        _188.Store(gl_WorkGroupID.x * 16 + 4, agg.clip_ix);
+        _188.Store(gl_WorkGroupID.x * 16 + 8, agg.scene_offset);
+        _188.Store(gl_WorkGroupID.x * 16 + 12, agg.info_offset);
    }
 }

--- a/piet-gpu/shader/gen/draw_reduce.msl
+++ b/piet-gpu/shader/gen/draw_reduce.msl
@ -85,7 +85,7 @@ static inline __attribute__((always_inline))
 DrawMonoid map_tag(thread const uint& tag_word)
 {
    uint has_path = uint(tag_word != 0u);
-    return DrawMonoid{ has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 28u };
+    return DrawMonoid{ has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 60u };
 }

 static inline __attribute__((always_inline))
@ -99,17 +99,17 @@ DrawMonoid combine_draw_monoid(thread const DrawMonoid& a, thread const DrawMono
    return c;
 }

-kernel void main0(const device ConfigBuf& _86 [[buffer(1)]], const device SceneBuf& _96 [[buffer(2)]], device OutBuf& _187 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+kernel void main0(const device ConfigBuf& _87 [[buffer(1)]], const device SceneBuf& _97 [[buffer(2)]], device OutBuf& _188 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
 {
    threadgroup DrawMonoid sh_scratch[256];
    uint ix = gl_GlobalInvocationID.x * 8u;
-    uint drawtag_base = _86.conf.drawtag_offset >> uint(2);
-    uint tag_word = _96.scene[drawtag_base + ix];
+    uint drawtag_base = _87.conf.drawtag_offset >> uint(2);
+    uint tag_word = _97.scene[drawtag_base + ix];
    uint param = tag_word;
    DrawMonoid agg = map_tag(param);
    for (uint i = 1u; i < 8u; i++)
    {
-        uint tag_word_1 = _96.scene[(drawtag_base + ix) + i];
+        uint tag_word_1 = _97.scene[(drawtag_base + ix) + i];
        uint param_1 = tag_word_1;
        DrawMonoid param_2 = agg;
        DrawMonoid param_3 = map_tag(param_1);
@ -131,10 +131,10 @@ kernel void main0(const device ConfigBuf& _86 [[buffer(1)]], const device SceneB
    }
    if (gl_LocalInvocationID.x == 0u)
    {
-        _187.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix;
-        _187.outbuf[gl_WorkGroupID.x].clip_ix = agg.clip_ix;
-        _187.outbuf[gl_WorkGroupID.x].scene_offset = agg.scene_offset;
-        _187.outbuf[gl_WorkGroupID.x].info_offset = agg.info_offset;
+        _188.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix;
+        _188.outbuf[gl_WorkGroupID.x].clip_ix = agg.clip_ix;
+        _188.outbuf[gl_WorkGroupID.x].scene_offset = agg.scene_offset;
+        _188.outbuf[gl_WorkGroupID.x].info_offset = agg.info_offset;
    }
 }

--- a/piet-gpu/shader/gen/draw_reduce.spv
+++ b/piet-gpu/shader/gen/draw_reduce.spv
--- a/piet-gpu/shader/gen/kernel4.dxil
+++ b/piet-gpu/shader/gen/kernel4.dxil
--- a/piet-gpu/shader/gen/kernel4.hlsl
+++ b/piet-gpu/shader/gen/kernel4.hlsl
@ -48,6 +48,21 @@ struct CmdLinGrad
    float line_c;
 };

+struct CmdRadGradRef
+{
+    uint offset;
+};
+
+struct CmdRadGrad
+{
+    uint index;
+    float4 mat;
+    float2 xlat;
+    float2 c1;
+    float ra;
+    float roff;
+};
+
 struct CmdImageRef
 {
    uint offset;
@ -146,8 +161,8 @@ struct Config

 static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);

-RWByteAddressBuffer _278 : register(u0, space0);
-ByteAddressBuffer _1521 : register(t1, space0);
+RWByteAddressBuffer _291 : register(u0, space0);
+ByteAddressBuffer _1666 : register(t1, space0);
 RWTexture2D<unorm float4> image_atlas : register(u3, space0);
 RWTexture2D<unorm float4> gradients : register(u4, space0);
 RWTexture2D<unorm float4> image : register(u2, space0);
@ -174,8 +189,8 @@ float4 spvUnpackUnorm4x8(uint value)

 Alloc slice_mem(Alloc a, uint offset, uint size)
 {
-    Alloc _291 = { a.offset + offset };
-    return _291;
+    Alloc _304 = { a.offset + offset };
+    return _304;
 }

 bool touch_mem(Alloc alloc, uint offset)
@ -191,7 +206,7 @@ uint read_mem(Alloc alloc, uint offset)
    {
        return 0u;
    }
-    uint v = _278.Load(offset * 4 + 8);
+    uint v = _291.Load(offset * 4 + 8);
    return v;
 }

@ -200,8 +215,8 @@ CmdTag Cmd_tag(Alloc a, CmdRef ref)
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
    uint tag_and_flags = read_mem(param, param_1);
-    CmdTag _525 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _525;
+    CmdTag _663 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+    return _663;
 }

 CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref)
@ -221,9 +236,9 @@ CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref)

 CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref)
 {
-    CmdStrokeRef _542 = { ref.offset + 4u };
+    CmdStrokeRef _679 = { ref.offset + 4u };
    Alloc param = a;
-    CmdStrokeRef param_1 = _542;
+    CmdStrokeRef param_1 = _679;
    return CmdStroke_read(param, param_1);
 }

@ -259,8 +274,8 @@ TileSeg TileSeg_read(Alloc a, TileSegRef ref)
    s.origin = float2(asfloat(raw0), asfloat(raw1));
    s._vector = float2(asfloat(raw2), asfloat(raw3));
    s.y_edge = asfloat(raw4);
-    TileSegRef _675 = { raw5 };
-    s.next = _675;
+    TileSegRef _820 = { raw5 };
+    s.next = _820;
    return s;
 }

@ -286,9 +301,9 @@ CmdFill CmdFill_read(Alloc a, CmdFillRef ref)

 CmdFill Cmd_Fill_read(Alloc a, CmdRef ref)
 {
-    CmdFillRef _532 = { ref.offset + 4u };
+    CmdFillRef _669 = { ref.offset + 4u };
    Alloc param = a;
-    CmdFillRef param_1 = _532;
+    CmdFillRef param_1 = _669;
    return CmdFill_read(param, param_1);
 }

@ -305,9 +320,9 @@ CmdAlpha CmdAlpha_read(Alloc a, CmdAlphaRef ref)

 CmdAlpha Cmd_Alpha_read(Alloc a, CmdRef ref)
 {
-    CmdAlphaRef _552 = { ref.offset + 4u };
+    CmdAlphaRef _689 = { ref.offset + 4u };
    Alloc param = a;
-    CmdAlphaRef param_1 = _552;
+    CmdAlphaRef param_1 = _689;
    return CmdAlpha_read(param, param_1);
 }

@ -324,9 +339,9 @@ CmdColor CmdColor_read(Alloc a, CmdColorRef ref)

 CmdColor Cmd_Color_read(Alloc a, CmdRef ref)
 {
-    CmdColorRef _562 = { ref.offset + 4u };
+    CmdColorRef _699 = { ref.offset + 4u };
    Alloc param = a;
-    CmdColorRef param_1 = _562;
+    CmdColorRef param_1 = _699;
    return CmdColor_read(param, param_1);
 }

@ -370,12 +385,66 @@ CmdLinGrad CmdLinGrad_read(Alloc a, CmdLinGradRef ref)

 CmdLinGrad Cmd_LinGrad_read(Alloc a, CmdRef ref)
 {
-    CmdLinGradRef _572 = { ref.offset + 4u };
+    CmdLinGradRef _709 = { ref.offset + 4u };
    Alloc param = a;
-    CmdLinGradRef param_1 = _572;
+    CmdLinGradRef param_1 = _709;
    return CmdLinGrad_read(param, param_1);
 }

+CmdRadGrad CmdRadGrad_read(Alloc a, CmdRadGradRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9);
+    Alloc param_10 = a;
+    uint param_11 = ix + 5u;
+    uint raw5 = read_mem(param_10, param_11);
+    Alloc param_12 = a;
+    uint param_13 = ix + 6u;
+    uint raw6 = read_mem(param_12, param_13);
+    Alloc param_14 = a;
+    uint param_15 = ix + 7u;
+    uint raw7 = read_mem(param_14, param_15);
+    Alloc param_16 = a;
+    uint param_17 = ix + 8u;
+    uint raw8 = read_mem(param_16, param_17);
+    Alloc param_18 = a;
+    uint param_19 = ix + 9u;
+    uint raw9 = read_mem(param_18, param_19);
+    Alloc param_20 = a;
+    uint param_21 = ix + 10u;
+    uint raw10 = read_mem(param_20, param_21);
+    CmdRadGrad s;
+    s.index = raw0;
+    s.mat = float4(asfloat(raw1), asfloat(raw2), asfloat(raw3), asfloat(raw4));
+    s.xlat = float2(asfloat(raw5), asfloat(raw6));
+    s.c1 = float2(asfloat(raw7), asfloat(raw8));
+    s.ra = asfloat(raw9);
+    s.roff = asfloat(raw10);
+    return s;
+}
+
+CmdRadGrad Cmd_RadGrad_read(Alloc a, CmdRef ref)
+{
+    CmdRadGradRef _719 = { ref.offset + 4u };
+    Alloc param = a;
+    CmdRadGradRef param_1 = _719;
+    return CmdRadGrad_read(param, param_1);
+}
+
 CmdImage CmdImage_read(Alloc a, CmdImageRef ref)
 {
    uint ix = ref.offset >> uint(2);
@ -393,9 +462,9 @@ CmdImage CmdImage_read(Alloc a, CmdImageRef ref)

 CmdImage Cmd_Image_read(Alloc a, CmdRef ref)
 {
-    CmdImageRef _582 = { ref.offset + 4u };
+    CmdImageRef _729 = { ref.offset + 4u };
    Alloc param = a;
-    CmdImageRef param_1 = _582;
+    CmdImageRef param_1 = _729;
    return CmdImage_read(param, param_1);
 }

@ -408,10 +477,10 @@ void fillImage(out float4 spvReturnValue[8], uint2 xy, CmdImage cmd_img)
        int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
        float4 fg_rgba = image_atlas[uv];
        float3 param_1 = fg_rgba.xyz;
-        float3 _1493 = fromsRGB(param_1);
-        fg_rgba.x = _1493.x;
-        fg_rgba.y = _1493.y;
-        fg_rgba.z = _1493.z;
+        float3 _1638 = fromsRGB(param_1);
+        fg_rgba.x = _1638.x;
+        fg_rgba.y = _1638.y;
+        fg_rgba.z = _1638.z;
        rgba[i] = fg_rgba;
    }
    spvReturnValue = rgba;
@ -445,9 +514,9 @@ CmdEndClip CmdEndClip_read(Alloc a, CmdEndClipRef ref)

 CmdEndClip Cmd_EndClip_read(Alloc a, CmdRef ref)
 {
-    CmdEndClipRef _592 = { ref.offset + 4u };
+    CmdEndClipRef _739 = { ref.offset + 4u };
    Alloc param = a;
-    CmdEndClipRef param_1 = _592;
+    CmdEndClipRef param_1 = _739;
    return CmdEndClip_read(param, param_1);
 }

@ -637,8 +706,8 @@ float3 set_lum(float3 c, float l)
 {
    float3 param = c;
    float3 param_1 = c + (l - lum(param)).xxx;
-    float3 _901 = clip_color(param_1);
-    return _901;
+    float3 _1046 = clip_color(param_1);
+    return _1046;
 }

 float3 mix_blend(float3 cb, float3 cs, uint mode)
@ -726,9 +795,9 @@ float3 mix_blend(float3 cb, float3 cs, uint mode)
            float3 param_20 = cb;
            float3 param_21 = cs;
            float param_22 = sat(param_20);
-            float3 _1192 = set_sat(param_21, param_22);
+            float3 _1337 = set_sat(param_21, param_22);
            float3 param_23 = cb;
-            float3 param_24 = _1192;
+            float3 param_24 = _1337;
            float param_25 = lum(param_23);
            b = set_lum(param_24, param_25);
            break;
@ -738,9 +807,9 @@ float3 mix_blend(float3 cb, float3 cs, uint mode)
            float3 param_26 = cs;
            float3 param_27 = cb;
            float param_28 = sat(param_26);
-            float3 _1206 = set_sat(param_27, param_28);
+            float3 _1351 = set_sat(param_27, param_28);
            float3 param_29 = cb;
-            float3 param_30 = _1206;
+            float3 param_30 = _1351;
            float param_31 = lum(param_29);
            b = set_lum(param_30, param_31);
            break;
@ -877,24 +946,24 @@ CmdJump CmdJump_read(Alloc a, CmdJumpRef ref)

 CmdJump Cmd_Jump_read(Alloc a, CmdRef ref)
 {
-    CmdJumpRef _602 = { ref.offset + 4u };
+    CmdJumpRef _749 = { ref.offset + 4u };
    Alloc param = a;
-    CmdJumpRef param_1 = _602;
+    CmdJumpRef param_1 = _749;
    return CmdJump_read(param, param_1);
 }

 void comp_main()
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1521.Load(8)) + gl_WorkGroupID.x;
-    Alloc _1536;
-    _1536.offset = _1521.Load(24);
+    uint tile_ix = (gl_WorkGroupID.y * _1666.Load(8)) + gl_WorkGroupID.x;
+    Alloc _1681;
+    _1681.offset = _1666.Load(24);
    Alloc param;
-    param.offset = _1536.offset;
+    param.offset = _1681.offset;
    uint param_1 = tile_ix * 1024u;
    uint param_2 = 1024u;
    Alloc cmd_alloc = slice_mem(param, param_1, param_2);
-    CmdRef _1545 = { cmd_alloc.offset };
-    CmdRef cmd_ref = _1545;
+    CmdRef _1690 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _1690;
    uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
    float2 xy = float2(xy_uint);
    float4 rgba[8];
@ -903,7 +972,7 @@ void comp_main()
        rgba[i] = 0.0f.xxxx;
    }
    uint clip_depth = 0u;
-    bool mem_ok = _278.Load(4) == 0u;
+    bool mem_ok = _291.Load(4) == 0u;
    float df[8];
    TileSegRef tile_seg_ref;
    float area[8];
@ -928,8 +997,8 @@ void comp_main()
                {
                    df[k] = 1000000000.0f;
                }
-                TileSegRef _1638 = { stroke.tile_ref };
-                tile_seg_ref = _1638;
+                TileSegRef _1784 = { stroke.tile_ref };
+                tile_seg_ref = _1784;
                do
                {
                    uint param_7 = tile_seg_ref.offset;
@ -965,8 +1034,8 @@ void comp_main()
                {
                    area[k_3] = float(fill.backdrop);
                }
-                TileSegRef _1758 = { fill.tile_ref };
-                tile_seg_ref = _1758;
+                TileSegRef _1904 = { fill.tile_ref };
+                tile_seg_ref = _1904;
                do
                {
                    uint param_15 = tile_seg_ref.offset;
@ -1055,11 +1124,12 @@ void comp_main()
                    int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
                    float4 fg_rgba = gradients[int2(x, int(lin.index))];
                    float3 param_29 = fg_rgba.xyz;
-                    float3 _2092 = fromsRGB(param_29);
-                    fg_rgba.x = _2092.x;
-                    fg_rgba.y = _2092.y;
-                    fg_rgba.z = _2092.z;
-                    rgba[k_9] = fg_rgba;
+                    float3 _2238 = fromsRGB(param_29);
+                    fg_rgba.x = _2238.x;
+                    fg_rgba.y = _2238.y;
+                    fg_rgba.z = _2238.z;
+                    float4 fg_k_1 = fg_rgba * area[k_9];
+                    rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1;
                }
                cmd_ref.offset += 20u;
                break;
@ -1068,74 +1138,100 @@ void comp_main()
            {
                Alloc param_30 = cmd_alloc;
                CmdRef param_31 = cmd_ref;
-                CmdImage fill_img = Cmd_Image_read(param_30, param_31);
-                uint2 param_32 = xy_uint;
-                CmdImage param_33 = fill_img;
-                float4 _2121[8];
-                fillImage(_2121, param_32, param_33);
-                float4 img[8] = _2121;
+                CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31);
                for (uint k_10 = 0u; k_10 < 8u; k_10++)
                {
-                    float4 fg_k_1 = img[k_10] * area[k_10];
-                    rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_1.w)) + fg_k_1;
+                    uint param_32 = k_10;
+                    float2 my_xy_1 = xy + float2(chunk_offset(param_32));
+                    my_xy_1 = ((rad.mat.xz * my_xy_1.x) + (rad.mat.yw * my_xy_1.y)) - rad.xlat;
+                    float ba = dot(my_xy_1, rad.c1);
+                    float ca = rad.ra * dot(my_xy_1, my_xy_1);
+                    float t_2 = (sqrt((ba * ba) + ca) - ba) - rad.roff;
+                    int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f));
+                    float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))];
+                    float3 param_33 = fg_rgba_1.xyz;
+                    float3 _2348 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2348.x;
+                    fg_rgba_1.y = _2348.y;
+                    fg_rgba_1.z = _2348.z;
+                    float4 fg_k_2 = fg_rgba_1 * area[k_10];
+                    rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2;
                }
-                cmd_ref.offset += 12u;
+                cmd_ref.offset += 48u;
                break;
            }
            case 8u:
            {
+                Alloc param_34 = cmd_alloc;
+                CmdRef param_35 = cmd_ref;
+                CmdImage fill_img = Cmd_Image_read(param_34, param_35);
+                uint2 param_36 = xy_uint;
+                CmdImage param_37 = fill_img;
+                float4 _2391[8];
+                fillImage(_2391, param_36, param_37);
+                float4 img[8] = _2391;
                for (uint k_11 = 0u; k_11 < 8u; k_11++)
+                {
+                    float4 fg_k_3 = img[k_11] * area[k_11];
+                    rgba[k_11] = (rgba[k_11] * (1.0f - fg_k_3.w)) + fg_k_3;
+                }
+                cmd_ref.offset += 12u;
+                break;
+            }
+            case 9u:
+            {
+                for (uint k_12 = 0u; k_12 < 8u; k_12++)
                {
                    uint d_2 = min(clip_depth, 127u);
-                    float4 param_34 = float4(rgba[k_11]);
-                    uint _2184 = packsRGB(param_34);
-                    blend_stack[d_2][k_11] = _2184;
-                    rgba[k_11] = 0.0f.xxxx;
+                    float4 param_38 = float4(rgba[k_12]);
+                    uint _2454 = packsRGB(param_38);
+                    blend_stack[d_2][k_12] = _2454;
+                    rgba[k_12] = 0.0f.xxxx;
                }
                clip_depth++;
                cmd_ref.offset += 4u;
                break;
            }
-            case 9u:
+            case 10u:
            {
-                Alloc param_35 = cmd_alloc;
-                CmdRef param_36 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_35, param_36);
+                Alloc param_39 = cmd_alloc;
+                CmdRef param_40 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40);
                uint blend_mode = end_clip.blend >> uint(8);
                uint comp_mode = end_clip.blend & 255u;
                clip_depth--;
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                for (uint k_13 = 0u; k_13 < 8u; k_13++)
                {
                    uint d_3 = min(clip_depth, 127u);
-                    uint param_37 = blend_stack[d_3][k_12];
-                    float4 bg = unpacksRGB(param_37);
-                    float4 fg_1 = rgba[k_12] * area[k_12];
-                    float3 param_38 = bg.xyz;
-                    float3 param_39 = fg_1.xyz;
-                    uint param_40 = blend_mode;
-                    float3 blend = mix_blend(param_38, param_39, param_40);
-                    float4 _2251 = fg_1;
-                    float _2255 = fg_1.w;
-                    float3 _2262 = lerp(_2251.xyz, blend, float((_2255 * bg.w) > 0.0f).xxx);
-                    fg_1.x = _2262.x;
-                    fg_1.y = _2262.y;
-                    fg_1.z = _2262.z;
-                    float3 param_41 = bg.xyz;
-                    float3 param_42 = fg_1.xyz;
-                    float param_43 = bg.w;
-                    float param_44 = fg_1.w;
-                    uint param_45 = comp_mode;
-                    rgba[k_12] = mix_compose(param_41, param_42, param_43, param_44, param_45);
+                    uint param_41 = blend_stack[d_3][k_13];
+                    float4 bg = unpacksRGB(param_41);
+                    float4 fg_1 = rgba[k_13] * area[k_13];
+                    float3 param_42 = bg.xyz;
+                    float3 param_43 = fg_1.xyz;
+                    uint param_44 = blend_mode;
+                    float3 blend = mix_blend(param_42, param_43, param_44);
+                    float4 _2521 = fg_1;
+                    float _2525 = fg_1.w;
+                    float3 _2532 = lerp(_2521.xyz, blend, float((_2525 * bg.w) > 0.0f).xxx);
+                    fg_1.x = _2532.x;
+                    fg_1.y = _2532.y;
+                    fg_1.z = _2532.z;
+                    float3 param_45 = bg.xyz;
+                    float3 param_46 = fg_1.xyz;
+                    float param_47 = bg.w;
+                    float param_48 = fg_1.w;
+                    uint param_49 = comp_mode;
+                    rgba[k_13] = mix_compose(param_45, param_46, param_47, param_48, param_49);
                }
                cmd_ref.offset += 8u;
                break;
            }
-            case 10u:
+            case 11u:
            {
-                Alloc param_46 = cmd_alloc;
-                CmdRef param_47 = cmd_ref;
-                CmdRef _2299 = { Cmd_Jump_read(param_46, param_47).new_ref };
-                cmd_ref = _2299;
+                Alloc param_50 = cmd_alloc;
+                CmdRef param_51 = cmd_ref;
+                CmdRef _2569 = { Cmd_Jump_read(param_50, param_51).new_ref };
+                cmd_ref = _2569;
                cmd_alloc.offset = cmd_ref.offset;
                break;
            }
@ -1143,9 +1239,9 @@ void comp_main()
    }
    for (uint i_1 = 0u; i_1 < 8u; i_1++)
    {
-        uint param_48 = i_1;
-        float3 param_49 = rgba[i_1].xyz;
-        image[int2(xy_uint + chunk_offset(param_48))] = float4(tosRGB(param_49), rgba[i_1].w);
+        uint param_52 = i_1;
+        float3 param_53 = rgba[i_1].xyz;
+        image[int2(xy_uint + chunk_offset(param_52))] = float4(tosRGB(param_53), rgba[i_1].w);
    }
 }

--- a/piet-gpu/shader/gen/kernel4.msl
+++ b/piet-gpu/shader/gen/kernel4.msl
@ -94,6 +94,21 @@ struct CmdLinGrad
    float line_c;
 };

+struct CmdRadGradRef
+{
+    uint offset;
+};
+
+struct CmdRadGrad
+{
+    uint index;
+    float4 mat;
+    float2 xlat;
+    float2 c1;
+    float ra;
+    float roff;
+};
+
 struct CmdImageRef
 {
    uint offset;
@ -222,7 +237,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
 }

 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_278)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_291)
 {
    Alloc param = alloc;
    uint param_1 = offset;
@ -230,29 +245,29 @@ uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memor
    {
        return 0u;
    }
-    uint v = v_278.memory[offset];
+    uint v = v_291.memory[offset];
    return v;
 }

 static inline __attribute__((always_inline))
-CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_278);
+    uint tag_and_flags = read_mem(param, param_1, v_291);
    return CmdTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
 }

 static inline __attribute__((always_inline))
-CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_278)
+CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_291)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
    Alloc param_2 = a;
    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
    CmdStroke s;
    s.tile_ref = raw0;
    s.half_width = as_type<float>(raw1);
@ -260,11 +275,11 @@ CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref,
 }

 static inline __attribute__((always_inline))
-CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
    Alloc param = a;
    CmdStrokeRef param_1 = CmdStrokeRef{ ref.offset + 4u };
-    return CmdStroke_read(param, param_1, v_278);
+    return CmdStroke_read(param, param_1, v_291);
 }

 static inline __attribute__((always_inline))
@ -276,27 +291,27 @@ Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const
 }

 static inline __attribute__((always_inline))
-TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_278)
+TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_291)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
    Alloc param_2 = a;
    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
    Alloc param_4 = a;
    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_278);
+    uint raw2 = read_mem(param_4, param_5, v_291);
    Alloc param_6 = a;
    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_278);
+    uint raw3 = read_mem(param_6, param_7, v_291);
    Alloc param_8 = a;
    uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_278);
+    uint raw4 = read_mem(param_8, param_9, v_291);
    Alloc param_10 = a;
    uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_278);
+    uint raw5 = read_mem(param_10, param_11, v_291);
    TileSeg s;
    s.origin = float2(as_type<float>(raw0), as_type<float>(raw1));
    s.vector = float2(as_type<float>(raw2), as_type<float>(raw3));
@ -312,15 +327,15 @@ uint2 chunk_offset(thread const uint& i)
 }

 static inline __attribute__((always_inline))
-CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_278)
+CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_291)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
    Alloc param_2 = a;
    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
    CmdFill s;
    s.tile_ref = raw0;
    s.backdrop = int(raw1);
@ -328,51 +343,51 @@ CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device
 }

 static inline __attribute__((always_inline))
-CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
    Alloc param = a;
    CmdFillRef param_1 = CmdFillRef{ ref.offset + 4u };
-    return CmdFill_read(param, param_1, v_278);
+    return CmdFill_read(param, param_1, v_291);
 }

 static inline __attribute__((always_inline))
-CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_278)
+CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_291)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
    CmdAlpha s;
    s.alpha = as_type<float>(raw0);
    return s;
 }

 static inline __attribute__((always_inline))
-CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
    Alloc param = a;
    CmdAlphaRef param_1 = CmdAlphaRef{ ref.offset + 4u };
-    return CmdAlpha_read(param, param_1, v_278);
+    return CmdAlpha_read(param, param_1, v_291);
 }

 static inline __attribute__((always_inline))
-CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_278)
+CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_291)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
    CmdColor s;
    s.rgba_color = raw0;
    return s;
 }

 static inline __attribute__((always_inline))
-CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
    Alloc param = a;
    CmdColorRef param_1 = CmdColorRef{ ref.offset + 4u };
-    return CmdColor_read(param, param_1, v_278);
+    return CmdColor_read(param, param_1, v_291);
 }

 static inline __attribute__((always_inline))
@ -393,21 +408,21 @@ float4 unpacksRGB(thread const uint& srgba)
 }

 static inline __attribute__((always_inline))
-CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_278)
+CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_291)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
    Alloc param_2 = a;
    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
    Alloc param_4 = a;
    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_278);
+    uint raw2 = read_mem(param_4, param_5, v_291);
    Alloc param_6 = a;
    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_278);
+    uint raw3 = read_mem(param_6, param_7, v_291);
    CmdLinGrad s;
    s.index = raw0;
    s.line_x = as_type<float>(raw1);
@ -417,23 +432,78 @@ CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& re
 }

 static inline __attribute__((always_inline))
-CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
    Alloc param = a;
    CmdLinGradRef param_1 = CmdLinGradRef{ ref.offset + 4u };
-    return CmdLinGrad_read(param, param_1, v_278);
+    return CmdLinGrad_read(param, param_1, v_291);
 }

 static inline __attribute__((always_inline))
-CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_278)
+CmdRadGrad CmdRadGrad_read(thread const Alloc& a, thread const CmdRadGradRef& ref, device Memory& v_291)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
    Alloc param_2 = a;
    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5, v_291);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7, v_291);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9, v_291);
+    Alloc param_10 = a;
+    uint param_11 = ix + 5u;
+    uint raw5 = read_mem(param_10, param_11, v_291);
+    Alloc param_12 = a;
+    uint param_13 = ix + 6u;
+    uint raw6 = read_mem(param_12, param_13, v_291);
+    Alloc param_14 = a;
+    uint param_15 = ix + 7u;
+    uint raw7 = read_mem(param_14, param_15, v_291);
+    Alloc param_16 = a;
+    uint param_17 = ix + 8u;
+    uint raw8 = read_mem(param_16, param_17, v_291);
+    Alloc param_18 = a;
+    uint param_19 = ix + 9u;
+    uint raw9 = read_mem(param_18, param_19, v_291);
+    Alloc param_20 = a;
+    uint param_21 = ix + 10u;
+    uint raw10 = read_mem(param_20, param_21, v_291);
+    CmdRadGrad s;
+    s.index = raw0;
+    s.mat = float4(as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3), as_type<float>(raw4));
+    s.xlat = float2(as_type<float>(raw5), as_type<float>(raw6));
+    s.c1 = float2(as_type<float>(raw7), as_type<float>(raw8));
+    s.ra = as_type<float>(raw9);
+    s.roff = as_type<float>(raw10);
+    return s;
+}
+
+static inline __attribute__((always_inline))
+CmdRadGrad Cmd_RadGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+{
+    Alloc param = a;
+    CmdRadGradRef param_1 = CmdRadGradRef{ ref.offset + 4u };
+    return CmdRadGrad_read(param, param_1, v_291);
+}
+
+static inline __attribute__((always_inline))
+CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_291)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_291);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3, v_291);
    CmdImage s;
    s.index = raw0;
    s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
@ -441,11 +511,11 @@ CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, dev
 }

 static inline __attribute__((always_inline))
-CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
    Alloc param = a;
    CmdImageRef param_1 = CmdImageRef{ ref.offset + 4u };
-    return CmdImage_read(param, param_1, v_278);
+    return CmdImage_read(param, param_1, v_291);
 }

 static inline __attribute__((always_inline))
@ -458,10 +528,10 @@ spvUnsafeArray<float4, 8> fillImage(thread const uint2& xy, thread const CmdImag
        int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
        float4 fg_rgba = image_atlas.read(uint2(uv));
        float3 param_1 = fg_rgba.xyz;
-        float3 _1493 = fromsRGB(param_1);
-        fg_rgba.x = _1493.x;
-        fg_rgba.y = _1493.y;
-        fg_rgba.z = _1493.z;
+        float3 _1638 = fromsRGB(param_1);
+        fg_rgba.x = _1638.x;
+        fg_rgba.y = _1638.y;
+        fg_rgba.z = _1638.z;
        rgba[i] = fg_rgba;
    }
    return rgba;
@ -485,23 +555,23 @@ uint packsRGB(thread float4& rgba)
 }

 static inline __attribute__((always_inline))
-CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_278)
+CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_291)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
    CmdEndClip s;
    s.blend = raw0;
    return s;
 }

 static inline __attribute__((always_inline))
-CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
    Alloc param = a;
    CmdEndClipRef param_1 = CmdEndClipRef{ ref.offset + 4u };
-    return CmdEndClip_read(param, param_1, v_278);
+    return CmdEndClip_read(param, param_1, v_291);
 }

 static inline __attribute__((always_inline))
@ -701,8 +771,8 @@ float3 set_lum(thread const float3& c, thread const float& l)
 {
    float3 param = c;
    float3 param_1 = c + float3(l - lum(param));
-    float3 _901 = clip_color(param_1);
-    return _901;
+    float3 _1046 = clip_color(param_1);
+    return _1046;
 }

 static inline __attribute__((always_inline))
@ -791,9 +861,9 @@ float3 mix_blend(thread const float3& cb, thread const float3& cs, thread const
            float3 param_20 = cb;
            float3 param_21 = cs;
            float param_22 = sat(param_20);
-            float3 _1192 = set_sat(param_21, param_22);
+            float3 _1337 = set_sat(param_21, param_22);
            float3 param_23 = cb;
-            float3 param_24 = _1192;
+            float3 param_24 = _1337;
            float param_25 = lum(param_23);
            b = set_lum(param_24, param_25);
            break;
@ -803,9 +873,9 @@ float3 mix_blend(thread const float3& cb, thread const float3& cs, thread const
            float3 param_26 = cs;
            float3 param_27 = cb;
            float param_28 = sat(param_26);
-            float3 _1206 = set_sat(param_27, param_28);
+            float3 _1351 = set_sat(param_27, param_28);
            float3 param_29 = cb;
-            float3 param_30 = _1206;
+            float3 param_30 = _1351;
            float param_31 = lum(param_29);
            b = set_lum(param_30, param_31);
            break;
@ -931,30 +1001,30 @@ float4 mix_compose(thread const float3& cb, thread const float3& cs, thread cons
 }

 static inline __attribute__((always_inline))
-CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_278)
+CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_291)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
    CmdJump s;
    s.new_ref = raw0;
    return s;
 }

 static inline __attribute__((always_inline))
-CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
    Alloc param = a;
    CmdJumpRef param_1 = CmdJumpRef{ ref.offset + 4u };
-    return CmdJump_read(param, param_1, v_278);
+    return CmdJump_read(param, param_1, v_291);
 }

-kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1521 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1666 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1521.conf.width_in_tiles) + gl_WorkGroupID.x;
+    uint tile_ix = (gl_WorkGroupID.y * _1666.conf.width_in_tiles) + gl_WorkGroupID.x;
    Alloc param;
-    param.offset = _1521.conf.ptcl_alloc.offset;
+    param.offset = _1666.conf.ptcl_alloc.offset;
    uint param_1 = tile_ix * 1024u;
    uint param_2 = 1024u;
    Alloc cmd_alloc = slice_mem(param, param_1, param_2);
@ -967,7 +1037,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
        rgba[i] = float4(0.0);
    }
    uint clip_depth = 0u;
-    bool mem_ok = v_278.mem_error == 0u;
+    bool mem_ok = v_291.mem_error == 0u;
    spvUnsafeArray<float, 8> df;
    TileSegRef tile_seg_ref;
    spvUnsafeArray<float, 8> area;
@ -976,7 +1046,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
    {
        Alloc param_3 = cmd_alloc;
        CmdRef param_4 = cmd_ref;
-        uint tag = Cmd_tag(param_3, param_4, v_278).tag;
+        uint tag = Cmd_tag(param_3, param_4, v_291).tag;
        if (tag == 0u)
        {
            break;
@ -987,7 +1057,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
            {
                Alloc param_5 = cmd_alloc;
                CmdRef param_6 = cmd_ref;
-                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_278);
+                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_291);
                for (uint k = 0u; k < 8u; k++)
                {
                    df[k] = 1000000000.0;
@ -1000,7 +1070,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
                    bool param_9 = mem_ok;
                    Alloc param_10 = new_alloc(param_7, param_8, param_9);
                    TileSegRef param_11 = tile_seg_ref;
-                    TileSeg seg = TileSeg_read(param_10, param_11, v_278);
+                    TileSeg seg = TileSeg_read(param_10, param_11, v_291);
                    float2 line_vec = seg.vector;
                    for (uint k_1 = 0u; k_1 < 8u; k_1++)
                    {
@ -1023,7 +1093,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
            {
                Alloc param_13 = cmd_alloc;
                CmdRef param_14 = cmd_ref;
-                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_278);
+                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_291);
                for (uint k_3 = 0u; k_3 < 8u; k_3++)
                {
                    area[k_3] = float(fill.backdrop);
@ -1036,7 +1106,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
                    bool param_17 = mem_ok;
                    Alloc param_18 = new_alloc(param_15, param_16, param_17);
                    TileSegRef param_19 = tile_seg_ref;
-                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_278);
+                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_291);
                    for (uint k_4 = 0u; k_4 < 8u; k_4++)
                    {
                        uint param_20 = k_4;
@ -1080,7 +1150,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
            {
                Alloc param_21 = cmd_alloc;
                CmdRef param_22 = cmd_ref;
-                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_278);
+                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_291);
                for (uint k_7 = 0u; k_7 < 8u; k_7++)
                {
                    area[k_7] = alpha.alpha;
@ -1092,7 +1162,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
            {
                Alloc param_23 = cmd_alloc;
                CmdRef param_24 = cmd_ref;
-                CmdColor color = Cmd_Color_read(param_23, param_24, v_278);
+                CmdColor color = Cmd_Color_read(param_23, param_24, v_291);
                uint param_25 = color.rgba_color;
                float4 fg = unpacksRGB(param_25);
                for (uint k_8 = 0u; k_8 < 8u; k_8++)
@ -1107,7 +1177,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
            {
                Alloc param_26 = cmd_alloc;
                CmdRef param_27 = cmd_ref;
-                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_278);
+                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_291);
                float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c;
                for (uint k_9 = 0u; k_9 < 8u; k_9++)
                {
@ -1117,11 +1187,12 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
                    int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                    float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                    float3 param_29 = fg_rgba.xyz;
-                    float3 _2092 = fromsRGB(param_29);
-                    fg_rgba.x = _2092.x;
-                    fg_rgba.y = _2092.y;
-                    fg_rgba.z = _2092.z;
-                    rgba[k_9] = fg_rgba;
+                    float3 _2238 = fromsRGB(param_29);
+                    fg_rgba.x = _2238.x;
+                    fg_rgba.y = _2238.y;
+                    fg_rgba.z = _2238.z;
+                    float4 fg_k_1 = fg_rgba * area[k_9];
+                    rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1;
                }
                cmd_ref.offset += 20u;
                break;
@ -1130,72 +1201,98 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
            {
                Alloc param_30 = cmd_alloc;
                CmdRef param_31 = cmd_ref;
-                CmdImage fill_img = Cmd_Image_read(param_30, param_31, v_278);
-                uint2 param_32 = xy_uint;
-                CmdImage param_33 = fill_img;
-                spvUnsafeArray<float4, 8> img;
-                img = fillImage(param_32, param_33, image_atlas);
+                CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31, v_291);
                for (uint k_10 = 0u; k_10 < 8u; k_10++)
                {
-                    float4 fg_k_1 = img[k_10] * area[k_10];
-                    rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_1.w)) + fg_k_1;
+                    uint param_32 = k_10;
+                    float2 my_xy_1 = xy + float2(chunk_offset(param_32));
+                    my_xy_1 = ((rad.mat.xz * my_xy_1.x) + (rad.mat.yw * my_xy_1.y)) - rad.xlat;
+                    float ba = dot(my_xy_1, rad.c1);
+                    float ca = rad.ra * dot(my_xy_1, my_xy_1);
+                    float t_2 = (sqrt((ba * ba) + ca) - ba) - rad.roff;
+                    int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0));
+                    float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index))));
+                    float3 param_33 = fg_rgba_1.xyz;
+                    float3 _2348 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2348.x;
+                    fg_rgba_1.y = _2348.y;
+                    fg_rgba_1.z = _2348.z;
+                    float4 fg_k_2 = fg_rgba_1 * area[k_10];
+                    rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2;
                }
-                cmd_ref.offset += 12u;
+                cmd_ref.offset += 48u;
                break;
            }
            case 8u:
            {
+                Alloc param_34 = cmd_alloc;
+                CmdRef param_35 = cmd_ref;
+                CmdImage fill_img = Cmd_Image_read(param_34, param_35, v_291);
+                uint2 param_36 = xy_uint;
+                CmdImage param_37 = fill_img;
+                spvUnsafeArray<float4, 8> img;
+                img = fillImage(param_36, param_37, image_atlas);
                for (uint k_11 = 0u; k_11 < 8u; k_11++)
+                {
+                    float4 fg_k_3 = img[k_11] * area[k_11];
+                    rgba[k_11] = (rgba[k_11] * (1.0 - fg_k_3.w)) + fg_k_3;
+                }
+                cmd_ref.offset += 12u;
+                break;
+            }
+            case 9u:
+            {
+                for (uint k_12 = 0u; k_12 < 8u; k_12++)
                {
                    uint d_2 = min(clip_depth, 127u);
-                    float4 param_34 = float4(rgba[k_11]);
-                    uint _2184 = packsRGB(param_34);
-                    blend_stack[d_2][k_11] = _2184;
-                    rgba[k_11] = float4(0.0);
+                    float4 param_38 = float4(rgba[k_12]);
+                    uint _2454 = packsRGB(param_38);
+                    blend_stack[d_2][k_12] = _2454;
+                    rgba[k_12] = float4(0.0);
                }
                clip_depth++;
                cmd_ref.offset += 4u;
                break;
            }
-            case 9u:
+            case 10u:
            {
-                Alloc param_35 = cmd_alloc;
-                CmdRef param_36 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_35, param_36, v_278);
+                Alloc param_39 = cmd_alloc;
+                CmdRef param_40 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40, v_291);
                uint blend_mode = end_clip.blend >> uint(8);
                uint comp_mode = end_clip.blend & 255u;
                clip_depth--;
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                for (uint k_13 = 0u; k_13 < 8u; k_13++)
                {
                    uint d_3 = min(clip_depth, 127u);
-                    uint param_37 = blend_stack[d_3][k_12];
-                    float4 bg = unpacksRGB(param_37);
-                    float4 fg_1 = rgba[k_12] * area[k_12];
-                    float3 param_38 = bg.xyz;
-                    float3 param_39 = fg_1.xyz;
-                    uint param_40 = blend_mode;
-                    float3 blend = mix_blend(param_38, param_39, param_40);
-                    float4 _2251 = fg_1;
-                    float _2255 = fg_1.w;
-                    float3 _2262 = mix(_2251.xyz, blend, float3(float((_2255 * bg.w) > 0.0)));
-                    fg_1.x = _2262.x;
-                    fg_1.y = _2262.y;
-                    fg_1.z = _2262.z;
-                    float3 param_41 = bg.xyz;
-                    float3 param_42 = fg_1.xyz;
-                    float param_43 = bg.w;
-                    float param_44 = fg_1.w;
-                    uint param_45 = comp_mode;
-                    rgba[k_12] = mix_compose(param_41, param_42, param_43, param_44, param_45);
+                    uint param_41 = blend_stack[d_3][k_13];
+                    float4 bg = unpacksRGB(param_41);
+                    float4 fg_1 = rgba[k_13] * area[k_13];
+                    float3 param_42 = bg.xyz;
+                    float3 param_43 = fg_1.xyz;
+                    uint param_44 = blend_mode;
+                    float3 blend = mix_blend(param_42, param_43, param_44);
+                    float4 _2521 = fg_1;
+                    float _2525 = fg_1.w;
+                    float3 _2532 = mix(_2521.xyz, blend, float3(float((_2525 * bg.w) > 0.0)));
+                    fg_1.x = _2532.x;
+                    fg_1.y = _2532.y;
+                    fg_1.z = _2532.z;
+                    float3 param_45 = bg.xyz;
+                    float3 param_46 = fg_1.xyz;
+                    float param_47 = bg.w;
+                    float param_48 = fg_1.w;
+                    uint param_49 = comp_mode;
+                    rgba[k_13] = mix_compose(param_45, param_46, param_47, param_48, param_49);
                }
                cmd_ref.offset += 8u;
                break;
            }
-            case 10u:
+            case 11u:
            {
-                Alloc param_46 = cmd_alloc;
-                CmdRef param_47 = cmd_ref;
-                cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_278).new_ref };
+                Alloc param_50 = cmd_alloc;
+                CmdRef param_51 = cmd_ref;
+                cmd_ref = CmdRef{ Cmd_Jump_read(param_50, param_51, v_291).new_ref };
                cmd_alloc.offset = cmd_ref.offset;
                break;
            }
@ -1203,9 +1300,9 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
    }
    for (uint i_1 = 0u; i_1 < 8u; i_1++)
    {
-        uint param_48 = i_1;
-        float3 param_49 = rgba[i_1].xyz;
-        image.write(float4(tosRGB(param_49), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48))));
+        uint param_52 = i_1;
+        float3 param_53 = rgba[i_1].xyz;
+        image.write(float4(tosRGB(param_53), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_52))));
    }
 }

--- a/piet-gpu/shader/gen/kernel4.spv
+++ b/piet-gpu/shader/gen/kernel4.spv
--- a/piet-gpu/shader/gen/kernel4_gray.dxil
+++ b/piet-gpu/shader/gen/kernel4_gray.dxil
--- a/piet-gpu/shader/gen/kernel4_gray.hlsl
+++ b/piet-gpu/shader/gen/kernel4_gray.hlsl
@ -48,6 +48,21 @@ struct CmdLinGrad
    float line_c;
 };

+struct CmdRadGradRef
+{
+    uint offset;
+};
+
+struct CmdRadGrad
+{
+    uint index;
+    float4 mat;
+    float2 xlat;
+    float2 c1;
+    float ra;
+    float roff;
+};
+
 struct CmdImageRef
 {
    uint offset;
@ -146,8 +161,8 @@ struct Config

 static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);

-RWByteAddressBuffer _278 : register(u0, space0);
-ByteAddressBuffer _1521 : register(t1, space0);
+RWByteAddressBuffer _291 : register(u0, space0);
+ByteAddressBuffer _1666 : register(t1, space0);
 RWTexture2D<unorm float4> image_atlas : register(u3, space0);
 RWTexture2D<unorm float4> gradients : register(u4, space0);
 RWTexture2D<unorm float> image : register(u2, space0);
@ -174,8 +189,8 @@ float4 spvUnpackUnorm4x8(uint value)

 Alloc slice_mem(Alloc a, uint offset, uint size)
 {
-    Alloc _291 = { a.offset + offset };
-    return _291;
+    Alloc _304 = { a.offset + offset };
+    return _304;
 }

 bool touch_mem(Alloc alloc, uint offset)
@ -191,7 +206,7 @@ uint read_mem(Alloc alloc, uint offset)
    {
        return 0u;
    }
-    uint v = _278.Load(offset * 4 + 8);
+    uint v = _291.Load(offset * 4 + 8);
    return v;
 }

@ -200,8 +215,8 @@ CmdTag Cmd_tag(Alloc a, CmdRef ref)
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
    uint tag_and_flags = read_mem(param, param_1);
-    CmdTag _525 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
-    return _525;
+    CmdTag _663 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+    return _663;
 }

 CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref)
@ -221,9 +236,9 @@ CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref)

 CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref)
 {
-    CmdStrokeRef _542 = { ref.offset + 4u };
+    CmdStrokeRef _679 = { ref.offset + 4u };
    Alloc param = a;
-    CmdStrokeRef param_1 = _542;
+    CmdStrokeRef param_1 = _679;
    return CmdStroke_read(param, param_1);
 }

@ -259,8 +274,8 @@ TileSeg TileSeg_read(Alloc a, TileSegRef ref)
    s.origin = float2(asfloat(raw0), asfloat(raw1));
    s._vector = float2(asfloat(raw2), asfloat(raw3));
    s.y_edge = asfloat(raw4);
-    TileSegRef _675 = { raw5 };
-    s.next = _675;
+    TileSegRef _820 = { raw5 };
+    s.next = _820;
    return s;
 }

@ -286,9 +301,9 @@ CmdFill CmdFill_read(Alloc a, CmdFillRef ref)

 CmdFill Cmd_Fill_read(Alloc a, CmdRef ref)
 {
-    CmdFillRef _532 = { ref.offset + 4u };
+    CmdFillRef _669 = { ref.offset + 4u };
    Alloc param = a;
-    CmdFillRef param_1 = _532;
+    CmdFillRef param_1 = _669;
    return CmdFill_read(param, param_1);
 }

@ -305,9 +320,9 @@ CmdAlpha CmdAlpha_read(Alloc a, CmdAlphaRef ref)

 CmdAlpha Cmd_Alpha_read(Alloc a, CmdRef ref)
 {
-    CmdAlphaRef _552 = { ref.offset + 4u };
+    CmdAlphaRef _689 = { ref.offset + 4u };
    Alloc param = a;
-    CmdAlphaRef param_1 = _552;
+    CmdAlphaRef param_1 = _689;
    return CmdAlpha_read(param, param_1);
 }

@ -324,9 +339,9 @@ CmdColor CmdColor_read(Alloc a, CmdColorRef ref)

 CmdColor Cmd_Color_read(Alloc a, CmdRef ref)
 {
-    CmdColorRef _562 = { ref.offset + 4u };
+    CmdColorRef _699 = { ref.offset + 4u };
    Alloc param = a;
-    CmdColorRef param_1 = _562;
+    CmdColorRef param_1 = _699;
    return CmdColor_read(param, param_1);
 }

@ -370,12 +385,66 @@ CmdLinGrad CmdLinGrad_read(Alloc a, CmdLinGradRef ref)

 CmdLinGrad Cmd_LinGrad_read(Alloc a, CmdRef ref)
 {
-    CmdLinGradRef _572 = { ref.offset + 4u };
+    CmdLinGradRef _709 = { ref.offset + 4u };
    Alloc param = a;
-    CmdLinGradRef param_1 = _572;
+    CmdLinGradRef param_1 = _709;
    return CmdLinGrad_read(param, param_1);
 }

+CmdRadGrad CmdRadGrad_read(Alloc a, CmdRadGradRef ref)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9);
+    Alloc param_10 = a;
+    uint param_11 = ix + 5u;
+    uint raw5 = read_mem(param_10, param_11);
+    Alloc param_12 = a;
+    uint param_13 = ix + 6u;
+    uint raw6 = read_mem(param_12, param_13);
+    Alloc param_14 = a;
+    uint param_15 = ix + 7u;
+    uint raw7 = read_mem(param_14, param_15);
+    Alloc param_16 = a;
+    uint param_17 = ix + 8u;
+    uint raw8 = read_mem(param_16, param_17);
+    Alloc param_18 = a;
+    uint param_19 = ix + 9u;
+    uint raw9 = read_mem(param_18, param_19);
+    Alloc param_20 = a;
+    uint param_21 = ix + 10u;
+    uint raw10 = read_mem(param_20, param_21);
+    CmdRadGrad s;
+    s.index = raw0;
+    s.mat = float4(asfloat(raw1), asfloat(raw2), asfloat(raw3), asfloat(raw4));
+    s.xlat = float2(asfloat(raw5), asfloat(raw6));
+    s.c1 = float2(asfloat(raw7), asfloat(raw8));
+    s.ra = asfloat(raw9);
+    s.roff = asfloat(raw10);
+    return s;
+}
+
+CmdRadGrad Cmd_RadGrad_read(Alloc a, CmdRef ref)
+{
+    CmdRadGradRef _719 = { ref.offset + 4u };
+    Alloc param = a;
+    CmdRadGradRef param_1 = _719;
+    return CmdRadGrad_read(param, param_1);
+}
+
 CmdImage CmdImage_read(Alloc a, CmdImageRef ref)
 {
    uint ix = ref.offset >> uint(2);
@ -393,9 +462,9 @@ CmdImage CmdImage_read(Alloc a, CmdImageRef ref)

 CmdImage Cmd_Image_read(Alloc a, CmdRef ref)
 {
-    CmdImageRef _582 = { ref.offset + 4u };
+    CmdImageRef _729 = { ref.offset + 4u };
    Alloc param = a;
-    CmdImageRef param_1 = _582;
+    CmdImageRef param_1 = _729;
    return CmdImage_read(param, param_1);
 }

@ -408,10 +477,10 @@ void fillImage(out float4 spvReturnValue[8], uint2 xy, CmdImage cmd_img)
        int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
        float4 fg_rgba = image_atlas[uv];
        float3 param_1 = fg_rgba.xyz;
-        float3 _1493 = fromsRGB(param_1);
-        fg_rgba.x = _1493.x;
-        fg_rgba.y = _1493.y;
-        fg_rgba.z = _1493.z;
+        float3 _1638 = fromsRGB(param_1);
+        fg_rgba.x = _1638.x;
+        fg_rgba.y = _1638.y;
+        fg_rgba.z = _1638.z;
        rgba[i] = fg_rgba;
    }
    spvReturnValue = rgba;
@ -445,9 +514,9 @@ CmdEndClip CmdEndClip_read(Alloc a, CmdEndClipRef ref)

 CmdEndClip Cmd_EndClip_read(Alloc a, CmdRef ref)
 {
-    CmdEndClipRef _592 = { ref.offset + 4u };
+    CmdEndClipRef _739 = { ref.offset + 4u };
    Alloc param = a;
-    CmdEndClipRef param_1 = _592;
+    CmdEndClipRef param_1 = _739;
    return CmdEndClip_read(param, param_1);
 }

@ -637,8 +706,8 @@ float3 set_lum(float3 c, float l)
 {
    float3 param = c;
    float3 param_1 = c + (l - lum(param)).xxx;
-    float3 _901 = clip_color(param_1);
-    return _901;
+    float3 _1046 = clip_color(param_1);
+    return _1046;
 }

 float3 mix_blend(float3 cb, float3 cs, uint mode)
@ -726,9 +795,9 @@ float3 mix_blend(float3 cb, float3 cs, uint mode)
            float3 param_20 = cb;
            float3 param_21 = cs;
            float param_22 = sat(param_20);
-            float3 _1192 = set_sat(param_21, param_22);
+            float3 _1337 = set_sat(param_21, param_22);
            float3 param_23 = cb;
-            float3 param_24 = _1192;
+            float3 param_24 = _1337;
            float param_25 = lum(param_23);
            b = set_lum(param_24, param_25);
            break;
@ -738,9 +807,9 @@ float3 mix_blend(float3 cb, float3 cs, uint mode)
            float3 param_26 = cs;
            float3 param_27 = cb;
            float param_28 = sat(param_26);
-            float3 _1206 = set_sat(param_27, param_28);
+            float3 _1351 = set_sat(param_27, param_28);
            float3 param_29 = cb;
-            float3 param_30 = _1206;
+            float3 param_30 = _1351;
            float param_31 = lum(param_29);
            b = set_lum(param_30, param_31);
            break;
@ -877,24 +946,24 @@ CmdJump CmdJump_read(Alloc a, CmdJumpRef ref)

 CmdJump Cmd_Jump_read(Alloc a, CmdRef ref)
 {
-    CmdJumpRef _602 = { ref.offset + 4u };
+    CmdJumpRef _749 = { ref.offset + 4u };
    Alloc param = a;
-    CmdJumpRef param_1 = _602;
+    CmdJumpRef param_1 = _749;
    return CmdJump_read(param, param_1);
 }

 void comp_main()
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1521.Load(8)) + gl_WorkGroupID.x;
-    Alloc _1536;
-    _1536.offset = _1521.Load(24);
+    uint tile_ix = (gl_WorkGroupID.y * _1666.Load(8)) + gl_WorkGroupID.x;
+    Alloc _1681;
+    _1681.offset = _1666.Load(24);
    Alloc param;
-    param.offset = _1536.offset;
+    param.offset = _1681.offset;
    uint param_1 = tile_ix * 1024u;
    uint param_2 = 1024u;
    Alloc cmd_alloc = slice_mem(param, param_1, param_2);
-    CmdRef _1545 = { cmd_alloc.offset };
-    CmdRef cmd_ref = _1545;
+    CmdRef _1690 = { cmd_alloc.offset };
+    CmdRef cmd_ref = _1690;
    uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
    float2 xy = float2(xy_uint);
    float4 rgba[8];
@ -903,7 +972,7 @@ void comp_main()
        rgba[i] = 0.0f.xxxx;
    }
    uint clip_depth = 0u;
-    bool mem_ok = _278.Load(4) == 0u;
+    bool mem_ok = _291.Load(4) == 0u;
    float df[8];
    TileSegRef tile_seg_ref;
    float area[8];
@ -928,8 +997,8 @@ void comp_main()
                {
                    df[k] = 1000000000.0f;
                }
-                TileSegRef _1638 = { stroke.tile_ref };
-                tile_seg_ref = _1638;
+                TileSegRef _1784 = { stroke.tile_ref };
+                tile_seg_ref = _1784;
                do
                {
                    uint param_7 = tile_seg_ref.offset;
@ -965,8 +1034,8 @@ void comp_main()
                {
                    area[k_3] = float(fill.backdrop);
                }
-                TileSegRef _1758 = { fill.tile_ref };
-                tile_seg_ref = _1758;
+                TileSegRef _1904 = { fill.tile_ref };
+                tile_seg_ref = _1904;
                do
                {
                    uint param_15 = tile_seg_ref.offset;
@ -1055,11 +1124,12 @@ void comp_main()
                    int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
                    float4 fg_rgba = gradients[int2(x, int(lin.index))];
                    float3 param_29 = fg_rgba.xyz;
-                    float3 _2092 = fromsRGB(param_29);
-                    fg_rgba.x = _2092.x;
-                    fg_rgba.y = _2092.y;
-                    fg_rgba.z = _2092.z;
-                    rgba[k_9] = fg_rgba;
+                    float3 _2238 = fromsRGB(param_29);
+                    fg_rgba.x = _2238.x;
+                    fg_rgba.y = _2238.y;
+                    fg_rgba.z = _2238.z;
+                    float4 fg_k_1 = fg_rgba * area[k_9];
+                    rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1;
                }
                cmd_ref.offset += 20u;
                break;
@ -1068,74 +1138,100 @@ void comp_main()
            {
                Alloc param_30 = cmd_alloc;
                CmdRef param_31 = cmd_ref;
-                CmdImage fill_img = Cmd_Image_read(param_30, param_31);
-                uint2 param_32 = xy_uint;
-                CmdImage param_33 = fill_img;
-                float4 _2121[8];
-                fillImage(_2121, param_32, param_33);
-                float4 img[8] = _2121;
+                CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31);
                for (uint k_10 = 0u; k_10 < 8u; k_10++)
                {
-                    float4 fg_k_1 = img[k_10] * area[k_10];
-                    rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_1.w)) + fg_k_1;
+                    uint param_32 = k_10;
+                    float2 my_xy_1 = xy + float2(chunk_offset(param_32));
+                    my_xy_1 = ((rad.mat.xz * my_xy_1.x) + (rad.mat.yw * my_xy_1.y)) - rad.xlat;
+                    float ba = dot(my_xy_1, rad.c1);
+                    float ca = rad.ra * dot(my_xy_1, my_xy_1);
+                    float t_2 = (sqrt((ba * ba) + ca) - ba) - rad.roff;
+                    int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f));
+                    float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))];
+                    float3 param_33 = fg_rgba_1.xyz;
+                    float3 _2348 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2348.x;
+                    fg_rgba_1.y = _2348.y;
+                    fg_rgba_1.z = _2348.z;
+                    float4 fg_k_2 = fg_rgba_1 * area[k_10];
+                    rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2;
                }
-                cmd_ref.offset += 12u;
+                cmd_ref.offset += 48u;
                break;
            }
            case 8u:
            {
+                Alloc param_34 = cmd_alloc;
+                CmdRef param_35 = cmd_ref;
+                CmdImage fill_img = Cmd_Image_read(param_34, param_35);
+                uint2 param_36 = xy_uint;
+                CmdImage param_37 = fill_img;
+                float4 _2391[8];
+                fillImage(_2391, param_36, param_37);
+                float4 img[8] = _2391;
                for (uint k_11 = 0u; k_11 < 8u; k_11++)
+                {
+                    float4 fg_k_3 = img[k_11] * area[k_11];
+                    rgba[k_11] = (rgba[k_11] * (1.0f - fg_k_3.w)) + fg_k_3;
+                }
+                cmd_ref.offset += 12u;
+                break;
+            }
+            case 9u:
+            {
+                for (uint k_12 = 0u; k_12 < 8u; k_12++)
                {
                    uint d_2 = min(clip_depth, 127u);
-                    float4 param_34 = float4(rgba[k_11]);
-                    uint _2184 = packsRGB(param_34);
-                    blend_stack[d_2][k_11] = _2184;
-                    rgba[k_11] = 0.0f.xxxx;
+                    float4 param_38 = float4(rgba[k_12]);
+                    uint _2454 = packsRGB(param_38);
+                    blend_stack[d_2][k_12] = _2454;
+                    rgba[k_12] = 0.0f.xxxx;
                }
                clip_depth++;
                cmd_ref.offset += 4u;
                break;
            }
-            case 9u:
+            case 10u:
            {
-                Alloc param_35 = cmd_alloc;
-                CmdRef param_36 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_35, param_36);
+                Alloc param_39 = cmd_alloc;
+                CmdRef param_40 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40);
                uint blend_mode = end_clip.blend >> uint(8);
                uint comp_mode = end_clip.blend & 255u;
                clip_depth--;
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                for (uint k_13 = 0u; k_13 < 8u; k_13++)
                {
                    uint d_3 = min(clip_depth, 127u);
-                    uint param_37 = blend_stack[d_3][k_12];
-                    float4 bg = unpacksRGB(param_37);
-                    float4 fg_1 = rgba[k_12] * area[k_12];
-                    float3 param_38 = bg.xyz;
-                    float3 param_39 = fg_1.xyz;
-                    uint param_40 = blend_mode;
-                    float3 blend = mix_blend(param_38, param_39, param_40);
-                    float4 _2251 = fg_1;
-                    float _2255 = fg_1.w;
-                    float3 _2262 = lerp(_2251.xyz, blend, float((_2255 * bg.w) > 0.0f).xxx);
-                    fg_1.x = _2262.x;
-                    fg_1.y = _2262.y;
-                    fg_1.z = _2262.z;
-                    float3 param_41 = bg.xyz;
-                    float3 param_42 = fg_1.xyz;
-                    float param_43 = bg.w;
-                    float param_44 = fg_1.w;
-                    uint param_45 = comp_mode;
-                    rgba[k_12] = mix_compose(param_41, param_42, param_43, param_44, param_45);
+                    uint param_41 = blend_stack[d_3][k_13];
+                    float4 bg = unpacksRGB(param_41);
+                    float4 fg_1 = rgba[k_13] * area[k_13];
+                    float3 param_42 = bg.xyz;
+                    float3 param_43 = fg_1.xyz;
+                    uint param_44 = blend_mode;
+                    float3 blend = mix_blend(param_42, param_43, param_44);
+                    float4 _2521 = fg_1;
+                    float _2525 = fg_1.w;
+                    float3 _2532 = lerp(_2521.xyz, blend, float((_2525 * bg.w) > 0.0f).xxx);
+                    fg_1.x = _2532.x;
+                    fg_1.y = _2532.y;
+                    fg_1.z = _2532.z;
+                    float3 param_45 = bg.xyz;
+                    float3 param_46 = fg_1.xyz;
+                    float param_47 = bg.w;
+                    float param_48 = fg_1.w;
+                    uint param_49 = comp_mode;
+                    rgba[k_13] = mix_compose(param_45, param_46, param_47, param_48, param_49);
                }
                cmd_ref.offset += 8u;
                break;
            }
-            case 10u:
+            case 11u:
            {
-                Alloc param_46 = cmd_alloc;
-                CmdRef param_47 = cmd_ref;
-                CmdRef _2299 = { Cmd_Jump_read(param_46, param_47).new_ref };
-                cmd_ref = _2299;
+                Alloc param_50 = cmd_alloc;
+                CmdRef param_51 = cmd_ref;
+                CmdRef _2569 = { Cmd_Jump_read(param_50, param_51).new_ref };
+                cmd_ref = _2569;
                cmd_alloc.offset = cmd_ref.offset;
                break;
            }
@ -1143,8 +1239,8 @@ void comp_main()
    }
    for (uint i_1 = 0u; i_1 < 8u; i_1++)
    {
-        uint param_48 = i_1;
-        image[int2(xy_uint + chunk_offset(param_48))] = rgba[i_1].w.x;
+        uint param_52 = i_1;
+        image[int2(xy_uint + chunk_offset(param_52))] = rgba[i_1].w.x;
    }
 }

--- a/piet-gpu/shader/gen/kernel4_gray.msl
+++ b/piet-gpu/shader/gen/kernel4_gray.msl
@ -94,6 +94,21 @@ struct CmdLinGrad
    float line_c;
 };

+struct CmdRadGradRef
+{
+    uint offset;
+};
+
+struct CmdRadGrad
+{
+    uint index;
+    float4 mat;
+    float2 xlat;
+    float2 c1;
+    float ra;
+    float roff;
+};
+
 struct CmdImageRef
 {
    uint offset;
@ -222,7 +237,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
 }

 static inline __attribute__((always_inline))
-uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_278)
+uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_291)
 {
    Alloc param = alloc;
    uint param_1 = offset;
@ -230,29 +245,29 @@ uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memor
    {
        return 0u;
    }
-    uint v = v_278.memory[offset];
+    uint v = v_291.memory[offset];
    return v;
 }

 static inline __attribute__((always_inline))
-CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
-    uint tag_and_flags = read_mem(param, param_1, v_278);
+    uint tag_and_flags = read_mem(param, param_1, v_291);
    return CmdTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
 }

 static inline __attribute__((always_inline))
-CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_278)
+CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_291)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
    Alloc param_2 = a;
    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
    CmdStroke s;
    s.tile_ref = raw0;
    s.half_width = as_type<float>(raw1);
@ -260,11 +275,11 @@ CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref,
 }

 static inline __attribute__((always_inline))
-CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
    Alloc param = a;
    CmdStrokeRef param_1 = CmdStrokeRef{ ref.offset + 4u };
-    return CmdStroke_read(param, param_1, v_278);
+    return CmdStroke_read(param, param_1, v_291);
 }

 static inline __attribute__((always_inline))
@ -276,27 +291,27 @@ Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const
 }

 static inline __attribute__((always_inline))
-TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_278)
+TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_291)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
    Alloc param_2 = a;
    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
    Alloc param_4 = a;
    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_278);
+    uint raw2 = read_mem(param_4, param_5, v_291);
    Alloc param_6 = a;
    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_278);
+    uint raw3 = read_mem(param_6, param_7, v_291);
    Alloc param_8 = a;
    uint param_9 = ix + 4u;
-    uint raw4 = read_mem(param_8, param_9, v_278);
+    uint raw4 = read_mem(param_8, param_9, v_291);
    Alloc param_10 = a;
    uint param_11 = ix + 5u;
-    uint raw5 = read_mem(param_10, param_11, v_278);
+    uint raw5 = read_mem(param_10, param_11, v_291);
    TileSeg s;
    s.origin = float2(as_type<float>(raw0), as_type<float>(raw1));
    s.vector = float2(as_type<float>(raw2), as_type<float>(raw3));
@ -312,15 +327,15 @@ uint2 chunk_offset(thread const uint& i)
 }

 static inline __attribute__((always_inline))
-CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_278)
+CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_291)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
    Alloc param_2 = a;
    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
    CmdFill s;
    s.tile_ref = raw0;
    s.backdrop = int(raw1);
@ -328,51 +343,51 @@ CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device
 }

 static inline __attribute__((always_inline))
-CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
    Alloc param = a;
    CmdFillRef param_1 = CmdFillRef{ ref.offset + 4u };
-    return CmdFill_read(param, param_1, v_278);
+    return CmdFill_read(param, param_1, v_291);
 }

 static inline __attribute__((always_inline))
-CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_278)
+CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_291)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
    CmdAlpha s;
    s.alpha = as_type<float>(raw0);
    return s;
 }

 static inline __attribute__((always_inline))
-CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
    Alloc param = a;
    CmdAlphaRef param_1 = CmdAlphaRef{ ref.offset + 4u };
-    return CmdAlpha_read(param, param_1, v_278);
+    return CmdAlpha_read(param, param_1, v_291);
 }

 static inline __attribute__((always_inline))
-CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_278)
+CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_291)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
    CmdColor s;
    s.rgba_color = raw0;
    return s;
 }

 static inline __attribute__((always_inline))
-CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
    Alloc param = a;
    CmdColorRef param_1 = CmdColorRef{ ref.offset + 4u };
-    return CmdColor_read(param, param_1, v_278);
+    return CmdColor_read(param, param_1, v_291);
 }

 static inline __attribute__((always_inline))
@ -393,21 +408,21 @@ float4 unpacksRGB(thread const uint& srgba)
 }

 static inline __attribute__((always_inline))
-CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_278)
+CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_291)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
    Alloc param_2 = a;
    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
    Alloc param_4 = a;
    uint param_5 = ix + 2u;
-    uint raw2 = read_mem(param_4, param_5, v_278);
+    uint raw2 = read_mem(param_4, param_5, v_291);
    Alloc param_6 = a;
    uint param_7 = ix + 3u;
-    uint raw3 = read_mem(param_6, param_7, v_278);
+    uint raw3 = read_mem(param_6, param_7, v_291);
    CmdLinGrad s;
    s.index = raw0;
    s.line_x = as_type<float>(raw1);
@ -417,23 +432,78 @@ CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& re
 }

 static inline __attribute__((always_inline))
-CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
    Alloc param = a;
    CmdLinGradRef param_1 = CmdLinGradRef{ ref.offset + 4u };
-    return CmdLinGrad_read(param, param_1, v_278);
+    return CmdLinGrad_read(param, param_1, v_291);
 }

 static inline __attribute__((always_inline))
-CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_278)
+CmdRadGrad CmdRadGrad_read(thread const Alloc& a, thread const CmdRadGradRef& ref, device Memory& v_291)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
    Alloc param_2 = a;
    uint param_3 = ix + 1u;
-    uint raw1 = read_mem(param_2, param_3, v_278);
+    uint raw1 = read_mem(param_2, param_3, v_291);
+    Alloc param_4 = a;
+    uint param_5 = ix + 2u;
+    uint raw2 = read_mem(param_4, param_5, v_291);
+    Alloc param_6 = a;
+    uint param_7 = ix + 3u;
+    uint raw3 = read_mem(param_6, param_7, v_291);
+    Alloc param_8 = a;
+    uint param_9 = ix + 4u;
+    uint raw4 = read_mem(param_8, param_9, v_291);
+    Alloc param_10 = a;
+    uint param_11 = ix + 5u;
+    uint raw5 = read_mem(param_10, param_11, v_291);
+    Alloc param_12 = a;
+    uint param_13 = ix + 6u;
+    uint raw6 = read_mem(param_12, param_13, v_291);
+    Alloc param_14 = a;
+    uint param_15 = ix + 7u;
+    uint raw7 = read_mem(param_14, param_15, v_291);
+    Alloc param_16 = a;
+    uint param_17 = ix + 8u;
+    uint raw8 = read_mem(param_16, param_17, v_291);
+    Alloc param_18 = a;
+    uint param_19 = ix + 9u;
+    uint raw9 = read_mem(param_18, param_19, v_291);
+    Alloc param_20 = a;
+    uint param_21 = ix + 10u;
+    uint raw10 = read_mem(param_20, param_21, v_291);
+    CmdRadGrad s;
+    s.index = raw0;
+    s.mat = float4(as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3), as_type<float>(raw4));
+    s.xlat = float2(as_type<float>(raw5), as_type<float>(raw6));
+    s.c1 = float2(as_type<float>(raw7), as_type<float>(raw8));
+    s.ra = as_type<float>(raw9);
+    s.roff = as_type<float>(raw10);
+    return s;
+}
+
+static inline __attribute__((always_inline))
+CmdRadGrad Cmd_RadGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
+{
+    Alloc param = a;
+    CmdRadGradRef param_1 = CmdRadGradRef{ ref.offset + 4u };
+    return CmdRadGrad_read(param, param_1, v_291);
+}
+
+static inline __attribute__((always_inline))
+CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_291)
+{
+    uint ix = ref.offset >> uint(2);
+    Alloc param = a;
+    uint param_1 = ix + 0u;
+    uint raw0 = read_mem(param, param_1, v_291);
+    Alloc param_2 = a;
+    uint param_3 = ix + 1u;
+    uint raw1 = read_mem(param_2, param_3, v_291);
    CmdImage s;
    s.index = raw0;
    s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
@ -441,11 +511,11 @@ CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, dev
 }

 static inline __attribute__((always_inline))
-CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
    Alloc param = a;
    CmdImageRef param_1 = CmdImageRef{ ref.offset + 4u };
-    return CmdImage_read(param, param_1, v_278);
+    return CmdImage_read(param, param_1, v_291);
 }

 static inline __attribute__((always_inline))
@ -458,10 +528,10 @@ spvUnsafeArray<float4, 8> fillImage(thread const uint2& xy, thread const CmdImag
        int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
        float4 fg_rgba = image_atlas.read(uint2(uv));
        float3 param_1 = fg_rgba.xyz;
-        float3 _1493 = fromsRGB(param_1);
-        fg_rgba.x = _1493.x;
-        fg_rgba.y = _1493.y;
-        fg_rgba.z = _1493.z;
+        float3 _1638 = fromsRGB(param_1);
+        fg_rgba.x = _1638.x;
+        fg_rgba.y = _1638.y;
+        fg_rgba.z = _1638.z;
        rgba[i] = fg_rgba;
    }
    return rgba;
@ -485,23 +555,23 @@ uint packsRGB(thread float4& rgba)
 }

 static inline __attribute__((always_inline))
-CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_278)
+CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_291)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
    CmdEndClip s;
    s.blend = raw0;
    return s;
 }

 static inline __attribute__((always_inline))
-CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
    Alloc param = a;
    CmdEndClipRef param_1 = CmdEndClipRef{ ref.offset + 4u };
-    return CmdEndClip_read(param, param_1, v_278);
+    return CmdEndClip_read(param, param_1, v_291);
 }

 static inline __attribute__((always_inline))
@ -701,8 +771,8 @@ float3 set_lum(thread const float3& c, thread const float& l)
 {
    float3 param = c;
    float3 param_1 = c + float3(l - lum(param));
-    float3 _901 = clip_color(param_1);
-    return _901;
+    float3 _1046 = clip_color(param_1);
+    return _1046;
 }

 static inline __attribute__((always_inline))
@ -791,9 +861,9 @@ float3 mix_blend(thread const float3& cb, thread const float3& cs, thread const
            float3 param_20 = cb;
            float3 param_21 = cs;
            float param_22 = sat(param_20);
-            float3 _1192 = set_sat(param_21, param_22);
+            float3 _1337 = set_sat(param_21, param_22);
            float3 param_23 = cb;
-            float3 param_24 = _1192;
+            float3 param_24 = _1337;
            float param_25 = lum(param_23);
            b = set_lum(param_24, param_25);
            break;
@ -803,9 +873,9 @@ float3 mix_blend(thread const float3& cb, thread const float3& cs, thread const
            float3 param_26 = cs;
            float3 param_27 = cb;
            float param_28 = sat(param_26);
-            float3 _1206 = set_sat(param_27, param_28);
+            float3 _1351 = set_sat(param_27, param_28);
            float3 param_29 = cb;
-            float3 param_30 = _1206;
+            float3 param_30 = _1351;
            float param_31 = lum(param_29);
            b = set_lum(param_30, param_31);
            break;
@ -931,30 +1001,30 @@ float4 mix_compose(thread const float3& cb, thread const float3& cs, thread cons
 }

 static inline __attribute__((always_inline))
-CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_278)
+CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_291)
 {
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
-    uint raw0 = read_mem(param, param_1, v_278);
+    uint raw0 = read_mem(param, param_1, v_291);
    CmdJump s;
    s.new_ref = raw0;
    return s;
 }

 static inline __attribute__((always_inline))
-CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_278)
+CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_291)
 {
    Alloc param = a;
    CmdJumpRef param_1 = CmdJumpRef{ ref.offset + 4u };
-    return CmdJump_read(param, param_1, v_278);
+    return CmdJump_read(param, param_1, v_291);
 }

-kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1521 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+kernel void main0(device Memory& v_291 [[buffer(0)]], const device ConfigBuf& _1666 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
-    uint tile_ix = (gl_WorkGroupID.y * _1521.conf.width_in_tiles) + gl_WorkGroupID.x;
+    uint tile_ix = (gl_WorkGroupID.y * _1666.conf.width_in_tiles) + gl_WorkGroupID.x;
    Alloc param;
-    param.offset = _1521.conf.ptcl_alloc.offset;
+    param.offset = _1666.conf.ptcl_alloc.offset;
    uint param_1 = tile_ix * 1024u;
    uint param_2 = 1024u;
    Alloc cmd_alloc = slice_mem(param, param_1, param_2);
@ -967,7 +1037,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
        rgba[i] = float4(0.0);
    }
    uint clip_depth = 0u;
-    bool mem_ok = v_278.mem_error == 0u;
+    bool mem_ok = v_291.mem_error == 0u;
    spvUnsafeArray<float, 8> df;
    TileSegRef tile_seg_ref;
    spvUnsafeArray<float, 8> area;
@ -976,7 +1046,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
    {
        Alloc param_3 = cmd_alloc;
        CmdRef param_4 = cmd_ref;
-        uint tag = Cmd_tag(param_3, param_4, v_278).tag;
+        uint tag = Cmd_tag(param_3, param_4, v_291).tag;
        if (tag == 0u)
        {
            break;
@ -987,7 +1057,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
            {
                Alloc param_5 = cmd_alloc;
                CmdRef param_6 = cmd_ref;
-                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_278);
+                CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_291);
                for (uint k = 0u; k < 8u; k++)
                {
                    df[k] = 1000000000.0;
@ -1000,7 +1070,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
                    bool param_9 = mem_ok;
                    Alloc param_10 = new_alloc(param_7, param_8, param_9);
                    TileSegRef param_11 = tile_seg_ref;
-                    TileSeg seg = TileSeg_read(param_10, param_11, v_278);
+                    TileSeg seg = TileSeg_read(param_10, param_11, v_291);
                    float2 line_vec = seg.vector;
                    for (uint k_1 = 0u; k_1 < 8u; k_1++)
                    {
@ -1023,7 +1093,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
            {
                Alloc param_13 = cmd_alloc;
                CmdRef param_14 = cmd_ref;
-                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_278);
+                CmdFill fill = Cmd_Fill_read(param_13, param_14, v_291);
                for (uint k_3 = 0u; k_3 < 8u; k_3++)
                {
                    area[k_3] = float(fill.backdrop);
@ -1036,7 +1106,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
                    bool param_17 = mem_ok;
                    Alloc param_18 = new_alloc(param_15, param_16, param_17);
                    TileSegRef param_19 = tile_seg_ref;
-                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_278);
+                    TileSeg seg_1 = TileSeg_read(param_18, param_19, v_291);
                    for (uint k_4 = 0u; k_4 < 8u; k_4++)
                    {
                        uint param_20 = k_4;
@ -1080,7 +1150,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
            {
                Alloc param_21 = cmd_alloc;
                CmdRef param_22 = cmd_ref;
-                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_278);
+                CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_291);
                for (uint k_7 = 0u; k_7 < 8u; k_7++)
                {
                    area[k_7] = alpha.alpha;
@ -1092,7 +1162,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
            {
                Alloc param_23 = cmd_alloc;
                CmdRef param_24 = cmd_ref;
-                CmdColor color = Cmd_Color_read(param_23, param_24, v_278);
+                CmdColor color = Cmd_Color_read(param_23, param_24, v_291);
                uint param_25 = color.rgba_color;
                float4 fg = unpacksRGB(param_25);
                for (uint k_8 = 0u; k_8 < 8u; k_8++)
@ -1107,7 +1177,7 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
            {
                Alloc param_26 = cmd_alloc;
                CmdRef param_27 = cmd_ref;
-                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_278);
+                CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_291);
                float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c;
                for (uint k_9 = 0u; k_9 < 8u; k_9++)
                {
@ -1117,11 +1187,12 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
                    int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
                    float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
                    float3 param_29 = fg_rgba.xyz;
-                    float3 _2092 = fromsRGB(param_29);
-                    fg_rgba.x = _2092.x;
-                    fg_rgba.y = _2092.y;
-                    fg_rgba.z = _2092.z;
-                    rgba[k_9] = fg_rgba;
+                    float3 _2238 = fromsRGB(param_29);
+                    fg_rgba.x = _2238.x;
+                    fg_rgba.y = _2238.y;
+                    fg_rgba.z = _2238.z;
+                    float4 fg_k_1 = fg_rgba * area[k_9];
+                    rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1;
                }
                cmd_ref.offset += 20u;
                break;
@ -1130,72 +1201,98 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
            {
                Alloc param_30 = cmd_alloc;
                CmdRef param_31 = cmd_ref;
-                CmdImage fill_img = Cmd_Image_read(param_30, param_31, v_278);
-                uint2 param_32 = xy_uint;
-                CmdImage param_33 = fill_img;
-                spvUnsafeArray<float4, 8> img;
-                img = fillImage(param_32, param_33, image_atlas);
+                CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31, v_291);
                for (uint k_10 = 0u; k_10 < 8u; k_10++)
                {
-                    float4 fg_k_1 = img[k_10] * area[k_10];
-                    rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_1.w)) + fg_k_1;
+                    uint param_32 = k_10;
+                    float2 my_xy_1 = xy + float2(chunk_offset(param_32));
+                    my_xy_1 = ((rad.mat.xz * my_xy_1.x) + (rad.mat.yw * my_xy_1.y)) - rad.xlat;
+                    float ba = dot(my_xy_1, rad.c1);
+                    float ca = rad.ra * dot(my_xy_1, my_xy_1);
+                    float t_2 = (sqrt((ba * ba) + ca) - ba) - rad.roff;
+                    int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0));
+                    float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index))));
+                    float3 param_33 = fg_rgba_1.xyz;
+                    float3 _2348 = fromsRGB(param_33);
+                    fg_rgba_1.x = _2348.x;
+                    fg_rgba_1.y = _2348.y;
+                    fg_rgba_1.z = _2348.z;
+                    float4 fg_k_2 = fg_rgba_1 * area[k_10];
+                    rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2;
                }
-                cmd_ref.offset += 12u;
+                cmd_ref.offset += 48u;
                break;
            }
            case 8u:
            {
+                Alloc param_34 = cmd_alloc;
+                CmdRef param_35 = cmd_ref;
+                CmdImage fill_img = Cmd_Image_read(param_34, param_35, v_291);
+                uint2 param_36 = xy_uint;
+                CmdImage param_37 = fill_img;
+                spvUnsafeArray<float4, 8> img;
+                img = fillImage(param_36, param_37, image_atlas);
                for (uint k_11 = 0u; k_11 < 8u; k_11++)
+                {
+                    float4 fg_k_3 = img[k_11] * area[k_11];
+                    rgba[k_11] = (rgba[k_11] * (1.0 - fg_k_3.w)) + fg_k_3;
+                }
+                cmd_ref.offset += 12u;
+                break;
+            }
+            case 9u:
+            {
+                for (uint k_12 = 0u; k_12 < 8u; k_12++)
                {
                    uint d_2 = min(clip_depth, 127u);
-                    float4 param_34 = float4(rgba[k_11]);
-                    uint _2184 = packsRGB(param_34);
-                    blend_stack[d_2][k_11] = _2184;
-                    rgba[k_11] = float4(0.0);
+                    float4 param_38 = float4(rgba[k_12]);
+                    uint _2454 = packsRGB(param_38);
+                    blend_stack[d_2][k_12] = _2454;
+                    rgba[k_12] = float4(0.0);
                }
                clip_depth++;
                cmd_ref.offset += 4u;
                break;
            }
-            case 9u:
+            case 10u:
            {
-                Alloc param_35 = cmd_alloc;
-                CmdRef param_36 = cmd_ref;
-                CmdEndClip end_clip = Cmd_EndClip_read(param_35, param_36, v_278);
+                Alloc param_39 = cmd_alloc;
+                CmdRef param_40 = cmd_ref;
+                CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40, v_291);
                uint blend_mode = end_clip.blend >> uint(8);
                uint comp_mode = end_clip.blend & 255u;
                clip_depth--;
-                for (uint k_12 = 0u; k_12 < 8u; k_12++)
+                for (uint k_13 = 0u; k_13 < 8u; k_13++)
                {
                    uint d_3 = min(clip_depth, 127u);
-                    uint param_37 = blend_stack[d_3][k_12];
-                    float4 bg = unpacksRGB(param_37);
-                    float4 fg_1 = rgba[k_12] * area[k_12];
-                    float3 param_38 = bg.xyz;
-                    float3 param_39 = fg_1.xyz;
-                    uint param_40 = blend_mode;
-                    float3 blend = mix_blend(param_38, param_39, param_40);
-                    float4 _2251 = fg_1;
-                    float _2255 = fg_1.w;
-                    float3 _2262 = mix(_2251.xyz, blend, float3(float((_2255 * bg.w) > 0.0)));
-                    fg_1.x = _2262.x;
-                    fg_1.y = _2262.y;
-                    fg_1.z = _2262.z;
-                    float3 param_41 = bg.xyz;
-                    float3 param_42 = fg_1.xyz;
-                    float param_43 = bg.w;
-                    float param_44 = fg_1.w;
-                    uint param_45 = comp_mode;
-                    rgba[k_12] = mix_compose(param_41, param_42, param_43, param_44, param_45);
+                    uint param_41 = blend_stack[d_3][k_13];
+                    float4 bg = unpacksRGB(param_41);
+                    float4 fg_1 = rgba[k_13] * area[k_13];
+                    float3 param_42 = bg.xyz;
+                    float3 param_43 = fg_1.xyz;
+                    uint param_44 = blend_mode;
+                    float3 blend = mix_blend(param_42, param_43, param_44);
+                    float4 _2521 = fg_1;
+                    float _2525 = fg_1.w;
+                    float3 _2532 = mix(_2521.xyz, blend, float3(float((_2525 * bg.w) > 0.0)));
+                    fg_1.x = _2532.x;
+                    fg_1.y = _2532.y;
+                    fg_1.z = _2532.z;
+                    float3 param_45 = bg.xyz;
+                    float3 param_46 = fg_1.xyz;
+                    float param_47 = bg.w;
+                    float param_48 = fg_1.w;
+                    uint param_49 = comp_mode;
+                    rgba[k_13] = mix_compose(param_45, param_46, param_47, param_48, param_49);
                }
                cmd_ref.offset += 8u;
                break;
            }
-            case 10u:
+            case 11u:
            {
-                Alloc param_46 = cmd_alloc;
-                CmdRef param_47 = cmd_ref;
-                cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_278).new_ref };
+                Alloc param_50 = cmd_alloc;
+                CmdRef param_51 = cmd_ref;
+                cmd_ref = CmdRef{ Cmd_Jump_read(param_50, param_51, v_291).new_ref };
                cmd_alloc.offset = cmd_ref.offset;
                break;
            }
@ -1203,8 +1300,8 @@ kernel void main0(device Memory& v_278 [[buffer(0)]], const device ConfigBuf& _1
    }
    for (uint i_1 = 0u; i_1 < 8u; i_1++)
    {
-        uint param_48 = i_1;
-        image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48))));
+        uint param_52 = i_1;
+        image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_52))));
    }
 }

--- a/piet-gpu/shader/gen/kernel4_gray.spv
+++ b/piet-gpu/shader/gen/kernel4_gray.spv
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@ -192,10 +192,27 @@ void main() {
                int x = int(round(clamp(my_d, 0.0, 1.0) * float(GRADIENT_WIDTH - 1)));
                mediump vec4 fg_rgba = imageLoad(gradients, ivec2(x, int(lin.index)));
                fg_rgba.rgb = fromsRGB(fg_rgba.rgb);
-                rgba[k] = fg_rgba;
+                mediump vec4 fg_k = fg_rgba * area[k];
+                rgba[k] = rgba[k] * (1.0 - fg_k.a) + fg_k;
            }
            cmd_ref.offset += 4 + CmdLinGrad_size;
            break;
+        case Cmd_RadGrad:
+            CmdRadGrad rad = Cmd_RadGrad_read(cmd_alloc, cmd_ref);
+            for (uint k = 0; k < CHUNK; k++) {
+                vec2 my_xy = xy + vec2(chunk_offset(k));
+                my_xy = rad.mat.xz * my_xy.x + rad.mat.yw * my_xy.y - rad.xlat;
+                float ba = dot(my_xy, rad.c1);
+                float ca = rad.ra * dot(my_xy, my_xy);
+                float t = sqrt(ba * ba  + ca) - ba - rad.roff;
+                int x = int(round(clamp(t, 0.0, 1.0) * float(GRADIENT_WIDTH - 1)));
+                mediump vec4 fg_rgba = imageLoad(gradients, ivec2(x, int(rad.index)));
+                fg_rgba.rgb = fromsRGB(fg_rgba.rgb);
+                mediump vec4 fg_k = fg_rgba * area[k];
+                rgba[k] = rgba[k] * (1.0 - fg_k.a) + fg_k;
+            }
+            cmd_ref.offset += 4 + CmdRadGrad_size;
+            break;
        case Cmd_Image:
            CmdImage fill_img = Cmd_Image_read(cmd_alloc, cmd_ref);
            mediump vec4 img[CHUNK] = fillImage(xy_uint, fill_img);
--- a/piet-gpu/shader/ptcl.h
+++ b/piet-gpu/shader/ptcl.h
@ -18,6 +18,10 @@ struct CmdLinGradRef {
    uint offset;
 };

+struct CmdRadGradRef {
+    uint offset;
+};
+
 struct CmdImageRef {
    uint offset;
 };
@ -83,6 +87,21 @@ CmdLinGradRef CmdLinGrad_index(CmdLinGradRef ref, uint index) {
    return CmdLinGradRef(ref.offset + index * CmdLinGrad_size);
 }

+struct CmdRadGrad {
+    uint index;
+    vec4 mat;
+    vec2 xlat;
+    vec2 c1;
+    float ra;
+    float roff;
+};
+
+#define CmdRadGrad_size 44
+
+CmdRadGradRef CmdRadGrad_index(CmdRadGradRef ref, uint index) {
+    return CmdRadGradRef(ref.offset + index * CmdRadGrad_size);
+}
+
 struct CmdImage {
    uint index;
    ivec2 offset;
@ -131,11 +150,12 @@ CmdJumpRef CmdJump_index(CmdJumpRef ref, uint index) {
 #define Cmd_Alpha 4
 #define Cmd_Color 5
 #define Cmd_LinGrad 6
-#define Cmd_Image 7
-#define Cmd_BeginClip 8
-#define Cmd_EndClip 9
-#define Cmd_Jump 10
-#define Cmd_size 20
+#define Cmd_RadGrad 7
+#define Cmd_Image 8
+#define Cmd_BeginClip 9
+#define Cmd_EndClip 10
+#define Cmd_Jump 11
+#define Cmd_size 48

 CmdRef Cmd_index(CmdRef ref, uint index) {
    return CmdRef(ref.offset + index * Cmd_size);
@ -213,6 +233,44 @@ void CmdLinGrad_write(Alloc a, CmdLinGradRef ref, CmdLinGrad s) {
    write_mem(a, ix + 3, floatBitsToUint(s.line_c));
 }

+CmdRadGrad CmdRadGrad_read(Alloc a, CmdRadGradRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    uint raw4 = read_mem(a, ix + 4);
+    uint raw5 = read_mem(a, ix + 5);
+    uint raw6 = read_mem(a, ix + 6);
+    uint raw7 = read_mem(a, ix + 7);
+    uint raw8 = read_mem(a, ix + 8);
+    uint raw9 = read_mem(a, ix + 9);
+    uint raw10 = read_mem(a, ix + 10);
+    CmdRadGrad s;
+    s.index = raw0;
+    s.mat = vec4(uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3), uintBitsToFloat(raw4));
+    s.xlat = vec2(uintBitsToFloat(raw5), uintBitsToFloat(raw6));
+    s.c1 = vec2(uintBitsToFloat(raw7), uintBitsToFloat(raw8));
+    s.ra = uintBitsToFloat(raw9);
+    s.roff = uintBitsToFloat(raw10);
+    return s;
+}
+
+void CmdRadGrad_write(Alloc a, CmdRadGradRef ref, CmdRadGrad s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, s.index);
+    write_mem(a, ix + 1, floatBitsToUint(s.mat.x));
+    write_mem(a, ix + 2, floatBitsToUint(s.mat.y));
+    write_mem(a, ix + 3, floatBitsToUint(s.mat.z));
+    write_mem(a, ix + 4, floatBitsToUint(s.mat.w));
+    write_mem(a, ix + 5, floatBitsToUint(s.xlat.x));
+    write_mem(a, ix + 6, floatBitsToUint(s.xlat.y));
+    write_mem(a, ix + 7, floatBitsToUint(s.c1.x));
+    write_mem(a, ix + 8, floatBitsToUint(s.c1.y));
+    write_mem(a, ix + 9, floatBitsToUint(s.ra));
+    write_mem(a, ix + 10, floatBitsToUint(s.roff));
+}
+
 CmdImage CmdImage_read(Alloc a, CmdImageRef ref) {
    uint ix = ref.offset >> 2;
    uint raw0 = read_mem(a, ix + 0);
@ -293,6 +351,10 @@ CmdLinGrad Cmd_LinGrad_read(Alloc a, CmdRef ref) {
    return CmdLinGrad_read(a, CmdLinGradRef(ref.offset + 4));
 }

+CmdRadGrad Cmd_RadGrad_read(Alloc a, CmdRef ref) {
+    return CmdRadGrad_read(a, CmdRadGradRef(ref.offset + 4));
+}
+
 CmdImage Cmd_Image_read(Alloc a, CmdRef ref) {
    return CmdImage_read(a, CmdImageRef(ref.offset + 4));
 }
@ -338,6 +400,11 @@ void Cmd_LinGrad_write(Alloc a, CmdRef ref, CmdLinGrad s) {
    CmdLinGrad_write(a, CmdLinGradRef(ref.offset + 4), s);
 }

+void Cmd_RadGrad_write(Alloc a, CmdRef ref, CmdRadGrad s) {
+    write_mem(a, ref.offset >> 2, Cmd_RadGrad);
+    CmdRadGrad_write(a, CmdRadGradRef(ref.offset + 4), s);
+}
+
 void Cmd_Image_write(Alloc a, CmdRef ref, CmdImage s) {
    write_mem(a, ref.offset >> 2, Cmd_Image);
    CmdImage_write(a, CmdImageRef(ref.offset + 4), s);
--- a/piet-gpu/src/encoder.rs
+++ b/piet-gpu/src/encoder.rs
@ -62,6 +62,7 @@ const ANNOTATED_SIZE: usize = 40;
 // Tags for draw objects. See shader/drawtag.h for the authoritative source.
 const DRAWTAG_FILLCOLOR: u32 = 0x44;
 const DRAWTAG_FILLLINGRADIENT: u32 = 0x114;
+const DRAWTAG_FILLRADGRADIENT: u32 = 0x2dc;
 const DRAWTAG_BEGINCLIP: u32 = 0x05;
 const DRAWTAG_ENDCLIP: u32 = 0x25;

@ -79,6 +80,16 @@ pub struct FillLinGradient {
    p1: [f32; 2],
 }

+#[repr(C)]
+#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
+pub struct FillRadGradient {
+    index: u32,
+    p0: [f32; 2],
+    p1: [f32; 2],
+    r0: f32,
+    r1: f32,
+}
+
 #[allow(unused)]
 #[repr(C)]
 #[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
@ -123,6 +134,13 @@ impl Encoder {
        self.transform_stream.push(transform);
    }

+    // Swap the last two tags in the tag stream; used for transformed
+    // gradients.
+    pub fn swap_last_tags(&mut self) {
+        let len = self.tag_stream.len();
+        self.tag_stream.swap(len - 1, len - 2);
+    }
+
    // -1.0 means "fill"
    pub fn linewidth(&mut self, linewidth: f32) {
        self.tag_stream.push(0x40);
@ -147,6 +165,16 @@ impl Encoder {
        self.drawdata_stream.extend(bytemuck::bytes_of(&element));
    }

+
+    /// Encode a fill radial gradient draw object.
+    ///
+    /// This should be encoded after a path.
+    pub fn fill_rad_gradient(&mut self, index: u32, p0: [f32; 2], p1: [f32; 2], r0: f32, r1: f32) {
+        self.drawtag_stream.push(DRAWTAG_FILLRADGRADIENT);
+        let element = FillRadGradient { index, p0, p1, r0, r1 };
+        self.drawdata_stream.extend(bytemuck::bytes_of(&element));
+    }
+    
    /// Start a clip.
    pub fn begin_clip(&mut self, blend: Option<Blend>) {
        self.drawtag_stream.push(DRAWTAG_BEGINCLIP);
@ -220,7 +248,7 @@ impl Encoder {
        alloc += n_drawobj * DRAW_BBOX_SIZE;
        let drawinfo_alloc = alloc;
        // TODO: not optimized; it can be accumulated during encoding or summed from drawtags
-        const MAX_DRAWINFO_SIZE: usize = 16;
+        const MAX_DRAWINFO_SIZE: usize = 44;
        alloc += n_drawobj * MAX_DRAWINFO_SIZE;

        let config = Config {
--- a/piet-gpu/src/gradient.rs
+++ b/piet-gpu/src/gradient.rs
@ -18,15 +18,29 @@

 use std::collections::hash_map::{Entry, HashMap};

-use piet::{Color, FixedLinearGradient, GradientStop};
+use piet::kurbo::Point;
+use piet::{Color, FixedLinearGradient, GradientStop, FixedRadialGradient};
+
+/// Radial gradient compatible with COLRv1 spec
+#[derive(Debug, Clone)]
+pub struct Colrv1RadialGradient {
+    /// The center of the iner circle.
+    pub center0: Point,
+    /// The offset of the origin relative to the center.
+    pub center1: Point,
+    /// The radius of the inner circle.
+    pub radius0: f64,
+    /// The radius of the outer circle.
+    pub radius1: f64,
+    /// The stops.
+    pub stops: Vec<GradientStop>,
+}

 #[derive(Clone)]
 pub struct BakedGradient {
    ramp: Vec<u32>,
 }

-/// This is basically the same type as scene::FillLinGradient, so could
-/// potentially use that directly.
 #[derive(Clone)]
 pub struct LinearGradient {
    pub(crate) start: [f32; 2],
@ -34,6 +48,15 @@ pub struct LinearGradient {
    pub(crate) ramp_id: u32,
 }

+#[derive(Clone)]
+pub struct RadialGradient {
+    pub(crate) start: [f32; 2],
+    pub(crate) end: [f32; 2],
+    pub(crate) r0: f32,
+    pub(crate) r1: f32,
+    pub(crate) ramp_id: u32,
+}
+
 #[derive(Default)]
 pub struct RampCache {
    ramps: Vec<GradientRamp>,
@ -154,6 +177,28 @@ impl RampCache {
        }
    }

+    pub fn add_radial_gradient(&mut self, rad: &FixedRadialGradient) -> RadialGradient {
+        let ramp_id = self.add_ramp(&rad.stops);
+        RadialGradient {
+            ramp_id: ramp_id as u32,
+            start: crate::render_ctx::to_f32_2(rad.center + rad.origin_offset),
+            end: crate::render_ctx::to_f32_2(rad.center),
+            r0: 0.0,
+            r1: rad.radius as f32,
+        }
+    }
+
+    pub fn add_radial_gradient_colrv1(&mut self, rad: &Colrv1RadialGradient) -> RadialGradient {
+        let ramp_id = self.add_ramp(&rad.stops);
+        RadialGradient {
+            ramp_id: ramp_id as u32,
+            start: crate::render_ctx::to_f32_2(rad.center0),
+            end: crate::render_ctx::to_f32_2(rad.center1),
+            r0: rad.radius0 as f32,
+            r1: rad.radius1 as f32,
+        }
+    }
+
    /// Dump the contents of a gradient. This is for debugging.
    #[allow(unused)]
    pub(crate) fn dump_gradient(&self, lin: &LinearGradient) {
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -12,6 +12,7 @@ use std::convert::TryInto;

 pub use blend::{Blend, BlendMode, CompositionMode};
 pub use render_ctx::PietGpuRenderContext;
+pub use gradient::Colrv1RadialGradient;

 use piet::kurbo::Vec2;
 use piet::{ImageFormat, RenderContext};
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@ -13,7 +13,7 @@ use piet_gpu_hal::BufWrite;
 use piet_gpu_types::encoder::{Encode, Encoder};
 use piet_gpu_types::scene::Element;

-use crate::gradient::{LinearGradient, RampCache};
+use crate::gradient::{LinearGradient, RadialGradient, RampCache, Colrv1RadialGradient};
 use crate::text::Font;
 pub use crate::text::{PietGpuText, PietGpuTextLayout, PietGpuTextLayoutBuilder};
 use crate::Blend;
@ -50,6 +50,7 @@ pub struct PietGpuRenderContext {
 pub enum PietGpuBrush {
    Solid(u32),
    LinGradient(LinearGradient),
+    RadGradient(RadialGradient),
 }

 #[derive(Default)]
@ -187,6 +188,10 @@ impl RenderContext for PietGpuRenderContext {
                let lin = self.ramp_cache.add_linear_gradient(&lin);
                Ok(PietGpuBrush::LinGradient(lin))
            }
+            FixedGradient::Radial(rad) => {
+                let rad = self.ramp_cache.add_radial_gradient(&rad);
+                Ok(PietGpuBrush::RadGradient(rad))
+            }
            _ => todo!("don't do radial gradients yet"),
        }
    }
@ -338,6 +343,20 @@ impl PietGpuRenderContext {
        }
    }

+    pub fn radial_gradient_colrv1(&mut self, rad: &Colrv1RadialGradient) -> PietGpuBrush {
+        PietGpuBrush::RadGradient(self.ramp_cache.add_radial_gradient_colrv1(rad))
+    }
+
+    pub fn fill_transform(&mut self, shape: impl Shape, brush: &PietGpuBrush, transform: Affine) {
+        let path = shape.path_elements(TOLERANCE);
+        self.encode_linewidth(-1.0);
+        self.encode_path(path, true);
+        self.encode_transform(Transform::from_kurbo(transform));
+        self.new_encoder.swap_last_tags();
+        self.encode_brush(&brush);
+        self.encode_transform(Transform::from_kurbo(transform.inverse()));
+    }
+
    fn encode_path(&mut self, path: impl Iterator<Item = PathEl>, is_fill: bool) {
        if is_fill {
            self.encode_path_inner(
@ -420,6 +439,10 @@ impl PietGpuRenderContext {
                self.new_encoder
                    .fill_lin_gradient(lin.ramp_id, lin.start, lin.end);
            }
+            PietGpuBrush::RadGradient(rad) => {
+                self.new_encoder
+                    .fill_rad_gradient(rad.ramp_id, rad.start, rad.end, rad.r0, rad.r1);
+            }
        }
    }
 }
--- a/piet-gpu/src/test_scenes.rs
+++ b/piet-gpu/src/test_scenes.rs
@ -2,10 +2,10 @@

 use rand::{Rng, RngCore};

-use crate::{Blend, BlendMode, CompositionMode, PietGpuRenderContext};
+use crate::{Blend, BlendMode, CompositionMode, PietGpuRenderContext, Colrv1RadialGradient};
 use piet::kurbo::{Affine, BezPath, Circle, Line, Point, Rect, Shape};
 use piet::{
-    Color, FixedGradient, FixedLinearGradient, GradientStop, Text, TextAttribute, TextLayoutBuilder,
+    Color, FixedGradient, FixedRadialGradient, GradientStop, Text, TextAttribute, TextLayoutBuilder,
 };

 use crate::{PicoSvg, RenderContext, Vec2};
@ -32,7 +32,7 @@ pub fn render_svg(rc: &mut impl RenderContext, filename: &str, scale: f64) {
    println!("flattening and encoding time: {:?}", start.elapsed());
 }

-pub fn render_scene(rc: &mut impl RenderContext) {
+pub fn render_scene(rc: &mut PietGpuRenderContext) {
    const WIDTH: usize = 2048;
    const HEIGHT: usize = 1536;
    let mut rng = rand::thread_rng();
@ -142,7 +142,7 @@ fn render_alpha_test(rc: &mut impl RenderContext) {
 }

 #[allow(unused)]
-fn render_gradient_test(rc: &mut impl RenderContext) {
+fn render_gradient_test(rc: &mut PietGpuRenderContext) {
    let stops = vec![
        GradientStop {
            color: Color::rgb8(0, 255, 0),
@ -153,14 +153,18 @@ fn render_gradient_test(rc: &mut impl RenderContext) {
            pos: 1.0,
        },
    ];
-    let lin = FixedLinearGradient {
-        start: Point::new(0.0, 100.0),
-        end: Point::new(0.0, 300.0),
+    let rad = Colrv1RadialGradient {
+        center0: Point::new(200.0, 200.0),
+        center1: Point::new(250.0, 200.0),
+        radius0: 50.0,
+        radius1: 100.0,
        stops,
    };
-    let brush = FixedGradient::Linear(lin);
+    let brush = rc.radial_gradient_colrv1(&rad);
+    //let brush = FixedGradient::Radial(rad);
    //let brush = Color::rgb8(0, 128, 0);
-    rc.fill(Rect::new(100.0, 100.0, 300.0, 300.0), &brush);
+    let transform = Affine::new([1.0, 0.0, 0.0, 0.5, 0.0, 100.0]);
+    rc.fill_transform(Rect::new(100.0, 100.0, 300.0, 300.0), &brush, transform);
 }

 fn diamond(origin: Point) -> impl Shape {