diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp
index 0855219..ad39af2 100644
--- a/piet-gpu/shader/elements.comp
+++ b/piet-gpu/shader/elements.comp
@@ -318,29 +318,34 @@ void main() {
         case Element_StrokeLine:
             LineSeg line = Element_StrokeLine_read(this_ref);
             PathStrokeLine path_line;
-            path_line.p0 = st.mat.xy * line.p0.x + st.mat.zw * line.p0.y + st.translate;
-            path_line.p1 = st.mat.xy * line.p1.x + st.mat.zw * line.p1.y + st.translate;
-            path_line.path_ix = st.path_count;
+            vec2 p0 = st.mat.xy * line.p0.x + st.mat.zw * line.p0.y + st.translate;
+            vec2 p1 = st.mat.xy * line.p1.x + st.mat.zw * line.p1.y + st.translate;
+            PathStrokeCubic path_cubic;
+            path_cubic.p0 = p0;
+            path_cubic.p1 = mix(p0, p1, 1.0 / 3.0);
+            path_cubic.p2 = mix(p1, p0, 1.0 / 3.0);
+            path_cubic.p3 = p1;
+            path_cubic.path_ix = st.path_count;
             if (tag == Element_StrokeLine) {
-                path_line.stroke = get_linewidth(st);
+                path_cubic.stroke = get_linewidth(st);
             } else {
-                path_line.stroke = vec2(0.0);
+                path_cubic.stroke = vec2(0.0);
             }
             // We do encoding a bit by hand to minimize divergence. Another approach
             // would be to have a fill/stroke bool.
             PathSegRef path_out_ref = PathSegRef((st.pathseg_count - 1) * PathSeg_size);
-            uint out_tag = tag == Element_FillLine ? PathSeg_FillLine : PathSeg_StrokeLine;
+            uint out_tag = tag == Element_FillLine ? PathSeg_FillCubic : PathSeg_StrokeCubic;
             pathseg[path_out_ref.offset >> 2] = out_tag;
-            PathStrokeLine_write(PathStrokeLineRef(path_out_ref.offset + 4), path_line);
+            PathStrokeCubic_write(PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic);
             break;
         case Element_FillCubic:
         case Element_StrokeCubic:
             CubicSeg cubic = Element_StrokeCubic_read(this_ref);
-            PathStrokeCubic path_cubic;
+            path_cubic;
             path_cubic.p0 = st.mat.xy * cubic.p0.x + st.mat.zw * cubic.p0.y + st.translate;
             path_cubic.p1 = st.mat.xy * cubic.p1.x + st.mat.zw * cubic.p1.y + st.translate;
-            path_cubic.p1 = st.mat.xy * cubic.p2.x + st.mat.zw * cubic.p2.y + st.translate;
-            path_cubic.p1 = st.mat.xy * cubic.p3.x + st.mat.zw * cubic.p3.y + st.translate;
+            path_cubic.p2 = st.mat.xy * cubic.p2.x + st.mat.zw * cubic.p2.y + st.translate;
+            path_cubic.p3 = st.mat.xy * cubic.p3.x + st.mat.zw * cubic.p3.y + st.translate;
             path_cubic.path_ix = st.path_count;
             if (tag == Element_StrokeCubic) {
                 path_cubic.stroke = get_linewidth(st);
@@ -350,7 +355,7 @@ void main() {
             // We do encoding a bit by hand to minimize divergence. Another approach
             // would be to have a fill/stroke bool.
             path_out_ref = PathSegRef((st.pathseg_count - 1) * PathSeg_size);
-            out_tag = tag == Element_FillLine ? PathSeg_FillCubic : PathSeg_StrokeCubic;
+            out_tag = tag == Element_FillCubic ? PathSeg_FillCubic : PathSeg_StrokeCubic;
             pathseg[path_out_ref.offset >> 2] = out_tag;
             PathStrokeCubic_write(PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic);
             break;
diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv
index 55a43f2..e6bd773 100644
Binary files a/piet-gpu/shader/elements.spv and b/piet-gpu/shader/elements.spv differ
diff --git a/piet-gpu/shader/path_coarse.comp b/piet-gpu/shader/path_coarse.comp
index 693082e..1bbad42 100644
--- a/piet-gpu/shader/path_coarse.comp
+++ b/piet-gpu/shader/path_coarse.comp
@@ -33,6 +33,14 @@ layout(set = 0, binding = 2) buffer TileBuf {
 #define SX (1.0 / float(TILE_WIDTH_PX))
 #define SY (1.0 / float(TILE_HEIGHT_PX))
 
+#define Q_ACCURACY 0.025
+#define MAX_HYPOT2 (432.0 * Q_ACCURACY * Q_ACCURACY)
+
+vec2 eval_cubic(vec2 p0, vec2 p1, vec2 p2, vec2 p3, float t) {
+    float mt = 1.0 - t;
+    return p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t;
+}
+
 void main() {
     uint element_ix = gl_GlobalInvocationID.x;
     PathSegRef ref = PathSegRef(element_ix * PathSeg_size);
@@ -48,6 +56,7 @@ void main() {
     PathStrokeLine line;
     float dx;
     switch (tag) {
+    /*
     case PathSeg_FillLine:
     case PathSeg_StrokeLine:
         line = PathSeg_StrokeLine_read(ref);
@@ -63,66 +72,101 @@ void main() {
         b = invslope; // Note: assumes square tiles, otherwise scale.
         a = (line.p0.x - (line.p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX;
         break;
-    }
-    int x0 = int(floor((xmin) * SX));
-    int x1 = int(ceil((xmax) * SX));
-    int y0 = int(floor((ymin) * SY));
-    int y1 = int(ceil((ymax) * SY));
-    
-    uint path_ix = line.path_ix;
-    Path path = Path_read(PathRef(path_ix * Path_size));
-    ivec4 bbox = ivec4(path.bbox);
-    x0 = clamp(x0, bbox.x, bbox.z);
-    y0 = clamp(y0, bbox.y, bbox.w);
-    x1 = clamp(x1, bbox.x, bbox.z);
-    y1 = clamp(y1, bbox.y, bbox.w);
-    float t = a + b * float(y0);
-    int stride = bbox.z - bbox.x;
-    int base = (y0 - bbox.y) * stride - bbox.x;
-    // TODO: can be tighter, use c to bound width
-    uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
-    // Consider using subgroups to aggregate atomic add.
-    uint tile_offset = atomicAdd(alloc, n_tile_alloc * TileSeg_size);
-    TileSeg tile_seg;
-    for (int y = y0; y < y1; y++) {
-        float tile_y0 = float(y * TILE_HEIGHT_PX);
-        if (tag == PathSeg_FillLine && min(line.p0.y, line.p1.y) <= tile_y0) {
-            int xray = max(int(ceil(t - 0.5 * b)), bbox.x);
-            if (xray < bbox.z) {
-                int backdrop = line.p1.y < line.p0.y ? 1 : -1;
-                TileRef tile_ref = Tile_index(path.tiles, uint(base + xray));
-                uint tile_el = tile_ref.offset >> 2;
-                atomicAdd(tile[tile_el + 1], backdrop);
-            }
-        }
-        int xx0 = clamp(int(floor(t - c)), x0, x1);
-        int xx1 = clamp(int(ceil(t + c)), x0, x1);
-        for (int x = xx0; x < xx1; x++) {
-            float tile_x0 = float(x * TILE_WIDTH_PX);
-            TileRef tile_ref = Tile_index(path.tiles, uint(base + x));
-            uint tile_el = tile_ref.offset >> 2;
-            uint old = atomicExchange(tile[tile_el], tile_offset);
-            tile_seg.start = line.p0;
-            tile_seg.end = line.p1;
-            float y_edge = 0.0;
-            if (tag == PathSeg_FillLine) {
-                y_edge = mix(line.p0.y, line.p1.y, (tile_x0 - line.p0.x) / dx);
-                if (min(line.p0.x, line.p1.x) < tile_x0 && y_edge >= tile_y0 && y_edge < tile_y0 + TILE_HEIGHT_PX) {
-                    if (line.p0.x > line.p1.x) {
-                        tile_seg.end = vec2(tile_x0, y_edge);
-                    } else {
-                        tile_seg.start = vec2(tile_x0, y_edge);
+    */
+    case PathSeg_FillCubic:
+    case PathSeg_StrokeCubic:
+        PathStrokeCubic cubic = PathSeg_StrokeCubic_read(ref);
+        vec2 err_v = 3.0 * (cubic.p2 - cubic.p1) + cubic.p0 - cubic.p3;
+        float err = err_v.x * err_v.x + err_v.y * err_v.y;
+        // The number of quadratics.
+        uint n = max(uint(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0))), 1);
+        vec2 p0 = cubic.p0;
+        float step = 1.0 / float(n);
+        uint path_ix = cubic.path_ix;
+        Path path = Path_read(PathRef(path_ix * Path_size));
+        ivec4 bbox = ivec4(path.bbox);
+        for (int i = 0; i < n; i++) {
+            // TODO: probably need special logic to make sure it's manifold
+            float t = float(i + 1) * step;
+            vec2 p2 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t);
+            /*
+            vec2 p1 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t - 0.5 * step);
+            p1 = 2.0 * p1 - 0.5 * (p0 + p2);
+            */
+
+            xmin = min(p0.x, p2.x) - cubic.stroke.x;
+            xmax = max(p0.x, p2.x) + cubic.stroke.x;
+            ymin = min(p0.y, p2.y) - cubic.stroke.y;
+            ymax = max(p0.y, p2.y) + cubic.stroke.y;
+            float dx = p2.x - p0.x;
+            float dy = p2.y - p0.y;
+            // Set up for per-scanline coverage formula, below.
+            float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
+            c = (cubic.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + cubic.stroke.y)) * SX;
+            b = invslope; // Note: assumes square tiles, otherwise scale.
+            a = (p0.x - (p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX;
+
+            int x0 = int(floor((xmin) * SX));
+            int x1 = int(ceil((xmax) * SX));
+            int y0 = int(floor((ymin) * SY));
+            int y1 = int(ceil((ymax) * SY));
+
+            x0 = clamp(x0, bbox.x, bbox.z);
+            y0 = clamp(y0, bbox.y, bbox.w);
+            x1 = clamp(x1, bbox.x, bbox.z);
+            y1 = clamp(y1, bbox.y, bbox.w);
+            float xc = a + b * float(y0);
+            int stride = bbox.z - bbox.x;
+            int base = (y0 - bbox.y) * stride - bbox.x;
+            // TODO: can be tighter, use c to bound width
+            uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
+            // Consider using subgroups to aggregate atomic add.
+            uint tile_offset = atomicAdd(alloc, n_tile_alloc * TileSeg_size);
+            TileSeg tile_seg;
+            for (int y = y0; y < y1; y++) {
+                float tile_y0 = float(y * TILE_HEIGHT_PX);
+                if (tag == PathSeg_FillCubic && min(p0.y, p2.y) <= tile_y0) {
+                    int xray = max(int(ceil(xc - 0.5 * b)), bbox.x);
+                    if (xray < bbox.z) {
+                        int backdrop = p2.y < p0.y ? 1 : -1;
+                        TileRef tile_ref = Tile_index(path.tiles, uint(base + xray));
+                        uint tile_el = tile_ref.offset >> 2;
+                        atomicAdd(tile[tile_el + 1], backdrop);
                     }
-                } else {
-                    y_edge = 1e9;
                 }
+                int xx0 = clamp(int(floor(xc - c)), x0, x1);
+                int xx1 = clamp(int(ceil(xc + c)), x0, x1);
+                for (int x = xx0; x < xx1; x++) {
+                    float tile_x0 = float(x * TILE_WIDTH_PX);
+                    TileRef tile_ref = Tile_index(path.tiles, uint(base + x));
+                    uint tile_el = tile_ref.offset >> 2;
+                    uint old = atomicExchange(tile[tile_el], tile_offset);
+                    tile_seg.start = p0;
+                    tile_seg.end = p2;
+                    float y_edge = 0.0;
+                    if (tag == PathSeg_FillCubic) {
+                        y_edge = mix(p0.y, p2.y, (tile_x0 - p0.x) / dx);
+                        if (min(p0.x, p2.x) < tile_x0 && y_edge >= tile_y0 && y_edge < tile_y0 + TILE_HEIGHT_PX) {
+                            if (p0.x > p2.x) {
+                                tile_seg.end = vec2(tile_x0, y_edge);
+                            } else {
+                                tile_seg.start = vec2(tile_x0, y_edge);
+                            }
+                        } else {
+                            y_edge = 1e9;
+                        }
+                    }
+                    tile_seg.y_edge = y_edge;
+                    tile_seg.next.offset = old;
+                    TileSeg_write(TileSegRef(tile_offset), tile_seg);
+                    tile_offset += TileSeg_size;
+                }
+                xc += b;
+                base += stride;
             }
-            tile_seg.y_edge = y_edge;
-            tile_seg.next.offset = old;
-            TileSeg_write(TileSegRef(tile_offset), tile_seg);
-            tile_offset += TileSeg_size;
+
+            p0 = p2;
         }
-        t += b;
-        base += stride;
+        break;
     }
 }
diff --git a/piet-gpu/shader/path_coarse.spv b/piet-gpu/shader/path_coarse.spv
index 7098a63..0d10ea2 100644
Binary files a/piet-gpu/shader/path_coarse.spv and b/piet-gpu/shader/path_coarse.spv differ
diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs
index 221b737..f914a2e 100644
--- a/piet-gpu/src/render_ctx.rs
+++ b/piet-gpu/src/render_ctx.rs
@@ -242,7 +242,7 @@ impl PietGpuRenderContext {
     }
 
     fn encode_path(&mut self, path: impl Iterator<Item = PathEl>, is_fill: bool) {
-        let flatten = true;
+        let flatten = false;
         if flatten {
             let mut start_pt = None;
             let mut last_pt = None;