diff --git a/piet-gpu/shader/path_coarse.comp b/piet-gpu/shader/path_coarse.comp
index c7fef8c..b17f3e3 100644
--- a/piet-gpu/shader/path_coarse.comp
+++ b/piet-gpu/shader/path_coarse.comp
@@ -34,14 +34,65 @@ layout(set = 0, binding = 2) buffer TileBuf {
 #define SY (1.0 / float(TILE_HEIGHT_PX))
 
 #define ACCURACY 0.25
-#define Q_ACCURACY 0.01
+#define Q_ACCURACY (ACCURACY * 0.1)
+#define REM_ACCURACY (ACCURACY - Q_ACCURACY)
 #define MAX_HYPOT2 (432.0 * Q_ACCURACY * Q_ACCURACY)
 
+vec2 eval_quad(vec2 p0, vec2 p1, vec2 p2, float t) {
+    float mt = 1.0 - t;
+    return p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t;
+}
+
 vec2 eval_cubic(vec2 p0, vec2 p1, vec2 p2, vec2 p3, float t) {
     float mt = 1.0 - t;
     return p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t;
 }
 
+struct SubdivResult {
+    float val;
+    float a0;
+    float a2;
+};
+
+/// An approximation to $\int (1 + 4x^2) ^ -0.25 dx$
+///
+/// This is used for flattening curves.
+#define D 0.67
+float approx_parabola_integral(float x) {
+    return x * inversesqrt(sqrt(1.0 - D + (D * D * D * D + 0.25 * x * x)));
+}
+
+/// An approximation to the inverse parabola integral.
+#define B 0.39
+float approx_parabola_inv_integral(float x) {
+    return x * sqrt(1.0 - B + (B * B + 0.25 * x * x));
+}
+
+SubdivResult estimate_subdiv(vec2 p0, vec2 p1, vec2 p2, float sqrt_tol) {
+    vec2 d01 = p1 - p0;
+    vec2 d12 = p2 - p1;
+    vec2 dd = d01 - d12;
+    float cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x;
+    float x0 = (d01.x * dd.x + d01.y * dd.y) / cross;
+    float x2 = (d12.x * dd.x + d12.y * dd.y) / cross;
+    float scale = abs(cross / (length(dd) * (x2 - x0)));
+
+    float a0 = approx_parabola_integral(x0);
+    float a2 = approx_parabola_integral(x2);
+    float val = 0.0;
+    if (scale < 1e9) {
+        float da = abs(a2 - a0);
+        float sqrt_scale = sqrt(scale);
+        if (sign(x0) == sign(x2)) {
+            val = da * sqrt_scale;
+        } else {
+            float xmin = sqrt_tol / sqrt_scale;
+            val = sqrt_tol * da / approx_parabola_integral(xmin);
+        }
+    }
+    return SubdivResult(val, a0, a2);
+}
+
 void main() {
     uint element_ix = gl_GlobalInvocationID.x;
     PathSegRef ref = PathSegRef(element_ix * PathSeg_size);
@@ -78,102 +129,137 @@ void main() {
     case PathSeg_StrokeCubic:
         PathStrokeCubic cubic = PathSeg_StrokeCubic_read(ref);
         // Commented out code is for computing error bound on conversion to quadratics
-        /*
         vec2 err_v = 3.0 * (cubic.p2 - cubic.p1) + cubic.p0 - cubic.p3;
         float err = err_v.x * err_v.x + err_v.y * err_v.y;
         // The number of quadratics.
-        uint n = max(uint(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0))), 1);
-        */
-        // This calculation is based on Sederberg, CAGD Notes section 10.6
-        vec2 l = max(abs(cubic.p0 + cubic.p2 - 2 * cubic.p1), abs(cubic.p1 + cubic.p3 - 2 * cubic.p2));
-        uint n = max(uint(ceil(sqrt(length(l) * (0.75 / ACCURACY)))), 1);
-        vec2 p0 = cubic.p0;
-        float step = 1.0 / float(n);
+        uint n_quads = max(uint(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0))), 1);
+        // Iterate over quadratics and tote up the estimated number of segments.
+        float val = 0.0;
+        vec2 qp0 = cubic.p0;
+        float step = 1.0 / float(n_quads);
+        for (uint i = 0; i < n_quads; i++) {
+            float t = float(i + 1) * step;
+            vec2 qp2 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t);
+            vec2 qp1 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t - 0.5 * step);
+            qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
+            SubdivResult params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY));
+            val += params.val;
+
+            qp0 = qp2;
+        }
+        uint n = max(uint(ceil(val * 0.5 / sqrt(REM_ACCURACY))), 1);
+
         uint path_ix = cubic.path_ix;
         Path path = Path_read(PathRef(path_ix * Path_size));
         ivec4 bbox = ivec4(path.bbox);
-        for (int i = 0; i < n; i++) {
-            // TODO: probably need special logic to make sure it's manifold
+        vec2 p0 = cubic.p0;
+        qp0 = cubic.p0;
+        float v_step = val / float(n);
+        int n_out = 1;
+        float val_sum = 0.0;
+        for (uint i = 0; i < n_quads; i++) {
             float t = float(i + 1) * step;
-            vec2 p2 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t);
-            /*
-            vec2 p1 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t - 0.5 * step);
-            p1 = 2.0 * p1 - 0.5 * (p0 + p2);
-            */
-
-            xmin = min(p0.x, p2.x) - cubic.stroke.x;
-            xmax = max(p0.x, p2.x) + cubic.stroke.x;
-            ymin = min(p0.y, p2.y) - cubic.stroke.y;
-            ymax = max(p0.y, p2.y) + cubic.stroke.y;
-            float dx = p2.x - p0.x;
-            float dy = p2.y - p0.y;
-            // Set up for per-scanline coverage formula, below.
-            float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
-            c = (cubic.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + cubic.stroke.y)) * SX;
-            b = invslope; // Note: assumes square tiles, otherwise scale.
-            a = (p0.x - (p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX;
-
-            int x0 = int(floor((xmin) * SX));
-            int x1 = int(ceil((xmax) * SX));
-            int y0 = int(floor((ymin) * SY));
-            int y1 = int(ceil((ymax) * SY));
-
-            x0 = clamp(x0, bbox.x, bbox.z);
-            y0 = clamp(y0, bbox.y, bbox.w);
-            x1 = clamp(x1, bbox.x, bbox.z);
-            y1 = clamp(y1, bbox.y, bbox.w);
-            float xc = a + b * float(y0);
-            int stride = bbox.z - bbox.x;
-            int base = (y0 - bbox.y) * stride - bbox.x;
-            // TODO: can be tighter, use c to bound width
-            uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
-            // Consider using subgroups to aggregate atomic add.
-            uint tile_offset = atomicAdd(alloc, n_tile_alloc * TileSeg_size);
-            TileSeg tile_seg;
-            for (int y = y0; y < y1; y++) {
-                float tile_y0 = float(y * TILE_HEIGHT_PX);
-                if (tag == PathSeg_FillCubic && min(p0.y, p2.y) <= tile_y0) {
-                    int xray = max(int(ceil(xc - 0.5 * b)), bbox.x);
-                    if (xray < bbox.z) {
-                        int backdrop = p2.y < p0.y ? 1 : -1;
-                        TileRef tile_ref = Tile_index(path.tiles, uint(base + xray));
-                        uint tile_el = tile_ref.offset >> 2;
-                        atomicAdd(tile[tile_el + 1], backdrop);
-                    }
+            vec2 qp2 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t);
+            vec2 qp1 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t - 0.5 * step);
+            qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
+            SubdivResult params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY));
+            float u0 = approx_parabola_inv_integral(params.a0);
+            float u2 = approx_parabola_inv_integral(params.a2);
+            float uscale = 1.0 / (u2 - u0);
+            float target = float(n_out) * v_step;
+            while (n_out == n || target < val_sum + params.val) {
+                vec2 p1;
+                if (n_out == n) {
+                    p1 = cubic.p3;
+                } else {
+                    float u = (target - val_sum) / params.val;
+                    float a = mix(params.a0, params.a2, u);
+                    float au = approx_parabola_inv_integral(a);
+                    float t = (au - u0) * uscale;
+                    p1 = eval_quad(qp0, qp1, qp2, t);
                 }
-                int xx0 = clamp(int(floor(xc - c)), x0, x1);
-                int xx1 = clamp(int(ceil(xc + c)), x0, x1);
-                for (int x = xx0; x < xx1; x++) {
-                    float tile_x0 = float(x * TILE_WIDTH_PX);
-                    TileRef tile_ref = Tile_index(path.tiles, uint(base + x));
-                    uint tile_el = tile_ref.offset >> 2;
-                    uint old = atomicExchange(tile[tile_el], tile_offset);
-                    tile_seg.start = p0;
-                    tile_seg.end = p2;
-                    float y_edge = 0.0;
-                    if (tag == PathSeg_FillCubic) {
-                        y_edge = mix(p0.y, p2.y, (tile_x0 - p0.x) / dx);
-                        if (min(p0.x, p2.x) < tile_x0 && y_edge >= tile_y0 && y_edge < tile_y0 + TILE_HEIGHT_PX) {
-                            if (p0.x > p2.x) {
-                                tile_seg.end = vec2(tile_x0, y_edge);
-                            } else {
-                                tile_seg.start = vec2(tile_x0, y_edge);
-                            }
-                        } else {
-                            y_edge = 1e9;
+
+                // Output line segment
+                xmin = min(p0.x, p1.x) - cubic.stroke.x;
+                xmax = max(p0.x, p1.x) + cubic.stroke.x;
+                ymin = min(p0.y, p1.y) - cubic.stroke.y;
+                ymax = max(p0.y, p1.y) + cubic.stroke.y;
+                float dx = p1.x - p0.x;
+                float dy = p1.y - p0.y;
+                // Set up for per-scanline coverage formula, below.
+                float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
+                c = (cubic.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + cubic.stroke.y)) * SX;
+                b = invslope; // Note: assumes square tiles, otherwise scale.
+                a = (p0.x - (p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX;
+
+                int x0 = int(floor((xmin) * SX));
+                int x1 = int(ceil((xmax) * SX));
+                int y0 = int(floor((ymin) * SY));
+                int y1 = int(ceil((ymax) * SY));
+
+                x0 = clamp(x0, bbox.x, bbox.z);
+                y0 = clamp(y0, bbox.y, bbox.w);
+                x1 = clamp(x1, bbox.x, bbox.z);
+                y1 = clamp(y1, bbox.y, bbox.w);
+                float xc = a + b * float(y0);
+                int stride = bbox.z - bbox.x;
+                int base = (y0 - bbox.y) * stride - bbox.x;
+                // TODO: can be tighter, use c to bound width
+                uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
+                // Consider using subgroups to aggregate atomic add.
+                uint tile_offset = atomicAdd(alloc, n_tile_alloc * TileSeg_size);
+                TileSeg tile_seg;
+                for (int y = y0; y < y1; y++) {
+                    float tile_y0 = float(y * TILE_HEIGHT_PX);
+                    if (tag == PathSeg_FillCubic && min(p0.y, p1.y) <= tile_y0) {
+                        int xray = max(int(ceil(xc - 0.5 * b)), bbox.x);
+                        if (xray < bbox.z) {
+                            int backdrop = p1.y < p0.y ? 1 : -1;
+                            TileRef tile_ref = Tile_index(path.tiles, uint(base + xray));
+                            uint tile_el = tile_ref.offset >> 2;
+                            atomicAdd(tile[tile_el + 1], backdrop);
                         }
                     }
-                    tile_seg.y_edge = y_edge;
-                    tile_seg.next.offset = old;
-                    TileSeg_write(TileSegRef(tile_offset), tile_seg);
-                    tile_offset += TileSeg_size;
+                    int xx0 = clamp(int(floor(xc - c)), x0, x1);
+                    int xx1 = clamp(int(ceil(xc + c)), x0, x1);
+                    for (int x = xx0; x < xx1; x++) {
+                        float tile_x0 = float(x * TILE_WIDTH_PX);
+                        TileRef tile_ref = Tile_index(path.tiles, uint(base + x));
+                        uint tile_el = tile_ref.offset >> 2;
+                        uint old = atomicExchange(tile[tile_el], tile_offset);
+                        tile_seg.start = p0;
+                        tile_seg.end = p1;
+                        float y_edge = 0.0;
+                        if (tag == PathSeg_FillCubic) {
+                            y_edge = mix(p0.y, p1.y, (tile_x0 - p0.x) / dx);
+                            if (min(p0.x, p1.x) < tile_x0 && y_edge >= tile_y0 && y_edge < tile_y0 + TILE_HEIGHT_PX) {
+                                if (p0.x > p1.x) {
+                                    tile_seg.end = vec2(tile_x0, y_edge);
+                                } else {
+                                    tile_seg.start = vec2(tile_x0, y_edge);
+                                }
+                            } else {
+                                y_edge = 1e9;
+                            }
+                        }
+                        tile_seg.y_edge = y_edge;
+                        tile_seg.next.offset = old;
+                        TileSeg_write(TileSegRef(tile_offset), tile_seg);
+                        tile_offset += TileSeg_size;
+                    }
+                    xc += b;
+                    base += stride;
                 }
-                xc += b;
-                base += stride;
-            }
 
-            p0 = p2;
+                n_out += 1;
+                target += v_step;
+                p0 = p1;
+            }
+            val_sum += params.val;
+
+            qp0 = qp2;
         }
+
         break;
     }
 }
diff --git a/piet-gpu/shader/path_coarse.spv b/piet-gpu/shader/path_coarse.spv
index f91fbb5..db5bc57 100644
Binary files a/piet-gpu/shader/path_coarse.spv and b/piet-gpu/shader/path_coarse.spv differ