diff --git a/piet-gpu-types/src/tile.rs b/piet-gpu-types/src/tile.rs
index 18318e3..38ee93b 100644
--- a/piet-gpu-types/src/tile.rs
+++ b/piet-gpu-types/src/tile.rs
@@ -13,8 +13,8 @@ piet_gpu! {
         }
         // Segments within a tile are represented as a linked list.
         struct TileSeg {
-            start: [f32; 2],
-            end: [f32; 2],
+            origin: [f32; 2],
+            vector: [f32; 2],
             y_edge: f32,
             next: Ref<TileSeg>,
         }
diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp
index 72ab396..bf9ec44 100644
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@@ -65,8 +65,8 @@ float[CHUNK] computeArea(vec2 xy, int backdrop, uint tile_ref) {
         TileSeg seg = TileSeg_read(tile_seg_ref);
         for (uint k = 0; k < CHUNK; k++) {
             vec2 my_xy = vec2(xy.x, xy.y + float(k * CHUNK_DY));
-            vec2 start = seg.start - my_xy;
-            vec2 end = seg.end - my_xy;
+            vec2 start = seg.origin - my_xy;
+            vec2 end = start + seg.vector;
             vec2 window = clamp(vec2(start.y, end.y), 0.0, 1.0);
             if (window.x != window.y) {
                 vec2 t = (window - start.y) / (end.y - start.y);
@@ -79,7 +79,7 @@ float[CHUNK] computeArea(vec2 xy, int backdrop, uint tile_ref) {
                 float a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin);
                 area[k] += a * (window.x - window.y);
             }
-            area[k] += sign(end.x - start.x) * clamp(my_xy.y - seg.y_edge + 1.0, 0.0, 1.0);
+            area[k] += sign(seg.vector.x) * clamp(my_xy.y - seg.y_edge + 1.0, 0.0, 1.0);
         }
         tile_seg_ref = seg.next;
     } while (tile_seg_ref.offset != 0);
@@ -131,9 +131,9 @@ void main() {
             TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref);
             do {
                 TileSeg seg = TileSeg_read(tile_seg_ref);
-                vec2 line_vec = seg.end - seg.start;
+                vec2 line_vec = seg.vector;
                 for (uint k = 0; k < CHUNK; k++) {
-                    vec2 dpos = xy + vec2(0.5, 0.5) - seg.start;
+                    vec2 dpos = xy + vec2(0.5, 0.5) - seg.origin;
                     dpos.y += float(k * CHUNK_DY);
                     float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
                     df[k] = min(df[k], length(line_vec * t - dpos));
diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv
index 808d707..fffdc4a 100644
Binary files a/piet-gpu/shader/kernel4.spv and b/piet-gpu/shader/kernel4.spv differ
diff --git a/piet-gpu/shader/path_coarse.comp b/piet-gpu/shader/path_coarse.comp
index 658af0e..eb3509b 100644
--- a/piet-gpu/shader/path_coarse.comp
+++ b/piet-gpu/shader/path_coarse.comp
@@ -101,12 +101,6 @@ void main() {
     if (element_ix < n_pathseg) {
         tag = PathSeg_tag(ref);
     }
-    // Setup for coverage algorithm.
-    float a, b, c;
-    // Bounding box of element in pixel coordinates.
-    float xmin, xmax, ymin, ymax;
-    PathStrokeLine line;
-    float dx;
     switch (tag) {
     case PathSeg_FillCubic:
     case PathSeg_StrokeCubic:
@@ -162,22 +156,24 @@ void main() {
                 }
 
                 // Output line segment
-                xmin = min(p0.x, p1.x) - cubic.stroke.x;
-                xmax = max(p0.x, p1.x) + cubic.stroke.x;
-                ymin = min(p0.y, p1.y) - cubic.stroke.y;
-                ymax = max(p0.y, p1.y) + cubic.stroke.y;
+
+                // Bounding box of element in pixel coordinates.
+                float xmin = min(p0.x, p1.x) - cubic.stroke.x;
+                float xmax = max(p0.x, p1.x) + cubic.stroke.x;
+                float ymin = min(p0.y, p1.y) - cubic.stroke.y;
+                float ymax = max(p0.y, p1.y) + cubic.stroke.y;
                 float dx = p1.x - p0.x;
                 float dy = p1.y - p0.y;
                 // Set up for per-scanline coverage formula, below.
                 float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
-                c = (cubic.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + cubic.stroke.y)) * SX;
-                b = invslope; // Note: assumes square tiles, otherwise scale.
-                a = (p0.x - (p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX;
+                float c = (cubic.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + cubic.stroke.y)) * SX;
+                float b = invslope; // Note: assumes square tiles, otherwise scale.
+                float a = (p0.x - (p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX;
 
-                int x0 = int(floor((xmin) * SX));
-                int x1 = int(ceil((xmax) * SX));
-                int y0 = int(floor((ymin) * SY));
-                int y1 = int(ceil((ymax) * SY));
+                int x0 = int(floor(xmin * SX));
+                int x1 = int(floor(xmax * SX) + 1);
+                int y0 = int(floor(ymin * SY));
+                int y1 = int(floor(ymax * SY) + 1);
 
                 x0 = clamp(x0, bbox.x, bbox.z);
                 y0 = clamp(y0, bbox.y, bbox.w);
@@ -191,36 +187,69 @@ void main() {
                 // Consider using subgroups to aggregate atomic add.
                 uint tile_offset = atomicAdd(alloc, n_tile_alloc * TileSeg_size);
                 TileSeg tile_seg;
+
+                int xray = int(floor(p0.x*SX));
+                int last_xray = int(floor(p1.x*SX));
+                if (p0.y > p1.y) {
+                    int tmp = xray;
+                    xray = last_xray;
+                    last_xray = tmp;
+                }
                 for (int y = y0; y < y1; y++) {
-                    float tile_y0 = float(y * TILE_HEIGHT_PX);
-                    if (tag == PathSeg_FillCubic && min(p0.y, p1.y) <= tile_y0) {
-                        int xray = max(int(ceil(xc - 0.5 * b)), bbox.x);
-                        if (xray < bbox.z) {
-                            int backdrop = p1.y < p0.y ? 1 : -1;
-                            TileRef tile_ref = Tile_index(path.tiles, uint(base + xray));
-                            uint tile_el = tile_ref.offset >> 2;
-                            atomicAdd(tile[tile_el + 1], backdrop);
-                        }
+                    int xbackdrop = max(xray + 1, bbox.x);
+                    if (tag == PathSeg_FillCubic && y > y0 && xbackdrop < bbox.z) {
+                        int backdrop = p1.y < p0.y ? 1 : -1;
+                        TileRef tile_ref = Tile_index(path.tiles, uint(base + xbackdrop));
+                        uint tile_el = tile_ref.offset >> 2;
+                        atomicAdd(tile[tile_el + 1], backdrop);
                     }
+
                     int xx0 = clamp(int(floor(xc - c)), x0, x1);
                     int xx1 = clamp(int(ceil(xc + c)), x0, x1);
+                    xx1 = max(xx1, xray + 1);
+
+                    // next_xray is the xray for the next scanline; it is derived
+                    // by left edge intersections computed below.
+                    int next_xray = xray;
                     for (int x = xx0; x < xx1; x++) {
                         float tile_x0 = float(x * TILE_WIDTH_PX);
                         TileRef tile_ref = Tile_index(path.tiles, uint(base + x));
                         uint tile_el = tile_ref.offset >> 2;
                         uint old = atomicExchange(tile[tile_el], tile_offset);
-                        tile_seg.start = p0;
-                        tile_seg.end = p1;
+                        tile_seg.origin = p0;
+                        tile_seg.vector = p1 - p0;
                         float y_edge = 0.0;
                         if (tag == PathSeg_FillCubic) {
+                            float tile_y0 = float(y * TILE_HEIGHT_PX);
                             y_edge = mix(p0.y, p1.y, (tile_x0 - p0.x) / dx);
                             if (min(p0.x, p1.x) < tile_x0 && y_edge >= tile_y0 && y_edge < tile_y0 + TILE_HEIGHT_PX) {
+                                // Left edge intersection.
+                                vec2 p = vec2(tile_x0, y_edge);
                                 if (p0.x > p1.x) {
-                                    tile_seg.end = vec2(tile_x0, y_edge);
+                                    tile_seg.vector = p - p0;
                                 } else {
-                                    tile_seg.start = vec2(tile_x0, y_edge);
+                                    tile_seg.origin = p;
+                                    tile_seg.vector = p1 - p;
                                 }
-                            } else {
+                                // kernel4 uses sign(vector.x) for the sign of the intersection backdrop.
+                                // Nudge zeroes towards the intended sign.
+                                if (tile_seg.vector.x == 0) {
+                                    tile_seg.vector.x += sign(p1.x - p0.x)*1e-9;
+                                }
+                                // Move next_xray consistently with previous intersections.
+                                if (x > next_xray && next_xray >= xray) {
+                                    next_xray = x;
+                                } else if (x <= next_xray && next_xray <= xray) {
+                                    next_xray = x - 1;
+                                }
+                            }
+                            // Force last xray on the last scanline for consistency with later
+                            // line segments.
+                            if (y == y1 - 1) {
+                                next_xray = last_xray;
+                            }
+                            // Drop inconsistent intersections.
+                            if (x <= min(xray, next_xray) || max(xray, next_xray) < x) {
                                 y_edge = 1e9;
                             }
                         }
@@ -231,6 +260,7 @@ void main() {
                     }
                     xc += b;
                     base += stride;
+                    xray = next_xray;
                 }
 
                 n_out += 1;
diff --git a/piet-gpu/shader/path_coarse.spv b/piet-gpu/shader/path_coarse.spv
index f82a031..767bbda 100644
Binary files a/piet-gpu/shader/path_coarse.spv and b/piet-gpu/shader/path_coarse.spv differ
diff --git a/piet-gpu/shader/tile.h b/piet-gpu/shader/tile.h
index d7659ff..b6c5e14 100644
--- a/piet-gpu/shader/tile.h
+++ b/piet-gpu/shader/tile.h
@@ -35,8 +35,8 @@ TileRef Tile_index(TileRef ref, uint index) {
 }
 
 struct TileSeg {
-    vec2 start;
-    vec2 end;
+    vec2 origin;
+    vec2 vector;
     float y_edge;
     TileSegRef next;
 };
@@ -90,8 +90,8 @@ TileSeg TileSeg_read(TileSegRef ref) {
     uint raw4 = tile[ix + 4];
     uint raw5 = tile[ix + 5];
     TileSeg s;
-    s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
-    s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.origin = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.vector = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
     s.y_edge = uintBitsToFloat(raw4);
     s.next = TileSegRef(raw5);
     return s;
@@ -99,10 +99,10 @@ TileSeg TileSeg_read(TileSegRef ref) {
 
 void TileSeg_write(TileSegRef ref, TileSeg s) {
     uint ix = ref.offset >> 2;
-    tile[ix + 0] = floatBitsToUint(s.start.x);
-    tile[ix + 1] = floatBitsToUint(s.start.y);
-    tile[ix + 2] = floatBitsToUint(s.end.x);
-    tile[ix + 3] = floatBitsToUint(s.end.y);
+    tile[ix + 0] = floatBitsToUint(s.origin.x);
+    tile[ix + 1] = floatBitsToUint(s.origin.y);
+    tile[ix + 2] = floatBitsToUint(s.vector.x);
+    tile[ix + 3] = floatBitsToUint(s.vector.y);
     tile[ix + 4] = floatBitsToUint(s.y_edge);
     tile[ix + 5] = s.next.offset;
 }