diff --git a/piet-gpu-types/src/tile.rs b/piet-gpu-types/src/tile.rs index 18318e3..38ee93b 100644 --- a/piet-gpu-types/src/tile.rs +++ b/piet-gpu-types/src/tile.rs @@ -13,8 +13,8 @@ piet_gpu! { } // Segments within a tile are represented as a linked list. struct TileSeg { - start: [f32; 2], - end: [f32; 2], + origin: [f32; 2], + vector: [f32; 2], y_edge: f32, next: Ref, } diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp index 72ab396..bf9ec44 100644 --- a/piet-gpu/shader/kernel4.comp +++ b/piet-gpu/shader/kernel4.comp @@ -65,8 +65,8 @@ float[CHUNK] computeArea(vec2 xy, int backdrop, uint tile_ref) { TileSeg seg = TileSeg_read(tile_seg_ref); for (uint k = 0; k < CHUNK; k++) { vec2 my_xy = vec2(xy.x, xy.y + float(k * CHUNK_DY)); - vec2 start = seg.start - my_xy; - vec2 end = seg.end - my_xy; + vec2 start = seg.origin - my_xy; + vec2 end = start + seg.vector; vec2 window = clamp(vec2(start.y, end.y), 0.0, 1.0); if (window.x != window.y) { vec2 t = (window - start.y) / (end.y - start.y); @@ -79,7 +79,7 @@ float[CHUNK] computeArea(vec2 xy, int backdrop, uint tile_ref) { float a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin); area[k] += a * (window.x - window.y); } - area[k] += sign(end.x - start.x) * clamp(my_xy.y - seg.y_edge + 1.0, 0.0, 1.0); + area[k] += sign(seg.vector.x) * clamp(my_xy.y - seg.y_edge + 1.0, 0.0, 1.0); } tile_seg_ref = seg.next; } while (tile_seg_ref.offset != 0); @@ -131,9 +131,9 @@ void main() { TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref); do { TileSeg seg = TileSeg_read(tile_seg_ref); - vec2 line_vec = seg.end - seg.start; + vec2 line_vec = seg.vector; for (uint k = 0; k < CHUNK; k++) { - vec2 dpos = xy + vec2(0.5, 0.5) - seg.start; + vec2 dpos = xy + vec2(0.5, 0.5) - seg.origin; dpos.y += float(k * CHUNK_DY); float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0); df[k] = min(df[k], length(line_vec * t - dpos)); diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv index 808d707..fffdc4a 100644 Binary files a/piet-gpu/shader/kernel4.spv and b/piet-gpu/shader/kernel4.spv differ diff --git a/piet-gpu/shader/path_coarse.comp b/piet-gpu/shader/path_coarse.comp index 658af0e..eb3509b 100644 --- a/piet-gpu/shader/path_coarse.comp +++ b/piet-gpu/shader/path_coarse.comp @@ -101,12 +101,6 @@ void main() { if (element_ix < n_pathseg) { tag = PathSeg_tag(ref); } - // Setup for coverage algorithm. - float a, b, c; - // Bounding box of element in pixel coordinates. - float xmin, xmax, ymin, ymax; - PathStrokeLine line; - float dx; switch (tag) { case PathSeg_FillCubic: case PathSeg_StrokeCubic: @@ -162,22 +156,24 @@ void main() { } // Output line segment - xmin = min(p0.x, p1.x) - cubic.stroke.x; - xmax = max(p0.x, p1.x) + cubic.stroke.x; - ymin = min(p0.y, p1.y) - cubic.stroke.y; - ymax = max(p0.y, p1.y) + cubic.stroke.y; + + // Bounding box of element in pixel coordinates. + float xmin = min(p0.x, p1.x) - cubic.stroke.x; + float xmax = max(p0.x, p1.x) + cubic.stroke.x; + float ymin = min(p0.y, p1.y) - cubic.stroke.y; + float ymax = max(p0.y, p1.y) + cubic.stroke.y; float dx = p1.x - p0.x; float dy = p1.y - p0.y; // Set up for per-scanline coverage formula, below. float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy; - c = (cubic.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + cubic.stroke.y)) * SX; - b = invslope; // Note: assumes square tiles, otherwise scale. - a = (p0.x - (p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX; + float c = (cubic.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + cubic.stroke.y)) * SX; + float b = invslope; // Note: assumes square tiles, otherwise scale. + float a = (p0.x - (p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX; - int x0 = int(floor((xmin) * SX)); - int x1 = int(ceil((xmax) * SX)); - int y0 = int(floor((ymin) * SY)); - int y1 = int(ceil((ymax) * SY)); + int x0 = int(floor(xmin * SX)); + int x1 = int(floor(xmax * SX) + 1); + int y0 = int(floor(ymin * SY)); + int y1 = int(floor(ymax * SY) + 1); x0 = clamp(x0, bbox.x, bbox.z); y0 = clamp(y0, bbox.y, bbox.w); @@ -191,36 +187,69 @@ void main() { // Consider using subgroups to aggregate atomic add. uint tile_offset = atomicAdd(alloc, n_tile_alloc * TileSeg_size); TileSeg tile_seg; + + int xray = int(floor(p0.x*SX)); + int last_xray = int(floor(p1.x*SX)); + if (p0.y > p1.y) { + int tmp = xray; + xray = last_xray; + last_xray = tmp; + } for (int y = y0; y < y1; y++) { - float tile_y0 = float(y * TILE_HEIGHT_PX); - if (tag == PathSeg_FillCubic && min(p0.y, p1.y) <= tile_y0) { - int xray = max(int(ceil(xc - 0.5 * b)), bbox.x); - if (xray < bbox.z) { - int backdrop = p1.y < p0.y ? 1 : -1; - TileRef tile_ref = Tile_index(path.tiles, uint(base + xray)); - uint tile_el = tile_ref.offset >> 2; - atomicAdd(tile[tile_el + 1], backdrop); - } + int xbackdrop = max(xray + 1, bbox.x); + if (tag == PathSeg_FillCubic && y > y0 && xbackdrop < bbox.z) { + int backdrop = p1.y < p0.y ? 1 : -1; + TileRef tile_ref = Tile_index(path.tiles, uint(base + xbackdrop)); + uint tile_el = tile_ref.offset >> 2; + atomicAdd(tile[tile_el + 1], backdrop); } + int xx0 = clamp(int(floor(xc - c)), x0, x1); int xx1 = clamp(int(ceil(xc + c)), x0, x1); + xx1 = max(xx1, xray + 1); + + // next_xray is the xray for the next scanline; it is derived + // by left edge intersections computed below. + int next_xray = xray; for (int x = xx0; x < xx1; x++) { float tile_x0 = float(x * TILE_WIDTH_PX); TileRef tile_ref = Tile_index(path.tiles, uint(base + x)); uint tile_el = tile_ref.offset >> 2; uint old = atomicExchange(tile[tile_el], tile_offset); - tile_seg.start = p0; - tile_seg.end = p1; + tile_seg.origin = p0; + tile_seg.vector = p1 - p0; float y_edge = 0.0; if (tag == PathSeg_FillCubic) { + float tile_y0 = float(y * TILE_HEIGHT_PX); y_edge = mix(p0.y, p1.y, (tile_x0 - p0.x) / dx); if (min(p0.x, p1.x) < tile_x0 && y_edge >= tile_y0 && y_edge < tile_y0 + TILE_HEIGHT_PX) { + // Left edge intersection. + vec2 p = vec2(tile_x0, y_edge); if (p0.x > p1.x) { - tile_seg.end = vec2(tile_x0, y_edge); + tile_seg.vector = p - p0; } else { - tile_seg.start = vec2(tile_x0, y_edge); + tile_seg.origin = p; + tile_seg.vector = p1 - p; } - } else { + // kernel4 uses sign(vector.x) for the sign of the intersection backdrop. + // Nudge zeroes towards the intended sign. + if (tile_seg.vector.x == 0) { + tile_seg.vector.x += sign(p1.x - p0.x)*1e-9; + } + // Move next_xray consistently with previous intersections. + if (x > next_xray && next_xray >= xray) { + next_xray = x; + } else if (x <= next_xray && next_xray <= xray) { + next_xray = x - 1; + } + } + // Force last xray on the last scanline for consistency with later + // line segments. + if (y == y1 - 1) { + next_xray = last_xray; + } + // Drop inconsistent intersections. + if (x <= min(xray, next_xray) || max(xray, next_xray) < x) { y_edge = 1e9; } } @@ -231,6 +260,7 @@ void main() { } xc += b; base += stride; + xray = next_xray; } n_out += 1; diff --git a/piet-gpu/shader/path_coarse.spv b/piet-gpu/shader/path_coarse.spv index f82a031..767bbda 100644 Binary files a/piet-gpu/shader/path_coarse.spv and b/piet-gpu/shader/path_coarse.spv differ diff --git a/piet-gpu/shader/tile.h b/piet-gpu/shader/tile.h index d7659ff..b6c5e14 100644 --- a/piet-gpu/shader/tile.h +++ b/piet-gpu/shader/tile.h @@ -35,8 +35,8 @@ TileRef Tile_index(TileRef ref, uint index) { } struct TileSeg { - vec2 start; - vec2 end; + vec2 origin; + vec2 vector; float y_edge; TileSegRef next; }; @@ -90,8 +90,8 @@ TileSeg TileSeg_read(TileSegRef ref) { uint raw4 = tile[ix + 4]; uint raw5 = tile[ix + 5]; TileSeg s; - s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); - s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.origin = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.vector = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); s.y_edge = uintBitsToFloat(raw4); s.next = TileSegRef(raw5); return s; @@ -99,10 +99,10 @@ TileSeg TileSeg_read(TileSegRef ref) { void TileSeg_write(TileSegRef ref, TileSeg s) { uint ix = ref.offset >> 2; - tile[ix + 0] = floatBitsToUint(s.start.x); - tile[ix + 1] = floatBitsToUint(s.start.y); - tile[ix + 2] = floatBitsToUint(s.end.x); - tile[ix + 3] = floatBitsToUint(s.end.y); + tile[ix + 0] = floatBitsToUint(s.origin.x); + tile[ix + 1] = floatBitsToUint(s.origin.y); + tile[ix + 2] = floatBitsToUint(s.vector.x); + tile[ix + 3] = floatBitsToUint(s.vector.y); tile[ix + 4] = floatBitsToUint(s.y_edge); tile[ix + 5] = s.next.offset; }