diff --git a/piet-gpu-types/src/tile.rs b/piet-gpu-types/src/tile.rs index 18318e3..38ee93b 100644 --- a/piet-gpu-types/src/tile.rs +++ b/piet-gpu-types/src/tile.rs @@ -13,8 +13,8 @@ piet_gpu! { } // Segments within a tile are represented as a linked list. struct TileSeg { - start: [f32; 2], - end: [f32; 2], + origin: [f32; 2], + vector: [f32; 2], y_edge: f32, next: Ref, } diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp index 3e230d9..c319cbe 100644 --- a/piet-gpu/shader/coarse.comp +++ b/piet-gpu/shader/coarse.comp @@ -84,11 +84,22 @@ void main() { // Coordinates of top left of bin, in tiles. uint bin_tile_x = N_TILE_X * gl_WorkGroupID.x; uint bin_tile_y = N_TILE_Y * gl_WorkGroupID.y; + + // Per-tile state uint tile_x = gl_LocalInvocationID.x % N_TILE_X; uint tile_y = gl_LocalInvocationID.x / N_TILE_X; uint this_tile_ix = (bin_tile_y + tile_y) * WIDTH_IN_TILES + bin_tile_x + tile_x; CmdRef cmd_ref = CmdRef(this_tile_ix * PTCL_INITIAL_ALLOC); uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size; + // The nesting depth of the clip stack + uint clip_depth = 0; + // State for the "clip zero" optimization. If it's nonzero, then we are + // currently in a clip for which the entire tile has an alpha of zero, and + // the value is the depth after the "begin clip" of that element. + uint clip_zero_depth = 0; + // State for the "clip one" optimization. If bit `i` is set, then that means + // that the clip pushed at depth `i` has an alpha of all one. + uint clip_one_mask = 0; // I'm sure we can figure out how to do this with at least one fewer register... // Items up to rd_ix have been read from sh_elements @@ -98,6 +109,7 @@ void main() { // Items between part_start_ix and ready_ix are ready to be transferred from sh_part_elements uint part_start_ix = 0; uint ready_ix = 0; + while (true) { for (uint i = 0; i < N_SLICE; i++) { sh_bitmaps[i][th_ix] = 0; @@ -270,56 +282,84 @@ void main() { ref = AnnotatedRef(element_ix * Annotated_size); tag = Annotated_tag(ref); - switch (tag) { - case Annotated_Fill: - Tile tile = Tile_read(TileRef(sh_tile_base[element_ref_ix] - + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); - AnnoFill fill = Annotated_Fill_read(ref); - alloc_cmd(cmd_ref, cmd_limit); - if (tile.tile.offset != 0) { - CmdFill cmd_fill; - cmd_fill.tile_ref = tile.tile.offset; - cmd_fill.backdrop = tile.backdrop; - cmd_fill.rgba_color = fill.rgba_color; - Cmd_Fill_write(cmd_ref, cmd_fill); - } else { - Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color)); + if (clip_zero_depth == 0) { + switch (tag) { + case Annotated_Fill: + Tile tile = Tile_read(TileRef(sh_tile_base[element_ref_ix] + + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); + AnnoFill fill = Annotated_Fill_read(ref); + alloc_cmd(cmd_ref, cmd_limit); + if (tile.tile.offset != 0) { + CmdFill cmd_fill; + cmd_fill.tile_ref = tile.tile.offset; + cmd_fill.backdrop = tile.backdrop; + cmd_fill.rgba_color = fill.rgba_color; + Cmd_Fill_write(cmd_ref, cmd_fill); + } else { + Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color)); + } + cmd_ref.offset += Cmd_size; + break; + case Annotated_BeginClip: + tile = Tile_read(TileRef(sh_tile_base[element_ref_ix] + + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); + if (tile.tile.offset == 0 && tile.backdrop == 0) { + clip_zero_depth = clip_depth + 1; + } else if (tile.tile.offset == 0 && clip_depth < 32) { + clip_one_mask |= (1 << clip_depth); + } else { + alloc_cmd(cmd_ref, cmd_limit); + if (tile.tile.offset != 0) { + CmdBeginClip cmd_begin_clip; + cmd_begin_clip.tile_ref = tile.tile.offset; + cmd_begin_clip.backdrop = tile.backdrop; + Cmd_BeginClip_write(cmd_ref, cmd_begin_clip); + } else { + // TODO: here is where a bunch of optimization magic should happen + float alpha = tile.backdrop == 0 ? 0.0 : 1.0; + Cmd_BeginSolidClip_write(cmd_ref, CmdBeginSolidClip(alpha)); + } + cmd_ref.offset += Cmd_size; + if (clip_depth < 32) { + clip_one_mask &= ~(1 << clip_depth); + } + } + clip_depth++; + break; + case Annotated_EndClip: + clip_depth--; + if (clip_depth >= 32 || (clip_one_mask & (1 << clip_depth)) == 0) { + alloc_cmd(cmd_ref, cmd_limit); + Cmd_EndClip_write(cmd_ref, CmdEndClip(1.0)); + cmd_ref.offset += Cmd_size; + } + break; + case Annotated_Stroke: + tile = Tile_read(TileRef(sh_tile_base[element_ref_ix] + + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); + AnnoStroke stroke = Annotated_Stroke_read(ref); + CmdStroke cmd_stroke; + cmd_stroke.tile_ref = tile.tile.offset; + cmd_stroke.half_width = 0.5 * stroke.linewidth; + cmd_stroke.rgba_color = stroke.rgba_color; + alloc_cmd(cmd_ref, cmd_limit); + Cmd_Stroke_write(cmd_ref, cmd_stroke); + cmd_ref.offset += Cmd_size; + break; } - cmd_ref.offset += Cmd_size; - break; - case Annotated_BeginClip: - tile = Tile_read(TileRef(sh_tile_base[element_ref_ix] - + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); - alloc_cmd(cmd_ref, cmd_limit); - if (tile.tile.offset != 0) { - CmdBeginClip cmd_begin_clip; - cmd_begin_clip.tile_ref = tile.tile.offset; - cmd_begin_clip.backdrop = tile.backdrop; - Cmd_BeginClip_write(cmd_ref, cmd_begin_clip); - } else { - // TODO: here is where a bunch of optimization magic should happen - float alpha = tile.backdrop == 0 ? 0.0 : 1.0; - Cmd_BeginSolidClip_write(cmd_ref, CmdBeginSolidClip(alpha)); + } else { + // In "clip zero" state, suppress all drawing + switch (tag) { + case Annotated_BeginClip: + clip_depth++; + break; + case Annotated_EndClip: + if (clip_depth == clip_zero_depth) { + clip_zero_depth = 0; + } + clip_depth--; + break; } - cmd_ref.offset += Cmd_size; - break; - case Annotated_EndClip: - alloc_cmd(cmd_ref, cmd_limit); - Cmd_EndClip_write(cmd_ref, CmdEndClip(1.0)); - cmd_ref.offset += Cmd_size; - break; - case Annotated_Stroke: - tile = Tile_read(TileRef(sh_tile_base[element_ref_ix] - + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); - AnnoStroke stroke = Annotated_Stroke_read(ref); - CmdStroke cmd_stroke; - cmd_stroke.tile_ref = tile.tile.offset; - cmd_stroke.half_width = 0.5 * stroke.linewidth; - cmd_stroke.rgba_color = stroke.rgba_color; - alloc_cmd(cmd_ref, cmd_limit); - Cmd_Stroke_write(cmd_ref, cmd_stroke); - cmd_ref.offset += Cmd_size; - break; } } barrier(); diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv index bd014c2..215a97a 100644 Binary files a/piet-gpu/shader/coarse.spv and b/piet-gpu/shader/coarse.spv differ diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp index 02c7f6f..a95183c 100644 --- a/piet-gpu/shader/kernel4.comp +++ b/piet-gpu/shader/kernel4.comp @@ -68,8 +68,8 @@ float[CHUNK] computeArea(vec2 xy, int backdrop, uint tile_ref) { TileSeg seg = TileSeg_read(tile_seg_ref); for (uint k = 0; k < CHUNK; k++) { vec2 my_xy = vec2(xy.x, xy.y + float(k * CHUNK_DY)); - vec2 start = seg.start - my_xy; - vec2 end = seg.end - my_xy; + vec2 start = seg.origin - my_xy; + vec2 end = start + seg.vector; vec2 window = clamp(vec2(start.y, end.y), 0.0, 1.0); if (window.x != window.y) { vec2 t = (window - start.y) / (end.y - start.y); @@ -82,7 +82,7 @@ float[CHUNK] computeArea(vec2 xy, int backdrop, uint tile_ref) { float a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin); area[k] += a * (window.x - window.y); } - area[k] += sign(end.x - start.x) * clamp(my_xy.y - seg.y_edge + 1.0, 0.0, 1.0); + area[k] += sign(seg.vector.x) * clamp(my_xy.y - seg.y_edge + 1.0, 0.0, 1.0); } tile_seg_ref = seg.next; } while (tile_seg_ref.offset != 0); @@ -137,9 +137,9 @@ void main() { TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref); do { TileSeg seg = TileSeg_read(tile_seg_ref); - vec2 line_vec = seg.end - seg.start; + vec2 line_vec = seg.vector; for (uint k = 0; k < CHUNK; k++) { - vec2 dpos = xy + vec2(0.5, 0.5) - seg.start; + vec2 dpos = xy + vec2(0.5, 0.5) - seg.origin; dpos.y += float(k * CHUNK_DY); float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0); df[k] = min(df[k], length(line_vec * t - dpos)); diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv index 6f3c33f..33ed4f8 100644 Binary files a/piet-gpu/shader/kernel4.spv and b/piet-gpu/shader/kernel4.spv differ diff --git a/piet-gpu/shader/path_coarse.comp b/piet-gpu/shader/path_coarse.comp index 658af0e..eb3509b 100644 --- a/piet-gpu/shader/path_coarse.comp +++ b/piet-gpu/shader/path_coarse.comp @@ -101,12 +101,6 @@ void main() { if (element_ix < n_pathseg) { tag = PathSeg_tag(ref); } - // Setup for coverage algorithm. - float a, b, c; - // Bounding box of element in pixel coordinates. - float xmin, xmax, ymin, ymax; - PathStrokeLine line; - float dx; switch (tag) { case PathSeg_FillCubic: case PathSeg_StrokeCubic: @@ -162,22 +156,24 @@ void main() { } // Output line segment - xmin = min(p0.x, p1.x) - cubic.stroke.x; - xmax = max(p0.x, p1.x) + cubic.stroke.x; - ymin = min(p0.y, p1.y) - cubic.stroke.y; - ymax = max(p0.y, p1.y) + cubic.stroke.y; + + // Bounding box of element in pixel coordinates. + float xmin = min(p0.x, p1.x) - cubic.stroke.x; + float xmax = max(p0.x, p1.x) + cubic.stroke.x; + float ymin = min(p0.y, p1.y) - cubic.stroke.y; + float ymax = max(p0.y, p1.y) + cubic.stroke.y; float dx = p1.x - p0.x; float dy = p1.y - p0.y; // Set up for per-scanline coverage formula, below. float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy; - c = (cubic.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + cubic.stroke.y)) * SX; - b = invslope; // Note: assumes square tiles, otherwise scale. - a = (p0.x - (p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX; + float c = (cubic.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + cubic.stroke.y)) * SX; + float b = invslope; // Note: assumes square tiles, otherwise scale. + float a = (p0.x - (p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX; - int x0 = int(floor((xmin) * SX)); - int x1 = int(ceil((xmax) * SX)); - int y0 = int(floor((ymin) * SY)); - int y1 = int(ceil((ymax) * SY)); + int x0 = int(floor(xmin * SX)); + int x1 = int(floor(xmax * SX) + 1); + int y0 = int(floor(ymin * SY)); + int y1 = int(floor(ymax * SY) + 1); x0 = clamp(x0, bbox.x, bbox.z); y0 = clamp(y0, bbox.y, bbox.w); @@ -191,36 +187,69 @@ void main() { // Consider using subgroups to aggregate atomic add. uint tile_offset = atomicAdd(alloc, n_tile_alloc * TileSeg_size); TileSeg tile_seg; + + int xray = int(floor(p0.x*SX)); + int last_xray = int(floor(p1.x*SX)); + if (p0.y > p1.y) { + int tmp = xray; + xray = last_xray; + last_xray = tmp; + } for (int y = y0; y < y1; y++) { - float tile_y0 = float(y * TILE_HEIGHT_PX); - if (tag == PathSeg_FillCubic && min(p0.y, p1.y) <= tile_y0) { - int xray = max(int(ceil(xc - 0.5 * b)), bbox.x); - if (xray < bbox.z) { - int backdrop = p1.y < p0.y ? 1 : -1; - TileRef tile_ref = Tile_index(path.tiles, uint(base + xray)); - uint tile_el = tile_ref.offset >> 2; - atomicAdd(tile[tile_el + 1], backdrop); - } + int xbackdrop = max(xray + 1, bbox.x); + if (tag == PathSeg_FillCubic && y > y0 && xbackdrop < bbox.z) { + int backdrop = p1.y < p0.y ? 1 : -1; + TileRef tile_ref = Tile_index(path.tiles, uint(base + xbackdrop)); + uint tile_el = tile_ref.offset >> 2; + atomicAdd(tile[tile_el + 1], backdrop); } + int xx0 = clamp(int(floor(xc - c)), x0, x1); int xx1 = clamp(int(ceil(xc + c)), x0, x1); + xx1 = max(xx1, xray + 1); + + // next_xray is the xray for the next scanline; it is derived + // by left edge intersections computed below. + int next_xray = xray; for (int x = xx0; x < xx1; x++) { float tile_x0 = float(x * TILE_WIDTH_PX); TileRef tile_ref = Tile_index(path.tiles, uint(base + x)); uint tile_el = tile_ref.offset >> 2; uint old = atomicExchange(tile[tile_el], tile_offset); - tile_seg.start = p0; - tile_seg.end = p1; + tile_seg.origin = p0; + tile_seg.vector = p1 - p0; float y_edge = 0.0; if (tag == PathSeg_FillCubic) { + float tile_y0 = float(y * TILE_HEIGHT_PX); y_edge = mix(p0.y, p1.y, (tile_x0 - p0.x) / dx); if (min(p0.x, p1.x) < tile_x0 && y_edge >= tile_y0 && y_edge < tile_y0 + TILE_HEIGHT_PX) { + // Left edge intersection. + vec2 p = vec2(tile_x0, y_edge); if (p0.x > p1.x) { - tile_seg.end = vec2(tile_x0, y_edge); + tile_seg.vector = p - p0; } else { - tile_seg.start = vec2(tile_x0, y_edge); + tile_seg.origin = p; + tile_seg.vector = p1 - p; } - } else { + // kernel4 uses sign(vector.x) for the sign of the intersection backdrop. + // Nudge zeroes towards the intended sign. + if (tile_seg.vector.x == 0) { + tile_seg.vector.x += sign(p1.x - p0.x)*1e-9; + } + // Move next_xray consistently with previous intersections. + if (x > next_xray && next_xray >= xray) { + next_xray = x; + } else if (x <= next_xray && next_xray <= xray) { + next_xray = x - 1; + } + } + // Force last xray on the last scanline for consistency with later + // line segments. + if (y == y1 - 1) { + next_xray = last_xray; + } + // Drop inconsistent intersections. + if (x <= min(xray, next_xray) || max(xray, next_xray) < x) { y_edge = 1e9; } } @@ -231,6 +260,7 @@ void main() { } xc += b; base += stride; + xray = next_xray; } n_out += 1; diff --git a/piet-gpu/shader/path_coarse.spv b/piet-gpu/shader/path_coarse.spv index f82a031..767bbda 100644 Binary files a/piet-gpu/shader/path_coarse.spv and b/piet-gpu/shader/path_coarse.spv differ diff --git a/piet-gpu/shader/tile.h b/piet-gpu/shader/tile.h index d7659ff..b6c5e14 100644 --- a/piet-gpu/shader/tile.h +++ b/piet-gpu/shader/tile.h @@ -35,8 +35,8 @@ TileRef Tile_index(TileRef ref, uint index) { } struct TileSeg { - vec2 start; - vec2 end; + vec2 origin; + vec2 vector; float y_edge; TileSegRef next; }; @@ -90,8 +90,8 @@ TileSeg TileSeg_read(TileSegRef ref) { uint raw4 = tile[ix + 4]; uint raw5 = tile[ix + 5]; TileSeg s; - s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); - s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.origin = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.vector = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); s.y_edge = uintBitsToFloat(raw4); s.next = TileSegRef(raw5); return s; @@ -99,10 +99,10 @@ TileSeg TileSeg_read(TileSegRef ref) { void TileSeg_write(TileSegRef ref, TileSeg s) { uint ix = ref.offset >> 2; - tile[ix + 0] = floatBitsToUint(s.start.x); - tile[ix + 1] = floatBitsToUint(s.start.y); - tile[ix + 2] = floatBitsToUint(s.end.x); - tile[ix + 3] = floatBitsToUint(s.end.y); + tile[ix + 0] = floatBitsToUint(s.origin.x); + tile[ix + 1] = floatBitsToUint(s.origin.y); + tile[ix + 2] = floatBitsToUint(s.vector.x); + tile[ix + 3] = floatBitsToUint(s.vector.y); tile[ix + 4] = floatBitsToUint(s.y_edge); tile[ix + 5] = s.next.offset; } diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index af4c3f5..1868888 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -99,9 +99,11 @@ fn render_cardioid(rc: &mut impl RenderContext) { fn render_clip_test(rc: &mut impl RenderContext) { const N: usize = 16; const X0: f64 = 50.0; - const Y0: f64 = 50.0; - const X1: f64 = 100.0; - const Y1: f64 = 100.0; + const Y0: f64 = 450.0; + // Note: if it gets much larger, it will exceed the 1MB scratch buffer. + // But this is a pretty demanding test. + const X1: f64 = 550.0; + const Y1: f64 = 950.0; let step = 1.0 / ((N + 1) as f64); for i in 0..N { let t = ((i + 1) as f64) * step;