diff --git a/piet-gpu-types/src/ptcl.rs b/piet-gpu-types/src/ptcl.rs index 98c4d44..96c0ecc 100644 --- a/piet-gpu-types/src/ptcl.rs +++ b/piet-gpu-types/src/ptcl.rs @@ -20,7 +20,8 @@ piet_gpu! { rgba_color: u32, } struct CmdFill { - seg_ref: Ref, + // As above, really Ref + tile_ref: u32, backdrop: i32, rgba_color: u32, } diff --git a/piet-gpu-types/src/tile.rs b/piet-gpu-types/src/tile.rs index 5a28037..18318e3 100644 --- a/piet-gpu-types/src/tile.rs +++ b/piet-gpu-types/src/tile.rs @@ -15,6 +15,7 @@ piet_gpu! { struct TileSeg { start: [f32; 2], end: [f32; 2], + y_edge: f32, next: Ref, } } diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs index 5b293d3..df2f894 100644 --- a/piet-gpu/bin/cli.rs +++ b/piet-gpu/bin/cli.rs @@ -171,7 +171,7 @@ fn main() -> Result<(), Error> { let fence = device.create_fence(false)?; let mut cmd_buf = device.create_cmd_buf()?; - let query_pool = device.create_query_pool(7)?; + let query_pool = device.create_query_pool(8)?; let mut ctx = PietGpuRenderContext::new(); if let Some(input) = matches.value_of("INPUT") { @@ -204,9 +204,10 @@ fn main() -> Result<(), Error> { println!("Element kernel time: {:.3}ms", ts[0] * 1e3); println!("Tile allocation kernel time: {:.3}ms", (ts[1] - ts[0]) * 1e3); println!("Coarse path kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3); - println!("Binning kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3); - println!("Coarse raster kernel time: {:.3}ms", (ts[4] - ts[3]) * 1e3); - println!("Render kernel time: {:.3}ms", (ts[5] - ts[4]) * 1e3); + println!("Backdrop kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3); + println!("Binning kernel time: {:.3}ms", (ts[4] - ts[3]) * 1e3); + println!("Coarse raster kernel time: {:.3}ms", (ts[5] - ts[4]) * 1e3); + println!("Render kernel time: {:.3}ms", (ts[6] - ts[5]) * 1e3); /* let mut data: Vec = Default::default(); diff --git a/piet-gpu/shader/backdrop.comp b/piet-gpu/shader/backdrop.comp new file mode 100644 index 0000000..c0f58c4 --- /dev/null +++ b/piet-gpu/shader/backdrop.comp @@ -0,0 +1,56 @@ +// Propagation of tile backdrop for filling. + +#version 450 +#extension GL_GOOGLE_include_directive : enable + +#include "setup.h" + +#define BACKDROP_WG 256 + +layout(local_size_x = BACKDROP_WG, local_size_y = 1) in; + +layout(set = 0, binding = 0) buffer AnnotatedBuf { + uint[] annotated; +}; + +// This is really only used for n_elements; maybe we can handle that +// a different way, but it's convenient to have the same signature as +// tile allocation. +layout(set = 0, binding = 1) buffer AllocBuf { + uint n_elements; + uint n_pathseg; + uint alloc; +}; + +layout(set = 0, binding = 2) buffer TileBuf { + uint[] tile; +}; + +#include "annotated.h" +#include "tile.h" + +void main() { + uint element_ix = gl_GlobalInvocationID.x; + AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size); + + uint tag = Annotated_Nop; + if (element_ix < n_elements) { + tag = Annotated_tag(ref); + } + if (tag == Annotated_Fill) { + PathRef path_ref = PathRef(element_ix * Path_size); + Path path = Path_read(path_ref); + uint width = path.bbox.z - path.bbox.x; + uint height = path.bbox.w - path.bbox.y; + // slightly handrolling the tile structure here... + uint tile_el_ix = (path.tiles.offset >> 2) + 1; + for (uint y = 0; y < height; y++) { + uint sum = 0; + for (uint x = 0; x < width; x++) { + sum += tile[tile_el_ix]; + tile[tile_el_ix] = sum; + tile_el_ix += 2; + } + } + } +} diff --git a/piet-gpu/shader/backdrop.spv b/piet-gpu/shader/backdrop.spv new file mode 100644 index 0000000..0b4828e Binary files /dev/null and b/piet-gpu/shader/backdrop.spv differ diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja index 27fcfe2..4f6e07f 100644 --- a/piet-gpu/shader/build.ninja +++ b/piet-gpu/shader/build.ninja @@ -18,6 +18,8 @@ build tile_alloc.spv: glsl tile_alloc.comp | annotated.h tile.h setup.h build path_coarse.spv: glsl path_coarse.comp | annotated.h tile.h setup.h +build backdrop.spv: glsl backdrop.comp | annotated.h tile.h setup.h + build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp index e488fbf..eec0bfe 100644 --- a/piet-gpu/shader/coarse.comp +++ b/piet-gpu/shader/coarse.comp @@ -226,7 +226,7 @@ void main() { uint x = sh_tile_x0[el_ix] + seq_ix % width; uint y = sh_tile_y0[el_ix] + seq_ix / width; Tile tile = Tile_read(TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size)); - if (tile.tile.offset != 0) { + if (tile.tile.offset != 0 || tile.backdrop != 0) { uint el_slice = el_ix / 32; uint el_mask = 1 << (el_ix & 31); atomicOr(sh_bitmaps[el_slice][y * N_TILE_X + x], el_mask); @@ -357,52 +357,26 @@ void main() { tag = Annotated_tag(ref); switch (tag) { - /* case Annotated_Fill: - if (last_chunk_n > 0 || seg_count > 0) { - SegChunkRef chunk_ref = SegChunkRef(0); - if (seg_count > 0) { - chunk_ref = alloc_seg_chunk(); - SegChunk chunk; - chunk.n = seg_count; - chunk.next = SegChunkRef(0); - uint seg_offset = seg_alloc + seg_start * Segment_size; - chunk.segs = SegmentRef(seg_offset); - SegChunk_write(chunk_ref, chunk); - } - if (last_chunk_n > 0) { - SegChunk chunk; - chunk.n = last_chunk_n; - chunk.next = chunk_ref; - chunk.segs = last_chunk_segs; - SegChunk_write(last_chunk_ref, chunk); - } else { - first_seg_chunk = chunk_ref; - } - - AnnoFill fill = Annotated_Fill_read(ref); - CmdFill cmd_fill; - cmd_fill.seg_ref = first_seg_chunk; - cmd_fill.backdrop = backdrop; - cmd_fill.rgba_color = fill.rgba_color; - alloc_cmd(cmd_ref, cmd_limit); - Cmd_Fill_write(cmd_ref, cmd_fill); - cmd_ref.offset += Cmd_size; - last_chunk_n = 0; - } else if (backdrop != 0) { - AnnoFill fill = Annotated_Fill_read(ref); - alloc_cmd(cmd_ref, cmd_limit); - Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color)); - cmd_ref.offset += Cmd_size; - } - seg_start += seg_count; - seg_count = 0; - backdrop = 0; - break; - */ - case Annotated_Stroke: Tile tile = Tile_read(TileRef(sh_tile_base[element_ref_ix] + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); + AnnoFill fill = Annotated_Fill_read(ref); + alloc_cmd(cmd_ref, cmd_limit); + if (tile.tile.offset != 0) { + CmdFill cmd_fill; + cmd_fill.tile_ref = tile.tile.offset; + cmd_fill.backdrop = tile.backdrop; + cmd_fill.rgba_color = fill.rgba_color; + Cmd_Fill_write(cmd_ref, cmd_fill); + } else { + AnnoFill fill = Annotated_Fill_read(ref); + Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color)); + } + cmd_ref.offset += Cmd_size; + break; + case Annotated_Stroke: + tile = Tile_read(TileRef(sh_tile_base[element_ref_ix] + + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); AnnoStroke stroke = Annotated_Stroke_read(ref); CmdStroke cmd_stroke; cmd_stroke.tile_ref = tile.tile.offset; diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv index c5a304b..ad24e6b 100644 Binary files a/piet-gpu/shader/coarse.spv and b/piet-gpu/shader/coarse.spv differ diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp index 0ecda68..e00320b 100644 --- a/piet-gpu/shader/kernel4.comp +++ b/piet-gpu/shader/kernel4.comp @@ -80,46 +80,40 @@ void main() { rgb[k] = mix(rgb[k], fg_rgba.rgb, alpha * fg_rgba.a); } break; - /* case Cmd_Fill: CmdFill fill = Cmd_Fill_read(cmd_ref); // Probably better to store as float, but conversion is no doubt cheap. float area[CHUNK]; for (uint k = 0; k < CHUNK; k++) area[k] = float(fill.backdrop); - SegChunkRef fill_seg_chunk_ref = fill.seg_ref; + tile_seg_ref = TileSegRef(fill.tile_ref); do { - SegChunk seg_chunk = SegChunk_read(fill_seg_chunk_ref); - SegmentRef segs = seg_chunk.segs; - for (int i = 0; i < seg_chunk.n; i++) { - Segment seg = Segment_read(Segment_index(segs, i)); - for (uint k = 0; k < CHUNK; k++) { - vec2 my_xy = vec2(xy.x, xy.y + float(k * CHUNK_DY)); - vec2 start = seg.start - my_xy; - vec2 end = seg.end - my_xy; - vec2 window = clamp(vec2(start.y, end.y), 0.0, 1.0); - if (window.x != window.y) { - vec2 t = (window - start.y) / (end.y - start.y); - vec2 xs = vec2(mix(start.x, end.x, t.x), mix(start.x, end.x, t.y)); - float xmin = min(min(xs.x, xs.y), 1.0) - 1e-6; - float xmax = max(xs.x, xs.y); - float b = min(xmax, 1.0); - float c = max(b, 0.0); - float d = max(xmin, 0.0); - float a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin); - area[k] += a * (window.x - window.y); - } - area[k] += sign(end.x - start.x) * clamp(my_xy.y - seg.y_edge + 1.0, 0.0, 1.0); + TileSeg seg = TileSeg_read(tile_seg_ref); + for (uint k = 0; k < CHUNK; k++) { + vec2 my_xy = vec2(xy.x, xy.y + float(k * CHUNK_DY)); + vec2 start = seg.start - my_xy; + vec2 end = seg.end - my_xy; + vec2 window = clamp(vec2(start.y, end.y), 0.0, 1.0); + if (window.x != window.y) { + vec2 t = (window - start.y) / (end.y - start.y); + vec2 xs = vec2(mix(start.x, end.x, t.x), mix(start.x, end.x, t.y)); + float xmin = min(min(xs.x, xs.y), 1.0) - 1e-6; + float xmax = max(xs.x, xs.y); + float b = min(xmax, 1.0); + float c = max(b, 0.0); + float d = max(xmin, 0.0); + float a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin); + area[k] += a * (window.x - window.y); } + area[k] += sign(end.x - start.x) * clamp(my_xy.y - seg.y_edge + 1.0, 0.0, 1.0); } - fill_seg_chunk_ref = seg_chunk.next; - } while (fill_seg_chunk_ref.offset != 0); + tile_seg_ref = seg.next; + } while (tile_seg_ref.offset != 0); fg_rgba = unpackUnorm4x8(fill.rgba_color).wzyx; for (uint k = 0; k < CHUNK; k++) { float alpha = min(abs(area[k]), 1.0); rgb[k] = mix(rgb[k], fg_rgba.rgb, alpha * fg_rgba.a); } break; - */ case Cmd_Solid: CmdSolid solid = Cmd_Solid_read(cmd_ref); fg_rgba = unpackUnorm4x8(solid.rgba_color).wzyx; diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv index cb27407..52ba572 100644 Binary files a/piet-gpu/shader/kernel4.spv and b/piet-gpu/shader/kernel4.spv differ diff --git a/piet-gpu/shader/path_coarse.comp b/piet-gpu/shader/path_coarse.comp index 9abcbf0..0b3bc8f 100644 --- a/piet-gpu/shader/path_coarse.comp +++ b/piet-gpu/shader/path_coarse.comp @@ -36,9 +36,11 @@ layout(set = 0, binding = 2) buffer TileBuf { shared uint sh_tile_count[COARSE_WG]; shared uint sh_width[COARSE_WG]; shared uint sh_draw_width[COARSE_WG]; +shared uint sh_tag[COARSE_WG]; shared vec2 sh_p0[COARSE_WG]; shared vec2 sh_p1[COARSE_WG]; shared int sh_x0[COARSE_WG]; +shared int sh_bbox_x1[COARSE_WG]; shared int sh_y0[COARSE_WG]; shared float sh_a[COARSE_WG]; shared float sh_b[COARSE_WG]; @@ -56,6 +58,7 @@ void main() { if (element_ix < n_pathseg) { tag = PathSeg_tag(ref); } + sh_tag[th_ix] = tag; // Setup for coverage algorithm. float a, b, c; // Bounding box of element in pixel coordinates. @@ -96,6 +99,7 @@ void main() { x1 = clamp(x1, bbox.x, bbox.z); y1 = clamp(y1, bbox.y, bbox.w); sh_x0[th_ix] = x0; + sh_bbox_x1[th_ix] = bbox.z; // TODO: can get rid of this (fold into base), with care (also need to update `a`) sh_y0[th_ix] = y0; int stride = bbox.z - bbox.x; @@ -138,7 +142,8 @@ void main() { int x1 = x0 + int(sh_width[el_ix]); int dx = int(seq_ix % draw_width); uint y = sh_y0[el_ix] + seq_ix / draw_width; - float t = sh_a[el_ix] + sh_b[el_ix] * float(y); + float b = sh_b[el_ix]; + float t = sh_a[el_ix] + b * float(y); float c = sh_c[el_ix]; int xx0 = clamp(int(floor(t - c)), x0, x1); int xx1 = clamp(int(ceil(t + c)), x0, x1); @@ -148,8 +153,34 @@ void main() { uint tile_el = (sh_base[el_ix] + uint(y * sh_stride[el_ix] + x) * Tile_size) >> 2; uint old = atomicExchange(tile[tile_el], tile_offset); TileSeg tile_seg; - tile_seg.start = sh_p0[el_ix]; - tile_seg.end = sh_p1[el_ix]; + vec2 p0 = sh_p0[el_ix]; + vec2 p1 = sh_p1[el_ix]; + float y_edge = 0.0; + if (sh_tag[el_ix] == PathSeg_FillLine) { + vec2 tile_xy = vec2(x * TILE_WIDTH_PX, y * TILE_HEIGHT_PX); + if (dx == 0 && min(p0.y, p1.y) <= tile_xy.y) { + // TODO: need a little more work to make sure this triggers even + // when line is to the left of bbox. + int xray = max(int(ceil(t - 0.5 * b)), x0); + if (xray < sh_bbox_x1[el_ix]) { + int backdrop = p1.y < p0.y ? 1 : -1; + atomicAdd(tile[tile_el + 1 + 2 * (xray - x)], backdrop); + } + } + y_edge = mix(p0.y, p1.y, (tile_xy.x - p0.x) / (p1.x - p0.x)); + if (min(p0.x, p1.x) < tile_xy.x && y_edge >= tile_xy.y && y_edge < tile_xy.y + TILE_HEIGHT_PX) { + if (p0.x > p1.x) { + p1 = vec2(tile_xy.x, y_edge); + } else { + p0 = vec2(tile_xy.x, y_edge); + } + } else { + y_edge = 1e9; + } + } + tile_seg.start = p0; + tile_seg.end = p1; + tile_seg.y_edge = y_edge; tile_seg.next.offset = old; TileSeg_write(TileSegRef(tile_offset), tile_seg); } diff --git a/piet-gpu/shader/path_coarse.spv b/piet-gpu/shader/path_coarse.spv index 53cb759..8c4801b 100644 Binary files a/piet-gpu/shader/path_coarse.spv and b/piet-gpu/shader/path_coarse.spv differ diff --git a/piet-gpu/shader/ptcl.h b/piet-gpu/shader/ptcl.h index d337598..0c20a89 100644 --- a/piet-gpu/shader/ptcl.h +++ b/piet-gpu/shader/ptcl.h @@ -80,7 +80,7 @@ CmdStrokeRef CmdStroke_index(CmdStrokeRef ref, uint index) { } struct CmdFill { - SegChunkRef seg_ref; + uint tile_ref; int backdrop; uint rgba_color; }; @@ -239,7 +239,7 @@ CmdFill CmdFill_read(CmdFillRef ref) { uint raw1 = ptcl[ix + 1]; uint raw2 = ptcl[ix + 2]; CmdFill s; - s.seg_ref = SegChunkRef(raw0); + s.tile_ref = raw0; s.backdrop = int(raw1); s.rgba_color = raw2; return s; @@ -247,7 +247,7 @@ CmdFill CmdFill_read(CmdFillRef ref) { void CmdFill_write(CmdFillRef ref, CmdFill s) { uint ix = ref.offset >> 2; - ptcl[ix + 0] = s.seg_ref.offset; + ptcl[ix + 0] = s.tile_ref; ptcl[ix + 1] = uint(s.backdrop); ptcl[ix + 2] = s.rgba_color; } diff --git a/piet-gpu/shader/tile.h b/piet-gpu/shader/tile.h index b4a8c9b..d7659ff 100644 --- a/piet-gpu/shader/tile.h +++ b/piet-gpu/shader/tile.h @@ -37,10 +37,11 @@ TileRef Tile_index(TileRef ref, uint index) { struct TileSeg { vec2 start; vec2 end; + float y_edge; TileSegRef next; }; -#define TileSeg_size 20 +#define TileSeg_size 24 TileSegRef TileSeg_index(TileSegRef ref, uint index) { return TileSegRef(ref.offset + index * TileSeg_size); @@ -87,10 +88,12 @@ TileSeg TileSeg_read(TileSegRef ref) { uint raw2 = tile[ix + 2]; uint raw3 = tile[ix + 3]; uint raw4 = tile[ix + 4]; + uint raw5 = tile[ix + 5]; TileSeg s; s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); - s.next = TileSegRef(raw4); + s.y_edge = uintBitsToFloat(raw4); + s.next = TileSegRef(raw5); return s; } @@ -100,6 +103,7 @@ void TileSeg_write(TileSegRef ref, TileSeg s) { tile[ix + 1] = floatBitsToUint(s.start.y); tile[ix + 2] = floatBitsToUint(s.end.x); tile[ix + 3] = floatBitsToUint(s.end.y); - tile[ix + 4] = s.next.offset; + tile[ix + 4] = floatBitsToUint(s.y_edge); + tile[ix + 5] = s.next.offset; } diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index 63d80fa..1e839d2 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -57,8 +57,8 @@ pub fn render_scene(rc: &mut impl RenderContext) { let circle = Circle::new(center, radius); rc.fill(circle, &color); } - /* let mut path = BezPath::new(); + /* path.move_to((100.0, 1150.0)); path.line_to((200.0, 1200.0)); path.line_to((150.0, 1250.0)); @@ -143,6 +143,9 @@ pub struct Renderer { path_pipeline: D::Pipeline, path_ds: D::DescriptorSet, + backdrop_pipeline: D::Pipeline, + backdrop_ds: D::DescriptorSet, + tile_alloc_buf_host: D::Buffer, tile_alloc_buf_dev: D::Buffer, @@ -224,6 +227,14 @@ impl Renderer { &[], )?; + let backdrop_alloc_code = include_bytes!("../shader/backdrop.spv"); + let backdrop_pipeline = device.create_simple_compute_pipeline(backdrop_alloc_code, 3, 0)?; + let backdrop_ds = device.create_descriptor_set( + &backdrop_pipeline, + &[&anno_buf, &tile_alloc_buf_dev, &tile_buf], + &[], + )?; + let bin_alloc_buf_host = device.create_buffer(12, host)?; let bin_alloc_buf_dev = device.create_buffer(12, dev)?; @@ -275,6 +286,8 @@ impl Renderer { tile_ds, path_pipeline, path_ds, + backdrop_pipeline, + backdrop_ds, bin_pipeline, bin_ds, coarse_pipeline, @@ -333,6 +346,13 @@ impl Renderer { (((self.n_pathseg + 31) / 32) as u32, 1, 1), ); cmd_buf.write_timestamp(&query_pool, 3); + cmd_buf.memory_barrier(); + cmd_buf.dispatch( + &self.backdrop_pipeline, + &self.backdrop_ds, + (((self.n_paths + 255) / 256) as u32, 1, 1), + ); + cmd_buf.write_timestamp(&query_pool, 4); // Note: this barrier is not needed as an actual dependency between // pipeline stages, but I am keeping it in so that timer queries are // easier to interpret. @@ -342,21 +362,21 @@ impl Renderer { &self.bin_ds, (((self.n_paths + 255) / 256) as u32, 1, 1), ); - cmd_buf.write_timestamp(&query_pool, 4); + cmd_buf.write_timestamp(&query_pool, 5); cmd_buf.memory_barrier(); cmd_buf.dispatch( &self.coarse_pipeline, &self.coarse_ds, (WIDTH as u32 / 256, HEIGHT as u32 / 256, 1), ); - cmd_buf.write_timestamp(&query_pool, 5); + cmd_buf.write_timestamp(&query_pool, 6); cmd_buf.memory_barrier(); cmd_buf.dispatch( &self.k4_pipeline, &self.k4_ds, ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1), ); - cmd_buf.write_timestamp(&query_pool, 6); + cmd_buf.write_timestamp(&query_pool, 7); cmd_buf.memory_barrier(); cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc); } diff --git a/piet-gpu/src/pico_svg.rs b/piet-gpu/src/pico_svg.rs index 0aac61a..140c42d 100644 --- a/piet-gpu/src/pico_svg.rs +++ b/piet-gpu/src/pico_svg.rs @@ -49,8 +49,8 @@ impl PicoSvg { for item in &self.items { match item { Item::Fill(fill_item) => { - //rc.fill(&fill_item.path, &fill_item.color); - rc.stroke(&fill_item.path, &fill_item.color, 1.0); + rc.fill(&fill_item.path, &fill_item.color); + //rc.stroke(&fill_item.path, &fill_item.color, 1.0); } Item::Stroke(stroke_item) => { rc.stroke(&stroke_item.path, &stroke_item.color, stroke_item.width);