diff --git a/piet-gpu-types/src/annotated.rs b/piet-gpu-types/src/annotated.rs index f7a6ad6..d53d870 100644 --- a/piet-gpu-types/src/annotated.rs +++ b/piet-gpu-types/src/annotated.rs @@ -3,9 +3,11 @@ use piet_gpu_derive::piet_gpu; piet_gpu! { #[gpu_write] mod annotated { + // Note: path segments have moved to pathseg, delete these. struct AnnoFillLineSeg { p0: [f32; 2], p1: [f32; 2], + path_ix: u32, // A note: the layout of this struct is shared with // AnnoStrokeLineSeg. In that case, we actually write // [0.0, 0.0] as the stroke field, to minimize divergence. @@ -13,6 +15,7 @@ piet_gpu! { struct AnnoStrokeLineSeg { p0: [f32; 2], p1: [f32; 2], + path_ix: u32, // halfwidth in both x and y for binning stroke: [f32; 2], } diff --git a/piet-gpu-types/src/lib.rs b/piet-gpu-types/src/lib.rs index 75a7731..62450d2 100644 --- a/piet-gpu-types/src/lib.rs +++ b/piet-gpu-types/src/lib.rs @@ -3,8 +3,10 @@ pub mod annotated; pub mod bins; pub mod encoder; +pub mod pathseg; pub mod ptcl; pub mod scene; pub mod state; pub mod test; +pub mod tile; pub mod tilegroup; diff --git a/piet-gpu-types/src/main.rs b/piet-gpu-types/src/main.rs index 9c40051..7913c5f 100644 --- a/piet-gpu-types/src/main.rs +++ b/piet-gpu-types/src/main.rs @@ -7,7 +7,9 @@ fn main() { "scene" => print!("{}", piet_gpu_types::scene::gen_gpu_scene()), "state" => print!("{}", piet_gpu_types::state::gen_gpu_state()), "annotated" => print!("{}", piet_gpu_types::annotated::gen_gpu_annotated()), + "pathseg" => print!("{}", piet_gpu_types::pathseg::gen_gpu_pathseg()), "bins" => print!("{}", piet_gpu_types::bins::gen_gpu_bins()), + "tile" => print!("{}", piet_gpu_types::tile::gen_gpu_tile()), "tilegroup" => print!("{}", piet_gpu_types::tilegroup::gen_gpu_tilegroup()), "ptcl" => print!("{}", piet_gpu_types::ptcl::gen_gpu_ptcl()), "test" => print!("{}", piet_gpu_types::test::gen_gpu_test()), diff --git a/piet-gpu-types/src/pathseg.rs b/piet-gpu-types/src/pathseg.rs new file mode 100644 index 0000000..5ad382b --- /dev/null +++ b/piet-gpu-types/src/pathseg.rs @@ -0,0 +1,46 @@ +use piet_gpu_derive::piet_gpu; + +piet_gpu! { + #[gpu_write] + mod pathseg { + struct PathFillLine { + p0: [f32; 2], + p1: [f32; 2], + path_ix: u32, + // A note: the layout of this struct is shared with + // PathStrokeLine. In that case, we actually write + // [0.0, 0.0] as the stroke field, to minimize divergence. + } + struct PathStrokeLine { + p0: [f32; 2], + p1: [f32; 2], + path_ix: u32, + // halfwidth in both x and y for binning + stroke: [f32; 2], + } + /* + struct PathQuad { + p0: [f32; 2], + p1: [f32; 2], + p2: [f32; 2], + stroke: [f32; 2], + } + struct PathCubic { + p0: [f32; 2], + p1: [f32; 2], + p2: [f32; 2], + p3: [f32; 2], + stroke: [f32; 2], + } + */ + enum PathSeg { + Nop, + FillLine(PathFillLine), + StrokeLine(PathStrokeLine), + /* + Quad(AnnoQuadSeg), + Cubic(AnnoCubicSeg), + */ + } + } +} diff --git a/piet-gpu-types/src/state.rs b/piet-gpu-types/src/state.rs index 35076f0..602fab9 100644 --- a/piet-gpu-types/src/state.rs +++ b/piet-gpu-types/src/state.rs @@ -9,6 +9,8 @@ piet_gpu! { bbox: [f32; 4], linewidth: f32, flags: u32, + path_count: u32, + pathseg_count: u32, } } } diff --git a/piet-gpu-types/src/tile.rs b/piet-gpu-types/src/tile.rs new file mode 100644 index 0000000..5a28037 --- /dev/null +++ b/piet-gpu-types/src/tile.rs @@ -0,0 +1,21 @@ +use piet_gpu_derive::piet_gpu; + +piet_gpu! { + #[gpu_write] + mod tile { + struct Path { + bbox: [u16; 4], + tiles: Ref, + } + struct Tile { + tile: Ref, + backdrop: i32, + } + // Segments within a tile are represented as a linked list. + struct TileSeg { + start: [f32; 2], + end: [f32; 2], + next: Ref, + } + } +} diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs index 31024aa..04a20ba 100644 --- a/piet-gpu/bin/cli.rs +++ b/piet-gpu/bin/cli.rs @@ -185,10 +185,12 @@ fn main() -> Result<(), Error> { } else { render_scene(&mut ctx); } + let n_paths = ctx.path_count(); + let n_pathseg = ctx.pathseg_count(); let scene = ctx.get_scene_buf(); //dump_scene(&scene); - let renderer = Renderer::new(&device, scene)?; + let renderer = Renderer::new(&device, scene, n_paths, n_pathseg)?; let image_buf = device.create_buffer((WIDTH * HEIGHT * 4) as u64, MemFlags::host_coherent())?; @@ -200,16 +202,16 @@ fn main() -> Result<(), Error> { device.wait_and_reset(&[fence])?; let ts = device.reap_query_pool(&query_pool).unwrap(); println!("Element kernel time: {:.3}ms", ts[0] * 1e3); - println!("Binning kernel time: {:.3}ms", (ts[1] - ts[0]) * 1e3); - println!("Coarse kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3); - println!("Render kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3); - + println!("Tile allocation kernel time: {:.3}ms", (ts[1] - ts[0]) * 1e3); + println!("Coarse path kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3); /* + println!("Render kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3); + */ + let mut data: Vec = Default::default(); - device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap(); + device.read_buffer(&renderer.tile_buf, &mut data).unwrap(); piet_gpu::dump_k1_data(&data); //trace_ptcl(&data); - */ let mut img_data: Vec = Default::default(); // Note: because png can use a `&[u8]` slice, we could avoid an extra copy diff --git a/piet-gpu/bin/winit.rs b/piet-gpu/bin/winit.rs index fd30fa3..3568732 100644 --- a/piet-gpu/bin/winit.rs +++ b/piet-gpu/bin/winit.rs @@ -42,9 +42,11 @@ fn main() -> Result<(), Error> { let mut ctx = PietGpuRenderContext::new(); render_scene(&mut ctx); + let n_paths = ctx.path_count(); + let n_pathseg = ctx.pathseg_count(); let scene = ctx.get_scene_buf(); - let renderer = Renderer::new(&device, scene)?; + let renderer = Renderer::new(&device, scene, n_paths, n_pathseg)?; event_loop.run(move |event, _, control_flow| { *control_flow = ControlFlow::Poll; // `ControlFlow::Wait` if only re-render on event diff --git a/piet-gpu/shader/annotated.h b/piet-gpu/shader/annotated.h index 9812264..f243fab 100644 --- a/piet-gpu/shader/annotated.h +++ b/piet-gpu/shader/annotated.h @@ -31,9 +31,10 @@ struct AnnotatedRef { struct AnnoFillLineSeg { vec2 p0; vec2 p1; + uint path_ix; }; -#define AnnoFillLineSeg_size 16 +#define AnnoFillLineSeg_size 20 AnnoFillLineSegRef AnnoFillLineSeg_index(AnnoFillLineSegRef ref, uint index) { return AnnoFillLineSegRef(ref.offset + index * AnnoFillLineSeg_size); @@ -42,10 +43,11 @@ AnnoFillLineSegRef AnnoFillLineSeg_index(AnnoFillLineSegRef ref, uint index) { struct AnnoStrokeLineSeg { vec2 p0; vec2 p1; + uint path_ix; vec2 stroke; }; -#define AnnoStrokeLineSeg_size 24 +#define AnnoStrokeLineSeg_size 28 AnnoStrokeLineSegRef AnnoStrokeLineSeg_index(AnnoStrokeLineSegRef ref, uint index) { return AnnoStrokeLineSegRef(ref.offset + index * AnnoStrokeLineSeg_size); @@ -120,9 +122,11 @@ AnnoFillLineSeg AnnoFillLineSeg_read(AnnoFillLineSegRef ref) { uint raw1 = annotated[ix + 1]; uint raw2 = annotated[ix + 2]; uint raw3 = annotated[ix + 3]; + uint raw4 = annotated[ix + 4]; AnnoFillLineSeg s; s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.path_ix = raw4; return s; } @@ -132,6 +136,7 @@ void AnnoFillLineSeg_write(AnnoFillLineSegRef ref, AnnoFillLineSeg s) { annotated[ix + 1] = floatBitsToUint(s.p0.y); annotated[ix + 2] = floatBitsToUint(s.p1.x); annotated[ix + 3] = floatBitsToUint(s.p1.y); + annotated[ix + 4] = s.path_ix; } AnnoStrokeLineSeg AnnoStrokeLineSeg_read(AnnoStrokeLineSegRef ref) { @@ -142,10 +147,12 @@ AnnoStrokeLineSeg AnnoStrokeLineSeg_read(AnnoStrokeLineSegRef ref) { uint raw3 = annotated[ix + 3]; uint raw4 = annotated[ix + 4]; uint raw5 = annotated[ix + 5]; + uint raw6 = annotated[ix + 6]; AnnoStrokeLineSeg s; s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); - s.stroke = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5)); + s.path_ix = raw4; + s.stroke = vec2(uintBitsToFloat(raw5), uintBitsToFloat(raw6)); return s; } @@ -155,8 +162,9 @@ void AnnoStrokeLineSeg_write(AnnoStrokeLineSegRef ref, AnnoStrokeLineSeg s) { annotated[ix + 1] = floatBitsToUint(s.p0.y); annotated[ix + 2] = floatBitsToUint(s.p1.x); annotated[ix + 3] = floatBitsToUint(s.p1.y); - annotated[ix + 4] = floatBitsToUint(s.stroke.x); - annotated[ix + 5] = floatBitsToUint(s.stroke.y); + annotated[ix + 4] = s.path_ix; + annotated[ix + 5] = floatBitsToUint(s.stroke.x); + annotated[ix + 6] = floatBitsToUint(s.stroke.y); } AnnoQuadSeg AnnoQuadSeg_read(AnnoQuadSegRef ref) { diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv index 6ea0877..524f9e4 100644 Binary files a/piet-gpu/shader/binning.spv and b/piet-gpu/shader/binning.spv differ diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja index 14c72aa..27fcfe2 100644 --- a/piet-gpu/shader/build.ninja +++ b/piet-gpu/shader/build.ninja @@ -14,6 +14,10 @@ build elements.spv: glsl elements.comp | scene.h state.h annotated.h build binning.spv: glsl binning.comp | annotated.h state.h bins.h setup.h +build tile_alloc.spv: glsl tile_alloc.comp | annotated.h tile.h setup.h + +build path_coarse.spv: glsl path_coarse.comp | annotated.h tile.h setup.h + build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv index 5a43f4a..4b7e1c4 100644 Binary files a/piet-gpu/shader/coarse.spv and b/piet-gpu/shader/coarse.spv differ diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp index 43bb9cc..230b710 100644 --- a/piet-gpu/shader/elements.comp +++ b/piet-gpu/shader/elements.comp @@ -30,9 +30,15 @@ layout(set = 0, binding = 2) buffer AnnotatedBuf { uint[] annotated; }; +// Path segments are stored here. +layout(set = 0, binding = 3) buffer PathSegBuf { + uint[] pathseg; +}; + #include "scene.h" #include "state.h" #include "annotated.h" +#include "pathseg.h" #define StateBuf_stride (8 + 2 * State_size) @@ -83,6 +89,8 @@ State combine_state(State a, State b) { c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth; c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX)) | b.flags; c.flags |= (a.flags & FLAG_RESET_BBOX) >> 1; + c.path_count = a.path_count + b.path_count; + c.pathseg_count = a.pathseg_count + b.pathseg_count; return c; } @@ -96,6 +104,8 @@ State map_element(ElementRef ref, inout bool is_fill) { c.translate = vec2(0.0, 0.0); c.linewidth = 1.0; // TODO should be 0.0 c.flags = 0; + c.path_count = 0; + c.pathseg_count = 0; is_fill = false; switch (tag) { case Element_FillLine: @@ -103,22 +113,26 @@ State map_element(ElementRef ref, inout bool is_fill) { LineSeg line = Element_FillLine_read(ref); c.bbox.xy = min(line.p0, line.p1); c.bbox.zw = max(line.p0, line.p1); + c.pathseg_count = 1; break; case Element_Quad: QuadSeg quad = Element_Quad_read(ref); c.bbox.xy = min(min(quad.p0, quad.p1), quad.p2); c.bbox.zw = max(max(quad.p0, quad.p1), quad.p2); + c.pathseg_count = 1; break; case Element_Cubic: CubicSeg cubic = Element_Cubic_read(ref); c.bbox.xy = min(min(cubic.p0, cubic.p1), min(cubic.p2, cubic.p3)); c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3)); + c.pathseg_count = 1; break; case Element_Fill: is_fill = true; // fall-through case Element_Stroke: c.flags = FLAG_RESET_BBOX; + c.path_count = 1; break; case Element_SetLineWidth: SetLineWidth lw = Element_SetLineWidth_read(ref); @@ -148,6 +162,8 @@ shared vec2 sh_translate[WG_SIZE]; shared vec4 sh_bbox[WG_SIZE]; shared float sh_width[WG_SIZE]; shared uint sh_flags[WG_SIZE]; +shared uint sh_path_count[WG_SIZE]; +shared uint sh_pathseg_count[WG_SIZE]; shared uint sh_min_fill; @@ -187,6 +203,8 @@ void main() { sh_bbox[gl_LocalInvocationID.x] = agg.bbox; sh_width[gl_LocalInvocationID.x] = agg.linewidth; sh_flags[gl_LocalInvocationID.x] = agg.flags; + sh_path_count[gl_LocalInvocationID.x] = agg.path_count; + sh_pathseg_count[gl_LocalInvocationID.x] = agg.pathseg_count; for (uint i = 0; i < LG_WG_SIZE; i++) { barrier(); if (gl_LocalInvocationID.x >= (1 << i)) { @@ -197,6 +215,8 @@ void main() { other.bbox = sh_bbox[ix]; other.linewidth = sh_width[ix]; other.flags = sh_flags[ix]; + other.path_count = sh_path_count[ix]; + other.pathseg_count = sh_pathseg_count[ix]; agg = combine_state(other, agg); } barrier(); @@ -205,6 +225,8 @@ void main() { sh_bbox[gl_LocalInvocationID.x] = agg.bbox; sh_width[gl_LocalInvocationID.x] = agg.linewidth; sh_flags[gl_LocalInvocationID.x] = agg.flags; + sh_path_count[gl_LocalInvocationID.x] = agg.path_count; + sh_pathseg_count[gl_LocalInvocationID.x] = agg.pathseg_count; } State exclusive; @@ -213,6 +235,8 @@ void main() { exclusive.translate = vec2(0.0, 0.0); exclusive.linewidth = 1.0; //TODO should be 0.0 exclusive.flags = 0; + exclusive.path_count = 0; + exclusive.pathseg_count = 0; // Publish aggregate for this partition if (gl_LocalInvocationID.x == WG_SIZE - 1) { @@ -266,6 +290,8 @@ void main() { other.bbox = sh_bbox[ix]; other.linewidth = sh_width[ix]; other.flags = sh_flags[ix]; + other.path_count = sh_path_count[ix]; + other.pathseg_count = sh_pathseg_count[ix]; row = combine_state(row, other); } if (my_min_fill == ~0 && gl_LocalInvocationID.x == 0) { @@ -284,25 +310,26 @@ void main() { // gains to be had from stashing in shared memory or possibly // registers (though register pressure is an issue). ElementRef this_ref = Element_index(ref, i); - AnnotatedRef out_ref = AnnotatedRef((ix + i) * Annotated_size); uint tag = Element_tag(this_ref); switch (tag) { case Element_FillLine: case Element_StrokeLine: LineSeg line = Element_StrokeLine_read(this_ref); - AnnoStrokeLineSeg anno_line; - anno_line.p0 = st.mat.xy * line.p0.x + st.mat.zw * line.p0.y + st.translate; - anno_line.p1 = st.mat.xy * line.p1.x + st.mat.zw * line.p1.y + st.translate; + PathStrokeLine path_line; + path_line.p0 = st.mat.xy * line.p0.x + st.mat.zw * line.p0.y + st.translate; + path_line.p1 = st.mat.xy * line.p1.x + st.mat.zw * line.p1.y + st.translate; + path_line.path_ix = st.path_count; if (tag == Element_StrokeLine) { - anno_line.stroke = get_linewidth(st); + path_line.stroke = get_linewidth(st); } else { - anno_line.stroke = vec2(0.0); + path_line.stroke = vec2(0.0); } // We do encoding a bit by hand to minimize divergence. Another approach // would be to have a fill/stroke bool. - uint out_tag = tag == Element_FillLine ? Annotated_FillLine : Annotated_StrokeLine; - annotated[out_ref.offset >> 2] = out_tag; - AnnoStrokeLineSeg_write(AnnoStrokeLineSegRef(out_ref.offset + 4), anno_line); + PathSegRef path_out_ref = PathSegRef((st.pathseg_count - 1) * PathSeg_size); + uint out_tag = tag == Element_FillLine ? PathSeg_FillLine : PathSeg_StrokeLine; + pathseg[path_out_ref.offset >> 2] = out_tag; + PathStrokeLine_write(PathStrokeLineRef(path_out_ref.offset + 4), path_line); break; case Element_Stroke: Stroke stroke = Element_Stroke_read(this_ref); @@ -311,6 +338,7 @@ void main() { vec2 lw = get_linewidth(st); anno_stroke.bbox = st.bbox + vec4(-lw, lw); anno_stroke.linewidth = st.linewidth * sqrt(st.mat.x * st.mat.w - st.mat.y * st.mat.z); + AnnotatedRef out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size); Annotated_Stroke_write(out_ref, anno_stroke); break; case Element_Fill: @@ -318,11 +346,9 @@ void main() { AnnoFill anno_fill; anno_fill.rgba_color = fill.rgba_color; anno_fill.bbox = st.bbox; + out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size); Annotated_Fill_write(out_ref, anno_fill); break; - default: - Annotated_Nop_write(out_ref); - break; } } } diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv index a2d439c..18f4dc5 100644 Binary files a/piet-gpu/shader/elements.spv and b/piet-gpu/shader/elements.spv differ diff --git a/piet-gpu/shader/path_coarse.comp b/piet-gpu/shader/path_coarse.comp new file mode 100644 index 0000000..ff79925 --- /dev/null +++ b/piet-gpu/shader/path_coarse.comp @@ -0,0 +1,107 @@ +// Coarse rasterization of path segments. + +// Allocation and initialization of tiles for paths. + +#version 450 +#extension GL_GOOGLE_include_directive : enable + +#include "setup.h" + +#define TILE_ALLOC_WG 32 + +layout(local_size_x = TILE_ALLOC_WG, local_size_y = 1) in; + +layout(set = 0, binding = 0) buffer PathSegBuf { + uint[] pathseg; +}; + +layout(set = 0, binding = 1) buffer AllocBuf { + uint n_paths; + uint n_pathseg; + uint alloc; +}; + +layout(set = 0, binding = 2) buffer TileBuf { + uint[] tile; +}; + +#include "pathseg.h" +#include "tile.h" + +// scale factors useful for converting coordinates to tiles +#define SX (1.0 / float(TILE_WIDTH_PX)) +#define SY (1.0 / float(TILE_HEIGHT_PX)) + +void main() { + uint element_ix = gl_GlobalInvocationID.x; + PathSegRef ref = PathSegRef(element_ix * PathSeg_size); + + uint tag = PathSeg_Nop; + if (element_ix < n_pathseg) { + tag = PathSeg_tag(ref); + } + // Setup for coverage algorithm. + float a, b, c; + // Bounding box of element in pixel coordinates. + float xmin, xmax, ymin, ymax; + PathStrokeLine line; + switch (tag) { + case PathSeg_FillLine: + case PathSeg_StrokeLine: + line = PathSeg_StrokeLine_read(ref); + xmin = min(line.p0.x, line.p1.x) - line.stroke.x; + xmax = max(line.p0.x, line.p1.x) + line.stroke.x; + ymin = min(line.p0.y, line.p1.y) - line.stroke.y; + ymax = max(line.p0.y, line.p1.y) + line.stroke.y; + float dx = line.p1.x - line.p0.x; + float dy = line.p1.y - line.p0.y; + // Set up for per-scanline coverage formula, below. + float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy; + c = (line.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + line.stroke.y)) * SX; + b = invslope; // Note: assumes square tiles, otherwise scale. + a = (line.p0.x - (line.p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX; + break; + } + int x0 = int(floor((xmin) * SX)); + int x1 = int(ceil((xmax) * SX)); + int y0 = int(floor((ymin) * SY)); + int y1 = int(ceil((ymax) * SY)); + + uint path_ix = line.path_ix; + Path path = Path_read(PathRef(path_ix * Path_size)); + ivec4 bbox = ivec4(path.bbox); + x0 = clamp(x0, bbox.x, bbox.z); + y0 = clamp(y0, bbox.y, bbox.w); + x1 = clamp(x1, bbox.x, bbox.z); + y1 = clamp(y1, bbox.y, bbox.w); + float t = a + b * float(y0); + int stride = bbox.z - bbox.x; + int base = (y0 - bbox.y) * stride - bbox.x; + // TODO: can be tighter, use c to bound width + uint n_tile_alloc = uint(stride * (bbox.w - bbox.y)); + // Consider using subgroups to aggregate atomic add. + uint tile_offset = atomicAdd(alloc, n_tile_alloc * TileSeg_size); + TileSeg tile_seg; + tile_seg.start = line.p0; + tile_seg.end = line.p1; + for (int y = y0; y < y1; y++) { + int xx0 = clamp(int(floor(t - c)), x0, x1); + int xx1 = clamp(int(ceil(t + c)), x0, x1); + for (int x = xx0; x < xx1; x++) { + TileRef tile_ref = Tile_index(path.tiles, uint(base + x)); + uint tile_el = tile_ref.offset >> 2; + uint old; + uint actual; + do { + old = tile[tile_el]; + actual = atomicCompSwap(tile[tile_el], old, tile_offset); + } while (actual != old); + tile_seg.next.offset = old; + TileSeg_write(TileSegRef(tile_offset), tile_seg); + tile_offset += TileSeg_size; + } + // TODO for fills: backdrop + t += b; + base += stride; + } +} diff --git a/piet-gpu/shader/path_coarse.spv b/piet-gpu/shader/path_coarse.spv new file mode 100644 index 0000000..ed212d7 Binary files /dev/null and b/piet-gpu/shader/path_coarse.spv differ diff --git a/piet-gpu/shader/pathseg.h b/piet-gpu/shader/pathseg.h new file mode 100644 index 0000000..dc36d7e --- /dev/null +++ b/piet-gpu/shader/pathseg.h @@ -0,0 +1,125 @@ +// Code auto-generated by piet-gpu-derive + +struct PathFillLineRef { + uint offset; +}; + +struct PathStrokeLineRef { + uint offset; +}; + +struct PathSegRef { + uint offset; +}; + +struct PathFillLine { + vec2 p0; + vec2 p1; + uint path_ix; +}; + +#define PathFillLine_size 20 + +PathFillLineRef PathFillLine_index(PathFillLineRef ref, uint index) { + return PathFillLineRef(ref.offset + index * PathFillLine_size); +} + +struct PathStrokeLine { + vec2 p0; + vec2 p1; + uint path_ix; + vec2 stroke; +}; + +#define PathStrokeLine_size 28 + +PathStrokeLineRef PathStrokeLine_index(PathStrokeLineRef ref, uint index) { + return PathStrokeLineRef(ref.offset + index * PathStrokeLine_size); +} + +#define PathSeg_Nop 0 +#define PathSeg_FillLine 1 +#define PathSeg_StrokeLine 2 +#define PathSeg_size 32 + +PathSegRef PathSeg_index(PathSegRef ref, uint index) { + return PathSegRef(ref.offset + index * PathSeg_size); +} + +PathFillLine PathFillLine_read(PathFillLineRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = pathseg[ix + 0]; + uint raw1 = pathseg[ix + 1]; + uint raw2 = pathseg[ix + 2]; + uint raw3 = pathseg[ix + 3]; + uint raw4 = pathseg[ix + 4]; + PathFillLine s; + s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.path_ix = raw4; + return s; +} + +void PathFillLine_write(PathFillLineRef ref, PathFillLine s) { + uint ix = ref.offset >> 2; + pathseg[ix + 0] = floatBitsToUint(s.p0.x); + pathseg[ix + 1] = floatBitsToUint(s.p0.y); + pathseg[ix + 2] = floatBitsToUint(s.p1.x); + pathseg[ix + 3] = floatBitsToUint(s.p1.y); + pathseg[ix + 4] = s.path_ix; +} + +PathStrokeLine PathStrokeLine_read(PathStrokeLineRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = pathseg[ix + 0]; + uint raw1 = pathseg[ix + 1]; + uint raw2 = pathseg[ix + 2]; + uint raw3 = pathseg[ix + 3]; + uint raw4 = pathseg[ix + 4]; + uint raw5 = pathseg[ix + 5]; + uint raw6 = pathseg[ix + 6]; + PathStrokeLine s; + s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.path_ix = raw4; + s.stroke = vec2(uintBitsToFloat(raw5), uintBitsToFloat(raw6)); + return s; +} + +void PathStrokeLine_write(PathStrokeLineRef ref, PathStrokeLine s) { + uint ix = ref.offset >> 2; + pathseg[ix + 0] = floatBitsToUint(s.p0.x); + pathseg[ix + 1] = floatBitsToUint(s.p0.y); + pathseg[ix + 2] = floatBitsToUint(s.p1.x); + pathseg[ix + 3] = floatBitsToUint(s.p1.y); + pathseg[ix + 4] = s.path_ix; + pathseg[ix + 5] = floatBitsToUint(s.stroke.x); + pathseg[ix + 6] = floatBitsToUint(s.stroke.y); +} + +uint PathSeg_tag(PathSegRef ref) { + return pathseg[ref.offset >> 2]; +} + +PathFillLine PathSeg_FillLine_read(PathSegRef ref) { + return PathFillLine_read(PathFillLineRef(ref.offset + 4)); +} + +PathStrokeLine PathSeg_StrokeLine_read(PathSegRef ref) { + return PathStrokeLine_read(PathStrokeLineRef(ref.offset + 4)); +} + +void PathSeg_Nop_write(PathSegRef ref) { + pathseg[ref.offset >> 2] = PathSeg_Nop; +} + +void PathSeg_FillLine_write(PathSegRef ref, PathFillLine s) { + pathseg[ref.offset >> 2] = PathSeg_FillLine; + PathFillLine_write(PathFillLineRef(ref.offset + 4), s); +} + +void PathSeg_StrokeLine_write(PathSegRef ref, PathStrokeLine s) { + pathseg[ref.offset >> 2] = PathSeg_StrokeLine; + PathStrokeLine_write(PathStrokeLineRef(ref.offset + 4), s); +} + diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h index b913086..03b3353 100644 --- a/piet-gpu/shader/setup.h +++ b/piet-gpu/shader/setup.h @@ -31,6 +31,7 @@ // TODO: compute all these #define WIDTH_IN_TILES 128 +#define HEIGHT_IN_TILES 96 #define TILEGROUP_WIDTH_TILES 32 #define TILE_WIDTH_PX 16 #define TILE_HEIGHT_PX 16 diff --git a/piet-gpu/shader/state.h b/piet-gpu/shader/state.h index 2547b93..eacab52 100644 --- a/piet-gpu/shader/state.h +++ b/piet-gpu/shader/state.h @@ -10,9 +10,11 @@ struct State { vec4 bbox; float linewidth; uint flags; + uint path_count; + uint pathseg_count; }; -#define State_size 48 +#define State_size 56 StateRef State_index(StateRef ref, uint index) { return StateRef(ref.offset + index * State_size); @@ -32,12 +34,16 @@ State State_read(StateRef ref) { uint raw9 = state[ix + 9]; uint raw10 = state[ix + 10]; uint raw11 = state[ix + 11]; + uint raw12 = state[ix + 12]; + uint raw13 = state[ix + 13]; State s; s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3)); s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5)); s.bbox = vec4(uintBitsToFloat(raw6), uintBitsToFloat(raw7), uintBitsToFloat(raw8), uintBitsToFloat(raw9)); s.linewidth = uintBitsToFloat(raw10); s.flags = raw11; + s.path_count = raw12; + s.pathseg_count = raw13; return s; } @@ -55,5 +61,7 @@ void State_write(StateRef ref, State s) { state[ix + 9] = floatBitsToUint(s.bbox.w); state[ix + 10] = floatBitsToUint(s.linewidth); state[ix + 11] = s.flags; + state[ix + 12] = s.path_count; + state[ix + 13] = s.pathseg_count; } diff --git a/piet-gpu/shader/tile.h b/piet-gpu/shader/tile.h new file mode 100644 index 0000000..b4a8c9b --- /dev/null +++ b/piet-gpu/shader/tile.h @@ -0,0 +1,105 @@ +// Code auto-generated by piet-gpu-derive + +struct PathRef { + uint offset; +}; + +struct TileRef { + uint offset; +}; + +struct TileSegRef { + uint offset; +}; + +struct Path { + uvec4 bbox; + TileRef tiles; +}; + +#define Path_size 12 + +PathRef Path_index(PathRef ref, uint index) { + return PathRef(ref.offset + index * Path_size); +} + +struct Tile { + TileSegRef tile; + int backdrop; +}; + +#define Tile_size 8 + +TileRef Tile_index(TileRef ref, uint index) { + return TileRef(ref.offset + index * Tile_size); +} + +struct TileSeg { + vec2 start; + vec2 end; + TileSegRef next; +}; + +#define TileSeg_size 20 + +TileSegRef TileSeg_index(TileSegRef ref, uint index) { + return TileSegRef(ref.offset + index * TileSeg_size); +} + +Path Path_read(PathRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = tile[ix + 0]; + uint raw1 = tile[ix + 1]; + uint raw2 = tile[ix + 2]; + Path s; + s.bbox = uvec4(raw0 & 0xffff, raw0 >> 16, raw1 & 0xffff, raw1 >> 16); + s.tiles = TileRef(raw2); + return s; +} + +void Path_write(PathRef ref, Path s) { + uint ix = ref.offset >> 2; + tile[ix + 0] = s.bbox.x | (s.bbox.y << 16); + tile[ix + 1] = s.bbox.z | (s.bbox.w << 16); + tile[ix + 2] = s.tiles.offset; +} + +Tile Tile_read(TileRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = tile[ix + 0]; + uint raw1 = tile[ix + 1]; + Tile s; + s.tile = TileSegRef(raw0); + s.backdrop = int(raw1); + return s; +} + +void Tile_write(TileRef ref, Tile s) { + uint ix = ref.offset >> 2; + tile[ix + 0] = s.tile.offset; + tile[ix + 1] = uint(s.backdrop); +} + +TileSeg TileSeg_read(TileSegRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = tile[ix + 0]; + uint raw1 = tile[ix + 1]; + uint raw2 = tile[ix + 2]; + uint raw3 = tile[ix + 3]; + uint raw4 = tile[ix + 4]; + TileSeg s; + s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.next = TileSegRef(raw4); + return s; +} + +void TileSeg_write(TileSegRef ref, TileSeg s) { + uint ix = ref.offset >> 2; + tile[ix + 0] = floatBitsToUint(s.start.x); + tile[ix + 1] = floatBitsToUint(s.start.y); + tile[ix + 2] = floatBitsToUint(s.end.x); + tile[ix + 3] = floatBitsToUint(s.end.y); + tile[ix + 4] = s.next.offset; +} + diff --git a/piet-gpu/shader/tile_alloc.comp b/piet-gpu/shader/tile_alloc.comp new file mode 100644 index 0000000..d8b1eb9 --- /dev/null +++ b/piet-gpu/shader/tile_alloc.comp @@ -0,0 +1,73 @@ +// Allocation and initialization of tiles for paths. + +#version 450 +#extension GL_GOOGLE_include_directive : enable + +#include "setup.h" + +#define TILE_ALLOC_WG 32 + +layout(local_size_x = TILE_ALLOC_WG, local_size_y = 1) in; + +layout(set = 0, binding = 0) buffer AnnotatedBuf { + uint[] annotated; +}; + +layout(set = 0, binding = 1) buffer AllocBuf { + uint n_elements; + uint n_pathseg; + uint alloc; +}; + +layout(set = 0, binding = 2) buffer TileBuf { + uint[] tile; +}; + +#include "annotated.h" +#include "tile.h" + +// scale factors useful for converting coordinates to tiles +#define SX (1.0 / float(TILE_WIDTH_PX)) +#define SY (1.0 / float(TILE_HEIGHT_PX)) + +void main() { + uint element_ix = gl_GlobalInvocationID.x; + PathRef path_ref = PathRef(element_ix * Path_size); + AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size); + + uint tag = Annotated_Nop; + if (element_ix < n_elements) { + tag = Annotated_tag(ref); + } + int x0 = 0, y0 = 0, x1 = 0, y1 = 0; + switch (tag) { + case Annotated_Fill: + case Annotated_Stroke: + // Note: we take advantage of the fact that fills and strokes + // have compatible layout. + AnnoFill fill = Annotated_Fill_read(ref); + x0 = int(floor(fill.bbox.x * SX)); + y0 = int(floor(fill.bbox.y * SY)); + x1 = int(ceil(fill.bbox.z * SX)); + y1 = int(ceil(fill.bbox.w * SY)); + break; + } + x0 = clamp(x0, 0, WIDTH_IN_TILES); + y0 = clamp(y0, 0, HEIGHT_IN_TILES); + x1 = clamp(x1, 0, WIDTH_IN_TILES); + y1 = clamp(y1, 0, HEIGHT_IN_TILES); + + Path path; + path.bbox = uvec4(x0, y0, x1, y1); + uint n_tiles = (x1 - x0) * (y1 - y0); + path.tiles = TileRef(0); + if (n_tiles > 0) { + path.tiles.offset = atomicAdd(alloc, n_tiles * Tile_size); + Tile init_tile = Tile(TileSegRef(0), 0); + // TODO: improve load balancing + for (uint i = 0; i < n_tiles; i++) { + Tile_write(Tile_index(path.tiles, i), init_tile); + } + } + Path_write(path_ref, path); +} diff --git a/piet-gpu/shader/tile_alloc.spv b/piet-gpu/shader/tile_alloc.spv new file mode 100644 index 0000000..0835903 Binary files /dev/null and b/piet-gpu/shader/tile_alloc.spv differ diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index 3ec7e1d..19e9b43 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -129,12 +129,23 @@ pub struct Renderer { pub state_buf: D::Buffer, pub anno_buf: D::Buffer, + pub pathseg_buf: D::Buffer, + pub tile_buf: D::Buffer, pub bin_buf: D::Buffer, pub ptcl_buf: D::Buffer, el_pipeline: D::Pipeline, el_ds: D::DescriptorSet, + tile_pipeline: D::Pipeline, + tile_ds: D::DescriptorSet, + + path_pipeline: D::Pipeline, + path_ds: D::DescriptorSet, + + tile_alloc_buf_host: D::Buffer, + tile_alloc_buf_dev: D::Buffer, + bin_pipeline: D::Pipeline, bin_ds: D::DescriptorSet, @@ -151,10 +162,12 @@ pub struct Renderer { k4_ds: D::DescriptorSet, n_elements: usize, + n_paths: usize, + n_pathseg: usize, } impl Renderer { - pub unsafe fn new(device: &D, scene: &[u8]) -> Result { + pub unsafe fn new(device: &D, scene: &[u8], n_paths: usize, n_pathseg: usize) -> Result { let host = MemFlags::host_coherent(); let dev = MemFlags::device_local(); @@ -170,16 +183,44 @@ impl Renderer { device.write_buffer(&scene_buf, &scene)?; let state_buf = device.create_buffer(1 * 1024 * 1024, dev)?; - let anno_buf = device.create_buffer(64 * 1024 * 1024, dev)?; + let anno_buf = device.create_buffer(64 * 1024 * 1024, host)?; + let pathseg_buf = device.create_buffer(64 * 1024 * 1024, host)?; + let tile_buf = device.create_buffer(64 * 1024 * 1024, host)?; let bin_buf = device.create_buffer(64 * 1024 * 1024, dev)?; let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?; let image_dev = device.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?; let el_code = include_bytes!("../shader/elements.spv"); - let el_pipeline = device.create_simple_compute_pipeline(el_code, 3, 0)?; + let el_pipeline = device.create_simple_compute_pipeline(el_code, 4, 0)?; let el_ds = device.create_descriptor_set( &el_pipeline, - &[&scene_dev, &state_buf, &anno_buf], + &[&scene_dev, &state_buf, &anno_buf, &pathseg_buf], + &[], + )?; + + let tile_alloc_buf_host = device.create_buffer(12, host)?; + let tile_alloc_buf_dev = device.create_buffer(12, dev)?; + + // TODO: constants + const PATH_SIZE: usize = 12; + let tile_alloc_start = ((n_paths + 31) & !31) * PATH_SIZE; + device.write_buffer( + &tile_alloc_buf_host, + &[n_paths as u32, n_pathseg as u32, tile_alloc_start as u32], + )?; + let tile_alloc_code = include_bytes!("../shader/tile_alloc.spv"); + let tile_pipeline = device.create_simple_compute_pipeline(tile_alloc_code, 3, 0)?; + let tile_ds = device.create_descriptor_set( + &tile_pipeline, + &[&anno_buf, &tile_alloc_buf_dev, &tile_buf], + &[], + )?; + + let path_alloc_code = include_bytes!("../shader/path_coarse.spv"); + let path_pipeline = device.create_simple_compute_pipeline(path_alloc_code, 3, 0)?; + let path_ds = device.create_descriptor_set( + &path_pipeline, + &[&pathseg_buf, &tile_alloc_buf_dev, &tile_buf], &[], )?; @@ -226,6 +267,10 @@ impl Renderer { image_dev, el_pipeline, el_ds, + tile_pipeline, + tile_ds, + path_pipeline, + path_ds, bin_pipeline, bin_ds, coarse_pipeline, @@ -234,18 +279,25 @@ impl Renderer { k4_ds, state_buf, anno_buf, + pathseg_buf, + tile_buf, bin_buf, ptcl_buf, + tile_alloc_buf_host, + tile_alloc_buf_dev, bin_alloc_buf_host, bin_alloc_buf_dev, coarse_alloc_buf_host, coarse_alloc_buf_dev, n_elements, + n_paths, + n_pathseg, }) } pub unsafe fn record(&self, cmd_buf: &mut impl CmdBuf, query_pool: &D::QueryPool) { cmd_buf.copy_buffer(&self.scene_buf, &self.scene_dev); + cmd_buf.copy_buffer(&self.tile_alloc_buf_host, &self.tile_alloc_buf_dev); cmd_buf.copy_buffer(&self.bin_alloc_buf_host, &self.bin_alloc_buf_dev); cmd_buf.copy_buffer(&self.coarse_alloc_buf_host, &self.coarse_alloc_buf_dev); cmd_buf.clear_buffer(&self.state_buf); @@ -264,26 +316,44 @@ impl Renderer { ); cmd_buf.write_timestamp(&query_pool, 1); cmd_buf.memory_barrier(); + cmd_buf.dispatch( + &self.tile_pipeline, + &self.tile_ds, + (((self.n_paths + 31) / 32) as u32, 1, 1), + ); + cmd_buf.write_timestamp(&query_pool, 2); + cmd_buf.memory_barrier(); + cmd_buf.dispatch( + &self.path_pipeline, + &self.path_ds, + (((self.n_pathseg + 31) / 32) as u32, 1, 1), + ); + /* cmd_buf.dispatch( &self.bin_pipeline, &self.bin_ds, (((self.n_elements + 255) / 256) as u32, 1, 1), ); - cmd_buf.write_timestamp(&query_pool, 2); + */ + cmd_buf.write_timestamp(&query_pool, 3); cmd_buf.memory_barrier(); + /* cmd_buf.dispatch( &self.coarse_pipeline, &self.coarse_ds, (WIDTH as u32 / 256, HEIGHT as u32 / 256, 1), ); - cmd_buf.write_timestamp(&query_pool, 3); + */ + cmd_buf.write_timestamp(&query_pool, 4); cmd_buf.memory_barrier(); + /* cmd_buf.dispatch( &self.k4_pipeline, &self.k4_ds, ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1), ); - cmd_buf.write_timestamp(&query_pool, 4); + cmd_buf.write_timestamp(&query_pool, 5); + */ cmd_buf.memory_barrier(); cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc); } diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs index da234de..7908ff2 100644 --- a/piet-gpu/src/render_ctx.rs +++ b/piet-gpu/src/render_ctx.rs @@ -35,6 +35,10 @@ pub struct PietGpuRenderContext { // Will probably need direct accesss to hal Device to create images etc. inner_text: PietGpuText, stroke_width: f32, + // We're tallying these cpu-side for expedience, but will probably + // move this to some kind of readback from element processing. + path_count: usize, + pathseg_count: usize, } #[derive(Clone)] @@ -56,6 +60,8 @@ impl PietGpuRenderContext { elements, inner_text, stroke_width, + path_count: 0, + pathseg_count: 0, } } @@ -63,6 +69,14 @@ impl PietGpuRenderContext { self.elements.encode(&mut self.encoder); self.encoder.buf() } + + pub fn path_count(&self) -> usize { + self.path_count + } + + pub fn pathseg_count(&self) -> usize { + self.pathseg_count + } } impl RenderContext for PietGpuRenderContext { @@ -99,6 +113,7 @@ impl RenderContext for PietGpuRenderContext { PietGpuBrush::Solid(rgba_color) => { let stroke = Stroke { rgba_color }; self.elements.push(Element::Stroke(stroke)); + self.path_count += 1; } _ => (), } @@ -121,6 +136,7 @@ impl RenderContext for PietGpuRenderContext { PietGpuBrush::Solid(rgba_color) => { let fill = Fill { rgba_color }; self.elements.push(Element::Fill(fill)); + self.path_count += 1; } _ => (), } @@ -204,6 +220,7 @@ impl PietGpuRenderContext { } else { self.elements.push(Element::StrokeLine(seg)); } + self.pathseg_count += 1; } fn encode_path(&mut self, path: impl Iterator, is_fill: bool) {