diff --git a/Cargo.lock b/Cargo.lock index 4a0a133..2755d5d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -26,6 +26,15 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "000444226fcff248f2bc4c7625be32c63caccfecc2723a2b9f78a7487a49c407" +[[package]] +name = "ansi_term" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" +dependencies = [ + "winapi 0.3.8", +] + [[package]] name = "approx" version = "0.3.2" @@ -59,6 +68,17 @@ dependencies = [ "raw-window-handle", ] +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi 0.3.8", +] + [[package]] name = "autocfg" version = "1.0.0" @@ -106,6 +126,21 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" +[[package]] +name = "clap" +version = "2.33.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bdfa80d47f954d53a35a64987ca1422f495b8d6483c0fe9f7117b36c2a792129" +dependencies = [ + "ansi_term", + "atty", + "bitflags", + "strsim", + "textwrap", + "unicode-width", + "vec_map", +] + [[package]] name = "cloudabi" version = "0.0.3" @@ -259,6 +294,15 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f36b5f248235f45773d4944f555f83ea61fe07b18b561ccf99d7483d7381e54d" +[[package]] +name = "hermit-abi" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91780f809e750b0a89f5544be56617ff6b1227ee485bcb06ebe10cdf89bd3b71" +dependencies = [ + "libc", +] + [[package]] name = "inflate" version = "0.4.5" @@ -525,6 +569,7 @@ dependencies = [ name = "piet-gpu" version = "0.1.0" dependencies = [ + "clap", "piet", "piet-gpu-hal", "piet-gpu-types", @@ -758,6 +803,12 @@ dependencies = [ "byteorder", ] +[[package]] +name = "strsim" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" + [[package]] name = "syn" version = "1.0.17" @@ -769,6 +820,21 @@ dependencies = [ "unicode-xid 0.2.0", ] +[[package]] +name = "textwrap" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" +dependencies = [ + "unicode-width", +] + +[[package]] +name = "unicode-width" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "caaa9d531767d1ff2150b9332433f32a24622147e5ebb1f26409d5da67afd479" + [[package]] name = "unicode-xid" version = "0.1.0" @@ -781,6 +847,12 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c" +[[package]] +name = "vec_map" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" + [[package]] name = "void" version = "1.0.2" diff --git a/piet-gpu-types/src/annotated.rs b/piet-gpu-types/src/annotated.rs new file mode 100644 index 0000000..f7a6ad6 --- /dev/null +++ b/piet-gpu-types/src/annotated.rs @@ -0,0 +1,53 @@ +use piet_gpu_derive::piet_gpu; + +piet_gpu! { + #[gpu_write] + mod annotated { + struct AnnoFillLineSeg { + p0: [f32; 2], + p1: [f32; 2], + // A note: the layout of this struct is shared with + // AnnoStrokeLineSeg. In that case, we actually write + // [0.0, 0.0] as the stroke field, to minimize divergence. + } + struct AnnoStrokeLineSeg { + p0: [f32; 2], + p1: [f32; 2], + // halfwidth in both x and y for binning + stroke: [f32; 2], + } + struct AnnoQuadSeg { + p0: [f32; 2], + p1: [f32; 2], + p2: [f32; 2], + stroke: [f32; 2], + } + struct AnnoCubicSeg { + p0: [f32; 2], + p1: [f32; 2], + p2: [f32; 2], + p3: [f32; 2], + stroke: [f32; 2], + } + struct AnnoFill { + rgba_color: u32, + bbox: [f32; 4], + } + struct AnnoStroke { + rgba_color: u32, + bbox: [f32; 4], + // For the nonuniform scale case, this needs to be a 2x2 matrix. + // That's expected to be uncommon, so we could special-case it. + linewidth: f32, + } + enum Annotated { + Nop, + FillLine(AnnoFillLineSeg), + StrokeLine(AnnoStrokeLineSeg), + Quad(AnnoQuadSeg), + Cubic(AnnoCubicSeg), + Stroke(AnnoStroke), + Fill(AnnoFill), + } + } +} diff --git a/piet-gpu-types/src/bins.rs b/piet-gpu-types/src/bins.rs new file mode 100644 index 0000000..1ac2413 --- /dev/null +++ b/piet-gpu-types/src/bins.rs @@ -0,0 +1,22 @@ +use piet_gpu_derive::piet_gpu; + +// The output of the binning stage, organized as a linked list of chunks. + +piet_gpu! { + #[gpu_write] + mod bins { + struct BinInstance { + element_ix: u32, + // Right edge of the bounding box of the associated fill + // element; used in backdrop computation. + right_edge: f32, + } + + struct BinChunk { + // First chunk can have n = 0, subsequent ones not. + n: u32, + next: Ref, + // Instances follow + } + } +} diff --git a/piet-gpu-types/src/fill_seg.rs b/piet-gpu-types/src/fill_seg.rs deleted file mode 100644 index 2242a84..0000000 --- a/piet-gpu-types/src/fill_seg.rs +++ /dev/null @@ -1,37 +0,0 @@ -use piet_gpu_derive::piet_gpu; - -// Structures representing segments for fill items. - -// There is some cut'n'paste here from stroke segments, which can be -// traced to the fact that buffers in GLSL are basically global. -// Maybe there's a way to address that, but in the meantime living -// with the duplication is easiest. - -piet_gpu! { - #[gpu_write] - mod fill_seg { - struct FillTileHeader { - n: u32, - items: Ref, - } - - struct FillItemHeader { - backdrop: i32, - segments: Ref, - } - - // TODO: strongly consider using f16. If so, these would be - // relative to the tile. We're doing f32 for now to minimize - // divergence from piet-metal originals. - struct FillSegment { - start: [f32; 2], - end: [f32; 2], - } - - struct FillSegChunk { - n: u32, - next: Ref, - // Segments follow (could represent this as a variable sized array). - } - } -} diff --git a/piet-gpu-types/src/lib.rs b/piet-gpu-types/src/lib.rs index d85df70..75a7731 100644 --- a/piet-gpu-types/src/lib.rs +++ b/piet-gpu-types/src/lib.rs @@ -1,7 +1,10 @@ +// Structures used only internally probably don't need to be pub. + +pub mod annotated; +pub mod bins; pub mod encoder; -pub mod fill_seg; pub mod ptcl; pub mod scene; -pub mod segment; +pub mod state; pub mod test; pub mod tilegroup; diff --git a/piet-gpu-types/src/main.rs b/piet-gpu-types/src/main.rs index c0b9d7e..9c40051 100644 --- a/piet-gpu-types/src/main.rs +++ b/piet-gpu-types/src/main.rs @@ -5,9 +5,10 @@ fn main() { .expect("provide a module name"); match mod_name.as_str() { "scene" => print!("{}", piet_gpu_types::scene::gen_gpu_scene()), + "state" => print!("{}", piet_gpu_types::state::gen_gpu_state()), + "annotated" => print!("{}", piet_gpu_types::annotated::gen_gpu_annotated()), + "bins" => print!("{}", piet_gpu_types::bins::gen_gpu_bins()), "tilegroup" => print!("{}", piet_gpu_types::tilegroup::gen_gpu_tilegroup()), - "segment" => print!("{}", piet_gpu_types::segment::gen_gpu_segment()), - "fill_seg" => print!("{}", piet_gpu_types::fill_seg::gen_gpu_fill_seg()), "ptcl" => print!("{}", piet_gpu_types::ptcl::gen_gpu_ptcl()), "test" => print!("{}", piet_gpu_types::test::gen_gpu_test()), _ => println!("Oops, unknown module name"), diff --git a/piet-gpu-types/src/ptcl.rs b/piet-gpu-types/src/ptcl.rs index 911f2c8..bdf342b 100644 --- a/piet-gpu-types/src/ptcl.rs +++ b/piet-gpu-types/src/ptcl.rs @@ -13,14 +13,13 @@ piet_gpu! { end: [f32; 2], } struct CmdStroke { - // Should be Ref if we had cross-module references. - seg_ref: u32, + // Consider a specialization to one segment. + seg_ref: Ref, half_width: f32, rgba_color: u32, } struct CmdFill { - // Should be Ref if we had cross-module references. - seg_ref: u32, + seg_ref: Ref, backdrop: i32, rgba_color: u32, } @@ -51,5 +50,24 @@ piet_gpu! { Jump(CmdJump), Bail, } + + // TODO: strongly consider using f16. If so, these would be + // relative to the tile. We're doing f32 for now to minimize + // divergence from piet-metal originals. + struct Segment { + start: [f32; 2], + end: [f32; 2], + + // This is used for fills only, but we're including it in + // the general structure for simplicity. + y_edge: f32, + } + + struct SegChunk { + n: u32, + next: Ref, + // Actually a reference to a variable-sized slice. + segs: Ref, + } } } diff --git a/piet-gpu-types/src/scene.rs b/piet-gpu-types/src/scene.rs index 5f95c40..5792c94 100644 --- a/piet-gpu-types/src/scene.rs +++ b/piet-gpu-types/src/scene.rs @@ -4,6 +4,8 @@ pub use self::scene::{ Bbox, PietCircle, PietFill, PietItem, PietStrokeLine, PietStrokePolyLine, Point, SimpleGroup, }; +pub use self::scene::{CubicSeg, Element, Fill, LineSeg, QuadSeg, SetLineWidth, Stroke, Transform}; + piet_gpu! { #[rust_encode] mod scene { @@ -51,5 +53,53 @@ piet_gpu! { Fill(PietFill), Poly(PietStrokePolyLine), } + + // New approach follows (above to be deleted) + struct LineSeg { + p0: [f32; 2], + p1: [f32; 2], + } + struct QuadSeg { + p0: [f32; 2], + p1: [f32; 2], + p2: [f32; 2], + } + struct CubicSeg { + p0: [f32; 2], + p1: [f32; 2], + p2: [f32; 2], + p3: [f32; 2], + } + struct Fill { + rgba_color: u32, + } + struct Stroke { + rgba_color: u32, + } + struct SetLineWidth { + width: f32, + } + struct Transform { + mat: [f32; 4], + translate: [f32; 2], + } + enum Element { + Nop, + // Another approach to encoding would be to use a single + // variant but have a bool for fill/stroke. This could be + // packed into the tag, so the on-the-wire representation + // would be very similar to what's here. + StrokeLine(LineSeg), + FillLine(LineSeg), + + // Note: we'll need to handle the stroke/fill distinction + // for these as well, when we do flattening on the GPU. + Quad(QuadSeg), + Cubic(CubicSeg), + Stroke(Stroke), + Fill(Fill), + SetLineWidth(SetLineWidth), + Transform(Transform), + } } } diff --git a/piet-gpu-types/src/segment.rs b/piet-gpu-types/src/segment.rs deleted file mode 100644 index 0b18ab8..0000000 --- a/piet-gpu-types/src/segment.rs +++ /dev/null @@ -1,32 +0,0 @@ -use piet_gpu_derive::piet_gpu; - -// Structures representing segments for stroke/fill items. - -piet_gpu! { - #[gpu_write] - mod segment { - struct TileHeader { - n: u32, - items: Ref, - } - - // Note: this is only suitable for strokes, fills require backdrop. - struct ItemHeader { - segments: Ref, - } - - // TODO: strongly consider using f16. If so, these would be - // relative to the tile. We're doing f32 for now to minimize - // divergence from piet-metal originals. - struct Segment { - start: [f32; 2], - end: [f32; 2], - } - - struct SegChunk { - n: u32, - next: Ref, - // Segments follow (could represent this as a variable sized array). - } - } -} diff --git a/piet-gpu-types/src/state.rs b/piet-gpu-types/src/state.rs new file mode 100644 index 0000000..35076f0 --- /dev/null +++ b/piet-gpu-types/src/state.rs @@ -0,0 +1,14 @@ +use piet_gpu_derive::piet_gpu; + +piet_gpu! { + #[gpu_write] + mod state { + struct State { + mat: [f32; 4], + translate: [f32; 2], + bbox: [f32; 4], + linewidth: f32, + flags: u32, + } + } +} diff --git a/piet-gpu/Cargo.toml b/piet-gpu/Cargo.toml index 6bdf178..a338d76 100644 --- a/piet-gpu/Cargo.toml +++ b/piet-gpu/Cargo.toml @@ -26,3 +26,4 @@ png = "0.16.2" rand = "0.7.3" roxmltree = "0.11" winit = "0.22" +clap = "2.33" diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs index 839c262..31024aa 100644 --- a/piet-gpu/bin/cli.rs +++ b/piet-gpu/bin/cli.rs @@ -2,10 +2,12 @@ use std::fs::File; use std::io::BufWriter; use std::path::Path; +use clap::{Arg, App}; + use piet_gpu_hal::vulkan::VkInstance; use piet_gpu_hal::{CmdBuf, Device, Error, MemFlags}; -use piet_gpu::{PietGpuRenderContext, Renderer, render_scene, WIDTH, HEIGHT}; +use piet_gpu::{render_scene, render_svg, PietGpuRenderContext, Renderer, HEIGHT, WIDTH}; #[allow(unused)] fn dump_scene(buf: &[u8]) { @@ -16,22 +18,179 @@ fn dump_scene(buf: &[u8]) { } } +#[allow(unused)] +fn dump_state(buf: &[u8]) { + for i in 0..(buf.len() / 48) { + let j = i * 48; + let floats = (0..11).map(|k| { + let mut buf_f32 = [0u8; 4]; + buf_f32.copy_from_slice(&buf[j + k * 4..j + k * 4 + 4]); + f32::from_le_bytes(buf_f32) + }).collect::>(); + println!("{}: [{} {} {} {} {} {}] ({}, {})-({} {}) {} {}", + i, + floats[0], floats[1], floats[2], floats[3], floats[4], floats[5], + floats[6], floats[7], floats[8], floats[9], + floats[10], buf[j + 44]); + } + +} + +/// Interpret the output of the binning stage, for diagnostic purposes. +#[allow(unused)] +fn trace_merge(buf: &[u32]) { + for bin in 0..256 { + println!("bin {}:", bin); + let mut starts = (0..16).map(|i| Some((bin * 16 + i) * 64)).collect::>>(); + loop { + let min_start = starts.iter().map(|st| + st.map(|st| + if buf[st / 4] == 0 { + !0 + } else { + buf[st / 4 + 2] + }).unwrap_or(!0)).min().unwrap(); + if min_start == !0 { + break; + } + let mut selected = !0; + for i in 0..16 { + if let Some(st) = starts[i] { + if buf[st/4] != 0 && buf[st/4 + 2] == min_start { + selected = i; + break; + } + } + } + let st = starts[selected].unwrap(); + println!("selected {}, start {:x}", selected, st); + for j in 0..buf[st/4] { + println!("{:x}", buf[st/4 + 2 + j as usize]) + } + if buf[st/4 + 1] == 0 { + starts[selected] = None; + } else { + starts[selected] = Some(buf[st/4 + 1] as usize); + } + } + + } +} + +/// Interpret the output of the coarse raster stage, for diagnostic purposes. +#[allow(unused)] +fn trace_ptcl(buf: &[u32]) { + for y in 0..96 { + for x in 0..128 { + let tile_ix = y * 128 + x; + println!("tile {} @({}, {})", tile_ix, x, y); + let mut tile_offset = tile_ix * 1024; + loop { + let tag = buf[tile_offset / 4]; + match tag { + 0 => break, + 3 => { + let backdrop = buf[tile_offset / 4 + 2]; + let rgba_color = buf[tile_offset / 4 + 3]; + println!(" {:x}: fill {:x} {}", tile_offset, rgba_color, backdrop); + let mut seg_chunk = buf[tile_offset / 4 + 1] as usize; + let n = buf[seg_chunk / 4] as usize; + let segs = buf[seg_chunk / 4 + 2] as usize; + println!(" chunk @{:x}: n={}, segs @{:x}", seg_chunk, n, segs); + for i in 0..n { + let x0 = f32::from_bits(buf[segs / 4 + i * 5]); + let y0 = f32::from_bits(buf[segs / 4 + i * 5 + 1]); + let x1 = f32::from_bits(buf[segs / 4 + i * 5 + 2]); + let y1 = f32::from_bits(buf[segs / 4 + i * 5 + 3]); + let y_edge = f32::from_bits(buf[segs / 4 + i * 5 + 4]); + println!(" ({:.3}, {:.3}) - ({:.3}, {:.3}) | {:.3}", x0, y0, x1, y1, y_edge); + } + loop { + seg_chunk = buf[seg_chunk / 4 + 1] as usize; + if seg_chunk == 0 { + break; + } + } + } + 4 => { + let line_width = f32::from_bits(buf[tile_offset / 4 + 2]); + let rgba_color = buf[tile_offset / 4 + 3]; + println!(" {:x}: stroke {:x} {}", tile_offset, rgba_color, line_width); + let mut seg_chunk = buf[tile_offset / 4 + 1] as usize; + let n = buf[seg_chunk / 4] as usize; + let segs = buf[seg_chunk / 4 + 2] as usize; + println!(" chunk @{:x}: n={}, segs @{:x}", seg_chunk, n, segs); + for i in 0..n { + let x0 = f32::from_bits(buf[segs / 4 + i * 5]); + let y0 = f32::from_bits(buf[segs / 4 + i * 5 + 1]); + let x1 = f32::from_bits(buf[segs / 4 + i * 5 + 2]); + let y1 = f32::from_bits(buf[segs / 4 + i * 5 + 3]); + let y_edge = f32::from_bits(buf[segs / 4 + i * 5 + 4]); + println!(" ({:.3}, {:.3}) - ({:.3}, {:.3}) | {:.3}", x0, y0, x1, y1, y_edge); + } + loop { + seg_chunk = buf[seg_chunk / 4 + 1] as usize; + if seg_chunk == 0 { + break; + } + } + } + _ => { + println!("{:x}: {}", tile_offset, tag); + } + } + if tag == 0 { + break; + } + if tag == 8 { + tile_offset = buf[tile_offset / 4 + 1] as usize; + } else { + tile_offset += 20; + } + } + } + } +} + + fn main() -> Result<(), Error> { + let matches = App::new("piet-gpu test") + .arg(Arg::with_name("INPUT") + .index(1)) + .arg(Arg::with_name("flip") + .short("f") + .long("flip")) + .arg(Arg::with_name("scale") + .short("s") + .long("scale") + .takes_value(true)) + .get_matches(); let (instance, _) = VkInstance::new(None)?; unsafe { let device = instance.device(None)?; let fence = device.create_fence(false)?; let mut cmd_buf = device.create_cmd_buf()?; - let query_pool = device.create_query_pool(6)?; + let query_pool = device.create_query_pool(5)?; let mut ctx = PietGpuRenderContext::new(); - render_scene(&mut ctx); + if let Some(input) = matches.value_of("INPUT") { + let mut scale = matches.value_of("scale") + .map(|scale| scale.parse().unwrap()) + .unwrap_or(8.0); + if matches.is_present("flip") { + scale = -scale; + } + render_svg(&mut ctx, input, scale); + } else { + render_scene(&mut ctx); + } let scene = ctx.get_scene_buf(); //dump_scene(&scene); let renderer = Renderer::new(&device, scene)?; - let image_buf = device.create_buffer((WIDTH * HEIGHT * 4) as u64, MemFlags::host_coherent())?; + let image_buf = + device.create_buffer((WIDTH * HEIGHT * 4) as u64, MemFlags::host_coherent())?; cmd_buf.begin(); renderer.record(&mut cmd_buf, &query_pool); @@ -39,29 +198,17 @@ fn main() -> Result<(), Error> { cmd_buf.finish(); device.run_cmd_buf(&cmd_buf, &[], &[], Some(&fence))?; device.wait_and_reset(&[fence])?; - let timestamps = device.reap_query_pool(&query_pool).unwrap(); - println!("Kernel 1 time: {:.3}ms", timestamps[0] * 1e3); - println!( - "Kernel 2s time: {:.3}ms", - (timestamps[1] - timestamps[0]) * 1e3 - ); - println!( - "Kernel 2f time: {:.3}ms", - (timestamps[2] - timestamps[1]) * 1e3 - ); - println!( - "Kernel 3 time: {:.3}ms", - (timestamps[3] - timestamps[2]) * 1e3 - ); - println!( - "Render time: {:.3}ms", - (timestamps[4] - timestamps[3]) * 1e3 - ); + let ts = device.reap_query_pool(&query_pool).unwrap(); + println!("Element kernel time: {:.3}ms", ts[0] * 1e3); + println!("Binning kernel time: {:.3}ms", (ts[1] - ts[0]) * 1e3); + println!("Coarse kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3); + println!("Render kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3); /* - let mut k1_data: Vec = Default::default(); - device.read_buffer(&segment_buf, &mut k1_data).unwrap(); - dump_k1_data(&k1_data); + let mut data: Vec = Default::default(); + device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap(); + piet_gpu::dump_k1_data(&data); + //trace_ptcl(&data); */ let mut img_data: Vec = Default::default(); diff --git a/piet-gpu/bin/winit.rs b/piet-gpu/bin/winit.rs index e5f174a..fd30fa3 100644 --- a/piet-gpu/bin/winit.rs +++ b/piet-gpu/bin/winit.rs @@ -1,7 +1,7 @@ use piet_gpu_hal::vulkan::VkInstance; use piet_gpu_hal::{CmdBuf, Device, Error, ImageLayout}; -use piet_gpu::{PietGpuRenderContext, Renderer, render_scene, WIDTH, HEIGHT}; +use piet_gpu::{render_scene, PietGpuRenderContext, Renderer, HEIGHT, WIDTH}; use winit::{ event::{Event, WindowEvent}, @@ -37,7 +37,7 @@ fn main() -> Result<(), Error> { .map(|_| device.create_cmd_buf()) .collect::, Error>>()?; let query_pools = (0..NUM_FRAMES) - .map(|_| device.create_query_pool(6)) + .map(|_| device.create_query_pool(5)) .collect::, Error>>()?; let mut ctx = PietGpuRenderContext::new(); @@ -69,12 +69,12 @@ fn main() -> Result<(), Error> { device.wait_and_reset(&[frame_fences[frame_idx]]).unwrap(); let timestamps = device.reap_query_pool(query_pool).unwrap(); - window.set_title(&format!("k1: {:.3}ms, k2s: {:.3}ms, k2f: {:.3}ms, k3: {:.3}ms, k4: {:.3}ms", + window.set_title(&format!( + "e: {:.3}ms, b: {:.3}ms, c: {:.3}ms, f: {:.3}ms", timestamps[0] * 1e3, (timestamps[1] - timestamps[0]) * 1e3, (timestamps[2] - timestamps[1]) * 1e3, (timestamps[3] - timestamps[2]) * 1e3, - (timestamps[4] - timestamps[3]) * 1e3, )); } @@ -93,11 +93,7 @@ fn main() -> Result<(), Error> { ImageLayout::BlitDst, ); cmd_buf.blit_image(&renderer.image_dev, &swap_image); - cmd_buf.image_barrier( - &swap_image, - ImageLayout::BlitDst, - ImageLayout::Present, - ); + cmd_buf.image_barrier(&swap_image, ImageLayout::BlitDst, ImageLayout::Present); cmd_buf.finish(); device diff --git a/piet-gpu/shader/annotated.h b/piet-gpu/shader/annotated.h new file mode 100644 index 0000000..9812264 --- /dev/null +++ b/piet-gpu/shader/annotated.h @@ -0,0 +1,335 @@ +// Code auto-generated by piet-gpu-derive + +struct AnnoFillLineSegRef { + uint offset; +}; + +struct AnnoStrokeLineSegRef { + uint offset; +}; + +struct AnnoQuadSegRef { + uint offset; +}; + +struct AnnoCubicSegRef { + uint offset; +}; + +struct AnnoFillRef { + uint offset; +}; + +struct AnnoStrokeRef { + uint offset; +}; + +struct AnnotatedRef { + uint offset; +}; + +struct AnnoFillLineSeg { + vec2 p0; + vec2 p1; +}; + +#define AnnoFillLineSeg_size 16 + +AnnoFillLineSegRef AnnoFillLineSeg_index(AnnoFillLineSegRef ref, uint index) { + return AnnoFillLineSegRef(ref.offset + index * AnnoFillLineSeg_size); +} + +struct AnnoStrokeLineSeg { + vec2 p0; + vec2 p1; + vec2 stroke; +}; + +#define AnnoStrokeLineSeg_size 24 + +AnnoStrokeLineSegRef AnnoStrokeLineSeg_index(AnnoStrokeLineSegRef ref, uint index) { + return AnnoStrokeLineSegRef(ref.offset + index * AnnoStrokeLineSeg_size); +} + +struct AnnoQuadSeg { + vec2 p0; + vec2 p1; + vec2 p2; + vec2 stroke; +}; + +#define AnnoQuadSeg_size 32 + +AnnoQuadSegRef AnnoQuadSeg_index(AnnoQuadSegRef ref, uint index) { + return AnnoQuadSegRef(ref.offset + index * AnnoQuadSeg_size); +} + +struct AnnoCubicSeg { + vec2 p0; + vec2 p1; + vec2 p2; + vec2 p3; + vec2 stroke; +}; + +#define AnnoCubicSeg_size 40 + +AnnoCubicSegRef AnnoCubicSeg_index(AnnoCubicSegRef ref, uint index) { + return AnnoCubicSegRef(ref.offset + index * AnnoCubicSeg_size); +} + +struct AnnoFill { + uint rgba_color; + vec4 bbox; +}; + +#define AnnoFill_size 20 + +AnnoFillRef AnnoFill_index(AnnoFillRef ref, uint index) { + return AnnoFillRef(ref.offset + index * AnnoFill_size); +} + +struct AnnoStroke { + uint rgba_color; + vec4 bbox; + float linewidth; +}; + +#define AnnoStroke_size 24 + +AnnoStrokeRef AnnoStroke_index(AnnoStrokeRef ref, uint index) { + return AnnoStrokeRef(ref.offset + index * AnnoStroke_size); +} + +#define Annotated_Nop 0 +#define Annotated_FillLine 1 +#define Annotated_StrokeLine 2 +#define Annotated_Quad 3 +#define Annotated_Cubic 4 +#define Annotated_Stroke 5 +#define Annotated_Fill 6 +#define Annotated_size 44 + +AnnotatedRef Annotated_index(AnnotatedRef ref, uint index) { + return AnnotatedRef(ref.offset + index * Annotated_size); +} + +AnnoFillLineSeg AnnoFillLineSeg_read(AnnoFillLineSegRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = annotated[ix + 0]; + uint raw1 = annotated[ix + 1]; + uint raw2 = annotated[ix + 2]; + uint raw3 = annotated[ix + 3]; + AnnoFillLineSeg s; + s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + return s; +} + +void AnnoFillLineSeg_write(AnnoFillLineSegRef ref, AnnoFillLineSeg s) { + uint ix = ref.offset >> 2; + annotated[ix + 0] = floatBitsToUint(s.p0.x); + annotated[ix + 1] = floatBitsToUint(s.p0.y); + annotated[ix + 2] = floatBitsToUint(s.p1.x); + annotated[ix + 3] = floatBitsToUint(s.p1.y); +} + +AnnoStrokeLineSeg AnnoStrokeLineSeg_read(AnnoStrokeLineSegRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = annotated[ix + 0]; + uint raw1 = annotated[ix + 1]; + uint raw2 = annotated[ix + 2]; + uint raw3 = annotated[ix + 3]; + uint raw4 = annotated[ix + 4]; + uint raw5 = annotated[ix + 5]; + AnnoStrokeLineSeg s; + s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.stroke = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5)); + return s; +} + +void AnnoStrokeLineSeg_write(AnnoStrokeLineSegRef ref, AnnoStrokeLineSeg s) { + uint ix = ref.offset >> 2; + annotated[ix + 0] = floatBitsToUint(s.p0.x); + annotated[ix + 1] = floatBitsToUint(s.p0.y); + annotated[ix + 2] = floatBitsToUint(s.p1.x); + annotated[ix + 3] = floatBitsToUint(s.p1.y); + annotated[ix + 4] = floatBitsToUint(s.stroke.x); + annotated[ix + 5] = floatBitsToUint(s.stroke.y); +} + +AnnoQuadSeg AnnoQuadSeg_read(AnnoQuadSegRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = annotated[ix + 0]; + uint raw1 = annotated[ix + 1]; + uint raw2 = annotated[ix + 2]; + uint raw3 = annotated[ix + 3]; + uint raw4 = annotated[ix + 4]; + uint raw5 = annotated[ix + 5]; + uint raw6 = annotated[ix + 6]; + uint raw7 = annotated[ix + 7]; + AnnoQuadSeg s; + s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5)); + s.stroke = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7)); + return s; +} + +void AnnoQuadSeg_write(AnnoQuadSegRef ref, AnnoQuadSeg s) { + uint ix = ref.offset >> 2; + annotated[ix + 0] = floatBitsToUint(s.p0.x); + annotated[ix + 1] = floatBitsToUint(s.p0.y); + annotated[ix + 2] = floatBitsToUint(s.p1.x); + annotated[ix + 3] = floatBitsToUint(s.p1.y); + annotated[ix + 4] = floatBitsToUint(s.p2.x); + annotated[ix + 5] = floatBitsToUint(s.p2.y); + annotated[ix + 6] = floatBitsToUint(s.stroke.x); + annotated[ix + 7] = floatBitsToUint(s.stroke.y); +} + +AnnoCubicSeg AnnoCubicSeg_read(AnnoCubicSegRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = annotated[ix + 0]; + uint raw1 = annotated[ix + 1]; + uint raw2 = annotated[ix + 2]; + uint raw3 = annotated[ix + 3]; + uint raw4 = annotated[ix + 4]; + uint raw5 = annotated[ix + 5]; + uint raw6 = annotated[ix + 6]; + uint raw7 = annotated[ix + 7]; + uint raw8 = annotated[ix + 8]; + uint raw9 = annotated[ix + 9]; + AnnoCubicSeg s; + s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5)); + s.p3 = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7)); + s.stroke = vec2(uintBitsToFloat(raw8), uintBitsToFloat(raw9)); + return s; +} + +void AnnoCubicSeg_write(AnnoCubicSegRef ref, AnnoCubicSeg s) { + uint ix = ref.offset >> 2; + annotated[ix + 0] = floatBitsToUint(s.p0.x); + annotated[ix + 1] = floatBitsToUint(s.p0.y); + annotated[ix + 2] = floatBitsToUint(s.p1.x); + annotated[ix + 3] = floatBitsToUint(s.p1.y); + annotated[ix + 4] = floatBitsToUint(s.p2.x); + annotated[ix + 5] = floatBitsToUint(s.p2.y); + annotated[ix + 6] = floatBitsToUint(s.p3.x); + annotated[ix + 7] = floatBitsToUint(s.p3.y); + annotated[ix + 8] = floatBitsToUint(s.stroke.x); + annotated[ix + 9] = floatBitsToUint(s.stroke.y); +} + +AnnoFill AnnoFill_read(AnnoFillRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = annotated[ix + 0]; + uint raw1 = annotated[ix + 1]; + uint raw2 = annotated[ix + 2]; + uint raw3 = annotated[ix + 3]; + uint raw4 = annotated[ix + 4]; + AnnoFill s; + s.rgba_color = raw0; + s.bbox = vec4(uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3), uintBitsToFloat(raw4)); + return s; +} + +void AnnoFill_write(AnnoFillRef ref, AnnoFill s) { + uint ix = ref.offset >> 2; + annotated[ix + 0] = s.rgba_color; + annotated[ix + 1] = floatBitsToUint(s.bbox.x); + annotated[ix + 2] = floatBitsToUint(s.bbox.y); + annotated[ix + 3] = floatBitsToUint(s.bbox.z); + annotated[ix + 4] = floatBitsToUint(s.bbox.w); +} + +AnnoStroke AnnoStroke_read(AnnoStrokeRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = annotated[ix + 0]; + uint raw1 = annotated[ix + 1]; + uint raw2 = annotated[ix + 2]; + uint raw3 = annotated[ix + 3]; + uint raw4 = annotated[ix + 4]; + uint raw5 = annotated[ix + 5]; + AnnoStroke s; + s.rgba_color = raw0; + s.bbox = vec4(uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3), uintBitsToFloat(raw4)); + s.linewidth = uintBitsToFloat(raw5); + return s; +} + +void AnnoStroke_write(AnnoStrokeRef ref, AnnoStroke s) { + uint ix = ref.offset >> 2; + annotated[ix + 0] = s.rgba_color; + annotated[ix + 1] = floatBitsToUint(s.bbox.x); + annotated[ix + 2] = floatBitsToUint(s.bbox.y); + annotated[ix + 3] = floatBitsToUint(s.bbox.z); + annotated[ix + 4] = floatBitsToUint(s.bbox.w); + annotated[ix + 5] = floatBitsToUint(s.linewidth); +} + +uint Annotated_tag(AnnotatedRef ref) { + return annotated[ref.offset >> 2]; +} + +AnnoFillLineSeg Annotated_FillLine_read(AnnotatedRef ref) { + return AnnoFillLineSeg_read(AnnoFillLineSegRef(ref.offset + 4)); +} + +AnnoStrokeLineSeg Annotated_StrokeLine_read(AnnotatedRef ref) { + return AnnoStrokeLineSeg_read(AnnoStrokeLineSegRef(ref.offset + 4)); +} + +AnnoQuadSeg Annotated_Quad_read(AnnotatedRef ref) { + return AnnoQuadSeg_read(AnnoQuadSegRef(ref.offset + 4)); +} + +AnnoCubicSeg Annotated_Cubic_read(AnnotatedRef ref) { + return AnnoCubicSeg_read(AnnoCubicSegRef(ref.offset + 4)); +} + +AnnoStroke Annotated_Stroke_read(AnnotatedRef ref) { + return AnnoStroke_read(AnnoStrokeRef(ref.offset + 4)); +} + +AnnoFill Annotated_Fill_read(AnnotatedRef ref) { + return AnnoFill_read(AnnoFillRef(ref.offset + 4)); +} + +void Annotated_Nop_write(AnnotatedRef ref) { + annotated[ref.offset >> 2] = Annotated_Nop; +} + +void Annotated_FillLine_write(AnnotatedRef ref, AnnoFillLineSeg s) { + annotated[ref.offset >> 2] = Annotated_FillLine; + AnnoFillLineSeg_write(AnnoFillLineSegRef(ref.offset + 4), s); +} + +void Annotated_StrokeLine_write(AnnotatedRef ref, AnnoStrokeLineSeg s) { + annotated[ref.offset >> 2] = Annotated_StrokeLine; + AnnoStrokeLineSeg_write(AnnoStrokeLineSegRef(ref.offset + 4), s); +} + +void Annotated_Quad_write(AnnotatedRef ref, AnnoQuadSeg s) { + annotated[ref.offset >> 2] = Annotated_Quad; + AnnoQuadSeg_write(AnnoQuadSegRef(ref.offset + 4), s); +} + +void Annotated_Cubic_write(AnnotatedRef ref, AnnoCubicSeg s) { + annotated[ref.offset >> 2] = Annotated_Cubic; + AnnoCubicSeg_write(AnnoCubicSegRef(ref.offset + 4), s); +} + +void Annotated_Stroke_write(AnnotatedRef ref, AnnoStroke s) { + annotated[ref.offset >> 2] = Annotated_Stroke; + AnnoStroke_write(AnnoStrokeRef(ref.offset + 4), s); +} + +void Annotated_Fill_write(AnnotatedRef ref, AnnoFill s) { + annotated[ref.offset >> 2] = Annotated_Fill; + AnnoFill_write(AnnoFillRef(ref.offset + 4), s); +} + diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp new file mode 100644 index 0000000..d35c2d9 --- /dev/null +++ b/piet-gpu/shader/binning.comp @@ -0,0 +1,193 @@ +// The binning stage of the pipeline. + +#version 450 +#extension GL_GOOGLE_include_directive : enable + +#include "setup.h" + +layout(local_size_x = N_TILE, local_size_y = 1) in; + +layout(set = 0, binding = 0) buffer AnnotatedBuf { + uint[] annotated; +}; + +// This is for scanning forward for right_edge data. +layout(set = 0, binding = 1) buffer StateBuf { + uint[] state; +}; + +layout(set = 0, binding = 2) buffer AllocBuf { + uint n_elements; + // Will be incremented atomically to claim tiles + uint tile_ix; + uint alloc; +}; + +layout(set = 0, binding = 3) buffer BinsBuf { + uint[] bins; +}; + +#include "annotated.h" +#include "state.h" +#include "bins.h" + +// scale factors useful for converting coordinates to bins +#define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX)) +#define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX)) + +#define TSY (1.0 / float(TILE_HEIGHT_PX)) + +// Constant not available in GLSL. Also consider uintBitsToFloat(0x7f800000) +#define INFINITY (1.0 / 0.0) + +// Note: cudaraster has N_TILE + 1 to cut down on bank conflicts. +shared uint bitmaps[N_SLICE][N_TILE]; +shared uint count[N_SLICE][N_TILE]; +shared uint sh_chunk_start[N_TILE]; + +shared float sh_right_edge[N_TILE]; + +#define StateBuf_stride (8 + 2 * State_size) + +uint state_right_edge_index(uint partition_ix) { + return 2 + partition_ix * (StateBuf_stride / 4); +} + +void main() { + uint chunk_n = 0; + uint my_n_elements = n_elements; + uint my_partition = gl_WorkGroupID.x; + + for (uint i = 0; i < N_SLICE; i++) { + bitmaps[i][gl_LocalInvocationID.x] = 0; + } + barrier(); + + // Read inputs and determine coverage of bins + uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x; + AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size); + uint tag = Annotated_Nop; + if (element_ix < my_n_elements) { + tag = Annotated_tag(ref); + } + int x0 = 0, y0 = 0, x1 = 0, y1 = 0; + float my_right_edge = INFINITY; + bool crosses_edge = false; + switch (tag) { + case Annotated_FillLine: + case Annotated_StrokeLine: + AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref); + x0 = int(floor((min(line.p0.x, line.p1.x) - line.stroke.x) * SX)); + y0 = int(floor((min(line.p0.y, line.p1.y) - line.stroke.y) * SY)); + x1 = int(ceil((max(line.p0.x, line.p1.x) + line.stroke.x) * SX)); + y1 = int(ceil((max(line.p0.y, line.p1.y) + line.stroke.y) * SY)); + crosses_edge = tag == Annotated_FillLine && ceil(line.p0.y * TSY) != ceil(line.p1.y * TSY); + break; + case Annotated_Fill: + case Annotated_Stroke: + // Note: we take advantage of the fact that fills and strokes + // have compatible layout. + AnnoFill fill = Annotated_Fill_read(ref); + x0 = int(floor(fill.bbox.x * SX)); + y0 = int(floor(fill.bbox.y * SY)); + x1 = int(ceil(fill.bbox.z * SX)); + y1 = int(ceil(fill.bbox.w * SY)); + // It probably makes more sense to track x1, to avoid having to redo + // the rounding to tile coords. + my_right_edge = fill.bbox.z; + break; + } + + // If the last element in this partition is a fill edge, then we need to do a + // look-forward to find the right edge of its corresponding fill. That data is + // recorded in aggregates computed in the element processing pass. + if (gl_LocalInvocationID.x == N_TILE - 1 && tag == Annotated_FillLine) { + uint aggregate_ix = (my_partition + 1) * ELEMENT_BINNING_RATIO; + // This is sequential but the expectation is that the amount of + // look-forward is small (performance may degrade in the case + // of massively complex paths). + do { + my_right_edge = uintBitsToFloat(state[state_right_edge_index(aggregate_ix)]); + aggregate_ix++; + } while (isinf(my_right_edge)); + } + + // Now propagate right_edge backward, from fill to segment. + for (uint i = 0; i < LG_N_TILE; i++) { + // Note: we could try to cut down on write bandwidth here if the value hasn't + // changed, but not sure it's worth the complexity to track. + sh_right_edge[gl_LocalInvocationID.x] = my_right_edge; + barrier(); + if (gl_LocalInvocationID.x + (1 << i) < N_TILE && isinf(my_right_edge)) { + my_right_edge = sh_right_edge[gl_LocalInvocationID.x + (1 << i)]; + } + barrier(); + } + if (crosses_edge) { + x1 = int(ceil(my_right_edge * SX)); + } + + // At this point, we run an iterator over the coverage area, + // trying to keep divergence low. + // Right now, it's just a bbox, but we'll get finer with + // segments. + x0 = clamp(x0, 0, N_TILE_X); + x1 = clamp(x1, x0, N_TILE_X); + y0 = clamp(y0, 0, N_TILE_Y); + y1 = clamp(y1, y0, N_TILE_Y); + if (x0 == x1) y1 = y0; + int x = x0, y = y0; + uint my_slice = gl_LocalInvocationID.x / 32; + uint my_mask = 1 << (gl_LocalInvocationID.x & 31); + while (y < y1) { + atomicOr(bitmaps[my_slice][y * N_TILE_X + x], my_mask); + x++; + if (x == x1) { + x = x0; + y++; + } + } + + barrier(); + // Allocate output segments. + uint element_count = 0; + for (uint i = 0; i < N_SLICE; i++) { + element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]); + count[i][gl_LocalInvocationID.x] = element_count; + } + // element_count is number of elements covering bin for this invocation. + uint chunk_start = 0; + if (element_count != 0) { + // TODO: aggregate atomic adds (subgroup is probably fastest) + chunk_start = atomicAdd(alloc, element_count * BinInstance_size); + sh_chunk_start[gl_LocalInvocationID.x] = chunk_start; + } + // Note: it might be more efficient for reading to do this in the + // other order (each bin is a contiguous sequence of partitions) + uint out_ix = (my_partition * N_TILE + gl_LocalInvocationID.x) * 2; + bins[out_ix] = element_count; + bins[out_ix + 1] = chunk_start; + + barrier(); + // Use similar strategy as Laine & Karras paper; loop over bbox of bins + // touched by this element + x = x0; + y = y0; + while (y < y1) { + uint bin_ix = y * N_TILE_X + x; + uint out_mask = bitmaps[my_slice][bin_ix]; + if ((out_mask & my_mask) != 0) { + uint idx = bitCount(out_mask & (my_mask - 1)); + if (my_slice > 0) { + idx += count[my_slice - 1][bin_ix]; + } + uint out_offset = sh_chunk_start[bin_ix] + idx * BinInstance_size; + BinInstance_write(BinInstanceRef(out_offset), BinInstance(element_ix, my_right_edge)); + } + x++; + if (x == x1) { + x = x0; + y++; + } + } +} diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv new file mode 100644 index 0000000..6ea0877 Binary files /dev/null and b/piet-gpu/shader/binning.spv differ diff --git a/piet-gpu/shader/bins.h b/piet-gpu/shader/bins.h new file mode 100644 index 0000000..85f7536 --- /dev/null +++ b/piet-gpu/shader/bins.h @@ -0,0 +1,64 @@ +// Code auto-generated by piet-gpu-derive + +struct BinInstanceRef { + uint offset; +}; + +struct BinChunkRef { + uint offset; +}; + +struct BinInstance { + uint element_ix; + float right_edge; +}; + +#define BinInstance_size 8 + +BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) { + return BinInstanceRef(ref.offset + index * BinInstance_size); +} + +struct BinChunk { + uint n; + BinChunkRef next; +}; + +#define BinChunk_size 8 + +BinChunkRef BinChunk_index(BinChunkRef ref, uint index) { + return BinChunkRef(ref.offset + index * BinChunk_size); +} + +BinInstance BinInstance_read(BinInstanceRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = bins[ix + 0]; + uint raw1 = bins[ix + 1]; + BinInstance s; + s.element_ix = raw0; + s.right_edge = uintBitsToFloat(raw1); + return s; +} + +void BinInstance_write(BinInstanceRef ref, BinInstance s) { + uint ix = ref.offset >> 2; + bins[ix + 0] = s.element_ix; + bins[ix + 1] = floatBitsToUint(s.right_edge); +} + +BinChunk BinChunk_read(BinChunkRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = bins[ix + 0]; + uint raw1 = bins[ix + 1]; + BinChunk s; + s.n = raw0; + s.next = BinChunkRef(raw1); + return s; +} + +void BinChunk_write(BinChunkRef ref, BinChunk s) { + uint ix = ref.offset >> 2; + bins[ix + 0] = s.n; + bins[ix + 1] = s.next.offset; +} + diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja index 0aaecae..14c72aa 100644 --- a/piet-gpu/shader/build.ninja +++ b/piet-gpu/shader/build.ninja @@ -9,12 +9,11 @@ rule glsl build image.spv: glsl image.comp | scene.h -build kernel1.spv: glsl kernel1.comp | scene.h tilegroup.h setup.h -build kernel2s.spv: glsl kernel2s.comp | scene.h tilegroup.h segment.h setup.h +build elements.spv: glsl elements.comp | scene.h state.h annotated.h -build kernel2f.spv: glsl kernel2f.comp | scene.h tilegroup.h fill_seg.h setup.h +build binning.spv: glsl binning.comp | annotated.h state.h bins.h setup.h -build kernel3.spv: glsl kernel3.comp | scene.h tilegroup.h segment.h fill_seg.h ptcl.h setup.h +build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h -build kernel4.spv: glsl kernel4.comp | ptcl.h segment.h fill_seg.h setup.h +build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp new file mode 100644 index 0000000..3656f77 --- /dev/null +++ b/piet-gpu/shader/coarse.comp @@ -0,0 +1,526 @@ +// The coarse rasterizer stage of the pipeline. + +#version 450 +#extension GL_GOOGLE_include_directive : enable + +#include "setup.h" + +layout(local_size_x = N_TILE, local_size_y = 1) in; + +layout(set = 0, binding = 0) buffer AnnotatedBuf { + uint[] annotated; +}; + +layout(set = 0, binding = 1) buffer BinsBuf { + uint[] bins; +}; + +layout(set = 0, binding = 2) buffer AllocBuf { + uint n_elements; + uint alloc; +}; + +layout(set = 0, binding = 3) buffer PtclBuf { + uint[] ptcl; +}; + +#include "annotated.h" +#include "bins.h" +#include "ptcl.h" + +#define LG_N_PART_READ 8 +#define N_PART_READ (1 << LG_N_PART_READ) + +shared uint sh_elements[N_TILE]; +shared float sh_right_edge[N_TILE]; + +// Number of elements in the partition; prefix sum. +shared uint sh_part_count[N_PART_READ]; +shared uint sh_part_elements[N_PART_READ]; + +shared uint sh_bitmaps[N_SLICE][N_TILE]; +shared uint sh_backdrop[N_SLICE][N_TILE]; +shared uint sh_bd_sign[N_SLICE]; +shared uint sh_is_segment[N_SLICE]; + +// Shared state for parallel segment output stage + +// Count of total number of segments in each tile, then +// inclusive prefix sum of same. +shared uint sh_seg_count[N_TILE]; +shared uint sh_seg_alloc; + +// scale factors useful for converting coordinates to tiles +#define SX (1.0 / float(TILE_WIDTH_PX)) +#define SY (1.0 / float(TILE_HEIGHT_PX)) + +// Perhaps cmd_limit should be a global? This is a style question. +void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) { + if (cmd_ref.offset > cmd_limit) { + uint new_cmd = atomicAdd(alloc, PTCL_INITIAL_ALLOC); + CmdJump jump = CmdJump(new_cmd); + Cmd_Jump_write(cmd_ref, jump); + cmd_ref = CmdRef(new_cmd); + cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - 2 * Cmd_size; + } +} + +#define CHUNK_ALLOC_SLAB 16 + +uint alloc_chunk_remaining; +uint alloc_chunk_offset; + +SegChunkRef alloc_seg_chunk() { + if (alloc_chunk_remaining == 0) { + alloc_chunk_offset = atomicAdd(alloc, CHUNK_ALLOC_SLAB * SegChunk_size); + alloc_chunk_remaining = CHUNK_ALLOC_SLAB; + } + uint offset = alloc_chunk_offset; + alloc_chunk_offset += SegChunk_size; + alloc_chunk_remaining--; + return SegChunkRef(offset); +} + +// Accumulate delta to backdrop. +// +// Each bit for which bd_bitmap is 1 and bd_sign is 1 counts as +1, and each +// bit for which bd_bitmap is 1 and bd_sign is 0 counts as -1. +int count_backdrop(uint bd_bitmap, uint bd_sign) { + return bitCount(bd_bitmap & bd_sign) - bitCount(bd_bitmap & ~bd_sign); +} + +void main() { + // Could use either linear or 2d layouts for both dispatch and + // invocations within the workgroup. We'll use variables to abstract. + uint bin_ix = N_TILE_X * gl_WorkGroupID.y + gl_WorkGroupID.x; + uint partition_ix = 0; + uint n_partitions = (n_elements + N_TILE - 1) / N_TILE; + // Top left coordinates of this bin. + vec2 xy0 = vec2(N_TILE_X * TILE_WIDTH_PX * gl_WorkGroupID.x, N_TILE_Y * TILE_HEIGHT_PX * gl_WorkGroupID.y); + uint th_ix = gl_LocalInvocationID.x; + + uint tile_x = N_TILE_X * gl_WorkGroupID.x + gl_LocalInvocationID.x % N_TILE_X; + uint tile_y = N_TILE_Y * gl_WorkGroupID.y + gl_LocalInvocationID.x / N_TILE_X; + uint this_tile_ix = tile_y * WIDTH_IN_TILES + tile_x; + CmdRef cmd_ref = CmdRef(this_tile_ix * PTCL_INITIAL_ALLOC); + uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size; + + // Allocation and management of segment output + SegChunkRef first_seg_chunk = SegChunkRef(0); + SegChunkRef last_chunk_ref = SegChunkRef(0); + uint last_chunk_n = 0; + SegmentRef last_chunk_segs = SegmentRef(0); + alloc_chunk_remaining = 0; + + // I'm sure we can figure out how to do this with at least one fewer register... + // Items up to rd_ix have been read from sh_elements + uint rd_ix = 0; + // Items up to wr_ix have been written into sh_elements + uint wr_ix = 0; + // Items between part_start_ix and ready_ix are ready to be transferred from sh_part_elements + uint part_start_ix = 0; + uint ready_ix = 0; + if (th_ix < N_SLICE) { + sh_bd_sign[th_ix] = 0; + } + int backdrop = 0; + while (true) { + for (uint i = 0; i < N_SLICE; i++) { + sh_bitmaps[i][th_ix] = 0; + sh_backdrop[i][th_ix] = 0; + } + if (th_ix < N_SLICE) { + sh_is_segment[th_ix] = 0; + } + + // parallel read of input partitions + do { + if (ready_ix == wr_ix && partition_ix < n_partitions) { + part_start_ix = ready_ix; + uint count = 0; + if (th_ix < N_PART_READ && partition_ix + th_ix < n_partitions) { + uint in_ix = ((partition_ix + th_ix) * N_TILE + bin_ix) * 2; + count = bins[in_ix]; + sh_part_elements[th_ix] = bins[in_ix + 1]; + } + // prefix sum of counts + for (uint i = 0; i < LG_N_PART_READ; i++) { + if (th_ix < N_PART_READ) { + sh_part_count[th_ix] = count; + } + barrier(); + if (th_ix < N_PART_READ) { + if (th_ix >= (1 << i)) { + count += sh_part_count[th_ix - (1 << i)]; + } + } + barrier(); + } + if (th_ix < N_PART_READ) { + sh_part_count[th_ix] = part_start_ix + count; + } + barrier(); + ready_ix = sh_part_count[N_PART_READ - 1]; + partition_ix += N_PART_READ; + } + // use binary search to find element to read + uint ix = rd_ix + th_ix; + if (ix >= wr_ix && ix < ready_ix) { + uint part_ix = 0; + for (uint i = 0; i < LG_N_PART_READ; i++) { + uint probe = part_ix + ((N_PART_READ / 2) >> i); + if (ix >= sh_part_count[probe - 1]) { + part_ix = probe; + } + } + ix -= part_ix > 0 ? sh_part_count[part_ix - 1] : part_start_ix; + BinInstanceRef inst_ref = BinInstanceRef(sh_part_elements[part_ix]); + BinInstance inst = BinInstance_read(BinInstance_index(inst_ref, ix)); + sh_elements[th_ix] = inst.element_ix; + sh_right_edge[th_ix] = inst.right_edge; + } + barrier(); + + wr_ix = min(rd_ix + N_TILE, ready_ix); + } while (wr_ix - rd_ix < N_TILE && (wr_ix < ready_ix || partition_ix < n_partitions)); + + // We've done the merge and filled the buffer. + + // Read one element, compute coverage. + uint tag = Annotated_Nop; + AnnotatedRef ref; + float right_edge = 0.0; + if (th_ix + rd_ix < wr_ix) { + uint element_ix = sh_elements[th_ix]; + right_edge = sh_right_edge[th_ix]; + ref = AnnotatedRef(element_ix * Annotated_size); + tag = Annotated_tag(ref); + } + + // Setup for coverage algorithm. + float a, b, c; + // Bounding box of element in pixel coordinates. + float xmin, xmax, ymin, ymax; + uint my_slice = th_ix / 32; + uint my_mask = 1 << (th_ix & 31); + switch (tag) { + case Annotated_FillLine: + case Annotated_StrokeLine: + AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref); + xmin = min(line.p0.x, line.p1.x) - line.stroke.x; + xmax = max(line.p0.x, line.p1.x) + line.stroke.x; + ymin = min(line.p0.y, line.p1.y) - line.stroke.y; + ymax = max(line.p0.y, line.p1.y) + line.stroke.y; + float dx = line.p1.x - line.p0.x; + float dy = line.p1.y - line.p0.y; + if (tag == Annotated_FillLine) { + // Set bit for backdrop sign calculation, 1 is +1, 0 is -1. + if (dy < 0) { + atomicOr(sh_bd_sign[my_slice], my_mask); + } else { + atomicAnd(sh_bd_sign[my_slice], ~my_mask); + } + } + atomicOr(sh_is_segment[my_slice], my_mask); + // Set up for per-scanline coverage formula, below. + float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy; + c = (line.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + line.stroke.y)) * SX; + b = invslope; // Note: assumes square tiles, otherwise scale. + a = (line.p0.x - xy0.x - (line.p0.y - 0.5 * float(TILE_HEIGHT_PX) - xy0.y) * b) * SX; + break; + case Annotated_Fill: + case Annotated_Stroke: + // Note: we take advantage of the fact that fills and strokes + // have compatible layout. + AnnoFill fill = Annotated_Fill_read(ref); + xmin = fill.bbox.x; + xmax = fill.bbox.z; + ymin = fill.bbox.y; + ymax = fill.bbox.w; + // Just let the clamping to xmin and xmax determine the bounds. + a = 0.0; + b = 0.0; + c = 1e9; + break; + default: + ymin = 0; + ymax = 0; + break; + } + + // Draw the coverage area into the bitmasks. This uses an algorithm + // that computes the coverage of a span for given scanline. + + // Compute bounding box in tiles and clip to this bin. + int x0 = int(floor((xmin - xy0.x) * SX)); + int x1 = int(ceil((xmax - xy0.x) * SX)); + int xr = int(ceil((right_edge - xy0.x) * SX)); + int y0 = int(floor((ymin - xy0.y) * SY)); + int y1 = int(ceil((ymax - xy0.y) * SY)); + x0 = clamp(x0, 0, N_TILE_X); + x1 = clamp(x1, x0, N_TILE_X); + xr = clamp(xr, 0, N_TILE_X); + y0 = clamp(y0, 0, N_TILE_Y); + y1 = clamp(y1, y0, N_TILE_Y); + float t = a + b * float(y0); + for (uint y = y0; y < y1; y++) { + uint xx0 = clamp(int(floor(t - c)), x0, x1); + uint xx1 = clamp(int(ceil(t + c)), x0, x1); + for (uint x = xx0; x < xx1; x++) { + atomicOr(sh_bitmaps[my_slice][y * N_TILE_X + x], my_mask); + } + if (tag == Annotated_FillLine && ymin <= xy0.y + float(y * TILE_HEIGHT_PX)) { + // Assign backdrop to all tiles to the right of the ray crossing the + // top edge of this tile, up to the right edge of the fill bbox. + float xray = t - 0.5 * b; + xx0 = max(int(ceil(xray)), 0); + for (uint x = xx0; x < xr; x++) { + atomicOr(sh_backdrop[my_slice][y * N_TILE_X + x], my_mask); + } + } + t += b; + } + barrier(); + + // We've computed coverage and other info for each element in the input, now for + // the output stage. We'll do segments first using a more parallel algorithm. + + uint seg_count = 0; + for (uint i = 0; i < N_SLICE; i++) { + seg_count += bitCount(sh_bitmaps[i][th_ix] & sh_is_segment[i]); + } + sh_seg_count[th_ix] = seg_count; + // Prefix sum of sh_seg_count + for (uint i = 0; i < LG_N_TILE; i++) { + barrier(); + if (th_ix >= (1 << i)) { + seg_count += sh_seg_count[th_ix - (1 << i)]; + } + barrier(); + sh_seg_count[th_ix] = seg_count; + } + if (th_ix == N_TILE - 1) { + sh_seg_alloc = atomicAdd(alloc, seg_count * Segment_size); + } + barrier(); + uint total_seg_count = sh_seg_count[N_TILE - 1]; + uint seg_alloc = sh_seg_alloc; + + // Output buffer is allocated as segments for each tile laid end-to-end. + + for (uint ix = th_ix; ix < total_seg_count; ix += N_TILE) { + // Find the work item; this thread is now not bound to an element or tile. + // First find the tile (by binary search) + uint tile_ix = 0; + for (uint i = 0; i < LG_N_TILE; i++) { + uint probe = tile_ix + ((N_TILE / 2) >> i); + if (ix >= sh_seg_count[probe - 1]) { + tile_ix = probe; + } + } + // Now, sh_seg_count[tile_ix - 1] <= ix < sh_seg_count[tile_ix]. + // (considering sh_seg_count[-1] == 0) + + // Index of segment within tile's segments + uint seq_ix = ix; + // Maybe consider a sentinel value to avoid the conditional? + if (tile_ix > 0) { + seq_ix -= sh_seg_count[tile_ix - 1]; + } + // Find the segment. This is done by linear scan through the bitmaps of the + // tile, accelerated by bit counting. Binary search might help, maybe not. + uint slice_ix = 0; + uint seq_bits; + + while (true) { + seq_bits = sh_bitmaps[slice_ix][tile_ix] & sh_is_segment[slice_ix]; + uint this_count = bitCount(seq_bits); + if (this_count > seq_ix) { + break; + } + seq_ix -= this_count; + slice_ix++; + } + // Now find position of nth bit set (n = seq_ix) in seq_bits; binary search + uint bit_ix = 0; + for (int i = 0; i < 5; i++) { + uint probe = bit_ix + (16 >> i); + if (seq_ix >= bitCount(seq_bits & ((1 << probe) - 1))) { + bit_ix = probe; + } + } + uint out_offset = seg_alloc + Segment_size * ix + SegChunk_size; + uint rd_el_ix = slice_ix * 32 + bit_ix; + uint element_ix = sh_elements[rd_el_ix]; + ref = AnnotatedRef(element_ix * Annotated_size); + AnnoFillLineSeg line = Annotated_FillLine_read(ref); + float y_edge = 0.0; + // This is basically the same logic as piet-metal, but should be made numerically robust. + if (Annotated_tag(ref) == Annotated_FillLine) { + vec2 tile_xy = xy0 + vec2((tile_ix % N_TILE_X) * TILE_WIDTH_PX, (tile_ix / N_TILE_X) * TILE_HEIGHT_PX); + y_edge = mix(line.p0.y, line.p1.y, (tile_xy.x - line.p0.x) / (line.p1.x - line.p0.x)); + if (min(line.p0.x, line.p1.x) < tile_xy.x && y_edge >= tile_xy.y && y_edge < tile_xy.y + TILE_HEIGHT_PX) { + if (line.p0.x > line.p1.x) { + line.p1 = vec2(tile_xy.x, y_edge); + } else { + line.p0 = vec2(tile_xy.x, y_edge); + } + } else { + y_edge = 1e9; + } + } + Segment seg = Segment(line.p0, line.p1, y_edge); + Segment_write(SegmentRef(seg_alloc + Segment_size * ix), seg); + } + + // Output non-segment elements for this tile. The thread does a sequential walk + // through the non-segment elements, and for segments, count and backdrop are + // aggregated using bit counting. + uint slice_ix = 0; + uint bitmap = sh_bitmaps[0][th_ix]; + uint bd_bitmap = sh_backdrop[0][th_ix]; + uint bd_sign = sh_bd_sign[0]; + uint is_segment = sh_is_segment[0]; + uint seg_start = th_ix == 0 ? 0 : sh_seg_count[th_ix - 1]; + seg_count = 0; + while (true) { + uint nonseg_bitmap = bitmap & ~is_segment; + if (nonseg_bitmap == 0) { + backdrop += count_backdrop(bd_bitmap, bd_sign); + seg_count += bitCount(bitmap & is_segment); + slice_ix++; + if (slice_ix == N_SLICE) { + break; + } + bitmap = sh_bitmaps[slice_ix][th_ix]; + bd_bitmap = sh_backdrop[slice_ix][th_ix]; + bd_sign = sh_bd_sign[slice_ix]; + is_segment = sh_is_segment[slice_ix]; + nonseg_bitmap = bitmap & ~is_segment; + if (nonseg_bitmap == 0) { + continue; + } + } + uint element_ref_ix = slice_ix * 32 + findLSB(nonseg_bitmap); + uint element_ix = sh_elements[element_ref_ix]; + + // Bits up to and including the lsb + uint bd_mask = (nonseg_bitmap - 1) ^ nonseg_bitmap; + backdrop += count_backdrop(bd_bitmap & bd_mask, bd_sign); + seg_count += bitCount(bitmap & bd_mask & is_segment); + // Clear bits that have been consumed. + bd_bitmap &= ~bd_mask; + bitmap &= ~bd_mask; + + // At this point, we read the element again from global memory. + // If that turns out to be expensive, maybe we can pack it into + // shared memory (or perhaps just the tag). + ref = AnnotatedRef(element_ix * Annotated_size); + tag = Annotated_tag(ref); + + switch (tag) { + case Annotated_Fill: + if (last_chunk_n > 0 || seg_count > 0) { + SegChunkRef chunk_ref = SegChunkRef(0); + if (seg_count > 0) { + chunk_ref = alloc_seg_chunk(); + SegChunk chunk; + chunk.n = seg_count; + chunk.next = SegChunkRef(0); + uint seg_offset = seg_alloc + seg_start * Segment_size; + chunk.segs = SegmentRef(seg_offset); + SegChunk_write(chunk_ref, chunk); + } + if (last_chunk_n > 0) { + SegChunk chunk; + chunk.n = last_chunk_n; + chunk.next = chunk_ref; + chunk.segs = last_chunk_segs; + SegChunk_write(last_chunk_ref, chunk); + } else { + first_seg_chunk = chunk_ref; + } + + AnnoFill fill = Annotated_Fill_read(ref); + CmdFill cmd_fill; + cmd_fill.seg_ref = first_seg_chunk; + cmd_fill.backdrop = backdrop; + cmd_fill.rgba_color = fill.rgba_color; + alloc_cmd(cmd_ref, cmd_limit); + Cmd_Fill_write(cmd_ref, cmd_fill); + cmd_ref.offset += Cmd_size; + last_chunk_n = 0; + } else if (backdrop != 0) { + AnnoFill fill = Annotated_Fill_read(ref); + alloc_cmd(cmd_ref, cmd_limit); + Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color)); + cmd_ref.offset += Cmd_size; + } + seg_start += seg_count; + seg_count = 0; + backdrop = 0; + break; + case Annotated_Stroke: + // TODO: reduce divergence & code duplication? Much of the + // fill and stroke processing is in common. + if (last_chunk_n > 0 || seg_count > 0) { + SegChunkRef chunk_ref = SegChunkRef(0); + if (seg_count > 0) { + chunk_ref = alloc_seg_chunk(); + SegChunk chunk; + chunk.n = seg_count; + chunk.next = SegChunkRef(0); + uint seg_offset = seg_alloc + seg_start * Segment_size; + chunk.segs = SegmentRef(seg_offset); + SegChunk_write(chunk_ref, chunk); + } + if (last_chunk_n > 0) { + SegChunk chunk; + chunk.n = last_chunk_n; + chunk.next = chunk_ref; + chunk.segs = last_chunk_segs; + SegChunk_write(last_chunk_ref, chunk); + } else { + first_seg_chunk = chunk_ref; + } + + AnnoStroke stroke = Annotated_Stroke_read(ref); + CmdStroke cmd_stroke; + cmd_stroke.seg_ref = first_seg_chunk; + cmd_stroke.half_width = 0.5 * stroke.linewidth; + cmd_stroke.rgba_color = stroke.rgba_color; + alloc_cmd(cmd_ref, cmd_limit); + Cmd_Stroke_write(cmd_ref, cmd_stroke); + cmd_ref.offset += Cmd_size; + last_chunk_n = 0; + } + seg_start += seg_count; + seg_count = 0; + break; + default: + // This shouldn't happen, but just in case. + seg_start++; + break; + } + } + if (seg_count > 0) { + SegChunkRef chunk_ref = alloc_seg_chunk(); + if (last_chunk_n > 0) { + SegChunk_write(last_chunk_ref, SegChunk(last_chunk_n, chunk_ref, last_chunk_segs)); + } else { + first_seg_chunk = chunk_ref; + } + // TODO: free two registers by writing count and segments ref now, + // as opposed to deferring SegChunk write until all fields are known. + last_chunk_ref = chunk_ref; + last_chunk_n = seg_count; + uint seg_offset = seg_alloc + seg_start * Segment_size; + last_chunk_segs = SegmentRef(seg_offset); + } + barrier(); + + rd_ix += N_TILE; + if (rd_ix >= ready_ix && partition_ix >= n_partitions) break; + } + Cmd_End_write(cmd_ref); +} diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv new file mode 100644 index 0000000..5a43f4a Binary files /dev/null and b/piet-gpu/shader/coarse.spv differ diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp new file mode 100644 index 0000000..43bb9cc --- /dev/null +++ b/piet-gpu/shader/elements.comp @@ -0,0 +1,328 @@ +// The element processing stage, first in the pipeline. +// +// This stage is primarily about applying transforms and computing bounding +// boxes. It is organized as a scan over the input elements, producing +// annotated output elements. + +#version 450 +#extension GL_GOOGLE_include_directive : enable + +#define N_ROWS 4 +#define WG_SIZE 32 +#define LG_WG_SIZE 5 +#define PARTITION_SIZE (WG_SIZE * N_ROWS) + +layout(local_size_x = WG_SIZE, local_size_y = 1) in; + +layout(set = 0, binding = 0) readonly buffer SceneBuf { + uint[] scene; +}; + +// It would be better to use the Vulkan memory model than +// "volatile" but shooting for compatibility here rather +// than doing things right. +layout(set = 0, binding = 1) volatile buffer StateBuf { + uint[] state; +}; + +// The annotated results are stored here. +layout(set = 0, binding = 2) buffer AnnotatedBuf { + uint[] annotated; +}; + +#include "scene.h" +#include "state.h" +#include "annotated.h" + +#define StateBuf_stride (8 + 2 * State_size) + +StateRef state_aggregate_ref(uint partition_ix) { + return StateRef(12 + partition_ix * StateBuf_stride); +} + +StateRef state_prefix_ref(uint partition_ix) { + return StateRef(12 + partition_ix * StateBuf_stride + State_size); +} + +uint state_flag_index(uint partition_ix) { + return 1 + partition_ix * (StateBuf_stride / 4); +} + +// These correspond to X, A, P respectively in the prefix sum paper. +#define FLAG_NOT_READY 0 +#define FLAG_AGGREGATE_READY 1 +#define FLAG_PREFIX_READY 2 + +#define FLAG_SET_LINEWIDTH 1 +#define FLAG_SET_BBOX 2 +#define FLAG_RESET_BBOX 4 + +// This is almost like a monoid (the interaction between transformation and +// bounding boxes is approximate) +State combine_state(State a, State b) { + State c; + c.bbox.x = min(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + min(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x; + c.bbox.y = min(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + min(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y; + c.bbox.z = max(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + max(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x; + c.bbox.w = max(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + max(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y; + if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) { + c.bbox = a.bbox; + } else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 && + (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)) + { + c.bbox.xy = min(a.bbox.xy, c.bbox.xy); + c.bbox.zw = max(a.bbox.zw, c.bbox.zw); + } + // It would be more concise to cast to matrix types; ah well. + c.mat.x = a.mat.x * b.mat.x + a.mat.z * b.mat.y; + c.mat.y = a.mat.y * b.mat.x + a.mat.w * b.mat.y; + c.mat.z = a.mat.x * b.mat.z + a.mat.z * b.mat.w; + c.mat.w = a.mat.y * b.mat.z + a.mat.w * b.mat.w; + c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x; + c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y; + c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth; + c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX)) | b.flags; + c.flags |= (a.flags & FLAG_RESET_BBOX) >> 1; + return c; +} + +State map_element(ElementRef ref, inout bool is_fill) { + // TODO: it would *probably* be more efficient to make the memory read patterns less + // divergent, though it would be more wasted memory. + uint tag = Element_tag(ref); + State c; + c.bbox = vec4(0.0, 0.0, 0.0, 0.0); + c.mat = vec4(1.0, 0.0, 0.0, 1.0); + c.translate = vec2(0.0, 0.0); + c.linewidth = 1.0; // TODO should be 0.0 + c.flags = 0; + is_fill = false; + switch (tag) { + case Element_FillLine: + case Element_StrokeLine: + LineSeg line = Element_FillLine_read(ref); + c.bbox.xy = min(line.p0, line.p1); + c.bbox.zw = max(line.p0, line.p1); + break; + case Element_Quad: + QuadSeg quad = Element_Quad_read(ref); + c.bbox.xy = min(min(quad.p0, quad.p1), quad.p2); + c.bbox.zw = max(max(quad.p0, quad.p1), quad.p2); + break; + case Element_Cubic: + CubicSeg cubic = Element_Cubic_read(ref); + c.bbox.xy = min(min(cubic.p0, cubic.p1), min(cubic.p2, cubic.p3)); + c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3)); + break; + case Element_Fill: + is_fill = true; + // fall-through + case Element_Stroke: + c.flags = FLAG_RESET_BBOX; + break; + case Element_SetLineWidth: + SetLineWidth lw = Element_SetLineWidth_read(ref); + c.linewidth = lw.width; + c.flags = FLAG_SET_LINEWIDTH; + break; + case Element_Transform: + Transform t = Element_Transform_read(ref); + c.mat = t.mat; + c.translate = t.translate; + break; + } + return c; +} + +// Get the bounding box of a circle transformed by the matrix into an ellipse. +vec2 get_linewidth(State st) { + // See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm + return 0.5 * st.linewidth * vec2(length(st.mat.xz), length(st.mat.yw)); +} + +// We should be able to use an array of structs but the NV shader compiler +// doesn't seem to like it :/ +//shared State sh_state[WG_SIZE]; +shared vec4 sh_mat[WG_SIZE]; +shared vec2 sh_translate[WG_SIZE]; +shared vec4 sh_bbox[WG_SIZE]; +shared float sh_width[WG_SIZE]; +shared uint sh_flags[WG_SIZE]; + +shared uint sh_min_fill; + +shared uint sh_tile_ix; +shared State sh_prefix; + +void main() { + State th_state[N_ROWS]; + // Determine partition to process by atomic counter (described in Section + // 4.4 of prefix sum paper). + if (gl_LocalInvocationID.x == 0) { + sh_tile_ix = atomicAdd(state[0], 1); + sh_min_fill = ~0; + } + barrier(); + uint tile_ix = sh_tile_ix; + + uint ix = tile_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS; + ElementRef ref = ElementRef(ix * Element_size); + + bool is_fill; + uint my_min_fill = ~0; + th_state[0] = map_element(ref, is_fill); + if (is_fill) my_min_fill = ix; + for (uint i = 1; i < N_ROWS; i++) { + // discussion question: would it be faster to load using more coherent patterns + // into thread memory? This is kinda strided. + th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i), is_fill)); + if (is_fill && my_min_fill == ~0) { + my_min_fill = ix + i; + } + } + atomicMin(sh_min_fill, my_min_fill); + State agg = th_state[N_ROWS - 1]; + sh_mat[gl_LocalInvocationID.x] = agg.mat; + sh_translate[gl_LocalInvocationID.x] = agg.translate; + sh_bbox[gl_LocalInvocationID.x] = agg.bbox; + sh_width[gl_LocalInvocationID.x] = agg.linewidth; + sh_flags[gl_LocalInvocationID.x] = agg.flags; + for (uint i = 0; i < LG_WG_SIZE; i++) { + barrier(); + if (gl_LocalInvocationID.x >= (1 << i)) { + State other; + uint ix = gl_LocalInvocationID.x - (1 << i); + other.mat = sh_mat[ix]; + other.translate = sh_translate[ix]; + other.bbox = sh_bbox[ix]; + other.linewidth = sh_width[ix]; + other.flags = sh_flags[ix]; + agg = combine_state(other, agg); + } + barrier(); + sh_mat[gl_LocalInvocationID.x] = agg.mat; + sh_translate[gl_LocalInvocationID.x] = agg.translate; + sh_bbox[gl_LocalInvocationID.x] = agg.bbox; + sh_width[gl_LocalInvocationID.x] = agg.linewidth; + sh_flags[gl_LocalInvocationID.x] = agg.flags; + } + + State exclusive; + exclusive.bbox = vec4(0.0, 0.0, 0.0, 0.0); + exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0); + exclusive.translate = vec2(0.0, 0.0); + exclusive.linewidth = 1.0; //TODO should be 0.0 + exclusive.flags = 0; + + // Publish aggregate for this partition + if (gl_LocalInvocationID.x == WG_SIZE - 1) { + // Note: with memory model, we'd want to generate the atomic store version of this. + State_write(state_aggregate_ref(tile_ix), agg); + uint flag = FLAG_AGGREGATE_READY; + memoryBarrierBuffer(); + if (tile_ix == 0) { + State_write(state_prefix_ref(tile_ix), agg); + flag = FLAG_PREFIX_READY; + } + state[state_flag_index(tile_ix)] = flag; + if (tile_ix != 0) { + // step 4 of paper: decoupled lookback + uint look_back_ix = tile_ix - 1; + while (true) { + flag = state[state_flag_index(look_back_ix)]; + if (flag == FLAG_PREFIX_READY) { + State their_prefix = State_read(state_prefix_ref(look_back_ix)); + exclusive = combine_state(their_prefix, exclusive); + break; + } else if (flag == FLAG_AGGREGATE_READY) { + State their_agg = State_read(state_aggregate_ref(look_back_ix)); + exclusive = combine_state(their_agg, exclusive); + look_back_ix--; + } + // else spin + } + + // step 5 of paper: compute inclusive prefix + State inclusive_prefix = combine_state(exclusive, agg); + sh_prefix = exclusive; + State_write(state_prefix_ref(tile_ix), inclusive_prefix); + memoryBarrierBuffer(); + flag = FLAG_PREFIX_READY; + state[state_flag_index(tile_ix)] = flag; + } + } + barrier(); + my_min_fill = sh_min_fill; + if (tile_ix != 0) { + exclusive = sh_prefix; + } + + State row = exclusive; + if (gl_LocalInvocationID.x > 0) { + uint ix = gl_LocalInvocationID.x - 1; + State other; + other.mat = sh_mat[ix]; + other.translate = sh_translate[ix]; + other.bbox = sh_bbox[ix]; + other.linewidth = sh_width[ix]; + other.flags = sh_flags[ix]; + row = combine_state(row, other); + } + if (my_min_fill == ~0 && gl_LocalInvocationID.x == 0) { + state[state_flag_index(tile_ix) + 1] = 0x7f800000; // infinity + } + for (uint i = 0; i < N_ROWS; i++) { + State st = combine_state(row, th_state[i]); + if (my_min_fill == ix + i) { + state[state_flag_index(tile_ix) + 1] = floatBitsToUint(st.bbox.z); + } + // We write the state now for development purposes, but the + // actual goal is to write transformed and annotated elements. + //State_write(StateRef((ix + i) * State_size), st); + + // Here we read again from the original scene. There may be + // gains to be had from stashing in shared memory or possibly + // registers (though register pressure is an issue). + ElementRef this_ref = Element_index(ref, i); + AnnotatedRef out_ref = AnnotatedRef((ix + i) * Annotated_size); + uint tag = Element_tag(this_ref); + switch (tag) { + case Element_FillLine: + case Element_StrokeLine: + LineSeg line = Element_StrokeLine_read(this_ref); + AnnoStrokeLineSeg anno_line; + anno_line.p0 = st.mat.xy * line.p0.x + st.mat.zw * line.p0.y + st.translate; + anno_line.p1 = st.mat.xy * line.p1.x + st.mat.zw * line.p1.y + st.translate; + if (tag == Element_StrokeLine) { + anno_line.stroke = get_linewidth(st); + } else { + anno_line.stroke = vec2(0.0); + } + // We do encoding a bit by hand to minimize divergence. Another approach + // would be to have a fill/stroke bool. + uint out_tag = tag == Element_FillLine ? Annotated_FillLine : Annotated_StrokeLine; + annotated[out_ref.offset >> 2] = out_tag; + AnnoStrokeLineSeg_write(AnnoStrokeLineSegRef(out_ref.offset + 4), anno_line); + break; + case Element_Stroke: + Stroke stroke = Element_Stroke_read(this_ref); + AnnoStroke anno_stroke; + anno_stroke.rgba_color = stroke.rgba_color; + vec2 lw = get_linewidth(st); + anno_stroke.bbox = st.bbox + vec4(-lw, lw); + anno_stroke.linewidth = st.linewidth * sqrt(st.mat.x * st.mat.w - st.mat.y * st.mat.z); + Annotated_Stroke_write(out_ref, anno_stroke); + break; + case Element_Fill: + Fill fill = Element_Fill_read(this_ref); + AnnoFill anno_fill; + anno_fill.rgba_color = fill.rgba_color; + anno_fill.bbox = st.bbox; + Annotated_Fill_write(out_ref, anno_fill); + break; + default: + Annotated_Nop_write(out_ref); + break; + } + } +} diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv new file mode 100644 index 0000000..a2d439c Binary files /dev/null and b/piet-gpu/shader/elements.spv differ diff --git a/piet-gpu/shader/fill_seg.h b/piet-gpu/shader/fill_seg.h deleted file mode 100644 index abe199f..0000000 --- a/piet-gpu/shader/fill_seg.h +++ /dev/null @@ -1,130 +0,0 @@ -// Code auto-generated by piet-gpu-derive - -struct FillTileHeaderRef { - uint offset; -}; - -struct FillItemHeaderRef { - uint offset; -}; - -struct FillSegmentRef { - uint offset; -}; - -struct FillSegChunkRef { - uint offset; -}; - -struct FillTileHeader { - uint n; - FillItemHeaderRef items; -}; - -#define FillTileHeader_size 8 - -FillTileHeaderRef FillTileHeader_index(FillTileHeaderRef ref, uint index) { - return FillTileHeaderRef(ref.offset + index * FillTileHeader_size); -} - -struct FillItemHeader { - int backdrop; - FillSegChunkRef segments; -}; - -#define FillItemHeader_size 8 - -FillItemHeaderRef FillItemHeader_index(FillItemHeaderRef ref, uint index) { - return FillItemHeaderRef(ref.offset + index * FillItemHeader_size); -} - -struct FillSegment { - vec2 start; - vec2 end; -}; - -#define FillSegment_size 16 - -FillSegmentRef FillSegment_index(FillSegmentRef ref, uint index) { - return FillSegmentRef(ref.offset + index * FillSegment_size); -} - -struct FillSegChunk { - uint n; - FillSegChunkRef next; -}; - -#define FillSegChunk_size 8 - -FillSegChunkRef FillSegChunk_index(FillSegChunkRef ref, uint index) { - return FillSegChunkRef(ref.offset + index * FillSegChunk_size); -} - -FillTileHeader FillTileHeader_read(FillTileHeaderRef ref) { - uint ix = ref.offset >> 2; - uint raw0 = fill_seg[ix + 0]; - uint raw1 = fill_seg[ix + 1]; - FillTileHeader s; - s.n = raw0; - s.items = FillItemHeaderRef(raw1); - return s; -} - -void FillTileHeader_write(FillTileHeaderRef ref, FillTileHeader s) { - uint ix = ref.offset >> 2; - fill_seg[ix + 0] = s.n; - fill_seg[ix + 1] = s.items.offset; -} - -FillItemHeader FillItemHeader_read(FillItemHeaderRef ref) { - uint ix = ref.offset >> 2; - uint raw0 = fill_seg[ix + 0]; - uint raw1 = fill_seg[ix + 1]; - FillItemHeader s; - s.backdrop = int(raw0); - s.segments = FillSegChunkRef(raw1); - return s; -} - -void FillItemHeader_write(FillItemHeaderRef ref, FillItemHeader s) { - uint ix = ref.offset >> 2; - fill_seg[ix + 0] = uint(s.backdrop); - fill_seg[ix + 1] = s.segments.offset; -} - -FillSegment FillSegment_read(FillSegmentRef ref) { - uint ix = ref.offset >> 2; - uint raw0 = fill_seg[ix + 0]; - uint raw1 = fill_seg[ix + 1]; - uint raw2 = fill_seg[ix + 2]; - uint raw3 = fill_seg[ix + 3]; - FillSegment s; - s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); - s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); - return s; -} - -void FillSegment_write(FillSegmentRef ref, FillSegment s) { - uint ix = ref.offset >> 2; - fill_seg[ix + 0] = floatBitsToUint(s.start.x); - fill_seg[ix + 1] = floatBitsToUint(s.start.y); - fill_seg[ix + 2] = floatBitsToUint(s.end.x); - fill_seg[ix + 3] = floatBitsToUint(s.end.y); -} - -FillSegChunk FillSegChunk_read(FillSegChunkRef ref) { - uint ix = ref.offset >> 2; - uint raw0 = fill_seg[ix + 0]; - uint raw1 = fill_seg[ix + 1]; - FillSegChunk s; - s.n = raw0; - s.next = FillSegChunkRef(raw1); - return s; -} - -void FillSegChunk_write(FillSegChunkRef ref, FillSegChunk s) { - uint ix = ref.offset >> 2; - fill_seg[ix + 0] = s.n; - fill_seg[ix + 1] = s.next.offset; -} - diff --git a/piet-gpu/shader/kernel1.comp b/piet-gpu/shader/kernel1.comp deleted file mode 100644 index 6b76c53..0000000 --- a/piet-gpu/shader/kernel1.comp +++ /dev/null @@ -1,161 +0,0 @@ -// This is "kernel 1" in a 4-kernel pipeline. It traverses the scene graph -// and outputs "instances" (references to item + translation) for each item -// that intersects the tilegroup. -// -// This implementation is simplistic and leaves a lot of performance on the -// table. A fancier implementation would use threadgroup shared memory or -// subgroups (or possibly both) to parallelize the reading of the input and -// the computation of tilegroup intersection. -// -// In addition, there are some features currently missing, such as support -// for clipping. - -#version 450 -#extension GL_GOOGLE_include_directive : enable - -// It's possible we should lay this out with x and do our own math. -layout(local_size_x = 1, local_size_y = 32) in; - -layout(set = 0, binding = 0) readonly buffer SceneBuf { - uint[] scene; -}; - -layout(set = 0, binding = 1) buffer TilegroupBuf { - uint[] tilegroup; -}; - -layout(set = 0, binding = 2) buffer AllocBuf { - uint alloc; -}; - -#include "scene.h" -#include "tilegroup.h" - -#include "setup.h" - -#define MAX_STACK 8 - -struct StackElement { - PietItemRef group; - uint index; - vec2 offset; -}; - -void main() { - StackElement stack[MAX_STACK]; - uint stack_ix = 0; - uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS + gl_GlobalInvocationID.x; - TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE); - uint tg_limit = tg_ref.offset + TILEGROUP_INITIAL_ALLOC - 2 * TileGroup_size; - - // State for stroke references. - TileGroupRef stroke_start = TileGroupRef(tg_ref.offset + TILEGROUP_STROKE_START); - ChunkRef stroke_chunk_start = ChunkRef(stroke_start.offset + 4); - InstanceRef stroke_ref = InstanceRef(stroke_chunk_start.offset + Chunk_size); - uint stroke_limit = stroke_start.offset + TILEGROUP_INITIAL_STROKE_ALLOC - Instance_size; - uint stroke_chunk_n = 0; - uint stroke_n = 0; - - // State for fill references. All this is a bit cut'n'paste, but making a - // proper abstraction isn't easy. - TileGroupRef fill_start = TileGroupRef(tg_ref.offset + TILEGROUP_FILL_START); - ChunkRef fill_chunk_start = ChunkRef(fill_start.offset + 4); - InstanceRef fill_ref = InstanceRef(fill_chunk_start.offset + Chunk_size); - uint fill_limit = fill_start.offset + TILEGROUP_INITIAL_FILL_ALLOC - Instance_size; - uint fill_chunk_n = 0; - uint fill_n = 0; - - vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILEGROUP_WIDTH_PX, TILEGROUP_HEIGHT_PX); - PietItemRef root = PietItemRef(0); - SimpleGroup group = PietItem_Group_read(root); - StackElement tos = StackElement(root, 0, group.offset.xy); - - while (true) { - if (tos.index < group.n_items) { - Bbox bbox = Bbox_read(Bbox_index(group.bboxes, tos.index)); - vec4 bb = vec4(bbox.bbox) + tos.offset.xyxy; - bool hit = max(bb.x, xy0.x) < min(bb.z, xy0.x + float(TILEGROUP_WIDTH_PX)) - && max(bb.y, xy0.y) < min(bb.w, xy0.y + float(TILEGROUP_HEIGHT_PX)); - bool is_group = false; - uint tag; - if (hit) { - PietItemRef item_ref = PietItem_index(group.items, tos.index); - tag = PietItem_tag(item_ref); - is_group = tag == PietItem_Group; - } - if (hit && !is_group) { - PietItemRef item_ref = PietItem_index(group.items, tos.index); - Instance ins = Instance(item_ref.offset, tos.offset); - if (tg_ref.offset > tg_limit) { - // Allocation exceeded; do atomic bump alloc. - uint new_tg = atomicAdd(alloc, TILEGROUP_INITIAL_ALLOC); - Jump jump = Jump(TileGroupRef(new_tg)); - TileGroup_Jump_write(tg_ref, jump); - tg_ref = TileGroupRef(new_tg); - tg_limit = tg_ref.offset + TILEGROUP_INITIAL_ALLOC - 2 * TileGroup_size; - } - TileGroup_Instance_write(tg_ref, ins); - tg_ref.offset += TileGroup_size; - if (tag == PietItem_Poly) { - if (stroke_ref.offset > stroke_limit) { - uint new_stroke = atomicAdd(alloc, TILEGROUP_STROKE_ALLOC); - Chunk_write(stroke_chunk_start, Chunk(stroke_chunk_n, ChunkRef(new_stroke))); - stroke_chunk_start = ChunkRef(new_stroke); - stroke_ref = InstanceRef(new_stroke + Chunk_size); - stroke_n += stroke_chunk_n; - stroke_chunk_n = 0; - stroke_limit = new_stroke + TILEGROUP_STROKE_ALLOC - Instance_size; - } - Instance_write(stroke_ref, ins); - stroke_chunk_n++; - stroke_ref.offset += Instance_size; - } else if (tag == PietItem_Fill) { - if (fill_ref.offset > fill_limit) { - uint new_fill = atomicAdd(alloc, TILEGROUP_FILL_ALLOC); - Chunk_write(fill_chunk_start, Chunk(fill_chunk_n, ChunkRef(new_fill))); - fill_chunk_start = ChunkRef(new_fill); - fill_ref = InstanceRef(new_fill + Chunk_size); - fill_n += fill_chunk_n; - fill_chunk_n = 0; - fill_limit = new_fill + TILEGROUP_FILL_ALLOC - Instance_size; - } - Instance_write(fill_ref, ins); - fill_chunk_n++; - fill_ref.offset += Instance_size; - - } - } - if (is_group) { - PietItemRef item_ref = PietItem_index(group.items, tos.index); - tos.index++; - if (tos.index < group.n_items) { - stack[stack_ix++] = tos; - } - group = PietItem_Group_read(item_ref); - tos = StackElement(item_ref, 0, tos.offset + group.offset.xy); - } else { - tos.index++; - } - } else { - // processed all items in this group; pop the stack - if (stack_ix == 0) { - break; - } - tos = stack[--stack_ix]; - group = PietItem_Group_read(tos.group); - } - } - TileGroup_End_write(tg_ref); - - stroke_n += stroke_chunk_n; - if (stroke_n > 0) { - Chunk_write(stroke_chunk_start, Chunk(stroke_chunk_n, ChunkRef(0))); - } - tilegroup[stroke_start.offset >> 2] = stroke_n; - - fill_n += fill_chunk_n; - if (fill_n > 0) { - Chunk_write(fill_chunk_start, Chunk(fill_chunk_n, ChunkRef(0))); - } - tilegroup[fill_start.offset >> 2] = fill_n; -} diff --git a/piet-gpu/shader/kernel1.spv b/piet-gpu/shader/kernel1.spv deleted file mode 100644 index 358151d..0000000 Binary files a/piet-gpu/shader/kernel1.spv and /dev/null differ diff --git a/piet-gpu/shader/kernel2f.comp b/piet-gpu/shader/kernel2f.comp deleted file mode 100644 index 7ea93bd..0000000 --- a/piet-gpu/shader/kernel2f.comp +++ /dev/null @@ -1,167 +0,0 @@ -// This is "kernel 2" (fill) in a 4-kernel pipeline. It processes the fill -// (polyline) items in the scene and generates a list of segments for each, for -// each tile. - -#version 450 -#extension GL_GOOGLE_include_directive : enable - -layout(local_size_x = 32) in; - -layout(set = 0, binding = 0) readonly buffer SceneBuf { - uint[] scene; -}; - -layout(set = 0, binding = 1) buffer TilegroupBuf { - uint[] tilegroup; -}; - -layout(set = 0, binding = 2) buffer FillSegBuf { - uint[] fill_seg; -}; - -layout(set = 0, binding = 3) buffer AllocBuf { - uint alloc; -}; - -#include "scene.h" -#include "tilegroup.h" -#include "fill_seg.h" - -#include "setup.h" - -// Ensure that there is space to encode a segment. -void alloc_chunk(inout uint chunk_n_segs, inout FillSegChunkRef seg_chunk_ref, - inout FillSegChunkRef first_seg_chunk, inout uint seg_limit) -{ - if (chunk_n_segs == 0) { - if (seg_chunk_ref.offset + 40 > seg_limit) { - seg_chunk_ref.offset = atomicAdd(alloc, SEG_CHUNK_ALLOC); - seg_limit = seg_chunk_ref.offset + SEG_CHUNK_ALLOC - FillSegment_size; - } - first_seg_chunk = seg_chunk_ref; - } else if (seg_chunk_ref.offset + FillSegChunk_size + FillSegment_size * chunk_n_segs > seg_limit) { - uint new_chunk_ref = atomicAdd(alloc, SEG_CHUNK_ALLOC); - seg_limit = new_chunk_ref + SEG_CHUNK_ALLOC - FillSegment_size; - FillSegChunk_write(seg_chunk_ref, FillSegChunk(chunk_n_segs, FillSegChunkRef(new_chunk_ref))); - seg_chunk_ref.offset = new_chunk_ref; - chunk_n_segs = 0; - } - -} - -void main() { - uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x; - uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS - + (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES); - vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX); - TileGroupRef fill_start = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE + TILEGROUP_FILL_START); - uint fill_n = tilegroup[fill_start.offset >> 2]; - - FillTileHeaderRef tile_header_ref = FillTileHeaderRef(tile_ix * FillTileHeader_size); - if (fill_n > 0) { - ChunkRef chunk_ref = ChunkRef(fill_start.offset + 4); - Chunk chunk = Chunk_read(chunk_ref); - InstanceRef fill_ref = InstanceRef(chunk_ref.offset + Chunk_size); - FillItemHeaderRef item_header = FillItemHeaderRef(atomicAdd(alloc, fill_n * FillItemHeader_size)); - FillTileHeader_write(tile_header_ref, FillTileHeader(fill_n, item_header)); - FillSegChunkRef seg_chunk_ref = FillSegChunkRef(0); - uint seg_limit = 0; - // Iterate through items; fill_n holds count remaining. - while (true) { - if (chunk.chunk_n == 0) { - chunk_ref = chunk.next; - if (chunk_ref.offset == 0) { - break; - } - chunk = Chunk_read(chunk_ref); - fill_ref = InstanceRef(chunk_ref.offset + Chunk_size); - } - Instance ins = Instance_read(fill_ref); - PietFill fill = PietItem_Fill_read(PietItemRef(ins.item_ref)); - - // Process the fill polyline item. - uint max_n_segs = fill.n_points - 1; - uint chunk_n_segs = 0; - int backdrop = 0; - FillSegChunkRef seg_chunk_ref; - FillSegChunkRef first_seg_chunk = FillSegChunkRef(0); - vec2 start = Point_read(fill.points).xy; - for (uint j = 0; j < max_n_segs; j++) { - fill.points.offset += Point_size; - vec2 end = Point_read(fill.points).xy; - - // Process one segment. - - // TODO: I think this would go more smoothly (and be easier to - // make numerically robust) if it were based on clipping the line - // to the tile box. See: - // https://tavianator.com/fast-branchless-raybounding-box-intersections/ - vec2 xymin = min(start, end); - vec2 xymax = max(start, end); - float a = end.y - start.y; - float b = start.x - end.x; - float c = -(a * start.x + b * start.y); - vec2 xy1 = xy0 + vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX); - float ytop = max(xy0.y, xymin.y); - float ybot = min(xy1.y, xymax.y); - float s00 = sign(b * ytop + a * xy0.x + c); - float s01 = sign(b * ytop + a * xy1.x + c); - float s10 = sign(b * ybot + a * xy0.x + c); - float s11 = sign(b * ybot + a * xy1.x + c); - float sTopLeft = sign(b * xy0.y + a * xy0.x + c); - if (sTopLeft == sign(a) && xymin.y <= xy0.y && xymax.y > xy0.y) { - backdrop -= int(s00); - } - - // This is adapted from piet-metal but could be improved. - - if (max(xymin.x, xy0.x) < min(xymax.x, xy1.x) - && ytop < ybot - && s00 * s01 + s00 * s10 + s00 * s11 < 3.0) - { - // avoid overwriting `end` so that it can be used as start - vec2 enc_end = end; - if (xymin.x < xy0.x) { - float yEdge = mix(start.y, end.y, (start.x - xy0.x) / b); - if (yEdge >= xy0.y && yEdge < xy1.y) { - // This is encoded the same as a general fill segment, but could be - // special-cased, either here or in rendering. (It was special-cased - // in piet-metal). - FillSegment edge_seg; - if (b > 0.0) { - enc_end = vec2(xy0.x, yEdge); - edge_seg.start = enc_end; - edge_seg.end = vec2(xy0.x, xy1.y); - } else { - start = vec2(xy0.x, yEdge); - edge_seg.start = vec2(xy0.x, xy1.y); - edge_seg.end = start; - } - alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit); - FillSegment_write(FillSegmentRef(seg_chunk_ref.offset + FillSegChunk_size + FillSegment_size * chunk_n_segs), edge_seg); - chunk_n_segs++; - } - } - alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit); - FillSegment seg = FillSegment(start, enc_end); - FillSegment_write(FillSegmentRef(seg_chunk_ref.offset + FillSegChunk_size + FillSegment_size * chunk_n_segs), seg); - chunk_n_segs++; - } - - start = end; - } - FillItemHeader_write(item_header, FillItemHeader(backdrop, first_seg_chunk)); - if (chunk_n_segs != 0) { - FillSegChunk_write(seg_chunk_ref, FillSegChunk(chunk_n_segs, FillSegChunkRef(0))); - seg_chunk_ref.offset += FillSegChunk_size + FillSegment_size * chunk_n_segs; - } - - fill_ref.offset += Instance_size; - chunk.chunk_n--; - item_header.offset += FillItemHeader_size; - } - } else { - // As an optimization, we could just write 0 for the size. - FillTileHeader_write(tile_header_ref, FillTileHeader(fill_n, FillItemHeaderRef(0))); - } -} diff --git a/piet-gpu/shader/kernel2f.spv b/piet-gpu/shader/kernel2f.spv deleted file mode 100644 index 75a7a39..0000000 Binary files a/piet-gpu/shader/kernel2f.spv and /dev/null differ diff --git a/piet-gpu/shader/kernel2s.comp b/piet-gpu/shader/kernel2s.comp deleted file mode 100644 index d6b1571..0000000 --- a/piet-gpu/shader/kernel2s.comp +++ /dev/null @@ -1,137 +0,0 @@ -// This is "kernel 2" (strokes) in a 4-kernel pipeline. It processes the stroke -// (polyline) items in the scene and generates a list of segments for each, for -// each tile. - -#version 450 -#extension GL_GOOGLE_include_directive : enable - -layout(local_size_x = 32) in; - -layout(set = 0, binding = 0) readonly buffer SceneBuf { - uint[] scene; -}; - -layout(set = 0, binding = 1) buffer TilegroupBuf { - uint[] tilegroup; -}; - -layout(set = 0, binding = 2) buffer SegmentBuf { - uint[] segment; -}; - -layout(set = 0, binding = 3) buffer AllocBuf { - uint alloc; -}; - -#include "scene.h" -#include "tilegroup.h" -#include "segment.h" - -#include "setup.h" - -void main() { - uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x; - uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS - + (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES); - vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX); - TileGroupRef stroke_start = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE + TILEGROUP_STROKE_START); - uint stroke_n = tilegroup[stroke_start.offset >> 2]; - - TileHeaderRef tile_header_ref = TileHeaderRef(tile_ix * TileHeader_size); - if (stroke_n > 0) { - ChunkRef chunk_ref = ChunkRef(stroke_start.offset + 4); - Chunk chunk = Chunk_read(chunk_ref); - InstanceRef stroke_ref = InstanceRef(chunk_ref.offset + Chunk_size); - ItemHeaderRef item_header = ItemHeaderRef(atomicAdd(alloc, stroke_n * ItemHeader_size)); - TileHeader_write(tile_header_ref, TileHeader(stroke_n, item_header)); - SegChunkRef seg_chunk_ref = SegChunkRef(0); - uint seg_limit = 0; - // Iterate through items; stroke_n holds count remaining. - while (true) { - if (chunk.chunk_n == 0) { - chunk_ref = chunk.next; - if (chunk_ref.offset == 0) { - break; - } - chunk = Chunk_read(chunk_ref); - stroke_ref = InstanceRef(chunk_ref.offset + Chunk_size); - } - Instance ins = Instance_read(stroke_ref); - PietStrokePolyLine poly = PietItem_Poly_read(PietItemRef(ins.item_ref)); - - // Process the stroke polyline item. - uint max_n_segs = poly.n_points - 1; - uint chunk_n_segs = 0; - SegChunkRef seg_chunk_ref; - vec2 start = Point_read(poly.points).xy; - for (uint j = 0; j < max_n_segs; j++) { - poly.points.offset += Point_size; - vec2 end = Point_read(poly.points).xy; - - // Process one segment. - - // This logic just tests for collision. What we probably want to do - // is a clipping algorithm like Liang-Barsky, and then store coords - // relative to the tile in f16. See also: - // https://tavianator.com/fast-branchless-raybounding-box-intersections/ - - // Also note that when we go to the fancy version, we want to compute - // the (horizontal projection of) the bounding box of the intersection - // once per tilegroup, so we can assign work to individual tiles. - - float a = end.y - start.y; - float b = start.x - end.x; - float c = -(a * start.x + b * start.y); - float half_width = 0.5 * poly.width; - // Tile boundaries padded by half-width. - float xmin = xy0.x - half_width; - float ymin = xy0.y - half_width; - float xmax = xy0.x + float(TILE_WIDTH_PX) + half_width; - float ymax = xy0.y + float(TILE_HEIGHT_PX) + half_width; - float s00 = sign(b * ymin + a * xmin + c); - float s01 = sign(b * ymin + a * xmax + c); - float s10 = sign(b * ymax + a * xmin + c); - float s11 = sign(b * ymax + a * xmax + c); - // If bounding boxes intersect and not all four corners are on the same side, hit. - // Also note: this is designed to be false on NAN input. - if (max(min(start.x, end.x), xmin) < min(max(start.x, end.x), xmax) - && max(min(start.y, end.y), ymin) < min(max(start.y, end.y), ymax) - && s00 * s01 + s00 * s10 + s00 * s11 < 3.0) - { - // Allocate a chunk if needed. - if (chunk_n_segs == 0) { - if (seg_chunk_ref.offset + 40 > seg_limit) { - seg_chunk_ref.offset = atomicAdd(alloc, SEG_CHUNK_ALLOC); - seg_limit = seg_chunk_ref.offset + SEG_CHUNK_ALLOC - Segment_size; - } - ItemHeader_write(item_header, ItemHeader(seg_chunk_ref)); - } else if (seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs > seg_limit) { - uint new_chunk_ref = atomicAdd(alloc, SEG_CHUNK_ALLOC); - seg_limit = new_chunk_ref + SEG_CHUNK_ALLOC - Segment_size; - SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, SegChunkRef(new_chunk_ref))); - seg_chunk_ref.offset = new_chunk_ref; - chunk_n_segs = 0; - } - Segment seg = Segment(start, end); - Segment_write(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs), seg); - chunk_n_segs++; - } - - start = end; - } - if (chunk_n_segs == 0) { - ItemHeader_write(item_header, ItemHeader(SegChunkRef(0))); - } else { - SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, SegChunkRef(0))); - seg_chunk_ref.offset += SegChunk_size + Segment_size * chunk_n_segs; - } - - stroke_ref.offset += Instance_size; - chunk.chunk_n--; - item_header.offset += ItemHeader_size; - } - } else { - // As an optimization, we could just write 0 for the size. - TileHeader_write(tile_header_ref, TileHeader(stroke_n, ItemHeaderRef(0))); - } -} diff --git a/piet-gpu/shader/kernel2s.spv b/piet-gpu/shader/kernel2s.spv deleted file mode 100644 index f7c27f0..0000000 Binary files a/piet-gpu/shader/kernel2s.spv and /dev/null differ diff --git a/piet-gpu/shader/kernel3.comp b/piet-gpu/shader/kernel3.comp deleted file mode 100644 index bd6d809..0000000 --- a/piet-gpu/shader/kernel3.comp +++ /dev/null @@ -1,135 +0,0 @@ -// This is "kernel 3" in a 4-kernel pipeline. It walks the active items -// for the tilegroup and produces a per-tile command list for each tile. - -#version 450 -#extension GL_GOOGLE_include_directive : enable - -layout(local_size_x = 32, local_size_y = 1) in; - -layout(set = 0, binding = 0) readonly buffer SceneBuf { - uint[] scene; -}; - -// TODO: this should have a `readonly` qualifier, but then inclusion -// of ptcl.h would fail because of the writers. -layout(set = 0, binding = 1) buffer TilegroupBuf { - uint[] tilegroup; -}; - -// Used readonly -layout(set = 0, binding = 2) buffer SegmentBuf { - uint[] segment; -}; - -// Used readonly -layout(set = 0, binding = 3) buffer FillSegmentBuf { - uint[] fill_seg; -}; - -layout(set = 0, binding = 4) buffer PtclBuf { - uint[] ptcl; -}; - -layout(set = 0, binding = 5) buffer AllocBuf { - uint alloc; -}; - -#include "scene.h" -#include "tilegroup.h" -#include "segment.h" -#include "fill_seg.h" -#include "ptcl.h" - -#include "setup.h" - -void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) { - if (cmd_ref.offset > cmd_limit) { - uint new_cmd = atomicAdd(alloc, PTCL_INITIAL_ALLOC); - CmdJump jump = CmdJump(new_cmd); - Cmd_Jump_write(cmd_ref, jump); - cmd_ref = CmdRef(new_cmd); - cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - 2 * Cmd_size; - } -} - -void main() { - uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x; - uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS - + (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES); - vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX); - TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE); - CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC); - uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size; - - TileHeader stroke_th = TileHeader_read(TileHeaderRef(tile_ix * TileHeader_size)); - FillTileHeader fill_th = FillTileHeader_read(FillTileHeaderRef(tile_ix * FillTileHeader_size)); - - while (true) { - uint tg_tag = TileGroup_tag(tg_ref); - if (tg_tag == TileGroup_End) { - break; - } - if (tg_tag == TileGroup_Jump) { - tg_ref = TileGroup_Jump_read(tg_ref).new_ref; - continue; - } - // Assume tg_tag is `Instance`, though there will be more cases. - Instance ins = TileGroup_Instance_read(tg_ref); - PietItemRef item_ref = PietItemRef(ins.item_ref); - uint item_tag = PietItem_tag(item_ref); - switch (item_tag) { - case PietItem_Circle: - PietCircle circle = PietItem_Circle_read(item_ref); - vec2 center = ins.offset + circle.center.xy; - float r = circle.radius; - if (max(center.x - r, xy0.x) < min(center.x + r, xy0.x + float(TILE_WIDTH_PX)) - && max(center.y - r, xy0.y) < min(center.y + r, xy0.y + float(TILE_HEIGHT_PX))) - { - CmdCircle cmd = CmdCircle(center, r, circle.rgba_color); - alloc_cmd(cmd_ref, cmd_limit); - Cmd_Circle_write(cmd_ref, cmd); - cmd_ref.offset += Cmd_size; - } - break; - case PietItem_Poly: - ItemHeader stroke_item = ItemHeader_read(stroke_th.items); - stroke_th.items.offset += ItemHeader_size; - if (stroke_item.segments.offset != 0) { - PietStrokePolyLine poly = PietItem_Poly_read(item_ref); - CmdStroke cmd = CmdStroke( - stroke_item.segments.offset, - 0.5 * poly.width, - poly.rgba_color - ); - alloc_cmd(cmd_ref, cmd_limit); - Cmd_Stroke_write(cmd_ref, cmd); - cmd_ref.offset += Cmd_size; - } - break; - case PietItem_Fill: - FillItemHeader fill_item = FillItemHeader_read(fill_th.items); - fill_th.items.offset += FillItemHeader_size; - // TODO: handle segments == 0 but backdrop != specially, it's a solid tile. - if (fill_item.segments.offset != 0) { - PietFill fill = PietItem_Fill_read(item_ref); - CmdFill cmd = CmdFill( - fill_item.segments.offset, - fill_item.backdrop, - fill.rgba_color - ); - alloc_cmd(cmd_ref, cmd_limit); - Cmd_Fill_write(cmd_ref, cmd); - cmd_ref.offset += Cmd_size; - } else if (fill_item.backdrop != 0) { - // TODO: truncate existing cmd list if alpha is opaque - PietFill fill = PietItem_Fill_read(item_ref); - alloc_cmd(cmd_ref, cmd_limit); - Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color)); - cmd_ref.offset += Cmd_size; - } - break; - } - tg_ref.offset += TileGroup_size; - } - Cmd_End_write(cmd_ref); -} diff --git a/piet-gpu/shader/kernel3.spv b/piet-gpu/shader/kernel3.spv deleted file mode 100644 index c182337..0000000 Binary files a/piet-gpu/shader/kernel3.spv and /dev/null differ diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp index d6f33b7..2c068aa 100644 --- a/piet-gpu/shader/kernel4.comp +++ b/piet-gpu/shader/kernel4.comp @@ -6,29 +6,20 @@ #version 450 #extension GL_GOOGLE_include_directive : enable +#extension GL_KHR_shader_subgroup_basic : enable -layout(local_size_x = 16, local_size_y = 16) in; +#define CHUNK 8 +#define CHUNK_DY (16 / CHUNK) +layout(local_size_x = 16, local_size_y = 2) in; // Same concern that this should be readonly as in kernel 3. layout(set = 0, binding = 0) buffer PtclBuf { uint[] ptcl; }; -// Used readonly -layout(set = 0, binding = 1) buffer SegmentBuf { - uint[] segment; -}; - -// Used readonly -layout(set = 0, binding = 2) buffer FillSegBuf { - uint[] fill_seg; -}; - -layout(rgba8, set = 0, binding = 3) uniform writeonly image2D image; +layout(rgba8, set = 0, binding = 1) uniform writeonly image2D image; #include "ptcl.h" -#include "segment.h" -#include "fill_seg.h" #include "setup.h" @@ -36,10 +27,14 @@ void main() { uint tile_ix = gl_WorkGroupID.y * WIDTH_IN_TILES + gl_WorkGroupID.x; CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC); - uvec2 xy_uint = gl_GlobalInvocationID.xy; + uvec2 xy_uint = uvec2(gl_GlobalInvocationID.x, gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y); vec2 xy = vec2(xy_uint); vec2 uv = xy * vec2(1.0 / IMAGE_WIDTH, 1.0 / IMAGE_HEIGHT); - vec3 rgb = uv.xyy; + //vec3 rgb = uv.xyy; + vec3 rgb[CHUNK]; + for (uint i = 0; i < CHUNK; i++) { + rgb[i] = vec3(0.5); + } while (true) { uint tag = Cmd_tag(cmd_ref); @@ -49,65 +44,85 @@ void main() { switch (tag) { case Cmd_Circle: CmdCircle circle = Cmd_Circle_read(cmd_ref); - float r = length(xy + vec2(0.5, 0.5) - circle.center.xy); - float alpha = clamp(0.5 + circle.radius - r, 0.0, 1.0); vec4 fg_rgba = unpackUnorm4x8(circle.rgba_color).wzyx; - // TODO: sRGB - rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a); + for (uint i = 0; i < CHUNK; i++) { + float dy = float(i * CHUNK_DY); + float r = length(vec2(xy.x, xy.y + dy) + vec2(0.5, 0.5) - circle.center.xy); + float alpha = clamp(0.5 + circle.radius - r, 0.0, 1.0); + // TODO: sRGB + rgb[i] = mix(rgb[i], fg_rgba.rgb, alpha * fg_rgba.a); + } break; case Cmd_Stroke: CmdStroke stroke = Cmd_Stroke_read(cmd_ref); - float df = 1e9; - SegChunkRef seg_chunk_ref = SegChunkRef(stroke.seg_ref); + float df[CHUNK]; + for (uint k = 0; k < CHUNK; k++) df[k] = 1e9; + SegChunkRef seg_chunk_ref = stroke.seg_ref; do { SegChunk seg_chunk = SegChunk_read(seg_chunk_ref); + SegmentRef segs = seg_chunk.segs; for (int i = 0; i < seg_chunk.n; i++) { - Segment seg = Segment_read(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * i)); + Segment seg = Segment_read(Segment_index(segs, i)); vec2 line_vec = seg.end - seg.start; - vec2 dpos = xy + vec2(0.5, 0.5) - seg.start; - float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0); - df = min(df, length(line_vec * t - dpos)); + for (uint k = 0; k < CHUNK; k++) { + vec2 dpos = xy + vec2(0.5, 0.5) - seg.start; + dpos.y += float(k * CHUNK_DY); + float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0); + df[k] = min(df[k], length(line_vec * t - dpos)); + } } seg_chunk_ref = seg_chunk.next; } while (seg_chunk_ref.offset != 0); fg_rgba = unpackUnorm4x8(stroke.rgba_color).wzyx; - alpha = clamp(stroke.half_width + 0.5 - df, 0.0, 1.0); - rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a); + for (uint k = 0; k < CHUNK; k++) { + float alpha = clamp(stroke.half_width + 0.5 - df[k], 0.0, 1.0); + rgb[k] = mix(rgb[k], fg_rgba.rgb, alpha * fg_rgba.a); + } break; case Cmd_Fill: CmdFill fill = Cmd_Fill_read(cmd_ref); // Probably better to store as float, but conversion is no doubt cheap. - float area = float(fill.backdrop); - FillSegChunkRef fill_seg_chunk_ref = FillSegChunkRef(fill.seg_ref); + float area[CHUNK]; + for (uint k = 0; k < CHUNK; k++) area[k] = float(fill.backdrop); + SegChunkRef fill_seg_chunk_ref = fill.seg_ref; do { - FillSegChunk seg_chunk = FillSegChunk_read(fill_seg_chunk_ref); + SegChunk seg_chunk = SegChunk_read(fill_seg_chunk_ref); + SegmentRef segs = seg_chunk.segs; for (int i = 0; i < seg_chunk.n; i++) { - FillSegment seg = FillSegment_read(FillSegmentRef(fill_seg_chunk_ref.offset + FillSegChunk_size + FillSegment_size * i)); - vec2 start = seg.start - xy; - vec2 end = seg.end - xy; - vec2 window = clamp(vec2(start.y, end.y), 0.0, 1.0); - if (window.x != window.y) { - vec2 t = (window - start.y) / (end.y - start.y); - vec2 xs = vec2(mix(start.x, end.x, t.x), mix(start.x, end.x, t.y)); - float xmin = min(min(xs.x, xs.y), 1.0) - 1e-6; - float xmax = max(xs.x, xs.y); - float b = min(xmax, 1.0); - float c = max(b, 0.0); - float d = max(xmin, 0.0); - float a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin); - area += a * (window.x - window.y); + Segment seg = Segment_read(Segment_index(segs, i)); + for (uint k = 0; k < CHUNK; k++) { + vec2 my_xy = vec2(xy.x, xy.y + float(k * CHUNK_DY)); + vec2 start = seg.start - my_xy; + vec2 end = seg.end - my_xy; + vec2 window = clamp(vec2(start.y, end.y), 0.0, 1.0); + if (window.x != window.y) { + vec2 t = (window - start.y) / (end.y - start.y); + vec2 xs = vec2(mix(start.x, end.x, t.x), mix(start.x, end.x, t.y)); + float xmin = min(min(xs.x, xs.y), 1.0) - 1e-6; + float xmax = max(xs.x, xs.y); + float b = min(xmax, 1.0); + float c = max(b, 0.0); + float d = max(xmin, 0.0); + float a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin); + area[k] += a * (window.x - window.y); + } + area[k] += sign(end.x - start.x) * clamp(my_xy.y - seg.y_edge + 1.0, 0.0, 1.0); } } fill_seg_chunk_ref = seg_chunk.next; } while (fill_seg_chunk_ref.offset != 0); fg_rgba = unpackUnorm4x8(fill.rgba_color).wzyx; - alpha = min(abs(area), 1.0); - rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a); + for (uint k = 0; k < CHUNK; k++) { + float alpha = min(abs(area[k]), 1.0); + rgb[k] = mix(rgb[k], fg_rgba.rgb, alpha * fg_rgba.a); + } break; case Cmd_Solid: CmdSolid solid = Cmd_Solid_read(cmd_ref); fg_rgba = unpackUnorm4x8(solid.rgba_color).wzyx; - rgb = mix(rgb, fg_rgba.rgb, fg_rgba.a); + for (uint k = 0; k < CHUNK; k++) { + rgb[k] = mix(rgb[k], fg_rgba.rgb, fg_rgba.a); + } break; case Cmd_Jump: cmd_ref = CmdRef(Cmd_Jump_read(cmd_ref).new_ref); @@ -116,5 +131,8 @@ void main() { cmd_ref.offset += Cmd_size; } - imageStore(image, ivec2(xy_uint), vec4(rgb, 1.0)); + // TODO: sRGB + for (uint i = 0; i < CHUNK; i++) { + imageStore(image, ivec2(xy_uint.x, xy_uint.y + CHUNK_DY * i), vec4(rgb[i], 1.0)); + } } diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv index d9dacc0..5215e2f 100644 Binary files a/piet-gpu/shader/kernel4.spv and b/piet-gpu/shader/kernel4.spv differ diff --git a/piet-gpu/shader/ptcl.h b/piet-gpu/shader/ptcl.h index 133b47a..dd1f9a8 100644 --- a/piet-gpu/shader/ptcl.h +++ b/piet-gpu/shader/ptcl.h @@ -36,6 +36,14 @@ struct CmdRef { uint offset; }; +struct SegmentRef { + uint offset; +}; + +struct SegChunkRef { + uint offset; +}; + struct CmdCircle { vec2 center; float radius; @@ -60,7 +68,7 @@ CmdLineRef CmdLine_index(CmdLineRef ref, uint index) { } struct CmdStroke { - uint seg_ref; + SegChunkRef seg_ref; float half_width; uint rgba_color; }; @@ -72,7 +80,7 @@ CmdStrokeRef CmdStroke_index(CmdStrokeRef ref, uint index) { } struct CmdFill { - uint seg_ref; + SegChunkRef seg_ref; int backdrop; uint rgba_color; }; @@ -141,6 +149,30 @@ CmdRef Cmd_index(CmdRef ref, uint index) { return CmdRef(ref.offset + index * Cmd_size); } +struct Segment { + vec2 start; + vec2 end; + float y_edge; +}; + +#define Segment_size 20 + +SegmentRef Segment_index(SegmentRef ref, uint index) { + return SegmentRef(ref.offset + index * Segment_size); +} + +struct SegChunk { + uint n; + SegChunkRef next; + SegmentRef segs; +}; + +#define SegChunk_size 12 + +SegChunkRef SegChunk_index(SegChunkRef ref, uint index) { + return SegChunkRef(ref.offset + index * SegChunk_size); +} + CmdCircle CmdCircle_read(CmdCircleRef ref) { uint ix = ref.offset >> 2; uint raw0 = ptcl[ix + 0]; @@ -188,7 +220,7 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) { uint raw1 = ptcl[ix + 1]; uint raw2 = ptcl[ix + 2]; CmdStroke s; - s.seg_ref = raw0; + s.seg_ref = SegChunkRef(raw0); s.half_width = uintBitsToFloat(raw1); s.rgba_color = raw2; return s; @@ -196,7 +228,7 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) { void CmdStroke_write(CmdStrokeRef ref, CmdStroke s) { uint ix = ref.offset >> 2; - ptcl[ix + 0] = s.seg_ref; + ptcl[ix + 0] = s.seg_ref.offset; ptcl[ix + 1] = floatBitsToUint(s.half_width); ptcl[ix + 2] = s.rgba_color; } @@ -207,7 +239,7 @@ CmdFill CmdFill_read(CmdFillRef ref) { uint raw1 = ptcl[ix + 1]; uint raw2 = ptcl[ix + 2]; CmdFill s; - s.seg_ref = raw0; + s.seg_ref = SegChunkRef(raw0); s.backdrop = int(raw1); s.rgba_color = raw2; return s; @@ -215,7 +247,7 @@ CmdFill CmdFill_read(CmdFillRef ref) { void CmdFill_write(CmdFillRef ref, CmdFill s) { uint ix = ref.offset >> 2; - ptcl[ix + 0] = s.seg_ref; + ptcl[ix + 0] = s.seg_ref.offset; ptcl[ix + 1] = uint(s.backdrop); ptcl[ix + 2] = s.rgba_color; } @@ -362,3 +394,45 @@ void Cmd_Bail_write(CmdRef ref) { ptcl[ref.offset >> 2] = Cmd_Bail; } +Segment Segment_read(SegmentRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = ptcl[ix + 0]; + uint raw1 = ptcl[ix + 1]; + uint raw2 = ptcl[ix + 2]; + uint raw3 = ptcl[ix + 3]; + uint raw4 = ptcl[ix + 4]; + Segment s; + s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.y_edge = uintBitsToFloat(raw4); + return s; +} + +void Segment_write(SegmentRef ref, Segment s) { + uint ix = ref.offset >> 2; + ptcl[ix + 0] = floatBitsToUint(s.start.x); + ptcl[ix + 1] = floatBitsToUint(s.start.y); + ptcl[ix + 2] = floatBitsToUint(s.end.x); + ptcl[ix + 3] = floatBitsToUint(s.end.y); + ptcl[ix + 4] = floatBitsToUint(s.y_edge); +} + +SegChunk SegChunk_read(SegChunkRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = ptcl[ix + 0]; + uint raw1 = ptcl[ix + 1]; + uint raw2 = ptcl[ix + 2]; + SegChunk s; + s.n = raw0; + s.next = SegChunkRef(raw1); + s.segs = SegmentRef(raw2); + return s; +} + +void SegChunk_write(SegChunkRef ref, SegChunk s) { + uint ix = ref.offset >> 2; + ptcl[ix + 0] = s.n; + ptcl[ix + 1] = s.next.offset; + ptcl[ix + 2] = s.segs.offset; +} + diff --git a/piet-gpu/shader/scene.h b/piet-gpu/shader/scene.h index 5e36abc..5bb879b 100644 --- a/piet-gpu/shader/scene.h +++ b/piet-gpu/shader/scene.h @@ -32,6 +32,38 @@ struct PietItemRef { uint offset; }; +struct LineSegRef { + uint offset; +}; + +struct QuadSegRef { + uint offset; +}; + +struct CubicSegRef { + uint offset; +}; + +struct FillRef { + uint offset; +}; + +struct StrokeRef { + uint offset; +}; + +struct SetLineWidthRef { + uint offset; +}; + +struct TransformRef { + uint offset; +}; + +struct ElementRef { + uint offset; +}; + struct Bbox { ivec4 bbox; }; @@ -128,6 +160,98 @@ PietItemRef PietItem_index(PietItemRef ref, uint index) { return PietItemRef(ref.offset + index * PietItem_size); } +struct LineSeg { + vec2 p0; + vec2 p1; +}; + +#define LineSeg_size 16 + +LineSegRef LineSeg_index(LineSegRef ref, uint index) { + return LineSegRef(ref.offset + index * LineSeg_size); +} + +struct QuadSeg { + vec2 p0; + vec2 p1; + vec2 p2; +}; + +#define QuadSeg_size 24 + +QuadSegRef QuadSeg_index(QuadSegRef ref, uint index) { + return QuadSegRef(ref.offset + index * QuadSeg_size); +} + +struct CubicSeg { + vec2 p0; + vec2 p1; + vec2 p2; + vec2 p3; +}; + +#define CubicSeg_size 32 + +CubicSegRef CubicSeg_index(CubicSegRef ref, uint index) { + return CubicSegRef(ref.offset + index * CubicSeg_size); +} + +struct Fill { + uint rgba_color; +}; + +#define Fill_size 4 + +FillRef Fill_index(FillRef ref, uint index) { + return FillRef(ref.offset + index * Fill_size); +} + +struct Stroke { + uint rgba_color; +}; + +#define Stroke_size 4 + +StrokeRef Stroke_index(StrokeRef ref, uint index) { + return StrokeRef(ref.offset + index * Stroke_size); +} + +struct SetLineWidth { + float width; +}; + +#define SetLineWidth_size 4 + +SetLineWidthRef SetLineWidth_index(SetLineWidthRef ref, uint index) { + return SetLineWidthRef(ref.offset + index * SetLineWidth_size); +} + +struct Transform { + vec4 mat; + vec2 translate; +}; + +#define Transform_size 24 + +TransformRef Transform_index(TransformRef ref, uint index) { + return TransformRef(ref.offset + index * Transform_size); +} + +#define Element_Nop 0 +#define Element_StrokeLine 1 +#define Element_FillLine 2 +#define Element_Quad 3 +#define Element_Cubic 4 +#define Element_Stroke 5 +#define Element_Fill 6 +#define Element_SetLineWidth 7 +#define Element_Transform 8 +#define Element_size 36 + +ElementRef Element_index(ElementRef ref, uint index) { + return ElementRef(ref.offset + index * Element_size); +} + Bbox Bbox_read(BboxRef ref) { uint ix = ref.offset >> 2; uint raw0 = scene[ix + 0]; @@ -236,3 +360,122 @@ PietStrokePolyLine PietItem_Poly_read(PietItemRef ref) { return PietStrokePolyLine_read(PietStrokePolyLineRef(ref.offset + 4)); } +LineSeg LineSeg_read(LineSegRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = scene[ix + 0]; + uint raw1 = scene[ix + 1]; + uint raw2 = scene[ix + 2]; + uint raw3 = scene[ix + 3]; + LineSeg s; + s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + return s; +} + +QuadSeg QuadSeg_read(QuadSegRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = scene[ix + 0]; + uint raw1 = scene[ix + 1]; + uint raw2 = scene[ix + 2]; + uint raw3 = scene[ix + 3]; + uint raw4 = scene[ix + 4]; + uint raw5 = scene[ix + 5]; + QuadSeg s; + s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5)); + return s; +} + +CubicSeg CubicSeg_read(CubicSegRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = scene[ix + 0]; + uint raw1 = scene[ix + 1]; + uint raw2 = scene[ix + 2]; + uint raw3 = scene[ix + 3]; + uint raw4 = scene[ix + 4]; + uint raw5 = scene[ix + 5]; + uint raw6 = scene[ix + 6]; + uint raw7 = scene[ix + 7]; + CubicSeg s; + s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5)); + s.p3 = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7)); + return s; +} + +Fill Fill_read(FillRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = scene[ix + 0]; + Fill s; + s.rgba_color = raw0; + return s; +} + +Stroke Stroke_read(StrokeRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = scene[ix + 0]; + Stroke s; + s.rgba_color = raw0; + return s; +} + +SetLineWidth SetLineWidth_read(SetLineWidthRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = scene[ix + 0]; + SetLineWidth s; + s.width = uintBitsToFloat(raw0); + return s; +} + +Transform Transform_read(TransformRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = scene[ix + 0]; + uint raw1 = scene[ix + 1]; + uint raw2 = scene[ix + 2]; + uint raw3 = scene[ix + 3]; + uint raw4 = scene[ix + 4]; + uint raw5 = scene[ix + 5]; + Transform s; + s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5)); + return s; +} + +uint Element_tag(ElementRef ref) { + return scene[ref.offset >> 2]; +} + +LineSeg Element_StrokeLine_read(ElementRef ref) { + return LineSeg_read(LineSegRef(ref.offset + 4)); +} + +LineSeg Element_FillLine_read(ElementRef ref) { + return LineSeg_read(LineSegRef(ref.offset + 4)); +} + +QuadSeg Element_Quad_read(ElementRef ref) { + return QuadSeg_read(QuadSegRef(ref.offset + 4)); +} + +CubicSeg Element_Cubic_read(ElementRef ref) { + return CubicSeg_read(CubicSegRef(ref.offset + 4)); +} + +Stroke Element_Stroke_read(ElementRef ref) { + return Stroke_read(StrokeRef(ref.offset + 4)); +} + +Fill Element_Fill_read(ElementRef ref) { + return Fill_read(FillRef(ref.offset + 4)); +} + +SetLineWidth Element_SetLineWidth_read(ElementRef ref) { + return SetLineWidth_read(SetLineWidthRef(ref.offset + 4)); +} + +Transform Element_Transform_read(ElementRef ref) { + return Transform_read(TransformRef(ref.offset + 4)); +} + diff --git a/piet-gpu/shader/segment.h b/piet-gpu/shader/segment.h deleted file mode 100644 index 2843b64..0000000 --- a/piet-gpu/shader/segment.h +++ /dev/null @@ -1,126 +0,0 @@ -// Code auto-generated by piet-gpu-derive - -struct TileHeaderRef { - uint offset; -}; - -struct ItemHeaderRef { - uint offset; -}; - -struct SegmentRef { - uint offset; -}; - -struct SegChunkRef { - uint offset; -}; - -struct TileHeader { - uint n; - ItemHeaderRef items; -}; - -#define TileHeader_size 8 - -TileHeaderRef TileHeader_index(TileHeaderRef ref, uint index) { - return TileHeaderRef(ref.offset + index * TileHeader_size); -} - -struct ItemHeader { - SegChunkRef segments; -}; - -#define ItemHeader_size 4 - -ItemHeaderRef ItemHeader_index(ItemHeaderRef ref, uint index) { - return ItemHeaderRef(ref.offset + index * ItemHeader_size); -} - -struct Segment { - vec2 start; - vec2 end; -}; - -#define Segment_size 16 - -SegmentRef Segment_index(SegmentRef ref, uint index) { - return SegmentRef(ref.offset + index * Segment_size); -} - -struct SegChunk { - uint n; - SegChunkRef next; -}; - -#define SegChunk_size 8 - -SegChunkRef SegChunk_index(SegChunkRef ref, uint index) { - return SegChunkRef(ref.offset + index * SegChunk_size); -} - -TileHeader TileHeader_read(TileHeaderRef ref) { - uint ix = ref.offset >> 2; - uint raw0 = segment[ix + 0]; - uint raw1 = segment[ix + 1]; - TileHeader s; - s.n = raw0; - s.items = ItemHeaderRef(raw1); - return s; -} - -void TileHeader_write(TileHeaderRef ref, TileHeader s) { - uint ix = ref.offset >> 2; - segment[ix + 0] = s.n; - segment[ix + 1] = s.items.offset; -} - -ItemHeader ItemHeader_read(ItemHeaderRef ref) { - uint ix = ref.offset >> 2; - uint raw0 = segment[ix + 0]; - ItemHeader s; - s.segments = SegChunkRef(raw0); - return s; -} - -void ItemHeader_write(ItemHeaderRef ref, ItemHeader s) { - uint ix = ref.offset >> 2; - segment[ix + 0] = s.segments.offset; -} - -Segment Segment_read(SegmentRef ref) { - uint ix = ref.offset >> 2; - uint raw0 = segment[ix + 0]; - uint raw1 = segment[ix + 1]; - uint raw2 = segment[ix + 2]; - uint raw3 = segment[ix + 3]; - Segment s; - s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); - s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); - return s; -} - -void Segment_write(SegmentRef ref, Segment s) { - uint ix = ref.offset >> 2; - segment[ix + 0] = floatBitsToUint(s.start.x); - segment[ix + 1] = floatBitsToUint(s.start.y); - segment[ix + 2] = floatBitsToUint(s.end.x); - segment[ix + 3] = floatBitsToUint(s.end.y); -} - -SegChunk SegChunk_read(SegChunkRef ref) { - uint ix = ref.offset >> 2; - uint raw0 = segment[ix + 0]; - uint raw1 = segment[ix + 1]; - SegChunk s; - s.n = raw0; - s.next = SegChunkRef(raw1); - return s; -} - -void SegChunk_write(SegChunkRef ref, SegChunk s) { - uint ix = ref.offset >> 2; - segment[ix + 0] = s.n; - segment[ix + 1] = s.next.offset; -} - diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h index 3d9cd53..b913086 100644 --- a/piet-gpu/shader/setup.h +++ b/piet-gpu/shader/setup.h @@ -39,4 +39,26 @@ // Maximum number of segments in a SegChunk #define SEG_CHUNK_N 32 -#define SEG_CHUNK_ALLOC 512 \ No newline at end of file +#define SEG_CHUNK_ALLOC 512 + +// Stuff for new algorithm follows; some of the above should get +// deleted. + +// These should probably be renamed and/or reworked. In the binning +// kernel, they represent the number of bins. Also, the workgroup size +// of that kernel is equal to the number of bins, but should probably +// be more flexible (it's 512 in the K&L paper). +#define N_TILE_X 16 +#define N_TILE_Y 16 +#define N_TILE (N_TILE_X * N_TILE_Y) +#define LG_N_TILE 8 +#define N_SLICE (N_TILE / 32) +// Number of workgroups for binning kernel +#define N_WG 16 + +// This is the ratio of the number of elements in a binning workgroup +// over the number of elements in a partition workgroup. +#define ELEMENT_BINNING_RATIO 2 + +#define BIN_INITIAL_ALLOC 64 +#define BIN_ALLOC 256 diff --git a/piet-gpu/shader/state.h b/piet-gpu/shader/state.h new file mode 100644 index 0000000..2547b93 --- /dev/null +++ b/piet-gpu/shader/state.h @@ -0,0 +1,59 @@ +// Code auto-generated by piet-gpu-derive + +struct StateRef { + uint offset; +}; + +struct State { + vec4 mat; + vec2 translate; + vec4 bbox; + float linewidth; + uint flags; +}; + +#define State_size 48 + +StateRef State_index(StateRef ref, uint index) { + return StateRef(ref.offset + index * State_size); +} + +State State_read(StateRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = state[ix + 0]; + uint raw1 = state[ix + 1]; + uint raw2 = state[ix + 2]; + uint raw3 = state[ix + 3]; + uint raw4 = state[ix + 4]; + uint raw5 = state[ix + 5]; + uint raw6 = state[ix + 6]; + uint raw7 = state[ix + 7]; + uint raw8 = state[ix + 8]; + uint raw9 = state[ix + 9]; + uint raw10 = state[ix + 10]; + uint raw11 = state[ix + 11]; + State s; + s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5)); + s.bbox = vec4(uintBitsToFloat(raw6), uintBitsToFloat(raw7), uintBitsToFloat(raw8), uintBitsToFloat(raw9)); + s.linewidth = uintBitsToFloat(raw10); + s.flags = raw11; + return s; +} + +void State_write(StateRef ref, State s) { + uint ix = ref.offset >> 2; + state[ix + 0] = floatBitsToUint(s.mat.x); + state[ix + 1] = floatBitsToUint(s.mat.y); + state[ix + 2] = floatBitsToUint(s.mat.z); + state[ix + 3] = floatBitsToUint(s.mat.w); + state[ix + 4] = floatBitsToUint(s.translate.x); + state[ix + 5] = floatBitsToUint(s.translate.y); + state[ix + 6] = floatBitsToUint(s.bbox.x); + state[ix + 7] = floatBitsToUint(s.bbox.y); + state[ix + 8] = floatBitsToUint(s.bbox.z); + state[ix + 9] = floatBitsToUint(s.bbox.w); + state[ix + 10] = floatBitsToUint(s.linewidth); + state[ix + 11] = s.flags; +} + diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index a47737a..3ec7e1d 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -1,5 +1,5 @@ -mod render_ctx; mod pico_svg; +mod render_ctx; pub use render_ctx::PietGpuRenderContext; @@ -8,6 +8,8 @@ use rand::{Rng, RngCore}; use piet::kurbo::{BezPath, Circle, Line, Point, Vec2}; use piet::{Color, RenderContext}; +use piet_gpu_types::encoder::Encode; + use piet_gpu_hal::{CmdBuf, Device, Error, ImageLayout, MemFlags}; use pico_svg::PicoSvg; @@ -28,7 +30,20 @@ const PTCL_INITIAL_ALLOC: usize = 1024; const K2_PER_TILE_SIZE: usize = 8; -const N_CIRCLES: usize = 1; +const N_CIRCLES: usize = 0; + +const N_WG: u32 = 16; + +pub fn render_svg(rc: &mut impl RenderContext, filename: &str, scale: f64) { + let xml_str = std::fs::read_to_string(filename).unwrap(); + let start = std::time::Instant::now(); + let svg = PicoSvg::load(&xml_str, scale).unwrap(); + println!("parsing time: {:?}", start.elapsed()); + + let start = std::time::Instant::now(); + svg.render(rc); + println!("flattening and encoding time: {:?}", start.elapsed()); +} pub fn render_scene(rc: &mut impl RenderContext) { let mut rng = rand::thread_rng(); @@ -42,12 +57,14 @@ pub fn render_scene(rc: &mut impl RenderContext) { let circle = Circle::new(center, radius); rc.fill(circle, &color); } + /* let mut path = BezPath::new(); path.move_to((100.0, 1150.0)); path.line_to((200.0, 1200.0)); path.line_to((150.0, 1250.0)); path.close_path(); rc.fill(path, &Color::rgb8(128, 0, 128)); + */ rc.stroke( Line::new((100.0, 100.0), (200.0, 150.0)), &Color::WHITE, @@ -59,7 +76,7 @@ pub fn render_scene(rc: &mut impl RenderContext) { #[allow(unused)] fn render_cardioid(rc: &mut impl RenderContext) { - let n = 91; + let n = 601; let dth = std::f64::consts::PI * 2.0 / (n as f64); let center = Point::new(1024.0, 768.0); let r = 750.0; @@ -67,7 +84,7 @@ fn render_cardioid(rc: &mut impl RenderContext) { for i in 1..n { let p0 = center + Vec2::from_angle(i as f64 * dth) * r; let p1 = center + Vec2::from_angle(((i * 2) % n) as f64 * dth) * r; - rc.fill(&Circle::new(p0, 8.0), &Color::WHITE); + //rc.fill(&Circle::new(p0, 8.0), &Color::WHITE); path.move_to(p0); path.line_to(p1); //rc.stroke(Line::new(p0, p1), &Color::BLACK, 2.0); @@ -96,10 +113,10 @@ fn dump_scene(buf: &[u8]) { } #[allow(unused)] -fn dump_k1_data(k1_buf: &[u32]) { +pub fn dump_k1_data(k1_buf: &[u32]) { for i in 0..k1_buf.len() { if k1_buf[i] != 0 { - println!("{:4x}: {:8x}", i, k1_buf[i]); + println!("{:4x}: {:8x}", i * 4, k1_buf[i]); } } } @@ -110,27 +127,30 @@ pub struct Renderer { scene_buf: D::Buffer, scene_dev: D::Buffer, - k1_alloc_buf_host: D::Buffer, - k1_alloc_buf_dev: D::Buffer, - k2s_alloc_buf_host: D::Buffer, - k2s_alloc_buf_dev: D::Buffer, - k2f_alloc_buf_host: D::Buffer, - k2f_alloc_buf_dev: D::Buffer, - k3_alloc_buf_host: D::Buffer, - k3_alloc_buf_dev: D::Buffer, - tilegroup_buf: D::Buffer, - ptcl_buf: D::Buffer, + pub state_buf: D::Buffer, + pub anno_buf: D::Buffer, + pub bin_buf: D::Buffer, + pub ptcl_buf: D::Buffer, + + el_pipeline: D::Pipeline, + el_ds: D::DescriptorSet, + + bin_pipeline: D::Pipeline, + bin_ds: D::DescriptorSet, + + bin_alloc_buf_host: D::Buffer, + bin_alloc_buf_dev: D::Buffer, + + coarse_pipeline: D::Pipeline, + coarse_ds: D::DescriptorSet, + + coarse_alloc_buf_host: D::Buffer, + coarse_alloc_buf_dev: D::Buffer, - k1_pipeline: D::Pipeline, - k1_ds: D::DescriptorSet, - k2s_pipeline: D::Pipeline, - k2s_ds: D::DescriptorSet, - k2f_pipeline: D::Pipeline, - k2f_ds: D::DescriptorSet, - k3_pipeline: D::Pipeline, - k3_ds: D::DescriptorSet, k4_pipeline: D::Pipeline, k4_ds: D::DescriptorSet, + + n_elements: usize, } impl Renderer { @@ -138,6 +158,9 @@ impl Renderer { let host = MemFlags::host_coherent(); let dev = MemFlags::device_local(); + let n_elements = scene.len() / piet_gpu_types::scene::Element::fixed_size(); + println!("scene: {} elements", n_elements); + let scene_buf = device .create_buffer(std::mem::size_of_val(&scene[..]) as u64, host) .unwrap(); @@ -146,174 +169,121 @@ impl Renderer { .unwrap(); device.write_buffer(&scene_buf, &scene)?; - let tilegroup_buf = device.create_buffer(4 * 1024 * 1024, dev)?; + let state_buf = device.create_buffer(1 * 1024 * 1024, dev)?; + let anno_buf = device.create_buffer(64 * 1024 * 1024, dev)?; + let bin_buf = device.create_buffer(64 * 1024 * 1024, dev)?; let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?; - let segment_buf = device.create_buffer(64 * 1024 * 1024, dev)?; - let fill_seg_buf = device.create_buffer(64 * 1024 * 1024, dev)?; let image_dev = device.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?; - let k1_alloc_buf_host = device.create_buffer(4, host)?; - let k1_alloc_buf_dev = device.create_buffer(4, dev)?; - let k1_alloc_start = WIDTH_IN_TILEGROUPS * HEIGHT_IN_TILEGROUPS * TILEGROUP_STRIDE; - device.write_buffer(&k1_alloc_buf_host, &[k1_alloc_start as u32])?; - let k1_code = include_bytes!("../shader/kernel1.spv"); - let k1_pipeline = device - .create_simple_compute_pipeline(k1_code, 3, 0)?; - let k1_ds = device - .create_descriptor_set( - &k1_pipeline, - &[&scene_dev, &tilegroup_buf, &k1_alloc_buf_dev], - &[], - )?; + let el_code = include_bytes!("../shader/elements.spv"); + let el_pipeline = device.create_simple_compute_pipeline(el_code, 3, 0)?; + let el_ds = device.create_descriptor_set( + &el_pipeline, + &[&scene_dev, &state_buf, &anno_buf], + &[], + )?; - let k2s_alloc_buf_host = device.create_buffer(4, host)?; - let k2s_alloc_buf_dev = device.create_buffer(4, dev)?; - let k2s_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * K2_PER_TILE_SIZE; - device - .write_buffer(&k2s_alloc_buf_host, &[k2s_alloc_start as u32]) - ?; - let k2s_code = include_bytes!("../shader/kernel2s.spv"); - let k2s_pipeline = device - .create_simple_compute_pipeline(k2s_code, 4, 0) - ?; - let k2s_ds = device - .create_descriptor_set( - &k2s_pipeline, - &[&scene_dev, &tilegroup_buf, &segment_buf, &k2s_alloc_buf_dev], - &[], - ) - ?; + let bin_alloc_buf_host = device.create_buffer(12, host)?; + let bin_alloc_buf_dev = device.create_buffer(12, dev)?; - let k2f_alloc_buf_host = device.create_buffer(4, host)?; - let k2f_alloc_buf_dev = device.create_buffer(4, dev)?; - let k2f_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * K2_PER_TILE_SIZE; - device - .write_buffer(&k2f_alloc_buf_host, &[k2f_alloc_start as u32]) - ?; - let k2f_code = include_bytes!("../shader/kernel2f.spv"); - let k2f_pipeline = device.create_simple_compute_pipeline(k2f_code, 4, 0)?; - let k2f_ds = device - .create_descriptor_set( - &k2f_pipeline, - &[ - &scene_dev, - &tilegroup_buf, - &fill_seg_buf, - &k2f_alloc_buf_dev, - ], - &[], - ) - ?; + // TODO: constants + let bin_alloc_start = ((n_elements + 255) & !255) * 8; + device.write_buffer( + &bin_alloc_buf_host, + &[n_elements as u32, 0, bin_alloc_start as u32], + )?; + let bin_code = include_bytes!("../shader/binning.spv"); + let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 4, 0)?; + let bin_ds = device.create_descriptor_set( + &bin_pipeline, + &[&anno_buf, &state_buf, &bin_alloc_buf_dev, &bin_buf], + &[], + )?; - let k3_alloc_buf_host = device.create_buffer(4, host)?; - let k3_alloc_buf_dev = device.create_buffer(4, dev)?; - let k3_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC; - device - .write_buffer(&k3_alloc_buf_host, &[k3_alloc_start as u32]) - ?; - let k3_code = include_bytes!("../shader/kernel3.spv"); - let k3_pipeline = device.create_simple_compute_pipeline(k3_code, 6, 0)?; - let k3_ds = device - .create_descriptor_set( - &k3_pipeline, - &[ - &scene_dev, - &tilegroup_buf, - &segment_buf, - &fill_seg_buf, - &ptcl_buf, - &k3_alloc_buf_dev, - ], - &[], - ) - ?; + let coarse_alloc_buf_host = device.create_buffer(8, host)?; + let coarse_alloc_buf_dev = device.create_buffer(8, dev)?; + + let coarse_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC; + device.write_buffer( + &coarse_alloc_buf_host, + &[n_elements as u32, coarse_alloc_start as u32], + )?; + let coarse_code = include_bytes!("../shader/coarse.spv"); + let coarse_pipeline = device.create_simple_compute_pipeline(coarse_code, 4, 0)?; + let coarse_ds = device.create_descriptor_set( + &coarse_pipeline, + &[&anno_buf, &bin_buf, &coarse_alloc_buf_dev, &ptcl_buf], + &[], + )?; let k4_code = include_bytes!("../shader/kernel4.spv"); - let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 3, 1)?; - let k4_ds = device - .create_descriptor_set(&k4_pipeline, &[&ptcl_buf, &segment_buf, &fill_seg_buf], &[&image_dev]) - ?; + let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 1, 1)?; + let k4_ds = device.create_descriptor_set(&k4_pipeline, &[&ptcl_buf], &[&image_dev])?; Ok(Renderer { scene_buf, scene_dev, image_dev, - k1_alloc_buf_host, - k1_alloc_buf_dev, - k2s_alloc_buf_host, - k2s_alloc_buf_dev, - k2f_alloc_buf_host, - k2f_alloc_buf_dev, - k3_alloc_buf_host, - k3_alloc_buf_dev, - tilegroup_buf, - ptcl_buf, - k1_pipeline, - k1_ds, - k2s_pipeline, - k2s_ds, - k2f_pipeline, - k2f_ds, - k3_pipeline, - k3_ds, + el_pipeline, + el_ds, + bin_pipeline, + bin_ds, + coarse_pipeline, + coarse_ds, k4_pipeline, k4_ds, + state_buf, + anno_buf, + bin_buf, + ptcl_buf, + bin_alloc_buf_host, + bin_alloc_buf_dev, + coarse_alloc_buf_host, + coarse_alloc_buf_dev, + n_elements, }) } pub unsafe fn record(&self, cmd_buf: &mut impl CmdBuf, query_pool: &D::QueryPool) { cmd_buf.copy_buffer(&self.scene_buf, &self.scene_dev); - // Note: we could use one alloc buf and reuse it. But we'll stick with - // multiple ones for clarity. - cmd_buf.copy_buffer(&self.k1_alloc_buf_host, &self.k1_alloc_buf_dev); - cmd_buf.copy_buffer(&self.k2s_alloc_buf_host, &self.k2s_alloc_buf_dev); - cmd_buf.copy_buffer(&self.k2f_alloc_buf_host, &self.k2f_alloc_buf_dev); - cmd_buf.copy_buffer(&self.k3_alloc_buf_host, &self.k3_alloc_buf_dev); - // Note: these clears aren't necessary, and are here to make inspection - // of the buffers cleaner. Can likely be removed. - cmd_buf.clear_buffer(&self.tilegroup_buf); - cmd_buf.clear_buffer(&self.ptcl_buf); + cmd_buf.copy_buffer(&self.bin_alloc_buf_host, &self.bin_alloc_buf_dev); + cmd_buf.copy_buffer(&self.coarse_alloc_buf_host, &self.coarse_alloc_buf_dev); + cmd_buf.clear_buffer(&self.state_buf); cmd_buf.memory_barrier(); - cmd_buf.image_barrier(&self.image_dev, ImageLayout::Undefined, ImageLayout::General); + cmd_buf.image_barrier( + &self.image_dev, + ImageLayout::Undefined, + ImageLayout::General, + ); cmd_buf.reset_query_pool(&query_pool); cmd_buf.write_timestamp(&query_pool, 0); cmd_buf.dispatch( - &self.k1_pipeline, - &self.k1_ds, - ((WIDTH / 512) as u32, (HEIGHT / 512) as u32, 1), + &self.el_pipeline, + &self.el_ds, + (((self.n_elements + 127) / 128) as u32, 1, 1), ); cmd_buf.write_timestamp(&query_pool, 1); cmd_buf.memory_barrier(); cmd_buf.dispatch( - &self.k2s_pipeline, - &self.k2s_ds, - ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 1), + &self.bin_pipeline, + &self.bin_ds, + (((self.n_elements + 255) / 256) as u32, 1, 1), ); cmd_buf.write_timestamp(&query_pool, 2); - // Note: this barrier is not necessary (k2f does not depend on - // k2s output), but I'm keeping it here to increase transparency - // of performance. cmd_buf.memory_barrier(); cmd_buf.dispatch( - &self.k2f_pipeline, - &self.k2f_ds, - ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 2), + &self.coarse_pipeline, + &self.coarse_ds, + (WIDTH as u32 / 256, HEIGHT as u32 / 256, 1), ); cmd_buf.write_timestamp(&query_pool, 3); cmd_buf.memory_barrier(); - cmd_buf.dispatch( - &self.k3_pipeline, - &self.k3_ds, - ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 3), - ); - cmd_buf.write_timestamp(&query_pool, 4); - cmd_buf.memory_barrier(); cmd_buf.dispatch( &self.k4_pipeline, &self.k4_ds, ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1), ); - cmd_buf.write_timestamp(&query_pool, 5); + cmd_buf.write_timestamp(&query_pool, 4); cmd_buf.memory_barrier(); cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc); } diff --git a/piet-gpu/src/pico_svg.rs b/piet-gpu/src/pico_svg.rs index a4c92d0..140c42d 100644 --- a/piet-gpu/src/pico_svg.rs +++ b/piet-gpu/src/pico_svg.rs @@ -2,7 +2,7 @@ use std::str::FromStr; -use roxmltree::Document; +use roxmltree::{Document, Node}; use piet::kurbo::{Affine, BezPath}; @@ -28,27 +28,19 @@ pub struct FillItem { path: BezPath, } +struct Parser<'a> { + scale: f64, + items: &'a mut Vec, +} + impl PicoSvg { pub fn load(xml_string: &str, scale: f64) -> Result> { let doc = Document::parse(xml_string)?; let root = doc.root_element(); - let g = root.first_element_child().ok_or("no root element")?; let mut items = Vec::new(); - for el in g.children() { - if el.is_element() { - let d = el.attribute("d").ok_or("missing 'd' attribute")?; - let bp = BezPath::from_svg(d)?; - let path = Affine::scale(scale) * bp; - if let Some(fill_color) = el.attribute("fill") { - let color = parse_color(fill_color); - items.push(Item::Fill(FillItem { color, path: path.clone() })); - } - if let Some(stroke_color) = el.attribute("stroke") { - let width = f64::from_str(el.attribute("stroke-width").ok_or("missing width")?)?; - let color = parse_color(stroke_color); - items.push(Item::Stroke(StrokeItem { width, color, path })); - } - } + let mut parser = Parser::new(&mut items, scale); + for node in root.children() { + parser.rec_parse(node)?; } Ok(PicoSvg { items }) } @@ -58,6 +50,7 @@ impl PicoSvg { match item { Item::Fill(fill_item) => { rc.fill(&fill_item.path, &fill_item.color); + //rc.stroke(&fill_item.path, &fill_item.color, 1.0); } Item::Stroke(stroke_item) => { rc.stroke(&stroke_item.path, &stroke_item.color, stroke_item.width); @@ -67,6 +60,59 @@ impl PicoSvg { } } +impl<'a> Parser<'a> { + fn new(items: &'a mut Vec, scale: f64) -> Parser<'a> { + Parser { scale, items } + } + + fn rec_parse(&mut self, node: Node) -> Result<(), Box> { + let transform = if self.scale >= 0.0 { + Affine::scale(self.scale) + } else { + Affine::new([-self.scale, 0.0, 0.0, self.scale, 0.0, 1536.0]) + }; + if node.is_element() { + match node.tag_name().name() { + "g" => { + for child in node.children() { + self.rec_parse(child)?; + } + } + "path" => { + let d = node.attribute("d").ok_or("missing 'd' attribute")?; + let bp = BezPath::from_svg(d)?; + let path = transform * bp; + // TODO: default fill color is black, but this is overridden in tiger to this logic. + if let Some(fill_color) = node.attribute("fill") { + if fill_color != "none" { + let color = parse_color(fill_color); + let color = modify_opacity(color, "fill-opacity", node); + self.items.push(Item::Fill(FillItem { + color, + path: path.clone(), + })); + } + } + if let Some(stroke_color) = node.attribute("stroke") { + if stroke_color != "none" { + let width = self.scale.abs() + * f64::from_str( + node.attribute("stroke-width").ok_or("missing width")?, + )?; + let color = parse_color(stroke_color); + let color = modify_opacity(color, "stroke-opacity", node); + self.items + .push(Item::Stroke(StrokeItem { width, color, path })); + } + } + } + _ => (), + } + } + Ok(()) + } +} + fn parse_color(color: &str) -> Color { if color.as_bytes()[0] == b'#' { let mut hex = u32::from_str_radix(&color[1..], 16).unwrap(); @@ -74,7 +120,27 @@ fn parse_color(color: &str) -> Color { hex = (hex >> 8) * 0x110000 + ((hex >> 4) & 0xf) * 0x1100 + (hex & 0xf) * 0x11; } Color::from_rgba32_u32((hex << 8) + 0xff) + } else if color.starts_with("rgb(") { + let mut iter = color[4..color.len() - 1].split(','); + let r = u8::from_str(iter.next().unwrap()).unwrap(); + let g = u8::from_str(iter.next().unwrap()).unwrap(); + let b = u8::from_str(iter.next().unwrap()).unwrap(); + Color::rgb8(r, g, b) } else { Color::from_rgba32_u32(0xff00ff80) } } + +fn modify_opacity(color: Color, attr_name: &str, node: Node) -> Color { + if let Some(opacity) = node.attribute(attr_name) { + let alpha = if opacity.ends_with("%") { + let pctg = opacity[..opacity.len() - 1].parse().unwrap_or(100.0); + pctg * 0.01 + } else { + opacity.parse().unwrap_or(1.0) + }; + color.with_alpha(alpha) + } else { + color + } +} diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs index 6367301..da234de 100644 --- a/piet-gpu/src/render_ctx.rs +++ b/piet-gpu/src/render_ctx.rs @@ -2,7 +2,11 @@ use std::borrow::Cow; use piet_gpu_types::encoder::{Encode, Encoder, Ref}; use piet_gpu_types::scene; -use piet_gpu_types::scene::{Bbox, PietCircle, PietFill, PietItem, PietStrokePolyLine, SimpleGroup}; +use piet_gpu_types::scene::{ + Bbox, PietCircle, PietFill, PietItem, PietStrokePolyLine, SimpleGroup, +}; + +use piet_gpu_types::scene::{CubicSeg, Element, Fill, LineSeg, QuadSeg, SetLineWidth, Stroke}; use piet::kurbo::{Affine, PathEl, Point, Rect, Shape}; @@ -27,10 +31,10 @@ pub struct PietGpuText; pub struct PietGpuRenderContext { encoder: Encoder, - bboxes: Vec, - items: Vec, + elements: Vec, // Will probably need direct accesss to hal Device to create images etc. inner_text: PietGpuText, + stroke_width: f32, } #[derive(Clone)] @@ -43,47 +47,22 @@ const TOLERANCE: f64 = 0.25; impl PietGpuRenderContext { pub fn new() -> PietGpuRenderContext { - let mut encoder = Encoder::new(); - let _reserve_root = encoder.alloc_chunk(PietItem::fixed_size() as u32); - let bboxes = Vec::new(); - let items = Vec::new(); + let encoder = Encoder::new(); + let elements = Vec::new(); let inner_text = PietGpuText; + let stroke_width = 0.0; PietGpuRenderContext { encoder, - bboxes, - items, + elements, inner_text, + stroke_width, } } pub fn get_scene_buf(&mut self) -> &[u8] { - let n_items = self.bboxes.len() as u32; - let bboxes = self.bboxes.encode(&mut self.encoder).transmute(); - let items = self.items.encode(&mut self.encoder).transmute(); - let offset = scene::Point { xy: [0.0, 0.0] }; - let simple_group = SimpleGroup { - n_items, - bboxes, - items, - offset, - }; - let root_item = PietItem::Group(simple_group); - root_item.encode_to(&mut self.encoder.buf_mut()[0..PietItem::fixed_size()]); + self.elements.encode(&mut self.encoder); self.encoder.buf() } - - fn push_item(&mut self, item: PietItem, bbox: Rect) { - let scene_bbox = Bbox { - bbox: [ - bbox.x0.floor() as i16, - bbox.y0.floor() as i16, - bbox.x1.ceil() as i16, - bbox.y1.ceil() as i16, - ], - }; - self.items.push(item); - self.bboxes.push(scene_bbox); - } } impl RenderContext for PietGpuRenderContext { @@ -107,20 +86,19 @@ impl RenderContext for PietGpuRenderContext { fn clear(&mut self, _color: Color) {} fn stroke(&mut self, shape: impl Shape, brush: &impl IntoBrush, width: f64) { - let bbox = shape.bounding_box(); - let brush = brush.make_brush(self, || bbox).into_owned(); + let width = width as f32; + if self.stroke_width != width { + self.elements + .push(Element::SetLineWidth(SetLineWidth { width })); + self.stroke_width = width; + } + let brush = brush.make_brush(self, || shape.bounding_box()).into_owned(); let path = shape.to_bez_path(TOLERANCE); - let (n_points, points) = flatten_shape(&mut self.encoder, path); + self.encode_path(path, false); match brush { PietGpuBrush::Solid(rgba_color) => { - let poly_line = PietStrokePolyLine { - rgba_color, - width: width as f32, - n_points, - points, - }; - let bbox = bbox.inset(-0.5 * width); - self.push_item(PietItem::Poly(poly_line), bbox); + let stroke = Stroke { rgba_color }; + self.elements.push(Element::Stroke(stroke)); } _ => (), } @@ -136,35 +114,13 @@ impl RenderContext for PietGpuRenderContext { } fn fill(&mut self, shape: impl Shape, brush: &impl IntoBrush) { - let bbox = shape.bounding_box(); let brush = brush.make_brush(self, || shape.bounding_box()).into_owned(); - - if let Some(circle) = shape.as_circle() { - match brush { - PietGpuBrush::Solid(rgba_color) => { - let piet_circle = PietCircle { - rgba_color, - center: to_scene_point(circle.center), - radius: circle.radius as f32, - }; - let bbox = circle.bounding_box(); - self.push_item(PietItem::Circle(piet_circle), bbox); - } - _ => {} - } - return; - } let path = shape.to_bez_path(TOLERANCE); - let (n_points, points) = flatten_shape(&mut self.encoder, path); + self.encode_path(path, true); match brush { PietGpuBrush::Solid(rgba_color) => { - let fill = PietFill { - flags: 0, - rgba_color, - n_points, - points, - }; - self.push_item(PietItem::Fill(fill), bbox); + let fill = Fill { rgba_color }; + self.elements.push(Element::Fill(fill)); } _ => (), } @@ -241,45 +197,110 @@ impl RenderContext for PietGpuRenderContext { } } -fn flatten_shape( - encoder: &mut Encoder, - path: impl Iterator, -) -> (u32, Ref) { - let mut points = Vec::new(); - let mut start_pt = None; - let mut last_pt = None; - piet::kurbo::flatten(path, TOLERANCE, |el| { - match el { - PathEl::MoveTo(p) => { - let scene_pt = to_scene_point(p); - start_pt = Some(clone_scene_pt(&scene_pt)); - if !points.is_empty() { - points.push(scene::Point { - xy: [std::f32::NAN, std::f32::NAN], - }); +impl PietGpuRenderContext { + fn encode_line_seg(&mut self, seg: LineSeg, is_fill: bool) { + if is_fill { + self.elements.push(Element::FillLine(seg)); + } else { + self.elements.push(Element::StrokeLine(seg)); + } + } + + fn encode_path(&mut self, path: impl Iterator, is_fill: bool) { + let flatten = true; + if flatten { + let mut start_pt = None; + let mut last_pt = None; + piet::kurbo::flatten(path, TOLERANCE, |el| { + match el { + PathEl::MoveTo(p) => { + let scene_pt = to_f32_2(p); + start_pt = Some(scene_pt); + last_pt = Some(scene_pt); + } + PathEl::LineTo(p) => { + let scene_pt = to_f32_2(p); + let seg = LineSeg { + p0: last_pt.unwrap(), + p1: scene_pt, + }; + self.encode_line_seg(seg, is_fill); + last_pt = Some(scene_pt); + } + PathEl::ClosePath => { + if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) { + if last != start { + let seg = LineSeg { + p0: last, + p1: start, + }; + self.encode_line_seg(seg, is_fill); + } + } + } + _ => (), } - last_pt = Some(clone_scene_pt(&scene_pt)); - points.push(scene_pt); - } - PathEl::LineTo(p) => { - let scene_pt = to_scene_point(p); - last_pt = Some(clone_scene_pt(&scene_pt)); - points.push(scene_pt); - } - PathEl::ClosePath => { - if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) { - if start.xy != last.xy { - points.push(start); + //println!("{:?}", el); + }); + } else { + let mut start_pt = None; + let mut last_pt = None; + for el in path { + match el { + PathEl::MoveTo(p) => { + let scene_pt = to_f32_2(p); + start_pt = Some(scene_pt); + last_pt = Some(scene_pt); + } + PathEl::LineTo(p) => { + let scene_pt = to_f32_2(p); + let seg = LineSeg { + p0: last_pt.unwrap(), + p1: scene_pt, + }; + self.encode_line_seg(seg, is_fill); + last_pt = Some(scene_pt); + } + PathEl::QuadTo(p1, p2) => { + let scene_p1 = to_f32_2(p1); + let scene_p2 = to_f32_2(p2); + let seg = QuadSeg { + p0: last_pt.unwrap(), + p1: scene_p1, + p2: scene_p2, + }; + self.elements.push(Element::Quad(seg)); + last_pt = Some(scene_p2); + } + PathEl::CurveTo(p1, p2, p3) => { + let scene_p1 = to_f32_2(p1); + let scene_p2 = to_f32_2(p2); + let scene_p3 = to_f32_2(p3); + let seg = CubicSeg { + p0: last_pt.unwrap(), + p1: scene_p1, + p2: scene_p2, + p3: scene_p3, + }; + self.elements.push(Element::Cubic(seg)); + last_pt = Some(scene_p3); + } + PathEl::ClosePath => { + if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) { + if last != start { + let seg = LineSeg { + p0: last, + p1: start, + }; + self.encode_line_seg(seg, is_fill); + } + } } } + //println!("{:?}", el); } - _ => (), } - //println!("{:?}", el); - }); - let n_points = points.len() as u32; - let points_ref = points.encode(encoder).transmute(); - (n_points, points_ref) + } } impl Text for PietGpuText { @@ -360,13 +381,6 @@ impl IntoBrush for PietGpuBrush { } } -fn to_scene_point(point: Point) -> scene::Point { - scene::Point { - xy: [point.x as f32, point.y as f32], - } -} - -// TODO: allow #[derive(Clone)] in piet-gpu-derive. -fn clone_scene_pt(p: &scene::Point) -> scene::Point { - scene::Point { xy: p.xy } +fn to_f32_2(point: Point) -> [f32; 2] { + [point.x as f32, point.y as f32] }