diff --git a/piet-gpu-types/src/lib.rs b/piet-gpu-types/src/lib.rs index d85df70..288f71c 100644 --- a/piet-gpu-types/src/lib.rs +++ b/piet-gpu-types/src/lib.rs @@ -3,5 +3,6 @@ pub mod fill_seg; pub mod ptcl; pub mod scene; pub mod segment; +pub mod state; pub mod test; pub mod tilegroup; diff --git a/piet-gpu-types/src/main.rs b/piet-gpu-types/src/main.rs index c0b9d7e..033bec4 100644 --- a/piet-gpu-types/src/main.rs +++ b/piet-gpu-types/src/main.rs @@ -5,6 +5,7 @@ fn main() { .expect("provide a module name"); match mod_name.as_str() { "scene" => print!("{}", piet_gpu_types::scene::gen_gpu_scene()), + "state" => print!("{}", piet_gpu_types::state::gen_gpu_state()), "tilegroup" => print!("{}", piet_gpu_types::tilegroup::gen_gpu_tilegroup()), "segment" => print!("{}", piet_gpu_types::segment::gen_gpu_segment()), "fill_seg" => print!("{}", piet_gpu_types::fill_seg::gen_gpu_fill_seg()), diff --git a/piet-gpu-types/src/scene.rs b/piet-gpu-types/src/scene.rs index 5f95c40..7451c9c 100644 --- a/piet-gpu-types/src/scene.rs +++ b/piet-gpu-types/src/scene.rs @@ -4,6 +4,8 @@ pub use self::scene::{ Bbox, PietCircle, PietFill, PietItem, PietStrokeLine, PietStrokePolyLine, Point, SimpleGroup, }; +pub use self::scene::{CubicSeg, Element, Fill, LineSeg, QuadSeg, SetLineWidth, Stroke, Transform}; + piet_gpu! { #[rust_encode] mod scene { @@ -51,5 +53,46 @@ piet_gpu! { Fill(PietFill), Poly(PietStrokePolyLine), } + + // New approach follows (above to be deleted) + struct LineSeg { + p0: [f32; 2], + p1: [f32; 2], + } + struct QuadSeg { + p0: [f32; 2], + p1: [f32; 2], + p2: [f32; 2], + } + struct CubicSeg { + p0: [f32; 2], + p1: [f32; 2], + p2: [f32; 2], + p3: [f32; 2], + } + struct Fill { + rgba_color: u32, + } + struct Stroke { + rgba_color: u32, + } + struct SetLineWidth { + width: f32, + } + struct Transform { + mat: [f32; 4], + translate: [f32; 2], + } + enum Element { + Nop, + // The segments need a flag to indicate fill/stroke + Line(LineSeg), + Quad(QuadSeg), + Cubic(CubicSeg), + Stroke(Stroke), + Fill(Fill), + SetLineWidth(SetLineWidth), + Transform(Transform), + } } } diff --git a/piet-gpu-types/src/state.rs b/piet-gpu-types/src/state.rs new file mode 100644 index 0000000..35076f0 --- /dev/null +++ b/piet-gpu-types/src/state.rs @@ -0,0 +1,14 @@ +use piet_gpu_derive::piet_gpu; + +piet_gpu! { + #[gpu_write] + mod state { + struct State { + mat: [f32; 4], + translate: [f32; 2], + bbox: [f32; 4], + linewidth: f32, + flags: u32, + } + } +} diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs index 839c262..82f3491 100644 --- a/piet-gpu/bin/cli.rs +++ b/piet-gpu/bin/cli.rs @@ -5,7 +5,7 @@ use std::path::Path; use piet_gpu_hal::vulkan::VkInstance; use piet_gpu_hal::{CmdBuf, Device, Error, MemFlags}; -use piet_gpu::{PietGpuRenderContext, Renderer, render_scene, WIDTH, HEIGHT}; +use piet_gpu::{render_scene, PietGpuRenderContext, Renderer, HEIGHT, WIDTH}; #[allow(unused)] fn dump_scene(buf: &[u8]) { @@ -16,6 +16,24 @@ fn dump_scene(buf: &[u8]) { } } +#[allow(unused)] +fn dump_state(buf: &[u8]) { + for i in 0..(buf.len() / 48) { + let j = i * 48; + let floats = (0..11).map(|k| { + let mut buf_f32 = [0u8; 4]; + buf_f32.copy_from_slice(&buf[j + k * 4..j + k * 4 + 4]); + f32::from_le_bytes(buf_f32) + }).collect::>(); + println!("{}: [{} {} {} {} {} {}] ({}, {})-({} {}) {} {}", + i, + floats[0], floats[1], floats[2], floats[3], floats[4], floats[5], + floats[6], floats[7], floats[8], floats[9], + floats[10], buf[j + 44]); + } + +} + fn main() -> Result<(), Error> { let (instance, _) = VkInstance::new(None)?; unsafe { @@ -23,7 +41,7 @@ fn main() -> Result<(), Error> { let fence = device.create_fence(false)?; let mut cmd_buf = device.create_cmd_buf()?; - let query_pool = device.create_query_pool(6)?; + let query_pool = device.create_query_pool(2)?; let mut ctx = PietGpuRenderContext::new(); render_scene(&mut ctx); @@ -31,7 +49,8 @@ fn main() -> Result<(), Error> { //dump_scene(&scene); let renderer = Renderer::new(&device, scene)?; - let image_buf = device.create_buffer((WIDTH * HEIGHT * 4) as u64, MemFlags::host_coherent())?; + let image_buf = + device.create_buffer((WIDTH * HEIGHT * 4) as u64, MemFlags::host_coherent())?; cmd_buf.begin(); renderer.record(&mut cmd_buf, &query_pool); @@ -40,28 +59,12 @@ fn main() -> Result<(), Error> { device.run_cmd_buf(&cmd_buf, &[], &[], Some(&fence))?; device.wait_and_reset(&[fence])?; let timestamps = device.reap_query_pool(&query_pool).unwrap(); - println!("Kernel 1 time: {:.3}ms", timestamps[0] * 1e3); - println!( - "Kernel 2s time: {:.3}ms", - (timestamps[1] - timestamps[0]) * 1e3 - ); - println!( - "Kernel 2f time: {:.3}ms", - (timestamps[2] - timestamps[1]) * 1e3 - ); - println!( - "Kernel 3 time: {:.3}ms", - (timestamps[3] - timestamps[2]) * 1e3 - ); - println!( - "Render time: {:.3}ms", - (timestamps[4] - timestamps[3]) * 1e3 - ); + println!("Element kernel time: {:.3}ms", timestamps[0] * 1e3); /* - let mut k1_data: Vec = Default::default(); - device.read_buffer(&segment_buf, &mut k1_data).unwrap(); - dump_k1_data(&k1_data); + let mut data: Vec = Default::default(); + device.read_buffer(&renderer.state_buf, &mut data).unwrap(); + dump_state(&data); */ let mut img_data: Vec = Default::default(); diff --git a/piet-gpu/bin/winit.rs b/piet-gpu/bin/winit.rs index e5f174a..1c263bb 100644 --- a/piet-gpu/bin/winit.rs +++ b/piet-gpu/bin/winit.rs @@ -1,7 +1,7 @@ use piet_gpu_hal::vulkan::VkInstance; use piet_gpu_hal::{CmdBuf, Device, Error, ImageLayout}; -use piet_gpu::{PietGpuRenderContext, Renderer, render_scene, WIDTH, HEIGHT}; +use piet_gpu::{render_scene, PietGpuRenderContext, Renderer, HEIGHT, WIDTH}; use winit::{ event::{Event, WindowEvent}, @@ -69,7 +69,8 @@ fn main() -> Result<(), Error> { device.wait_and_reset(&[frame_fences[frame_idx]]).unwrap(); let timestamps = device.reap_query_pool(query_pool).unwrap(); - window.set_title(&format!("k1: {:.3}ms, k2s: {:.3}ms, k2f: {:.3}ms, k3: {:.3}ms, k4: {:.3}ms", + window.set_title(&format!( + "k1: {:.3}ms, k2s: {:.3}ms, k2f: {:.3}ms, k3: {:.3}ms, k4: {:.3}ms", timestamps[0] * 1e3, (timestamps[1] - timestamps[0]) * 1e3, (timestamps[2] - timestamps[1]) * 1e3, @@ -93,11 +94,7 @@ fn main() -> Result<(), Error> { ImageLayout::BlitDst, ); cmd_buf.blit_image(&renderer.image_dev, &swap_image); - cmd_buf.image_barrier( - &swap_image, - ImageLayout::BlitDst, - ImageLayout::Present, - ); + cmd_buf.image_barrier(&swap_image, ImageLayout::BlitDst, ImageLayout::Present); cmd_buf.finish(); device diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja index 0aaecae..b429a71 100644 --- a/piet-gpu/shader/build.ninja +++ b/piet-gpu/shader/build.ninja @@ -18,3 +18,6 @@ build kernel2f.spv: glsl kernel2f.comp | scene.h tilegroup.h fill_seg.h setup.h build kernel3.spv: glsl kernel3.comp | scene.h tilegroup.h segment.h fill_seg.h ptcl.h setup.h build kernel4.spv: glsl kernel4.comp | ptcl.h segment.h fill_seg.h setup.h + + +build elements.spv: glsl elements.comp | scene.h state.h diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp new file mode 100644 index 0000000..5cede7c --- /dev/null +++ b/piet-gpu/shader/elements.comp @@ -0,0 +1,173 @@ +#version 450 +#extension GL_GOOGLE_include_directive : enable + +#define N_ROWS 4 +#define WG_SIZE 32 +#define LG_WG_SIZE 5 +#define TILE_SIZE (WG_SIZE * N_ROWS) + +layout(local_size_x = WG_SIZE, local_size_y = 1) in; + +layout(set = 0, binding = 0) readonly buffer SceneBuf { + uint[] scene; +}; + +// This will be used for inter-wprkgroup aggregates +layout(set = 0, binding = 1) buffer StateBuf { + uint[] state; +}; + +#include "scene.h" +#include "state.h" + +#define FLAG_SET_LINEWIDTH 1 +#define FLAG_RESET_BBOX 2 + +// This is almost like a monoid (the interaction between transformation and +// bounding boxes is approximate) +State combine_state(State a, State b) { + State c; + c.bbox.x = min(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + min(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x; + c.bbox.y = min(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + min(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y; + c.bbox.z = max(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + max(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x; + c.bbox.w = max(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + max(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y; + if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) { + c.bbox = a.bbox; + } else if ((a.flags & FLAG_RESET_BBOX) == 0 && (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)) { + c.bbox.xy = min(a.bbox.xy, c.bbox.xy); + c.bbox.zw = max(a.bbox.zw, c.bbox.zw); + } + // It would be more concise to cast to matrix types; ah well. + c.mat.x = a.mat.x * b.mat.x + a.mat.z * b.mat.y; + c.mat.y = a.mat.y * b.mat.x + a.mat.w * b.mat.y; + c.mat.z = a.mat.x * b.mat.z + a.mat.z * b.mat.w; + c.mat.w = a.mat.y * b.mat.z + a.mat.w * b.mat.w; + c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x; + c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y; + c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth; + c.flags = a.flags | b.flags; + return c; +} + +State map_element(ElementRef ref) { + // TODO: it would *probably* be more efficient to make the memory read patterns less + // divergent, though it would be more wasted memory. + uint tag = Element_tag(ref); + State c; + c.bbox = vec4(0.0, 0.0, 0.0, 0.0); + c.mat = vec4(1.0, 0.0, 0.0, 1.0); + c.translate = vec2(0.0, 0.0); + c.linewidth = 0.0; + c.flags = 0; + switch (tag) { + case Element_Line: + LineSeg line = Element_Line_read(ref); + c.bbox.xy = min(line.p0, line.p1); + c.bbox.zw = max(line.p0, line.p1); + break; + case Element_Quad: + QuadSeg quad = Element_Quad_read(ref); + c.bbox.xy = min(min(quad.p0, quad.p1), quad.p2); + c.bbox.zw = max(max(quad.p0, quad.p1), quad.p2); + break; + case Element_Cubic: + CubicSeg cubic = Element_Cubic_read(ref); + c.bbox.xy = min(min(cubic.p0, cubic.p1), min(cubic.p2, cubic.p3)); + c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3)); + break; + case Element_Fill: + case Element_Stroke: + c.flags = FLAG_RESET_BBOX; + break; + case Element_SetLineWidth: + SetLineWidth lw = Element_SetLineWidth_read(ref); + c.linewidth = lw.width; + c.flags = FLAG_SET_LINEWIDTH; + break; + case Element_Transform: + Transform t = Element_Transform_read(ref); + c.mat = t.mat; + c.translate = t.translate; + break; + } + return c; +} + +// We should be able to use an array of structs but the NV shader compiler +// doesn't seem to like it :/ +//shared State sh_state[WG_SIZE]; +shared vec4 sh_mat[WG_SIZE]; +shared vec2 sh_translate[WG_SIZE]; +shared vec4 sh_bbox[WG_SIZE]; +shared float sh_width[WG_SIZE]; +shared uint sh_flags[WG_SIZE]; + +void main() { + State th_state[N_ROWS]; + // this becomes an atomic counter + uint tile_ix = gl_WorkGroupID.x; + + uint ix = tile_ix * TILE_SIZE + gl_LocalInvocationID.x * N_ROWS; + ElementRef ref = ElementRef(ix * Element_size); + + th_state[0] = map_element(ref); + for (uint i = 1; i < N_ROWS; i++) { + // discussion question: would it be faster to load using more coherent patterns + // into thread memory? This is kinda strided. + th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i))); + } + State agg = th_state[N_ROWS - 1]; + sh_mat[gl_LocalInvocationID.x] = agg.mat; + sh_translate[gl_LocalInvocationID.x] = agg.translate; + sh_bbox[gl_LocalInvocationID.x] = agg.bbox; + sh_width[gl_LocalInvocationID.x] = agg.linewidth; + sh_flags[gl_LocalInvocationID.x] = agg.flags; + for (uint i = 0; i < LG_WG_SIZE; i++) { + barrier(); + if (gl_LocalInvocationID.x >= (1 << i)) { + State other; + uint ix = gl_LocalInvocationID.x - (1 << i); + other.mat = sh_mat[ix]; + other.translate = sh_translate[ix]; + other.bbox = sh_bbox[ix]; + other.linewidth = sh_width[ix]; + other.flags = sh_flags[ix]; + agg = combine_state(other, agg); + } + barrier(); + sh_mat[gl_LocalInvocationID.x] = agg.mat; + sh_translate[gl_LocalInvocationID.x] = agg.translate; + sh_bbox[gl_LocalInvocationID.x] = agg.bbox; + sh_width[gl_LocalInvocationID.x] = agg.linewidth; + sh_flags[gl_LocalInvocationID.x] = agg.flags; + } + + // TODO: if last invocation in wg, publish agg. + + barrier(); + State exclusive; + exclusive.bbox = vec4(0.0, 0.0, 0.0, 0.0); + exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0); + exclusive.translate = vec2(0.0, 0.0); + exclusive.linewidth = 0.0; + exclusive.flags = 0; + // TODO: do decoupled look-back + + State row = exclusive; + if (gl_LocalInvocationID.x > 0) { + uint ix = gl_LocalInvocationID.x - 1; + State other; + other.mat = sh_mat[ix]; + other.translate = sh_translate[ix]; + other.bbox = sh_bbox[ix]; + other.linewidth = sh_width[ix]; + other.flags = sh_flags[ix]; + row = combine_state(row, other); + } + for (uint i = 0; i < N_ROWS; i++) { + State this_state = combine_state(row, th_state[i]); + // We write the state now for development purposes, but the + // actual goal is to write transformed and annotated elements. + State_write(StateRef((ix + i) * State_size), this_state); + } +} diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv new file mode 100644 index 0000000..e97226c Binary files /dev/null and b/piet-gpu/shader/elements.spv differ diff --git a/piet-gpu/shader/scene.h b/piet-gpu/shader/scene.h index 5e36abc..84ef80d 100644 --- a/piet-gpu/shader/scene.h +++ b/piet-gpu/shader/scene.h @@ -32,6 +32,38 @@ struct PietItemRef { uint offset; }; +struct LineSegRef { + uint offset; +}; + +struct QuadSegRef { + uint offset; +}; + +struct CubicSegRef { + uint offset; +}; + +struct FillRef { + uint offset; +}; + +struct StrokeRef { + uint offset; +}; + +struct SetLineWidthRef { + uint offset; +}; + +struct TransformRef { + uint offset; +}; + +struct ElementRef { + uint offset; +}; + struct Bbox { ivec4 bbox; }; @@ -128,6 +160,97 @@ PietItemRef PietItem_index(PietItemRef ref, uint index) { return PietItemRef(ref.offset + index * PietItem_size); } +struct LineSeg { + vec2 p0; + vec2 p1; +}; + +#define LineSeg_size 16 + +LineSegRef LineSeg_index(LineSegRef ref, uint index) { + return LineSegRef(ref.offset + index * LineSeg_size); +} + +struct QuadSeg { + vec2 p0; + vec2 p1; + vec2 p2; +}; + +#define QuadSeg_size 24 + +QuadSegRef QuadSeg_index(QuadSegRef ref, uint index) { + return QuadSegRef(ref.offset + index * QuadSeg_size); +} + +struct CubicSeg { + vec2 p0; + vec2 p1; + vec2 p2; + vec2 p3; +}; + +#define CubicSeg_size 32 + +CubicSegRef CubicSeg_index(CubicSegRef ref, uint index) { + return CubicSegRef(ref.offset + index * CubicSeg_size); +} + +struct Fill { + uint rgba_color; +}; + +#define Fill_size 4 + +FillRef Fill_index(FillRef ref, uint index) { + return FillRef(ref.offset + index * Fill_size); +} + +struct Stroke { + uint rgba_color; +}; + +#define Stroke_size 4 + +StrokeRef Stroke_index(StrokeRef ref, uint index) { + return StrokeRef(ref.offset + index * Stroke_size); +} + +struct SetLineWidth { + float width; +}; + +#define SetLineWidth_size 4 + +SetLineWidthRef SetLineWidth_index(SetLineWidthRef ref, uint index) { + return SetLineWidthRef(ref.offset + index * SetLineWidth_size); +} + +struct Transform { + vec4 mat; + vec2 translate; +}; + +#define Transform_size 24 + +TransformRef Transform_index(TransformRef ref, uint index) { + return TransformRef(ref.offset + index * Transform_size); +} + +#define Element_Nop 0 +#define Element_Line 1 +#define Element_Quad 2 +#define Element_Cubic 3 +#define Element_Stroke 4 +#define Element_Fill 5 +#define Element_SetLineWidth 6 +#define Element_Transform 7 +#define Element_size 36 + +ElementRef Element_index(ElementRef ref, uint index) { + return ElementRef(ref.offset + index * Element_size); +} + Bbox Bbox_read(BboxRef ref) { uint ix = ref.offset >> 2; uint raw0 = scene[ix + 0]; @@ -236,3 +359,118 @@ PietStrokePolyLine PietItem_Poly_read(PietItemRef ref) { return PietStrokePolyLine_read(PietStrokePolyLineRef(ref.offset + 4)); } +LineSeg LineSeg_read(LineSegRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = scene[ix + 0]; + uint raw1 = scene[ix + 1]; + uint raw2 = scene[ix + 2]; + uint raw3 = scene[ix + 3]; + LineSeg s; + s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + return s; +} + +QuadSeg QuadSeg_read(QuadSegRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = scene[ix + 0]; + uint raw1 = scene[ix + 1]; + uint raw2 = scene[ix + 2]; + uint raw3 = scene[ix + 3]; + uint raw4 = scene[ix + 4]; + uint raw5 = scene[ix + 5]; + QuadSeg s; + s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5)); + return s; +} + +CubicSeg CubicSeg_read(CubicSegRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = scene[ix + 0]; + uint raw1 = scene[ix + 1]; + uint raw2 = scene[ix + 2]; + uint raw3 = scene[ix + 3]; + uint raw4 = scene[ix + 4]; + uint raw5 = scene[ix + 5]; + uint raw6 = scene[ix + 6]; + uint raw7 = scene[ix + 7]; + CubicSeg s; + s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5)); + s.p3 = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7)); + return s; +} + +Fill Fill_read(FillRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = scene[ix + 0]; + Fill s; + s.rgba_color = raw0; + return s; +} + +Stroke Stroke_read(StrokeRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = scene[ix + 0]; + Stroke s; + s.rgba_color = raw0; + return s; +} + +SetLineWidth SetLineWidth_read(SetLineWidthRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = scene[ix + 0]; + SetLineWidth s; + s.width = uintBitsToFloat(raw0); + return s; +} + +Transform Transform_read(TransformRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = scene[ix + 0]; + uint raw1 = scene[ix + 1]; + uint raw2 = scene[ix + 2]; + uint raw3 = scene[ix + 3]; + uint raw4 = scene[ix + 4]; + uint raw5 = scene[ix + 5]; + Transform s; + s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5)); + return s; +} + +uint Element_tag(ElementRef ref) { + return scene[ref.offset >> 2]; +} + +LineSeg Element_Line_read(ElementRef ref) { + return LineSeg_read(LineSegRef(ref.offset + 4)); +} + +QuadSeg Element_Quad_read(ElementRef ref) { + return QuadSeg_read(QuadSegRef(ref.offset + 4)); +} + +CubicSeg Element_Cubic_read(ElementRef ref) { + return CubicSeg_read(CubicSegRef(ref.offset + 4)); +} + +Stroke Element_Stroke_read(ElementRef ref) { + return Stroke_read(StrokeRef(ref.offset + 4)); +} + +Fill Element_Fill_read(ElementRef ref) { + return Fill_read(FillRef(ref.offset + 4)); +} + +SetLineWidth Element_SetLineWidth_read(ElementRef ref) { + return SetLineWidth_read(SetLineWidthRef(ref.offset + 4)); +} + +Transform Element_Transform_read(ElementRef ref) { + return Transform_read(TransformRef(ref.offset + 4)); +} + diff --git a/piet-gpu/shader/state.h b/piet-gpu/shader/state.h new file mode 100644 index 0000000..2547b93 --- /dev/null +++ b/piet-gpu/shader/state.h @@ -0,0 +1,59 @@ +// Code auto-generated by piet-gpu-derive + +struct StateRef { + uint offset; +}; + +struct State { + vec4 mat; + vec2 translate; + vec4 bbox; + float linewidth; + uint flags; +}; + +#define State_size 48 + +StateRef State_index(StateRef ref, uint index) { + return StateRef(ref.offset + index * State_size); +} + +State State_read(StateRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = state[ix + 0]; + uint raw1 = state[ix + 1]; + uint raw2 = state[ix + 2]; + uint raw3 = state[ix + 3]; + uint raw4 = state[ix + 4]; + uint raw5 = state[ix + 5]; + uint raw6 = state[ix + 6]; + uint raw7 = state[ix + 7]; + uint raw8 = state[ix + 8]; + uint raw9 = state[ix + 9]; + uint raw10 = state[ix + 10]; + uint raw11 = state[ix + 11]; + State s; + s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5)); + s.bbox = vec4(uintBitsToFloat(raw6), uintBitsToFloat(raw7), uintBitsToFloat(raw8), uintBitsToFloat(raw9)); + s.linewidth = uintBitsToFloat(raw10); + s.flags = raw11; + return s; +} + +void State_write(StateRef ref, State s) { + uint ix = ref.offset >> 2; + state[ix + 0] = floatBitsToUint(s.mat.x); + state[ix + 1] = floatBitsToUint(s.mat.y); + state[ix + 2] = floatBitsToUint(s.mat.z); + state[ix + 3] = floatBitsToUint(s.mat.w); + state[ix + 4] = floatBitsToUint(s.translate.x); + state[ix + 5] = floatBitsToUint(s.translate.y); + state[ix + 6] = floatBitsToUint(s.bbox.x); + state[ix + 7] = floatBitsToUint(s.bbox.y); + state[ix + 8] = floatBitsToUint(s.bbox.z); + state[ix + 9] = floatBitsToUint(s.bbox.w); + state[ix + 10] = floatBitsToUint(s.linewidth); + state[ix + 11] = s.flags; +} + diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index a47737a..82b20c8 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -1,5 +1,5 @@ -mod render_ctx; mod pico_svg; +mod render_ctx; pub use render_ctx::PietGpuRenderContext; @@ -8,6 +8,8 @@ use rand::{Rng, RngCore}; use piet::kurbo::{BezPath, Circle, Line, Point, Vec2}; use piet::{Color, RenderContext}; +use piet_gpu_types::encoder::Encode; + use piet_gpu_hal::{CmdBuf, Device, Error, ImageLayout, MemFlags}; use pico_svg::PicoSvg; @@ -110,6 +112,12 @@ pub struct Renderer { scene_buf: D::Buffer, scene_dev: D::Buffer, + pub state_buf: D::Buffer, + + el_pipeline: D::Pipeline, + el_ds: D::DescriptorSet, + + /* k1_alloc_buf_host: D::Buffer, k1_alloc_buf_dev: D::Buffer, k2s_alloc_buf_host: D::Buffer, @@ -131,6 +139,8 @@ pub struct Renderer { k3_ds: D::DescriptorSet, k4_pipeline: D::Pipeline, k4_ds: D::DescriptorSet, + */ + n_elements: usize, } impl Renderer { @@ -146,175 +156,123 @@ impl Renderer { .unwrap(); device.write_buffer(&scene_buf, &scene)?; + let state_buf = device.create_buffer(4 * 1024 * 1024, dev)?; + let image_dev = device.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?; + + let el_code = include_bytes!("../shader/elements.spv"); + let el_pipeline = device.create_simple_compute_pipeline(el_code, 2, 0)?; + let el_ds = device.create_descriptor_set( + &el_pipeline, + &[&scene_dev, &state_buf], + &[], + )?; + + let n_elements = scene.len() / piet_gpu_types::scene::Element::fixed_size(); + println!("scene: {} elements", n_elements); + + /* let tilegroup_buf = device.create_buffer(4 * 1024 * 1024, dev)?; let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?; let segment_buf = device.create_buffer(64 * 1024 * 1024, dev)?; let fill_seg_buf = device.create_buffer(64 * 1024 * 1024, dev)?; - let image_dev = device.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?; let k1_alloc_buf_host = device.create_buffer(4, host)?; let k1_alloc_buf_dev = device.create_buffer(4, dev)?; let k1_alloc_start = WIDTH_IN_TILEGROUPS * HEIGHT_IN_TILEGROUPS * TILEGROUP_STRIDE; device.write_buffer(&k1_alloc_buf_host, &[k1_alloc_start as u32])?; let k1_code = include_bytes!("../shader/kernel1.spv"); - let k1_pipeline = device - .create_simple_compute_pipeline(k1_code, 3, 0)?; - let k1_ds = device - .create_descriptor_set( - &k1_pipeline, - &[&scene_dev, &tilegroup_buf, &k1_alloc_buf_dev], - &[], - )?; + let k1_pipeline = device.create_simple_compute_pipeline(k1_code, 3, 0)?; + let k1_ds = device.create_descriptor_set( + &k1_pipeline, + &[&scene_dev, &tilegroup_buf, &k1_alloc_buf_dev], + &[], + )?; let k2s_alloc_buf_host = device.create_buffer(4, host)?; let k2s_alloc_buf_dev = device.create_buffer(4, dev)?; let k2s_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * K2_PER_TILE_SIZE; - device - .write_buffer(&k2s_alloc_buf_host, &[k2s_alloc_start as u32]) - ?; + device.write_buffer(&k2s_alloc_buf_host, &[k2s_alloc_start as u32])?; let k2s_code = include_bytes!("../shader/kernel2s.spv"); - let k2s_pipeline = device - .create_simple_compute_pipeline(k2s_code, 4, 0) - ?; - let k2s_ds = device - .create_descriptor_set( - &k2s_pipeline, - &[&scene_dev, &tilegroup_buf, &segment_buf, &k2s_alloc_buf_dev], - &[], - ) - ?; + let k2s_pipeline = device.create_simple_compute_pipeline(k2s_code, 4, 0)?; + let k2s_ds = device.create_descriptor_set( + &k2s_pipeline, + &[&scene_dev, &tilegroup_buf, &segment_buf, &k2s_alloc_buf_dev], + &[], + )?; let k2f_alloc_buf_host = device.create_buffer(4, host)?; let k2f_alloc_buf_dev = device.create_buffer(4, dev)?; let k2f_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * K2_PER_TILE_SIZE; - device - .write_buffer(&k2f_alloc_buf_host, &[k2f_alloc_start as u32]) - ?; + device.write_buffer(&k2f_alloc_buf_host, &[k2f_alloc_start as u32])?; let k2f_code = include_bytes!("../shader/kernel2f.spv"); let k2f_pipeline = device.create_simple_compute_pipeline(k2f_code, 4, 0)?; - let k2f_ds = device - .create_descriptor_set( - &k2f_pipeline, - &[ - &scene_dev, - &tilegroup_buf, - &fill_seg_buf, - &k2f_alloc_buf_dev, - ], - &[], - ) - ?; + let k2f_ds = device.create_descriptor_set( + &k2f_pipeline, + &[ + &scene_dev, + &tilegroup_buf, + &fill_seg_buf, + &k2f_alloc_buf_dev, + ], + &[], + )?; let k3_alloc_buf_host = device.create_buffer(4, host)?; let k3_alloc_buf_dev = device.create_buffer(4, dev)?; let k3_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC; - device - .write_buffer(&k3_alloc_buf_host, &[k3_alloc_start as u32]) - ?; + device.write_buffer(&k3_alloc_buf_host, &[k3_alloc_start as u32])?; let k3_code = include_bytes!("../shader/kernel3.spv"); let k3_pipeline = device.create_simple_compute_pipeline(k3_code, 6, 0)?; - let k3_ds = device - .create_descriptor_set( - &k3_pipeline, - &[ - &scene_dev, - &tilegroup_buf, - &segment_buf, - &fill_seg_buf, - &ptcl_buf, - &k3_alloc_buf_dev, - ], - &[], - ) - ?; + let k3_ds = device.create_descriptor_set( + &k3_pipeline, + &[ + &scene_dev, + &tilegroup_buf, + &segment_buf, + &fill_seg_buf, + &ptcl_buf, + &k3_alloc_buf_dev, + ], + &[], + )?; let k4_code = include_bytes!("../shader/kernel4.spv"); let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 3, 1)?; - let k4_ds = device - .create_descriptor_set(&k4_pipeline, &[&ptcl_buf, &segment_buf, &fill_seg_buf], &[&image_dev]) - ?; + let k4_ds = device.create_descriptor_set( + &k4_pipeline, + &[&ptcl_buf, &segment_buf, &fill_seg_buf], + &[&image_dev], + )?; + */ Ok(Renderer { scene_buf, scene_dev, image_dev, - k1_alloc_buf_host, - k1_alloc_buf_dev, - k2s_alloc_buf_host, - k2s_alloc_buf_dev, - k2f_alloc_buf_host, - k2f_alloc_buf_dev, - k3_alloc_buf_host, - k3_alloc_buf_dev, - tilegroup_buf, - ptcl_buf, - k1_pipeline, - k1_ds, - k2s_pipeline, - k2s_ds, - k2f_pipeline, - k2f_ds, - k3_pipeline, - k3_ds, - k4_pipeline, - k4_ds, + el_pipeline, + el_ds, + state_buf, + n_elements, }) } pub unsafe fn record(&self, cmd_buf: &mut impl CmdBuf, query_pool: &D::QueryPool) { cmd_buf.copy_buffer(&self.scene_buf, &self.scene_dev); - // Note: we could use one alloc buf and reuse it. But we'll stick with - // multiple ones for clarity. - cmd_buf.copy_buffer(&self.k1_alloc_buf_host, &self.k1_alloc_buf_dev); - cmd_buf.copy_buffer(&self.k2s_alloc_buf_host, &self.k2s_alloc_buf_dev); - cmd_buf.copy_buffer(&self.k2f_alloc_buf_host, &self.k2f_alloc_buf_dev); - cmd_buf.copy_buffer(&self.k3_alloc_buf_host, &self.k3_alloc_buf_dev); - // Note: these clears aren't necessary, and are here to make inspection - // of the buffers cleaner. Can likely be removed. - cmd_buf.clear_buffer(&self.tilegroup_buf); - cmd_buf.clear_buffer(&self.ptcl_buf); cmd_buf.memory_barrier(); - cmd_buf.image_barrier(&self.image_dev, ImageLayout::Undefined, ImageLayout::General); + cmd_buf.image_barrier( + &self.image_dev, + ImageLayout::Undefined, + ImageLayout::General, + ); cmd_buf.reset_query_pool(&query_pool); cmd_buf.write_timestamp(&query_pool, 0); cmd_buf.dispatch( - &self.k1_pipeline, - &self.k1_ds, - ((WIDTH / 512) as u32, (HEIGHT / 512) as u32, 1), + &self.el_pipeline, + &self.el_ds, + ((self.n_elements / 128) as u32, 1, 1), ); cmd_buf.write_timestamp(&query_pool, 1); cmd_buf.memory_barrier(); - cmd_buf.dispatch( - &self.k2s_pipeline, - &self.k2s_ds, - ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 1), - ); - cmd_buf.write_timestamp(&query_pool, 2); - // Note: this barrier is not necessary (k2f does not depend on - // k2s output), but I'm keeping it here to increase transparency - // of performance. - cmd_buf.memory_barrier(); - cmd_buf.dispatch( - &self.k2f_pipeline, - &self.k2f_ds, - ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 2), - ); - cmd_buf.write_timestamp(&query_pool, 3); - cmd_buf.memory_barrier(); - cmd_buf.dispatch( - &self.k3_pipeline, - &self.k3_ds, - ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 3), - ); - cmd_buf.write_timestamp(&query_pool, 4); - cmd_buf.memory_barrier(); - cmd_buf.dispatch( - &self.k4_pipeline, - &self.k4_ds, - ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1), - ); - cmd_buf.write_timestamp(&query_pool, 5); - cmd_buf.memory_barrier(); cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc); } } diff --git a/piet-gpu/src/pico_svg.rs b/piet-gpu/src/pico_svg.rs index a4c92d0..4ddf94b 100644 --- a/piet-gpu/src/pico_svg.rs +++ b/piet-gpu/src/pico_svg.rs @@ -41,10 +41,14 @@ impl PicoSvg { let path = Affine::scale(scale) * bp; if let Some(fill_color) = el.attribute("fill") { let color = parse_color(fill_color); - items.push(Item::Fill(FillItem { color, path: path.clone() })); + items.push(Item::Fill(FillItem { + color, + path: path.clone(), + })); } if let Some(stroke_color) = el.attribute("stroke") { - let width = f64::from_str(el.attribute("stroke-width").ok_or("missing width")?)?; + let width = + f64::from_str(el.attribute("stroke-width").ok_or("missing width")?)?; let color = parse_color(stroke_color); items.push(Item::Stroke(StrokeItem { width, color, path })); } diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs index 6367301..ad84a60 100644 --- a/piet-gpu/src/render_ctx.rs +++ b/piet-gpu/src/render_ctx.rs @@ -2,7 +2,11 @@ use std::borrow::Cow; use piet_gpu_types::encoder::{Encode, Encoder, Ref}; use piet_gpu_types::scene; -use piet_gpu_types::scene::{Bbox, PietCircle, PietFill, PietItem, PietStrokePolyLine, SimpleGroup}; +use piet_gpu_types::scene::{ + Bbox, PietCircle, PietFill, PietItem, PietStrokePolyLine, SimpleGroup, +}; + +use piet_gpu_types::scene::{CubicSeg, Element, Fill, LineSeg, QuadSeg, SetLineWidth, Stroke}; use piet::kurbo::{Affine, PathEl, Point, Rect, Shape}; @@ -27,10 +31,10 @@ pub struct PietGpuText; pub struct PietGpuRenderContext { encoder: Encoder, - bboxes: Vec, - items: Vec, + elements: Vec, // Will probably need direct accesss to hal Device to create images etc. inner_text: PietGpuText, + stroke_width: f32, } #[derive(Clone)] @@ -43,47 +47,22 @@ const TOLERANCE: f64 = 0.25; impl PietGpuRenderContext { pub fn new() -> PietGpuRenderContext { - let mut encoder = Encoder::new(); - let _reserve_root = encoder.alloc_chunk(PietItem::fixed_size() as u32); - let bboxes = Vec::new(); - let items = Vec::new(); + let encoder = Encoder::new(); + let elements = Vec::new(); let inner_text = PietGpuText; + let stroke_width = 0.0; PietGpuRenderContext { encoder, - bboxes, - items, + elements, inner_text, + stroke_width, } } pub fn get_scene_buf(&mut self) -> &[u8] { - let n_items = self.bboxes.len() as u32; - let bboxes = self.bboxes.encode(&mut self.encoder).transmute(); - let items = self.items.encode(&mut self.encoder).transmute(); - let offset = scene::Point { xy: [0.0, 0.0] }; - let simple_group = SimpleGroup { - n_items, - bboxes, - items, - offset, - }; - let root_item = PietItem::Group(simple_group); - root_item.encode_to(&mut self.encoder.buf_mut()[0..PietItem::fixed_size()]); + self.elements.encode(&mut self.encoder); self.encoder.buf() } - - fn push_item(&mut self, item: PietItem, bbox: Rect) { - let scene_bbox = Bbox { - bbox: [ - bbox.x0.floor() as i16, - bbox.y0.floor() as i16, - bbox.x1.ceil() as i16, - bbox.y1.ceil() as i16, - ], - }; - self.items.push(item); - self.bboxes.push(scene_bbox); - } } impl RenderContext for PietGpuRenderContext { @@ -107,20 +86,19 @@ impl RenderContext for PietGpuRenderContext { fn clear(&mut self, _color: Color) {} fn stroke(&mut self, shape: impl Shape, brush: &impl IntoBrush, width: f64) { - let bbox = shape.bounding_box(); - let brush = brush.make_brush(self, || bbox).into_owned(); + let width = width as f32; + if self.stroke_width != width { + self.elements + .push(Element::SetLineWidth(SetLineWidth { width })); + self.stroke_width = width; + } + let brush = brush.make_brush(self, || shape.bounding_box()).into_owned(); let path = shape.to_bez_path(TOLERANCE); - let (n_points, points) = flatten_shape(&mut self.encoder, path); + self.encode_path(path); match brush { PietGpuBrush::Solid(rgba_color) => { - let poly_line = PietStrokePolyLine { - rgba_color, - width: width as f32, - n_points, - points, - }; - let bbox = bbox.inset(-0.5 * width); - self.push_item(PietItem::Poly(poly_line), bbox); + let stroke = Stroke { rgba_color }; + self.elements.push(Element::Stroke(stroke)); } _ => (), } @@ -136,35 +114,13 @@ impl RenderContext for PietGpuRenderContext { } fn fill(&mut self, shape: impl Shape, brush: &impl IntoBrush) { - let bbox = shape.bounding_box(); let brush = brush.make_brush(self, || shape.bounding_box()).into_owned(); - - if let Some(circle) = shape.as_circle() { - match brush { - PietGpuBrush::Solid(rgba_color) => { - let piet_circle = PietCircle { - rgba_color, - center: to_scene_point(circle.center), - radius: circle.radius as f32, - }; - let bbox = circle.bounding_box(); - self.push_item(PietItem::Circle(piet_circle), bbox); - } - _ => {} - } - return; - } let path = shape.to_bez_path(TOLERANCE); - let (n_points, points) = flatten_shape(&mut self.encoder, path); + self.encode_path(path); match brush { PietGpuBrush::Solid(rgba_color) => { - let fill = PietFill { - flags: 0, - rgba_color, - n_points, - points, - }; - self.push_item(PietItem::Fill(fill), bbox); + let fill = Fill { rgba_color }; + self.elements.push(Element::Fill(fill)); } _ => (), } @@ -241,45 +197,96 @@ impl RenderContext for PietGpuRenderContext { } } -fn flatten_shape( - encoder: &mut Encoder, - path: impl Iterator, -) -> (u32, Ref) { - let mut points = Vec::new(); - let mut start_pt = None; - let mut last_pt = None; - piet::kurbo::flatten(path, TOLERANCE, |el| { - match el { - PathEl::MoveTo(p) => { - let scene_pt = to_scene_point(p); - start_pt = Some(clone_scene_pt(&scene_pt)); - if !points.is_empty() { - points.push(scene::Point { - xy: [std::f32::NAN, std::f32::NAN], - }); +impl PietGpuRenderContext { + fn encode_path(&mut self, path: impl Iterator) { + let flatten = false; + if flatten { + let mut start_pt = None; + let mut last_pt = None; + piet::kurbo::flatten(path, TOLERANCE, |el| { + match el { + PathEl::MoveTo(p) => { + let scene_pt = to_f32_2(p); + last_pt = Some(scene_pt); + } + PathEl::LineTo(p) => { + let scene_pt = to_f32_2(p); + let seg = LineSeg { + p0: last_pt.unwrap(), + p1: scene_pt, + }; + self.elements.push(Element::Line(seg)); + last_pt = Some(scene_pt); + } + PathEl::ClosePath => { + if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) { + let seg = LineSeg { + p0: last, + p1: start, + }; + self.elements.push(Element::Line(seg)); + } + } + _ => (), } - last_pt = Some(clone_scene_pt(&scene_pt)); - points.push(scene_pt); - } - PathEl::LineTo(p) => { - let scene_pt = to_scene_point(p); - last_pt = Some(clone_scene_pt(&scene_pt)); - points.push(scene_pt); - } - PathEl::ClosePath => { - if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) { - if start.xy != last.xy { - points.push(start); + //println!("{:?}", el); + }); + } else { + let mut start_pt = None; + let mut last_pt = None; + for el in path { + match el { + PathEl::MoveTo(p) => { + let scene_pt = to_f32_2(p); + last_pt = Some(scene_pt); + } + PathEl::LineTo(p) => { + let scene_pt = to_f32_2(p); + let seg = LineSeg { + p0: last_pt.unwrap(), + p1: scene_pt, + }; + self.elements.push(Element::Line(seg)); + last_pt = Some(scene_pt); + } + PathEl::QuadTo(p1, p2) => { + let scene_p1 = to_f32_2(p1); + let scene_p2 = to_f32_2(p2); + let seg = QuadSeg { + p0: last_pt.unwrap(), + p1: scene_p1, + p2: scene_p2, + }; + self.elements.push(Element::Quad(seg)); + last_pt = Some(scene_p2); + } + PathEl::CurveTo(p1, p2, p3) => { + let scene_p1 = to_f32_2(p1); + let scene_p2 = to_f32_2(p2); + let scene_p3 = to_f32_2(p3); + let seg = CubicSeg { + p0: last_pt.unwrap(), + p1: scene_p1, + p2: scene_p2, + p3: scene_p3, + }; + self.elements.push(Element::Cubic(seg)); + last_pt = Some(scene_p3); + } + PathEl::ClosePath => { + if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) { + let seg = LineSeg { + p0: last, + p1: start, + }; + self.elements.push(Element::Line(seg)); + } } } + //println!("{:?}", el); } - _ => (), } - //println!("{:?}", el); - }); - let n_points = points.len() as u32; - let points_ref = points.encode(encoder).transmute(); - (n_points, points_ref) + } } impl Text for PietGpuText { @@ -360,13 +367,6 @@ impl IntoBrush for PietGpuBrush { } } -fn to_scene_point(point: Point) -> scene::Point { - scene::Point { - xy: [point.x as f32, point.y as f32], - } -} - -// TODO: allow #[derive(Clone)] in piet-gpu-derive. -fn clone_scene_pt(p: &scene::Point) -> scene::Point { - scene::Point { xy: p.xy } +fn to_f32_2(point: Point) -> [f32; 2] { + [point.x as f32, point.y as f32] }