diff --git a/piet-gpu-hal/src/lib.rs b/piet-gpu-hal/src/lib.rs index 698ae43..1e04b8d 100644 --- a/piet-gpu-hal/src/lib.rs +++ b/piet-gpu-hal/src/lib.rs @@ -148,7 +148,7 @@ pub trait CmdBuf { /// This is readily supported in Vulkan, but for portability it is remarkably /// tricky (unimplemented in gfx-hal right now). Possibly best to write a compute /// kernel, or organize the code not to need it. - unsafe fn clear_buffer(&self, buffer: &D::Buffer); + unsafe fn clear_buffer(&self, buffer: &D::Buffer, size: Option); unsafe fn copy_buffer(&self, src: &D::Buffer, dst: &D::Buffer); diff --git a/piet-gpu-hal/src/vulkan.rs b/piet-gpu-hal/src/vulkan.rs index b40576b..53d575b 100644 --- a/piet-gpu-hal/src/vulkan.rs +++ b/piet-gpu-hal/src/vulkan.rs @@ -902,9 +902,10 @@ impl crate::CmdBuf for CmdBuf { ); } - unsafe fn clear_buffer(&self, buffer: &Buffer) { + unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option) { let device = &self.device.device; - device.cmd_fill_buffer(self.cmd_buf, buffer.buffer, 0, vk::WHOLE_SIZE, 0); + let size = size.unwrap_or(vk::WHOLE_SIZE); + device.cmd_fill_buffer(self.cmd_buf, buffer.buffer, 0, size, 0); } unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) { diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp index a7d5e92..6c98c4b 100644 --- a/piet-gpu/shader/kernel4.comp +++ b/piet-gpu/shader/kernel4.comp @@ -22,13 +22,39 @@ layout(set = 0, binding = 1) buffer TileBuf { uint[] tile; }; -layout(rgba8, set = 0, binding = 2) uniform writeonly image2D image; +layout(set = 0, binding = 2) buffer ClipScratchBuf { + uint[] clip_scratch; +}; + +layout(rgba8, set = 0, binding = 3) uniform writeonly image2D image; #include "ptcl.h" #include "tile.h" #define BLEND_STACK_SIZE 4 +// Layout of clip_scratch buffer: +// [0] is the alloc bump offset (in units of 32 bit words, initially 0) +// Starting at 1 is a sequence of frames. +// Each frame is WIDTH * HEIGHT 32-bit words, then a link reference. + +#define CLIP_LINK_OFFSET (TILE_WIDTH_PX * TILE_HEIGHT_PX) +#define CLIP_BUF_SIZE (CLIP_LINK_OFFSET + 1) + +shared uint sh_clip_alloc; + +// Allocate a scratch buffer for clipping. Unlike offsets in the rest of the code, +// it counts 32-bit words. +uint alloc_clip_buf(uint link) { + if (gl_LocalInvocationID.x == 0 && gl_LocalInvocationID.y == 0) { + uint alloc = atomicAdd(clip_scratch[0], CLIP_BUF_SIZE) + 1; + sh_clip_alloc = alloc; + clip_scratch[alloc + CLIP_LINK_OFFSET] = link; + } + barrier(); + return sh_clip_alloc; +} + // Calculate coverage based on backdrop + coverage of each line segment float[CHUNK] computeArea(vec2 xy, int backdrop, uint tile_ref) { // Probably better to store as float, but conversion is no doubt cheap. @@ -72,7 +98,9 @@ void main() { vec3 rgb[CHUNK]; float mask[CHUNK]; uint blend_stack[BLEND_STACK_SIZE][CHUNK]; + uint blend_spill = 0; uint blend_sp = 0; + uint clip_tos = 0; for (uint i = 0; i < CHUNK; i++) { rgb[i] = vec3(0.5); mask[i] = 1.0; @@ -142,26 +170,46 @@ void main() { } break; case Cmd_BeginClip: - CmdBeginClip begin_clip = Cmd_BeginClip_read(cmd_ref); - area = computeArea(xy, begin_clip.backdrop, begin_clip.tile_ref); - for (uint k = 0; k < CHUNK; k++) { - blend_stack[blend_sp][k] = packUnorm4x8(vec4(rgb[k], clamp(abs(area[k]), 0.0, 1.0))); - } - blend_sp++; - break; case Cmd_BeginSolidClip: - CmdBeginSolidClip begin_solid_clip = Cmd_BeginSolidClip_read(cmd_ref); - float solid_alpha = begin_solid_clip.alpha; - for (uint k = 0; k < CHUNK; k++) { - blend_stack[blend_sp][k] = packUnorm4x8(vec4(rgb[k], solid_alpha)); + uint blend_slot = blend_sp % BLEND_STACK_SIZE; + if (blend_sp == blend_spill + BLEND_STACK_SIZE) { + // spill to scratch buffer + clip_tos = alloc_clip_buf(clip_tos); + uint base_ix = clip_tos + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y; + for (uint k = 0; k < CHUNK; k++) { + clip_scratch[base_ix + k * TILE_WIDTH_PX * CHUNK_DY] = blend_stack[blend_slot][k]; + } + blend_spill++; + } + if (tag == Cmd_BeginClip) { + CmdBeginClip begin_clip = Cmd_BeginClip_read(cmd_ref); + area = computeArea(xy, begin_clip.backdrop, begin_clip.tile_ref); + for (uint k = 0; k < CHUNK; k++) { + blend_stack[blend_slot][k] = packUnorm4x8(vec4(rgb[k], clamp(abs(area[k]), 0.0, 1.0))); + } + } else { + CmdBeginSolidClip begin_solid_clip = Cmd_BeginSolidClip_read(cmd_ref); + float solid_alpha = begin_solid_clip.alpha; + for (uint k = 0; k < CHUNK; k++) { + blend_stack[blend_slot][k] = packUnorm4x8(vec4(rgb[k], solid_alpha)); + } } blend_sp++; break; case Cmd_EndClip: CmdEndClip end_clip = Cmd_EndClip_read(cmd_ref); + blend_slot = (blend_sp - 1) % BLEND_STACK_SIZE; + if (blend_sp == blend_spill) { + uint base_ix = clip_tos + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y; + for (uint k = 0; k < CHUNK; k++) { + blend_stack[blend_slot][k] = clip_scratch[base_ix + k * TILE_WIDTH_PX * CHUNK_DY]; + } + clip_tos = clip_scratch[clip_tos + CLIP_LINK_OFFSET]; + blend_spill--; + } blend_sp--; for (uint k = 0; k < CHUNK; k++) { - vec4 rgba = unpackUnorm4x8(blend_stack[blend_sp][k]); + vec4 rgba = unpackUnorm4x8(blend_stack[blend_slot][k]); rgb[k] = mix(rgba.rgb, rgb[k], end_clip.alpha * rgba.a); } break; diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv index a02387a..538e783 100644 Binary files a/piet-gpu/shader/kernel4.spv and b/piet-gpu/shader/kernel4.spv differ diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index 6eff190..0a6152d 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -73,7 +73,8 @@ pub fn render_scene(rc: &mut impl RenderContext) { 5.0, ); //render_cardioid(rc); - render_tiger(rc); + render_clip_test(rc); + //render_tiger(rc); } #[allow(unused)] @@ -94,6 +95,33 @@ fn render_cardioid(rc: &mut impl RenderContext) { rc.stroke(&path, &Color::BLACK, 2.0); } +#[allow(unused)] +fn render_clip_test(rc: &mut impl RenderContext) { + const N: usize = 16; + const X0: f64 = 50.0; + const Y0: f64 = 50.0; + const X1: f64 = 100.0; + const Y1: f64 = 100.0; + let step = 1.0 / ((N + 1) as f64); + for i in 0..N { + let t = ((i + 1) as f64) * step; + rc.save(); + let mut path = BezPath::new(); + path.move_to((X0, Y0)); + path.line_to((X1, Y0)); + path.line_to((X1, Y0 + t * (Y1 - Y0))); + path.line_to((X1 + t * (X0 - X1), Y1)); + path.line_to((X0, Y1)); + path.close_path(); + rc.clip(path); + } + let rect = piet::kurbo::Rect::new(X0, Y0, X1, Y1); + rc.fill(rect, &Color::BLACK); + for _ in 0..N { + rc.restore(); + } +} + fn render_tiger(rc: &mut impl RenderContext) { let xml_str = std::str::from_utf8(include_bytes!("../Ghostscript_Tiger.svg")).unwrap(); let start = std::time::Instant::now(); @@ -163,6 +191,8 @@ pub struct Renderer { coarse_alloc_buf_host: hub::Buffer, coarse_alloc_buf_dev: hub::Buffer, + clip_scratch_buf: hub::Buffer, + k4_pipeline: hub::Pipeline, k4_ds: hub::DescriptorSet, @@ -278,6 +308,8 @@ impl Renderer { &[], )?; + let clip_scratch_buf = session.create_buffer(1024 * 1024, dev)?; + let mut coarse_alloc_buf_host = session.create_buffer(8, host)?; let coarse_alloc_buf_dev = session.create_buffer(8, dev)?; @@ -298,10 +330,14 @@ impl Renderer { )?; let k4_code = include_bytes!("../shader/kernel4.spv"); - let k4_pipeline = session.create_simple_compute_pipeline(k4_code, 2, 1)?; + let k4_pipeline = session.create_simple_compute_pipeline(k4_code, 3, 1)?; let k4_ds = session.create_descriptor_set( &k4_pipeline, - &[ptcl_buf.vk_buffer(), tile_buf.vk_buffer()], + &[ + ptcl_buf.vk_buffer(), + tile_buf.vk_buffer(), + clip_scratch_buf.vk_buffer(), + ], &[image_dev.vk_image()], )?; @@ -335,6 +371,7 @@ impl Renderer { bin_alloc_buf_dev, coarse_alloc_buf_host, coarse_alloc_buf_dev, + clip_scratch_buf, n_elements, n_paths, n_pathseg, @@ -355,7 +392,8 @@ impl Renderer { self.coarse_alloc_buf_host.vk_buffer(), self.coarse_alloc_buf_dev.vk_buffer(), ); - cmd_buf.clear_buffer(self.state_buf.vk_buffer()); + cmd_buf.clear_buffer(self.state_buf.vk_buffer(), None); + cmd_buf.clear_buffer(self.clip_scratch_buf.vk_buffer(), Some(4)); cmd_buf.memory_barrier(); cmd_buf.image_barrier( self.image_dev.vk_image(),