Scratch buffer for clip stack

We keep a small window of the clip stack in registers in the fine
rasterization kernel, and when that window is exceeded, spill to global
memory, so the clip stack can be unbounded.
This commit is contained in:
Raph Levien 2020-11-21 11:39:23 -08:00
parent 180047da51
commit a60c2dd3c8
5 changed files with 107 additions and 20 deletions

View file

@ -148,7 +148,7 @@ pub trait CmdBuf<D: Device> {
/// This is readily supported in Vulkan, but for portability it is remarkably /// This is readily supported in Vulkan, but for portability it is remarkably
/// tricky (unimplemented in gfx-hal right now). Possibly best to write a compute /// tricky (unimplemented in gfx-hal right now). Possibly best to write a compute
/// kernel, or organize the code not to need it. /// kernel, or organize the code not to need it.
unsafe fn clear_buffer(&self, buffer: &D::Buffer); unsafe fn clear_buffer(&self, buffer: &D::Buffer, size: Option<u64>);
unsafe fn copy_buffer(&self, src: &D::Buffer, dst: &D::Buffer); unsafe fn copy_buffer(&self, src: &D::Buffer, dst: &D::Buffer);

View file

@ -902,9 +902,10 @@ impl crate::CmdBuf<VkDevice> for CmdBuf {
); );
} }
unsafe fn clear_buffer(&self, buffer: &Buffer) { unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) {
let device = &self.device.device; let device = &self.device.device;
device.cmd_fill_buffer(self.cmd_buf, buffer.buffer, 0, vk::WHOLE_SIZE, 0); let size = size.unwrap_or(vk::WHOLE_SIZE);
device.cmd_fill_buffer(self.cmd_buf, buffer.buffer, 0, size, 0);
} }
unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) { unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) {

View file

@ -22,13 +22,39 @@ layout(set = 0, binding = 1) buffer TileBuf {
uint[] tile; uint[] tile;
}; };
layout(rgba8, set = 0, binding = 2) uniform writeonly image2D image; layout(set = 0, binding = 2) buffer ClipScratchBuf {
uint[] clip_scratch;
};
layout(rgba8, set = 0, binding = 3) uniform writeonly image2D image;
#include "ptcl.h" #include "ptcl.h"
#include "tile.h" #include "tile.h"
#define BLEND_STACK_SIZE 4 #define BLEND_STACK_SIZE 4
// Layout of clip_scratch buffer:
// [0] is the alloc bump offset (in units of 32 bit words, initially 0)
// Starting at 1 is a sequence of frames.
// Each frame is WIDTH * HEIGHT 32-bit words, then a link reference.
#define CLIP_LINK_OFFSET (TILE_WIDTH_PX * TILE_HEIGHT_PX)
#define CLIP_BUF_SIZE (CLIP_LINK_OFFSET + 1)
shared uint sh_clip_alloc;
// Allocate a scratch buffer for clipping. Unlike offsets in the rest of the code,
// it counts 32-bit words.
uint alloc_clip_buf(uint link) {
if (gl_LocalInvocationID.x == 0 && gl_LocalInvocationID.y == 0) {
uint alloc = atomicAdd(clip_scratch[0], CLIP_BUF_SIZE) + 1;
sh_clip_alloc = alloc;
clip_scratch[alloc + CLIP_LINK_OFFSET] = link;
}
barrier();
return sh_clip_alloc;
}
// Calculate coverage based on backdrop + coverage of each line segment // Calculate coverage based on backdrop + coverage of each line segment
float[CHUNK] computeArea(vec2 xy, int backdrop, uint tile_ref) { float[CHUNK] computeArea(vec2 xy, int backdrop, uint tile_ref) {
// Probably better to store as float, but conversion is no doubt cheap. // Probably better to store as float, but conversion is no doubt cheap.
@ -72,7 +98,9 @@ void main() {
vec3 rgb[CHUNK]; vec3 rgb[CHUNK];
float mask[CHUNK]; float mask[CHUNK];
uint blend_stack[BLEND_STACK_SIZE][CHUNK]; uint blend_stack[BLEND_STACK_SIZE][CHUNK];
uint blend_spill = 0;
uint blend_sp = 0; uint blend_sp = 0;
uint clip_tos = 0;
for (uint i = 0; i < CHUNK; i++) { for (uint i = 0; i < CHUNK; i++) {
rgb[i] = vec3(0.5); rgb[i] = vec3(0.5);
mask[i] = 1.0; mask[i] = 1.0;
@ -142,26 +170,46 @@ void main() {
} }
break; break;
case Cmd_BeginClip: case Cmd_BeginClip:
CmdBeginClip begin_clip = Cmd_BeginClip_read(cmd_ref);
area = computeArea(xy, begin_clip.backdrop, begin_clip.tile_ref);
for (uint k = 0; k < CHUNK; k++) {
blend_stack[blend_sp][k] = packUnorm4x8(vec4(rgb[k], clamp(abs(area[k]), 0.0, 1.0)));
}
blend_sp++;
break;
case Cmd_BeginSolidClip: case Cmd_BeginSolidClip:
CmdBeginSolidClip begin_solid_clip = Cmd_BeginSolidClip_read(cmd_ref); uint blend_slot = blend_sp % BLEND_STACK_SIZE;
float solid_alpha = begin_solid_clip.alpha; if (blend_sp == blend_spill + BLEND_STACK_SIZE) {
for (uint k = 0; k < CHUNK; k++) { // spill to scratch buffer
blend_stack[blend_sp][k] = packUnorm4x8(vec4(rgb[k], solid_alpha)); clip_tos = alloc_clip_buf(clip_tos);
uint base_ix = clip_tos + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y;
for (uint k = 0; k < CHUNK; k++) {
clip_scratch[base_ix + k * TILE_WIDTH_PX * CHUNK_DY] = blend_stack[blend_slot][k];
}
blend_spill++;
}
if (tag == Cmd_BeginClip) {
CmdBeginClip begin_clip = Cmd_BeginClip_read(cmd_ref);
area = computeArea(xy, begin_clip.backdrop, begin_clip.tile_ref);
for (uint k = 0; k < CHUNK; k++) {
blend_stack[blend_slot][k] = packUnorm4x8(vec4(rgb[k], clamp(abs(area[k]), 0.0, 1.0)));
}
} else {
CmdBeginSolidClip begin_solid_clip = Cmd_BeginSolidClip_read(cmd_ref);
float solid_alpha = begin_solid_clip.alpha;
for (uint k = 0; k < CHUNK; k++) {
blend_stack[blend_slot][k] = packUnorm4x8(vec4(rgb[k], solid_alpha));
}
} }
blend_sp++; blend_sp++;
break; break;
case Cmd_EndClip: case Cmd_EndClip:
CmdEndClip end_clip = Cmd_EndClip_read(cmd_ref); CmdEndClip end_clip = Cmd_EndClip_read(cmd_ref);
blend_slot = (blend_sp - 1) % BLEND_STACK_SIZE;
if (blend_sp == blend_spill) {
uint base_ix = clip_tos + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y;
for (uint k = 0; k < CHUNK; k++) {
blend_stack[blend_slot][k] = clip_scratch[base_ix + k * TILE_WIDTH_PX * CHUNK_DY];
}
clip_tos = clip_scratch[clip_tos + CLIP_LINK_OFFSET];
blend_spill--;
}
blend_sp--; blend_sp--;
for (uint k = 0; k < CHUNK; k++) { for (uint k = 0; k < CHUNK; k++) {
vec4 rgba = unpackUnorm4x8(blend_stack[blend_sp][k]); vec4 rgba = unpackUnorm4x8(blend_stack[blend_slot][k]);
rgb[k] = mix(rgba.rgb, rgb[k], end_clip.alpha * rgba.a); rgb[k] = mix(rgba.rgb, rgb[k], end_clip.alpha * rgba.a);
} }
break; break;

Binary file not shown.

View file

@ -73,7 +73,8 @@ pub fn render_scene(rc: &mut impl RenderContext) {
5.0, 5.0,
); );
//render_cardioid(rc); //render_cardioid(rc);
render_tiger(rc); render_clip_test(rc);
//render_tiger(rc);
} }
#[allow(unused)] #[allow(unused)]
@ -94,6 +95,33 @@ fn render_cardioid(rc: &mut impl RenderContext) {
rc.stroke(&path, &Color::BLACK, 2.0); rc.stroke(&path, &Color::BLACK, 2.0);
} }
#[allow(unused)]
fn render_clip_test(rc: &mut impl RenderContext) {
const N: usize = 16;
const X0: f64 = 50.0;
const Y0: f64 = 50.0;
const X1: f64 = 100.0;
const Y1: f64 = 100.0;
let step = 1.0 / ((N + 1) as f64);
for i in 0..N {
let t = ((i + 1) as f64) * step;
rc.save();
let mut path = BezPath::new();
path.move_to((X0, Y0));
path.line_to((X1, Y0));
path.line_to((X1, Y0 + t * (Y1 - Y0)));
path.line_to((X1 + t * (X0 - X1), Y1));
path.line_to((X0, Y1));
path.close_path();
rc.clip(path);
}
let rect = piet::kurbo::Rect::new(X0, Y0, X1, Y1);
rc.fill(rect, &Color::BLACK);
for _ in 0..N {
rc.restore();
}
}
fn render_tiger(rc: &mut impl RenderContext) { fn render_tiger(rc: &mut impl RenderContext) {
let xml_str = std::str::from_utf8(include_bytes!("../Ghostscript_Tiger.svg")).unwrap(); let xml_str = std::str::from_utf8(include_bytes!("../Ghostscript_Tiger.svg")).unwrap();
let start = std::time::Instant::now(); let start = std::time::Instant::now();
@ -163,6 +191,8 @@ pub struct Renderer {
coarse_alloc_buf_host: hub::Buffer, coarse_alloc_buf_host: hub::Buffer,
coarse_alloc_buf_dev: hub::Buffer, coarse_alloc_buf_dev: hub::Buffer,
clip_scratch_buf: hub::Buffer,
k4_pipeline: hub::Pipeline, k4_pipeline: hub::Pipeline,
k4_ds: hub::DescriptorSet, k4_ds: hub::DescriptorSet,
@ -278,6 +308,8 @@ impl Renderer {
&[], &[],
)?; )?;
let clip_scratch_buf = session.create_buffer(1024 * 1024, dev)?;
let mut coarse_alloc_buf_host = session.create_buffer(8, host)?; let mut coarse_alloc_buf_host = session.create_buffer(8, host)?;
let coarse_alloc_buf_dev = session.create_buffer(8, dev)?; let coarse_alloc_buf_dev = session.create_buffer(8, dev)?;
@ -298,10 +330,14 @@ impl Renderer {
)?; )?;
let k4_code = include_bytes!("../shader/kernel4.spv"); let k4_code = include_bytes!("../shader/kernel4.spv");
let k4_pipeline = session.create_simple_compute_pipeline(k4_code, 2, 1)?; let k4_pipeline = session.create_simple_compute_pipeline(k4_code, 3, 1)?;
let k4_ds = session.create_descriptor_set( let k4_ds = session.create_descriptor_set(
&k4_pipeline, &k4_pipeline,
&[ptcl_buf.vk_buffer(), tile_buf.vk_buffer()], &[
ptcl_buf.vk_buffer(),
tile_buf.vk_buffer(),
clip_scratch_buf.vk_buffer(),
],
&[image_dev.vk_image()], &[image_dev.vk_image()],
)?; )?;
@ -335,6 +371,7 @@ impl Renderer {
bin_alloc_buf_dev, bin_alloc_buf_dev,
coarse_alloc_buf_host, coarse_alloc_buf_host,
coarse_alloc_buf_dev, coarse_alloc_buf_dev,
clip_scratch_buf,
n_elements, n_elements,
n_paths, n_paths,
n_pathseg, n_pathseg,
@ -355,7 +392,8 @@ impl Renderer {
self.coarse_alloc_buf_host.vk_buffer(), self.coarse_alloc_buf_host.vk_buffer(),
self.coarse_alloc_buf_dev.vk_buffer(), self.coarse_alloc_buf_dev.vk_buffer(),
); );
cmd_buf.clear_buffer(self.state_buf.vk_buffer()); cmd_buf.clear_buffer(self.state_buf.vk_buffer(), None);
cmd_buf.clear_buffer(self.clip_scratch_buf.vk_buffer(), Some(4));
cmd_buf.memory_barrier(); cmd_buf.memory_barrier();
cmd_buf.image_barrier( cmd_buf.image_barrier(
self.image_dev.vk_image(), self.image_dev.vk_image(),