Scratch buffer for clip stack

We keep a small window of the clip stack in registers in the fine
rasterization kernel, and when that window is exceeded, spill to global
memory, so the clip stack can be unbounded.
This commit is contained in:
Raph Levien 2020-11-21 11:39:23 -08:00
parent 180047da51
commit a60c2dd3c8
5 changed files with 107 additions and 20 deletions

View file

@ -148,7 +148,7 @@ pub trait CmdBuf<D: Device> {
/// This is readily supported in Vulkan, but for portability it is remarkably
/// tricky (unimplemented in gfx-hal right now). Possibly best to write a compute
/// kernel, or organize the code not to need it.
unsafe fn clear_buffer(&self, buffer: &D::Buffer);
unsafe fn clear_buffer(&self, buffer: &D::Buffer, size: Option<u64>);
unsafe fn copy_buffer(&self, src: &D::Buffer, dst: &D::Buffer);

View file

@ -902,9 +902,10 @@ impl crate::CmdBuf<VkDevice> for CmdBuf {
);
}
unsafe fn clear_buffer(&self, buffer: &Buffer) {
unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) {
let device = &self.device.device;
device.cmd_fill_buffer(self.cmd_buf, buffer.buffer, 0, vk::WHOLE_SIZE, 0);
let size = size.unwrap_or(vk::WHOLE_SIZE);
device.cmd_fill_buffer(self.cmd_buf, buffer.buffer, 0, size, 0);
}
unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) {

View file

@ -22,13 +22,39 @@ layout(set = 0, binding = 1) buffer TileBuf {
uint[] tile;
};
layout(rgba8, set = 0, binding = 2) uniform writeonly image2D image;
layout(set = 0, binding = 2) buffer ClipScratchBuf {
uint[] clip_scratch;
};
layout(rgba8, set = 0, binding = 3) uniform writeonly image2D image;
#include "ptcl.h"
#include "tile.h"
#define BLEND_STACK_SIZE 4
// Layout of clip_scratch buffer:
// [0] is the alloc bump offset (in units of 32 bit words, initially 0)
// Starting at 1 is a sequence of frames.
// Each frame is WIDTH * HEIGHT 32-bit words, then a link reference.
#define CLIP_LINK_OFFSET (TILE_WIDTH_PX * TILE_HEIGHT_PX)
#define CLIP_BUF_SIZE (CLIP_LINK_OFFSET + 1)
shared uint sh_clip_alloc;
// Allocate a scratch buffer for clipping. Unlike offsets in the rest of the code,
// it counts 32-bit words.
uint alloc_clip_buf(uint link) {
if (gl_LocalInvocationID.x == 0 && gl_LocalInvocationID.y == 0) {
uint alloc = atomicAdd(clip_scratch[0], CLIP_BUF_SIZE) + 1;
sh_clip_alloc = alloc;
clip_scratch[alloc + CLIP_LINK_OFFSET] = link;
}
barrier();
return sh_clip_alloc;
}
// Calculate coverage based on backdrop + coverage of each line segment
float[CHUNK] computeArea(vec2 xy, int backdrop, uint tile_ref) {
// Probably better to store as float, but conversion is no doubt cheap.
@ -72,7 +98,9 @@ void main() {
vec3 rgb[CHUNK];
float mask[CHUNK];
uint blend_stack[BLEND_STACK_SIZE][CHUNK];
uint blend_spill = 0;
uint blend_sp = 0;
uint clip_tos = 0;
for (uint i = 0; i < CHUNK; i++) {
rgb[i] = vec3(0.5);
mask[i] = 1.0;
@ -142,26 +170,46 @@ void main() {
}
break;
case Cmd_BeginClip:
case Cmd_BeginSolidClip:
uint blend_slot = blend_sp % BLEND_STACK_SIZE;
if (blend_sp == blend_spill + BLEND_STACK_SIZE) {
// spill to scratch buffer
clip_tos = alloc_clip_buf(clip_tos);
uint base_ix = clip_tos + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y;
for (uint k = 0; k < CHUNK; k++) {
clip_scratch[base_ix + k * TILE_WIDTH_PX * CHUNK_DY] = blend_stack[blend_slot][k];
}
blend_spill++;
}
if (tag == Cmd_BeginClip) {
CmdBeginClip begin_clip = Cmd_BeginClip_read(cmd_ref);
area = computeArea(xy, begin_clip.backdrop, begin_clip.tile_ref);
for (uint k = 0; k < CHUNK; k++) {
blend_stack[blend_sp][k] = packUnorm4x8(vec4(rgb[k], clamp(abs(area[k]), 0.0, 1.0)));
blend_stack[blend_slot][k] = packUnorm4x8(vec4(rgb[k], clamp(abs(area[k]), 0.0, 1.0)));
}
blend_sp++;
break;
case Cmd_BeginSolidClip:
} else {
CmdBeginSolidClip begin_solid_clip = Cmd_BeginSolidClip_read(cmd_ref);
float solid_alpha = begin_solid_clip.alpha;
for (uint k = 0; k < CHUNK; k++) {
blend_stack[blend_sp][k] = packUnorm4x8(vec4(rgb[k], solid_alpha));
blend_stack[blend_slot][k] = packUnorm4x8(vec4(rgb[k], solid_alpha));
}
}
blend_sp++;
break;
case Cmd_EndClip:
CmdEndClip end_clip = Cmd_EndClip_read(cmd_ref);
blend_slot = (blend_sp - 1) % BLEND_STACK_SIZE;
if (blend_sp == blend_spill) {
uint base_ix = clip_tos + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y;
for (uint k = 0; k < CHUNK; k++) {
blend_stack[blend_slot][k] = clip_scratch[base_ix + k * TILE_WIDTH_PX * CHUNK_DY];
}
clip_tos = clip_scratch[clip_tos + CLIP_LINK_OFFSET];
blend_spill--;
}
blend_sp--;
for (uint k = 0; k < CHUNK; k++) {
vec4 rgba = unpackUnorm4x8(blend_stack[blend_sp][k]);
vec4 rgba = unpackUnorm4x8(blend_stack[blend_slot][k]);
rgb[k] = mix(rgba.rgb, rgb[k], end_clip.alpha * rgba.a);
}
break;

Binary file not shown.

View file

@ -73,7 +73,8 @@ pub fn render_scene(rc: &mut impl RenderContext) {
5.0,
);
//render_cardioid(rc);
render_tiger(rc);
render_clip_test(rc);
//render_tiger(rc);
}
#[allow(unused)]
@ -94,6 +95,33 @@ fn render_cardioid(rc: &mut impl RenderContext) {
rc.stroke(&path, &Color::BLACK, 2.0);
}
#[allow(unused)]
fn render_clip_test(rc: &mut impl RenderContext) {
const N: usize = 16;
const X0: f64 = 50.0;
const Y0: f64 = 50.0;
const X1: f64 = 100.0;
const Y1: f64 = 100.0;
let step = 1.0 / ((N + 1) as f64);
for i in 0..N {
let t = ((i + 1) as f64) * step;
rc.save();
let mut path = BezPath::new();
path.move_to((X0, Y0));
path.line_to((X1, Y0));
path.line_to((X1, Y0 + t * (Y1 - Y0)));
path.line_to((X1 + t * (X0 - X1), Y1));
path.line_to((X0, Y1));
path.close_path();
rc.clip(path);
}
let rect = piet::kurbo::Rect::new(X0, Y0, X1, Y1);
rc.fill(rect, &Color::BLACK);
for _ in 0..N {
rc.restore();
}
}
fn render_tiger(rc: &mut impl RenderContext) {
let xml_str = std::str::from_utf8(include_bytes!("../Ghostscript_Tiger.svg")).unwrap();
let start = std::time::Instant::now();
@ -163,6 +191,8 @@ pub struct Renderer {
coarse_alloc_buf_host: hub::Buffer,
coarse_alloc_buf_dev: hub::Buffer,
clip_scratch_buf: hub::Buffer,
k4_pipeline: hub::Pipeline,
k4_ds: hub::DescriptorSet,
@ -278,6 +308,8 @@ impl Renderer {
&[],
)?;
let clip_scratch_buf = session.create_buffer(1024 * 1024, dev)?;
let mut coarse_alloc_buf_host = session.create_buffer(8, host)?;
let coarse_alloc_buf_dev = session.create_buffer(8, dev)?;
@ -298,10 +330,14 @@ impl Renderer {
)?;
let k4_code = include_bytes!("../shader/kernel4.spv");
let k4_pipeline = session.create_simple_compute_pipeline(k4_code, 2, 1)?;
let k4_pipeline = session.create_simple_compute_pipeline(k4_code, 3, 1)?;
let k4_ds = session.create_descriptor_set(
&k4_pipeline,
&[ptcl_buf.vk_buffer(), tile_buf.vk_buffer()],
&[
ptcl_buf.vk_buffer(),
tile_buf.vk_buffer(),
clip_scratch_buf.vk_buffer(),
],
&[image_dev.vk_image()],
)?;
@ -335,6 +371,7 @@ impl Renderer {
bin_alloc_buf_dev,
coarse_alloc_buf_host,
coarse_alloc_buf_dev,
clip_scratch_buf,
n_elements,
n_paths,
n_pathseg,
@ -355,7 +392,8 @@ impl Renderer {
self.coarse_alloc_buf_host.vk_buffer(),
self.coarse_alloc_buf_dev.vk_buffer(),
);
cmd_buf.clear_buffer(self.state_buf.vk_buffer());
cmd_buf.clear_buffer(self.state_buf.vk_buffer(), None);
cmd_buf.clear_buffer(self.clip_scratch_buf.vk_buffer(), Some(4));
cmd_buf.memory_barrier();
cmd_buf.image_barrier(
self.image_dev.vk_image(),