Scratch buffer for clip stack

We keep a small window of the clip stack in registers in the fine rasterization kernel, and when that window is exceeded, spill to global memory, so the clip stack can be unbounded.
2025-01-10 20:51:29 +11:00 · 2020-11-21 11:39:23 -08:00 · 2020-11-21 11:39:23 -08:00 · a60c2dd3c8
parent 180047da51
commit a60c2dd3c8
5 changed files with 107 additions and 20 deletions
--- a/piet-gpu-hal/src/lib.rs
+++ b/piet-gpu-hal/src/lib.rs
@ -148,7 +148,7 @@ pub trait CmdBuf<D: Device> {
    /// This is readily supported in Vulkan, but for portability it is remarkably
    /// tricky (unimplemented in gfx-hal right now). Possibly best to write a compute
    /// kernel, or organize the code not to need it.
-    unsafe fn clear_buffer(&self, buffer: &D::Buffer);
+    unsafe fn clear_buffer(&self, buffer: &D::Buffer, size: Option<u64>);
    unsafe fn copy_buffer(&self, src: &D::Buffer, dst: &D::Buffer);
--- a/piet-gpu-hal/src/vulkan.rs
+++ b/piet-gpu-hal/src/vulkan.rs
@ -902,9 +902,10 @@ impl crate::CmdBuf<VkDevice> for CmdBuf {
        );
    }
-    unsafe fn clear_buffer(&self, buffer: &Buffer) {
+    unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) {
        let device = &self.device.device;
-        device.cmd_fill_buffer(self.cmd_buf, buffer.buffer, 0, vk::WHOLE_SIZE, 0);
+        let size = size.unwrap_or(vk::WHOLE_SIZE);
        device.cmd_fill_buffer(self.cmd_buf, buffer.buffer, 0, size, 0);
    }
    unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) {
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@ -22,13 +22,39 @@ layout(set = 0, binding = 1) buffer TileBuf {
    uint[] tile;
 };
-layout(rgba8, set = 0, binding = 2) uniform writeonly image2D image;
+layout(set = 0, binding = 2) buffer ClipScratchBuf {
    uint[] clip_scratch;
 };
 layout(rgba8, set = 0, binding = 3) uniform writeonly image2D image;
 #include "ptcl.h"
 #include "tile.h"
 #define BLEND_STACK_SIZE 4
 // Layout of clip_scratch buffer:
 // [0] is the alloc bump offset (in units of 32 bit words, initially 0)
 // Starting at 1 is a sequence of frames.
 // Each frame is WIDTH * HEIGHT 32-bit words, then a link reference.
 #define CLIP_LINK_OFFSET (TILE_WIDTH_PX * TILE_HEIGHT_PX)
 #define CLIP_BUF_SIZE (CLIP_LINK_OFFSET + 1)
 shared uint sh_clip_alloc;
 // Allocate a scratch buffer for clipping. Unlike offsets in the rest of the code,
 // it counts 32-bit words.
 uint alloc_clip_buf(uint link) {
    if (gl_LocalInvocationID.x == 0 && gl_LocalInvocationID.y == 0) {
        uint alloc = atomicAdd(clip_scratch[0], CLIP_BUF_SIZE) + 1;
        sh_clip_alloc = alloc;
        clip_scratch[alloc + CLIP_LINK_OFFSET] = link;
    }
    barrier();
    return sh_clip_alloc;
 }
 // Calculate coverage based on backdrop + coverage of each line segment
 float[CHUNK] computeArea(vec2 xy, int backdrop, uint tile_ref) {
    // Probably better to store as float, but conversion is no doubt cheap.
@ -72,7 +98,9 @@ void main() {
    vec3 rgb[CHUNK];
    float mask[CHUNK];
    uint blend_stack[BLEND_STACK_SIZE][CHUNK];
    uint blend_spill = 0;
    uint blend_sp = 0;
    uint clip_tos = 0;
    for (uint i = 0; i < CHUNK; i++) {
        rgb[i] = vec3(0.5);
        mask[i] = 1.0;
@ -142,26 +170,46 @@ void main() {
            }
            break;
        case Cmd_BeginClip:
            CmdBeginClip begin_clip = Cmd_BeginClip_read(cmd_ref);
            area = computeArea(xy, begin_clip.backdrop, begin_clip.tile_ref);
            for (uint k = 0; k < CHUNK; k++) {
                blend_stack[blend_sp][k] = packUnorm4x8(vec4(rgb[k], clamp(abs(area[k]), 0.0, 1.0)));
            }
            blend_sp++;
            break;
        case Cmd_BeginSolidClip:
-            CmdBeginSolidClip begin_solid_clip = Cmd_BeginSolidClip_read(cmd_ref);
+            uint blend_slot = blend_sp % BLEND_STACK_SIZE;
-            float solid_alpha = begin_solid_clip.alpha;
+            if (blend_sp == blend_spill + BLEND_STACK_SIZE) {
-            for (uint k = 0; k < CHUNK; k++) {
+                // spill to scratch buffer
-                blend_stack[blend_sp][k] = packUnorm4x8(vec4(rgb[k], solid_alpha));
+                clip_tos = alloc_clip_buf(clip_tos);
                uint base_ix = clip_tos + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y;
                for (uint k = 0; k < CHUNK; k++) {
                    clip_scratch[base_ix + k * TILE_WIDTH_PX * CHUNK_DY] = blend_stack[blend_slot][k];
                }
                blend_spill++;
            }
            if (tag == Cmd_BeginClip) {
                CmdBeginClip begin_clip = Cmd_BeginClip_read(cmd_ref);
                area = computeArea(xy, begin_clip.backdrop, begin_clip.tile_ref);
                for (uint k = 0; k < CHUNK; k++) {
                    blend_stack[blend_slot][k] = packUnorm4x8(vec4(rgb[k], clamp(abs(area[k]), 0.0, 1.0)));
                }
            } else {
                CmdBeginSolidClip begin_solid_clip = Cmd_BeginSolidClip_read(cmd_ref);
                float solid_alpha = begin_solid_clip.alpha;
                for (uint k = 0; k < CHUNK; k++) {
                    blend_stack[blend_slot][k] = packUnorm4x8(vec4(rgb[k], solid_alpha));
                }                
            }
            blend_sp++;
            break;
        case Cmd_EndClip:
            CmdEndClip end_clip = Cmd_EndClip_read(cmd_ref);
            blend_slot = (blend_sp - 1) % BLEND_STACK_SIZE;
            if (blend_sp == blend_spill) {
                uint base_ix = clip_tos + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y;
                for (uint k = 0; k < CHUNK; k++) {
                    blend_stack[blend_slot][k] = clip_scratch[base_ix + k * TILE_WIDTH_PX * CHUNK_DY];
                }
                clip_tos = clip_scratch[clip_tos + CLIP_LINK_OFFSET];
                blend_spill--;
            }
            blend_sp--;
            for (uint k = 0; k < CHUNK; k++) {
-                vec4 rgba = unpackUnorm4x8(blend_stack[blend_sp][k]);
+                vec4 rgba = unpackUnorm4x8(blend_stack[blend_slot][k]);
                rgb[k] = mix(rgba.rgb, rgb[k], end_clip.alpha * rgba.a);
            }
            break;
--- a/piet-gpu/shader/kernel4.spv
+++ b/piet-gpu/shader/kernel4.spv
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -73,7 +73,8 @@ pub fn render_scene(rc: &mut impl RenderContext) {
        5.0,
    );
    //render_cardioid(rc);
-    render_tiger(rc);
+    render_clip_test(rc);
    //render_tiger(rc);
 }
 #[allow(unused)]
@ -94,6 +95,33 @@ fn render_cardioid(rc: &mut impl RenderContext) {
    rc.stroke(&path, &Color::BLACK, 2.0);
 }
 #[allow(unused)]
 fn render_clip_test(rc: &mut impl RenderContext) {
    const N: usize = 16;
    const X0: f64 = 50.0;
    const Y0: f64 = 50.0;
    const X1: f64 = 100.0;
    const Y1: f64 = 100.0;
    let step = 1.0 / ((N + 1) as f64);
    for i in 0..N {
        let t = ((i + 1) as f64) * step;
        rc.save();
        let mut path = BezPath::new();
        path.move_to((X0, Y0));
        path.line_to((X1, Y0));
        path.line_to((X1, Y0 + t * (Y1 - Y0)));
        path.line_to((X1 + t * (X0 - X1), Y1));
        path.line_to((X0, Y1));
        path.close_path();
        rc.clip(path);
    }
    let rect = piet::kurbo::Rect::new(X0, Y0, X1, Y1);
    rc.fill(rect, &Color::BLACK);
    for _ in 0..N {
        rc.restore();
    }
 }
 fn render_tiger(rc: &mut impl RenderContext) {
    let xml_str = std::str::from_utf8(include_bytes!("../Ghostscript_Tiger.svg")).unwrap();
    let start = std::time::Instant::now();
@ -163,6 +191,8 @@ pub struct Renderer {
    coarse_alloc_buf_host: hub::Buffer,
    coarse_alloc_buf_dev: hub::Buffer,
    clip_scratch_buf: hub::Buffer,
    k4_pipeline: hub::Pipeline,
    k4_ds: hub::DescriptorSet,
@ -278,6 +308,8 @@ impl Renderer {
            &[],
        )?;
        let clip_scratch_buf = session.create_buffer(1024 * 1024, dev)?;
        let mut coarse_alloc_buf_host = session.create_buffer(8, host)?;
        let coarse_alloc_buf_dev = session.create_buffer(8, dev)?;
@ -298,10 +330,14 @@ impl Renderer {
        )?;
        let k4_code = include_bytes!("../shader/kernel4.spv");
-        let k4_pipeline = session.create_simple_compute_pipeline(k4_code, 2, 1)?;
+        let k4_pipeline = session.create_simple_compute_pipeline(k4_code, 3, 1)?;
        let k4_ds = session.create_descriptor_set(
            &k4_pipeline,
-            &[ptcl_buf.vk_buffer(), tile_buf.vk_buffer()],
+            &[
                ptcl_buf.vk_buffer(),
                tile_buf.vk_buffer(),
                clip_scratch_buf.vk_buffer(),
            ],
            &[image_dev.vk_image()],
        )?;
@ -335,6 +371,7 @@ impl Renderer {
            bin_alloc_buf_dev,
            coarse_alloc_buf_host,
            coarse_alloc_buf_dev,
            clip_scratch_buf,
            n_elements,
            n_paths,
            n_pathseg,
@ -355,7 +392,8 @@ impl Renderer {
            self.coarse_alloc_buf_host.vk_buffer(),
            self.coarse_alloc_buf_dev.vk_buffer(),
        );
-        cmd_buf.clear_buffer(self.state_buf.vk_buffer());
+        cmd_buf.clear_buffer(self.state_buf.vk_buffer(), None);
        cmd_buf.clear_buffer(self.clip_scratch_buf.vk_buffer(), Some(4));
        cmd_buf.memory_barrier();
        cmd_buf.image_barrier(
            self.image_dev.vk_image(),