mirror of
https://github.com/italicsjenga/vello.git
synced 2025-01-10 12:41:30 +11:00
Binning stage
Adds a binning stage. This is a first draft, and a number of loose ends exist.
This commit is contained in:
parent
736f883f66
commit
343e4c3075
19
piet-gpu-types/src/bins.rs
Normal file
19
piet-gpu-types/src/bins.rs
Normal file
|
@ -0,0 +1,19 @@
|
|||
use piet_gpu_derive::piet_gpu;
|
||||
|
||||
// The output of the binning stage, organized as a linked list of chunks.
|
||||
|
||||
piet_gpu! {
|
||||
#[gpu_write]
|
||||
mod bins {
|
||||
struct BinInstance {
|
||||
element_ix: u32,
|
||||
}
|
||||
|
||||
struct BinChunk {
|
||||
// First chunk can have n = 0, subsequent ones not.
|
||||
n: u32,
|
||||
next: Ref<BinChunk>,
|
||||
// Instances follow
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,4 +1,7 @@
|
|||
// Structures used only internally probably don't need to be pub.
|
||||
|
||||
pub mod annotated;
|
||||
pub mod bins;
|
||||
pub mod encoder;
|
||||
pub mod fill_seg;
|
||||
pub mod ptcl;
|
||||
|
|
|
@ -7,6 +7,7 @@ fn main() {
|
|||
"scene" => print!("{}", piet_gpu_types::scene::gen_gpu_scene()),
|
||||
"state" => print!("{}", piet_gpu_types::state::gen_gpu_state()),
|
||||
"annotated" => print!("{}", piet_gpu_types::annotated::gen_gpu_annotated()),
|
||||
"bins" => print!("{}", piet_gpu_types::bins::gen_gpu_bins()),
|
||||
"tilegroup" => print!("{}", piet_gpu_types::tilegroup::gen_gpu_tilegroup()),
|
||||
"segment" => print!("{}", piet_gpu_types::segment::gen_gpu_segment()),
|
||||
"fill_seg" => print!("{}", piet_gpu_types::fill_seg::gen_gpu_fill_seg()),
|
||||
|
|
|
@ -41,7 +41,7 @@ fn main() -> Result<(), Error> {
|
|||
|
||||
let fence = device.create_fence(false)?;
|
||||
let mut cmd_buf = device.create_cmd_buf()?;
|
||||
let query_pool = device.create_query_pool(2)?;
|
||||
let query_pool = device.create_query_pool(3)?;
|
||||
|
||||
let mut ctx = PietGpuRenderContext::new();
|
||||
render_scene(&mut ctx);
|
||||
|
@ -58,13 +58,14 @@ fn main() -> Result<(), Error> {
|
|||
cmd_buf.finish();
|
||||
device.run_cmd_buf(&cmd_buf, &[], &[], Some(&fence))?;
|
||||
device.wait_and_reset(&[fence])?;
|
||||
let timestamps = device.reap_query_pool(&query_pool).unwrap();
|
||||
println!("Element kernel time: {:.3}ms", timestamps[0] * 1e3);
|
||||
let ts = device.reap_query_pool(&query_pool).unwrap();
|
||||
println!("Element kernel time: {:.3}ms", ts[0] * 1e3);
|
||||
println!("Binning kernel time: {:.3}ms", (ts[1] - ts[0]) * 1e3);
|
||||
|
||||
/*
|
||||
let mut data: Vec<u8> = Default::default();
|
||||
device.read_buffer(&renderer.state_buf, &mut data).unwrap();
|
||||
dump_state(&data);
|
||||
let mut data: Vec<u32> = Default::default();
|
||||
device.read_buffer(&renderer.bin_buf, &mut data).unwrap();
|
||||
piet_gpu::dump_k1_data(&data);
|
||||
*/
|
||||
|
||||
let mut img_data: Vec<u8> = Default::default();
|
||||
|
|
169
piet-gpu/shader/binning.comp
Normal file
169
piet-gpu/shader/binning.comp
Normal file
|
@ -0,0 +1,169 @@
|
|||
// The binning stage of the pipeline.
|
||||
|
||||
#version 450
|
||||
#extension GL_GOOGLE_include_directive : enable
|
||||
|
||||
#define N_ROWS 4
|
||||
#define WG_SIZE 32
|
||||
#define LG_WG_SIZE 5
|
||||
#define TILE_SIZE (WG_SIZE * N_ROWS)
|
||||
|
||||
// TODO: move these to setup file
|
||||
#define N_TILE_X 16
|
||||
#define N_TILE_Y 16
|
||||
#define N_TILE (N_TILE_X * N_TILE_Y)
|
||||
#define N_SLICE (N_TILE / 32)
|
||||
#define N_WG 16 // Number of workgroups, should be 1 per SM
|
||||
|
||||
#define BIN_INITIAL_ALLOC 64
|
||||
#define BIN_ALLOC 256
|
||||
|
||||
layout(local_size_x = N_TILE, local_size_y = 1) in;
|
||||
|
||||
layout(set = 0, binding = 0) buffer AnnotatedBuf {
|
||||
uint[] annotated;
|
||||
};
|
||||
|
||||
layout(set = 0, binding = 1) buffer AllocBuf {
|
||||
uint n_elements;
|
||||
// Will be incremented atomically to claim tiles
|
||||
uint tile_ix;
|
||||
uint alloc;
|
||||
};
|
||||
|
||||
layout(set = 0, binding = 2) buffer BinsBuf {
|
||||
uint[] bins;
|
||||
};
|
||||
|
||||
#include "annotated.h"
|
||||
#include "bins.h"
|
||||
#include "setup.h"
|
||||
|
||||
// scale factors useful for converting coordinates to bins
|
||||
#define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX))
|
||||
#define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX))
|
||||
|
||||
// Note: cudaraster has N_TILE + 1 to cut down on bank conflicts.
|
||||
shared uint bitmaps[N_SLICE][N_TILE];
|
||||
shared uint sh_my_tile;
|
||||
|
||||
void main() {
|
||||
BinChunkRef chunk_ref = BinChunkRef((gl_LocalInvocationID.x * N_WG + gl_WorkGroupID.x) * BIN_INITIAL_ALLOC);
|
||||
uint chunk_limit = chunk_ref.offset + BIN_INITIAL_ALLOC - BinInstance_size;
|
||||
uint chunk_n = 0;
|
||||
BinInstanceRef instance_ref = BinInstanceRef(chunk_ref.offset + BinChunk_size);
|
||||
while (true) {
|
||||
if (gl_LocalInvocationID.x == 0) {
|
||||
sh_my_tile = atomicAdd(tile_ix, 1);
|
||||
}
|
||||
barrier();
|
||||
uint my_tile = sh_my_tile;
|
||||
if (my_tile * N_TILE >= n_elements) {
|
||||
break;
|
||||
}
|
||||
|
||||
for (uint i = 0; i < N_SLICE; i++) {
|
||||
bitmaps[i][gl_LocalInvocationID.x] = 0;
|
||||
}
|
||||
barrier();
|
||||
|
||||
// Read inputs and determine coverage of bins
|
||||
uint element_ix = my_tile * N_TILE + gl_LocalInvocationID.x;
|
||||
AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size);
|
||||
uint tag = Annotated_tag(ref);
|
||||
int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
|
||||
switch (tag) {
|
||||
case Annotated_Line:
|
||||
AnnoLineSeg line = Annotated_Line_read(ref);
|
||||
x0 = int(floor((min(line.p0.x, line.p1.x) - line.stroke.x) * SX));
|
||||
y0 = int(floor((min(line.p0.y, line.p1.y) - line.stroke.y) * SY));
|
||||
x1 = int(ceil((max(line.p0.x, line.p1.x) + line.stroke.x) * SX));
|
||||
y1 = int(ceil((max(line.p0.y, line.p1.y) + line.stroke.y) * SY));
|
||||
break;
|
||||
case Annotated_Fill:
|
||||
case Annotated_Stroke:
|
||||
// Note: we take advantage of the fact that fills and strokes
|
||||
// have compatible layout.
|
||||
AnnoFill fill = Annotated_Fill_read(ref);
|
||||
x0 = int(floor(fill.bbox.x * SX));
|
||||
y0 = int(floor(fill.bbox.y * SY));
|
||||
x1 = int(ceil(fill.bbox.z * SX));
|
||||
y1 = int(ceil(fill.bbox.w * SY));
|
||||
break;
|
||||
}
|
||||
// At this point, we run an iterator over the coverage area,
|
||||
// trying to keep divergence low.
|
||||
// Right now, it's just a bbox, but we'll get finer with
|
||||
// segments.
|
||||
x0 = clamp(x0, 0, N_TILE_X);
|
||||
x1 = clamp(x1, x0, N_TILE_X);
|
||||
y0 = clamp(y0, 0, N_TILE_Y);
|
||||
y1 = clamp(y1, y0, N_TILE_Y);
|
||||
if (x0 == x1) y1 = y0;
|
||||
int x = x0, y = y0;
|
||||
uint my_slice = gl_LocalInvocationID.x / 32;
|
||||
uint my_mask = 1 << (gl_LocalInvocationID.x & 31);
|
||||
while (y < y1) {
|
||||
atomicOr(bitmaps[my_slice][y * N_TILE_X + x], my_mask);
|
||||
x++;
|
||||
if (x == x1) {
|
||||
x = x0;
|
||||
y++;
|
||||
}
|
||||
}
|
||||
|
||||
barrier();
|
||||
// Allocate output segments.
|
||||
uint element_count = 0;
|
||||
for (uint i = 0; i < N_SLICE; i++) {
|
||||
element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]);
|
||||
}
|
||||
// element_count is number of elements covering bin for this invocation.
|
||||
if (element_count > 0 && chunk_n > 0) {
|
||||
uint new_chunk = instance_ref.offset;
|
||||
if (new_chunk + min(32, element_count * 4) > chunk_limit) {
|
||||
new_chunk = atomicAdd(alloc, BIN_ALLOC);
|
||||
chunk_limit = new_chunk + BIN_ALLOC - BinInstance_size;
|
||||
}
|
||||
BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(new_chunk)));
|
||||
chunk_ref = BinChunkRef(new_chunk);
|
||||
instance_ref = BinInstanceRef(new_chunk + BinChunk_size);
|
||||
chunk_n = 0;
|
||||
}
|
||||
// TODO: allocate output here
|
||||
|
||||
// Iterate over bits set.
|
||||
uint slice_ix = 0;
|
||||
uint bitmap = bitmaps[0][gl_LocalInvocationID.x];
|
||||
while (true) {
|
||||
if (bitmap == 0) {
|
||||
slice_ix++;
|
||||
if (slice_ix == N_SLICE) {
|
||||
break;
|
||||
}
|
||||
bitmap = bitmaps[slice_ix][gl_LocalInvocationID.x];
|
||||
if (bitmap == 0) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
element_ix = my_tile * N_TILE + slice_ix * 32 + findLSB(bitmap);
|
||||
// At this point, element_ix refers to an element that covers this bin.
|
||||
|
||||
// TODO: batch allocated based on element_count; this is divergent
|
||||
if (instance_ref.offset > chunk_limit) {
|
||||
uint new_chunk = atomicAdd(alloc, BIN_ALLOC);
|
||||
BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(new_chunk)));
|
||||
chunk_ref = BinChunkRef(new_chunk);
|
||||
instance_ref = BinInstanceRef(new_chunk + BinChunk_size);
|
||||
chunk_n = 0;
|
||||
chunk_limit = new_chunk + BIN_ALLOC - BinInstance_size;
|
||||
}
|
||||
BinInstance_write(instance_ref, BinInstance(element_ix));
|
||||
chunk_n++;
|
||||
instance_ref.offset += BinInstance_size;
|
||||
// clear LSB
|
||||
bitmap &= bitmap - 1;
|
||||
}
|
||||
}
|
||||
BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(0)));
|
||||
}
|
BIN
piet-gpu/shader/binning.spv
Normal file
BIN
piet-gpu/shader/binning.spv
Normal file
Binary file not shown.
60
piet-gpu/shader/bins.h
Normal file
60
piet-gpu/shader/bins.h
Normal file
|
@ -0,0 +1,60 @@
|
|||
// Code auto-generated by piet-gpu-derive
|
||||
|
||||
struct BinInstanceRef {
|
||||
uint offset;
|
||||
};
|
||||
|
||||
struct BinChunkRef {
|
||||
uint offset;
|
||||
};
|
||||
|
||||
struct BinInstance {
|
||||
uint element_ix;
|
||||
};
|
||||
|
||||
#define BinInstance_size 4
|
||||
|
||||
BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) {
|
||||
return BinInstanceRef(ref.offset + index * BinInstance_size);
|
||||
}
|
||||
|
||||
struct BinChunk {
|
||||
uint n;
|
||||
BinChunkRef next;
|
||||
};
|
||||
|
||||
#define BinChunk_size 8
|
||||
|
||||
BinChunkRef BinChunk_index(BinChunkRef ref, uint index) {
|
||||
return BinChunkRef(ref.offset + index * BinChunk_size);
|
||||
}
|
||||
|
||||
BinInstance BinInstance_read(BinInstanceRef ref) {
|
||||
uint ix = ref.offset >> 2;
|
||||
uint raw0 = bins[ix + 0];
|
||||
BinInstance s;
|
||||
s.element_ix = raw0;
|
||||
return s;
|
||||
}
|
||||
|
||||
void BinInstance_write(BinInstanceRef ref, BinInstance s) {
|
||||
uint ix = ref.offset >> 2;
|
||||
bins[ix + 0] = s.element_ix;
|
||||
}
|
||||
|
||||
BinChunk BinChunk_read(BinChunkRef ref) {
|
||||
uint ix = ref.offset >> 2;
|
||||
uint raw0 = bins[ix + 0];
|
||||
uint raw1 = bins[ix + 1];
|
||||
BinChunk s;
|
||||
s.n = raw0;
|
||||
s.next = BinChunkRef(raw1);
|
||||
return s;
|
||||
}
|
||||
|
||||
void BinChunk_write(BinChunkRef ref, BinChunk s) {
|
||||
uint ix = ref.offset >> 2;
|
||||
bins[ix + 0] = s.n;
|
||||
bins[ix + 1] = s.next.offset;
|
||||
}
|
||||
|
|
@ -20,4 +20,6 @@ build kernel3.spv: glsl kernel3.comp | scene.h tilegroup.h segment.h fill_seg.h
|
|||
build kernel4.spv: glsl kernel4.comp | ptcl.h segment.h fill_seg.h setup.h
|
||||
|
||||
|
||||
build elements.spv: glsl elements.comp | scene.h state.h
|
||||
build elements.spv: glsl elements.comp | scene.h state.h annotated.h
|
||||
|
||||
build binning.spv: glsl binning.comp | annotated.h setup.h
|
||||
|
|
|
@ -1,3 +1,9 @@
|
|||
// The element processing stage, first in the pipeline.
|
||||
//
|
||||
// This stage is primarily about applying transforms and computing bounding
|
||||
// boxes. It is organized as a scan over the input elements, producing
|
||||
// annotated output elements.
|
||||
|
||||
#version 450
|
||||
#extension GL_GOOGLE_include_directive : enable
|
||||
|
||||
|
@ -208,6 +214,13 @@ void main() {
|
|||
anno_stroke.linewidth = st.linewidth * sqrt(st.mat.x * st.mat.w - st.mat.y * st.mat.z);
|
||||
Annotated_Stroke_write(out_ref, anno_stroke);
|
||||
break;
|
||||
case Element_Fill:
|
||||
Fill fill = Element_Fill_read(this_ref);
|
||||
AnnoFill anno_fill;
|
||||
anno_fill.rgba_color = fill.rgba_color;
|
||||
anno_fill.bbox = st.bbox;
|
||||
Annotated_Fill_write(out_ref, anno_fill);
|
||||
break;
|
||||
default:
|
||||
Annotated_Nop_write(out_ref);
|
||||
break;
|
||||
|
|
Binary file not shown.
|
@ -32,6 +32,8 @@ const K2_PER_TILE_SIZE: usize = 8;
|
|||
|
||||
const N_CIRCLES: usize = 1;
|
||||
|
||||
const N_WG: u32 = 16;
|
||||
|
||||
pub fn render_scene(rc: &mut impl RenderContext) {
|
||||
let mut rng = rand::thread_rng();
|
||||
for _ in 0..N_CIRCLES {
|
||||
|
@ -98,10 +100,10 @@ fn dump_scene(buf: &[u8]) {
|
|||
}
|
||||
|
||||
#[allow(unused)]
|
||||
fn dump_k1_data(k1_buf: &[u32]) {
|
||||
pub fn dump_k1_data(k1_buf: &[u32]) {
|
||||
for i in 0..k1_buf.len() {
|
||||
if k1_buf[i] != 0 {
|
||||
println!("{:4x}: {:8x}", i, k1_buf[i]);
|
||||
println!("{:4x}: {:8x}", i * 4, k1_buf[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -114,10 +116,17 @@ pub struct Renderer<D: Device> {
|
|||
|
||||
pub state_buf: D::Buffer,
|
||||
pub anno_buf: D::Buffer,
|
||||
pub bin_buf: D::Buffer,
|
||||
|
||||
el_pipeline: D::Pipeline,
|
||||
el_ds: D::DescriptorSet,
|
||||
|
||||
bin_pipeline: D::Pipeline,
|
||||
bin_ds: D::DescriptorSet,
|
||||
|
||||
bin_alloc_buf_host: D::Buffer,
|
||||
bin_alloc_buf_dev: D::Buffer,
|
||||
|
||||
/*
|
||||
k1_alloc_buf_host: D::Buffer,
|
||||
k1_alloc_buf_dev: D::Buffer,
|
||||
|
@ -149,6 +158,9 @@ impl<D: Device> Renderer<D> {
|
|||
let host = MemFlags::host_coherent();
|
||||
let dev = MemFlags::device_local();
|
||||
|
||||
let n_elements = scene.len() / piet_gpu_types::scene::Element::fixed_size();
|
||||
println!("scene: {} elements", n_elements);
|
||||
|
||||
let scene_buf = device
|
||||
.create_buffer(std::mem::size_of_val(&scene[..]) as u64, host)
|
||||
.unwrap();
|
||||
|
@ -159,6 +171,7 @@ impl<D: Device> Renderer<D> {
|
|||
|
||||
let state_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
|
||||
let anno_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
|
||||
let bin_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
|
||||
let image_dev = device.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?;
|
||||
|
||||
let el_code = include_bytes!("../shader/elements.spv");
|
||||
|
@ -169,8 +182,25 @@ impl<D: Device> Renderer<D> {
|
|||
&[],
|
||||
)?;
|
||||
|
||||
let n_elements = scene.len() / piet_gpu_types::scene::Element::fixed_size();
|
||||
println!("scene: {} elements", n_elements);
|
||||
let bin_alloc_buf_host = device.create_buffer(12, host)?;
|
||||
let bin_alloc_buf_dev = device.create_buffer(12, dev)?;
|
||||
|
||||
// TODO: constants
|
||||
let bin_alloc_start = 256 * 64 * N_WG;
|
||||
device
|
||||
.write_buffer(&bin_alloc_buf_host, &[
|
||||
n_elements as u32,
|
||||
0,
|
||||
bin_alloc_start,
|
||||
])
|
||||
?;
|
||||
let bin_code = include_bytes!("../shader/binning.spv");
|
||||
let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 3, 0)?;
|
||||
let bin_ds = device.create_descriptor_set(
|
||||
&bin_pipeline,
|
||||
&[&anno_buf, &bin_alloc_buf_dev, &bin_buf],
|
||||
&[],
|
||||
)?;
|
||||
|
||||
/*
|
||||
let tilegroup_buf = device.create_buffer(4 * 1024 * 1024, dev)?;
|
||||
|
@ -253,14 +283,20 @@ impl<D: Device> Renderer<D> {
|
|||
image_dev,
|
||||
el_pipeline,
|
||||
el_ds,
|
||||
bin_pipeline,
|
||||
bin_ds,
|
||||
state_buf,
|
||||
anno_buf,
|
||||
bin_buf,
|
||||
bin_alloc_buf_host,
|
||||
bin_alloc_buf_dev,
|
||||
n_elements,
|
||||
})
|
||||
}
|
||||
|
||||
pub unsafe fn record(&self, cmd_buf: &mut impl CmdBuf<D>, query_pool: &D::QueryPool) {
|
||||
cmd_buf.copy_buffer(&self.scene_buf, &self.scene_dev);
|
||||
cmd_buf.copy_buffer(&self.bin_alloc_buf_host, &self.bin_alloc_buf_dev);
|
||||
cmd_buf.memory_barrier();
|
||||
cmd_buf.image_barrier(
|
||||
&self.image_dev,
|
||||
|
@ -276,6 +312,13 @@ impl<D: Device> Renderer<D> {
|
|||
);
|
||||
cmd_buf.write_timestamp(&query_pool, 1);
|
||||
cmd_buf.memory_barrier();
|
||||
cmd_buf.dispatch(
|
||||
&self.bin_pipeline,
|
||||
&self.bin_ds,
|
||||
(N_WG, 1, 1),
|
||||
);
|
||||
cmd_buf.write_timestamp(&query_pool, 2);
|
||||
cmd_buf.memory_barrier();
|
||||
cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue