Binning stage

Adds a binning stage. This is a first draft, and a number of loose ends
exist.
This commit is contained in:
Raph Levien 2020-05-12 13:38:26 -07:00
parent 736f883f66
commit 343e4c3075
11 changed files with 322 additions and 11 deletions

View file

@ -0,0 +1,19 @@
use piet_gpu_derive::piet_gpu;
// The output of the binning stage, organized as a linked list of chunks.
piet_gpu! {
#[gpu_write]
mod bins {
struct BinInstance {
element_ix: u32,
}
struct BinChunk {
// First chunk can have n = 0, subsequent ones not.
n: u32,
next: Ref<BinChunk>,
// Instances follow
}
}
}

View file

@ -1,4 +1,7 @@
// Structures used only internally probably don't need to be pub.
pub mod annotated;
pub mod bins;
pub mod encoder;
pub mod fill_seg;
pub mod ptcl;

View file

@ -7,6 +7,7 @@ fn main() {
"scene" => print!("{}", piet_gpu_types::scene::gen_gpu_scene()),
"state" => print!("{}", piet_gpu_types::state::gen_gpu_state()),
"annotated" => print!("{}", piet_gpu_types::annotated::gen_gpu_annotated()),
"bins" => print!("{}", piet_gpu_types::bins::gen_gpu_bins()),
"tilegroup" => print!("{}", piet_gpu_types::tilegroup::gen_gpu_tilegroup()),
"segment" => print!("{}", piet_gpu_types::segment::gen_gpu_segment()),
"fill_seg" => print!("{}", piet_gpu_types::fill_seg::gen_gpu_fill_seg()),

View file

@ -41,7 +41,7 @@ fn main() -> Result<(), Error> {
let fence = device.create_fence(false)?;
let mut cmd_buf = device.create_cmd_buf()?;
let query_pool = device.create_query_pool(2)?;
let query_pool = device.create_query_pool(3)?;
let mut ctx = PietGpuRenderContext::new();
render_scene(&mut ctx);
@ -58,13 +58,14 @@ fn main() -> Result<(), Error> {
cmd_buf.finish();
device.run_cmd_buf(&cmd_buf, &[], &[], Some(&fence))?;
device.wait_and_reset(&[fence])?;
let timestamps = device.reap_query_pool(&query_pool).unwrap();
println!("Element kernel time: {:.3}ms", timestamps[0] * 1e3);
let ts = device.reap_query_pool(&query_pool).unwrap();
println!("Element kernel time: {:.3}ms", ts[0] * 1e3);
println!("Binning kernel time: {:.3}ms", (ts[1] - ts[0]) * 1e3);
/*
let mut data: Vec<u8> = Default::default();
device.read_buffer(&renderer.state_buf, &mut data).unwrap();
dump_state(&data);
let mut data: Vec<u32> = Default::default();
device.read_buffer(&renderer.bin_buf, &mut data).unwrap();
piet_gpu::dump_k1_data(&data);
*/
let mut img_data: Vec<u8> = Default::default();

View file

@ -0,0 +1,169 @@
// The binning stage of the pipeline.
#version 450
#extension GL_GOOGLE_include_directive : enable
#define N_ROWS 4
#define WG_SIZE 32
#define LG_WG_SIZE 5
#define TILE_SIZE (WG_SIZE * N_ROWS)
// TODO: move these to setup file
#define N_TILE_X 16
#define N_TILE_Y 16
#define N_TILE (N_TILE_X * N_TILE_Y)
#define N_SLICE (N_TILE / 32)
#define N_WG 16 // Number of workgroups, should be 1 per SM
#define BIN_INITIAL_ALLOC 64
#define BIN_ALLOC 256
layout(local_size_x = N_TILE, local_size_y = 1) in;
layout(set = 0, binding = 0) buffer AnnotatedBuf {
uint[] annotated;
};
layout(set = 0, binding = 1) buffer AllocBuf {
uint n_elements;
// Will be incremented atomically to claim tiles
uint tile_ix;
uint alloc;
};
layout(set = 0, binding = 2) buffer BinsBuf {
uint[] bins;
};
#include "annotated.h"
#include "bins.h"
#include "setup.h"
// scale factors useful for converting coordinates to bins
#define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX))
#define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX))
// Note: cudaraster has N_TILE + 1 to cut down on bank conflicts.
shared uint bitmaps[N_SLICE][N_TILE];
shared uint sh_my_tile;
void main() {
BinChunkRef chunk_ref = BinChunkRef((gl_LocalInvocationID.x * N_WG + gl_WorkGroupID.x) * BIN_INITIAL_ALLOC);
uint chunk_limit = chunk_ref.offset + BIN_INITIAL_ALLOC - BinInstance_size;
uint chunk_n = 0;
BinInstanceRef instance_ref = BinInstanceRef(chunk_ref.offset + BinChunk_size);
while (true) {
if (gl_LocalInvocationID.x == 0) {
sh_my_tile = atomicAdd(tile_ix, 1);
}
barrier();
uint my_tile = sh_my_tile;
if (my_tile * N_TILE >= n_elements) {
break;
}
for (uint i = 0; i < N_SLICE; i++) {
bitmaps[i][gl_LocalInvocationID.x] = 0;
}
barrier();
// Read inputs and determine coverage of bins
uint element_ix = my_tile * N_TILE + gl_LocalInvocationID.x;
AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size);
uint tag = Annotated_tag(ref);
int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
switch (tag) {
case Annotated_Line:
AnnoLineSeg line = Annotated_Line_read(ref);
x0 = int(floor((min(line.p0.x, line.p1.x) - line.stroke.x) * SX));
y0 = int(floor((min(line.p0.y, line.p1.y) - line.stroke.y) * SY));
x1 = int(ceil((max(line.p0.x, line.p1.x) + line.stroke.x) * SX));
y1 = int(ceil((max(line.p0.y, line.p1.y) + line.stroke.y) * SY));
break;
case Annotated_Fill:
case Annotated_Stroke:
// Note: we take advantage of the fact that fills and strokes
// have compatible layout.
AnnoFill fill = Annotated_Fill_read(ref);
x0 = int(floor(fill.bbox.x * SX));
y0 = int(floor(fill.bbox.y * SY));
x1 = int(ceil(fill.bbox.z * SX));
y1 = int(ceil(fill.bbox.w * SY));
break;
}
// At this point, we run an iterator over the coverage area,
// trying to keep divergence low.
// Right now, it's just a bbox, but we'll get finer with
// segments.
x0 = clamp(x0, 0, N_TILE_X);
x1 = clamp(x1, x0, N_TILE_X);
y0 = clamp(y0, 0, N_TILE_Y);
y1 = clamp(y1, y0, N_TILE_Y);
if (x0 == x1) y1 = y0;
int x = x0, y = y0;
uint my_slice = gl_LocalInvocationID.x / 32;
uint my_mask = 1 << (gl_LocalInvocationID.x & 31);
while (y < y1) {
atomicOr(bitmaps[my_slice][y * N_TILE_X + x], my_mask);
x++;
if (x == x1) {
x = x0;
y++;
}
}
barrier();
// Allocate output segments.
uint element_count = 0;
for (uint i = 0; i < N_SLICE; i++) {
element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]);
}
// element_count is number of elements covering bin for this invocation.
if (element_count > 0 && chunk_n > 0) {
uint new_chunk = instance_ref.offset;
if (new_chunk + min(32, element_count * 4) > chunk_limit) {
new_chunk = atomicAdd(alloc, BIN_ALLOC);
chunk_limit = new_chunk + BIN_ALLOC - BinInstance_size;
}
BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(new_chunk)));
chunk_ref = BinChunkRef(new_chunk);
instance_ref = BinInstanceRef(new_chunk + BinChunk_size);
chunk_n = 0;
}
// TODO: allocate output here
// Iterate over bits set.
uint slice_ix = 0;
uint bitmap = bitmaps[0][gl_LocalInvocationID.x];
while (true) {
if (bitmap == 0) {
slice_ix++;
if (slice_ix == N_SLICE) {
break;
}
bitmap = bitmaps[slice_ix][gl_LocalInvocationID.x];
if (bitmap == 0) {
continue;
}
}
element_ix = my_tile * N_TILE + slice_ix * 32 + findLSB(bitmap);
// At this point, element_ix refers to an element that covers this bin.
// TODO: batch allocated based on element_count; this is divergent
if (instance_ref.offset > chunk_limit) {
uint new_chunk = atomicAdd(alloc, BIN_ALLOC);
BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(new_chunk)));
chunk_ref = BinChunkRef(new_chunk);
instance_ref = BinInstanceRef(new_chunk + BinChunk_size);
chunk_n = 0;
chunk_limit = new_chunk + BIN_ALLOC - BinInstance_size;
}
BinInstance_write(instance_ref, BinInstance(element_ix));
chunk_n++;
instance_ref.offset += BinInstance_size;
// clear LSB
bitmap &= bitmap - 1;
}
}
BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(0)));
}

BIN
piet-gpu/shader/binning.spv Normal file

Binary file not shown.

60
piet-gpu/shader/bins.h Normal file
View file

@ -0,0 +1,60 @@
// Code auto-generated by piet-gpu-derive
struct BinInstanceRef {
uint offset;
};
struct BinChunkRef {
uint offset;
};
struct BinInstance {
uint element_ix;
};
#define BinInstance_size 4
BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) {
return BinInstanceRef(ref.offset + index * BinInstance_size);
}
struct BinChunk {
uint n;
BinChunkRef next;
};
#define BinChunk_size 8
BinChunkRef BinChunk_index(BinChunkRef ref, uint index) {
return BinChunkRef(ref.offset + index * BinChunk_size);
}
BinInstance BinInstance_read(BinInstanceRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = bins[ix + 0];
BinInstance s;
s.element_ix = raw0;
return s;
}
void BinInstance_write(BinInstanceRef ref, BinInstance s) {
uint ix = ref.offset >> 2;
bins[ix + 0] = s.element_ix;
}
BinChunk BinChunk_read(BinChunkRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = bins[ix + 0];
uint raw1 = bins[ix + 1];
BinChunk s;
s.n = raw0;
s.next = BinChunkRef(raw1);
return s;
}
void BinChunk_write(BinChunkRef ref, BinChunk s) {
uint ix = ref.offset >> 2;
bins[ix + 0] = s.n;
bins[ix + 1] = s.next.offset;
}

View file

@ -20,4 +20,6 @@ build kernel3.spv: glsl kernel3.comp | scene.h tilegroup.h segment.h fill_seg.h
build kernel4.spv: glsl kernel4.comp | ptcl.h segment.h fill_seg.h setup.h
build elements.spv: glsl elements.comp | scene.h state.h
build elements.spv: glsl elements.comp | scene.h state.h annotated.h
build binning.spv: glsl binning.comp | annotated.h setup.h

View file

@ -1,3 +1,9 @@
// The element processing stage, first in the pipeline.
//
// This stage is primarily about applying transforms and computing bounding
// boxes. It is organized as a scan over the input elements, producing
// annotated output elements.
#version 450
#extension GL_GOOGLE_include_directive : enable
@ -208,6 +214,13 @@ void main() {
anno_stroke.linewidth = st.linewidth * sqrt(st.mat.x * st.mat.w - st.mat.y * st.mat.z);
Annotated_Stroke_write(out_ref, anno_stroke);
break;
case Element_Fill:
Fill fill = Element_Fill_read(this_ref);
AnnoFill anno_fill;
anno_fill.rgba_color = fill.rgba_color;
anno_fill.bbox = st.bbox;
Annotated_Fill_write(out_ref, anno_fill);
break;
default:
Annotated_Nop_write(out_ref);
break;

Binary file not shown.

View file

@ -32,6 +32,8 @@ const K2_PER_TILE_SIZE: usize = 8;
const N_CIRCLES: usize = 1;
const N_WG: u32 = 16;
pub fn render_scene(rc: &mut impl RenderContext) {
let mut rng = rand::thread_rng();
for _ in 0..N_CIRCLES {
@ -98,10 +100,10 @@ fn dump_scene(buf: &[u8]) {
}
#[allow(unused)]
fn dump_k1_data(k1_buf: &[u32]) {
pub fn dump_k1_data(k1_buf: &[u32]) {
for i in 0..k1_buf.len() {
if k1_buf[i] != 0 {
println!("{:4x}: {:8x}", i, k1_buf[i]);
println!("{:4x}: {:8x}", i * 4, k1_buf[i]);
}
}
}
@ -114,10 +116,17 @@ pub struct Renderer<D: Device> {
pub state_buf: D::Buffer,
pub anno_buf: D::Buffer,
pub bin_buf: D::Buffer,
el_pipeline: D::Pipeline,
el_ds: D::DescriptorSet,
bin_pipeline: D::Pipeline,
bin_ds: D::DescriptorSet,
bin_alloc_buf_host: D::Buffer,
bin_alloc_buf_dev: D::Buffer,
/*
k1_alloc_buf_host: D::Buffer,
k1_alloc_buf_dev: D::Buffer,
@ -149,6 +158,9 @@ impl<D: Device> Renderer<D> {
let host = MemFlags::host_coherent();
let dev = MemFlags::device_local();
let n_elements = scene.len() / piet_gpu_types::scene::Element::fixed_size();
println!("scene: {} elements", n_elements);
let scene_buf = device
.create_buffer(std::mem::size_of_val(&scene[..]) as u64, host)
.unwrap();
@ -159,6 +171,7 @@ impl<D: Device> Renderer<D> {
let state_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
let anno_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
let bin_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
let image_dev = device.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?;
let el_code = include_bytes!("../shader/elements.spv");
@ -169,8 +182,25 @@ impl<D: Device> Renderer<D> {
&[],
)?;
let n_elements = scene.len() / piet_gpu_types::scene::Element::fixed_size();
println!("scene: {} elements", n_elements);
let bin_alloc_buf_host = device.create_buffer(12, host)?;
let bin_alloc_buf_dev = device.create_buffer(12, dev)?;
// TODO: constants
let bin_alloc_start = 256 * 64 * N_WG;
device
.write_buffer(&bin_alloc_buf_host, &[
n_elements as u32,
0,
bin_alloc_start,
])
?;
let bin_code = include_bytes!("../shader/binning.spv");
let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 3, 0)?;
let bin_ds = device.create_descriptor_set(
&bin_pipeline,
&[&anno_buf, &bin_alloc_buf_dev, &bin_buf],
&[],
)?;
/*
let tilegroup_buf = device.create_buffer(4 * 1024 * 1024, dev)?;
@ -253,14 +283,20 @@ impl<D: Device> Renderer<D> {
image_dev,
el_pipeline,
el_ds,
bin_pipeline,
bin_ds,
state_buf,
anno_buf,
bin_buf,
bin_alloc_buf_host,
bin_alloc_buf_dev,
n_elements,
})
}
pub unsafe fn record(&self, cmd_buf: &mut impl CmdBuf<D>, query_pool: &D::QueryPool) {
cmd_buf.copy_buffer(&self.scene_buf, &self.scene_dev);
cmd_buf.copy_buffer(&self.bin_alloc_buf_host, &self.bin_alloc_buf_dev);
cmd_buf.memory_barrier();
cmd_buf.image_barrier(
&self.image_dev,
@ -276,6 +312,13 @@ impl<D: Device> Renderer<D> {
);
cmd_buf.write_timestamp(&query_pool, 1);
cmd_buf.memory_barrier();
cmd_buf.dispatch(
&self.bin_pipeline,
&self.bin_ds,
(N_WG, 1, 1),
);
cmd_buf.write_timestamp(&query_pool, 2);
cmd_buf.memory_barrier();
cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
}
}