diff --git a/piet-gpu-types/src/bins.rs b/piet-gpu-types/src/bins.rs new file mode 100644 index 0000000..88f16f1 --- /dev/null +++ b/piet-gpu-types/src/bins.rs @@ -0,0 +1,19 @@ +use piet_gpu_derive::piet_gpu; + +// The output of the binning stage, organized as a linked list of chunks. + +piet_gpu! { + #[gpu_write] + mod bins { + struct BinInstance { + element_ix: u32, + } + + struct BinChunk { + // First chunk can have n = 0, subsequent ones not. + n: u32, + next: Ref, + // Instances follow + } + } +} diff --git a/piet-gpu-types/src/lib.rs b/piet-gpu-types/src/lib.rs index 1759c4d..29ed806 100644 --- a/piet-gpu-types/src/lib.rs +++ b/piet-gpu-types/src/lib.rs @@ -1,4 +1,7 @@ +// Structures used only internally probably don't need to be pub. + pub mod annotated; +pub mod bins; pub mod encoder; pub mod fill_seg; pub mod ptcl; diff --git a/piet-gpu-types/src/main.rs b/piet-gpu-types/src/main.rs index 68e6487..41ae021 100644 --- a/piet-gpu-types/src/main.rs +++ b/piet-gpu-types/src/main.rs @@ -7,6 +7,7 @@ fn main() { "scene" => print!("{}", piet_gpu_types::scene::gen_gpu_scene()), "state" => print!("{}", piet_gpu_types::state::gen_gpu_state()), "annotated" => print!("{}", piet_gpu_types::annotated::gen_gpu_annotated()), + "bins" => print!("{}", piet_gpu_types::bins::gen_gpu_bins()), "tilegroup" => print!("{}", piet_gpu_types::tilegroup::gen_gpu_tilegroup()), "segment" => print!("{}", piet_gpu_types::segment::gen_gpu_segment()), "fill_seg" => print!("{}", piet_gpu_types::fill_seg::gen_gpu_fill_seg()), diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs index 82f3491..4a4fed3 100644 --- a/piet-gpu/bin/cli.rs +++ b/piet-gpu/bin/cli.rs @@ -41,7 +41,7 @@ fn main() -> Result<(), Error> { let fence = device.create_fence(false)?; let mut cmd_buf = device.create_cmd_buf()?; - let query_pool = device.create_query_pool(2)?; + let query_pool = device.create_query_pool(3)?; let mut ctx = PietGpuRenderContext::new(); render_scene(&mut ctx); @@ -58,13 +58,14 @@ fn main() -> Result<(), Error> { cmd_buf.finish(); device.run_cmd_buf(&cmd_buf, &[], &[], Some(&fence))?; device.wait_and_reset(&[fence])?; - let timestamps = device.reap_query_pool(&query_pool).unwrap(); - println!("Element kernel time: {:.3}ms", timestamps[0] * 1e3); + let ts = device.reap_query_pool(&query_pool).unwrap(); + println!("Element kernel time: {:.3}ms", ts[0] * 1e3); + println!("Binning kernel time: {:.3}ms", (ts[1] - ts[0]) * 1e3); /* - let mut data: Vec = Default::default(); - device.read_buffer(&renderer.state_buf, &mut data).unwrap(); - dump_state(&data); + let mut data: Vec = Default::default(); + device.read_buffer(&renderer.bin_buf, &mut data).unwrap(); + piet_gpu::dump_k1_data(&data); */ let mut img_data: Vec = Default::default(); diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp new file mode 100644 index 0000000..bf7bbae --- /dev/null +++ b/piet-gpu/shader/binning.comp @@ -0,0 +1,169 @@ +// The binning stage of the pipeline. + +#version 450 +#extension GL_GOOGLE_include_directive : enable + +#define N_ROWS 4 +#define WG_SIZE 32 +#define LG_WG_SIZE 5 +#define TILE_SIZE (WG_SIZE * N_ROWS) + +// TODO: move these to setup file +#define N_TILE_X 16 +#define N_TILE_Y 16 +#define N_TILE (N_TILE_X * N_TILE_Y) +#define N_SLICE (N_TILE / 32) +#define N_WG 16 // Number of workgroups, should be 1 per SM + +#define BIN_INITIAL_ALLOC 64 +#define BIN_ALLOC 256 + +layout(local_size_x = N_TILE, local_size_y = 1) in; + +layout(set = 0, binding = 0) buffer AnnotatedBuf { + uint[] annotated; +}; + +layout(set = 0, binding = 1) buffer AllocBuf { + uint n_elements; + // Will be incremented atomically to claim tiles + uint tile_ix; + uint alloc; +}; + +layout(set = 0, binding = 2) buffer BinsBuf { + uint[] bins; +}; + +#include "annotated.h" +#include "bins.h" +#include "setup.h" + +// scale factors useful for converting coordinates to bins +#define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX)) +#define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX)) + +// Note: cudaraster has N_TILE + 1 to cut down on bank conflicts. +shared uint bitmaps[N_SLICE][N_TILE]; +shared uint sh_my_tile; + +void main() { + BinChunkRef chunk_ref = BinChunkRef((gl_LocalInvocationID.x * N_WG + gl_WorkGroupID.x) * BIN_INITIAL_ALLOC); + uint chunk_limit = chunk_ref.offset + BIN_INITIAL_ALLOC - BinInstance_size; + uint chunk_n = 0; + BinInstanceRef instance_ref = BinInstanceRef(chunk_ref.offset + BinChunk_size); + while (true) { + if (gl_LocalInvocationID.x == 0) { + sh_my_tile = atomicAdd(tile_ix, 1); + } + barrier(); + uint my_tile = sh_my_tile; + if (my_tile * N_TILE >= n_elements) { + break; + } + + for (uint i = 0; i < N_SLICE; i++) { + bitmaps[i][gl_LocalInvocationID.x] = 0; + } + barrier(); + + // Read inputs and determine coverage of bins + uint element_ix = my_tile * N_TILE + gl_LocalInvocationID.x; + AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size); + uint tag = Annotated_tag(ref); + int x0 = 0, y0 = 0, x1 = 0, y1 = 0; + switch (tag) { + case Annotated_Line: + AnnoLineSeg line = Annotated_Line_read(ref); + x0 = int(floor((min(line.p0.x, line.p1.x) - line.stroke.x) * SX)); + y0 = int(floor((min(line.p0.y, line.p1.y) - line.stroke.y) * SY)); + x1 = int(ceil((max(line.p0.x, line.p1.x) + line.stroke.x) * SX)); + y1 = int(ceil((max(line.p0.y, line.p1.y) + line.stroke.y) * SY)); + break; + case Annotated_Fill: + case Annotated_Stroke: + // Note: we take advantage of the fact that fills and strokes + // have compatible layout. + AnnoFill fill = Annotated_Fill_read(ref); + x0 = int(floor(fill.bbox.x * SX)); + y0 = int(floor(fill.bbox.y * SY)); + x1 = int(ceil(fill.bbox.z * SX)); + y1 = int(ceil(fill.bbox.w * SY)); + break; + } + // At this point, we run an iterator over the coverage area, + // trying to keep divergence low. + // Right now, it's just a bbox, but we'll get finer with + // segments. + x0 = clamp(x0, 0, N_TILE_X); + x1 = clamp(x1, x0, N_TILE_X); + y0 = clamp(y0, 0, N_TILE_Y); + y1 = clamp(y1, y0, N_TILE_Y); + if (x0 == x1) y1 = y0; + int x = x0, y = y0; + uint my_slice = gl_LocalInvocationID.x / 32; + uint my_mask = 1 << (gl_LocalInvocationID.x & 31); + while (y < y1) { + atomicOr(bitmaps[my_slice][y * N_TILE_X + x], my_mask); + x++; + if (x == x1) { + x = x0; + y++; + } + } + + barrier(); + // Allocate output segments. + uint element_count = 0; + for (uint i = 0; i < N_SLICE; i++) { + element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]); + } + // element_count is number of elements covering bin for this invocation. + if (element_count > 0 && chunk_n > 0) { + uint new_chunk = instance_ref.offset; + if (new_chunk + min(32, element_count * 4) > chunk_limit) { + new_chunk = atomicAdd(alloc, BIN_ALLOC); + chunk_limit = new_chunk + BIN_ALLOC - BinInstance_size; + } + BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(new_chunk))); + chunk_ref = BinChunkRef(new_chunk); + instance_ref = BinInstanceRef(new_chunk + BinChunk_size); + chunk_n = 0; + } + // TODO: allocate output here + + // Iterate over bits set. + uint slice_ix = 0; + uint bitmap = bitmaps[0][gl_LocalInvocationID.x]; + while (true) { + if (bitmap == 0) { + slice_ix++; + if (slice_ix == N_SLICE) { + break; + } + bitmap = bitmaps[slice_ix][gl_LocalInvocationID.x]; + if (bitmap == 0) { + continue; + } + } + element_ix = my_tile * N_TILE + slice_ix * 32 + findLSB(bitmap); + // At this point, element_ix refers to an element that covers this bin. + + // TODO: batch allocated based on element_count; this is divergent + if (instance_ref.offset > chunk_limit) { + uint new_chunk = atomicAdd(alloc, BIN_ALLOC); + BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(new_chunk))); + chunk_ref = BinChunkRef(new_chunk); + instance_ref = BinInstanceRef(new_chunk + BinChunk_size); + chunk_n = 0; + chunk_limit = new_chunk + BIN_ALLOC - BinInstance_size; + } + BinInstance_write(instance_ref, BinInstance(element_ix)); + chunk_n++; + instance_ref.offset += BinInstance_size; + // clear LSB + bitmap &= bitmap - 1; + } + } + BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(0))); +} diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv new file mode 100644 index 0000000..4cc5d36 Binary files /dev/null and b/piet-gpu/shader/binning.spv differ diff --git a/piet-gpu/shader/bins.h b/piet-gpu/shader/bins.h new file mode 100644 index 0000000..3ce06e0 --- /dev/null +++ b/piet-gpu/shader/bins.h @@ -0,0 +1,60 @@ +// Code auto-generated by piet-gpu-derive + +struct BinInstanceRef { + uint offset; +}; + +struct BinChunkRef { + uint offset; +}; + +struct BinInstance { + uint element_ix; +}; + +#define BinInstance_size 4 + +BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) { + return BinInstanceRef(ref.offset + index * BinInstance_size); +} + +struct BinChunk { + uint n; + BinChunkRef next; +}; + +#define BinChunk_size 8 + +BinChunkRef BinChunk_index(BinChunkRef ref, uint index) { + return BinChunkRef(ref.offset + index * BinChunk_size); +} + +BinInstance BinInstance_read(BinInstanceRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = bins[ix + 0]; + BinInstance s; + s.element_ix = raw0; + return s; +} + +void BinInstance_write(BinInstanceRef ref, BinInstance s) { + uint ix = ref.offset >> 2; + bins[ix + 0] = s.element_ix; +} + +BinChunk BinChunk_read(BinChunkRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = bins[ix + 0]; + uint raw1 = bins[ix + 1]; + BinChunk s; + s.n = raw0; + s.next = BinChunkRef(raw1); + return s; +} + +void BinChunk_write(BinChunkRef ref, BinChunk s) { + uint ix = ref.offset >> 2; + bins[ix + 0] = s.n; + bins[ix + 1] = s.next.offset; +} + diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja index b429a71..4628fd2 100644 --- a/piet-gpu/shader/build.ninja +++ b/piet-gpu/shader/build.ninja @@ -20,4 +20,6 @@ build kernel3.spv: glsl kernel3.comp | scene.h tilegroup.h segment.h fill_seg.h build kernel4.spv: glsl kernel4.comp | ptcl.h segment.h fill_seg.h setup.h -build elements.spv: glsl elements.comp | scene.h state.h +build elements.spv: glsl elements.comp | scene.h state.h annotated.h + +build binning.spv: glsl binning.comp | annotated.h setup.h diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp index 1061fab..c31dd2e 100644 --- a/piet-gpu/shader/elements.comp +++ b/piet-gpu/shader/elements.comp @@ -1,3 +1,9 @@ +// The element processing stage, first in the pipeline. +// +// This stage is primarily about applying transforms and computing bounding +// boxes. It is organized as a scan over the input elements, producing +// annotated output elements. + #version 450 #extension GL_GOOGLE_include_directive : enable @@ -208,6 +214,13 @@ void main() { anno_stroke.linewidth = st.linewidth * sqrt(st.mat.x * st.mat.w - st.mat.y * st.mat.z); Annotated_Stroke_write(out_ref, anno_stroke); break; + case Element_Fill: + Fill fill = Element_Fill_read(this_ref); + AnnoFill anno_fill; + anno_fill.rgba_color = fill.rgba_color; + anno_fill.bbox = st.bbox; + Annotated_Fill_write(out_ref, anno_fill); + break; default: Annotated_Nop_write(out_ref); break; diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv index 41d9bc1..afb63b5 100644 Binary files a/piet-gpu/shader/elements.spv and b/piet-gpu/shader/elements.spv differ diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index 0753054..437a31a 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -32,6 +32,8 @@ const K2_PER_TILE_SIZE: usize = 8; const N_CIRCLES: usize = 1; +const N_WG: u32 = 16; + pub fn render_scene(rc: &mut impl RenderContext) { let mut rng = rand::thread_rng(); for _ in 0..N_CIRCLES { @@ -98,10 +100,10 @@ fn dump_scene(buf: &[u8]) { } #[allow(unused)] -fn dump_k1_data(k1_buf: &[u32]) { +pub fn dump_k1_data(k1_buf: &[u32]) { for i in 0..k1_buf.len() { if k1_buf[i] != 0 { - println!("{:4x}: {:8x}", i, k1_buf[i]); + println!("{:4x}: {:8x}", i * 4, k1_buf[i]); } } } @@ -114,10 +116,17 @@ pub struct Renderer { pub state_buf: D::Buffer, pub anno_buf: D::Buffer, + pub bin_buf: D::Buffer, el_pipeline: D::Pipeline, el_ds: D::DescriptorSet, + bin_pipeline: D::Pipeline, + bin_ds: D::DescriptorSet, + + bin_alloc_buf_host: D::Buffer, + bin_alloc_buf_dev: D::Buffer, + /* k1_alloc_buf_host: D::Buffer, k1_alloc_buf_dev: D::Buffer, @@ -149,6 +158,9 @@ impl Renderer { let host = MemFlags::host_coherent(); let dev = MemFlags::device_local(); + let n_elements = scene.len() / piet_gpu_types::scene::Element::fixed_size(); + println!("scene: {} elements", n_elements); + let scene_buf = device .create_buffer(std::mem::size_of_val(&scene[..]) as u64, host) .unwrap(); @@ -159,6 +171,7 @@ impl Renderer { let state_buf = device.create_buffer(64 * 1024 * 1024, dev)?; let anno_buf = device.create_buffer(64 * 1024 * 1024, dev)?; + let bin_buf = device.create_buffer(64 * 1024 * 1024, dev)?; let image_dev = device.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?; let el_code = include_bytes!("../shader/elements.spv"); @@ -169,8 +182,25 @@ impl Renderer { &[], )?; - let n_elements = scene.len() / piet_gpu_types::scene::Element::fixed_size(); - println!("scene: {} elements", n_elements); + let bin_alloc_buf_host = device.create_buffer(12, host)?; + let bin_alloc_buf_dev = device.create_buffer(12, dev)?; + + // TODO: constants + let bin_alloc_start = 256 * 64 * N_WG; + device + .write_buffer(&bin_alloc_buf_host, &[ + n_elements as u32, + 0, + bin_alloc_start, + ]) + ?; + let bin_code = include_bytes!("../shader/binning.spv"); + let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 3, 0)?; + let bin_ds = device.create_descriptor_set( + &bin_pipeline, + &[&anno_buf, &bin_alloc_buf_dev, &bin_buf], + &[], + )?; /* let tilegroup_buf = device.create_buffer(4 * 1024 * 1024, dev)?; @@ -253,14 +283,20 @@ impl Renderer { image_dev, el_pipeline, el_ds, + bin_pipeline, + bin_ds, state_buf, anno_buf, + bin_buf, + bin_alloc_buf_host, + bin_alloc_buf_dev, n_elements, }) } pub unsafe fn record(&self, cmd_buf: &mut impl CmdBuf, query_pool: &D::QueryPool) { cmd_buf.copy_buffer(&self.scene_buf, &self.scene_dev); + cmd_buf.copy_buffer(&self.bin_alloc_buf_host, &self.bin_alloc_buf_dev); cmd_buf.memory_barrier(); cmd_buf.image_barrier( &self.image_dev, @@ -276,6 +312,13 @@ impl Renderer { ); cmd_buf.write_timestamp(&query_pool, 1); cmd_buf.memory_barrier(); + cmd_buf.dispatch( + &self.bin_pipeline, + &self.bin_ds, + (N_WG, 1, 1), + ); + cmd_buf.write_timestamp(&query_pool, 2); + cmd_buf.memory_barrier(); cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc); } }