diff --git a/piet-gpu-derive/src/glsl.rs b/piet-gpu-derive/src/glsl.rs index 5164179..617669a 100644 --- a/piet-gpu-derive/src/glsl.rs +++ b/piet-gpu-derive/src/glsl.rs @@ -219,6 +219,7 @@ fn gen_struct_write( fields: &[(String, usize, LayoutType)], ) { writeln!(r, "void {}_write({}Ref ref, {} s) {{", name, name, name).unwrap(); + writeln!(r, " uint ix = ref.offset >> 2;").unwrap(); let coverage = crate::layout::struct_coverage(fields, true); for (i, field_ixs) in coverage.iter().enumerate() { let mut pieces = Vec::new(); @@ -254,7 +255,7 @@ fn gen_struct_write( } } if !pieces.is_empty() { - write!(r, " {}[{}] = ", bufname, i).unwrap(); + write!(r, " {}[ix + {}] = ", bufname, i).unwrap(); for (j, piece) in pieces.iter().enumerate() { if j != 0 { write!(r, " | ").unwrap(); diff --git a/piet-gpu-hal/src/vulkan.rs b/piet-gpu-hal/src/vulkan.rs index c4a07a4..e788919 100644 --- a/piet-gpu-hal/src/vulkan.rs +++ b/piet-gpu-hal/src/vulkan.rs @@ -449,13 +449,7 @@ impl crate::CmdBuf for CmdBuf { unsafe fn clear_buffer(&self, buffer: &Buffer) { let device = &self.device.device; - device.cmd_fill_buffer( - self.cmd_buf, - buffer.buffer, - 0, - vk::WHOLE_SIZE, - 0 - ); + device.cmd_fill_buffer(self.cmd_buf, buffer.buffer, 0, vk::WHOLE_SIZE, 0); } unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) { @@ -465,10 +459,7 @@ impl crate::CmdBuf for CmdBuf { self.cmd_buf, src.buffer, dst.buffer, - &[vk::BufferCopy::builder() - .size(size) - .build() - ] + &[vk::BufferCopy::builder().size(size).build()], ); } diff --git a/piet-gpu-types/src/lib.rs b/piet-gpu-types/src/lib.rs index 60c11ab..44d4843 100644 --- a/piet-gpu-types/src/lib.rs +++ b/piet-gpu-types/src/lib.rs @@ -1,3 +1,4 @@ pub mod encoder; pub mod ptcl; pub mod scene; +pub mod tilegroup; diff --git a/piet-gpu-types/src/main.rs b/piet-gpu-types/src/main.rs index 2a20c3b..7ed941f 100644 --- a/piet-gpu-types/src/main.rs +++ b/piet-gpu-types/src/main.rs @@ -5,6 +5,7 @@ fn main() { .expect("provide a module name"); match mod_name.as_str() { "scene" => print!("{}", piet_gpu_types::scene::gen_gpu_scene()), + "tilegroup" => print!("{}", piet_gpu_types::tilegroup::gen_gpu_tilegroup()), "ptcl" => print!("{}", piet_gpu_types::ptcl::gen_gpu_ptcl()), _ => println!("Oops, unknown module name"), } diff --git a/piet-gpu-types/src/scene.rs b/piet-gpu-types/src/scene.rs index 8e4ec3c..5f95c40 100644 --- a/piet-gpu-types/src/scene.rs +++ b/piet-gpu-types/src/scene.rs @@ -8,8 +8,7 @@ piet_gpu! { #[rust_encode] mod scene { struct Bbox { - // TODO: this should be i16 - bbox: [u16; 4], + bbox: [i16; 4], } struct Point { xy: [f32; 2], @@ -19,6 +18,7 @@ piet_gpu! { // Note: both of the following items are actually arrays items: Ref, bboxes: Ref, + offset: Point, } struct PietCircle { rgba_color: u32, @@ -45,6 +45,7 @@ piet_gpu! { points: Ref, } enum PietItem { + Group(SimpleGroup), Circle(PietCircle), Line(PietStrokeLine), Fill(PietFill), diff --git a/piet-gpu-types/src/tilegroup.rs b/piet-gpu-types/src/tilegroup.rs new file mode 100644 index 0000000..4824178 --- /dev/null +++ b/piet-gpu-types/src/tilegroup.rs @@ -0,0 +1,18 @@ +use piet_gpu_derive::piet_gpu; + +piet_gpu! { + #[gpu_write] + mod tilegroup { + struct Instance { + // Note: a better type would be `Ref` but to do that we + // would need cross-module references. Punt for now. + item_ref: u32, + // A better type would be Point. + offset: [f32; 2], + } + enum TileGroup { + Instance(Instance), + End, + } + } +} diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja index 037540b..5befa7f 100644 --- a/piet-gpu/shader/build.ninja +++ b/piet-gpu/shader/build.ninja @@ -8,3 +8,5 @@ rule glsl command = $glslang_validator -V -o $out $in build image.spv: glsl image.comp | scene.h + +build kernel1.spv: glsl kernel1.comp | scene.h tilegroup.h diff --git a/piet-gpu/shader/image.comp b/piet-gpu/shader/image.comp index b3e906c..60739d5 100644 --- a/piet-gpu/shader/image.comp +++ b/piet-gpu/shader/image.comp @@ -32,7 +32,7 @@ void main() { // which is horribly wasteful, but the goal is to get *some* output and // then optimize. - SimpleGroup group = SimpleGroup_read(SimpleGroupRef(0)); + SimpleGroup group = PietItem_Group_read(PietItemRef(0)); for (uint i = 0; i < group.n_items; i++) { PietItemRef item_ref = PietItem_index(group.items, i); uint tag = PietItem_tag(item_ref); diff --git a/piet-gpu/shader/image.spv b/piet-gpu/shader/image.spv index b1b6eb7..527c9ae 100644 Binary files a/piet-gpu/shader/image.spv and b/piet-gpu/shader/image.spv differ diff --git a/piet-gpu/shader/kernel1.comp b/piet-gpu/shader/kernel1.comp new file mode 100644 index 0000000..436b8bd --- /dev/null +++ b/piet-gpu/shader/kernel1.comp @@ -0,0 +1,83 @@ +#version 450 +#extension GL_GOOGLE_include_directive : enable + +// It's possible we should lay this out with x and do our own math. +layout(local_size_x = 1, local_size_y = 32) in; + +layout(set = 0, binding = 0) readonly buffer SceneBuf { + uint[] scene; +}; + +layout(set = 0, binding = 1) buffer TilegroupBuf { + uint[] tilegroup; +}; + +#include "scene.h" +#include "tilegroup.h" + +// TODO: compute this +#define WIDTH_IN_TILEGROUPS 4 + +#define TILEGROUP_WIDTH 512 +#define TILEGROUP_HEIGHT 16 + +#define INITIAL_ALLOC 1024 + +#define MAX_STACK 8 + +struct StackElement { + PietItemRef group; + uint index; + vec2 offset; +}; + +void main() { + StackElement stack[MAX_STACK]; + uint stack_ix = 0; + uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS + gl_GlobalInvocationID.x; + TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * INITIAL_ALLOC); + vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILEGROUP_WIDTH, TILEGROUP_HEIGHT); + PietItemRef root = PietItemRef(0); + SimpleGroup group = PietItem_Group_read(root); + StackElement tos = StackElement(root, 0, group.offset.xy); + + while (true) { + if (tos.index < group.n_items) { + Bbox bbox = Bbox_read(Bbox_index(group.bboxes, tos.index)); + vec4 bb = vec4(bbox.bbox) + tos.offset.xyxy; + bool hit = max(bb.x, xy0.x) < min(bb.z, xy0.x + float(TILEGROUP_WIDTH)) + && max(bb.y, xy0.y) < min(bb.w, xy0.y + float(TILEGROUP_HEIGHT)); + bool is_group = false; + if (hit) { + PietItemRef item_ref = PietItem_index(group.items, tos.index); + is_group = PietItem_tag(item_ref) == PietItem_Group; + } + if (hit && !is_group) { + PietItemRef item_ref = PietItem_index(group.items, tos.index); + Instance ins = Instance(item_ref.offset, tos.offset); + TileGroup_Instance_write(tg_ref, ins); + tg_ref.offset += TileGroup_size; + // TODO: bump allocate if allocation exceeded + } + if (is_group) { + PietItemRef item_ref = PietItem_index(group.items, tos.index); + tos.index++; + if (tos.index < group.n_items) { + stack[stack_ix++] = tos; + } + group = PietItem_Group_read(item_ref); + tos = StackElement(item_ref, 0, tos.offset + group.offset.xy); + } else { + tos.index++; + } + } else { + // processed all items in this group; pop the stack + if (stack_ix == 0) { + break; + } + tos = stack[--stack_ix]; + group = PietItem_Group_read(tos.group); + } + } + TileGroup_End_write(tg_ref); +} diff --git a/piet-gpu/shader/kernel1.spv b/piet-gpu/shader/kernel1.spv new file mode 100644 index 0000000..0e9a497 Binary files /dev/null and b/piet-gpu/shader/kernel1.spv differ diff --git a/piet-gpu/shader/scene.h b/piet-gpu/shader/scene.h index 440f491..5e36abc 100644 --- a/piet-gpu/shader/scene.h +++ b/piet-gpu/shader/scene.h @@ -33,7 +33,7 @@ struct PietItemRef { }; struct Bbox { - uvec4 bbox; + ivec4 bbox; }; #define Bbox_size 8 @@ -56,9 +56,10 @@ struct SimpleGroup { uint n_items; PietItemRef items; BboxRef bboxes; + Point offset; }; -#define SimpleGroup_size 12 +#define SimpleGroup_size 20 SimpleGroupRef SimpleGroup_index(SimpleGroupRef ref, uint index) { return SimpleGroupRef(ref.offset + index * SimpleGroup_size); @@ -116,10 +117,11 @@ PietStrokePolyLineRef PietStrokePolyLine_index(PietStrokePolyLineRef ref, uint i return PietStrokePolyLineRef(ref.offset + index * PietStrokePolyLine_size); } -#define PietItem_Circle 0 -#define PietItem_Line 1 -#define PietItem_Fill 2 -#define PietItem_Poly 3 +#define PietItem_Group 0 +#define PietItem_Circle 1 +#define PietItem_Line 2 +#define PietItem_Fill 3 +#define PietItem_Poly 4 #define PietItem_size 32 PietItemRef PietItem_index(PietItemRef ref, uint index) { @@ -131,7 +133,7 @@ Bbox Bbox_read(BboxRef ref) { uint raw0 = scene[ix + 0]; uint raw1 = scene[ix + 1]; Bbox s; - s.bbox = uvec4(raw0 & 0xffff, raw0 >> 16, raw1 & 0xffff, raw1 >> 16); + s.bbox = ivec4(int(raw0 << 16) >> 16, int(raw0) >> 16, int(raw1 << 16) >> 16, int(raw1) >> 16); return s; } @@ -153,6 +155,7 @@ SimpleGroup SimpleGroup_read(SimpleGroupRef ref) { s.n_items = raw0; s.items = PietItemRef(raw1); s.bboxes = BboxRef(raw2); + s.offset = Point_read(PointRef(ref.offset + 12)); return s; } @@ -213,6 +216,10 @@ uint PietItem_tag(PietItemRef ref) { return scene[ref.offset >> 2]; } +SimpleGroup PietItem_Group_read(PietItemRef ref) { + return SimpleGroup_read(SimpleGroupRef(ref.offset + 4)); +} + PietCircle PietItem_Circle_read(PietItemRef ref) { return PietCircle_read(PietCircleRef(ref.offset + 4)); } diff --git a/piet-gpu/shader/tilegroup.h b/piet-gpu/shader/tilegroup.h new file mode 100644 index 0000000..f1d646f --- /dev/null +++ b/piet-gpu/shader/tilegroup.h @@ -0,0 +1,64 @@ +// Code auto-generated by piet-gpu-derive + +struct InstanceRef { + uint offset; +}; + +struct TileGroupRef { + uint offset; +}; + +struct Instance { + uint item_ref; + vec2 offset; +}; + +#define Instance_size 12 + +InstanceRef Instance_index(InstanceRef ref, uint index) { + return InstanceRef(ref.offset + index * Instance_size); +} + +#define TileGroup_Instance 0 +#define TileGroup_End 1 +#define TileGroup_size 16 + +TileGroupRef TileGroup_index(TileGroupRef ref, uint index) { + return TileGroupRef(ref.offset + index * TileGroup_size); +} + +Instance Instance_read(InstanceRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = tilegroup[ix + 0]; + uint raw1 = tilegroup[ix + 1]; + uint raw2 = tilegroup[ix + 2]; + Instance s; + s.item_ref = raw0; + s.offset = vec2(uintBitsToFloat(raw1), uintBitsToFloat(raw2)); + return s; +} + +void Instance_write(InstanceRef ref, Instance s) { + uint ix = ref.offset >> 2; + tilegroup[ix + 0] = s.item_ref; + tilegroup[ix + 1] = floatBitsToUint(s.offset.x); + tilegroup[ix + 2] = floatBitsToUint(s.offset.y); +} + +uint TileGroup_tag(TileGroupRef ref) { + return tilegroup[ref.offset >> 2]; +} + +Instance TileGroup_Instance_read(TileGroupRef ref) { + return Instance_read(InstanceRef(ref.offset + 4)); +} + +void TileGroup_Instance_write(TileGroupRef ref, Instance s) { + tilegroup[ref.offset >> 2] = TileGroup_Instance; + Instance_write(InstanceRef(ref.offset + 4), s); +} + +void TileGroup_End_write(TileGroupRef ref) { + tilegroup[ref.offset >> 2] = TileGroup_End; +} + diff --git a/piet-gpu/src/main.rs b/piet-gpu/src/main.rs index 0bc4375..72f0d3c 100644 --- a/piet-gpu/src/main.rs +++ b/piet-gpu/src/main.rs @@ -21,7 +21,7 @@ const N_CIRCLES: usize = 100; fn make_scene() -> Vec { let mut rng = rand::thread_rng(); let mut encoder = Encoder::new(); - let _reserve_root = encoder.alloc_chunk(SimpleGroup::fixed_size() as u32); + let _reserve_root = encoder.alloc_chunk(PietItem::fixed_size() as u32); let mut items = Vec::new(); let mut bboxes = Vec::new(); @@ -36,23 +36,30 @@ fn make_scene() -> Vec { }, radius: rng.gen_range(0.0, 50.0), }; - items.push(PietItem::Circle(circle)); let bbox = Bbox { - // TODO: real bbox - bbox: [0, 0, 0, 0], + bbox: [ + (circle.center.xy[0] - circle.radius).floor() as i16, + (circle.center.xy[1] - circle.radius).floor() as i16, + (circle.center.xy[0] + circle.radius).ceil() as i16, + (circle.center.xy[1] + circle.radius).ceil() as i16, + ], }; + items.push(PietItem::Circle(circle)); bboxes.push(bbox); } let n_items = bboxes.len() as u32; let bboxes = bboxes.encode(&mut encoder).transmute(); let items = items.encode(&mut encoder).transmute(); + let offset = Point { xy: [0.0, 0.0] }; let simple_group = SimpleGroup { n_items, bboxes, items, + offset, }; - simple_group.encode_to(&mut encoder.buf_mut()[0..SimpleGroup::fixed_size()]); + let root_item = PietItem::Group(simple_group); + root_item.encode_to(&mut encoder.buf_mut()[0..PietItem::fixed_size()]); // We should avoid this clone. encoder.buf().to_owned() } @@ -66,6 +73,14 @@ fn dump_scene(buf: &[u8]) { } } +fn dump_k1_data(k1_buf: &[u32]) { + for i in 0..k1_buf.len() { + if k1_buf[i] != 0 { + println!("{:4x}: {:8x}", i, k1_buf[i]); + } + } +} + fn main() { let instance = VkInstance::new().unwrap(); unsafe { @@ -81,35 +96,57 @@ fn main() { .create_buffer(std::mem::size_of_val(&scene[..]) as u64, dev) .unwrap(); device.write_buffer(&scene_buf, &scene).unwrap(); + let tilegroup_buf = device.create_buffer(384 * 1024, host).unwrap(); let image_buf = device .create_buffer((WIDTH * HEIGHT * 4) as u64, host) .unwrap(); let image_dev = device .create_buffer((WIDTH * HEIGHT * 4) as u64, dev) .unwrap(); + + let k1_code = include_bytes!("../shader/kernel1.spv"); + let k1_pipeline = device.create_simple_compute_pipeline(k1_code, 2).unwrap(); + let k1_ds = device + .create_descriptor_set(&k1_pipeline, &[&scene_dev, &tilegroup_buf]) + .unwrap(); + let code = include_bytes!("../shader/image.spv"); let pipeline = device.create_simple_compute_pipeline(code, 2).unwrap(); let descriptor_set = device .create_descriptor_set(&pipeline, &[&scene_dev, &image_dev]) .unwrap(); - let query_pool = device.create_query_pool(2).unwrap(); + let query_pool = device.create_query_pool(3).unwrap(); let mut cmd_buf = device.create_cmd_buf().unwrap(); cmd_buf.begin(); cmd_buf.copy_buffer(&scene_buf, &scene_dev); + cmd_buf.clear_buffer(&tilegroup_buf); cmd_buf.memory_barrier(); cmd_buf.write_timestamp(&query_pool, 0); + cmd_buf.dispatch( + &k1_pipeline, + &k1_ds, + ((WIDTH / 512) as u32, (HEIGHT / 512) as u32, 1), + ); + cmd_buf.write_timestamp(&query_pool, 1); + cmd_buf.memory_barrier(); cmd_buf.dispatch( &pipeline, &descriptor_set, ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1), ); - cmd_buf.write_timestamp(&query_pool, 1); + cmd_buf.write_timestamp(&query_pool, 2); cmd_buf.memory_barrier(); cmd_buf.copy_buffer(&image_dev, &image_buf); cmd_buf.finish(); device.run_cmd_buf(&cmd_buf).unwrap(); let timestamps = device.reap_query_pool(query_pool).unwrap(); - println!("Render time: {:.3}ms", timestamps[0] * 1e3); + println!("Kernel 1 time: {:.3}ms", timestamps[0] * 1e3); + println!("Render time: {:.3}ms", (timestamps[1] - timestamps[0]) * 1e3); + + let mut k1_data: Vec = Default::default(); + device.read_buffer(&tilegroup_buf, &mut k1_data).unwrap(); + dump_k1_data(&k1_data); + let mut img_data: Vec = Default::default(); // Note: because png can use a `&[u8]` slice, we could avoid an extra copy // (probably passing a slice into a closure). But for now: keep it simple.