diff --git a/piet-gpu-types/src/ptcl.rs b/piet-gpu-types/src/ptcl.rs index f5e42af..b6df77d 100644 --- a/piet-gpu-types/src/ptcl.rs +++ b/piet-gpu-types/src/ptcl.rs @@ -4,8 +4,9 @@ piet_gpu! { #[gpu_write] mod ptcl { struct CmdCircle { - // In existing code, this is packed; we might need an annotation for this. - bbox: [u16; 4], + center: [f32; 2], + radius: f32, + rgba_color: u32, } struct CmdLine { start: [f32; 2], diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja index 5befa7f..ada8694 100644 --- a/piet-gpu/shader/build.ninja +++ b/piet-gpu/shader/build.ninja @@ -10,3 +10,5 @@ rule glsl build image.spv: glsl image.comp | scene.h build kernel1.spv: glsl kernel1.comp | scene.h tilegroup.h + +build kernel3.spv: glsl kernel3.comp | scene.h tilegroup.h ptcl.h diff --git a/piet-gpu/shader/image.comp b/piet-gpu/shader/image.comp index 60739d5..6d84eb5 100644 --- a/piet-gpu/shader/image.comp +++ b/piet-gpu/shader/image.comp @@ -40,7 +40,7 @@ void main() { if (tag == PietItem_Circle) { PietCircle circle = PietItem_Circle_read(item_ref); float r = length(xy + vec2(0.5, 0.5) - circle.center.xy); - float alpha = clamp(circle.radius - r, 0.0, 1.0); + float alpha = clamp(0.5 + circle.radius - r, 0.0, 1.0); vec4 fg_rgba = unpackUnorm4x8(circle.rgba_color); // TODO: sRGB rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a); diff --git a/piet-gpu/shader/image.spv b/piet-gpu/shader/image.spv index 527c9ae..097add1 100644 Binary files a/piet-gpu/shader/image.spv and b/piet-gpu/shader/image.spv differ diff --git a/piet-gpu/shader/kernel1.comp b/piet-gpu/shader/kernel1.comp index 436b8bd..3a4156c 100644 --- a/piet-gpu/shader/kernel1.comp +++ b/piet-gpu/shader/kernel1.comp @@ -1,3 +1,16 @@ +// This is "kernel 1" in a 4-kernel pipeline. It traverses the scene graph +// and outputs "instances" (references to item + translation) for each item +// that intersects the tilegroup. +// +// This implementation is simplistic and leaves a lot of performance on the +// table. A fancier implementation would use threadgroup shared memory or +// subgroups (or possibly both) to parallelize the reading of the input and +// the computation of tilegroup intersection. +// +// In addition, there are some features currently missing. One is the use of +// a bump allocator to extend the current fixed allocation. Another is support +// for clipping. + #version 450 #extension GL_GOOGLE_include_directive : enable @@ -18,10 +31,10 @@ layout(set = 0, binding = 1) buffer TilegroupBuf { // TODO: compute this #define WIDTH_IN_TILEGROUPS 4 -#define TILEGROUP_WIDTH 512 -#define TILEGROUP_HEIGHT 16 +#define TILEGROUP_WIDTH_PX 512 +#define TILEGROUP_HEIGHT_PX 16 -#define INITIAL_ALLOC 1024 +#define TILEGROUP_INITIAL_ALLOC 1024 #define MAX_STACK 8 @@ -35,8 +48,8 @@ void main() { StackElement stack[MAX_STACK]; uint stack_ix = 0; uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS + gl_GlobalInvocationID.x; - TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * INITIAL_ALLOC); - vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILEGROUP_WIDTH, TILEGROUP_HEIGHT); + TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_INITIAL_ALLOC); + vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILEGROUP_WIDTH_PX, TILEGROUP_HEIGHT_PX); PietItemRef root = PietItemRef(0); SimpleGroup group = PietItem_Group_read(root); StackElement tos = StackElement(root, 0, group.offset.xy); @@ -45,8 +58,8 @@ void main() { if (tos.index < group.n_items) { Bbox bbox = Bbox_read(Bbox_index(group.bboxes, tos.index)); vec4 bb = vec4(bbox.bbox) + tos.offset.xyxy; - bool hit = max(bb.x, xy0.x) < min(bb.z, xy0.x + float(TILEGROUP_WIDTH)) - && max(bb.y, xy0.y) < min(bb.w, xy0.y + float(TILEGROUP_HEIGHT)); + bool hit = max(bb.x, xy0.x) < min(bb.z, xy0.x + float(TILEGROUP_WIDTH_PX)) + && max(bb.y, xy0.y) < min(bb.w, xy0.y + float(TILEGROUP_HEIGHT_PX)); bool is_group = false; if (hit) { PietItemRef item_ref = PietItem_index(group.items, tos.index); diff --git a/piet-gpu/shader/kernel3.comp b/piet-gpu/shader/kernel3.comp new file mode 100644 index 0000000..f9f9362 --- /dev/null +++ b/piet-gpu/shader/kernel3.comp @@ -0,0 +1,72 @@ +#version 450 +#extension GL_GOOGLE_include_directive : enable + +layout(local_size_x = 32, local_size_y = 1) in; + +layout(set = 0, binding = 0) readonly buffer SceneBuf { + uint[] scene; +}; + +// TODO: this should have a `readonly` qualifier, but then inclusion +// of ptcl.h would fail because of the writers. +layout(set = 0, binding = 1) buffer TilegroupBuf { + uint[] tilegroup; +}; + +layout(set = 0, binding = 2) buffer PtclBuf { + uint[] ptcl; +}; + +#include "scene.h" +#include "tilegroup.h" +#include "ptcl.h" + +// TODO: compute all these + +#define WIDTH_IN_TILEGROUPS 4 +#define WIDTH_IN_TILES 128 +#define TILEGROUP_WIDTH_TILES 32 +#define TILE_WIDTH_PX 16 +#define TILE_HEIGHT_PX 16 + +// Must be the same as kernel1. Might be a good idea to move these particular +// constants to their own .h file. +#define TILEGROUP_INITIAL_ALLOC 1024 + +#define PTCL_INITIAL_ALLOC 4096 + +void main() { + uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x; + uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS + + (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES); + vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX); + TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_INITIAL_ALLOC); + CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC); + + while (true) { + uint tg_tag = TileGroup_tag(tg_ref); + if (tg_tag == TileGroup_End) { + break; + } + // Assume tg_tag is `Instance`, though there will be more cases. + Instance ins = TileGroup_Instance_read(tg_ref); + PietItemRef item_ref = PietItemRef(ins.item_ref); + uint item_tag = PietItem_tag(item_ref); + switch (item_tag) { + case PietItem_Circle: + PietCircle circle = PietItem_Circle_read(item_ref); + vec2 center = ins.offset + circle.center.xy; + float r = circle.radius; + if (max(center.x - r, xy0.x) < min(center.x + r, xy0.x + float(TILE_WIDTH_PX)) + && max(center.y - r, xy0.y) < min(center.y + r, xy0.y + float(TILE_HEIGHT_PX))) + { + CmdCircle cmd = CmdCircle(center, r, circle.rgba_color); + Cmd_Circle_write(cmd_ref, cmd); + cmd_ref.offset += Cmd_size; + } + break; + } + tg_ref.offset += TileGroup_size; + } + Cmd_End_write(cmd_ref); +} diff --git a/piet-gpu/shader/kernel3.spv b/piet-gpu/shader/kernel3.spv new file mode 100644 index 0000000..23a7c3e Binary files /dev/null and b/piet-gpu/shader/kernel3.spv differ diff --git a/piet-gpu/shader/ptcl.h b/piet-gpu/shader/ptcl.h new file mode 100644 index 0000000..583cc10 --- /dev/null +++ b/piet-gpu/shader/ptcl.h @@ -0,0 +1,323 @@ +// Code auto-generated by piet-gpu-derive + +struct CmdCircleRef { + uint offset; +}; + +struct CmdLineRef { + uint offset; +}; + +struct CmdStrokeRef { + uint offset; +}; + +struct CmdFillRef { + uint offset; +}; + +struct CmdFillEdgeRef { + uint offset; +}; + +struct CmdDrawFillRef { + uint offset; +}; + +struct CmdSolidRef { + uint offset; +}; + +struct CmdRef { + uint offset; +}; + +struct CmdCircle { + vec2 center; + float radius; + uint rgba_color; +}; + +#define CmdCircle_size 16 + +CmdCircleRef CmdCircle_index(CmdCircleRef ref, uint index) { + return CmdCircleRef(ref.offset + index * CmdCircle_size); +} + +struct CmdLine { + vec2 start; + vec2 end; +}; + +#define CmdLine_size 16 + +CmdLineRef CmdLine_index(CmdLineRef ref, uint index) { + return CmdLineRef(ref.offset + index * CmdLine_size); +} + +struct CmdStroke { + float halfWidth; + uint rgba_color; +}; + +#define CmdStroke_size 8 + +CmdStrokeRef CmdStroke_index(CmdStrokeRef ref, uint index) { + return CmdStrokeRef(ref.offset + index * CmdStroke_size); +} + +struct CmdFill { + vec2 start; + vec2 end; +}; + +#define CmdFill_size 16 + +CmdFillRef CmdFill_index(CmdFillRef ref, uint index) { + return CmdFillRef(ref.offset + index * CmdFill_size); +} + +struct CmdFillEdge { + int sign; + float y; +}; + +#define CmdFillEdge_size 8 + +CmdFillEdgeRef CmdFillEdge_index(CmdFillEdgeRef ref, uint index) { + return CmdFillEdgeRef(ref.offset + index * CmdFillEdge_size); +} + +struct CmdDrawFill { + int backdrop; + uint rgba_color; +}; + +#define CmdDrawFill_size 8 + +CmdDrawFillRef CmdDrawFill_index(CmdDrawFillRef ref, uint index) { + return CmdDrawFillRef(ref.offset + index * CmdDrawFill_size); +} + +struct CmdSolid { + uint rgba_color; +}; + +#define CmdSolid_size 4 + +CmdSolidRef CmdSolid_index(CmdSolidRef ref, uint index) { + return CmdSolidRef(ref.offset + index * CmdSolid_size); +} + +#define Cmd_End 0 +#define Cmd_Circle 1 +#define Cmd_Line 2 +#define Cmd_Fill 3 +#define Cmd_Stroke 4 +#define Cmd_FillEdge 5 +#define Cmd_DrawFill 6 +#define Cmd_Solid 7 +#define Cmd_Bail 8 +#define Cmd_size 20 + +CmdRef Cmd_index(CmdRef ref, uint index) { + return CmdRef(ref.offset + index * Cmd_size); +} + +CmdCircle CmdCircle_read(CmdCircleRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = ptcl[ix + 0]; + uint raw1 = ptcl[ix + 1]; + uint raw2 = ptcl[ix + 2]; + uint raw3 = ptcl[ix + 3]; + CmdCircle s; + s.center = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.radius = uintBitsToFloat(raw2); + s.rgba_color = raw3; + return s; +} + +void CmdCircle_write(CmdCircleRef ref, CmdCircle s) { + uint ix = ref.offset >> 2; + ptcl[ix + 0] = floatBitsToUint(s.center.x); + ptcl[ix + 1] = floatBitsToUint(s.center.y); + ptcl[ix + 2] = floatBitsToUint(s.radius); + ptcl[ix + 3] = s.rgba_color; +} + +CmdLine CmdLine_read(CmdLineRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = ptcl[ix + 0]; + uint raw1 = ptcl[ix + 1]; + uint raw2 = ptcl[ix + 2]; + uint raw3 = ptcl[ix + 3]; + CmdLine s; + s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + return s; +} + +void CmdLine_write(CmdLineRef ref, CmdLine s) { + uint ix = ref.offset >> 2; + ptcl[ix + 0] = floatBitsToUint(s.start.x); + ptcl[ix + 1] = floatBitsToUint(s.start.y); + ptcl[ix + 2] = floatBitsToUint(s.end.x); + ptcl[ix + 3] = floatBitsToUint(s.end.y); +} + +CmdStroke CmdStroke_read(CmdStrokeRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = ptcl[ix + 0]; + uint raw1 = ptcl[ix + 1]; + CmdStroke s; + s.halfWidth = uintBitsToFloat(raw0); + s.rgba_color = raw1; + return s; +} + +void CmdStroke_write(CmdStrokeRef ref, CmdStroke s) { + uint ix = ref.offset >> 2; + ptcl[ix + 0] = floatBitsToUint(s.halfWidth); + ptcl[ix + 1] = s.rgba_color; +} + +CmdFill CmdFill_read(CmdFillRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = ptcl[ix + 0]; + uint raw1 = ptcl[ix + 1]; + uint raw2 = ptcl[ix + 2]; + uint raw3 = ptcl[ix + 3]; + CmdFill s; + s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + return s; +} + +void CmdFill_write(CmdFillRef ref, CmdFill s) { + uint ix = ref.offset >> 2; + ptcl[ix + 0] = floatBitsToUint(s.start.x); + ptcl[ix + 1] = floatBitsToUint(s.start.y); + ptcl[ix + 2] = floatBitsToUint(s.end.x); + ptcl[ix + 3] = floatBitsToUint(s.end.y); +} + +CmdFillEdge CmdFillEdge_read(CmdFillEdgeRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = ptcl[ix + 0]; + uint raw1 = ptcl[ix + 1]; + CmdFillEdge s; + s.sign = int(raw0); + s.y = uintBitsToFloat(raw1); + return s; +} + +void CmdFillEdge_write(CmdFillEdgeRef ref, CmdFillEdge s) { + uint ix = ref.offset >> 2; + ptcl[ix + 0] = uint(s.sign); + ptcl[ix + 1] = floatBitsToUint(s.y); +} + +CmdDrawFill CmdDrawFill_read(CmdDrawFillRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = ptcl[ix + 0]; + uint raw1 = ptcl[ix + 1]; + CmdDrawFill s; + s.backdrop = int(raw0); + s.rgba_color = raw1; + return s; +} + +void CmdDrawFill_write(CmdDrawFillRef ref, CmdDrawFill s) { + uint ix = ref.offset >> 2; + ptcl[ix + 0] = uint(s.backdrop); + ptcl[ix + 1] = s.rgba_color; +} + +CmdSolid CmdSolid_read(CmdSolidRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = ptcl[ix + 0]; + CmdSolid s; + s.rgba_color = raw0; + return s; +} + +void CmdSolid_write(CmdSolidRef ref, CmdSolid s) { + uint ix = ref.offset >> 2; + ptcl[ix + 0] = s.rgba_color; +} + +uint Cmd_tag(CmdRef ref) { + return ptcl[ref.offset >> 2]; +} + +CmdCircle Cmd_Circle_read(CmdRef ref) { + return CmdCircle_read(CmdCircleRef(ref.offset + 4)); +} + +CmdLine Cmd_Line_read(CmdRef ref) { + return CmdLine_read(CmdLineRef(ref.offset + 4)); +} + +CmdFill Cmd_Fill_read(CmdRef ref) { + return CmdFill_read(CmdFillRef(ref.offset + 4)); +} + +CmdStroke Cmd_Stroke_read(CmdRef ref) { + return CmdStroke_read(CmdStrokeRef(ref.offset + 4)); +} + +CmdFillEdge Cmd_FillEdge_read(CmdRef ref) { + return CmdFillEdge_read(CmdFillEdgeRef(ref.offset + 4)); +} + +CmdDrawFill Cmd_DrawFill_read(CmdRef ref) { + return CmdDrawFill_read(CmdDrawFillRef(ref.offset + 4)); +} + +CmdSolid Cmd_Solid_read(CmdRef ref) { + return CmdSolid_read(CmdSolidRef(ref.offset + 4)); +} + +void Cmd_End_write(CmdRef ref) { + ptcl[ref.offset >> 2] = Cmd_End; +} + +void Cmd_Circle_write(CmdRef ref, CmdCircle s) { + ptcl[ref.offset >> 2] = Cmd_Circle; + CmdCircle_write(CmdCircleRef(ref.offset + 4), s); +} + +void Cmd_Line_write(CmdRef ref, CmdLine s) { + ptcl[ref.offset >> 2] = Cmd_Line; + CmdLine_write(CmdLineRef(ref.offset + 4), s); +} + +void Cmd_Fill_write(CmdRef ref, CmdFill s) { + ptcl[ref.offset >> 2] = Cmd_Fill; + CmdFill_write(CmdFillRef(ref.offset + 4), s); +} + +void Cmd_Stroke_write(CmdRef ref, CmdStroke s) { + ptcl[ref.offset >> 2] = Cmd_Stroke; + CmdStroke_write(CmdStrokeRef(ref.offset + 4), s); +} + +void Cmd_FillEdge_write(CmdRef ref, CmdFillEdge s) { + ptcl[ref.offset >> 2] = Cmd_FillEdge; + CmdFillEdge_write(CmdFillEdgeRef(ref.offset + 4), s); +} + +void Cmd_DrawFill_write(CmdRef ref, CmdDrawFill s) { + ptcl[ref.offset >> 2] = Cmd_DrawFill; + CmdDrawFill_write(CmdDrawFillRef(ref.offset + 4), s); +} + +void Cmd_Solid_write(CmdRef ref, CmdSolid s) { + ptcl[ref.offset >> 2] = Cmd_Solid; + CmdSolid_write(CmdSolidRef(ref.offset + 4), s); +} + +void Cmd_Bail_write(CmdRef ref) { + ptcl[ref.offset >> 2] = Cmd_Bail; +} + diff --git a/piet-gpu/src/main.rs b/piet-gpu/src/main.rs index 72f0d3c..56b73ca 100644 --- a/piet-gpu/src/main.rs +++ b/piet-gpu/src/main.rs @@ -73,6 +73,7 @@ fn dump_scene(buf: &[u8]) { } } +#[allow(unused)] fn dump_k1_data(k1_buf: &[u32]) { for i in 0..k1_buf.len() { if k1_buf[i] != 0 { @@ -96,7 +97,9 @@ fn main() { .create_buffer(std::mem::size_of_val(&scene[..]) as u64, dev) .unwrap(); device.write_buffer(&scene_buf, &scene).unwrap(); + // These should only be on the host if we're going to examine them from Rust. let tilegroup_buf = device.create_buffer(384 * 1024, host).unwrap(); + let ptcl_buf = device.create_buffer(12 * 1024 * 4096, host).unwrap(); let image_buf = device .create_buffer((WIDTH * HEIGHT * 4) as u64, host) .unwrap(); @@ -110,16 +113,23 @@ fn main() { .create_descriptor_set(&k1_pipeline, &[&scene_dev, &tilegroup_buf]) .unwrap(); + let k3_code = include_bytes!("../shader/kernel3.spv"); + let k3_pipeline = device.create_simple_compute_pipeline(k3_code, 3).unwrap(); + let k3_ds = device + .create_descriptor_set(&k3_pipeline, &[&scene_dev, &tilegroup_buf, &ptcl_buf]) + .unwrap(); + let code = include_bytes!("../shader/image.spv"); let pipeline = device.create_simple_compute_pipeline(code, 2).unwrap(); let descriptor_set = device .create_descriptor_set(&pipeline, &[&scene_dev, &image_dev]) .unwrap(); - let query_pool = device.create_query_pool(3).unwrap(); + let query_pool = device.create_query_pool(4).unwrap(); let mut cmd_buf = device.create_cmd_buf().unwrap(); cmd_buf.begin(); cmd_buf.copy_buffer(&scene_buf, &scene_dev); cmd_buf.clear_buffer(&tilegroup_buf); + cmd_buf.clear_buffer(&ptcl_buf); cmd_buf.memory_barrier(); cmd_buf.write_timestamp(&query_pool, 0); cmd_buf.dispatch( @@ -129,22 +139,36 @@ fn main() { ); cmd_buf.write_timestamp(&query_pool, 1); cmd_buf.memory_barrier(); + cmd_buf.dispatch( + &k3_pipeline, + &k3_ds, + ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 1), + ); + cmd_buf.write_timestamp(&query_pool, 2); + cmd_buf.memory_barrier(); cmd_buf.dispatch( &pipeline, &descriptor_set, ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1), ); - cmd_buf.write_timestamp(&query_pool, 2); + cmd_buf.write_timestamp(&query_pool, 3); cmd_buf.memory_barrier(); cmd_buf.copy_buffer(&image_dev, &image_buf); cmd_buf.finish(); device.run_cmd_buf(&cmd_buf).unwrap(); let timestamps = device.reap_query_pool(query_pool).unwrap(); println!("Kernel 1 time: {:.3}ms", timestamps[0] * 1e3); - println!("Render time: {:.3}ms", (timestamps[1] - timestamps[0]) * 1e3); + println!( + "Kernel 3 time: {:.3}ms", + (timestamps[1] - timestamps[0]) * 1e3 + ); + println!( + "Render time: {:.3}ms", + (timestamps[2] - timestamps[1]) * 1e3 + ); let mut k1_data: Vec = Default::default(); - device.read_buffer(&tilegroup_buf, &mut k1_data).unwrap(); + device.read_buffer(&ptcl_buf, &mut k1_data).unwrap(); dump_k1_data(&k1_data); let mut img_data: Vec = Default::default();