Add first draft of kernel 3

A fairly simple approach, but it adds the translation (not tested yet
in scene encoding) and does bounding box culling.
This commit is contained in:
Raph Levien 2020-04-21 17:55:17 -07:00
parent 2ed89dd65e
commit 6976f877e0
9 changed files with 449 additions and 14 deletions

View file

@ -4,8 +4,9 @@ piet_gpu! {
#[gpu_write] #[gpu_write]
mod ptcl { mod ptcl {
struct CmdCircle { struct CmdCircle {
// In existing code, this is packed; we might need an annotation for this. center: [f32; 2],
bbox: [u16; 4], radius: f32,
rgba_color: u32,
} }
struct CmdLine { struct CmdLine {
start: [f32; 2], start: [f32; 2],

View file

@ -10,3 +10,5 @@ rule glsl
build image.spv: glsl image.comp | scene.h build image.spv: glsl image.comp | scene.h
build kernel1.spv: glsl kernel1.comp | scene.h tilegroup.h build kernel1.spv: glsl kernel1.comp | scene.h tilegroup.h
build kernel3.spv: glsl kernel3.comp | scene.h tilegroup.h ptcl.h

View file

@ -40,7 +40,7 @@ void main() {
if (tag == PietItem_Circle) { if (tag == PietItem_Circle) {
PietCircle circle = PietItem_Circle_read(item_ref); PietCircle circle = PietItem_Circle_read(item_ref);
float r = length(xy + vec2(0.5, 0.5) - circle.center.xy); float r = length(xy + vec2(0.5, 0.5) - circle.center.xy);
float alpha = clamp(circle.radius - r, 0.0, 1.0); float alpha = clamp(0.5 + circle.radius - r, 0.0, 1.0);
vec4 fg_rgba = unpackUnorm4x8(circle.rgba_color); vec4 fg_rgba = unpackUnorm4x8(circle.rgba_color);
// TODO: sRGB // TODO: sRGB
rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a); rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a);

Binary file not shown.

View file

@ -1,3 +1,16 @@
// This is "kernel 1" in a 4-kernel pipeline. It traverses the scene graph
// and outputs "instances" (references to item + translation) for each item
// that intersects the tilegroup.
//
// This implementation is simplistic and leaves a lot of performance on the
// table. A fancier implementation would use threadgroup shared memory or
// subgroups (or possibly both) to parallelize the reading of the input and
// the computation of tilegroup intersection.
//
// In addition, there are some features currently missing. One is the use of
// a bump allocator to extend the current fixed allocation. Another is support
// for clipping.
#version 450 #version 450
#extension GL_GOOGLE_include_directive : enable #extension GL_GOOGLE_include_directive : enable
@ -18,10 +31,10 @@ layout(set = 0, binding = 1) buffer TilegroupBuf {
// TODO: compute this // TODO: compute this
#define WIDTH_IN_TILEGROUPS 4 #define WIDTH_IN_TILEGROUPS 4
#define TILEGROUP_WIDTH 512 #define TILEGROUP_WIDTH_PX 512
#define TILEGROUP_HEIGHT 16 #define TILEGROUP_HEIGHT_PX 16
#define INITIAL_ALLOC 1024 #define TILEGROUP_INITIAL_ALLOC 1024
#define MAX_STACK 8 #define MAX_STACK 8
@ -35,8 +48,8 @@ void main() {
StackElement stack[MAX_STACK]; StackElement stack[MAX_STACK];
uint stack_ix = 0; uint stack_ix = 0;
uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS + gl_GlobalInvocationID.x; uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS + gl_GlobalInvocationID.x;
TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * INITIAL_ALLOC); TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_INITIAL_ALLOC);
vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILEGROUP_WIDTH, TILEGROUP_HEIGHT); vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILEGROUP_WIDTH_PX, TILEGROUP_HEIGHT_PX);
PietItemRef root = PietItemRef(0); PietItemRef root = PietItemRef(0);
SimpleGroup group = PietItem_Group_read(root); SimpleGroup group = PietItem_Group_read(root);
StackElement tos = StackElement(root, 0, group.offset.xy); StackElement tos = StackElement(root, 0, group.offset.xy);
@ -45,8 +58,8 @@ void main() {
if (tos.index < group.n_items) { if (tos.index < group.n_items) {
Bbox bbox = Bbox_read(Bbox_index(group.bboxes, tos.index)); Bbox bbox = Bbox_read(Bbox_index(group.bboxes, tos.index));
vec4 bb = vec4(bbox.bbox) + tos.offset.xyxy; vec4 bb = vec4(bbox.bbox) + tos.offset.xyxy;
bool hit = max(bb.x, xy0.x) < min(bb.z, xy0.x + float(TILEGROUP_WIDTH)) bool hit = max(bb.x, xy0.x) < min(bb.z, xy0.x + float(TILEGROUP_WIDTH_PX))
&& max(bb.y, xy0.y) < min(bb.w, xy0.y + float(TILEGROUP_HEIGHT)); && max(bb.y, xy0.y) < min(bb.w, xy0.y + float(TILEGROUP_HEIGHT_PX));
bool is_group = false; bool is_group = false;
if (hit) { if (hit) {
PietItemRef item_ref = PietItem_index(group.items, tos.index); PietItemRef item_ref = PietItem_index(group.items, tos.index);

View file

@ -0,0 +1,72 @@
#version 450
#extension GL_GOOGLE_include_directive : enable
layout(local_size_x = 32, local_size_y = 1) in;
layout(set = 0, binding = 0) readonly buffer SceneBuf {
uint[] scene;
};
// TODO: this should have a `readonly` qualifier, but then inclusion
// of ptcl.h would fail because of the writers.
layout(set = 0, binding = 1) buffer TilegroupBuf {
uint[] tilegroup;
};
layout(set = 0, binding = 2) buffer PtclBuf {
uint[] ptcl;
};
#include "scene.h"
#include "tilegroup.h"
#include "ptcl.h"
// TODO: compute all these
#define WIDTH_IN_TILEGROUPS 4
#define WIDTH_IN_TILES 128
#define TILEGROUP_WIDTH_TILES 32
#define TILE_WIDTH_PX 16
#define TILE_HEIGHT_PX 16
// Must be the same as kernel1. Might be a good idea to move these particular
// constants to their own .h file.
#define TILEGROUP_INITIAL_ALLOC 1024
#define PTCL_INITIAL_ALLOC 4096
void main() {
uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x;
uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS
+ (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES);
vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_INITIAL_ALLOC);
CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC);
while (true) {
uint tg_tag = TileGroup_tag(tg_ref);
if (tg_tag == TileGroup_End) {
break;
}
// Assume tg_tag is `Instance`, though there will be more cases.
Instance ins = TileGroup_Instance_read(tg_ref);
PietItemRef item_ref = PietItemRef(ins.item_ref);
uint item_tag = PietItem_tag(item_ref);
switch (item_tag) {
case PietItem_Circle:
PietCircle circle = PietItem_Circle_read(item_ref);
vec2 center = ins.offset + circle.center.xy;
float r = circle.radius;
if (max(center.x - r, xy0.x) < min(center.x + r, xy0.x + float(TILE_WIDTH_PX))
&& max(center.y - r, xy0.y) < min(center.y + r, xy0.y + float(TILE_HEIGHT_PX)))
{
CmdCircle cmd = CmdCircle(center, r, circle.rgba_color);
Cmd_Circle_write(cmd_ref, cmd);
cmd_ref.offset += Cmd_size;
}
break;
}
tg_ref.offset += TileGroup_size;
}
Cmd_End_write(cmd_ref);
}

BIN
piet-gpu/shader/kernel3.spv Normal file

Binary file not shown.

323
piet-gpu/shader/ptcl.h Normal file
View file

@ -0,0 +1,323 @@
// Code auto-generated by piet-gpu-derive
struct CmdCircleRef {
uint offset;
};
struct CmdLineRef {
uint offset;
};
struct CmdStrokeRef {
uint offset;
};
struct CmdFillRef {
uint offset;
};
struct CmdFillEdgeRef {
uint offset;
};
struct CmdDrawFillRef {
uint offset;
};
struct CmdSolidRef {
uint offset;
};
struct CmdRef {
uint offset;
};
struct CmdCircle {
vec2 center;
float radius;
uint rgba_color;
};
#define CmdCircle_size 16
CmdCircleRef CmdCircle_index(CmdCircleRef ref, uint index) {
return CmdCircleRef(ref.offset + index * CmdCircle_size);
}
struct CmdLine {
vec2 start;
vec2 end;
};
#define CmdLine_size 16
CmdLineRef CmdLine_index(CmdLineRef ref, uint index) {
return CmdLineRef(ref.offset + index * CmdLine_size);
}
struct CmdStroke {
float halfWidth;
uint rgba_color;
};
#define CmdStroke_size 8
CmdStrokeRef CmdStroke_index(CmdStrokeRef ref, uint index) {
return CmdStrokeRef(ref.offset + index * CmdStroke_size);
}
struct CmdFill {
vec2 start;
vec2 end;
};
#define CmdFill_size 16
CmdFillRef CmdFill_index(CmdFillRef ref, uint index) {
return CmdFillRef(ref.offset + index * CmdFill_size);
}
struct CmdFillEdge {
int sign;
float y;
};
#define CmdFillEdge_size 8
CmdFillEdgeRef CmdFillEdge_index(CmdFillEdgeRef ref, uint index) {
return CmdFillEdgeRef(ref.offset + index * CmdFillEdge_size);
}
struct CmdDrawFill {
int backdrop;
uint rgba_color;
};
#define CmdDrawFill_size 8
CmdDrawFillRef CmdDrawFill_index(CmdDrawFillRef ref, uint index) {
return CmdDrawFillRef(ref.offset + index * CmdDrawFill_size);
}
struct CmdSolid {
uint rgba_color;
};
#define CmdSolid_size 4
CmdSolidRef CmdSolid_index(CmdSolidRef ref, uint index) {
return CmdSolidRef(ref.offset + index * CmdSolid_size);
}
#define Cmd_End 0
#define Cmd_Circle 1
#define Cmd_Line 2
#define Cmd_Fill 3
#define Cmd_Stroke 4
#define Cmd_FillEdge 5
#define Cmd_DrawFill 6
#define Cmd_Solid 7
#define Cmd_Bail 8
#define Cmd_size 20
CmdRef Cmd_index(CmdRef ref, uint index) {
return CmdRef(ref.offset + index * Cmd_size);
}
CmdCircle CmdCircle_read(CmdCircleRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = ptcl[ix + 0];
uint raw1 = ptcl[ix + 1];
uint raw2 = ptcl[ix + 2];
uint raw3 = ptcl[ix + 3];
CmdCircle s;
s.center = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
s.radius = uintBitsToFloat(raw2);
s.rgba_color = raw3;
return s;
}
void CmdCircle_write(CmdCircleRef ref, CmdCircle s) {
uint ix = ref.offset >> 2;
ptcl[ix + 0] = floatBitsToUint(s.center.x);
ptcl[ix + 1] = floatBitsToUint(s.center.y);
ptcl[ix + 2] = floatBitsToUint(s.radius);
ptcl[ix + 3] = s.rgba_color;
}
CmdLine CmdLine_read(CmdLineRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = ptcl[ix + 0];
uint raw1 = ptcl[ix + 1];
uint raw2 = ptcl[ix + 2];
uint raw3 = ptcl[ix + 3];
CmdLine s;
s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
return s;
}
void CmdLine_write(CmdLineRef ref, CmdLine s) {
uint ix = ref.offset >> 2;
ptcl[ix + 0] = floatBitsToUint(s.start.x);
ptcl[ix + 1] = floatBitsToUint(s.start.y);
ptcl[ix + 2] = floatBitsToUint(s.end.x);
ptcl[ix + 3] = floatBitsToUint(s.end.y);
}
CmdStroke CmdStroke_read(CmdStrokeRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = ptcl[ix + 0];
uint raw1 = ptcl[ix + 1];
CmdStroke s;
s.halfWidth = uintBitsToFloat(raw0);
s.rgba_color = raw1;
return s;
}
void CmdStroke_write(CmdStrokeRef ref, CmdStroke s) {
uint ix = ref.offset >> 2;
ptcl[ix + 0] = floatBitsToUint(s.halfWidth);
ptcl[ix + 1] = s.rgba_color;
}
CmdFill CmdFill_read(CmdFillRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = ptcl[ix + 0];
uint raw1 = ptcl[ix + 1];
uint raw2 = ptcl[ix + 2];
uint raw3 = ptcl[ix + 3];
CmdFill s;
s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
return s;
}
void CmdFill_write(CmdFillRef ref, CmdFill s) {
uint ix = ref.offset >> 2;
ptcl[ix + 0] = floatBitsToUint(s.start.x);
ptcl[ix + 1] = floatBitsToUint(s.start.y);
ptcl[ix + 2] = floatBitsToUint(s.end.x);
ptcl[ix + 3] = floatBitsToUint(s.end.y);
}
CmdFillEdge CmdFillEdge_read(CmdFillEdgeRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = ptcl[ix + 0];
uint raw1 = ptcl[ix + 1];
CmdFillEdge s;
s.sign = int(raw0);
s.y = uintBitsToFloat(raw1);
return s;
}
void CmdFillEdge_write(CmdFillEdgeRef ref, CmdFillEdge s) {
uint ix = ref.offset >> 2;
ptcl[ix + 0] = uint(s.sign);
ptcl[ix + 1] = floatBitsToUint(s.y);
}
CmdDrawFill CmdDrawFill_read(CmdDrawFillRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = ptcl[ix + 0];
uint raw1 = ptcl[ix + 1];
CmdDrawFill s;
s.backdrop = int(raw0);
s.rgba_color = raw1;
return s;
}
void CmdDrawFill_write(CmdDrawFillRef ref, CmdDrawFill s) {
uint ix = ref.offset >> 2;
ptcl[ix + 0] = uint(s.backdrop);
ptcl[ix + 1] = s.rgba_color;
}
CmdSolid CmdSolid_read(CmdSolidRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = ptcl[ix + 0];
CmdSolid s;
s.rgba_color = raw0;
return s;
}
void CmdSolid_write(CmdSolidRef ref, CmdSolid s) {
uint ix = ref.offset >> 2;
ptcl[ix + 0] = s.rgba_color;
}
uint Cmd_tag(CmdRef ref) {
return ptcl[ref.offset >> 2];
}
CmdCircle Cmd_Circle_read(CmdRef ref) {
return CmdCircle_read(CmdCircleRef(ref.offset + 4));
}
CmdLine Cmd_Line_read(CmdRef ref) {
return CmdLine_read(CmdLineRef(ref.offset + 4));
}
CmdFill Cmd_Fill_read(CmdRef ref) {
return CmdFill_read(CmdFillRef(ref.offset + 4));
}
CmdStroke Cmd_Stroke_read(CmdRef ref) {
return CmdStroke_read(CmdStrokeRef(ref.offset + 4));
}
CmdFillEdge Cmd_FillEdge_read(CmdRef ref) {
return CmdFillEdge_read(CmdFillEdgeRef(ref.offset + 4));
}
CmdDrawFill Cmd_DrawFill_read(CmdRef ref) {
return CmdDrawFill_read(CmdDrawFillRef(ref.offset + 4));
}
CmdSolid Cmd_Solid_read(CmdRef ref) {
return CmdSolid_read(CmdSolidRef(ref.offset + 4));
}
void Cmd_End_write(CmdRef ref) {
ptcl[ref.offset >> 2] = Cmd_End;
}
void Cmd_Circle_write(CmdRef ref, CmdCircle s) {
ptcl[ref.offset >> 2] = Cmd_Circle;
CmdCircle_write(CmdCircleRef(ref.offset + 4), s);
}
void Cmd_Line_write(CmdRef ref, CmdLine s) {
ptcl[ref.offset >> 2] = Cmd_Line;
CmdLine_write(CmdLineRef(ref.offset + 4), s);
}
void Cmd_Fill_write(CmdRef ref, CmdFill s) {
ptcl[ref.offset >> 2] = Cmd_Fill;
CmdFill_write(CmdFillRef(ref.offset + 4), s);
}
void Cmd_Stroke_write(CmdRef ref, CmdStroke s) {
ptcl[ref.offset >> 2] = Cmd_Stroke;
CmdStroke_write(CmdStrokeRef(ref.offset + 4), s);
}
void Cmd_FillEdge_write(CmdRef ref, CmdFillEdge s) {
ptcl[ref.offset >> 2] = Cmd_FillEdge;
CmdFillEdge_write(CmdFillEdgeRef(ref.offset + 4), s);
}
void Cmd_DrawFill_write(CmdRef ref, CmdDrawFill s) {
ptcl[ref.offset >> 2] = Cmd_DrawFill;
CmdDrawFill_write(CmdDrawFillRef(ref.offset + 4), s);
}
void Cmd_Solid_write(CmdRef ref, CmdSolid s) {
ptcl[ref.offset >> 2] = Cmd_Solid;
CmdSolid_write(CmdSolidRef(ref.offset + 4), s);
}
void Cmd_Bail_write(CmdRef ref) {
ptcl[ref.offset >> 2] = Cmd_Bail;
}

View file

@ -73,6 +73,7 @@ fn dump_scene(buf: &[u8]) {
} }
} }
#[allow(unused)]
fn dump_k1_data(k1_buf: &[u32]) { fn dump_k1_data(k1_buf: &[u32]) {
for i in 0..k1_buf.len() { for i in 0..k1_buf.len() {
if k1_buf[i] != 0 { if k1_buf[i] != 0 {
@ -96,7 +97,9 @@ fn main() {
.create_buffer(std::mem::size_of_val(&scene[..]) as u64, dev) .create_buffer(std::mem::size_of_val(&scene[..]) as u64, dev)
.unwrap(); .unwrap();
device.write_buffer(&scene_buf, &scene).unwrap(); device.write_buffer(&scene_buf, &scene).unwrap();
// These should only be on the host if we're going to examine them from Rust.
let tilegroup_buf = device.create_buffer(384 * 1024, host).unwrap(); let tilegroup_buf = device.create_buffer(384 * 1024, host).unwrap();
let ptcl_buf = device.create_buffer(12 * 1024 * 4096, host).unwrap();
let image_buf = device let image_buf = device
.create_buffer((WIDTH * HEIGHT * 4) as u64, host) .create_buffer((WIDTH * HEIGHT * 4) as u64, host)
.unwrap(); .unwrap();
@ -110,16 +113,23 @@ fn main() {
.create_descriptor_set(&k1_pipeline, &[&scene_dev, &tilegroup_buf]) .create_descriptor_set(&k1_pipeline, &[&scene_dev, &tilegroup_buf])
.unwrap(); .unwrap();
let k3_code = include_bytes!("../shader/kernel3.spv");
let k3_pipeline = device.create_simple_compute_pipeline(k3_code, 3).unwrap();
let k3_ds = device
.create_descriptor_set(&k3_pipeline, &[&scene_dev, &tilegroup_buf, &ptcl_buf])
.unwrap();
let code = include_bytes!("../shader/image.spv"); let code = include_bytes!("../shader/image.spv");
let pipeline = device.create_simple_compute_pipeline(code, 2).unwrap(); let pipeline = device.create_simple_compute_pipeline(code, 2).unwrap();
let descriptor_set = device let descriptor_set = device
.create_descriptor_set(&pipeline, &[&scene_dev, &image_dev]) .create_descriptor_set(&pipeline, &[&scene_dev, &image_dev])
.unwrap(); .unwrap();
let query_pool = device.create_query_pool(3).unwrap(); let query_pool = device.create_query_pool(4).unwrap();
let mut cmd_buf = device.create_cmd_buf().unwrap(); let mut cmd_buf = device.create_cmd_buf().unwrap();
cmd_buf.begin(); cmd_buf.begin();
cmd_buf.copy_buffer(&scene_buf, &scene_dev); cmd_buf.copy_buffer(&scene_buf, &scene_dev);
cmd_buf.clear_buffer(&tilegroup_buf); cmd_buf.clear_buffer(&tilegroup_buf);
cmd_buf.clear_buffer(&ptcl_buf);
cmd_buf.memory_barrier(); cmd_buf.memory_barrier();
cmd_buf.write_timestamp(&query_pool, 0); cmd_buf.write_timestamp(&query_pool, 0);
cmd_buf.dispatch( cmd_buf.dispatch(
@ -129,22 +139,36 @@ fn main() {
); );
cmd_buf.write_timestamp(&query_pool, 1); cmd_buf.write_timestamp(&query_pool, 1);
cmd_buf.memory_barrier(); cmd_buf.memory_barrier();
cmd_buf.dispatch(
&k3_pipeline,
&k3_ds,
((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 1),
);
cmd_buf.write_timestamp(&query_pool, 2);
cmd_buf.memory_barrier();
cmd_buf.dispatch( cmd_buf.dispatch(
&pipeline, &pipeline,
&descriptor_set, &descriptor_set,
((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1), ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1),
); );
cmd_buf.write_timestamp(&query_pool, 2); cmd_buf.write_timestamp(&query_pool, 3);
cmd_buf.memory_barrier(); cmd_buf.memory_barrier();
cmd_buf.copy_buffer(&image_dev, &image_buf); cmd_buf.copy_buffer(&image_dev, &image_buf);
cmd_buf.finish(); cmd_buf.finish();
device.run_cmd_buf(&cmd_buf).unwrap(); device.run_cmd_buf(&cmd_buf).unwrap();
let timestamps = device.reap_query_pool(query_pool).unwrap(); let timestamps = device.reap_query_pool(query_pool).unwrap();
println!("Kernel 1 time: {:.3}ms", timestamps[0] * 1e3); println!("Kernel 1 time: {:.3}ms", timestamps[0] * 1e3);
println!("Render time: {:.3}ms", (timestamps[1] - timestamps[0]) * 1e3); println!(
"Kernel 3 time: {:.3}ms",
(timestamps[1] - timestamps[0]) * 1e3
);
println!(
"Render time: {:.3}ms",
(timestamps[2] - timestamps[1]) * 1e3
);
let mut k1_data: Vec<u32> = Default::default(); let mut k1_data: Vec<u32> = Default::default();
device.read_buffer(&tilegroup_buf, &mut k1_data).unwrap(); device.read_buffer(&ptcl_buf, &mut k1_data).unwrap();
dump_k1_data(&k1_data); dump_k1_data(&k1_data);
let mut img_data: Vec<u8> = Default::default(); let mut img_data: Vec<u8> = Default::default();