Start writing tiles

This is the first checkpoint where it actually runs a pipeline end to
end, though it's far from accurate.
This commit is contained in:
Raph Levien 2020-05-15 12:28:29 -07:00
parent 06cad48dca
commit 3a6428238b
6 changed files with 56 additions and 94 deletions

View file

@ -41,7 +41,7 @@ fn main() -> Result<(), Error> {
let fence = device.create_fence(false)?;
let mut cmd_buf = device.create_cmd_buf()?;
let query_pool = device.create_query_pool(4)?;
let query_pool = device.create_query_pool(5)?;
let mut ctx = PietGpuRenderContext::new();
render_scene(&mut ctx);
@ -62,10 +62,11 @@ fn main() -> Result<(), Error> {
println!("Element kernel time: {:.3}ms", ts[0] * 1e3);
println!("Binning kernel time: {:.3}ms", (ts[1] - ts[0]) * 1e3);
println!("Coarse kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3);
println!("Render kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3);
/*
let mut data: Vec<u32> = Default::default();
device.read_buffer(&renderer.bin_buf, &mut data).unwrap();
device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap();
piet_gpu::dump_k1_data(&data);
let mut data: Vec<u32> = Default::default();

View file

@ -46,6 +46,17 @@ shared uint sh_bitmaps[N_SLICE][N_TILE];
#define SX (1.0 / float(TILE_WIDTH_PX))
#define SY (1.0 / float(TILE_HEIGHT_PX))
// Perhaps cmd_limit should be a global? This is a style question.
void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) {
if (cmd_ref.offset > cmd_limit) {
uint new_cmd = atomicAdd(alloc, PTCL_INITIAL_ALLOC);
CmdJump jump = CmdJump(new_cmd);
Cmd_Jump_write(cmd_ref, jump);
cmd_ref = CmdRef(new_cmd);
cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
}
}
void main() {
// Could use either linear or 2d layouts for both dispatch and
// invocations within the workgroup. We'll use variables to abstract.
@ -53,6 +64,13 @@ void main() {
// Top left coordinates of this bin.
vec2 xy0 = vec2(N_TILE_X * TILE_WIDTH_PX * gl_WorkGroupID.x, N_TILE_Y * TILE_HEIGHT_PX * gl_WorkGroupID.y);
uint th_ix = gl_LocalInvocationID.x;
uint tile_x = N_TILE_X * gl_WorkGroupID.x + gl_LocalInvocationID.x % N_TILE_X;
uint tile_y = N_TILE_Y * gl_WorkGroupID.y + gl_LocalInvocationID.x / N_TILE_X;
uint tile_ix = tile_y * WIDTH_IN_TILES + tile_x;
CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC);
uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
uint wr_ix = 0;
uint rd_ix = 0;
uint first_el;
@ -172,6 +190,7 @@ void main() {
y++;
}
}
barrier();
// Output elements for this tile, based on bitmaps.
uint slice_ix = 0;
@ -193,13 +212,25 @@ void main() {
// At this point, we read the element again from global memory.
// If that turns out to be expensive, maybe we can pack it into
// shared memory (or perhaps just the tag).
probe += 1;
ref = AnnotatedRef(element_ix * Annotated_size);
tag = Annotated_tag(ref);
switch (tag) {
case Annotated_Fill:
case Annotated_Stroke:
// Note: we take advantage of the fact that fills and strokes
// have compatible layout.
AnnoFill fill = Annotated_Fill_read(ref);
alloc_cmd(cmd_ref, cmd_limit);
Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color));
break;
}
// clear LSB
bitmap &= bitmap - 1;
}
rd_ix += N_TILE;
break;
} while (wr_ix > rd_ix);
ptcl[bin_ix * N_TILE + th_ix] = probe;
}

Binary file not shown.

View file

@ -39,7 +39,8 @@ void main() {
uvec2 xy_uint = gl_GlobalInvocationID.xy;
vec2 xy = vec2(xy_uint);
vec2 uv = xy * vec2(1.0 / IMAGE_WIDTH, 1.0 / IMAGE_HEIGHT);
vec3 rgb = uv.xyy;
//vec3 rgb = uv.xyy;
vec3 rgb = vec3(0.75);
while (true) {
uint tag = Cmd_tag(cmd_ref);

Binary file not shown.

View file

@ -30,7 +30,7 @@ const PTCL_INITIAL_ALLOC: usize = 1024;
const K2_PER_TILE_SIZE: usize = 8;
const N_CIRCLES: usize = 1;
const N_CIRCLES: usize = 0;
const N_WG: u32 = 16;
@ -47,11 +47,13 @@ pub fn render_scene(rc: &mut impl RenderContext) {
rc.fill(circle, &color);
}
let mut path = BezPath::new();
/*
path.move_to((100.0, 1150.0));
path.line_to((200.0, 1200.0));
path.line_to((150.0, 1250.0));
path.close_path();
rc.fill(path, &Color::rgb8(128, 0, 128));
*/
rc.stroke(
Line::new((100.0, 100.0), (200.0, 150.0)),
&Color::WHITE,
@ -134,29 +136,9 @@ pub struct Renderer<D: Device> {
coarse_alloc_buf_host: D::Buffer,
coarse_alloc_buf_dev: D::Buffer,
/*
k1_alloc_buf_host: D::Buffer,
k1_alloc_buf_dev: D::Buffer,
k2s_alloc_buf_host: D::Buffer,
k2s_alloc_buf_dev: D::Buffer,
k2f_alloc_buf_host: D::Buffer,
k2f_alloc_buf_dev: D::Buffer,
k3_alloc_buf_host: D::Buffer,
k3_alloc_buf_dev: D::Buffer,
tilegroup_buf: D::Buffer,
ptcl_buf: D::Buffer,
k1_pipeline: D::Pipeline,
k1_ds: D::DescriptorSet,
k2s_pipeline: D::Pipeline,
k2s_ds: D::DescriptorSet,
k2f_pipeline: D::Pipeline,
k2f_ds: D::DescriptorSet,
k3_pipeline: D::Pipeline,
k3_ds: D::DescriptorSet,
k4_pipeline: D::Pipeline,
k4_ds: D::DescriptorSet,
*/
n_elements: usize,
}
@ -213,10 +195,10 @@ impl<D: Device> Renderer<D> {
let coarse_alloc_buf_host = device.create_buffer(4, host)?;
let coarse_alloc_buf_dev = device.create_buffer(4, dev)?;
let coarse_alloc_start = 256 * 64 * N_WG;
let coarse_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC;
device
.write_buffer(&coarse_alloc_buf_host, &[
coarse_alloc_start,
coarse_alloc_start as u32,
])
?;
let coarse_code = include_bytes!("../shader/coarse.spv");
@ -227,72 +209,11 @@ impl<D: Device> Renderer<D> {
&[],
)?;
/*
let tilegroup_buf = device.create_buffer(4 * 1024 * 1024, dev)?;
let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?;
// These will probably be combined with the ptcl buf, as they're all written by the
// same kernel now.
let segment_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
let fill_seg_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
let k1_alloc_buf_host = device.create_buffer(4, host)?;
let k1_alloc_buf_dev = device.create_buffer(4, dev)?;
let k1_alloc_start = WIDTH_IN_TILEGROUPS * HEIGHT_IN_TILEGROUPS * TILEGROUP_STRIDE;
device.write_buffer(&k1_alloc_buf_host, &[k1_alloc_start as u32])?;
let k1_code = include_bytes!("../shader/kernel1.spv");
let k1_pipeline = device.create_simple_compute_pipeline(k1_code, 3, 0)?;
let k1_ds = device.create_descriptor_set(
&k1_pipeline,
&[&scene_dev, &tilegroup_buf, &k1_alloc_buf_dev],
&[],
)?;
let k2s_alloc_buf_host = device.create_buffer(4, host)?;
let k2s_alloc_buf_dev = device.create_buffer(4, dev)?;
let k2s_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * K2_PER_TILE_SIZE;
device.write_buffer(&k2s_alloc_buf_host, &[k2s_alloc_start as u32])?;
let k2s_code = include_bytes!("../shader/kernel2s.spv");
let k2s_pipeline = device.create_simple_compute_pipeline(k2s_code, 4, 0)?;
let k2s_ds = device.create_descriptor_set(
&k2s_pipeline,
&[&scene_dev, &tilegroup_buf, &segment_buf, &k2s_alloc_buf_dev],
&[],
)?;
let k2f_alloc_buf_host = device.create_buffer(4, host)?;
let k2f_alloc_buf_dev = device.create_buffer(4, dev)?;
let k2f_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * K2_PER_TILE_SIZE;
device.write_buffer(&k2f_alloc_buf_host, &[k2f_alloc_start as u32])?;
let k2f_code = include_bytes!("../shader/kernel2f.spv");
let k2f_pipeline = device.create_simple_compute_pipeline(k2f_code, 4, 0)?;
let k2f_ds = device.create_descriptor_set(
&k2f_pipeline,
&[
&scene_dev,
&tilegroup_buf,
&fill_seg_buf,
&k2f_alloc_buf_dev,
],
&[],
)?;
let k3_alloc_buf_host = device.create_buffer(4, host)?;
let k3_alloc_buf_dev = device.create_buffer(4, dev)?;
let k3_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC;
device.write_buffer(&k3_alloc_buf_host, &[k3_alloc_start as u32])?;
let k3_code = include_bytes!("../shader/kernel3.spv");
let k3_pipeline = device.create_simple_compute_pipeline(k3_code, 6, 0)?;
let k3_ds = device.create_descriptor_set(
&k3_pipeline,
&[
&scene_dev,
&tilegroup_buf,
&segment_buf,
&fill_seg_buf,
&ptcl_buf,
&k3_alloc_buf_dev,
],
&[],
)?;
let k4_code = include_bytes!("../shader/kernel4.spv");
let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 3, 1)?;
let k4_ds = device.create_descriptor_set(
@ -300,7 +221,6 @@ impl<D: Device> Renderer<D> {
&[&ptcl_buf, &segment_buf, &fill_seg_buf],
&[&image_dev],
)?;
*/
Ok(Renderer {
scene_buf,
@ -312,6 +232,8 @@ impl<D: Device> Renderer<D> {
bin_ds,
coarse_pipeline,
coarse_ds,
k4_pipeline,
k4_ds,
state_buf,
anno_buf,
bin_buf,
@ -339,7 +261,7 @@ impl<D: Device> Renderer<D> {
cmd_buf.dispatch(
&self.el_pipeline,
&self.el_ds,
((self.n_elements / 128) as u32, 1, 1),
(((self.n_elements + 127) / 128) as u32, 1, 1),
);
cmd_buf.write_timestamp(&query_pool, 1);
cmd_buf.memory_barrier();
@ -357,6 +279,13 @@ impl<D: Device> Renderer<D> {
);
cmd_buf.write_timestamp(&query_pool, 3);
cmd_buf.memory_barrier();
cmd_buf.dispatch(
&self.k4_pipeline,
&self.k4_ds,
((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1),
);
cmd_buf.write_timestamp(&query_pool, 4);
cmd_buf.memory_barrier();
cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
}
}