diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs index 73f33ee..672b42d 100644 --- a/piet-gpu/bin/cli.rs +++ b/piet-gpu/bin/cli.rs @@ -41,7 +41,7 @@ fn main() -> Result<(), Error> { let fence = device.create_fence(false)?; let mut cmd_buf = device.create_cmd_buf()?; - let query_pool = device.create_query_pool(4)?; + let query_pool = device.create_query_pool(5)?; let mut ctx = PietGpuRenderContext::new(); render_scene(&mut ctx); @@ -62,10 +62,11 @@ fn main() -> Result<(), Error> { println!("Element kernel time: {:.3}ms", ts[0] * 1e3); println!("Binning kernel time: {:.3}ms", (ts[1] - ts[0]) * 1e3); println!("Coarse kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3); + println!("Render kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3); /* let mut data: Vec = Default::default(); - device.read_buffer(&renderer.bin_buf, &mut data).unwrap(); + device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap(); piet_gpu::dump_k1_data(&data); let mut data: Vec = Default::default(); diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp index 4e4ff19..da25ce4 100644 --- a/piet-gpu/shader/coarse.comp +++ b/piet-gpu/shader/coarse.comp @@ -46,6 +46,17 @@ shared uint sh_bitmaps[N_SLICE][N_TILE]; #define SX (1.0 / float(TILE_WIDTH_PX)) #define SY (1.0 / float(TILE_HEIGHT_PX)) +// Perhaps cmd_limit should be a global? This is a style question. +void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) { + if (cmd_ref.offset > cmd_limit) { + uint new_cmd = atomicAdd(alloc, PTCL_INITIAL_ALLOC); + CmdJump jump = CmdJump(new_cmd); + Cmd_Jump_write(cmd_ref, jump); + cmd_ref = CmdRef(new_cmd); + cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - 2 * Cmd_size; + } +} + void main() { // Could use either linear or 2d layouts for both dispatch and // invocations within the workgroup. We'll use variables to abstract. @@ -53,6 +64,13 @@ void main() { // Top left coordinates of this bin. vec2 xy0 = vec2(N_TILE_X * TILE_WIDTH_PX * gl_WorkGroupID.x, N_TILE_Y * TILE_HEIGHT_PX * gl_WorkGroupID.y); uint th_ix = gl_LocalInvocationID.x; + + uint tile_x = N_TILE_X * gl_WorkGroupID.x + gl_LocalInvocationID.x % N_TILE_X; + uint tile_y = N_TILE_Y * gl_WorkGroupID.y + gl_LocalInvocationID.x / N_TILE_X; + uint tile_ix = tile_y * WIDTH_IN_TILES + tile_x; + CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC); + uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size; + uint wr_ix = 0; uint rd_ix = 0; uint first_el; @@ -172,6 +190,7 @@ void main() { y++; } } + barrier(); // Output elements for this tile, based on bitmaps. uint slice_ix = 0; @@ -193,13 +212,25 @@ void main() { // At this point, we read the element again from global memory. // If that turns out to be expensive, maybe we can pack it into // shared memory (or perhaps just the tag). - probe += 1; + ref = AnnotatedRef(element_ix * Annotated_size); + tag = Annotated_tag(ref); + + switch (tag) { + case Annotated_Fill: + case Annotated_Stroke: + // Note: we take advantage of the fact that fills and strokes + // have compatible layout. + AnnoFill fill = Annotated_Fill_read(ref); + alloc_cmd(cmd_ref, cmd_limit); + Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color)); + break; + } // clear LSB bitmap &= bitmap - 1; } rd_ix += N_TILE; + break; } while (wr_ix > rd_ix); - ptcl[bin_ix * N_TILE + th_ix] = probe; } diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv index ded68da..ed005fc 100644 Binary files a/piet-gpu/shader/coarse.spv and b/piet-gpu/shader/coarse.spv differ diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp index d6f33b7..2df43ec 100644 --- a/piet-gpu/shader/kernel4.comp +++ b/piet-gpu/shader/kernel4.comp @@ -39,7 +39,8 @@ void main() { uvec2 xy_uint = gl_GlobalInvocationID.xy; vec2 xy = vec2(xy_uint); vec2 uv = xy * vec2(1.0 / IMAGE_WIDTH, 1.0 / IMAGE_HEIGHT); - vec3 rgb = uv.xyy; + //vec3 rgb = uv.xyy; + vec3 rgb = vec3(0.75); while (true) { uint tag = Cmd_tag(cmd_ref); diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv index d9dacc0..00e1ac3 100644 Binary files a/piet-gpu/shader/kernel4.spv and b/piet-gpu/shader/kernel4.spv differ diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index cca0b0b..2527b50 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -30,7 +30,7 @@ const PTCL_INITIAL_ALLOC: usize = 1024; const K2_PER_TILE_SIZE: usize = 8; -const N_CIRCLES: usize = 1; +const N_CIRCLES: usize = 0; const N_WG: u32 = 16; @@ -47,11 +47,13 @@ pub fn render_scene(rc: &mut impl RenderContext) { rc.fill(circle, &color); } let mut path = BezPath::new(); + /* path.move_to((100.0, 1150.0)); path.line_to((200.0, 1200.0)); path.line_to((150.0, 1250.0)); path.close_path(); rc.fill(path, &Color::rgb8(128, 0, 128)); + */ rc.stroke( Line::new((100.0, 100.0), (200.0, 150.0)), &Color::WHITE, @@ -134,29 +136,9 @@ pub struct Renderer { coarse_alloc_buf_host: D::Buffer, coarse_alloc_buf_dev: D::Buffer, - /* - k1_alloc_buf_host: D::Buffer, - k1_alloc_buf_dev: D::Buffer, - k2s_alloc_buf_host: D::Buffer, - k2s_alloc_buf_dev: D::Buffer, - k2f_alloc_buf_host: D::Buffer, - k2f_alloc_buf_dev: D::Buffer, - k3_alloc_buf_host: D::Buffer, - k3_alloc_buf_dev: D::Buffer, - tilegroup_buf: D::Buffer, - ptcl_buf: D::Buffer, - - k1_pipeline: D::Pipeline, - k1_ds: D::DescriptorSet, - k2s_pipeline: D::Pipeline, - k2s_ds: D::DescriptorSet, - k2f_pipeline: D::Pipeline, - k2f_ds: D::DescriptorSet, - k3_pipeline: D::Pipeline, - k3_ds: D::DescriptorSet, k4_pipeline: D::Pipeline, k4_ds: D::DescriptorSet, - */ + n_elements: usize, } @@ -213,10 +195,10 @@ impl Renderer { let coarse_alloc_buf_host = device.create_buffer(4, host)?; let coarse_alloc_buf_dev = device.create_buffer(4, dev)?; - let coarse_alloc_start = 256 * 64 * N_WG; + let coarse_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC; device .write_buffer(&coarse_alloc_buf_host, &[ - coarse_alloc_start, + coarse_alloc_start as u32, ]) ?; let coarse_code = include_bytes!("../shader/coarse.spv"); @@ -227,72 +209,11 @@ impl Renderer { &[], )?; - /* - let tilegroup_buf = device.create_buffer(4 * 1024 * 1024, dev)?; - let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?; + // These will probably be combined with the ptcl buf, as they're all written by the + // same kernel now. let segment_buf = device.create_buffer(64 * 1024 * 1024, dev)?; let fill_seg_buf = device.create_buffer(64 * 1024 * 1024, dev)?; - let k1_alloc_buf_host = device.create_buffer(4, host)?; - let k1_alloc_buf_dev = device.create_buffer(4, dev)?; - let k1_alloc_start = WIDTH_IN_TILEGROUPS * HEIGHT_IN_TILEGROUPS * TILEGROUP_STRIDE; - device.write_buffer(&k1_alloc_buf_host, &[k1_alloc_start as u32])?; - let k1_code = include_bytes!("../shader/kernel1.spv"); - let k1_pipeline = device.create_simple_compute_pipeline(k1_code, 3, 0)?; - let k1_ds = device.create_descriptor_set( - &k1_pipeline, - &[&scene_dev, &tilegroup_buf, &k1_alloc_buf_dev], - &[], - )?; - - let k2s_alloc_buf_host = device.create_buffer(4, host)?; - let k2s_alloc_buf_dev = device.create_buffer(4, dev)?; - let k2s_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * K2_PER_TILE_SIZE; - device.write_buffer(&k2s_alloc_buf_host, &[k2s_alloc_start as u32])?; - let k2s_code = include_bytes!("../shader/kernel2s.spv"); - let k2s_pipeline = device.create_simple_compute_pipeline(k2s_code, 4, 0)?; - let k2s_ds = device.create_descriptor_set( - &k2s_pipeline, - &[&scene_dev, &tilegroup_buf, &segment_buf, &k2s_alloc_buf_dev], - &[], - )?; - - let k2f_alloc_buf_host = device.create_buffer(4, host)?; - let k2f_alloc_buf_dev = device.create_buffer(4, dev)?; - let k2f_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * K2_PER_TILE_SIZE; - device.write_buffer(&k2f_alloc_buf_host, &[k2f_alloc_start as u32])?; - let k2f_code = include_bytes!("../shader/kernel2f.spv"); - let k2f_pipeline = device.create_simple_compute_pipeline(k2f_code, 4, 0)?; - let k2f_ds = device.create_descriptor_set( - &k2f_pipeline, - &[ - &scene_dev, - &tilegroup_buf, - &fill_seg_buf, - &k2f_alloc_buf_dev, - ], - &[], - )?; - - let k3_alloc_buf_host = device.create_buffer(4, host)?; - let k3_alloc_buf_dev = device.create_buffer(4, dev)?; - let k3_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC; - device.write_buffer(&k3_alloc_buf_host, &[k3_alloc_start as u32])?; - let k3_code = include_bytes!("../shader/kernel3.spv"); - let k3_pipeline = device.create_simple_compute_pipeline(k3_code, 6, 0)?; - let k3_ds = device.create_descriptor_set( - &k3_pipeline, - &[ - &scene_dev, - &tilegroup_buf, - &segment_buf, - &fill_seg_buf, - &ptcl_buf, - &k3_alloc_buf_dev, - ], - &[], - )?; - let k4_code = include_bytes!("../shader/kernel4.spv"); let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 3, 1)?; let k4_ds = device.create_descriptor_set( @@ -300,7 +221,6 @@ impl Renderer { &[&ptcl_buf, &segment_buf, &fill_seg_buf], &[&image_dev], )?; - */ Ok(Renderer { scene_buf, @@ -312,6 +232,8 @@ impl Renderer { bin_ds, coarse_pipeline, coarse_ds, + k4_pipeline, + k4_ds, state_buf, anno_buf, bin_buf, @@ -339,7 +261,7 @@ impl Renderer { cmd_buf.dispatch( &self.el_pipeline, &self.el_ds, - ((self.n_elements / 128) as u32, 1, 1), + (((self.n_elements + 127) / 128) as u32, 1, 1), ); cmd_buf.write_timestamp(&query_pool, 1); cmd_buf.memory_barrier(); @@ -357,6 +279,13 @@ impl Renderer { ); cmd_buf.write_timestamp(&query_pool, 3); cmd_buf.memory_barrier(); + cmd_buf.dispatch( + &self.k4_pipeline, + &self.k4_ds, + ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1), + ); + cmd_buf.write_timestamp(&query_pool, 4); + cmd_buf.memory_barrier(); cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc); } }