diff --git a/piet-gpu-hal/src/vulkan.rs b/piet-gpu-hal/src/vulkan.rs index 2fac015..402b13d 100644 --- a/piet-gpu-hal/src/vulkan.rs +++ b/piet-gpu-hal/src/vulkan.rs @@ -1016,7 +1016,7 @@ unsafe fn choose_compute_device( devices: &[vk::PhysicalDevice], surface: Option<&VkSurface>, ) -> Option<(vk::PhysicalDevice, u32)> { - for pdevice in devices { + for pdevice in &devices[1..] { let props = instance.get_physical_device_queue_family_properties(*pdevice); for (ix, info) in props.iter().enumerate() { // Check for surface presentation support diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs index fe8c4ac..cc4cb44 100644 --- a/piet-gpu/bin/cli.rs +++ b/piet-gpu/bin/cli.rs @@ -181,12 +181,10 @@ fn main() -> Result<(), Error> { println!("Coarse kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3); println!("Render kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3); - /* let mut data: Vec = Default::default(); - device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap(); - //piet_gpu::dump_k1_data(&data); - trace_ptcl(&data); - */ + device.read_buffer(&renderer.bin_buf, &mut data).unwrap(); + piet_gpu::dump_k1_data(&data); + //trace_ptcl(&data); let mut img_data: Vec = Default::default(); // Note: because png can use a `&[u8]` slice, we could avoid an extra copy diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp index 713a654..d193dd2 100644 --- a/piet-gpu/shader/binning.comp +++ b/piet-gpu/shader/binning.comp @@ -45,8 +45,6 @@ shared uint bitmaps[N_SLICE][N_TILE]; shared uint count[N_SLICE][N_TILE]; shared uint sh_my_tile; shared uint sh_chunk_start[N_TILE]; -shared uint sh_chunk_end[N_TILE]; -shared uint sh_chunk_jump[N_TILE]; shared float sh_right_edge[N_TILE]; @@ -57,8 +55,6 @@ uint state_right_edge_index(uint partition_ix) { } void main() { - BinChunkRef chunk_ref = BinChunkRef((gl_LocalInvocationID.x * N_WG + gl_WorkGroupID.x) * BIN_INITIAL_ALLOC); - uint wr_limit = chunk_ref.offset + BIN_INITIAL_ALLOC; uint chunk_n = 0; uint my_n_elements = n_elements; while (true) { @@ -169,41 +165,15 @@ void main() { count[i][gl_LocalInvocationID.x] = element_count; } // element_count is number of elements covering bin for this invocation. + uint chunk_start = 0; if (element_count != 0) { - uint chunk_end; - uint chunk_new_start; - // Refactor to reduce code duplication? - if (chunk_n > 0) { - uint next_chunk = chunk_ref.offset + BinChunk_size + chunk_n * BinInstance_size; - if (next_chunk + BinChunk_size + min(24, element_count * BinInstance_size) > wr_limit) { - uint alloc_amount = max(BIN_ALLOC, BinChunk_size + element_count * BinInstance_size); - // could try to reduce fragmentation if BIN_ALLOC is only a bit above needed - next_chunk = atomicAdd(alloc, alloc_amount); - wr_limit = next_chunk + alloc_amount; - } - BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(next_chunk))); - chunk_ref = BinChunkRef(next_chunk); - } - BinInstanceRef instance_ref = BinInstanceRef(chunk_ref.offset + BinChunk_size); - if (instance_ref.offset + element_count * BinInstance_size > wr_limit) { - chunk_end = wr_limit; - chunk_n = (wr_limit - instance_ref.offset) / BinInstance_size; - uint alloc_amount = max(BIN_ALLOC, BinChunk_size + (element_count - chunk_n) * BinInstance_size); - chunk_new_start = atomicAdd(alloc, alloc_amount); - wr_limit = chunk_new_start + alloc_amount; - BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(chunk_new_start))); - chunk_ref = BinChunkRef(chunk_new_start); - chunk_new_start += BinChunk_size; - chunk_n = element_count - chunk_n; - } else { - chunk_end = ~0; - chunk_new_start = ~0; - chunk_n = element_count; - } - sh_chunk_start[gl_LocalInvocationID.x] = instance_ref.offset; - sh_chunk_end[gl_LocalInvocationID.x] = chunk_end; - sh_chunk_jump[gl_LocalInvocationID.x] = chunk_new_start - chunk_end; + // TODO: aggregate atomic adds (subgroup is probably fastest) + chunk_start = atomicAdd(alloc, element_count * BinInstance_size); + sh_chunk_start[gl_LocalInvocationID.x] = chunk_start; } + uint out_ix = (my_tile * N_TILE + gl_LocalInvocationID.x) * 2; + bins[out_ix] = element_count; + bins[out_ix + 1] = chunk_start; barrier(); // Use similar strategy as Laine & Karras paper; loop over bbox of bins @@ -219,9 +189,6 @@ void main() { idx += count[my_slice - 1][bin_ix]; } uint out_offset = sh_chunk_start[bin_ix] + idx * BinInstance_size; - if (out_offset >= sh_chunk_end[bin_ix]) { - out_offset += sh_chunk_jump[bin_ix]; - } BinInstance_write(BinInstanceRef(out_offset), BinInstance(element_ix, my_right_edge)); } x++; @@ -231,5 +198,4 @@ void main() { } } } - BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(0))); } diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv index e932e4d..9f22a33 100644 Binary files a/piet-gpu/shader/binning.spv and b/piet-gpu/shader/binning.spv differ diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv index d951b24..b58a327 100644 Binary files a/piet-gpu/shader/coarse.spv and b/piet-gpu/shader/coarse.spv differ diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index 65bbe5c..5f0f6be 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -160,7 +160,7 @@ impl Renderer { let state_buf = device.create_buffer(1 * 1024 * 1024, dev)?; let anno_buf = device.create_buffer(64 * 1024 * 1024, dev)?; - let bin_buf = device.create_buffer(64 * 1024 * 1024, dev)?; + let bin_buf = device.create_buffer(64 * 1024 * 1024, host)?; let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?; let image_dev = device.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?; @@ -176,12 +176,12 @@ impl Renderer { let bin_alloc_buf_dev = device.create_buffer(12, dev)?; // TODO: constants - let bin_alloc_start = 256 * 64 * N_WG; + let bin_alloc_start = ((n_elements + 255) & !255) * 8; device .write_buffer(&bin_alloc_buf_host, &[ n_elements as u32, 0, - bin_alloc_start, + bin_alloc_start as u32, ]) ?; let bin_code = include_bytes!("../shader/binning.spv"); @@ -268,18 +268,22 @@ impl Renderer { ); cmd_buf.write_timestamp(&query_pool, 2); cmd_buf.memory_barrier(); + /* cmd_buf.dispatch( &self.coarse_pipeline, &self.coarse_ds, (WIDTH as u32 / 256, HEIGHT as u32 / 256, 1), ); + */ cmd_buf.write_timestamp(&query_pool, 3); cmd_buf.memory_barrier(); + /* cmd_buf.dispatch( &self.k4_pipeline, &self.k4_ds, ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1), ); + */ cmd_buf.write_timestamp(&query_pool, 4); cmd_buf.memory_barrier(); cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);