Reuse command buffers

Reuse submitted command buffers rather than continually allocating them.

This patch also improves the story across the different backends. On
DX12 it was reusing allocators without resetting them, which could be a
leak. And on Metal the reset "fails," so there's always a new alloc.
This commit is contained in:
Raph Levien 2021-10-21 18:07:46 -07:00
parent b423e6c25d
commit 59e850a7b1
10 changed files with 131 additions and 70 deletions

View file

@ -173,6 +173,9 @@ pub trait CmdBuf<D: Device> {
unsafe fn finish(&mut self);
/// Return true if the command buffer is suitable for reuse.
unsafe fn reset(&mut self) -> bool;
unsafe fn dispatch(
&mut self,
pipeline: &D::Pipeline,

View file

@ -3,7 +3,6 @@
mod error;
mod wrappers;
use std::sync::{Arc, Mutex, Weak};
use std::{cell::Cell, convert::TryInto, mem, ptr};
use winapi::shared::minwindef::TRUE;
@ -33,7 +32,6 @@ pub struct Dx12Swapchain {
pub struct Dx12Device {
device: Device,
free_allocators: Arc<Mutex<Vec<CommandAllocator>>>,
command_queue: CommandQueue,
ts_freq: u64,
gpu_info: GpuInfo,
@ -54,10 +52,8 @@ pub struct Image {
pub struct CmdBuf {
c: wrappers::GraphicsCommandList,
allocator: Option<CommandAllocator>,
// One for resetting, one to put back into the allocator pool
allocator_clone: CommandAllocator,
free_allocators: Weak<Mutex<Vec<CommandAllocator>>>,
allocator: CommandAllocator,
needs_reset: bool,
}
pub struct Pipeline {
@ -184,11 +180,9 @@ impl Dx12Instance {
has_memory_model: false,
use_staging_buffers,
};
let free_allocators = Default::default();
Ok(Dx12Device {
device,
command_queue,
free_allocators,
ts_freq,
memory_arch,
gpu_info,
@ -295,23 +289,18 @@ impl crate::backend::Device for Dx12Device {
fn create_cmd_buf(&self) -> Result<Self::CmdBuf, Error> {
let list_type = d3d12::D3D12_COMMAND_LIST_TYPE_DIRECT;
let allocator = self.free_allocators.lock().unwrap().pop();
let allocator = if let Some(allocator) = allocator {
allocator
} else {
let allocator =
unsafe { self.device.create_command_allocator(list_type)? }
};
;
let node_mask = 0;
unsafe {
let c = self
.device
.create_graphics_command_list(list_type, &allocator, None, node_mask)?;
let free_allocators = Arc::downgrade(&self.free_allocators);
Ok(CmdBuf {
c,
allocator: Some(allocator.clone()),
allocator_clone: allocator,
free_allocators,
allocator,
needs_reset: false,
})
}
}
@ -364,9 +353,6 @@ impl crate::backend::Device for Dx12Device {
.map(|c| c.c.as_raw_command_list())
.collect::<SmallVec<[_; 4]>>();
self.command_queue.execute_command_lists(&lists);
for c in cmd_bufs {
c.c.reset(&c.allocator_clone, None);
}
if let Some(fence) = fence {
let val = fence.val.get() + 1;
fence.val.set(val);
@ -464,19 +450,18 @@ impl Dx12Device {
}
impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
unsafe fn begin(&mut self) {}
unsafe fn begin(&mut self) {
if self.needs_reset {
}
}
unsafe fn finish(&mut self) {
let _ = self.c.close();
// This is a bit of a mess. Returning the allocator to the free pool
// makes sense if the command list will be dropped, but not if it will
// be reused. Probably need to implement some logic on drop.
if let Some(free_allocators) = self.free_allocators.upgrade() {
free_allocators
.lock()
.unwrap()
.push(self.allocator.take().unwrap());
}
self.needs_reset = true;
}
unsafe fn reset(&mut self) -> bool {
self.allocator.reset().is_ok() && self.c.reset(&self.allocator, None).is_ok()
}
unsafe fn dispatch(

View file

@ -849,6 +849,12 @@ impl Drop for Event {
}
}
impl CommandAllocator {
pub unsafe fn reset(&self) -> Result<(), Error> {
error::error_if_failed_else_unit(self.0.Reset())
}
}
impl GraphicsCommandList {
pub unsafe fn as_raw_command_list(&self) -> *mut d3d12::ID3D12CommandList {
self.0.as_raw() as *mut d3d12::ID3D12CommandList
@ -858,10 +864,9 @@ impl GraphicsCommandList {
explain_error(self.0.Close(), "error closing command list")
}
pub unsafe fn reset(&self, allocator: &CommandAllocator, initial_pso: Option<&PipelineState>) {
pub unsafe fn reset(&self, allocator: &CommandAllocator, initial_pso: Option<&PipelineState>) -> Result<(), Error> {
let p_initial_state = initial_pso.map(|p| p.0.as_raw()).unwrap_or(ptr::null_mut());
error::error_if_failed_else_unit(self.0.Reset(allocator.0.as_raw(), p_initial_state))
.expect("could not reset command list");
}
pub unsafe fn set_compute_pipeline_root_signature(&self, signature: &RootSignature) {

View file

@ -48,8 +48,12 @@ struct SessionInner {
/// Actual work done by the GPU is encoded into a command buffer and then
/// submitted to the session in a batch.
pub struct CmdBuf {
cmd_buf: mux::CmdBuf,
fence: Fence,
// The invariant is that these options are always populated except
// when the struct is being destroyed. It would be possible to get
// rid of them by using this unsafe trick:
// https://phaazon.net/blog/blog/rust-no-drop
cmd_buf: Option<mux::CmdBuf>,
fence: Option<Fence>,
resources: Vec<RetainResource>,
session: Weak<SessionInner>,
}
@ -158,8 +162,8 @@ impl Session {
(cmd_buf, fence)
};
Ok(CmdBuf {
cmd_buf,
fence,
cmd_buf: Some(cmd_buf),
fence: Some(fence),
resources: Vec::new(),
session: Arc::downgrade(&self.0),
})
@ -202,23 +206,23 @@ impl Session {
// some cases.
staging.memory_barrier();
staging.finish();
cmd_bufs.push(&staging.cmd_buf);
cmd_bufs.push(staging.cmd_buf.as_ref().unwrap());
}
cmd_bufs.push(&cmd_buf.cmd_buf);
cmd_bufs.push(cmd_buf.cmd_buf.as_ref().unwrap());
self.0.device.run_cmd_bufs(
&cmd_bufs,
wait_semaphores,
signal_semaphores,
Some(&mut cmd_buf.fence),
Some(cmd_buf.fence.as_mut().unwrap()),
)?;
Ok(SubmittedCmdBuf(
Some(SubmittedCmdBufInner {
cmd_buf: cmd_buf.cmd_buf,
fence: cmd_buf.fence,
resources: cmd_buf.resources,
cmd_buf: cmd_buf.cmd_buf.take().unwrap(),
fence: cmd_buf.fence.take().unwrap(),
resources: std::mem::take(&mut cmd_buf.resources),
staging_cmd_buf,
}),
cmd_buf.session,
std::mem::replace(&mut cmd_buf.session, Weak::new()),
))
}
@ -397,22 +401,24 @@ impl SessionInner {
let _should_handle_err = self.device.destroy_fence(item.fence);
std::mem::drop(item.resources);
if let Some(staging_cmd_buf) = item.staging_cmd_buf {
let _should_handle_err = self.device.destroy_cmd_buf(staging_cmd_buf.cmd_buf);
let _should_handle_err = self.device.destroy_fence(staging_cmd_buf.fence);
std::mem::drop(staging_cmd_buf.resources);
if let Some(mut staging_cmd_buf) = item.staging_cmd_buf {
staging_cmd_buf.destroy(self);
}
}
}
impl CmdBuf {
fn cmd_buf(&mut self) -> &mut mux::CmdBuf {
self.cmd_buf.as_mut().unwrap()
}
/// Begin recording into a command buffer.
///
/// Always call this before encoding any actual work.
///
/// Discussion question: can this be subsumed?
pub unsafe fn begin(&mut self) {
self.cmd_buf.begin();
self.cmd_buf().begin();
}
/// Finish recording into a command buffer.
@ -420,7 +426,7 @@ impl CmdBuf {
/// Always call this as the last method before submitting the command
/// buffer.
pub unsafe fn finish(&mut self) {
self.cmd_buf.finish();
self.cmd_buf().finish();
}
/// Dispatch a compute shader.
@ -438,7 +444,7 @@ impl CmdBuf {
workgroup_count: (u32, u32, u32),
workgroup_size: (u32, u32, u32),
) {
self.cmd_buf
self.cmd_buf()
.dispatch(pipeline, descriptor_set, workgroup_count, workgroup_size);
}
@ -447,7 +453,7 @@ impl CmdBuf {
/// Compute kernels (and other actions) after this barrier may read from buffers
/// that were written before this barrier.
pub unsafe fn memory_barrier(&mut self) {
self.cmd_buf.memory_barrier();
self.cmd_buf().memory_barrier();
}
/// Insert a barrier for host access to buffers.
@ -458,7 +464,7 @@ impl CmdBuf {
/// See http://themaister.net/blog/2019/08/14/yet-another-blog-explaining-vulkan-synchronization/
/// ("Host memory reads") for an explanation of this barrier.
pub unsafe fn host_barrier(&mut self) {
self.cmd_buf.memory_barrier();
self.cmd_buf().memory_barrier();
}
/// Insert an image barrier, transitioning image layout.
@ -475,7 +481,7 @@ impl CmdBuf {
src_layout: ImageLayout,
dst_layout: ImageLayout,
) {
self.cmd_buf
self.cmd_buf()
.image_barrier(image.mux_image(), src_layout, dst_layout);
}
@ -483,21 +489,22 @@ impl CmdBuf {
///
/// When the size is not specified, it clears the whole buffer.
pub unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
self.cmd_buf.clear_buffer(buffer.mux_buffer(), size);
self.cmd_buf().clear_buffer(buffer.mux_buffer(), size);
}
/// Copy one buffer to another.
///
/// When the buffers differ in size, the minimum of the sizes is used.
pub unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
self.cmd_buf.copy_buffer(src.mux_buffer(), dst.mux_buffer());
self.cmd_buf()
.copy_buffer(src.mux_buffer(), dst.mux_buffer());
}
/// Copy an image to a buffer.
///
/// The size of the image and buffer must match.
pub unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
self.cmd_buf
self.cmd_buf()
.copy_image_to_buffer(src.mux_image(), dst.mux_buffer());
// TODO: change the backend signature to allow failure, as in "not
// implemented" or "unaligned", and fall back to compute shader
@ -508,7 +515,7 @@ impl CmdBuf {
///
/// The size of the image and buffer must match.
pub unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
self.cmd_buf
self.cmd_buf()
.copy_buffer_to_image(src.mux_buffer(), dst.mux_image());
// See above.
}
@ -521,7 +528,7 @@ impl CmdBuf {
/// Discussion question: we might have a specialized version of this
/// function for copying to the swapchain image, and a separate type.
pub unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
self.cmd_buf.blit_image(src.mux_image(), dst.mux_image());
self.cmd_buf().blit_image(src.mux_image(), dst.mux_image());
}
/// Reset the query pool.
@ -530,14 +537,14 @@ impl CmdBuf {
/// This is annoying, and we could tweak the API to make it implicit, doing
/// the reset before the first timestamp write.
pub unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {
self.cmd_buf.reset_query_pool(pool);
self.cmd_buf().reset_query_pool(pool);
}
/// Write a timestamp.
///
/// The query index must be less than the size of the query pool on creation.
pub unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
self.cmd_buf.write_timestamp(pool, query);
self.cmd_buf().write_timestamp(pool, query);
}
/// Prepare the timestamps for reading. This isn't required on Vulkan but
@ -546,7 +553,7 @@ impl CmdBuf {
/// It's possible we'll make this go away, by implicitly including it
/// on command buffer submission when a query pool has been written.
pub unsafe fn finish_timestamps(&mut self, pool: &QueryPool) {
self.cmd_buf.finish_timestamps(pool);
self.cmd_buf().finish_timestamps(pool);
}
/// Make sure the resource lives until the command buffer completes.
@ -574,16 +581,52 @@ impl SubmittedCmdBuf {
///
/// Resources for which destruction was deferred through
/// [`add_resource`][`CmdBuf::add_resource`] will actually be dropped here.
pub fn wait(mut self) -> Result<(), Error> {
///
/// If the command buffer is still available for reuse, it is returned.
pub fn wait(mut self) -> Result<Option<CmdBuf>, Error> {
let mut item = self.0.take().unwrap();
if let Some(session) = Weak::upgrade(&self.1) {
unsafe {
session.device.wait_and_reset(vec![&mut item.fence])?;
session.cleanup_submitted_cmd_buf(item);
if let Some(mut staging_cmd_buf) = item.staging_cmd_buf {
staging_cmd_buf.destroy(&session);
}
if item.cmd_buf.reset() {
return Ok(Some(CmdBuf {
cmd_buf: Some(item.cmd_buf),
fence: Some(item.fence),
resources: Vec::new(),
session: std::mem::take(&mut self.1),
}));
} else {
return Ok(None);
}
}
}
// else session dropped error?
Ok(())
Ok(None)
}
}
impl Drop for CmdBuf {
fn drop(&mut self) {
if let Some(session) = Weak::upgrade(&self.session) {
unsafe {
self.destroy(&session);
}
}
}
}
impl CmdBuf {
unsafe fn destroy(&mut self, session: &SessionInner) {
if let Some(cmd_buf) = self.cmd_buf.take() {
let _ = session.device.destroy_cmd_buf(cmd_buf);
}
if let Some(fence) = self.fence.take() {
let _ = session.device.destroy_fence(fence);
}
self.resources.clear();
}
}

View file

@ -407,6 +407,10 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
unsafe fn finish(&mut self) {}
unsafe fn reset(&mut self) -> bool {
false
}
unsafe fn dispatch(
&mut self,
pipeline: &Pipeline,

View file

@ -622,6 +622,14 @@ impl CmdBuf {
}
}
pub unsafe fn reset(&mut self) -> bool {
mux_match! { self;
CmdBuf::Vk(c) => c.reset(),
CmdBuf::Dx12(c) => c.reset(),
CmdBuf::Mtl(c) => c.reset(),
}
}
/// Dispatch a compute shader.
///
/// Note that both the number of workgroups (`workgroup_count`) and the number of

View file

@ -862,6 +862,10 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
self.device.device.end_command_buffer(self.cmd_buf).unwrap();
}
unsafe fn reset(&mut self) -> bool {
true
}
unsafe fn dispatch(
&mut self,
pipeline: &Pipeline,

View file

@ -39,3 +39,6 @@ ndk = "0.3"
ndk-sys = "0.2.0"
ndk-glue = "0.3"
raw-window-handle = "0.3"
[package.metadata.android.application]
debuggable = true

View file

@ -12,7 +12,7 @@ use ndk::native_window::NativeWindow;
use ndk_glue::Event;
use piet_gpu_hal::{
Error, ImageLayout, Instance, QueryPool, Semaphore, Session, SubmittedCmdBuf, Surface,
CmdBuf, Error, ImageLayout, Instance, QueryPool, Semaphore, Session, SubmittedCmdBuf, Surface,
Swapchain,
};
@ -37,6 +37,7 @@ struct GfxState {
swapchain: Swapchain,
current_frame: usize,
submitted: [Option<SubmittedCmdBuf>; NUM_FRAMES],
cmd_bufs: [Option<CmdBuf>; NUM_FRAMES],
query_pools: Vec<QueryPool>,
present_semaphores: Vec<Semaphore>,
}
@ -112,6 +113,7 @@ impl GfxState {
.map(|_| session.create_query_pool(8))
.collect::<Result<Vec<_>, Error>>()?;
let submitted = Default::default();
let cmd_bufs = Default::default();
let renderer = Renderer::new(&session, width, height, NUM_FRAMES)?;
@ -121,6 +123,7 @@ impl GfxState {
swapchain,
current_frame,
submitted,
cmd_bufs,
query_pools,
present_semaphores,
})
@ -134,7 +137,7 @@ impl GfxState {
let mut info_string = String::new();
if let Some(submitted) = self.submitted[frame_idx].take() {
submitted.wait().unwrap();
self.cmd_bufs[frame_idx] = submitted.wait().unwrap();
let ts = self
.session
.fetch_query_pool(&self.query_pools[frame_idx])
@ -152,7 +155,9 @@ impl GfxState {
let (image_idx, acquisition_semaphore) = self.swapchain.next().unwrap();
let swap_image = self.swapchain.image(image_idx);
let query_pool = &self.query_pools[frame_idx];
let mut cmd_buf = self.session.cmd_buf().unwrap();
let mut cmd_buf = self.cmd_bufs[frame_idx]
.take()
.unwrap_or_else(|| self.session.cmd_buf().unwrap());
cmd_buf.begin();
self.renderer.record(&mut cmd_buf, &query_pool, frame_idx);

View file

@ -1,6 +1,6 @@
use piet::kurbo::Point;
use piet::{RenderContext, Text, TextAttribute, TextLayoutBuilder};
use piet_gpu_hal::{Error, ImageLayout, Instance, Session, SubmittedCmdBuf};
use piet_gpu_hal::{CmdBuf, Error, ImageLayout, Instance, Session, SubmittedCmdBuf};
use piet_gpu::{test_scenes, PietGpuRenderContext, Renderer};
@ -53,6 +53,7 @@ fn main() -> Result<(), Error> {
let query_pools = (0..NUM_FRAMES)
.map(|_| session.create_query_pool(8))
.collect::<Result<Vec<_>, Error>>()?;
let mut cmd_bufs: [Option<CmdBuf>; NUM_FRAMES] = Default::default();
let mut submitted: [Option<SubmittedCmdBuf>; NUM_FRAMES] = Default::default();
let mut renderer = Renderer::new(&session, WIDTH, HEIGHT, NUM_FRAMES)?;
@ -76,7 +77,7 @@ fn main() -> Result<(), Error> {
let frame_idx = current_frame % NUM_FRAMES;
if let Some(submitted) = submitted[frame_idx].take() {
submitted.wait().unwrap();
cmd_bufs[frame_idx] = submitted.wait().unwrap();
let ts = session.fetch_query_pool(&query_pools[frame_idx]).unwrap();
info_string = format!(
"{:.3}ms :: e:{:.3}ms|alloc:{:.3}ms|cp:{:.3}ms|bd:{:.3}ms|bin:{:.3}ms|cr:{:.3}ms|r:{:.3}ms",
@ -112,7 +113,7 @@ fn main() -> Result<(), Error> {
let (image_idx, acquisition_semaphore) = swapchain.next().unwrap();
let swap_image = swapchain.image(image_idx);
let query_pool = &query_pools[frame_idx];
let mut cmd_buf = session.cmd_buf().unwrap();
let mut cmd_buf = cmd_bufs[frame_idx].take().unwrap_or_else(|| session.cmd_buf().unwrap());
cmd_buf.begin();
renderer.record(&mut cmd_buf, &query_pool, frame_idx);