Start applying compute pass to tests

Use compute pass for tests in tests subdir. This is also shaking out some issues that weren't apparent from just collatz.

In particular, we need more autorelease pools to prevent things from leaking. As of this commit, the "clear" test runs correctly but the others haven't yet been converted to the compute_pass format.
This commit is contained in:
Raph Levien 2022-04-20 10:21:49 -07:00
parent 58836244a4
commit 5a9b8d9243
5 changed files with 96 additions and 43 deletions

View file

@ -21,8 +21,8 @@ pub use crate::mux::{
};
pub use bufwrite::BufWrite;
pub use hub::{
BufReadGuard, BufWriteGuard, Buffer, CmdBuf, DescriptorSetBuilder, Image, RetainResource,
Session, SubmittedCmdBuf,
BufReadGuard, BufWriteGuard, Buffer, CmdBuf, ComputePass, DescriptorSetBuilder, Image,
RetainResource, Session, SubmittedCmdBuf,
};
// TODO: because these are conditionally included, "cargo fmt" does not

View file

@ -381,8 +381,25 @@ impl crate::backend::Device for MtlDevice {
fn create_cmd_buf(&self) -> Result<Self::CmdBuf, Error> {
let cmd_queue = self.cmd_queue.lock().unwrap();
// A discussion about autorelease pools.
//
// Autorelease pools are a sore point in Rust/Objective-C interop. Basically,
// you can have any two of correctness, ergonomics, and performance. Here we've
// chosen the first two, using the pattern of a fine grained autorelease pool
// to give the Obj-C object Rust-like lifetime semantics whenever objects are
// created as autorelease (by convention, this is any object creation with an
// Obj-C method name that doesn't begin with "new" or "alloc").
//
// To gain back some of the performance, we'd need a way to wrap an autorelease
// pool over a chunk of work - that could be one frame of rendering, but for
// tests that iterate a number of command buffer submissions, it would need to
// be around that. On non-mac platforms, it would be a no-op.
//
// In any case, this way, the caller doesn't need to worry, and the performance
// hit might not be so bad (perhaps we should measure).
// consider new_command_buffer_with_unretained_references for performance
let cmd_buf = cmd_queue.new_command_buffer().to_owned();
let cmd_buf = autoreleasepool(|| cmd_queue.new_command_buffer().to_owned());
let helpers = self.helpers.clone();
let cur_encoder = Encoder::None;
let time_calibration = Default::default();
@ -555,32 +572,38 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
}
unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
debug_assert!(matches!(self.cur_encoder, Encoder::None));
let encoder = if let Some(queries) = &desc.timer_queries {
let descriptor: id = msg_send![class!(MTLComputePassDescriptor), computePassDescriptor];
let attachments: id = msg_send![descriptor, sampleBufferAttachments];
let index: NSUInteger = 0;
let attachment: id = msg_send![attachments, objectAtIndexedSubscript: index];
// Here we break the hub/mux separation a bit, for expedience
#[allow(irrefutable_let_patterns)]
if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 {
if let Some(sample_buf) = &query_pool.counter_sample_buf {
let () = msg_send![attachment, setSampleBuffer: sample_buf.id()];
// TODO: we might want to get better about validation but the following
// assert is likely to trigger, and also a case can be made that
// validation should be done at the hub level, for consistency.
//debug_assert!(matches!(self.cur_encoder, Encoder::None));
self.flush_encoder();
autoreleasepool(|| {
let encoder = if let Some(queries) = &desc.timer_queries {
let descriptor: id =
msg_send![class!(MTLComputePassDescriptor), computePassDescriptor];
let attachments: id = msg_send![descriptor, sampleBufferAttachments];
let index: NSUInteger = 0;
let attachment: id = msg_send![attachments, objectAtIndexedSubscript: index];
// Here we break the hub/mux separation a bit, for expedience
#[allow(irrefutable_let_patterns)]
if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 {
if let Some(sample_buf) = &query_pool.counter_sample_buf {
let () = msg_send![attachment, setSampleBuffer: sample_buf.id()];
}
}
}
let start_index = queries.1 as NSUInteger;
let end_index = queries.2 as NSInteger;
let () = msg_send![attachment, setStartOfEncoderSampleIndex: start_index];
let () = msg_send![attachment, setEndOfEncoderSampleIndex: end_index];
let encoder = msg_send![
self.cmd_buf,
computeCommandEncoderWithDescriptor: descriptor
];
encoder
} else {
self.cmd_buf.new_compute_command_encoder()
};
self.cur_encoder = Encoder::Compute(encoder.to_owned());
let start_index = queries.1 as NSUInteger;
let end_index = queries.2 as NSInteger;
let () = msg_send![attachment, setStartOfEncoderSampleIndex: start_index];
let () = msg_send![attachment, setEndOfEncoderSampleIndex: end_index];
msg_send![
self.cmd_buf,
computeCommandEncoderWithDescriptor: descriptor
]
} else {
self.cmd_buf.new_compute_command_encoder()
};
self.cur_encoder = Encoder::Compute(encoder.to_owned());
});
}
unsafe fn dispatch(

View file

@ -67,6 +67,12 @@ impl CounterSampleBuffer {
}
}
impl Drop for CounterSet {
fn drop(&mut self) {
unsafe { msg_send![self.id, release] }
}
}
impl CounterSet {
pub fn get_timer_counter_set(device: &DeviceRef) -> Option<CounterSet> {
unsafe {
@ -86,6 +92,19 @@ impl CounterSet {
}
}
// copied from metal-rs; should be in common utilities maybe?
fn nsstring_as_str(nsstr: &objc::runtime::Object) -> &str {
let bytes = unsafe {
let bytes: *const std::os::raw::c_char = msg_send![nsstr, UTF8String];
bytes as *const u8
};
let len: NSUInteger = unsafe { msg_send![nsstr, length] };
unsafe {
let bytes = std::slice::from_raw_parts(bytes, len as usize);
std::str::from_utf8(bytes).unwrap()
}
}
impl CounterSampleBuffer {
pub fn new(
device: &DeviceRef,
@ -107,6 +126,11 @@ impl CounterSampleBuffer {
let buf: id = msg_send![device, newCounterSampleBufferWithDescriptor: descriptor error: &mut error];
let () = msg_send![descriptor, release];
if !error.is_null() {
let description = msg_send![error, localizedDescription];
println!(
"error allocating sample buffer, code = {}",
nsstring_as_str(description)
);
let () = msg_send![error, release];
return None;
}
@ -138,7 +162,7 @@ impl TimeCalibration {
let delta_gpu = self.gpu_end_ts - self.gpu_start_ts;
let adj_ts = if delta_gpu > 0 {
let scale = delta_cpu as f64 / delta_gpu as f64;
self.cpu_start_ts as f64 + (raw_ts - self.gpu_start_ts) as f64 * scale
self.cpu_start_ts as f64 + (raw_ts as f64 - self.gpu_start_ts as f64) * scale
} else {
// Default is ns on Apple Silicon; on other hardware this will be wrong
raw_ts as f64

View file

@ -16,11 +16,11 @@
//! Utilities (and a benchmark) for clearing buffers with compute shaders.
use piet_gpu_hal::{include_shader, BindType, BufferUsage, DescriptorSet};
use piet_gpu_hal::{include_shader, BindType, BufferUsage, ComputePass, DescriptorSet};
use piet_gpu_hal::{Buffer, Pipeline};
use crate::config::Config;
use crate::runner::{Commands, Runner};
use crate::runner::Runner;
use crate::test_result::TestResult;
const WG_SIZE: u64 = 256;
@ -52,9 +52,9 @@ pub unsafe fn run_clear_test(runner: &mut Runner, config: &Config) -> TestResult
let mut total_elapsed = 0.0;
for i in 0..n_iter {
let mut commands = runner.commands();
commands.write_timestamp(0);
stage.record(&mut commands, &code, &binding);
commands.write_timestamp(1);
let mut pass = commands.compute_pass(0, 1);
stage.record(&mut pass, &code, &binding);
pass.end();
if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier();
commands.download(&out_buf);
@ -108,17 +108,12 @@ impl ClearStage {
ClearBinding { descriptor_set }
}
pub unsafe fn record(
&self,
commands: &mut Commands,
code: &ClearCode,
bindings: &ClearBinding,
) {
pub unsafe fn record(&self, pass: &mut ComputePass, code: &ClearCode, bindings: &ClearBinding) {
let n_workgroups = (self.n_elements + WG_SIZE - 1) / WG_SIZE;
// An issue: for clearing large buffers (>16M), we need to check the
// number of workgroups against the (dynamically detected) limit, and
// potentially issue multiple dispatches.
commands.cmd_buf.dispatch(
pass.dispatch(
&code.pipeline,
&bindings.descriptor_set,
(n_workgroups as u32, 1, 1),

View file

@ -20,8 +20,8 @@ use std::ops::RangeBounds;
use bytemuck::Pod;
use piet_gpu_hal::{
BackendType, BufReadGuard, BufWriteGuard, Buffer, BufferUsage, CmdBuf, Instance, InstanceFlags,
QueryPool, Session,
BackendType, BufReadGuard, BufWriteGuard, Buffer, BufferUsage, CmdBuf, ComputePass,
ComputePassDescriptor, Instance, InstanceFlags, QueryPool, Session,
};
pub struct Runner {
@ -118,10 +118,21 @@ impl Runner {
}
impl Commands {
#[deprecated(note = "use compute_pass instead")]
pub unsafe fn write_timestamp(&mut self, query: u32) {
self.cmd_buf.write_timestamp(&self.query_pool, query);
}
/// Start a compute pass with timer queries.
pub unsafe fn compute_pass(&mut self, start_query: u32, end_query: u32) -> ComputePass {
self.cmd_buf
.begin_compute_pass(&ComputePassDescriptor::timer(
&self.query_pool,
start_query,
end_query,
))
}
pub unsafe fn upload(&mut self, buf: &BufStage) {
self.cmd_buf.copy_buffer(&buf.stage_buf, &buf.dev_buf);
}